preview of pdfs added
#1
by
koulsahil
- opened
app.py
CHANGED
@@ -9,6 +9,17 @@ import base64
|
|
9 |
import io
|
10 |
from datetime import datetime
|
11 |
import json
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
# Set page config
|
14 |
st.set_page_config(
|
@@ -27,6 +38,49 @@ This application analyzes SEC filings (10-K, 13F, etc.) to extract:
|
|
27 |
- Potential violations
|
28 |
""")
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
# Sidebar for model selection and settings
|
31 |
st.sidebar.header("Analysis Settings")
|
32 |
|
@@ -79,6 +133,8 @@ risk_keywords_list = [keyword.strip() for keyword in risk_keywords.split(",")]
|
|
79 |
# Add confidence threshold slider
|
80 |
confidence_threshold = st.sidebar.slider("Confidence Threshold", 0.0, 1.0, 0.5)
|
81 |
|
|
|
|
|
82 |
# Function to extract text from PDF
|
83 |
@st.cache_data
|
84 |
def extract_text_from_pdf(pdf_file):
|
@@ -251,9 +307,45 @@ def get_download_link(data, filename, text):
|
|
251 |
return href
|
252 |
|
253 |
# File upload
|
254 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
255 |
|
256 |
if uploaded_file:
|
|
|
|
|
257 |
with st.spinner("Processing PDF file..."):
|
258 |
# Extract text from PDF
|
259 |
full_text, text_by_page = extract_text_from_pdf(uploaded_file)
|
@@ -500,4 +592,5 @@ else:
|
|
500 |
st.markdown("Download structured analysis results for review by your legal and compliance teams.")
|
501 |
|
502 |
# Add footer with information
|
503 |
-
st.markdown("---")
|
|
|
|
9 |
import io
|
10 |
from datetime import datetime
|
11 |
import json
|
12 |
+
#below liraries to fix the axios error 403 code
|
13 |
+
from pathlib import Path
|
14 |
+
import os
|
15 |
+
|
16 |
+
|
17 |
+
#below code to match the docker file config the code worked without this on hugging face so needs to be checked out further
|
18 |
+
|
19 |
+
#UPLOAD_FOLDER = os.getenv('UPLOAD_FOLDER', '/tmp/uploads')
|
20 |
+
#Path(UPLOAD_FOLDER).mkdir(exist_ok=True) # Ensure directory exists
|
21 |
+
|
22 |
+
|
23 |
|
24 |
# Set page config
|
25 |
st.set_page_config(
|
|
|
38 |
- Potential violations
|
39 |
""")
|
40 |
|
41 |
+
# Function to display PDFs
|
42 |
+
def display_pdf(file, height=350):
|
43 |
+
# Handle both file paths and file-like objects
|
44 |
+
if isinstance(file, str):
|
45 |
+
# It's a file path
|
46 |
+
if os.path.exists(file):
|
47 |
+
with open(file, "rb") as f:
|
48 |
+
base64_pdf = base64.b64encode(f.read()).decode("utf-8")
|
49 |
+
else:
|
50 |
+
st.error("Selected PDF not found.")
|
51 |
+
return
|
52 |
+
else:
|
53 |
+
# It's a file-like object (e.g., from file uploader)
|
54 |
+
base64_pdf = base64.b64encode(file.read()).decode("utf-8")
|
55 |
+
# Reset the file pointer to the beginning for later processing
|
56 |
+
file.seek(0)
|
57 |
+
|
58 |
+
pdf_display = f"""
|
59 |
+
<iframe
|
60 |
+
src="data:application/pdf;base64,{base64_pdf}"
|
61 |
+
width="100%"
|
62 |
+
height="{height}px"
|
63 |
+
style="border: 1px solid #ccc; border-radius: 10px;"
|
64 |
+
type="application/pdf">
|
65 |
+
</iframe>
|
66 |
+
"""
|
67 |
+
st.markdown(pdf_display, unsafe_allow_html=True)
|
68 |
+
|
69 |
+
# Define sample PDFs
|
70 |
+
sample_pdfs = {
|
71 |
+
"π Meridian Financial Services, Inc. Annual Report (10-K)": "example.pdf",
|
72 |
+
"π Annual Report (10-K)": "Mock_Form_10K.pdf",
|
73 |
+
"π Sample Investment Holdings (13F)": "Mock_Form_13F.pdf",
|
74 |
+
}
|
75 |
+
|
76 |
+
# Initialize session state for selected PDF
|
77 |
+
if "selected_pdf" not in st.session_state:
|
78 |
+
st.session_state["selected_pdf"] = list(sample_pdfs.values())[0]
|
79 |
+
|
80 |
+
|
81 |
+
|
82 |
+
|
83 |
+
|
84 |
# Sidebar for model selection and settings
|
85 |
st.sidebar.header("Analysis Settings")
|
86 |
|
|
|
133 |
# Add confidence threshold slider
|
134 |
confidence_threshold = st.sidebar.slider("Confidence Threshold", 0.0, 1.0, 0.5)
|
135 |
|
136 |
+
|
137 |
+
|
138 |
# Function to extract text from PDF
|
139 |
@st.cache_data
|
140 |
def extract_text_from_pdf(pdf_file):
|
|
|
307 |
return href
|
308 |
|
309 |
# File upload
|
310 |
+
# Create two columns for PDF preview and file uploader
|
311 |
+
preview_col, upload_col = st.columns([1, 1])
|
312 |
+
|
313 |
+
with upload_col:
|
314 |
+
st.header("Upload Document")
|
315 |
+
uploaded_file = st.file_uploader("Upload SEC Filing (PDF)", type=["pdf"])
|
316 |
+
|
317 |
+
# Sample PDF selector
|
318 |
+
st.markdown("### Or choose a sample:")
|
319 |
+
sample_cols = st.columns(len(sample_pdfs))
|
320 |
+
|
321 |
+
for i, (label, file_path) in enumerate(sample_pdfs.items()):
|
322 |
+
with sample_cols[i]:
|
323 |
+
if st.button(label):
|
324 |
+
st.session_state["selected_pdf"] = file_path
|
325 |
+
# When a sample is selected, set it as if it was uploaded
|
326 |
+
try:
|
327 |
+
with open(file_path, "rb") as f:
|
328 |
+
file_bytes = f.read()
|
329 |
+
uploaded_file = io.BytesIO(file_bytes)
|
330 |
+
uploaded_file.name = file_path
|
331 |
+
except FileNotFoundError:
|
332 |
+
st.error(f"Sample file {file_path} not found.")
|
333 |
+
|
334 |
+
with preview_col:
|
335 |
+
st.header("Document Preview")
|
336 |
+
# Display uploaded file or selected sample
|
337 |
+
if uploaded_file:
|
338 |
+
display_pdf(uploaded_file, height=400)
|
339 |
+
elif st.session_state["selected_pdf"]:
|
340 |
+
display_pdf(st.session_state["selected_pdf"], height=400)
|
341 |
+
else:
|
342 |
+
st.info("Upload a PDF or select a sample to preview.")
|
343 |
+
|
344 |
+
|
345 |
|
346 |
if uploaded_file:
|
347 |
+
if hasattr(uploaded_file, 'seek'):
|
348 |
+
uploaded_file.seek(0)
|
349 |
with st.spinner("Processing PDF file..."):
|
350 |
# Extract text from PDF
|
351 |
full_text, text_by_page = extract_text_from_pdf(uploaded_file)
|
|
|
592 |
st.markdown("Download structured analysis results for review by your legal and compliance teams.")
|
593 |
|
594 |
# Add footer with information
|
595 |
+
st.markdown("---")
|
596 |
+
st.markdown("Regulatory Report Checker - NLP-powered document analysis for compliance teams")
|