Spaces:
Sleeping
Sleeping
# Set environment variables IMMEDIATELY to prevent root filesystem access | |
# This must happen before any other imports or operations | |
import os | |
import tempfile | |
import json | |
from datetime import datetime | |
# Get a writable temp directory first | |
try: | |
TEMP_DIR = os.path.join(tempfile.gettempdir(), "docling_temp") | |
os.makedirs(TEMP_DIR, exist_ok=True) | |
except Exception: | |
try: | |
TEMP_DIR = "/tmp/docling_temp" | |
os.makedirs(TEMP_DIR, exist_ok=True) | |
except Exception: | |
TEMP_DIR = os.getcwd() | |
# Set all environment variables that libraries might use | |
os.environ.update({ | |
# Streamlit configuration | |
'STREAMLIT_SERVER_FILE_WATCHER_TYPE': 'none', | |
'STREAMLIT_SERVER_HEADLESS': 'true', | |
'STREAMLIT_BROWSER_GATHER_USAGE_STATS': 'false', | |
'STREAMLIT_SERVER_ENABLE_CORS': 'false', | |
'STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION': 'false', | |
# EasyOCR configuration | |
'EASYOCR_MODULE_PATH': os.path.join(TEMP_DIR, 'easyocr_models'), | |
'HOME': TEMP_DIR, | |
'USERPROFILE': TEMP_DIR, | |
'XDG_CACHE_HOME': os.path.join(TEMP_DIR, 'cache'), | |
'XDG_CONFIG_HOME': os.path.join(TEMP_DIR, 'config'), | |
'XDG_DATA_HOME': os.path.join(TEMP_DIR, 'data'), | |
# Hugging Face Hub configuration - CRITICAL for preventing /.cache access | |
'HF_HOME': os.path.join(TEMP_DIR, 'huggingface'), | |
'HF_CACHE_HOME': os.path.join(TEMP_DIR, 'huggingface_cache'), | |
'HF_HUB_CACHE': os.path.join(TEMP_DIR, 'huggingface_cache'), | |
'TRANSFORMERS_CACHE': os.path.join(TEMP_DIR, 'transformers_cache'), | |
'HF_DATASETS_CACHE': os.path.join(TEMP_DIR, 'datasets_cache'), | |
'DIFFUSERS_CACHE': os.path.join(TEMP_DIR, 'diffusers_cache'), | |
'ACCELERATE_CACHE': os.path.join(TEMP_DIR, 'accelerate_cache'), | |
# Additional Hugging Face specific variables | |
'HF_HUB_DISABLE_TELEMETRY': '1', | |
'HF_HUB_DISABLE_IMPLICIT_TOKEN': '1', | |
'HF_HUB_OFFLINE': '0', | |
# Other ML libraries | |
'TORCH_HOME': os.path.join(TEMP_DIR, 'torch'), | |
'TENSORFLOW_HOME': os.path.join(TEMP_DIR, 'tensorflow'), | |
'KERAS_HOME': os.path.join(TEMP_DIR, 'keras'), | |
'MLFLOW_TRACKING_URI': f'file:{os.path.join(TEMP_DIR, "mlruns")}', | |
# Additional cache directories | |
'CACHE_DIR': os.path.join(TEMP_DIR, 'cache'), | |
'MODEL_CACHE_DIR': os.path.join(TEMP_DIR, 'models'), | |
# Additional environment variables to prevent root access | |
'PYTHONPATH': TEMP_DIR, | |
'TMPDIR': TEMP_DIR, | |
'TEMP': TEMP_DIR, | |
'TMP': TEMP_DIR, | |
'CACHE': os.path.join(TEMP_DIR, 'cache'), | |
'MODELS': os.path.join(TEMP_DIR, 'models'), | |
'DATA': os.path.join(TEMP_DIR, 'data'), | |
'CONFIG': os.path.join(TEMP_DIR, 'config'), | |
}) | |
# Create all necessary directories | |
directories_to_create = [ | |
os.environ['EASYOCR_MODULE_PATH'], | |
os.environ['XDG_CACHE_HOME'], | |
os.environ['XDG_CONFIG_HOME'], | |
os.environ['XDG_DATA_HOME'], | |
os.environ['HF_HOME'], | |
os.environ['HF_CACHE_HOME'], | |
os.environ['TRANSFORMERS_CACHE'], | |
os.environ['HF_DATASETS_CACHE'], | |
os.environ['TORCH_HOME'], | |
os.environ['TENSORFLOW_HOME'], | |
os.environ['KERAS_HOME'], | |
os.environ['CACHE_DIR'], | |
os.environ['MODEL_CACHE_DIR'], | |
os.environ['CACHE'], | |
os.environ['MODELS'], | |
os.environ['DATA'], | |
os.environ['CONFIG'], | |
os.environ['HF_HUB_CACHE'], | |
os.environ['DIFFUSERS_CACHE'], | |
os.environ['ACCELERATE_CACHE'], | |
] | |
for directory in directories_to_create: | |
try: | |
# Create directory and all parent directories | |
os.makedirs(directory, mode=0o777, exist_ok=True) | |
# Ensure the directory has write permissions | |
os.chmod(directory, 0o777) | |
except Exception as e: | |
print(f"Warning: Could not create directory {directory}: {e}") | |
# Now import the rest of the modules | |
import streamlit as st | |
import logging | |
import shutil | |
from processing.document_processor import DocumentProcessor | |
from processing.sections import ReasoningSectionExtractor | |
from utils.logging_utils import get_log_handler | |
from utils.cost_tracker import cost_tracker | |
from dotenv import load_dotenv | |
import sys | |
import difflib | |
import time | |
# Configure logging early to avoid issues | |
logging.basicConfig( | |
level=logging.INFO, | |
format="%(asctime)s %(levelname)s %(name)s: %(message)s", | |
stream=sys.stdout, | |
force=True | |
) | |
# Load environment variables from .env | |
load_dotenv() | |
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT") | |
AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY") | |
AZURE_OPENAI_VERSION = os.getenv("AZURE_OPENAI_VERSION") | |
AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT") | |
# Log startup information | |
logging.info("=" * 50) | |
logging.info("Docling Streamlit App Starting") | |
logging.info(f"Temp directory: {TEMP_DIR}") | |
logging.info(f"EasyOCR model directory: {os.environ.get('EASYOCR_MODULE_PATH', 'NOT_SET')}") | |
logging.info(f"Hugging Face cache: {os.environ.get('HF_CACHE_HOME', 'NOT_SET')}") | |
logging.info(f"Current working directory: {os.getcwd()}") | |
logging.info(f"Python version: {sys.version}") | |
logging.info("=" * 50) | |
def cleanup_temp_files(): | |
"""Clean up temporary files in the temp directory.""" | |
try: | |
if os.path.exists(TEMP_DIR): | |
for filename in os.listdir(TEMP_DIR): | |
file_path = os.path.join(TEMP_DIR, filename) | |
if os.path.isfile(file_path): | |
try: | |
os.remove(file_path) | |
logging.info(f"Removed temp file: {filename}") | |
except PermissionError as e: | |
logging.warning(f"Permission error removing {filename}: {e}") | |
except Exception as e: | |
logging.warning(f"Error removing {filename}: {e}") | |
logging.info(f"Cleaned up temporary files in {TEMP_DIR}") | |
else: | |
logging.info(f"Temp directory {TEMP_DIR} does not exist") | |
except PermissionError as e: | |
logging.warning(f"Permission error accessing temp directory {TEMP_DIR}: {e}") | |
except Exception as e: | |
logging.warning(f"Error cleaning up temp files: {e}") | |
def clear_all_data(): | |
"""Clear all temporary files and session state data.""" | |
try: | |
# Clean up temp files | |
cleanup_temp_files() | |
# Clear session state | |
if "processed_results" in st.session_state: | |
del st.session_state.processed_results | |
if "logs" in st.session_state: | |
del st.session_state.logs | |
if "original_structures" in st.session_state: | |
del st.session_state.original_structures | |
if "show_original" in st.session_state: | |
del st.session_state.show_original | |
if "show_processed" in st.session_state: | |
del st.session_state.show_processed | |
if "temp_cleaned" in st.session_state: | |
del st.session_state.temp_cleaned | |
if "last_cleanup_time" in st.session_state: | |
del st.session_state.last_cleanup_time | |
logging.info("Cleared all session state and temporary files") | |
return True | |
except Exception as e: | |
logging.error(f"Error clearing all data: {e}") | |
return False | |
def get_temp_files_info(): | |
"""Get information about temporary files (count and total size).""" | |
try: | |
if not os.path.exists(TEMP_DIR): | |
return 0, 0 | |
files = os.listdir(TEMP_DIR) | |
total_size = 0 | |
file_details = [] | |
for filename in files: | |
try: | |
file_path = os.path.join(TEMP_DIR, filename) | |
if os.path.isfile(file_path): | |
file_size = os.path.getsize(file_path) | |
total_size += file_size | |
file_details.append({ | |
'name': filename, | |
'size': file_size, | |
'type': 'file' | |
}) | |
elif os.path.isdir(file_path): | |
file_details.append({ | |
'name': filename, | |
'size': 0, | |
'type': 'directory' | |
}) | |
except (PermissionError, OSError) as e: | |
logging.warning(f"Error accessing file {filename}: {e}") | |
file_details.append({ | |
'name': filename, | |
'size': 0, | |
'type': 'error' | |
}) | |
continue | |
# Log detailed information for debugging | |
if file_details: | |
logging.info(f"Temp directory contents ({TEMP_DIR}):") | |
for detail in file_details: | |
logging.info(f" - {detail['name']} ({detail['type']}): {detail['size']} bytes") | |
return len(files), total_size | |
except PermissionError as e: | |
logging.warning(f"Permission error accessing temp directory {TEMP_DIR}: {e}") | |
return 0, 0 | |
except Exception as e: | |
logging.warning(f"Error getting temp files info: {e}") | |
return 0, 0 | |
def format_file_size(size_bytes): | |
"""Format file size in human readable format.""" | |
if size_bytes == 0: | |
return "0 B" | |
size_names = ["B", "KB", "MB", "GB"] | |
i = 0 | |
while size_bytes >= 1024 and i < len(size_names) - 1: | |
size_bytes /= 1024.0 | |
i += 1 | |
return f"{size_bytes:.1f} {size_names[i]}" | |
def save_uploaded_file(uploaded_file, filename): | |
"""Save uploaded file to temp directory and return the path.""" | |
temp_path = os.path.join(TEMP_DIR, f"temp_{filename}") | |
try: | |
uploaded_file.seek(0) # Reset file pointer to beginning | |
file_bytes = uploaded_file.read() | |
with open(temp_path, "wb") as f: | |
f.write(file_bytes) | |
logging.info(f"Saved uploaded file to {temp_path}") | |
return temp_path | |
except PermissionError as e: | |
logging.error(f"Permission error saving uploaded file to {temp_path}: {e}") | |
raise PermissionError(f"Cannot save file due to permission restrictions. Please try clearing data or contact support.") | |
except Exception as e: | |
logging.error(f"Error saving uploaded file: {e}") | |
raise | |
# Configure page layout to use wide mode | |
st.set_page_config( | |
page_title="Medical Document Parser & Redactor", | |
page_icon="📄", | |
layout="wide", | |
initial_sidebar_state="collapsed" | |
) | |
# Add custom CSS for better styling | |
st.markdown(""" | |
<style> | |
/* Custom styling for text areas */ | |
.stTextArea textarea { | |
font-family: 'Courier New', monospace !important; | |
font-size: 12px !important; | |
line-height: 1.4 !important; | |
border: 2px solid #e0e0e0 !important; | |
border-radius: 8px !important; | |
} | |
/* Hover effect for text areas */ | |
.stTextArea textarea:hover { | |
border-color: #1f77b4 !important; | |
} | |
/* Custom styling for download buttons */ | |
.stDownloadButton > button { | |
border-radius: 8px !important; | |
font-weight: 600 !important; | |
} | |
/* Custom styling for the comparison section */ | |
.comparison-container { | |
background-color: #f8f9fa; | |
padding: 20px; | |
border-radius: 10px; | |
border: 1px solid #e9ecef; | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
# Configure root logger only once (avoid duplicate handlers on reruns) | |
if len(logging.getLogger().handlers) == 0: | |
logging.getLogger().setLevel(logging.INFO) | |
# (We will attach custom handlers during processing as needed) | |
# Title and description | |
st.title("Medical Document Parser & Redactor") | |
st.write(""" | |
Upload PDF medical documents to parse their content using **Docling** (structure-aware parser) | |
and automatically **redact specific sections** (e.g., initial and final medication lists). | |
Use the buttons below to view the original structure or process with redaction. | |
**💡 Tip:** This is a Hugging Face Space with limited storage. Use the "Clear All Data" button to remove temporary files when you're done processing documents. | |
""") | |
# Add clear all data button at the top | |
if st.button("🧹 Clear All Data", type="secondary", help="Remove all temporary files and reset the application"): | |
if clear_all_data(): | |
st.success("✅ All data cleared successfully! The application has been reset.") | |
cost_tracker.reset_session() # Reset cost tracking when clearing data | |
st.rerun() | |
else: | |
st.error("❌ Error clearing data. Please try again.") | |
# File uploader (accept multiple PDF files) | |
uploaded_files = st.file_uploader("Upload PDF medical documents", type=["pdf"], accept_multiple_files=True) | |
# Clean up temp files on app start (but keep the directory) | |
if "temp_cleaned" not in st.session_state: | |
cleanup_temp_files() | |
st.session_state.temp_cleaned = True | |
# Initialize session state storage for results and logs | |
if "processed_results" not in st.session_state: | |
st.session_state.processed_results = {} # {filename: {"structured_json": ..., "redacted_md": ..., "redacted_json": ...}} | |
if "logs" not in st.session_state: | |
st.session_state.logs = {} # {filename: log_text} | |
if "original_structures" not in st.session_state: | |
st.session_state.original_structures = {} # {filename: structured_json} | |
# Show temp directory status and cleanup button | |
temp_file_count, total_size = get_temp_files_info() | |
# Automatic cleanup: if temp files are too old or too large, clean them up | |
if "last_cleanup_time" not in st.session_state: | |
st.session_state.last_cleanup_time = time.time() | |
# Check if we should do automatic cleanup (every 30 minutes or if files are too large) | |
current_time = time.time() | |
time_since_cleanup = current_time - st.session_state.last_cleanup_time | |
if (time_since_cleanup > 1800 or # 30 minutes | |
total_size > 100 * 1024 * 1024): # 100MB | |
if temp_file_count > 0: | |
cleanup_temp_files() | |
st.session_state.last_cleanup_time = current_time | |
st.info("🧹 Automatic cleanup: Removed old temporary files") | |
# Recalculate after cleanup | |
temp_file_count, total_size = get_temp_files_info() | |
# Create a row with temp file status and delete button | |
col1, col2 = st.columns([3, 1]) | |
with col1: | |
if temp_file_count > 0: | |
st.caption(f"📁 {temp_file_count} temporary file(s) - Total size: {format_file_size(total_size)}") | |
# Show warning if total size is large | |
if total_size > 50 * 1024 * 1024: # 50MB | |
st.warning("⚠️ Large temporary files detected. Consider clearing data to free up space.") | |
# Debug: Show temp files (expandable) | |
with st.expander("🔍 Debug: View temporary files"): | |
try: | |
if os.path.exists(TEMP_DIR): | |
files = os.listdir(TEMP_DIR) | |
if files: | |
st.write("**Temporary files in directory:**") | |
for filename in files: | |
file_path = os.path.join(TEMP_DIR, filename) | |
try: | |
if os.path.isfile(file_path): | |
size = os.path.getsize(file_path) | |
st.write(f"📄 {filename} ({format_file_size(size)})") | |
elif os.path.isdir(file_path): | |
st.write(f"📁 {filename} (directory)") | |
else: | |
st.write(f"❓ {filename} (unknown)") | |
except Exception as e: | |
st.write(f"❌ {filename} (error: {e})") | |
else: | |
st.write("No files found in temp directory") | |
else: | |
st.write("Temp directory does not exist") | |
except Exception as e: | |
st.write(f"Error accessing temp directory: {e}") | |
else: | |
st.caption("📁 No temporary files") | |
with col2: | |
if temp_file_count > 0: | |
if st.button("🗑️ Delete Temp Files", type="secondary", help="Remove all temporary files from the server"): | |
try: | |
cleanup_temp_files() | |
st.success(f"✅ Successfully deleted {temp_file_count} temporary file(s)") | |
st.rerun() # Refresh the page to update the file count | |
except Exception as e: | |
st.error(f"❌ Error deleting temporary files: {e}") | |
else: | |
st.caption("No files to delete") | |
if uploaded_files: | |
# UI to select which file to work with (if multiple files uploaded) | |
file_names = [f.name for f in uploaded_files] | |
selected_file = st.selectbox("Select a file to work with", options=file_names) | |
if selected_file: | |
# Find the selected uploaded file | |
uploaded_file = next(f for f in uploaded_files if f.name == selected_file) | |
# Create buttons for different actions | |
col1, col2, col3, col4, col5 = st.columns(5) | |
with col1: | |
if st.button("📄 Show Original", type="primary"): | |
# Process the document to get original structure (without redaction) | |
if selected_file not in st.session_state.original_structures: | |
# Save uploaded file to a temporary location | |
temp_path = save_uploaded_file(uploaded_file, selected_file) | |
# Create a DocumentProcessor without section extraction (for original structure) | |
processor = DocumentProcessor(section_extractor=None) | |
# Process the document to get original structure | |
result = processor.process(temp_path) | |
st.session_state.original_structures[selected_file] = result.structured_json | |
# Also store the original markdown for comparison | |
st.session_state.original_structures[f"{selected_file}_markdown"] = result.structured_markdown | |
# Display the original structure | |
st.session_state.show_original = True | |
st.session_state.show_processed = False | |
with col2: | |
if st.button("🔒 Process with Redaction"): | |
# Process the document with redaction | |
if selected_file not in st.session_state.processed_results: | |
# Save uploaded file to a temporary location | |
temp_path = save_uploaded_file(uploaded_file, selected_file) | |
# Ensure the deployment name is in the cost tracker | |
if AZURE_OPENAI_DEPLOYMENT and AZURE_OPENAI_DEPLOYMENT not in cost_tracker.get_available_models(): | |
model_type = cost_tracker.guess_model_type(AZURE_OPENAI_DEPLOYMENT) | |
cost_tracker.add_deployment_pricing(AZURE_OPENAI_DEPLOYMENT, model_type) | |
# Use the new processing function | |
from processing.document_processor import process_document_with_redaction | |
# Attach an in-memory log handler to capture logs for this file | |
log_handler, log_buffer = get_log_handler() | |
root_logger = logging.getLogger() | |
root_logger.addHandler(log_handler) | |
try: | |
# Process the document using the new function | |
processing_result = process_document_with_redaction( | |
file_path=temp_path, | |
endpoint=AZURE_OPENAI_ENDPOINT, | |
api_key=AZURE_OPENAI_KEY, | |
api_version=AZURE_OPENAI_VERSION, | |
deployment=AZURE_OPENAI_DEPLOYMENT, | |
) | |
# Save results in session state (maintaining compatibility with existing UI) | |
st.session_state.processed_results[selected_file] = { | |
"structured_json": processing_result.original_document_json, | |
"redacted_md": processing_result.redacted_document_md, | |
"redacted_json": processing_result.redacted_document_json, # Now this is actually redacted! | |
"original_markdown": processing_result.original_document_md, | |
"processing_result": processing_result # Store the new result | |
} | |
finally: | |
# Remove handler and stop capturing logs | |
root_logger.removeHandler(log_handler) | |
# Combine log records into a single text | |
log_text = "\n".join(log_buffer) | |
st.session_state.logs[selected_file] = log_text | |
st.session_state.show_original = False | |
st.session_state.show_processed = True | |
with col3: | |
if st.button("🔄 Switch View"): | |
# Toggle between views | |
if st.session_state.get("show_original", False): | |
st.session_state.show_original = False | |
st.session_state.show_processed = True | |
else: | |
st.session_state.show_original = True | |
st.session_state.show_processed = False | |
with col4: | |
if st.button("📄 Show Original JSON", type="secondary"): | |
# Process the document to get original structure (without redaction) | |
if selected_file not in st.session_state.original_structures: | |
# Save uploaded file to a temporary location | |
temp_path = save_uploaded_file(uploaded_file, selected_file) | |
# Create a DocumentProcessor without section extraction (for original structure) | |
processor = DocumentProcessor(section_extractor=None) | |
# Process the document to get original structure | |
result = processor.process(temp_path) | |
st.session_state.original_structures[selected_file] = result.structured_json | |
# Store the original markdown for comparison | |
st.session_state.original_structures[f"{selected_file}_markdown"] = result.structured_markdown | |
# Store the original YAML for comparison | |
st.session_state.original_structures[f"{selected_file}_yaml"] = result.structured_yaml | |
# Display the original JSON structure | |
st.session_state.show_original = True | |
st.session_state.show_processed = False | |
st.session_state.show_json = True | |
st.session_state.show_yaml = False | |
with col5: | |
if st.button("📄 Show Original YAML", type="secondary"): | |
# Process the document to get original structure (without redaction) | |
if selected_file not in st.session_state.original_structures: | |
# Save uploaded file to a temporary location | |
temp_path = save_uploaded_file(uploaded_file, selected_file) | |
# Create a DocumentProcessor without section extraction (for original structure) | |
processor = DocumentProcessor(section_extractor=None) | |
# Process the document to get original structure | |
result = processor.process(temp_path) | |
st.session_state.original_structures[selected_file] = result.structured_json | |
# Store the original markdown for comparison | |
st.session_state.original_structures[f"{selected_file}_markdown"] = result.structured_markdown | |
# Store the original YAML for comparison | |
st.session_state.original_structures[f"{selected_file}_yaml"] = result.structured_yaml | |
# Display the original YAML structure | |
st.session_state.show_original = True | |
st.session_state.show_processed = False | |
st.session_state.show_json = False | |
st.session_state.show_yaml = True | |
# Show current view status | |
if st.session_state.get("show_original", False): | |
st.info("📄 Currently viewing: **Original Document Structure**") | |
elif st.session_state.get("show_processed", False): | |
st.success("🔒 Currently viewing: **Processed Document with Redaction**") | |
else: | |
st.info("ℹ️ Select an action above to view document content") | |
# Display results based on button clicked | |
if st.session_state.get("show_original", False): | |
st.markdown("---") | |
# Determine what to show based on button clicked | |
show_json = st.session_state.get("show_json", False) | |
show_yaml = st.session_state.get("show_yaml", False) | |
if show_json: | |
st.subheader(f"Original Document Structure (JSON) - {selected_file}") | |
elif show_yaml: | |
st.subheader(f"Original Document Structure (YAML) - {selected_file}") | |
else: | |
st.subheader(f"Original Document Structure (Markdown) - {selected_file}") | |
# Get the original structure | |
original_json = st.session_state.original_structures[selected_file] | |
original_markdown = st.session_state.original_structures.get(f"{selected_file}_markdown", "") | |
original_yaml = st.session_state.original_structures.get(f"{selected_file}_yaml", "") | |
# Display PDF viewer and content side by side | |
col1, col2 = st.columns([1, 1]) | |
with col1: | |
st.subheader("📄 Original PDF") | |
# Reset file pointer to beginning | |
uploaded_file.seek(0) | |
# Display PDF using base64 encoding for inline display | |
import base64 | |
pdf_bytes = uploaded_file.getvalue() | |
b64_pdf = base64.b64encode(pdf_bytes).decode() | |
pdf_display = f'<iframe src="data:application/pdf;base64,{b64_pdf}" width="100%" height="600" type="application/pdf"></iframe>' | |
st.markdown(pdf_display, unsafe_allow_html=True) | |
with col2: | |
if show_json: | |
st.subheader("📋 Original Document (JSON)") | |
st.caption("Docling-generated JSON structure from the PDF") | |
# Use a text area for better readability and scrolling | |
st.text_area( | |
label="Original JSON content", | |
value=json.dumps(original_json, indent=2, ensure_ascii=False), | |
height=600, | |
key="original_json_display", | |
label_visibility="collapsed" | |
) | |
elif show_yaml: | |
st.subheader("📋 Original Document (YAML)") | |
st.caption("Docling-generated YAML structure from the PDF") | |
# Use a text area for better readability and scrolling | |
st.text_area( | |
label="Original YAML content", | |
value=original_yaml, | |
height=600, | |
key="original_yaml_display", | |
label_visibility="collapsed" | |
) | |
else: | |
st.subheader("📋 Original Document (Markdown)") | |
st.caption("Docling-generated markdown from the PDF") | |
# Use a text area for better readability and scrolling | |
st.text_area( | |
label="Original markdown content", | |
value=original_markdown, | |
height=600, | |
key="original_markdown_display", | |
label_visibility="collapsed" | |
) | |
# Add download buttons for the original content | |
st.markdown("---") | |
col1, col2, col3 = st.columns(3) | |
with col1: | |
if show_json: | |
st.download_button( | |
label="📥 Download Original JSON", | |
data=json.dumps(original_json, indent=2, ensure_ascii=False), | |
file_name=f"{selected_file}_original.json", | |
mime="application/json" | |
) | |
elif show_yaml: | |
st.download_button( | |
label="📥 Download Original YAML", | |
data=original_yaml, | |
file_name=f"{selected_file}_original.yaml", | |
mime="text/yaml" | |
) | |
else: | |
st.download_button( | |
label="📥 Download Original Markdown", | |
data=original_markdown, | |
file_name=f"{selected_file}_original.md", | |
mime="text/markdown" | |
) | |
with col2: | |
if show_json or show_yaml: | |
st.subheader("📊 Document Structure") | |
st.json(original_json) | |
else: | |
st.subheader("📊 JSON Structure") | |
st.json(original_json) | |
with col3: | |
if show_json or show_yaml: | |
# Show format information | |
st.subheader("📋 Format Info") | |
if show_json: | |
st.info("**JSON Format**: Structured data representation with key-value pairs") | |
st.write("**Use case**: API integration, data processing, programmatic access") | |
elif show_yaml: | |
st.info("**YAML Format**: Human-readable data serialization") | |
st.write("**Use case**: Configuration files, documentation, easy reading") | |
else: | |
st.subheader("📋 Markdown Info") | |
st.info("**Markdown Format**: Formatted text with headers, lists, and styling") | |
st.write("**Use case**: Documentation, readable output, web display") | |
elif st.session_state.get("show_processed", False): | |
st.markdown("---") | |
st.subheader(f"Processed Document - {selected_file}") | |
# Retrieve stored results | |
data = st.session_state.processed_results[selected_file] | |
structured_json = data["structured_json"] | |
redacted_md = data["redacted_md"] | |
redacted_json = data["redacted_json"] | |
original_md = data["original_markdown"] | |
# Show processing summary | |
original_texts = structured_json.get("texts", []) | |
redacted_texts = redacted_json.get("texts", []) | |
removed_count = len(original_texts) - len(redacted_texts) | |
if removed_count > 0: | |
st.success(f"✅ Successfully removed {removed_count} text elements containing medication information") | |
else: | |
st.info("ℹ️ No medication sections were identified for removal") | |
# Create tabs for different views | |
tab1, tab2, tab3 = st.tabs(["📄 Side-by-Side Comparison", "🔍 JSON Structure", "📊 Processing Details"]) | |
with tab1: | |
st.subheader("Original vs Redacted Content") | |
st.caption("Compare the original document content with the redacted version") | |
# Get the actual removed indices from the processing result | |
actual_removed_indices = [] | |
if "processing_result" in st.session_state.processed_results[selected_file]: | |
processing_result = st.session_state.processed_results[selected_file]["processing_result"] | |
actual_removed_indices = processing_result.removed_indices | |
# Create a more intelligent side-by-side comparison based on JSON structure | |
col1, col2 = st.columns(2) | |
with col1: | |
st.markdown("**📋 Original Document**") | |
# Display original content with removed sections highlighted | |
for i, text_elem in enumerate(original_texts): | |
text_content = text_elem.get("text", "") | |
label = text_elem.get("label", "") | |
# Check if this element was removed | |
is_removed = i in actual_removed_indices | |
if is_removed: | |
# Highlight removed content in red | |
st.markdown(f""" | |
<div style="background-color: #ffebee; color: #c62828; padding: 8px; margin: 4px 0; border-left: 4px solid #f44336; border-radius: 4px;"> | |
<strong>Text {i} ({label}) - REMOVED:</strong><br> | |
{text_content} | |
</div> | |
""", unsafe_allow_html=True) | |
else: | |
# Show normal content | |
content_preview = text_content[:150] + "..." if len(text_content) > 150 else text_content | |
st.markdown(f""" | |
<div style="padding: 4px; margin: 2px 0; border-radius: 4px;"> | |
<strong>Text {i} ({label}) - {len(text_content)} chars:</strong><br> | |
<code style="background-color: #f5f5f5; padding: 2px; border-radius: 2px;">{content_preview}</code> | |
</div> | |
""", unsafe_allow_html=True) | |
with col2: | |
st.markdown("**🔒 Redacted Document**") | |
# Display redacted content (only non-removed elements) | |
redacted_index = 0 | |
for i, text_elem in enumerate(original_texts): | |
text_content = text_elem.get("text", "") | |
label = text_elem.get("label", "") | |
# Check if this element was removed | |
is_removed = i in actual_removed_indices | |
if is_removed: | |
# Show placeholder for removed content | |
st.markdown(f""" | |
<div style="background-color: #ffebee; color: #c62828; padding: 8px; margin: 4px 0; border-left: 4px solid #f44336; border-radius: 4px; font-style: italic; opacity: 0.7;"> | |
<strong>Text {i} ({label}) - REMOVED</strong><br> | |
[Content removed by redaction] | |
</div> | |
""", unsafe_allow_html=True) | |
else: | |
# Show the actual content from redacted texts | |
if redacted_index < len(redacted_texts): | |
redacted_content = redacted_texts[redacted_index].get("text", "") | |
content_preview = redacted_content[:150] + "..." if len(redacted_content) > 150 else redacted_content | |
st.markdown(f""" | |
<div style="padding: 4px; margin: 2px 0; border-radius: 4px;"> | |
<strong>Text {i} ({label}) - {len(redacted_content)} chars:</strong><br> | |
<code style="background-color: #f5f5f5; padding: 2px; border-radius: 2px;">{content_preview}</code> | |
</div> | |
""", unsafe_allow_html=True) | |
redacted_index += 1 | |
else: | |
st.markdown(f""" | |
<div style="padding: 4px; margin: 2px 0; border-radius: 4px; background-color: #f5f5f5;"> | |
<strong>Text {i} ({label}):</strong><br> | |
[Content preserved] | |
</div> | |
""", unsafe_allow_html=True) | |
# Add legend | |
st.markdown("---") | |
col1, col2 = st.columns(2) | |
with col1: | |
st.markdown("**🎨 Comparison Legend:**") | |
st.markdown("🔴 **Red background** = Removed content") | |
st.markdown("⚪ **White background** = Preserved content") | |
st.markdown("📝 **Italic text** = Placeholder for removed content") | |
with col2: | |
st.markdown("**💡 How to read:**") | |
st.markdown("Left panel shows original with removed sections highlighted") | |
st.markdown("Right panel shows redacted version with placeholders") | |
st.markdown("Compare corresponding text indices to see changes") | |
# Add debug information to help identify missing content | |
with st.expander("🔍 Debug: Content Analysis"): | |
st.write("**Searching for table content...**") | |
# Search for table-related content in original texts | |
table_elements = [] | |
for i, text_elem in enumerate(original_texts): | |
text_content = text_elem.get("text", "") | |
label = text_elem.get("label", "") | |
if "Bespreking" in text_content or "|" in text_content or "table" in label.lower(): | |
table_elements.append({ | |
"index": i, | |
"label": label, | |
"content": text_content[:200] + "..." if len(text_content) > 200 else text_content, | |
"is_removed": i in actual_removed_indices | |
}) | |
if table_elements: | |
st.write(f"**Found {len(table_elements)} table-related elements:**") | |
for elem in table_elements: | |
status = "🔴 REMOVED" if elem["is_removed"] else "✅ PRESERVED" | |
st.write(f"**Text {elem['index']} ({elem['label']}) - {status}:**") | |
st.write(f"`{elem['content']}`") | |
st.write("---") | |
else: | |
st.write("**No table-related content found in original texts**") | |
# Also check redacted texts | |
st.write("**Table content in redacted texts:**") | |
table_elements_redacted = [] | |
for i, text_elem in enumerate(redacted_texts): | |
text_content = text_elem.get("text", "") | |
label = text_elem.get("label", "") | |
if "Bespreking" in text_content or "|" in text_content or "table" in label.lower(): | |
table_elements_redacted.append({ | |
"index": i, | |
"label": label, | |
"content": text_content[:200] + "..." if len(text_content) > 200 else text_content | |
}) | |
if table_elements_redacted: | |
st.write(f"**Found {len(table_elements_redacted)} table-related elements in redacted content:**") | |
for elem in table_elements_redacted: | |
st.write(f"**Text {elem['index']} ({elem['label']}):**") | |
st.write(f"`{elem['content']}`") | |
st.write("---") | |
else: | |
st.write("**No table-related content found in redacted texts**") | |
# Add download buttons for redacted content | |
st.markdown("---") | |
st.subheader("📥 Download Redacted Content") | |
col1, col2, col3 = st.columns(3) | |
with col1: | |
# Download redacted markdown | |
st.download_button( | |
label="📄 Download Redacted Markdown", | |
data=redacted_md, | |
file_name=f"{selected_file}_redacted.md", | |
mime="text/markdown", | |
help="Download the redacted document as Markdown format" | |
) | |
with col2: | |
# Generate and download redacted PDF | |
pdf_generated = False | |
pdf_bytes = None | |
if st.button("📋 Generate Redacted PDF", help="Generate a PDF version of the redacted document"): | |
with st.spinner("Generating redacted PDF..."): | |
try: | |
# Create a DocumentProcessor to access PDF generation | |
temp_path = save_uploaded_file(uploaded_file, selected_file) | |
processor = DocumentProcessor(section_extractor=None) | |
# Generate PDF path | |
base_name = os.path.splitext(selected_file)[0] | |
pdf_path = os.path.join(TEMP_DIR, f"{base_name}_redacted.pdf") | |
# Generate the PDF | |
success = processor.generate_redacted_pdf(redacted_json, pdf_path) | |
if success: | |
# Read the generated PDF and store for download | |
with open(pdf_path, "rb") as pdf_file: | |
pdf_bytes = pdf_file.read() | |
pdf_generated = True | |
st.success("✅ PDF generated successfully!") | |
else: | |
st.error("❌ Failed to generate PDF. Check logs for details.") | |
except Exception as e: | |
st.error(f"❌ Error generating PDF: {e}") | |
st.info("💡 Make sure reportlab is installed: `pip install reportlab`") | |
# Show download button if PDF was generated | |
if pdf_generated and pdf_bytes: | |
st.download_button( | |
label="📥 Download Redacted PDF", | |
data=pdf_bytes, | |
file_name=f"{os.path.splitext(selected_file)[0]}_redacted.pdf", | |
mime="application/pdf", | |
help="Download the redacted document as PDF" | |
) | |
# Show debug information about what's in the PDF | |
with st.expander("🔍 Debug: PDF Content Analysis"): | |
st.write("**Content that will be included in the PDF:**") | |
texts_in_pdf = redacted_json.get("texts", []) | |
st.write(f"Total text elements: {len(texts_in_pdf)}") | |
for i, text_elem in enumerate(texts_in_pdf): | |
text_content = text_elem.get("text", "")[:100] + "..." if len(text_elem.get("text", "")) > 100 else text_elem.get("text", "") | |
label = text_elem.get("label", "") | |
st.write(f"**Text {i} ({label}):** {text_content}") | |
elif not pdf_generated: | |
st.info("💡 Click 'Generate Redacted PDF' to create a PDF version") | |
with col3: | |
# Download redacted JSON structure | |
st.download_button( | |
label="🔧 Download Redacted JSON", | |
data=json.dumps(redacted_json, indent=2, ensure_ascii=False), | |
file_name=f"{selected_file}_redacted.json", | |
mime="application/json", | |
help="Download the redacted document structure as JSON" | |
) | |
with tab2: | |
st.subheader("Document Structure Analysis") | |
# Show JSON structure comparison | |
col1, col2 = st.columns(2) | |
with col1: | |
st.markdown("**📊 Original Structure (JSON)**") | |
st.json(structured_json) | |
with col2: | |
st.markdown("**🔒 Redacted Structure (JSON)**") | |
st.json(redacted_json) | |
with tab3: | |
st.subheader("Processing Details") | |
# Show cost analysis for this processing session | |
st.subheader("💰 Cost Analysis") | |
# Get cost data from the processing result | |
if "processing_result" in st.session_state.processed_results[selected_file]: | |
processing_result = st.session_state.processed_results[selected_file]["processing_result"] | |
col1, col2, col3 = st.columns(3) | |
with col1: | |
st.metric("Total Cost", f"${processing_result.cost:.4f}") | |
with col2: | |
st.metric("Input Tokens", f"{processing_result.input_tokens:,}") | |
with col3: | |
st.metric("Output Tokens", f"{processing_result.output_tokens:,}") | |
# Add download button for cost report | |
cost_report = { | |
"timestamp": datetime.now().isoformat(), | |
"total_cost": processing_result.cost, | |
"input_tokens": processing_result.input_tokens, | |
"output_tokens": processing_result.output_tokens, | |
"total_tokens": processing_result.input_tokens + processing_result.output_tokens, | |
"document_processed": selected_file, | |
"model_used": AZURE_OPENAI_DEPLOYMENT | |
} | |
st.download_button( | |
label="📥 Download Cost Report (JSON)", | |
data=json.dumps(cost_report, indent=2), | |
file_name=f"cost_report_{selected_file}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json", | |
mime="application/json" | |
) | |
# Show model information | |
model_info = cost_tracker.get_model_info(AZURE_OPENAI_DEPLOYMENT) | |
if model_info: | |
st.subheader("Model Information") | |
st.write(f"**Model:** {model_info.description}") | |
st.write(f"**Input cost:** ${model_info.input_cost_per_1k_tokens:.4f}/1K tokens") | |
st.write(f"**Output cost:** ${model_info.output_cost_per_1k_tokens:.4f}/1K tokens") | |
# Calculate cost breakdown | |
input_cost = (processing_result.input_tokens / 1000) * model_info.input_cost_per_1k_tokens | |
output_cost = (processing_result.output_tokens / 1000) * model_info.output_cost_per_1k_tokens | |
st.write(f"**Cost breakdown:** Input: ${input_cost:.4f}, Output: ${output_cost:.4f}") | |
else: | |
# Fallback to old cost summary method | |
cost_summary = cost_tracker.get_session_summary() | |
if cost_summary["usage_count"] > 0: | |
col1, col2, col3 = st.columns(3) | |
with col1: | |
st.metric("Total Cost", f"${cost_summary['total_cost']:.4f}") | |
with col2: | |
st.metric("Total Tokens", f"{cost_summary['total_tokens']:,}") | |
with col3: | |
st.metric("API Calls", cost_summary["usage_count"]) | |
# Add download button for cost report | |
cost_report = { | |
"timestamp": datetime.now().isoformat(), | |
"total_cost": cost_summary["total_cost"], | |
"total_tokens": cost_summary["total_tokens"], | |
"api_calls": cost_summary["usage_count"], | |
"model_breakdown": cost_summary["model_breakdown"], | |
"document_processed": selected_file | |
} | |
st.download_button( | |
label="📥 Download Cost Report (JSON)", | |
data=json.dumps(cost_report, indent=2), | |
file_name=f"cost_report_{selected_file}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json", | |
mime="application/json" | |
) | |
# Show detailed model breakdown | |
if cost_summary["model_breakdown"]: | |
st.subheader("Model Usage Breakdown") | |
for model, stats in cost_summary["model_breakdown"].items(): | |
model_info = cost_tracker.get_model_info(model) | |
model_display_name = model_info.description if model_info else model | |
with st.expander(f"{model_display_name} - ${stats['cost']:.4f}"): | |
col1, col2 = st.columns(2) | |
with col1: | |
st.write(f"**Input tokens:** {stats['input_tokens']:,}") | |
st.write(f"**Output tokens:** {stats['output_tokens']:,}") | |
with col2: | |
st.write(f"**Total tokens:** {stats['total_tokens']:,}") | |
st.write(f"**API calls:** {stats['usage_count']}") | |
# Show cost breakdown | |
if model_info: | |
input_cost = (stats['input_tokens'] / 1000) * model_info.input_cost_per_1k_tokens | |
output_cost = (stats['output_tokens'] / 1000) * model_info.output_cost_per_1k_tokens | |
st.write(f"**Cost breakdown:** Input: ${input_cost:.4f}, Output: ${output_cost:.4f}") | |
else: | |
st.info("No API calls recorded for this session") | |
# Show what was removed | |
if removed_count > 0: | |
st.info(f"**Removed {removed_count} text elements from the document structure.**") | |
# Show the removed text elements - use the actual indices from the processing result | |
st.subheader("Removed Text Elements:") | |
# Get the actual indices that were removed from the processing result | |
if "processing_result" in st.session_state.processed_results[selected_file]: | |
# Get the actual removed indices from the LLM response | |
processing_result = st.session_state.processed_results[selected_file]["processing_result"] | |
actual_removed_indices = processing_result.removed_indices | |
if actual_removed_indices: | |
st.info(f"**Elements removed by LLM analysis ({len(actual_removed_indices)} elements):**") | |
for idx in actual_removed_indices: | |
if idx < len(original_texts): | |
text_content = original_texts[idx].get("text", "") | |
st.text(f"Text {idx}: {text_content[:100]}{'...' if len(text_content) > 100 else ''}") | |
else: | |
st.text(f"Text {idx}: [Index out of bounds]") | |
else: | |
st.info("**No elements were identified for removal by the LLM.**") | |
else: | |
# Fallback to the old method if processing result not available | |
st.warning("**Note: Using fallback calculation method**") | |
removed_texts = [] | |
for i, text_elem in enumerate(original_texts): | |
if i >= len(redacted_texts) or text_elem.get("text", "") != redacted_texts[i].get("text", ""): | |
removed_texts.append((i, text_elem.get("text", "")[:100] + "..." if len(text_elem.get("text", "")) > 100 else text_elem.get("text", ""))) | |
for idx, text in removed_texts: | |
st.text(f"Text {idx}: {text}") | |
else: | |
st.info("No text elements were removed during processing.") | |
# Show processing logs | |
st.subheader("Processing Logs") | |
st.text_area( | |
label="Processing logs", | |
value=st.session_state.logs.get(selected_file, ""), | |
height=300, | |
label_visibility="collapsed" | |
) | |