docling / src /streamlit_app.py
levalencia's picture
Enhance Streamlit app to support original JSON and YAML display
c62bdf5
# Set environment variables IMMEDIATELY to prevent root filesystem access
# This must happen before any other imports or operations
import os
import tempfile
import json
from datetime import datetime
# Get a writable temp directory first
try:
TEMP_DIR = os.path.join(tempfile.gettempdir(), "docling_temp")
os.makedirs(TEMP_DIR, exist_ok=True)
except Exception:
try:
TEMP_DIR = "/tmp/docling_temp"
os.makedirs(TEMP_DIR, exist_ok=True)
except Exception:
TEMP_DIR = os.getcwd()
# Set all environment variables that libraries might use
os.environ.update({
# Streamlit configuration
'STREAMLIT_SERVER_FILE_WATCHER_TYPE': 'none',
'STREAMLIT_SERVER_HEADLESS': 'true',
'STREAMLIT_BROWSER_GATHER_USAGE_STATS': 'false',
'STREAMLIT_SERVER_ENABLE_CORS': 'false',
'STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION': 'false',
# EasyOCR configuration
'EASYOCR_MODULE_PATH': os.path.join(TEMP_DIR, 'easyocr_models'),
'HOME': TEMP_DIR,
'USERPROFILE': TEMP_DIR,
'XDG_CACHE_HOME': os.path.join(TEMP_DIR, 'cache'),
'XDG_CONFIG_HOME': os.path.join(TEMP_DIR, 'config'),
'XDG_DATA_HOME': os.path.join(TEMP_DIR, 'data'),
# Hugging Face Hub configuration - CRITICAL for preventing /.cache access
'HF_HOME': os.path.join(TEMP_DIR, 'huggingface'),
'HF_CACHE_HOME': os.path.join(TEMP_DIR, 'huggingface_cache'),
'HF_HUB_CACHE': os.path.join(TEMP_DIR, 'huggingface_cache'),
'TRANSFORMERS_CACHE': os.path.join(TEMP_DIR, 'transformers_cache'),
'HF_DATASETS_CACHE': os.path.join(TEMP_DIR, 'datasets_cache'),
'DIFFUSERS_CACHE': os.path.join(TEMP_DIR, 'diffusers_cache'),
'ACCELERATE_CACHE': os.path.join(TEMP_DIR, 'accelerate_cache'),
# Additional Hugging Face specific variables
'HF_HUB_DISABLE_TELEMETRY': '1',
'HF_HUB_DISABLE_IMPLICIT_TOKEN': '1',
'HF_HUB_OFFLINE': '0',
# Other ML libraries
'TORCH_HOME': os.path.join(TEMP_DIR, 'torch'),
'TENSORFLOW_HOME': os.path.join(TEMP_DIR, 'tensorflow'),
'KERAS_HOME': os.path.join(TEMP_DIR, 'keras'),
'MLFLOW_TRACKING_URI': f'file:{os.path.join(TEMP_DIR, "mlruns")}',
# Additional cache directories
'CACHE_DIR': os.path.join(TEMP_DIR, 'cache'),
'MODEL_CACHE_DIR': os.path.join(TEMP_DIR, 'models'),
# Additional environment variables to prevent root access
'PYTHONPATH': TEMP_DIR,
'TMPDIR': TEMP_DIR,
'TEMP': TEMP_DIR,
'TMP': TEMP_DIR,
'CACHE': os.path.join(TEMP_DIR, 'cache'),
'MODELS': os.path.join(TEMP_DIR, 'models'),
'DATA': os.path.join(TEMP_DIR, 'data'),
'CONFIG': os.path.join(TEMP_DIR, 'config'),
})
# Create all necessary directories
directories_to_create = [
os.environ['EASYOCR_MODULE_PATH'],
os.environ['XDG_CACHE_HOME'],
os.environ['XDG_CONFIG_HOME'],
os.environ['XDG_DATA_HOME'],
os.environ['HF_HOME'],
os.environ['HF_CACHE_HOME'],
os.environ['TRANSFORMERS_CACHE'],
os.environ['HF_DATASETS_CACHE'],
os.environ['TORCH_HOME'],
os.environ['TENSORFLOW_HOME'],
os.environ['KERAS_HOME'],
os.environ['CACHE_DIR'],
os.environ['MODEL_CACHE_DIR'],
os.environ['CACHE'],
os.environ['MODELS'],
os.environ['DATA'],
os.environ['CONFIG'],
os.environ['HF_HUB_CACHE'],
os.environ['DIFFUSERS_CACHE'],
os.environ['ACCELERATE_CACHE'],
]
for directory in directories_to_create:
try:
# Create directory and all parent directories
os.makedirs(directory, mode=0o777, exist_ok=True)
# Ensure the directory has write permissions
os.chmod(directory, 0o777)
except Exception as e:
print(f"Warning: Could not create directory {directory}: {e}")
# Now import the rest of the modules
import streamlit as st
import logging
import shutil
from processing.document_processor import DocumentProcessor
from processing.sections import ReasoningSectionExtractor
from utils.logging_utils import get_log_handler
from utils.cost_tracker import cost_tracker
from dotenv import load_dotenv
import sys
import difflib
import time
# Configure logging early to avoid issues
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(name)s: %(message)s",
stream=sys.stdout,
force=True
)
# Load environment variables from .env
load_dotenv()
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
AZURE_OPENAI_VERSION = os.getenv("AZURE_OPENAI_VERSION")
AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT")
# Log startup information
logging.info("=" * 50)
logging.info("Docling Streamlit App Starting")
logging.info(f"Temp directory: {TEMP_DIR}")
logging.info(f"EasyOCR model directory: {os.environ.get('EASYOCR_MODULE_PATH', 'NOT_SET')}")
logging.info(f"Hugging Face cache: {os.environ.get('HF_CACHE_HOME', 'NOT_SET')}")
logging.info(f"Current working directory: {os.getcwd()}")
logging.info(f"Python version: {sys.version}")
logging.info("=" * 50)
def cleanup_temp_files():
"""Clean up temporary files in the temp directory."""
try:
if os.path.exists(TEMP_DIR):
for filename in os.listdir(TEMP_DIR):
file_path = os.path.join(TEMP_DIR, filename)
if os.path.isfile(file_path):
try:
os.remove(file_path)
logging.info(f"Removed temp file: {filename}")
except PermissionError as e:
logging.warning(f"Permission error removing {filename}: {e}")
except Exception as e:
logging.warning(f"Error removing {filename}: {e}")
logging.info(f"Cleaned up temporary files in {TEMP_DIR}")
else:
logging.info(f"Temp directory {TEMP_DIR} does not exist")
except PermissionError as e:
logging.warning(f"Permission error accessing temp directory {TEMP_DIR}: {e}")
except Exception as e:
logging.warning(f"Error cleaning up temp files: {e}")
def clear_all_data():
"""Clear all temporary files and session state data."""
try:
# Clean up temp files
cleanup_temp_files()
# Clear session state
if "processed_results" in st.session_state:
del st.session_state.processed_results
if "logs" in st.session_state:
del st.session_state.logs
if "original_structures" in st.session_state:
del st.session_state.original_structures
if "show_original" in st.session_state:
del st.session_state.show_original
if "show_processed" in st.session_state:
del st.session_state.show_processed
if "temp_cleaned" in st.session_state:
del st.session_state.temp_cleaned
if "last_cleanup_time" in st.session_state:
del st.session_state.last_cleanup_time
logging.info("Cleared all session state and temporary files")
return True
except Exception as e:
logging.error(f"Error clearing all data: {e}")
return False
def get_temp_files_info():
"""Get information about temporary files (count and total size)."""
try:
if not os.path.exists(TEMP_DIR):
return 0, 0
files = os.listdir(TEMP_DIR)
total_size = 0
file_details = []
for filename in files:
try:
file_path = os.path.join(TEMP_DIR, filename)
if os.path.isfile(file_path):
file_size = os.path.getsize(file_path)
total_size += file_size
file_details.append({
'name': filename,
'size': file_size,
'type': 'file'
})
elif os.path.isdir(file_path):
file_details.append({
'name': filename,
'size': 0,
'type': 'directory'
})
except (PermissionError, OSError) as e:
logging.warning(f"Error accessing file {filename}: {e}")
file_details.append({
'name': filename,
'size': 0,
'type': 'error'
})
continue
# Log detailed information for debugging
if file_details:
logging.info(f"Temp directory contents ({TEMP_DIR}):")
for detail in file_details:
logging.info(f" - {detail['name']} ({detail['type']}): {detail['size']} bytes")
return len(files), total_size
except PermissionError as e:
logging.warning(f"Permission error accessing temp directory {TEMP_DIR}: {e}")
return 0, 0
except Exception as e:
logging.warning(f"Error getting temp files info: {e}")
return 0, 0
def format_file_size(size_bytes):
"""Format file size in human readable format."""
if size_bytes == 0:
return "0 B"
size_names = ["B", "KB", "MB", "GB"]
i = 0
while size_bytes >= 1024 and i < len(size_names) - 1:
size_bytes /= 1024.0
i += 1
return f"{size_bytes:.1f} {size_names[i]}"
def save_uploaded_file(uploaded_file, filename):
"""Save uploaded file to temp directory and return the path."""
temp_path = os.path.join(TEMP_DIR, f"temp_{filename}")
try:
uploaded_file.seek(0) # Reset file pointer to beginning
file_bytes = uploaded_file.read()
with open(temp_path, "wb") as f:
f.write(file_bytes)
logging.info(f"Saved uploaded file to {temp_path}")
return temp_path
except PermissionError as e:
logging.error(f"Permission error saving uploaded file to {temp_path}: {e}")
raise PermissionError(f"Cannot save file due to permission restrictions. Please try clearing data or contact support.")
except Exception as e:
logging.error(f"Error saving uploaded file: {e}")
raise
# Configure page layout to use wide mode
st.set_page_config(
page_title="Medical Document Parser & Redactor",
page_icon="📄",
layout="wide",
initial_sidebar_state="collapsed"
)
# Add custom CSS for better styling
st.markdown("""
<style>
/* Custom styling for text areas */
.stTextArea textarea {
font-family: 'Courier New', monospace !important;
font-size: 12px !important;
line-height: 1.4 !important;
border: 2px solid #e0e0e0 !important;
border-radius: 8px !important;
}
/* Hover effect for text areas */
.stTextArea textarea:hover {
border-color: #1f77b4 !important;
}
/* Custom styling for download buttons */
.stDownloadButton > button {
border-radius: 8px !important;
font-weight: 600 !important;
}
/* Custom styling for the comparison section */
.comparison-container {
background-color: #f8f9fa;
padding: 20px;
border-radius: 10px;
border: 1px solid #e9ecef;
}
</style>
""", unsafe_allow_html=True)
# Configure root logger only once (avoid duplicate handlers on reruns)
if len(logging.getLogger().handlers) == 0:
logging.getLogger().setLevel(logging.INFO)
# (We will attach custom handlers during processing as needed)
# Title and description
st.title("Medical Document Parser & Redactor")
st.write("""
Upload PDF medical documents to parse their content using **Docling** (structure-aware parser)
and automatically **redact specific sections** (e.g., initial and final medication lists).
Use the buttons below to view the original structure or process with redaction.
**💡 Tip:** This is a Hugging Face Space with limited storage. Use the "Clear All Data" button to remove temporary files when you're done processing documents.
""")
# Add clear all data button at the top
if st.button("🧹 Clear All Data", type="secondary", help="Remove all temporary files and reset the application"):
if clear_all_data():
st.success("✅ All data cleared successfully! The application has been reset.")
cost_tracker.reset_session() # Reset cost tracking when clearing data
st.rerun()
else:
st.error("❌ Error clearing data. Please try again.")
# File uploader (accept multiple PDF files)
uploaded_files = st.file_uploader("Upload PDF medical documents", type=["pdf"], accept_multiple_files=True)
# Clean up temp files on app start (but keep the directory)
if "temp_cleaned" not in st.session_state:
cleanup_temp_files()
st.session_state.temp_cleaned = True
# Initialize session state storage for results and logs
if "processed_results" not in st.session_state:
st.session_state.processed_results = {} # {filename: {"structured_json": ..., "redacted_md": ..., "redacted_json": ...}}
if "logs" not in st.session_state:
st.session_state.logs = {} # {filename: log_text}
if "original_structures" not in st.session_state:
st.session_state.original_structures = {} # {filename: structured_json}
# Show temp directory status and cleanup button
temp_file_count, total_size = get_temp_files_info()
# Automatic cleanup: if temp files are too old or too large, clean them up
if "last_cleanup_time" not in st.session_state:
st.session_state.last_cleanup_time = time.time()
# Check if we should do automatic cleanup (every 30 minutes or if files are too large)
current_time = time.time()
time_since_cleanup = current_time - st.session_state.last_cleanup_time
if (time_since_cleanup > 1800 or # 30 minutes
total_size > 100 * 1024 * 1024): # 100MB
if temp_file_count > 0:
cleanup_temp_files()
st.session_state.last_cleanup_time = current_time
st.info("🧹 Automatic cleanup: Removed old temporary files")
# Recalculate after cleanup
temp_file_count, total_size = get_temp_files_info()
# Create a row with temp file status and delete button
col1, col2 = st.columns([3, 1])
with col1:
if temp_file_count > 0:
st.caption(f"📁 {temp_file_count} temporary file(s) - Total size: {format_file_size(total_size)}")
# Show warning if total size is large
if total_size > 50 * 1024 * 1024: # 50MB
st.warning("⚠️ Large temporary files detected. Consider clearing data to free up space.")
# Debug: Show temp files (expandable)
with st.expander("🔍 Debug: View temporary files"):
try:
if os.path.exists(TEMP_DIR):
files = os.listdir(TEMP_DIR)
if files:
st.write("**Temporary files in directory:**")
for filename in files:
file_path = os.path.join(TEMP_DIR, filename)
try:
if os.path.isfile(file_path):
size = os.path.getsize(file_path)
st.write(f"📄 {filename} ({format_file_size(size)})")
elif os.path.isdir(file_path):
st.write(f"📁 {filename} (directory)")
else:
st.write(f"❓ {filename} (unknown)")
except Exception as e:
st.write(f"❌ {filename} (error: {e})")
else:
st.write("No files found in temp directory")
else:
st.write("Temp directory does not exist")
except Exception as e:
st.write(f"Error accessing temp directory: {e}")
else:
st.caption("📁 No temporary files")
with col2:
if temp_file_count > 0:
if st.button("🗑️ Delete Temp Files", type="secondary", help="Remove all temporary files from the server"):
try:
cleanup_temp_files()
st.success(f"✅ Successfully deleted {temp_file_count} temporary file(s)")
st.rerun() # Refresh the page to update the file count
except Exception as e:
st.error(f"❌ Error deleting temporary files: {e}")
else:
st.caption("No files to delete")
if uploaded_files:
# UI to select which file to work with (if multiple files uploaded)
file_names = [f.name for f in uploaded_files]
selected_file = st.selectbox("Select a file to work with", options=file_names)
if selected_file:
# Find the selected uploaded file
uploaded_file = next(f for f in uploaded_files if f.name == selected_file)
# Create buttons for different actions
col1, col2, col3, col4, col5 = st.columns(5)
with col1:
if st.button("📄 Show Original", type="primary"):
# Process the document to get original structure (without redaction)
if selected_file not in st.session_state.original_structures:
# Save uploaded file to a temporary location
temp_path = save_uploaded_file(uploaded_file, selected_file)
# Create a DocumentProcessor without section extraction (for original structure)
processor = DocumentProcessor(section_extractor=None)
# Process the document to get original structure
result = processor.process(temp_path)
st.session_state.original_structures[selected_file] = result.structured_json
# Also store the original markdown for comparison
st.session_state.original_structures[f"{selected_file}_markdown"] = result.structured_markdown
# Display the original structure
st.session_state.show_original = True
st.session_state.show_processed = False
with col2:
if st.button("🔒 Process with Redaction"):
# Process the document with redaction
if selected_file not in st.session_state.processed_results:
# Save uploaded file to a temporary location
temp_path = save_uploaded_file(uploaded_file, selected_file)
# Ensure the deployment name is in the cost tracker
if AZURE_OPENAI_DEPLOYMENT and AZURE_OPENAI_DEPLOYMENT not in cost_tracker.get_available_models():
model_type = cost_tracker.guess_model_type(AZURE_OPENAI_DEPLOYMENT)
cost_tracker.add_deployment_pricing(AZURE_OPENAI_DEPLOYMENT, model_type)
# Use the new processing function
from processing.document_processor import process_document_with_redaction
# Attach an in-memory log handler to capture logs for this file
log_handler, log_buffer = get_log_handler()
root_logger = logging.getLogger()
root_logger.addHandler(log_handler)
try:
# Process the document using the new function
processing_result = process_document_with_redaction(
file_path=temp_path,
endpoint=AZURE_OPENAI_ENDPOINT,
api_key=AZURE_OPENAI_KEY,
api_version=AZURE_OPENAI_VERSION,
deployment=AZURE_OPENAI_DEPLOYMENT,
)
# Save results in session state (maintaining compatibility with existing UI)
st.session_state.processed_results[selected_file] = {
"structured_json": processing_result.original_document_json,
"redacted_md": processing_result.redacted_document_md,
"redacted_json": processing_result.redacted_document_json, # Now this is actually redacted!
"original_markdown": processing_result.original_document_md,
"processing_result": processing_result # Store the new result
}
finally:
# Remove handler and stop capturing logs
root_logger.removeHandler(log_handler)
# Combine log records into a single text
log_text = "\n".join(log_buffer)
st.session_state.logs[selected_file] = log_text
st.session_state.show_original = False
st.session_state.show_processed = True
with col3:
if st.button("🔄 Switch View"):
# Toggle between views
if st.session_state.get("show_original", False):
st.session_state.show_original = False
st.session_state.show_processed = True
else:
st.session_state.show_original = True
st.session_state.show_processed = False
with col4:
if st.button("📄 Show Original JSON", type="secondary"):
# Process the document to get original structure (without redaction)
if selected_file not in st.session_state.original_structures:
# Save uploaded file to a temporary location
temp_path = save_uploaded_file(uploaded_file, selected_file)
# Create a DocumentProcessor without section extraction (for original structure)
processor = DocumentProcessor(section_extractor=None)
# Process the document to get original structure
result = processor.process(temp_path)
st.session_state.original_structures[selected_file] = result.structured_json
# Store the original markdown for comparison
st.session_state.original_structures[f"{selected_file}_markdown"] = result.structured_markdown
# Store the original YAML for comparison
st.session_state.original_structures[f"{selected_file}_yaml"] = result.structured_yaml
# Display the original JSON structure
st.session_state.show_original = True
st.session_state.show_processed = False
st.session_state.show_json = True
st.session_state.show_yaml = False
with col5:
if st.button("📄 Show Original YAML", type="secondary"):
# Process the document to get original structure (without redaction)
if selected_file not in st.session_state.original_structures:
# Save uploaded file to a temporary location
temp_path = save_uploaded_file(uploaded_file, selected_file)
# Create a DocumentProcessor without section extraction (for original structure)
processor = DocumentProcessor(section_extractor=None)
# Process the document to get original structure
result = processor.process(temp_path)
st.session_state.original_structures[selected_file] = result.structured_json
# Store the original markdown for comparison
st.session_state.original_structures[f"{selected_file}_markdown"] = result.structured_markdown
# Store the original YAML for comparison
st.session_state.original_structures[f"{selected_file}_yaml"] = result.structured_yaml
# Display the original YAML structure
st.session_state.show_original = True
st.session_state.show_processed = False
st.session_state.show_json = False
st.session_state.show_yaml = True
# Show current view status
if st.session_state.get("show_original", False):
st.info("📄 Currently viewing: **Original Document Structure**")
elif st.session_state.get("show_processed", False):
st.success("🔒 Currently viewing: **Processed Document with Redaction**")
else:
st.info("ℹ️ Select an action above to view document content")
# Display results based on button clicked
if st.session_state.get("show_original", False):
st.markdown("---")
# Determine what to show based on button clicked
show_json = st.session_state.get("show_json", False)
show_yaml = st.session_state.get("show_yaml", False)
if show_json:
st.subheader(f"Original Document Structure (JSON) - {selected_file}")
elif show_yaml:
st.subheader(f"Original Document Structure (YAML) - {selected_file}")
else:
st.subheader(f"Original Document Structure (Markdown) - {selected_file}")
# Get the original structure
original_json = st.session_state.original_structures[selected_file]
original_markdown = st.session_state.original_structures.get(f"{selected_file}_markdown", "")
original_yaml = st.session_state.original_structures.get(f"{selected_file}_yaml", "")
# Display PDF viewer and content side by side
col1, col2 = st.columns([1, 1])
with col1:
st.subheader("📄 Original PDF")
# Reset file pointer to beginning
uploaded_file.seek(0)
# Display PDF using base64 encoding for inline display
import base64
pdf_bytes = uploaded_file.getvalue()
b64_pdf = base64.b64encode(pdf_bytes).decode()
pdf_display = f'<iframe src="data:application/pdf;base64,{b64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'
st.markdown(pdf_display, unsafe_allow_html=True)
with col2:
if show_json:
st.subheader("📋 Original Document (JSON)")
st.caption("Docling-generated JSON structure from the PDF")
# Use a text area for better readability and scrolling
st.text_area(
label="Original JSON content",
value=json.dumps(original_json, indent=2, ensure_ascii=False),
height=600,
key="original_json_display",
label_visibility="collapsed"
)
elif show_yaml:
st.subheader("📋 Original Document (YAML)")
st.caption("Docling-generated YAML structure from the PDF")
# Use a text area for better readability and scrolling
st.text_area(
label="Original YAML content",
value=original_yaml,
height=600,
key="original_yaml_display",
label_visibility="collapsed"
)
else:
st.subheader("📋 Original Document (Markdown)")
st.caption("Docling-generated markdown from the PDF")
# Use a text area for better readability and scrolling
st.text_area(
label="Original markdown content",
value=original_markdown,
height=600,
key="original_markdown_display",
label_visibility="collapsed"
)
# Add download buttons for the original content
st.markdown("---")
col1, col2, col3 = st.columns(3)
with col1:
if show_json:
st.download_button(
label="📥 Download Original JSON",
data=json.dumps(original_json, indent=2, ensure_ascii=False),
file_name=f"{selected_file}_original.json",
mime="application/json"
)
elif show_yaml:
st.download_button(
label="📥 Download Original YAML",
data=original_yaml,
file_name=f"{selected_file}_original.yaml",
mime="text/yaml"
)
else:
st.download_button(
label="📥 Download Original Markdown",
data=original_markdown,
file_name=f"{selected_file}_original.md",
mime="text/markdown"
)
with col2:
if show_json or show_yaml:
st.subheader("📊 Document Structure")
st.json(original_json)
else:
st.subheader("📊 JSON Structure")
st.json(original_json)
with col3:
if show_json or show_yaml:
# Show format information
st.subheader("📋 Format Info")
if show_json:
st.info("**JSON Format**: Structured data representation with key-value pairs")
st.write("**Use case**: API integration, data processing, programmatic access")
elif show_yaml:
st.info("**YAML Format**: Human-readable data serialization")
st.write("**Use case**: Configuration files, documentation, easy reading")
else:
st.subheader("📋 Markdown Info")
st.info("**Markdown Format**: Formatted text with headers, lists, and styling")
st.write("**Use case**: Documentation, readable output, web display")
elif st.session_state.get("show_processed", False):
st.markdown("---")
st.subheader(f"Processed Document - {selected_file}")
# Retrieve stored results
data = st.session_state.processed_results[selected_file]
structured_json = data["structured_json"]
redacted_md = data["redacted_md"]
redacted_json = data["redacted_json"]
original_md = data["original_markdown"]
# Show processing summary
original_texts = structured_json.get("texts", [])
redacted_texts = redacted_json.get("texts", [])
removed_count = len(original_texts) - len(redacted_texts)
if removed_count > 0:
st.success(f"✅ Successfully removed {removed_count} text elements containing medication information")
else:
st.info("ℹ️ No medication sections were identified for removal")
# Create tabs for different views
tab1, tab2, tab3 = st.tabs(["📄 Side-by-Side Comparison", "🔍 JSON Structure", "📊 Processing Details"])
with tab1:
st.subheader("Original vs Redacted Content")
st.caption("Compare the original document content with the redacted version")
# Get the actual removed indices from the processing result
actual_removed_indices = []
if "processing_result" in st.session_state.processed_results[selected_file]:
processing_result = st.session_state.processed_results[selected_file]["processing_result"]
actual_removed_indices = processing_result.removed_indices
# Create a more intelligent side-by-side comparison based on JSON structure
col1, col2 = st.columns(2)
with col1:
st.markdown("**📋 Original Document**")
# Display original content with removed sections highlighted
for i, text_elem in enumerate(original_texts):
text_content = text_elem.get("text", "")
label = text_elem.get("label", "")
# Check if this element was removed
is_removed = i in actual_removed_indices
if is_removed:
# Highlight removed content in red
st.markdown(f"""
<div style="background-color: #ffebee; color: #c62828; padding: 8px; margin: 4px 0; border-left: 4px solid #f44336; border-radius: 4px;">
<strong>Text {i} ({label}) - REMOVED:</strong><br>
{text_content}
</div>
""", unsafe_allow_html=True)
else:
# Show normal content
content_preview = text_content[:150] + "..." if len(text_content) > 150 else text_content
st.markdown(f"""
<div style="padding: 4px; margin: 2px 0; border-radius: 4px;">
<strong>Text {i} ({label}) - {len(text_content)} chars:</strong><br>
<code style="background-color: #f5f5f5; padding: 2px; border-radius: 2px;">{content_preview}</code>
</div>
""", unsafe_allow_html=True)
with col2:
st.markdown("**🔒 Redacted Document**")
# Display redacted content (only non-removed elements)
redacted_index = 0
for i, text_elem in enumerate(original_texts):
text_content = text_elem.get("text", "")
label = text_elem.get("label", "")
# Check if this element was removed
is_removed = i in actual_removed_indices
if is_removed:
# Show placeholder for removed content
st.markdown(f"""
<div style="background-color: #ffebee; color: #c62828; padding: 8px; margin: 4px 0; border-left: 4px solid #f44336; border-radius: 4px; font-style: italic; opacity: 0.7;">
<strong>Text {i} ({label}) - REMOVED</strong><br>
[Content removed by redaction]
</div>
""", unsafe_allow_html=True)
else:
# Show the actual content from redacted texts
if redacted_index < len(redacted_texts):
redacted_content = redacted_texts[redacted_index].get("text", "")
content_preview = redacted_content[:150] + "..." if len(redacted_content) > 150 else redacted_content
st.markdown(f"""
<div style="padding: 4px; margin: 2px 0; border-radius: 4px;">
<strong>Text {i} ({label}) - {len(redacted_content)} chars:</strong><br>
<code style="background-color: #f5f5f5; padding: 2px; border-radius: 2px;">{content_preview}</code>
</div>
""", unsafe_allow_html=True)
redacted_index += 1
else:
st.markdown(f"""
<div style="padding: 4px; margin: 2px 0; border-radius: 4px; background-color: #f5f5f5;">
<strong>Text {i} ({label}):</strong><br>
[Content preserved]
</div>
""", unsafe_allow_html=True)
# Add legend
st.markdown("---")
col1, col2 = st.columns(2)
with col1:
st.markdown("**🎨 Comparison Legend:**")
st.markdown("🔴 **Red background** = Removed content")
st.markdown("⚪ **White background** = Preserved content")
st.markdown("📝 **Italic text** = Placeholder for removed content")
with col2:
st.markdown("**💡 How to read:**")
st.markdown("Left panel shows original with removed sections highlighted")
st.markdown("Right panel shows redacted version with placeholders")
st.markdown("Compare corresponding text indices to see changes")
# Add debug information to help identify missing content
with st.expander("🔍 Debug: Content Analysis"):
st.write("**Searching for table content...**")
# Search for table-related content in original texts
table_elements = []
for i, text_elem in enumerate(original_texts):
text_content = text_elem.get("text", "")
label = text_elem.get("label", "")
if "Bespreking" in text_content or "|" in text_content or "table" in label.lower():
table_elements.append({
"index": i,
"label": label,
"content": text_content[:200] + "..." if len(text_content) > 200 else text_content,
"is_removed": i in actual_removed_indices
})
if table_elements:
st.write(f"**Found {len(table_elements)} table-related elements:**")
for elem in table_elements:
status = "🔴 REMOVED" if elem["is_removed"] else "✅ PRESERVED"
st.write(f"**Text {elem['index']} ({elem['label']}) - {status}:**")
st.write(f"`{elem['content']}`")
st.write("---")
else:
st.write("**No table-related content found in original texts**")
# Also check redacted texts
st.write("**Table content in redacted texts:**")
table_elements_redacted = []
for i, text_elem in enumerate(redacted_texts):
text_content = text_elem.get("text", "")
label = text_elem.get("label", "")
if "Bespreking" in text_content or "|" in text_content or "table" in label.lower():
table_elements_redacted.append({
"index": i,
"label": label,
"content": text_content[:200] + "..." if len(text_content) > 200 else text_content
})
if table_elements_redacted:
st.write(f"**Found {len(table_elements_redacted)} table-related elements in redacted content:**")
for elem in table_elements_redacted:
st.write(f"**Text {elem['index']} ({elem['label']}):**")
st.write(f"`{elem['content']}`")
st.write("---")
else:
st.write("**No table-related content found in redacted texts**")
# Add download buttons for redacted content
st.markdown("---")
st.subheader("📥 Download Redacted Content")
col1, col2, col3 = st.columns(3)
with col1:
# Download redacted markdown
st.download_button(
label="📄 Download Redacted Markdown",
data=redacted_md,
file_name=f"{selected_file}_redacted.md",
mime="text/markdown",
help="Download the redacted document as Markdown format"
)
with col2:
# Generate and download redacted PDF
pdf_generated = False
pdf_bytes = None
if st.button("📋 Generate Redacted PDF", help="Generate a PDF version of the redacted document"):
with st.spinner("Generating redacted PDF..."):
try:
# Create a DocumentProcessor to access PDF generation
temp_path = save_uploaded_file(uploaded_file, selected_file)
processor = DocumentProcessor(section_extractor=None)
# Generate PDF path
base_name = os.path.splitext(selected_file)[0]
pdf_path = os.path.join(TEMP_DIR, f"{base_name}_redacted.pdf")
# Generate the PDF
success = processor.generate_redacted_pdf(redacted_json, pdf_path)
if success:
# Read the generated PDF and store for download
with open(pdf_path, "rb") as pdf_file:
pdf_bytes = pdf_file.read()
pdf_generated = True
st.success("✅ PDF generated successfully!")
else:
st.error("❌ Failed to generate PDF. Check logs for details.")
except Exception as e:
st.error(f"❌ Error generating PDF: {e}")
st.info("💡 Make sure reportlab is installed: `pip install reportlab`")
# Show download button if PDF was generated
if pdf_generated and pdf_bytes:
st.download_button(
label="📥 Download Redacted PDF",
data=pdf_bytes,
file_name=f"{os.path.splitext(selected_file)[0]}_redacted.pdf",
mime="application/pdf",
help="Download the redacted document as PDF"
)
# Show debug information about what's in the PDF
with st.expander("🔍 Debug: PDF Content Analysis"):
st.write("**Content that will be included in the PDF:**")
texts_in_pdf = redacted_json.get("texts", [])
st.write(f"Total text elements: {len(texts_in_pdf)}")
for i, text_elem in enumerate(texts_in_pdf):
text_content = text_elem.get("text", "")[:100] + "..." if len(text_elem.get("text", "")) > 100 else text_elem.get("text", "")
label = text_elem.get("label", "")
st.write(f"**Text {i} ({label}):** {text_content}")
elif not pdf_generated:
st.info("💡 Click 'Generate Redacted PDF' to create a PDF version")
with col3:
# Download redacted JSON structure
st.download_button(
label="🔧 Download Redacted JSON",
data=json.dumps(redacted_json, indent=2, ensure_ascii=False),
file_name=f"{selected_file}_redacted.json",
mime="application/json",
help="Download the redacted document structure as JSON"
)
with tab2:
st.subheader("Document Structure Analysis")
# Show JSON structure comparison
col1, col2 = st.columns(2)
with col1:
st.markdown("**📊 Original Structure (JSON)**")
st.json(structured_json)
with col2:
st.markdown("**🔒 Redacted Structure (JSON)**")
st.json(redacted_json)
with tab3:
st.subheader("Processing Details")
# Show cost analysis for this processing session
st.subheader("💰 Cost Analysis")
# Get cost data from the processing result
if "processing_result" in st.session_state.processed_results[selected_file]:
processing_result = st.session_state.processed_results[selected_file]["processing_result"]
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Total Cost", f"${processing_result.cost:.4f}")
with col2:
st.metric("Input Tokens", f"{processing_result.input_tokens:,}")
with col3:
st.metric("Output Tokens", f"{processing_result.output_tokens:,}")
# Add download button for cost report
cost_report = {
"timestamp": datetime.now().isoformat(),
"total_cost": processing_result.cost,
"input_tokens": processing_result.input_tokens,
"output_tokens": processing_result.output_tokens,
"total_tokens": processing_result.input_tokens + processing_result.output_tokens,
"document_processed": selected_file,
"model_used": AZURE_OPENAI_DEPLOYMENT
}
st.download_button(
label="📥 Download Cost Report (JSON)",
data=json.dumps(cost_report, indent=2),
file_name=f"cost_report_{selected_file}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
mime="application/json"
)
# Show model information
model_info = cost_tracker.get_model_info(AZURE_OPENAI_DEPLOYMENT)
if model_info:
st.subheader("Model Information")
st.write(f"**Model:** {model_info.description}")
st.write(f"**Input cost:** ${model_info.input_cost_per_1k_tokens:.4f}/1K tokens")
st.write(f"**Output cost:** ${model_info.output_cost_per_1k_tokens:.4f}/1K tokens")
# Calculate cost breakdown
input_cost = (processing_result.input_tokens / 1000) * model_info.input_cost_per_1k_tokens
output_cost = (processing_result.output_tokens / 1000) * model_info.output_cost_per_1k_tokens
st.write(f"**Cost breakdown:** Input: ${input_cost:.4f}, Output: ${output_cost:.4f}")
else:
# Fallback to old cost summary method
cost_summary = cost_tracker.get_session_summary()
if cost_summary["usage_count"] > 0:
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Total Cost", f"${cost_summary['total_cost']:.4f}")
with col2:
st.metric("Total Tokens", f"{cost_summary['total_tokens']:,}")
with col3:
st.metric("API Calls", cost_summary["usage_count"])
# Add download button for cost report
cost_report = {
"timestamp": datetime.now().isoformat(),
"total_cost": cost_summary["total_cost"],
"total_tokens": cost_summary["total_tokens"],
"api_calls": cost_summary["usage_count"],
"model_breakdown": cost_summary["model_breakdown"],
"document_processed": selected_file
}
st.download_button(
label="📥 Download Cost Report (JSON)",
data=json.dumps(cost_report, indent=2),
file_name=f"cost_report_{selected_file}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
mime="application/json"
)
# Show detailed model breakdown
if cost_summary["model_breakdown"]:
st.subheader("Model Usage Breakdown")
for model, stats in cost_summary["model_breakdown"].items():
model_info = cost_tracker.get_model_info(model)
model_display_name = model_info.description if model_info else model
with st.expander(f"{model_display_name} - ${stats['cost']:.4f}"):
col1, col2 = st.columns(2)
with col1:
st.write(f"**Input tokens:** {stats['input_tokens']:,}")
st.write(f"**Output tokens:** {stats['output_tokens']:,}")
with col2:
st.write(f"**Total tokens:** {stats['total_tokens']:,}")
st.write(f"**API calls:** {stats['usage_count']}")
# Show cost breakdown
if model_info:
input_cost = (stats['input_tokens'] / 1000) * model_info.input_cost_per_1k_tokens
output_cost = (stats['output_tokens'] / 1000) * model_info.output_cost_per_1k_tokens
st.write(f"**Cost breakdown:** Input: ${input_cost:.4f}, Output: ${output_cost:.4f}")
else:
st.info("No API calls recorded for this session")
# Show what was removed
if removed_count > 0:
st.info(f"**Removed {removed_count} text elements from the document structure.**")
# Show the removed text elements - use the actual indices from the processing result
st.subheader("Removed Text Elements:")
# Get the actual indices that were removed from the processing result
if "processing_result" in st.session_state.processed_results[selected_file]:
# Get the actual removed indices from the LLM response
processing_result = st.session_state.processed_results[selected_file]["processing_result"]
actual_removed_indices = processing_result.removed_indices
if actual_removed_indices:
st.info(f"**Elements removed by LLM analysis ({len(actual_removed_indices)} elements):**")
for idx in actual_removed_indices:
if idx < len(original_texts):
text_content = original_texts[idx].get("text", "")
st.text(f"Text {idx}: {text_content[:100]}{'...' if len(text_content) > 100 else ''}")
else:
st.text(f"Text {idx}: [Index out of bounds]")
else:
st.info("**No elements were identified for removal by the LLM.**")
else:
# Fallback to the old method if processing result not available
st.warning("**Note: Using fallback calculation method**")
removed_texts = []
for i, text_elem in enumerate(original_texts):
if i >= len(redacted_texts) or text_elem.get("text", "") != redacted_texts[i].get("text", ""):
removed_texts.append((i, text_elem.get("text", "")[:100] + "..." if len(text_elem.get("text", "")) > 100 else text_elem.get("text", "")))
for idx, text in removed_texts:
st.text(f"Text {idx}: {text}")
else:
st.info("No text elements were removed during processing.")
# Show processing logs
st.subheader("Processing Logs")
st.text_area(
label="Processing logs",
value=st.session_state.logs.get(selected_file, ""),
height=300,
label_visibility="collapsed"
)