Spaces:

levalencia
/

docling

Sleeping

App Files Files Community

docling / src /streamlit_app.py

levalencia

Enhance Streamlit app to support original JSON and YAML display

c62bdf5 about 1 month ago

raw

history blame contribute delete

55 kB

	# Set environment variables IMMEDIATELY to prevent root filesystem access
	# This must happen before any other imports or operations

	import os
	import tempfile
	import json
	from datetime import datetime

	# Get a writable temp directory first
	try:
	TEMP_DIR = os.path.join(tempfile.gettempdir(), "docling_temp")
	os.makedirs(TEMP_DIR, exist_ok=True)
	except Exception:
	try:
	TEMP_DIR = "/tmp/docling_temp"
	os.makedirs(TEMP_DIR, exist_ok=True)
	except Exception:
	TEMP_DIR = os.getcwd()

	# Set all environment variables that libraries might use
	os.environ.update({
	# Streamlit configuration
	'STREAMLIT_SERVER_FILE_WATCHER_TYPE': 'none',
	'STREAMLIT_SERVER_HEADLESS': 'true',
	'STREAMLIT_BROWSER_GATHER_USAGE_STATS': 'false',
	'STREAMLIT_SERVER_ENABLE_CORS': 'false',
	'STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION': 'false',

	# EasyOCR configuration
	'EASYOCR_MODULE_PATH': os.path.join(TEMP_DIR, 'easyocr_models'),
	'HOME': TEMP_DIR,
	'USERPROFILE': TEMP_DIR,
	'XDG_CACHE_HOME': os.path.join(TEMP_DIR, 'cache'),
	'XDG_CONFIG_HOME': os.path.join(TEMP_DIR, 'config'),
	'XDG_DATA_HOME': os.path.join(TEMP_DIR, 'data'),

	# Hugging Face Hub configuration - CRITICAL for preventing /.cache access
	'HF_HOME': os.path.join(TEMP_DIR, 'huggingface'),
	'HF_CACHE_HOME': os.path.join(TEMP_DIR, 'huggingface_cache'),
	'HF_HUB_CACHE': os.path.join(TEMP_DIR, 'huggingface_cache'),
	'TRANSFORMERS_CACHE': os.path.join(TEMP_DIR, 'transformers_cache'),
	'HF_DATASETS_CACHE': os.path.join(TEMP_DIR, 'datasets_cache'),
	'DIFFUSERS_CACHE': os.path.join(TEMP_DIR, 'diffusers_cache'),
	'ACCELERATE_CACHE': os.path.join(TEMP_DIR, 'accelerate_cache'),

	# Additional Hugging Face specific variables
	'HF_HUB_DISABLE_TELEMETRY': '1',
	'HF_HUB_DISABLE_IMPLICIT_TOKEN': '1',
	'HF_HUB_OFFLINE': '0',

	# Other ML libraries
	'TORCH_HOME': os.path.join(TEMP_DIR, 'torch'),
	'TENSORFLOW_HOME': os.path.join(TEMP_DIR, 'tensorflow'),
	'KERAS_HOME': os.path.join(TEMP_DIR, 'keras'),
	'MLFLOW_TRACKING_URI': f'file:{os.path.join(TEMP_DIR, "mlruns")}',

	# Additional cache directories
	'CACHE_DIR': os.path.join(TEMP_DIR, 'cache'),
	'MODEL_CACHE_DIR': os.path.join(TEMP_DIR, 'models'),

	# Additional environment variables to prevent root access
	'PYTHONPATH': TEMP_DIR,
	'TMPDIR': TEMP_DIR,
	'TEMP': TEMP_DIR,
	'TMP': TEMP_DIR,
	'CACHE': os.path.join(TEMP_DIR, 'cache'),
	'MODELS': os.path.join(TEMP_DIR, 'models'),
	'DATA': os.path.join(TEMP_DIR, 'data'),
	'CONFIG': os.path.join(TEMP_DIR, 'config'),
	})

	# Create all necessary directories
	directories_to_create = [
	os.environ['EASYOCR_MODULE_PATH'],
	os.environ['XDG_CACHE_HOME'],
	os.environ['XDG_CONFIG_HOME'],
	os.environ['XDG_DATA_HOME'],
	os.environ['HF_HOME'],
	os.environ['HF_CACHE_HOME'],
	os.environ['TRANSFORMERS_CACHE'],
	os.environ['HF_DATASETS_CACHE'],
	os.environ['TORCH_HOME'],
	os.environ['TENSORFLOW_HOME'],
	os.environ['KERAS_HOME'],
	os.environ['CACHE_DIR'],
	os.environ['MODEL_CACHE_DIR'],
	os.environ['CACHE'],
	os.environ['MODELS'],
	os.environ['DATA'],
	os.environ['CONFIG'],
	os.environ['HF_HUB_CACHE'],
	os.environ['DIFFUSERS_CACHE'],
	os.environ['ACCELERATE_CACHE'],
	]

	for directory in directories_to_create:
	try:
	# Create directory and all parent directories
	os.makedirs(directory, mode=0o777, exist_ok=True)
	# Ensure the directory has write permissions
	os.chmod(directory, 0o777)
	except Exception as e:
	print(f"Warning: Could not create directory {directory}: {e}")

	# Now import the rest of the modules
	import streamlit as st
	import logging
	import shutil
	from processing.document_processor import DocumentProcessor
	from processing.sections import ReasoningSectionExtractor
	from utils.logging_utils import get_log_handler
	from utils.cost_tracker import cost_tracker
	from dotenv import load_dotenv
	import sys
	import difflib
	import time

	# Configure logging early to avoid issues
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s %(levelname)s %(name)s: %(message)s",
	stream=sys.stdout,
	force=True
	)

	# Load environment variables from .env
	load_dotenv()

	AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
	AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
	AZURE_OPENAI_VERSION = os.getenv("AZURE_OPENAI_VERSION")
	AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT")

	# Log startup information
	logging.info("=" * 50)
	logging.info("Docling Streamlit App Starting")
	logging.info(f"Temp directory: {TEMP_DIR}")
	logging.info(f"EasyOCR model directory: {os.environ.get('EASYOCR_MODULE_PATH', 'NOT_SET')}")
	logging.info(f"Hugging Face cache: {os.environ.get('HF_CACHE_HOME', 'NOT_SET')}")
	logging.info(f"Current working directory: {os.getcwd()}")
	logging.info(f"Python version: {sys.version}")
	logging.info("=" * 50)

	def cleanup_temp_files():
	"""Clean up temporary files in the temp directory."""
	try:
	if os.path.exists(TEMP_DIR):
	for filename in os.listdir(TEMP_DIR):
	file_path = os.path.join(TEMP_DIR, filename)
	if os.path.isfile(file_path):
	try:
	os.remove(file_path)
	logging.info(f"Removed temp file: {filename}")
	except PermissionError as e:
	logging.warning(f"Permission error removing {filename}: {e}")
	except Exception as e:
	logging.warning(f"Error removing {filename}: {e}")
	logging.info(f"Cleaned up temporary files in {TEMP_DIR}")
	else:
	logging.info(f"Temp directory {TEMP_DIR} does not exist")
	except PermissionError as e:
	logging.warning(f"Permission error accessing temp directory {TEMP_DIR}: {e}")
	except Exception as e:
	logging.warning(f"Error cleaning up temp files: {e}")

	def clear_all_data():
	"""Clear all temporary files and session state data."""
	try:
	# Clean up temp files
	cleanup_temp_files()

	# Clear session state
	if "processed_results" in st.session_state:
	del st.session_state.processed_results
	if "logs" in st.session_state:
	del st.session_state.logs
	if "original_structures" in st.session_state:
	del st.session_state.original_structures
	if "show_original" in st.session_state:
	del st.session_state.show_original
	if "show_processed" in st.session_state:
	del st.session_state.show_processed
	if "temp_cleaned" in st.session_state:
	del st.session_state.temp_cleaned
	if "last_cleanup_time" in st.session_state:
	del st.session_state.last_cleanup_time

	logging.info("Cleared all session state and temporary files")
	return True
	except Exception as e:
	logging.error(f"Error clearing all data: {e}")
	return False

	def get_temp_files_info():
	"""Get information about temporary files (count and total size)."""
	try:
	if not os.path.exists(TEMP_DIR):
	return 0, 0

	files = os.listdir(TEMP_DIR)
	total_size = 0
	file_details = []

	for filename in files:
	try:
	file_path = os.path.join(TEMP_DIR, filename)
	if os.path.isfile(file_path):
	file_size = os.path.getsize(file_path)
	total_size += file_size
	file_details.append({
	'name': filename,
	'size': file_size,
	'type': 'file'
	})
	elif os.path.isdir(file_path):
	file_details.append({
	'name': filename,
	'size': 0,
	'type': 'directory'
	})
	except (PermissionError, OSError) as e:
	logging.warning(f"Error accessing file {filename}: {e}")
	file_details.append({
	'name': filename,
	'size': 0,
	'type': 'error'
	})
	continue

	# Log detailed information for debugging
	if file_details:
	logging.info(f"Temp directory contents ({TEMP_DIR}):")
	for detail in file_details:
	logging.info(f" - {detail['name']} ({detail['type']}): {detail['size']} bytes")

	return len(files), total_size
	except PermissionError as e:
	logging.warning(f"Permission error accessing temp directory {TEMP_DIR}: {e}")
	return 0, 0
	except Exception as e:
	logging.warning(f"Error getting temp files info: {e}")
	return 0, 0

	def format_file_size(size_bytes):
	"""Format file size in human readable format."""
	if size_bytes == 0:
	return "0 B"

	size_names = ["B", "KB", "MB", "GB"]
	i = 0
	while size_bytes >= 1024 and i < len(size_names) - 1:
	size_bytes /= 1024.0
	i += 1

	return f"{size_bytes:.1f} {size_names[i]}"

	def save_uploaded_file(uploaded_file, filename):
	"""Save uploaded file to temp directory and return the path."""
	temp_path = os.path.join(TEMP_DIR, f"temp_{filename}")
	try:
	uploaded_file.seek(0) # Reset file pointer to beginning
	file_bytes = uploaded_file.read()
	with open(temp_path, "wb") as f:
	f.write(file_bytes)
	logging.info(f"Saved uploaded file to {temp_path}")
	return temp_path
	except PermissionError as e:
	logging.error(f"Permission error saving uploaded file to {temp_path}: {e}")
	raise PermissionError(f"Cannot save file due to permission restrictions. Please try clearing data or contact support.")
	except Exception as e:
	logging.error(f"Error saving uploaded file: {e}")
	raise

	# Configure page layout to use wide mode
	st.set_page_config(
	page_title="Medical Document Parser & Redactor",
	page_icon="📄",
	layout="wide",
	initial_sidebar_state="collapsed"
	)

	# Add custom CSS for better styling
	st.markdown("""
	<style>
	/* Custom styling for text areas */
	.stTextArea textarea {
	font-family: 'Courier New', monospace !important;
	font-size: 12px !important;
	line-height: 1.4 !important;
	border: 2px solid #e0e0e0 !important;
	border-radius: 8px !important;
	}

	/* Hover effect for text areas */
	.stTextArea textarea:hover {
	border-color: #1f77b4 !important;
	}

	/* Custom styling for download buttons */
	.stDownloadButton > button {
	border-radius: 8px !important;
	font-weight: 600 !important;
	}

	/* Custom styling for the comparison section */
	.comparison-container {
	background-color: #f8f9fa;
	padding: 20px;
	border-radius: 10px;
	border: 1px solid #e9ecef;
	}
	</style>
	""", unsafe_allow_html=True)

	# Configure root logger only once (avoid duplicate handlers on reruns)
	if len(logging.getLogger().handlers) == 0:
	logging.getLogger().setLevel(logging.INFO)
	# (We will attach custom handlers during processing as needed)

	# Title and description
	st.title("Medical Document Parser & Redactor")
	st.write("""
	Upload PDF medical documents to parse their content using Docling (structure-aware parser)
	and automatically redact specific sections (e.g., initial and final medication lists).
	Use the buttons below to view the original structure or process with redaction.

	💡 Tip: This is a Hugging Face Space with limited storage. Use the "Clear All Data" button to remove temporary files when you're done processing documents.
	""")

	# Add clear all data button at the top
	if st.button("🧹 Clear All Data", type="secondary", help="Remove all temporary files and reset the application"):
	if clear_all_data():
	st.success("✅ All data cleared successfully! The application has been reset.")
	cost_tracker.reset_session() # Reset cost tracking when clearing data
	st.rerun()
	else:
	st.error("❌ Error clearing data. Please try again.")

	# File uploader (accept multiple PDF files)
	uploaded_files = st.file_uploader("Upload PDF medical documents", type=["pdf"], accept_multiple_files=True)

	# Clean up temp files on app start (but keep the directory)
	if "temp_cleaned" not in st.session_state:
	cleanup_temp_files()
	st.session_state.temp_cleaned = True

	# Initialize session state storage for results and logs
	if "processed_results" not in st.session_state:
	st.session_state.processed_results = {} # {filename: {"structured_json": ..., "redacted_md": ..., "redacted_json": ...}}
	if "logs" not in st.session_state:
	st.session_state.logs = {} # {filename: log_text}
	if "original_structures" not in st.session_state:
	st.session_state.original_structures = {} # {filename: structured_json}

	# Show temp directory status and cleanup button
	temp_file_count, total_size = get_temp_files_info()

	# Automatic cleanup: if temp files are too old or too large, clean them up
	if "last_cleanup_time" not in st.session_state:
	st.session_state.last_cleanup_time = time.time()

	# Check if we should do automatic cleanup (every 30 minutes or if files are too large)
	current_time = time.time()
	time_since_cleanup = current_time - st.session_state.last_cleanup_time

	if (time_since_cleanup > 1800 or # 30 minutes
	total_size > 100 * 1024 * 1024): # 100MB
	if temp_file_count > 0:
	cleanup_temp_files()
	st.session_state.last_cleanup_time = current_time
	st.info("🧹 Automatic cleanup: Removed old temporary files")
	# Recalculate after cleanup
	temp_file_count, total_size = get_temp_files_info()

	# Create a row with temp file status and delete button
	col1, col2 = st.columns([3, 1])

	with col1:
	if temp_file_count > 0:
	st.caption(f"📁 {temp_file_count} temporary file(s) - Total size: {format_file_size(total_size)}")

	# Show warning if total size is large
	if total_size > 50 * 1024 * 1024: # 50MB
	st.warning("⚠️ Large temporary files detected. Consider clearing data to free up space.")

	# Debug: Show temp files (expandable)
	with st.expander("🔍 Debug: View temporary files"):
	try:
	if os.path.exists(TEMP_DIR):
	files = os.listdir(TEMP_DIR)
	if files:
	st.write("Temporary files in directory:")
	for filename in files:
	file_path = os.path.join(TEMP_DIR, filename)
	try:
	if os.path.isfile(file_path):
	size = os.path.getsize(file_path)
	st.write(f"📄 {filename} ({format_file_size(size)})")
	elif os.path.isdir(file_path):
	st.write(f"📁 {filename} (directory)")
	else:
	st.write(f"❓ {filename} (unknown)")
	except Exception as e:
	st.write(f"❌ {filename} (error: {e})")
	else:
	st.write("No files found in temp directory")
	else:
	st.write("Temp directory does not exist")
	except Exception as e:
	st.write(f"Error accessing temp directory: {e}")
	else:
	st.caption("📁 No temporary files")

	with col2:
	if temp_file_count > 0:
	if st.button("🗑️ Delete Temp Files", type="secondary", help="Remove all temporary files from the server"):
	try:
	cleanup_temp_files()
	st.success(f"✅ Successfully deleted {temp_file_count} temporary file(s)")
	st.rerun() # Refresh the page to update the file count
	except Exception as e:
	st.error(f"❌ Error deleting temporary files: {e}")
	else:
	st.caption("No files to delete")

	if uploaded_files:
	# UI to select which file to work with (if multiple files uploaded)
	file_names = [f.name for f in uploaded_files]
	selected_file = st.selectbox("Select a file to work with", options=file_names)

	if selected_file:
	# Find the selected uploaded file
	uploaded_file = next(f for f in uploaded_files if f.name == selected_file)

	# Create buttons for different actions
	col1, col2, col3, col4, col5 = st.columns(5)

	with col1:
	if st.button("📄 Show Original", type="primary"):
	# Process the document to get original structure (without redaction)
	if selected_file not in st.session_state.original_structures:
	# Save uploaded file to a temporary location
	temp_path = save_uploaded_file(uploaded_file, selected_file)

	# Create a DocumentProcessor without section extraction (for original structure)
	processor = DocumentProcessor(section_extractor=None)

	# Process the document to get original structure
	result = processor.process(temp_path)
	st.session_state.original_structures[selected_file] = result.structured_json
	# Also store the original markdown for comparison
	st.session_state.original_structures[f"{selected_file}_markdown"] = result.structured_markdown

	# Display the original structure
	st.session_state.show_original = True
	st.session_state.show_processed = False

	with col2:
	if st.button("🔒 Process with Redaction"):
	# Process the document with redaction
	if selected_file not in st.session_state.processed_results:
	# Save uploaded file to a temporary location
	temp_path = save_uploaded_file(uploaded_file, selected_file)

	# Ensure the deployment name is in the cost tracker
	if AZURE_OPENAI_DEPLOYMENT and AZURE_OPENAI_DEPLOYMENT not in cost_tracker.get_available_models():
	model_type = cost_tracker.guess_model_type(AZURE_OPENAI_DEPLOYMENT)
	cost_tracker.add_deployment_pricing(AZURE_OPENAI_DEPLOYMENT, model_type)

	# Use the new processing function
	from processing.document_processor import process_document_with_redaction

	# Attach an in-memory log handler to capture logs for this file
	log_handler, log_buffer = get_log_handler()
	root_logger = logging.getLogger()
	root_logger.addHandler(log_handler)

	try:
	# Process the document using the new function
	processing_result = process_document_with_redaction(
	file_path=temp_path,
	endpoint=AZURE_OPENAI_ENDPOINT,
	api_key=AZURE_OPENAI_KEY,
	api_version=AZURE_OPENAI_VERSION,
	deployment=AZURE_OPENAI_DEPLOYMENT,
	)

	# Save results in session state (maintaining compatibility with existing UI)
	st.session_state.processed_results[selected_file] = {
	"structured_json": processing_result.original_document_json,
	"redacted_md": processing_result.redacted_document_md,
	"redacted_json": processing_result.redacted_document_json, # Now this is actually redacted!
	"original_markdown": processing_result.original_document_md,
	"processing_result": processing_result # Store the new result
	}

	finally:
	# Remove handler and stop capturing logs
	root_logger.removeHandler(log_handler)

	# Combine log records into a single text
	log_text = "\n".join(log_buffer)
	st.session_state.logs[selected_file] = log_text

	st.session_state.show_original = False
	st.session_state.show_processed = True

	with col3:
	if st.button("🔄 Switch View"):
	# Toggle between views
	if st.session_state.get("show_original", False):
	st.session_state.show_original = False
	st.session_state.show_processed = True
	else:
	st.session_state.show_original = True
	st.session_state.show_processed = False

	with col4:
	if st.button("📄 Show Original JSON", type="secondary"):
	# Process the document to get original structure (without redaction)
	if selected_file not in st.session_state.original_structures:
	# Save uploaded file to a temporary location
	temp_path = save_uploaded_file(uploaded_file, selected_file)

	# Create a DocumentProcessor without section extraction (for original structure)
	processor = DocumentProcessor(section_extractor=None)

	# Process the document to get original structure
	result = processor.process(temp_path)
	st.session_state.original_structures[selected_file] = result.structured_json
	# Store the original markdown for comparison
	st.session_state.original_structures[f"{selected_file}_markdown"] = result.structured_markdown
	# Store the original YAML for comparison
	st.session_state.original_structures[f"{selected_file}_yaml"] = result.structured_yaml

	# Display the original JSON structure
	st.session_state.show_original = True
	st.session_state.show_processed = False
	st.session_state.show_json = True
	st.session_state.show_yaml = False

	with col5:
	if st.button("📄 Show Original YAML", type="secondary"):
	# Process the document to get original structure (without redaction)
	if selected_file not in st.session_state.original_structures:
	# Save uploaded file to a temporary location
	temp_path = save_uploaded_file(uploaded_file, selected_file)

	# Create a DocumentProcessor without section extraction (for original structure)
	processor = DocumentProcessor(section_extractor=None)

	# Process the document to get original structure
	result = processor.process(temp_path)
	st.session_state.original_structures[selected_file] = result.structured_json
	# Store the original markdown for comparison
	st.session_state.original_structures[f"{selected_file}_markdown"] = result.structured_markdown
	# Store the original YAML for comparison
	st.session_state.original_structures[f"{selected_file}_yaml"] = result.structured_yaml

	# Display the original YAML structure
	st.session_state.show_original = True
	st.session_state.show_processed = False
	st.session_state.show_json = False
	st.session_state.show_yaml = True

	# Show current view status
	if st.session_state.get("show_original", False):
	st.info("📄 Currently viewing: Original Document Structure")
	elif st.session_state.get("show_processed", False):
	st.success("🔒 Currently viewing: Processed Document with Redaction")
	else:
	st.info("ℹ️ Select an action above to view document content")

	# Display results based on button clicked
	if st.session_state.get("show_original", False):
	st.markdown("---")

	# Determine what to show based on button clicked
	show_json = st.session_state.get("show_json", False)
	show_yaml = st.session_state.get("show_yaml", False)

	if show_json:
	st.subheader(f"Original Document Structure (JSON) - {selected_file}")
	elif show_yaml:
	st.subheader(f"Original Document Structure (YAML) - {selected_file}")
	else:
	st.subheader(f"Original Document Structure (Markdown) - {selected_file}")

	# Get the original structure
	original_json = st.session_state.original_structures[selected_file]
	original_markdown = st.session_state.original_structures.get(f"{selected_file}_markdown", "")
	original_yaml = st.session_state.original_structures.get(f"{selected_file}_yaml", "")

	# Display PDF viewer and content side by side
	col1, col2 = st.columns([1, 1])

	with col1:
	st.subheader("📄 Original PDF")
	# Reset file pointer to beginning
	uploaded_file.seek(0)
	# Display PDF using base64 encoding for inline display
	import base64
	pdf_bytes = uploaded_file.getvalue()
	b64_pdf = base64.b64encode(pdf_bytes).decode()
	pdf_display = f'<iframe src="data:application/pdf;base64,{b64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'
	st.markdown(pdf_display, unsafe_allow_html=True)

	with col2:
	if show_json:
	st.subheader("📋 Original Document (JSON)")
	st.caption("Docling-generated JSON structure from the PDF")
	# Use a text area for better readability and scrolling
	st.text_area(
	label="Original JSON content",
	value=json.dumps(original_json, indent=2, ensure_ascii=False),
	height=600,
	key="original_json_display",
	label_visibility="collapsed"
	)
	elif show_yaml:
	st.subheader("📋 Original Document (YAML)")
	st.caption("Docling-generated YAML structure from the PDF")
	# Use a text area for better readability and scrolling
	st.text_area(
	label="Original YAML content",
	value=original_yaml,
	height=600,
	key="original_yaml_display",
	label_visibility="collapsed"
	)
	else:
	st.subheader("📋 Original Document (Markdown)")
	st.caption("Docling-generated markdown from the PDF")
	# Use a text area for better readability and scrolling
	st.text_area(
	label="Original markdown content",
	value=original_markdown,
	height=600,
	key="original_markdown_display",
	label_visibility="collapsed"
	)

	# Add download buttons for the original content
	st.markdown("---")
	col1, col2, col3 = st.columns(3)
	with col1:
	if show_json:
	st.download_button(
	label="📥 Download Original JSON",
	data=json.dumps(original_json, indent=2, ensure_ascii=False),
	file_name=f"{selected_file}_original.json",
	mime="application/json"
	)
	elif show_yaml:
	st.download_button(
	label="📥 Download Original YAML",
	data=original_yaml,
	file_name=f"{selected_file}_original.yaml",
	mime="text/yaml"
	)
	else:
	st.download_button(
	label="📥 Download Original Markdown",
	data=original_markdown,
	file_name=f"{selected_file}_original.md",
	mime="text/markdown"
	)
	with col2:
	if show_json or show_yaml:
	st.subheader("📊 Document Structure")
	st.json(original_json)
	else:
	st.subheader("📊 JSON Structure")
	st.json(original_json)
	with col3:
	if show_json or show_yaml:
	# Show format information
	st.subheader("📋 Format Info")
	if show_json:
	st.info("JSON Format: Structured data representation with key-value pairs")
	st.write("Use case: API integration, data processing, programmatic access")
	elif show_yaml:
	st.info("YAML Format: Human-readable data serialization")
	st.write("Use case: Configuration files, documentation, easy reading")
	else:
	st.subheader("📋 Markdown Info")
	st.info("Markdown Format: Formatted text with headers, lists, and styling")
	st.write("Use case: Documentation, readable output, web display")

	elif st.session_state.get("show_processed", False):
	st.markdown("---")
	st.subheader(f"Processed Document - {selected_file}")

	# Retrieve stored results
	data = st.session_state.processed_results[selected_file]
	structured_json = data["structured_json"]
	redacted_md = data["redacted_md"]
	redacted_json = data["redacted_json"]
	original_md = data["original_markdown"]

	# Show processing summary
	original_texts = structured_json.get("texts", [])
	redacted_texts = redacted_json.get("texts", [])
	removed_count = len(original_texts) - len(redacted_texts)

	if removed_count > 0:
	st.success(f"✅ Successfully removed {removed_count} text elements containing medication information")
	else:
	st.info("ℹ️ No medication sections were identified for removal")

	# Create tabs for different views
	tab1, tab2, tab3 = st.tabs(["📄 Side-by-Side Comparison", "🔍 JSON Structure", "📊 Processing Details"])

	with tab1:
	st.subheader("Original vs Redacted Content")
	st.caption("Compare the original document content with the redacted version")

	# Get the actual removed indices from the processing result
	actual_removed_indices = []
	if "processing_result" in st.session_state.processed_results[selected_file]:
	processing_result = st.session_state.processed_results[selected_file]["processing_result"]
	actual_removed_indices = processing_result.removed_indices

	# Create a more intelligent side-by-side comparison based on JSON structure
	col1, col2 = st.columns(2)

	with col1:
	st.markdown("📋 Original Document")

	# Display original content with removed sections highlighted
	for i, text_elem in enumerate(original_texts):
	text_content = text_elem.get("text", "")
	label = text_elem.get("label", "")

	# Check if this element was removed
	is_removed = i in actual_removed_indices

	if is_removed:
	# Highlight removed content in red
	st.markdown(f"""
	<div style="background-color: #ffebee; color: #c62828; padding: 8px; margin: 4px 0; border-left: 4px solid #f44336; border-radius: 4px;">
	<strong>Text {i} ({label}) - REMOVED:</strong><br>
	{text_content}
	</div>
	""", unsafe_allow_html=True)
	else:
	# Show normal content
	content_preview = text_content[:150] + "..." if len(text_content) > 150 else text_content
	st.markdown(f"""
	<div style="padding: 4px; margin: 2px 0; border-radius: 4px;">
	<strong>Text {i} ({label}) - {len(text_content)} chars:</strong><br>
	<code style="background-color: #f5f5f5; padding: 2px; border-radius: 2px;">{content_preview}</code>
	</div>
	""", unsafe_allow_html=True)

	with col2:
	st.markdown("🔒 Redacted Document")

	# Display redacted content (only non-removed elements)
	redacted_index = 0
	for i, text_elem in enumerate(original_texts):
	text_content = text_elem.get("text", "")
	label = text_elem.get("label", "")

	# Check if this element was removed
	is_removed = i in actual_removed_indices

	if is_removed:
	# Show placeholder for removed content
	st.markdown(f"""
	<div style="background-color: #ffebee; color: #c62828; padding: 8px; margin: 4px 0; border-left: 4px solid #f44336; border-radius: 4px; font-style: italic; opacity: 0.7;">
	<strong>Text {i} ({label}) - REMOVED</strong><br>
	[Content removed by redaction]
	</div>
	""", unsafe_allow_html=True)
	else:
	# Show the actual content from redacted texts
	if redacted_index < len(redacted_texts):
	redacted_content = redacted_texts[redacted_index].get("text", "")
	content_preview = redacted_content[:150] + "..." if len(redacted_content) > 150 else redacted_content
	st.markdown(f"""
	<div style="padding: 4px; margin: 2px 0; border-radius: 4px;">
	<strong>Text {i} ({label}) - {len(redacted_content)} chars:</strong><br>
	<code style="background-color: #f5f5f5; padding: 2px; border-radius: 2px;">{content_preview}</code>
	</div>
	""", unsafe_allow_html=True)
	redacted_index += 1
	else:
	st.markdown(f"""
	<div style="padding: 4px; margin: 2px 0; border-radius: 4px; background-color: #f5f5f5;">
	<strong>Text {i} ({label}):</strong><br>
	[Content preserved]
	</div>
	""", unsafe_allow_html=True)

	# Add legend
	st.markdown("---")
	col1, col2 = st.columns(2)
	with col1:
	st.markdown("🎨 Comparison Legend:")
	st.markdown("🔴 Red background = Removed content")
	st.markdown("⚪ White background = Preserved content")
	st.markdown("📝 Italic text = Placeholder for removed content")

	with col2:
	st.markdown("💡 How to read:")
	st.markdown("Left panel shows original with removed sections highlighted")
	st.markdown("Right panel shows redacted version with placeholders")
	st.markdown("Compare corresponding text indices to see changes")

	# Add debug information to help identify missing content
	with st.expander("🔍 Debug: Content Analysis"):
	st.write("Searching for table content...")

	# Search for table-related content in original texts
	table_elements = []
	for i, text_elem in enumerate(original_texts):
	text_content = text_elem.get("text", "")
	label = text_elem.get("label", "")

	if "Bespreking" in text_content or "\|" in text_content or "table" in label.lower():
	table_elements.append({
	"index": i,
	"label": label,
	"content": text_content[:200] + "..." if len(text_content) > 200 else text_content,
	"is_removed": i in actual_removed_indices
	})

	if table_elements:
	st.write(f"Found {len(table_elements)} table-related elements:")
	for elem in table_elements:
	status = "🔴 REMOVED" if elem["is_removed"] else "✅ PRESERVED"
	st.write(f"Text {elem['index']} ({elem['label']}) - {status}:")
	st.write(f"`{elem['content']}`")
	st.write("---")
	else:
	st.write("No table-related content found in original texts")

	# Also check redacted texts
	st.write("Table content in redacted texts:")
	table_elements_redacted = []
	for i, text_elem in enumerate(redacted_texts):
	text_content = text_elem.get("text", "")
	label = text_elem.get("label", "")

	if "Bespreking" in text_content or "\|" in text_content or "table" in label.lower():
	table_elements_redacted.append({
	"index": i,
	"label": label,
	"content": text_content[:200] + "..." if len(text_content) > 200 else text_content
	})

	if table_elements_redacted:
	st.write(f"Found {len(table_elements_redacted)} table-related elements in redacted content:")
	for elem in table_elements_redacted:
	st.write(f"Text {elem['index']} ({elem['label']}):")
	st.write(f"`{elem['content']}`")
	st.write("---")
	else:
	st.write("No table-related content found in redacted texts")

	# Add download buttons for redacted content
	st.markdown("---")
	st.subheader("📥 Download Redacted Content")

	col1, col2, col3 = st.columns(3)

	with col1:
	# Download redacted markdown
	st.download_button(
	label="📄 Download Redacted Markdown",
	data=redacted_md,
	file_name=f"{selected_file}_redacted.md",
	mime="text/markdown",
	help="Download the redacted document as Markdown format"
	)

	with col2:
	# Generate and download redacted PDF
	pdf_generated = False
	pdf_bytes = None

	if st.button("📋 Generate Redacted PDF", help="Generate a PDF version of the redacted document"):
	with st.spinner("Generating redacted PDF..."):
	try:
	# Create a DocumentProcessor to access PDF generation
	temp_path = save_uploaded_file(uploaded_file, selected_file)
	processor = DocumentProcessor(section_extractor=None)

	# Generate PDF path
	base_name = os.path.splitext(selected_file)[0]
	pdf_path = os.path.join(TEMP_DIR, f"{base_name}_redacted.pdf")

	# Generate the PDF
	success = processor.generate_redacted_pdf(redacted_json, pdf_path)

	if success:
	# Read the generated PDF and store for download
	with open(pdf_path, "rb") as pdf_file:
	pdf_bytes = pdf_file.read()
	pdf_generated = True
	st.success("✅ PDF generated successfully!")
	else:
	st.error("❌ Failed to generate PDF. Check logs for details.")

	except Exception as e:
	st.error(f"❌ Error generating PDF: {e}")
	st.info("💡 Make sure reportlab is installed: `pip install reportlab`")

	# Show download button if PDF was generated
	if pdf_generated and pdf_bytes:
	st.download_button(
	label="📥 Download Redacted PDF",
	data=pdf_bytes,
	file_name=f"{os.path.splitext(selected_file)[0]}_redacted.pdf",
	mime="application/pdf",
	help="Download the redacted document as PDF"
	)

	# Show debug information about what's in the PDF
	with st.expander("🔍 Debug: PDF Content Analysis"):
	st.write("Content that will be included in the PDF:")
	texts_in_pdf = redacted_json.get("texts", [])
	st.write(f"Total text elements: {len(texts_in_pdf)}")

	for i, text_elem in enumerate(texts_in_pdf):
	text_content = text_elem.get("text", "")[:100] + "..." if len(text_elem.get("text", "")) > 100 else text_elem.get("text", "")
	label = text_elem.get("label", "")
	st.write(f"Text {i} ({label}): {text_content}")
	elif not pdf_generated:
	st.info("💡 Click 'Generate Redacted PDF' to create a PDF version")

	with col3:
	# Download redacted JSON structure
	st.download_button(
	label="🔧 Download Redacted JSON",
	data=json.dumps(redacted_json, indent=2, ensure_ascii=False),
	file_name=f"{selected_file}_redacted.json",
	mime="application/json",
	help="Download the redacted document structure as JSON"
	)

	with tab2:
	st.subheader("Document Structure Analysis")
	# Show JSON structure comparison
	col1, col2 = st.columns(2)

	with col1:
	st.markdown("📊 Original Structure (JSON)")
	st.json(structured_json)

	with col2:
	st.markdown("🔒 Redacted Structure (JSON)")
	st.json(redacted_json)

	with tab3:
	st.subheader("Processing Details")

	# Show cost analysis for this processing session
	st.subheader("💰 Cost Analysis")

	# Get cost data from the processing result
	if "processing_result" in st.session_state.processed_results[selected_file]:
	processing_result = st.session_state.processed_results[selected_file]["processing_result"]

	col1, col2, col3 = st.columns(3)
	with col1:
	st.metric("Total Cost", f"${processing_result.cost:.4f}")
	with col2:
	st.metric("Input Tokens", f"{processing_result.input_tokens:,}")
	with col3:
	st.metric("Output Tokens", f"{processing_result.output_tokens:,}")

	# Add download button for cost report
	cost_report = {
	"timestamp": datetime.now().isoformat(),
	"total_cost": processing_result.cost,
	"input_tokens": processing_result.input_tokens,
	"output_tokens": processing_result.output_tokens,
	"total_tokens": processing_result.input_tokens + processing_result.output_tokens,
	"document_processed": selected_file,
	"model_used": AZURE_OPENAI_DEPLOYMENT
	}

	st.download_button(
	label="📥 Download Cost Report (JSON)",
	data=json.dumps(cost_report, indent=2),
	file_name=f"cost_report_{selected_file}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
	mime="application/json"
	)

	# Show model information
	model_info = cost_tracker.get_model_info(AZURE_OPENAI_DEPLOYMENT)
	if model_info:
	st.subheader("Model Information")
	st.write(f"Model: {model_info.description}")
	st.write(f"Input cost: ${model_info.input_cost_per_1k_tokens:.4f}/1K tokens")
	st.write(f"Output cost: ${model_info.output_cost_per_1k_tokens:.4f}/1K tokens")

	# Calculate cost breakdown
	input_cost = (processing_result.input_tokens / 1000) * model_info.input_cost_per_1k_tokens
	output_cost = (processing_result.output_tokens / 1000) * model_info.output_cost_per_1k_tokens
	st.write(f"Cost breakdown: Input: ${input_cost:.4f}, Output: ${output_cost:.4f}")
	else:
	# Fallback to old cost summary method
	cost_summary = cost_tracker.get_session_summary()

	if cost_summary["usage_count"] > 0:
	col1, col2, col3 = st.columns(3)
	with col1:
	st.metric("Total Cost", f"${cost_summary['total_cost']:.4f}")
	with col2:
	st.metric("Total Tokens", f"{cost_summary['total_tokens']:,}")
	with col3:
	st.metric("API Calls", cost_summary["usage_count"])

	# Add download button for cost report
	cost_report = {
	"timestamp": datetime.now().isoformat(),
	"total_cost": cost_summary["total_cost"],
	"total_tokens": cost_summary["total_tokens"],
	"api_calls": cost_summary["usage_count"],
	"model_breakdown": cost_summary["model_breakdown"],
	"document_processed": selected_file
	}

	st.download_button(
	label="📥 Download Cost Report (JSON)",
	data=json.dumps(cost_report, indent=2),
	file_name=f"cost_report_{selected_file}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
	mime="application/json"
	)

	# Show detailed model breakdown
	if cost_summary["model_breakdown"]:
	st.subheader("Model Usage Breakdown")
	for model, stats in cost_summary["model_breakdown"].items():
	model_info = cost_tracker.get_model_info(model)
	model_display_name = model_info.description if model_info else model

	with st.expander(f"{model_display_name} - ${stats['cost']:.4f}"):
	col1, col2 = st.columns(2)
	with col1:
	st.write(f"Input tokens: {stats['input_tokens']:,}")
	st.write(f"Output tokens: {stats['output_tokens']:,}")
	with col2:
	st.write(f"Total tokens: {stats['total_tokens']:,}")
	st.write(f"API calls: {stats['usage_count']}")

	# Show cost breakdown
	if model_info:
	input_cost = (stats['input_tokens'] / 1000) * model_info.input_cost_per_1k_tokens
	output_cost = (stats['output_tokens'] / 1000) * model_info.output_cost_per_1k_tokens
	st.write(f"Cost breakdown: Input: ${input_cost:.4f}, Output: ${output_cost:.4f}")
	else:
	st.info("No API calls recorded for this session")

	# Show what was removed
	if removed_count > 0:
	st.info(f"Removed {removed_count} text elements from the document structure.")

	# Show the removed text elements - use the actual indices from the processing result
	st.subheader("Removed Text Elements:")

	# Get the actual indices that were removed from the processing result
	if "processing_result" in st.session_state.processed_results[selected_file]:
	# Get the actual removed indices from the LLM response
	processing_result = st.session_state.processed_results[selected_file]["processing_result"]
	actual_removed_indices = processing_result.removed_indices

	if actual_removed_indices:
	st.info(f"Elements removed by LLM analysis ({len(actual_removed_indices)} elements):")

	for idx in actual_removed_indices:
	if idx < len(original_texts):
	text_content = original_texts[idx].get("text", "")
	st.text(f"Text {idx}: {text_content[:100]}{'...' if len(text_content) > 100 else ''}")
	else:
	st.text(f"Text {idx}: [Index out of bounds]")
	else:
	st.info("No elements were identified for removal by the LLM.")
	else:
	# Fallback to the old method if processing result not available
	st.warning("Note: Using fallback calculation method")
	removed_texts = []
	for i, text_elem in enumerate(original_texts):
	if i >= len(redacted_texts) or text_elem.get("text", "") != redacted_texts[i].get("text", ""):
	removed_texts.append((i, text_elem.get("text", "")[:100] + "..." if len(text_elem.get("text", "")) > 100 else text_elem.get("text", "")))

	for idx, text in removed_texts:
	st.text(f"Text {idx}: {text}")
	else:
	st.info("No text elements were removed during processing.")

	# Show processing logs
	st.subheader("Processing Logs")
	st.text_area(
	label="Processing logs",
	value=st.session_state.logs.get(selected_file, ""),
	height=300,
	label_visibility="collapsed"
	)