import os os.environ["STREAMLIT_SERVER_PORT"] = "8501" os.environ["STREAMLIT_SERVER_HEADLESS"] = "true" # Must be first Streamlit command import streamlit as st st.set_page_config(page_title="DocAnalyzer Pro", layout="wide") # Import all required libraries from transformers import pipeline from PyPDF2 import PdfReader import docx import time import psutil from pathlib import Path import torch from tempfile import gettempdir # 🔁 New import # ====================== # CACHE SETUP # ====================== def setup_environment(): """Ensure cache directories exist in a safe location""" cache_dir = Path(gettempdir()) / "models" # 🔁 Changed from /app/models cache_dir.mkdir(exist_ok=True, parents=True) return cache_dir cache_dir = setup_environment() # ====================== # MODEL LOADING # ====================== @st.cache_resource(ttl=3600) def load_models(): """Load optimized models for Hugging Face Spaces""" try: with st.spinner("🔄 Loading AI models (this may take 1-2 minutes)..."): return { 'qa': pipeline( "question-answering", model="distilbert-base-cased-distilled-squad", device=-1 ), 'summarizer': pipeline( "summarization", model="sshleifer/distilbart-cnn-12-6", device=-1 ) } except Exception as e: st.error(f"❌ Failed to load models: {str(e)}") st.stop() models = load_models() # ====================== # DOCUMENT PROCESSING # ====================== def extract_text(file): """Extract text from PDF/DOCX files with error handling""" if file is None: return "" try: if file.type == "application/pdf": reader = PdfReader(file) return " ".join(page.extract_text() for page in reader.pages if page.extract_text()) elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": doc = docx.Document(file) return "\n".join(para.text for para in doc.paragraphs if para.text) except Exception as e: st.error(f"⚠️ Error processing document: {str(e)}") return "" # ====================== # CORE FUNCTIONS # ====================== def generate_summary(text, max_length=150): """Generate summary with chunking for large documents""" if not text or len(text.strip()) == 0: return "" try: if len(text) > 10000: chunk_size = 3000 chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)] summaries = [] for chunk in chunks: result = models['summarizer']( chunk, max_length=max(max_length//len(chunks), 30), min_length=30, do_sample=False ) summaries.append(result[0]['summary_text']) return " ".join(summaries) return models['summarizer'](text, max_length=max_length)[0]['summary_text'] except Exception as e: st.error(f"❌ Summarization failed: {str(e)}") return "" # ====================== # STREAMLIT UI # ====================== st.title("📄 DocAnalyzer Pro") # File Upload Section with st.expander("📤 Upload Document", expanded=True): uploaded_file = st.file_uploader("Choose PDF/DOCX", type=["pdf", "docx"]) manual_text = st.text_area("Or paste text here:", height=150) context = extract_text(uploaded_file) if uploaded_file else manual_text # Main Features tab1, tab2 = st.tabs(["🔍 Question Answering", "📝 Summarization"]) with tab1: if context and len(context.strip()) > 0: question = st.text_input("Ask about the document:") if question and len(question.strip()) > 0: with st.spinner("Analyzing..."): start_time = time.time() try: result = models['qa']( question=question, context=context[:100000] ) st.success(f"Answered in {time.time()-start_time:.1f}s") st.markdown(f"**Answer:** {result['answer']}") st.progress(result['score']) st.caption(f"Confidence: {result['score']:.0%}") except Exception as e: st.error(f"❌ Question answering failed: {str(e)}") with tab2: if context and len(context.strip()) > 0: with st.form("summary_form"): length = st.slider("Summary Length", 50, 300, 150) if st.form_submit_button("Generate Summary"): with st.spinner("Summarizing..."): start_time = time.time() summary = generate_summary(context, length) if summary: st.success(f"Generated in {time.time()-start_time:.1f}s") st.markdown(f"**Summary:**\n\n{summary}") # System Info with st.expander("⚙️ System Status"): try: device_status = 'GPU ✅' if torch.cuda.is_available() else 'CPU ⚠️' except: device_status = 'CPU (torch not configured)' st.code(f""" Models loaded: {', '.join(models.keys())} Device: {device_status} Memory: {psutil.virtual_memory().percent}% used CPU: {psutil.cpu_percent()}% used Cache location: {cache_dir} """)