FINALLL / aapp.py
amritn8's picture
Rename app.py to aapp.py
6db0b20 verified
import os
os.environ["STREAMLIT_SERVER_PORT"] = "8501"
os.environ["STREAMLIT_SERVER_HEADLESS"] = "true"
# Must be first Streamlit command
import streamlit as st
st.set_page_config(page_title="DocAnalyzer Pro", layout="wide")
# Import all required libraries
from transformers import pipeline
from PyPDF2 import PdfReader
import docx
import time
import psutil
from pathlib import Path
import torch
from tempfile import gettempdir # πŸ” New import
# ======================
# CACHE SETUP
# ======================
def setup_environment():
"""Ensure cache directories exist in a safe location"""
cache_dir = Path(gettempdir()) / "models" # πŸ” Changed from /app/models
cache_dir.mkdir(exist_ok=True, parents=True)
return cache_dir
cache_dir = setup_environment()
# ======================
# MODEL LOADING
# ======================
@st.cache_resource(ttl=3600)
def load_models():
"""Load optimized models for Hugging Face Spaces"""
try:
with st.spinner("πŸ”„ Loading AI models (this may take 1-2 minutes)..."):
return {
'qa': pipeline(
"question-answering",
model="distilbert-base-cased-distilled-squad",
device=-1
),
'summarizer': pipeline(
"summarization",
model="sshleifer/distilbart-cnn-12-6",
device=-1
)
}
except Exception as e:
st.error(f"❌ Failed to load models: {str(e)}")
st.stop()
models = load_models()
# ======================
# DOCUMENT PROCESSING
# ======================
def extract_text(file):
"""Extract text from PDF/DOCX files with error handling"""
if file is None:
return ""
try:
if file.type == "application/pdf":
reader = PdfReader(file)
return " ".join(page.extract_text() for page in reader.pages if page.extract_text())
elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
doc = docx.Document(file)
return "\n".join(para.text for para in doc.paragraphs if para.text)
except Exception as e:
st.error(f"⚠️ Error processing document: {str(e)}")
return ""
# ======================
# CORE FUNCTIONS
# ======================
def generate_summary(text, max_length=150):
"""Generate summary with chunking for large documents"""
if not text or len(text.strip()) == 0:
return ""
try:
if len(text) > 10000:
chunk_size = 3000
chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
summaries = []
for chunk in chunks:
result = models['summarizer'](
chunk,
max_length=max(max_length//len(chunks), 30),
min_length=30,
do_sample=False
)
summaries.append(result[0]['summary_text'])
return " ".join(summaries)
return models['summarizer'](text, max_length=max_length)[0]['summary_text']
except Exception as e:
st.error(f"❌ Summarization failed: {str(e)}")
return ""
# ======================
# STREAMLIT UI
# ======================
st.title("πŸ“„ DocAnalyzer Pro")
# File Upload Section
with st.expander("πŸ“€ Upload Document", expanded=True):
uploaded_file = st.file_uploader("Choose PDF/DOCX", type=["pdf", "docx"])
manual_text = st.text_area("Or paste text here:", height=150)
context = extract_text(uploaded_file) if uploaded_file else manual_text
# Main Features
tab1, tab2 = st.tabs(["πŸ” Question Answering", "πŸ“ Summarization"])
with tab1:
if context and len(context.strip()) > 0:
question = st.text_input("Ask about the document:")
if question and len(question.strip()) > 0:
with st.spinner("Analyzing..."):
start_time = time.time()
try:
result = models['qa'](
question=question,
context=context[:100000]
)
st.success(f"Answered in {time.time()-start_time:.1f}s")
st.markdown(f"**Answer:** {result['answer']}")
st.progress(result['score'])
st.caption(f"Confidence: {result['score']:.0%}")
except Exception as e:
st.error(f"❌ Question answering failed: {str(e)}")
with tab2:
if context and len(context.strip()) > 0:
with st.form("summary_form"):
length = st.slider("Summary Length", 50, 300, 150)
if st.form_submit_button("Generate Summary"):
with st.spinner("Summarizing..."):
start_time = time.time()
summary = generate_summary(context, length)
if summary:
st.success(f"Generated in {time.time()-start_time:.1f}s")
st.markdown(f"**Summary:**\n\n{summary}")
# System Info
with st.expander("βš™οΈ System Status"):
try:
device_status = 'GPU βœ…' if torch.cuda.is_available() else 'CPU ⚠️'
except:
device_status = 'CPU (torch not configured)'
st.code(f"""
Models loaded: {', '.join(models.keys())}
Device: {device_status}
Memory: {psutil.virtual_memory().percent}% used
CPU: {psutil.cpu_percent()}% used
Cache location: {cache_dir}
""")