File size: 5,550 Bytes
ecf4e97 6f9f41c 0816da5 6f9f41c 0816da5 ecf4e97 2a77717 6f9f41c 2a77717 6f9f41c ecf4e97 6f9f41c ecf4e97 2a77717 ecf4e97 be11fc2 ecf4e97 be11fc2 ecf4e97 d80c9f5 ecf4e97 2a77717 d80c9f5 6f9f41c 2a77717 be11fc2 d80c9f5 be11fc2 ecf4e97 d80c9f5 ecf4e97 d80c9f5 2a77717 ecf4e97 6f9f41c ecf4e97 be11fc2 ecf4e97 6f9f41c 2a77717 ecf4e97 d80c9f5 2a77717 d80c9f5 ecf4e97 d80c9f5 2a77717 ecf4e97 d80c9f5 ecf4e97 d80c9f5 ecf4e97 d80c9f5 6f9f41c d80c9f5 be11fc2 ecf4e97 e0e4765 ecf4e97 6f9f41c be11fc2 6f9f41c ecf4e97 d80c9f5 6f9f41c d80c9f5 be11fc2 ecf4e97 2a77717 ecf4e97 2a77717 ecf4e97 6f9f41c 2a77717 ecf4e97 2a77717 ecf4e97 6f9f41c ecf4e97 2a77717 d80c9f5 6f9f41c 2a77717 ecf4e97 2a77717 6f9f41c 2a77717 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import os
os.environ["STREAMLIT_SERVER_PORT"] = "8501"
os.environ["STREAMLIT_SERVER_HEADLESS"] = "true"
# Must be first Streamlit command
import streamlit as st
st.set_page_config(page_title="DocAnalyzer Pro", layout="wide")
# Import all required libraries
from transformers import pipeline
from PyPDF2 import PdfReader
import docx
import time
import psutil
from pathlib import Path
import torch
from tempfile import gettempdir # π New import
# ======================
# CACHE SETUP
# ======================
def setup_environment():
"""Ensure cache directories exist in a safe location"""
cache_dir = Path(gettempdir()) / "models" # π Changed from /app/models
cache_dir.mkdir(exist_ok=True, parents=True)
return cache_dir
cache_dir = setup_environment()
# ======================
# MODEL LOADING
# ======================
@st.cache_resource(ttl=3600)
def load_models():
"""Load optimized models for Hugging Face Spaces"""
try:
with st.spinner("π Loading AI models (this may take 1-2 minutes)..."):
return {
'qa': pipeline(
"question-answering",
model="distilbert-base-cased-distilled-squad",
device=-1
),
'summarizer': pipeline(
"summarization",
model="sshleifer/distilbart-cnn-12-6",
device=-1
)
}
except Exception as e:
st.error(f"β Failed to load models: {str(e)}")
st.stop()
models = load_models()
# ======================
# DOCUMENT PROCESSING
# ======================
def extract_text(file):
"""Extract text from PDF/DOCX files with error handling"""
if file is None:
return ""
try:
if file.type == "application/pdf":
reader = PdfReader(file)
return " ".join(page.extract_text() for page in reader.pages if page.extract_text())
elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
doc = docx.Document(file)
return "\n".join(para.text for para in doc.paragraphs if para.text)
except Exception as e:
st.error(f"β οΈ Error processing document: {str(e)}")
return ""
# ======================
# CORE FUNCTIONS
# ======================
def generate_summary(text, max_length=150):
"""Generate summary with chunking for large documents"""
if not text or len(text.strip()) == 0:
return ""
try:
if len(text) > 10000:
chunk_size = 3000
chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
summaries = []
for chunk in chunks:
result = models['summarizer'](
chunk,
max_length=max(max_length//len(chunks), 30),
min_length=30,
do_sample=False
)
summaries.append(result[0]['summary_text'])
return " ".join(summaries)
return models['summarizer'](text, max_length=max_length)[0]['summary_text']
except Exception as e:
st.error(f"β Summarization failed: {str(e)}")
return ""
# ======================
# STREAMLIT UI
# ======================
st.title("π DocAnalyzer Pro")
# File Upload Section
with st.expander("π€ Upload Document", expanded=True):
uploaded_file = st.file_uploader("Choose PDF/DOCX", type=["pdf", "docx"])
manual_text = st.text_area("Or paste text here:", height=150)
context = extract_text(uploaded_file) if uploaded_file else manual_text
# Main Features
tab1, tab2 = st.tabs(["π Question Answering", "π Summarization"])
with tab1:
if context and len(context.strip()) > 0:
question = st.text_input("Ask about the document:")
if question and len(question.strip()) > 0:
with st.spinner("Analyzing..."):
start_time = time.time()
try:
result = models['qa'](
question=question,
context=context[:100000]
)
st.success(f"Answered in {time.time()-start_time:.1f}s")
st.markdown(f"**Answer:** {result['answer']}")
st.progress(result['score'])
st.caption(f"Confidence: {result['score']:.0%}")
except Exception as e:
st.error(f"β Question answering failed: {str(e)}")
with tab2:
if context and len(context.strip()) > 0:
with st.form("summary_form"):
length = st.slider("Summary Length", 50, 300, 150)
if st.form_submit_button("Generate Summary"):
with st.spinner("Summarizing..."):
start_time = time.time()
summary = generate_summary(context, length)
if summary:
st.success(f"Generated in {time.time()-start_time:.1f}s")
st.markdown(f"**Summary:**\n\n{summary}")
# System Info
with st.expander("βοΈ System Status"):
try:
device_status = 'GPU β
' if torch.cuda.is_available() else 'CPU β οΈ'
except:
device_status = 'CPU (torch not configured)'
st.code(f"""
Models loaded: {', '.join(models.keys())}
Device: {device_status}
Memory: {psutil.virtual_memory().percent}% used
CPU: {psutil.cpu_percent()}% used
Cache location: {cache_dir}
""")
|