File size: 5,550 Bytes
ecf4e97
6f9f41c
 
0816da5
6f9f41c
0816da5
 
ecf4e97
2a77717
6f9f41c
 
 
 
 
 
2a77717
 
6f9f41c
ecf4e97
6f9f41c
ecf4e97
 
2a77717
 
ecf4e97
 
be11fc2
ecf4e97
be11fc2
ecf4e97
d80c9f5
ecf4e97
2a77717
d80c9f5
6f9f41c
2a77717
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
be11fc2
d80c9f5
be11fc2
ecf4e97
d80c9f5
ecf4e97
d80c9f5
2a77717
 
 
 
ecf4e97
 
 
 
 
 
 
 
6f9f41c
ecf4e97
be11fc2
ecf4e97
 
 
 
6f9f41c
2a77717
ecf4e97
 
d80c9f5
2a77717
 
 
d80c9f5
 
ecf4e97
d80c9f5
2a77717
ecf4e97
d80c9f5
 
ecf4e97
d80c9f5
ecf4e97
d80c9f5
6f9f41c
d80c9f5
be11fc2
ecf4e97
e0e4765
ecf4e97
6f9f41c
be11fc2
6f9f41c
ecf4e97
d80c9f5
6f9f41c
d80c9f5
be11fc2
ecf4e97
 
 
 
2a77717
ecf4e97
2a77717
ecf4e97
6f9f41c
2a77717
 
 
 
 
 
 
 
 
 
 
ecf4e97
 
2a77717
ecf4e97
 
 
 
6f9f41c
ecf4e97
2a77717
 
 
d80c9f5
6f9f41c
 
2a77717
 
 
 
 
ecf4e97
 
2a77717
6f9f41c
 
2a77717
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import os
os.environ["STREAMLIT_SERVER_PORT"] = "8501"
os.environ["STREAMLIT_SERVER_HEADLESS"] = "true"

# Must be first Streamlit command
import streamlit as st
st.set_page_config(page_title="DocAnalyzer Pro", layout="wide")

# Import all required libraries
from transformers import pipeline
from PyPDF2 import PdfReader
import docx
import time
import psutil
from pathlib import Path
import torch
from tempfile import gettempdir  # πŸ” New import

# ======================
# CACHE SETUP
# ======================
def setup_environment():
    """Ensure cache directories exist in a safe location"""
    cache_dir = Path(gettempdir()) / "models"  # πŸ” Changed from /app/models
    cache_dir.mkdir(exist_ok=True, parents=True)
    return cache_dir

cache_dir = setup_environment()

# ======================
# MODEL LOADING
# ======================
@st.cache_resource(ttl=3600)
def load_models():
    """Load optimized models for Hugging Face Spaces"""
    try:
        with st.spinner("πŸ”„ Loading AI models (this may take 1-2 minutes)..."):
            return {
                'qa': pipeline(
                    "question-answering",
                    model="distilbert-base-cased-distilled-squad",
                    device=-1
                ),
                'summarizer': pipeline(
                    "summarization",
                    model="sshleifer/distilbart-cnn-12-6",
                    device=-1
                )
            }
    except Exception as e:
        st.error(f"❌ Failed to load models: {str(e)}")
        st.stop()

models = load_models()

# ======================
# DOCUMENT PROCESSING
# ======================
def extract_text(file):
    """Extract text from PDF/DOCX files with error handling"""
    if file is None:
        return ""
    
    try:
        if file.type == "application/pdf":
            reader = PdfReader(file)
            return " ".join(page.extract_text() for page in reader.pages if page.extract_text())
        elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
            doc = docx.Document(file)
            return "\n".join(para.text for para in doc.paragraphs if para.text)
    except Exception as e:
        st.error(f"⚠️ Error processing document: {str(e)}")
        return ""

# ======================
# CORE FUNCTIONS
# ======================
def generate_summary(text, max_length=150):
    """Generate summary with chunking for large documents"""
    if not text or len(text.strip()) == 0:
        return ""
    
    try:
        if len(text) > 10000:
            chunk_size = 3000
            chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
            summaries = []
            for chunk in chunks:
                result = models['summarizer'](
                    chunk,
                    max_length=max(max_length//len(chunks), 30),
                    min_length=30,
                    do_sample=False
                )
                summaries.append(result[0]['summary_text'])
            return " ".join(summaries)
        return models['summarizer'](text, max_length=max_length)[0]['summary_text']
    except Exception as e:
        st.error(f"❌ Summarization failed: {str(e)}")
        return ""

# ======================
# STREAMLIT UI
# ======================
st.title("πŸ“„ DocAnalyzer Pro")

# File Upload Section
with st.expander("πŸ“€ Upload Document", expanded=True):
    uploaded_file = st.file_uploader("Choose PDF/DOCX", type=["pdf", "docx"])
    manual_text = st.text_area("Or paste text here:", height=150)
    context = extract_text(uploaded_file) if uploaded_file else manual_text

# Main Features
tab1, tab2 = st.tabs(["πŸ” Question Answering", "πŸ“ Summarization"])

with tab1:
    if context and len(context.strip()) > 0:
        question = st.text_input("Ask about the document:")
        if question and len(question.strip()) > 0:
            with st.spinner("Analyzing..."):
                start_time = time.time()
                try:
                    result = models['qa'](
                        question=question,
                        context=context[:100000]
                    )
                    st.success(f"Answered in {time.time()-start_time:.1f}s")
                    st.markdown(f"**Answer:** {result['answer']}")
                    st.progress(result['score'])
                    st.caption(f"Confidence: {result['score']:.0%}")
                except Exception as e:
                    st.error(f"❌ Question answering failed: {str(e)}")

with tab2:
    if context and len(context.strip()) > 0:
        with st.form("summary_form"):
            length = st.slider("Summary Length", 50, 300, 150)
            if st.form_submit_button("Generate Summary"):
                with st.spinner("Summarizing..."):
                    start_time = time.time()
                    summary = generate_summary(context, length)
                    if summary:
                        st.success(f"Generated in {time.time()-start_time:.1f}s")
                        st.markdown(f"**Summary:**\n\n{summary}")

# System Info
with st.expander("βš™οΈ System Status"):
    try:
        device_status = 'GPU βœ…' if torch.cuda.is_available() else 'CPU ⚠️'
    except:
        device_status = 'CPU (torch not configured)'
    
    st.code(f"""
    Models loaded: {', '.join(models.keys())}
    Device: {device_status}
    Memory: {psutil.virtual_memory().percent}% used
    CPU: {psutil.cpu_percent()}% used
    Cache location: {cache_dir}
    """)