Spaces:

amritn8
/

FINALLL

Sleeping

App Files Files Community

FINALLL / aapp.py

amritn8

Rename app.py to aapp.py

6db0b20 verified about 2 months ago

raw

history blame contribute delete

5.55 kB

	import os
	os.environ["STREAMLIT_SERVER_PORT"] = "8501"
	os.environ["STREAMLIT_SERVER_HEADLESS"] = "true"

	# Must be first Streamlit command
	import streamlit as st
	st.set_page_config(page_title="DocAnalyzer Pro", layout="wide")

	# Import all required libraries
	from transformers import pipeline
	from PyPDF2 import PdfReader
	import docx
	import time
	import psutil
	from pathlib import Path
	import torch
	from tempfile import gettempdir # 🔁 New import

	# ======================
	# CACHE SETUP
	# ======================
	def setup_environment():
	"""Ensure cache directories exist in a safe location"""
	cache_dir = Path(gettempdir()) / "models" # 🔁 Changed from /app/models
	cache_dir.mkdir(exist_ok=True, parents=True)
	return cache_dir

	cache_dir = setup_environment()

	# ======================
	# MODEL LOADING
	# ======================
	@st.cache_resource(ttl=3600)
	def load_models():
	"""Load optimized models for Hugging Face Spaces"""
	try:
	with st.spinner("🔄 Loading AI models (this may take 1-2 minutes)..."):
	return {
	'qa': pipeline(
	"question-answering",
	model="distilbert-base-cased-distilled-squad",
	device=-1
	),
	'summarizer': pipeline(
	"summarization",
	model="sshleifer/distilbart-cnn-12-6",
	device=-1
	)
	}
	except Exception as e:
	st.error(f"❌ Failed to load models: {str(e)}")
	st.stop()

	models = load_models()

	# ======================
	# DOCUMENT PROCESSING
	# ======================
	def extract_text(file):
	"""Extract text from PDF/DOCX files with error handling"""
	if file is None:
	return ""

	try:
	if file.type == "application/pdf":
	reader = PdfReader(file)
	return " ".join(page.extract_text() for page in reader.pages if page.extract_text())
	elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
	doc = docx.Document(file)
	return "\n".join(para.text for para in doc.paragraphs if para.text)
	except Exception as e:
	st.error(f"⚠️ Error processing document: {str(e)}")
	return ""

	# ======================
	# CORE FUNCTIONS
	# ======================
	def generate_summary(text, max_length=150):
	"""Generate summary with chunking for large documents"""
	if not text or len(text.strip()) == 0:
	return ""

	try:
	if len(text) > 10000:
	chunk_size = 3000
	chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
	summaries = []
	for chunk in chunks:
	result = models['summarizer'](
	chunk,
	max_length=max(max_length//len(chunks), 30),
	min_length=30,
	do_sample=False
	)
	summaries.append(result[0]['summary_text'])
	return " ".join(summaries)
	return models['summarizer'](text, max_length=max_length)[0]['summary_text']
	except Exception as e:
	st.error(f"❌ Summarization failed: {str(e)}")
	return ""

	# ======================
	# STREAMLIT UI
	# ======================
	st.title("📄 DocAnalyzer Pro")

	# File Upload Section
	with st.expander("📤 Upload Document", expanded=True):
	uploaded_file = st.file_uploader("Choose PDF/DOCX", type=["pdf", "docx"])
	manual_text = st.text_area("Or paste text here:", height=150)
	context = extract_text(uploaded_file) if uploaded_file else manual_text

	# Main Features
	tab1, tab2 = st.tabs(["🔍 Question Answering", "📝 Summarization"])

	with tab1:
	if context and len(context.strip()) > 0:
	question = st.text_input("Ask about the document:")
	if question and len(question.strip()) > 0:
	with st.spinner("Analyzing..."):
	start_time = time.time()
	try:
	result = models['qa'](
	question=question,
	context=context[:100000]
	)
	st.success(f"Answered in {time.time()-start_time:.1f}s")
	st.markdown(f"Answer: {result['answer']}")
	st.progress(result['score'])
	st.caption(f"Confidence: {result['score']:.0%}")
	except Exception as e:
	st.error(f"❌ Question answering failed: {str(e)}")

	with tab2:
	if context and len(context.strip()) > 0:
	with st.form("summary_form"):
	length = st.slider("Summary Length", 50, 300, 150)
	if st.form_submit_button("Generate Summary"):
	with st.spinner("Summarizing..."):
	start_time = time.time()
	summary = generate_summary(context, length)
	if summary:
	st.success(f"Generated in {time.time()-start_time:.1f}s")
	st.markdown(f"Summary:\n\n{summary}")

	# System Info
	with st.expander("⚙️ System Status"):
	try:
	device_status = 'GPU ✅' if torch.cuda.is_available() else 'CPU ⚠️'
	except:
	device_status = 'CPU (torch not configured)'

	st.code(f"""
	Models loaded: {', '.join(models.keys())}
	Device: {device_status}
	Memory: {psutil.virtual_memory().percent}% used
	CPU: {psutil.cpu_percent()}% used
	Cache location: {cache_dir}
	""")