Spaces:
Sleeping
Sleeping
import streamlit as st | |
from PyPDF2 import PdfReader | |
import docx | |
from pptx import Presentation | |
from transformers import pipeline | |
# Title of the app | |
st.title("π Multi-Document Q&A App") | |
# Load question-answering pipeline from Hugging Face | |
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad") | |
# File uploader for multiple file types | |
uploaded_files = st.file_uploader( | |
"Upload PDF, Word (.docx), or PPT (.pptx) files", | |
type=["pdf", "docx", "pptx"], | |
accept_multiple_files=True | |
) | |
# Combine text from all files | |
all_text = "" | |
# File processing functions | |
def extract_text_from_pdf(file): | |
reader = PdfReader(file) | |
return "\n".join([page.extract_text() or "" for page in reader.pages]) | |
def extract_text_from_docx(file): | |
doc = docx.Document(file) | |
return "\n".join([para.text for para in doc.paragraphs]) | |
def extract_text_from_pptx(file): | |
prs = Presentation(file) | |
text = [] | |
for slide in prs.slides: | |
for shape in slide.shapes: | |
if hasattr(shape, "text"): | |
text.append(shape.text) | |
return "\n".join(text) | |
# Extract text from uploaded files | |
for file in uploaded_files: | |
file_type = file.name.split('.')[-1].lower() | |
if file_type == "pdf": | |
all_text += extract_text_from_pdf(file) + "\n" | |
elif file_type == "docx": | |
all_text += extract_text_from_docx(file) + "\n" | |
elif file_type == "pptx": | |
all_text += extract_text_from_pptx(file) + "\n" | |
# Show input for question if files were processed | |
if all_text: | |
st.success("β Files processed. Ask your question below.") | |
question = st.text_input("β Ask a question:") | |
if question: | |
result = qa_pipeline(question=question, context=all_text) | |
st.write("π **Answer:**", result['answer']) | |
else: | |
st.info("Upload some files to begin...") | |