RanaZaeem12 commited on
Commit
3c64a05
·
verified ·
1 Parent(s): 3672982

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -0
app.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from PyPDF2 import PdfReader
3
+ import docx
4
+ from pptx import Presentation
5
+ from transformers import pipeline
6
+ import os
7
+
8
+ st.title("Multi-Document Q&A App 📄💬")
9
+
10
+ qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
11
+
12
+ uploaded_files = st.file_uploader("Upload PDF, Word, or PPT files", type=["pdf", "docx", "pptx"], accept_multiple_files=True)
13
+
14
+ all_text = ""
15
+
16
+ def extract_text_from_pdf(file):
17
+ reader = PdfReader(file)
18
+ return "\n".join([page.extract_text() or "" for page in reader.pages])
19
+
20
+ def extract_text_from_docx(file):
21
+ doc = docx.Document(file)
22
+ return "\n".join([para.text for para in doc.paragraphs])
23
+
24
+ def extract_text_from_pptx(file):
25
+ prs = Presentation(file)
26
+ text = []
27
+ for slide in prs.slides:
28
+ for shape in slide.shapes:
29
+ if hasattr(shape, "text"):
30
+ text.append(shape.text)
31
+ return "\n".join(text)
32
+
33
+ for file in uploaded_files:
34
+ file_type = file.name.split('.')[-1].lower()
35
+ if file_type == "pdf":
36
+ all_text += extract_text_from_pdf(file) + "\n"
37
+ elif file_type == "docx":
38
+ all_text += extract_text_from_docx(file) + "\n"
39
+ elif file_type == "pptx":
40
+ all_text += extract_text_from_pptx(file) + "\n"
41
+
42
+ if all_text:
43
+ st.success("Files processed. You can now ask questions.")
44
+ question = st.text_input("Ask a question based on your uploaded files:")
45
+
46
+ if question:
47
+ result = qa_pipeline(question=question, context=all_text)
48
+ st.write("**Answer:**", result['answer'])