Aroy1997 commited on
Commit
bbe81e1
·
verified ·
1 Parent(s): a106039

Create app1.py

Browse files
Files changed (1) hide show
  1. app1.py +91 -0
app1.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+ import gradio as gr
3
+ from transformers import pipeline
4
+ from sentence_transformers import SentenceTransformer
5
+ import faiss
6
+ import numpy as np
7
+
8
+ # Initialize summarizer pipeline
9
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
10
+
11
+ # Initialize embedding model
12
+ embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
13
+
14
+ # Initialize question-answering pipeline
15
+ qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
16
+
17
+ def extract_text_from_pdf(file_path):
18
+ doc = fitz.open(file_path)
19
+ text = ""
20
+ for page in doc:
21
+ text += page.get_text()
22
+ return text
23
+
24
+ def chunk_text(text, max_chunk_size=500):
25
+ words = text.split()
26
+ chunks = []
27
+ current_chunk = []
28
+ current_length = 0
29
+ for word in words:
30
+ current_chunk.append(word)
31
+ current_length += len(word) + 1 # +1 for space
32
+ if current_length >= max_chunk_size:
33
+ chunks.append(" ".join(current_chunk))
34
+ current_chunk = []
35
+ current_length = 0
36
+ if current_chunk:
37
+ chunks.append(" ".join(current_chunk))
38
+ return chunks
39
+
40
+ def build_faiss_index(chunks):
41
+ embeddings = embedding_model.encode(chunks)
42
+ dimension = embeddings.shape[1]
43
+ index = faiss.IndexFlatL2(dimension)
44
+ index.add(np.array(embeddings))
45
+ return index, embeddings
46
+
47
+ def retrieve_relevant_chunks(query, chunks, index, embeddings, top_k=3):
48
+ query_embedding = embedding_model.encode([query])
49
+ distances, indices = index.search(np.array(query_embedding), top_k)
50
+ retrieved_chunks = [chunks[i] for i in indices[0]]
51
+ return retrieved_chunks
52
+
53
+ def summarize_pdf(file_path):
54
+ raw_text = extract_text_from_pdf(file_path)
55
+ max_chunk = 1024
56
+ chunks = [raw_text[i:i+max_chunk] for i in range(0, len(raw_text), max_chunk)]
57
+ summary = ""
58
+ for chunk in chunks:
59
+ res = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
60
+ summary += res[0]['summary_text'] + " "
61
+ return summary.strip()
62
+
63
+ def answer_question(file_path, question):
64
+ raw_text = extract_text_from_pdf(file_path)
65
+ chunks = chunk_text(raw_text)
66
+ index, embeddings = build_faiss_index(chunks)
67
+ relevant_chunks = retrieve_relevant_chunks(question, chunks, index, embeddings)
68
+ context = " ".join(relevant_chunks)
69
+ answer = qa_pipeline(question=question, context=context)
70
+ return answer['answer']
71
+
72
+ # Gradio UI
73
+ with gr.Blocks() as demo:
74
+ gr.Markdown("# PDF Summarizer and Q&A")
75
+ with gr.Tab("Summarization"):
76
+ with gr.Row():
77
+ pdf_input = gr.File(type="filepath", label="Upload a PDF")
78
+ summarize_button = gr.Button("Summarize")
79
+ summary_output = gr.Textbox(label="Summary", lines=10)
80
+ summarize_button.click(fn=summarize_pdf, inputs=pdf_input, outputs=summary_output)
81
+
82
+ with gr.Tab("Question Answering"):
83
+ with gr.Row():
84
+ pdf_input_qa = gr.File(type="filepath", label="Upload a PDF")
85
+ question_input = gr.Textbox(label="Enter your question")
86
+ answer_button = gr.Button("Get Answer")
87
+ answer_output = gr.Textbox(label="Answer", lines=2)
88
+ answer_button.click(fn=answer_question, inputs=[pdf_input_qa, question_input], outputs=answer_output)
89
+
90
+ if __name__ == "__main__":
91
+ demo.launch()