Spaces:

Aroy1997
/

Pdf_summariser

Sleeping

App Files Files Community

Aroy1997 commited on 30 days ago

Commit

bbe81e1

verified ·

1 Parent(s): a106039

Create app1.py

Browse files

Files changed (1) hide show

app1.py +91 -0

app1.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import fitz  # PyMuPDF
+import gradio as gr
+from transformers import pipeline
+from sentence_transformers import SentenceTransformer
+import faiss
+import numpy as np
+# Initialize summarizer pipeline
+summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+# Initialize embedding model
+embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
+# Initialize question-answering pipeline
+qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
+def extract_text_from_pdf(file_path):
+    doc = fitz.open(file_path)
+    text = ""
+    for page in doc:
+        text += page.get_text()
+    return text
+def chunk_text(text, max_chunk_size=500):
+    words = text.split()
+    chunks = []
+    current_chunk = []
+    current_length = 0
+    for word in words:
+        current_chunk.append(word)
+        current_length += len(word) + 1  # +1 for space
+        if current_length >= max_chunk_size:
+            chunks.append(" ".join(current_chunk))
+            current_chunk = []
+            current_length = 0
+    if current_chunk:
+        chunks.append(" ".join(current_chunk))
+    return chunks
+def build_faiss_index(chunks):
+    embeddings = embedding_model.encode(chunks)
+    dimension = embeddings.shape[1]
+    index = faiss.IndexFlatL2(dimension)
+    index.add(np.array(embeddings))
+    return index, embeddings
+def retrieve_relevant_chunks(query, chunks, index, embeddings, top_k=3):
+    query_embedding = embedding_model.encode([query])
+    distances, indices = index.search(np.array(query_embedding), top_k)
+    retrieved_chunks = [chunks[i] for i in indices[0]]
+    return retrieved_chunks
+def summarize_pdf(file_path):
+    raw_text = extract_text_from_pdf(file_path)
+    max_chunk = 1024
+    chunks = [raw_text[i:i+max_chunk] for i in range(0, len(raw_text), max_chunk)]
+    summary = ""
+    for chunk in chunks:
+        res = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
+        summary += res[0]['summary_text'] + " "
+    return summary.strip()
+def answer_question(file_path, question):
+    raw_text = extract_text_from_pdf(file_path)
+    chunks = chunk_text(raw_text)
+    index, embeddings = build_faiss_index(chunks)
+    relevant_chunks = retrieve_relevant_chunks(question, chunks, index, embeddings)
+    context = " ".join(relevant_chunks)
+    answer = qa_pipeline(question=question, context=context)
+    return answer['answer']
+# Gradio UI
+with gr.Blocks() as demo:
+    gr.Markdown("# PDF Summarizer and Q&A")
+    with gr.Tab("Summarization"):
+        with gr.Row():
+            pdf_input = gr.File(type="filepath", label="Upload a PDF")
+            summarize_button = gr.Button("Summarize")
+        summary_output = gr.Textbox(label="Summary", lines=10)
+        summarize_button.click(fn=summarize_pdf, inputs=pdf_input, outputs=summary_output)
+    with gr.Tab("Question Answering"):
+        with gr.Row():
+            pdf_input_qa = gr.File(type="filepath", label="Upload a PDF")
+            question_input = gr.Textbox(label="Enter your question")
+            answer_button = gr.Button("Get Answer")
+        answer_output = gr.Textbox(label="Answer", lines=2)
+        answer_button.click(fn=answer_question, inputs=[pdf_input_qa, question_input], outputs=answer_output)
+if __name__ == "__main__":
+    demo.launch()