File size: 3,301 Bytes
bbe81e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import fitz  # PyMuPDF
import gradio as gr
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Initialize summarizer pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Initialize embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Initialize question-answering pipeline
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

def extract_text_from_pdf(file_path):
    doc = fitz.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def chunk_text(text, max_chunk_size=500):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    for word in words:
        current_chunk.append(word)
        current_length += len(word) + 1  # +1 for space
        if current_length >= max_chunk_size:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_length = 0
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

def build_faiss_index(chunks):
    embeddings = embedding_model.encode(chunks)
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(np.array(embeddings))
    return index, embeddings

def retrieve_relevant_chunks(query, chunks, index, embeddings, top_k=3):
    query_embedding = embedding_model.encode([query])
    distances, indices = index.search(np.array(query_embedding), top_k)
    retrieved_chunks = [chunks[i] for i in indices[0]]
    return retrieved_chunks

def summarize_pdf(file_path):
    raw_text = extract_text_from_pdf(file_path)
    max_chunk = 1024
    chunks = [raw_text[i:i+max_chunk] for i in range(0, len(raw_text), max_chunk)]
    summary = ""
    for chunk in chunks:
        res = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
        summary += res[0]['summary_text'] + " "
    return summary.strip()

def answer_question(file_path, question):
    raw_text = extract_text_from_pdf(file_path)
    chunks = chunk_text(raw_text)
    index, embeddings = build_faiss_index(chunks)
    relevant_chunks = retrieve_relevant_chunks(question, chunks, index, embeddings)
    context = " ".join(relevant_chunks)
    answer = qa_pipeline(question=question, context=context)
    return answer['answer']

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# PDF Summarizer and Q&A")
    with gr.Tab("Summarization"):
        with gr.Row():
            pdf_input = gr.File(type="filepath", label="Upload a PDF")
            summarize_button = gr.Button("Summarize")
        summary_output = gr.Textbox(label="Summary", lines=10)
        summarize_button.click(fn=summarize_pdf, inputs=pdf_input, outputs=summary_output)

    with gr.Tab("Question Answering"):
        with gr.Row():
            pdf_input_qa = gr.File(type="filepath", label="Upload a PDF")
            question_input = gr.Textbox(label="Enter your question")
            answer_button = gr.Button("Get Answer")
        answer_output = gr.Textbox(label="Answer", lines=2)
        answer_button.click(fn=answer_question, inputs=[pdf_input_qa, question_input], outputs=answer_output)

if __name__ == "__main__":
    demo.launch()