Spaces:
Running
Running
File size: 3,301 Bytes
bbe81e1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
import fitz # PyMuPDF
import gradio as gr
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
# Initialize summarizer pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
# Initialize embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# Initialize question-answering pipeline
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
def extract_text_from_pdf(file_path):
doc = fitz.open(file_path)
text = ""
for page in doc:
text += page.get_text()
return text
def chunk_text(text, max_chunk_size=500):
words = text.split()
chunks = []
current_chunk = []
current_length = 0
for word in words:
current_chunk.append(word)
current_length += len(word) + 1 # +1 for space
if current_length >= max_chunk_size:
chunks.append(" ".join(current_chunk))
current_chunk = []
current_length = 0
if current_chunk:
chunks.append(" ".join(current_chunk))
return chunks
def build_faiss_index(chunks):
embeddings = embedding_model.encode(chunks)
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))
return index, embeddings
def retrieve_relevant_chunks(query, chunks, index, embeddings, top_k=3):
query_embedding = embedding_model.encode([query])
distances, indices = index.search(np.array(query_embedding), top_k)
retrieved_chunks = [chunks[i] for i in indices[0]]
return retrieved_chunks
def summarize_pdf(file_path):
raw_text = extract_text_from_pdf(file_path)
max_chunk = 1024
chunks = [raw_text[i:i+max_chunk] for i in range(0, len(raw_text), max_chunk)]
summary = ""
for chunk in chunks:
res = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
summary += res[0]['summary_text'] + " "
return summary.strip()
def answer_question(file_path, question):
raw_text = extract_text_from_pdf(file_path)
chunks = chunk_text(raw_text)
index, embeddings = build_faiss_index(chunks)
relevant_chunks = retrieve_relevant_chunks(question, chunks, index, embeddings)
context = " ".join(relevant_chunks)
answer = qa_pipeline(question=question, context=context)
return answer['answer']
# Gradio UI
with gr.Blocks() as demo:
gr.Markdown("# PDF Summarizer and Q&A")
with gr.Tab("Summarization"):
with gr.Row():
pdf_input = gr.File(type="filepath", label="Upload a PDF")
summarize_button = gr.Button("Summarize")
summary_output = gr.Textbox(label="Summary", lines=10)
summarize_button.click(fn=summarize_pdf, inputs=pdf_input, outputs=summary_output)
with gr.Tab("Question Answering"):
with gr.Row():
pdf_input_qa = gr.File(type="filepath", label="Upload a PDF")
question_input = gr.Textbox(label="Enter your question")
answer_button = gr.Button("Get Answer")
answer_output = gr.Textbox(label="Answer", lines=2)
answer_button.click(fn=answer_question, inputs=[pdf_input_qa, question_input], outputs=answer_output)
if __name__ == "__main__":
demo.launch()
|