import fitz # PyMuPDF import gradio as gr from transformers import pipeline from sentence_transformers import SentenceTransformer import faiss import numpy as np # Initialize summarizer pipeline summarizer = pipeline("summarization", model="facebook/bart-large-cnn") # Initialize embedding model embedding_model = SentenceTransformer("all-MiniLM-L6-v2") # Initialize question-answering pipeline qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad") def extract_text_from_pdf(file_path): doc = fitz.open(file_path) text = "" for page in doc: text += page.get_text() return text def chunk_text(text, max_chunk_size=500): words = text.split() chunks = [] current_chunk = [] current_length = 0 for word in words: current_chunk.append(word) current_length += len(word) + 1 # +1 for space if current_length >= max_chunk_size: chunks.append(" ".join(current_chunk)) current_chunk = [] current_length = 0 if current_chunk: chunks.append(" ".join(current_chunk)) return chunks def build_faiss_index(chunks): embeddings = embedding_model.encode(chunks) dimension = embeddings.shape[1] index = faiss.IndexFlatL2(dimension) index.add(np.array(embeddings)) return index, embeddings def retrieve_relevant_chunks(query, chunks, index, embeddings, top_k=3): query_embedding = embedding_model.encode([query]) distances, indices = index.search(np.array(query_embedding), top_k) retrieved_chunks = [chunks[i] for i in indices[0]] return retrieved_chunks def summarize_pdf(file_path): raw_text = extract_text_from_pdf(file_path) max_chunk = 1024 chunks = [raw_text[i:i+max_chunk] for i in range(0, len(raw_text), max_chunk)] summary = "" for chunk in chunks: res = summarizer(chunk, max_length=130, min_length=30, do_sample=False) summary += res[0]['summary_text'] + " " return summary.strip() def answer_question(file_path, question): raw_text = extract_text_from_pdf(file_path) chunks = chunk_text(raw_text) index, embeddings = build_faiss_index(chunks) relevant_chunks = retrieve_relevant_chunks(question, chunks, index, embeddings) context = " ".join(relevant_chunks) answer = qa_pipeline(question=question, context=context) return answer['answer'] # Gradio UI with gr.Blocks() as demo: gr.Markdown("# PDF Summarizer and Q&A") with gr.Tab("Summarization"): with gr.Row(): pdf_input = gr.File(type="filepath", label="Upload a PDF") summarize_button = gr.Button("Summarize") summary_output = gr.Textbox(label="Summary", lines=10) summarize_button.click(fn=summarize_pdf, inputs=pdf_input, outputs=summary_output) with gr.Tab("Question Answering"): with gr.Row(): pdf_input_qa = gr.File(type="filepath", label="Upload a PDF") question_input = gr.Textbox(label="Enter your question") answer_button = gr.Button("Get Answer") answer_output = gr.Textbox(label="Answer", lines=2) answer_button.click(fn=answer_question, inputs=[pdf_input_qa, question_input], outputs=answer_output) if __name__ == "__main__": demo.launch()