import gradio as gr from PyPDF2 import PdfReader # LangChain components from langchain.text_splitter import CharacterTextSplitter from langchain_community.vectorstores import FAISS from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_core.prompts import PromptTemplate # Hugging Face Transformers from transformers import pipeline # ---------------- Load LLM ---------------- def load_llm(): try: # Use a model that's good at instruction following pipe = pipeline( "text2text-generation", model="google/flan-t5-base", max_length=512, temperature=0.1 # Lower temperature for more focused answers ) print("✅ Successfully loaded model: google/flan-t5-base") return pipe except Exception as e: print(f"⚠️ Failed to load model: {e}") return None llm = load_llm() # ---------------- Process PDF ---------------- def process_pdf(pdf_files): text = "" for pdf in pdf_files: reader = PdfReader(pdf) for page in reader.pages: extracted = page.extract_text() if extracted: text += extracted + "\n" if not text.strip(): return None # Split text into chunks splitter = CharacterTextSplitter(chunk_size=800, chunk_overlap=100) texts = splitter.split_text(text) # Embeddings & vector store embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") db = FAISS.from_texts(texts, embeddings) return db # ---------------- Ask Questions ---------------- def ask_question(pdf_files, question): try: if not pdf_files: return "⚠️ Please upload at least one PDF file." if not llm: return "⚠️ Language model failed to load. Please try again later." db = process_pdf(pdf_files) if not db: return "⚠️ No text found in the uploaded PDF(s)." retriever = db.as_retriever(search_kwargs={"k": 4}) docs = retriever.get_relevant_documents(question) # Combine retrieved context context = "\n".join([doc.page_content for doc in docs]) # Clean up context to remove excessive whitespace context = " ".join(context.split()) # Better prompt template that forces the model to answer prompt = f"""Based on the following information, answer the question clearly and concisely. Information: {context} Question: {question} Answer:""" # Generate response result = llm( prompt, max_length=300, num_return_sequences=1, do_sample=False, temperature=0.1 ) response = result[0]['generated_text'].strip() # Clean up the response if response.startswith("Answer:"): response = response.replace("Answer:", "").strip() # If response is empty or just repeats the prompt, provide fallback if not response or len(response) < 10: return "I couldn't find a clear answer to your question in the provided documents. Please try rephrasing your question or check if the relevant information is in the uploaded PDFs." return response except Exception as e: return f"⚠️ Error: {str(e)}" # ---------------- Gradio UI ---------------- with gr.Blocks() as demo: gr.Markdown("## 📚 PDF Question Answering System") gr.Markdown("Upload PDF files and ask questions about their content.") with gr.Row(): with gr.Column(): pdf_input = gr.File( label="Upload PDF Files", file_types=[".pdf"], file_count="multiple" ) with gr.Column(): question_input = gr.Textbox( label="Your Question", placeholder="What would you like to know about the document?", lines=2 ) submit_btn = gr.Button("Ask Question", variant="primary") with gr.Row(): output = gr.Textbox( label="Answer", lines=4, interactive=False ) # Examples gr.Examples( examples=[ ["What is the main topic of this document?"], ["Can you summarize the key points?"], ["What are the main findings or conclusions?"], ["Who are the authors and what are their credentials?"] ], inputs=question_input, label="Example Questions" ) # Handle both button click and enter key submit_btn.click(ask_question, inputs=[pdf_input, question_input], outputs=output) question_input.submit(ask_question, inputs=[pdf_input, question_input], outputs=output) demo.launch()