himanshukumar378's picture
Update app.py
cb6ff7e verified
import gradio as gr
from PyPDF2 import PdfReader
# LangChain components
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.prompts import PromptTemplate
# Hugging Face Transformers
from transformers import pipeline
# ---------------- Load LLM ----------------
def load_llm():
try:
# Use a model that's good at instruction following
pipe = pipeline(
"text2text-generation",
model="google/flan-t5-base",
max_length=512,
temperature=0.1 # Lower temperature for more focused answers
)
print("✅ Successfully loaded model: google/flan-t5-base")
return pipe
except Exception as e:
print(f"⚠️ Failed to load model: {e}")
return None
llm = load_llm()
# ---------------- Process PDF ----------------
def process_pdf(pdf_files):
text = ""
for pdf in pdf_files:
reader = PdfReader(pdf)
for page in reader.pages:
extracted = page.extract_text()
if extracted:
text += extracted + "\n"
if not text.strip():
return None
# Split text into chunks
splitter = CharacterTextSplitter(chunk_size=800, chunk_overlap=100)
texts = splitter.split_text(text)
# Embeddings & vector store
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db = FAISS.from_texts(texts, embeddings)
return db
# ---------------- Ask Questions ----------------
def ask_question(pdf_files, question):
try:
if not pdf_files:
return "⚠️ Please upload at least one PDF file."
if not llm:
return "⚠️ Language model failed to load. Please try again later."
db = process_pdf(pdf_files)
if not db:
return "⚠️ No text found in the uploaded PDF(s)."
retriever = db.as_retriever(search_kwargs={"k": 4})
docs = retriever.get_relevant_documents(question)
# Combine retrieved context
context = "\n".join([doc.page_content for doc in docs])
# Clean up context to remove excessive whitespace
context = " ".join(context.split())
# Better prompt template that forces the model to answer
prompt = f"""Based on the following information, answer the question clearly and concisely.
Information:
{context}
Question: {question}
Answer:"""
# Generate response
result = llm(
prompt,
max_length=300,
num_return_sequences=1,
do_sample=False,
temperature=0.1
)
response = result[0]['generated_text'].strip()
# Clean up the response
if response.startswith("Answer:"):
response = response.replace("Answer:", "").strip()
# If response is empty or just repeats the prompt, provide fallback
if not response or len(response) < 10:
return "I couldn't find a clear answer to your question in the provided documents. Please try rephrasing your question or check if the relevant information is in the uploaded PDFs."
return response
except Exception as e:
return f"⚠️ Error: {str(e)}"
# ---------------- Gradio UI ----------------
with gr.Blocks() as demo:
gr.Markdown("## 📚 PDF Question Answering System")
gr.Markdown("Upload PDF files and ask questions about their content.")
with gr.Row():
with gr.Column():
pdf_input = gr.File(
label="Upload PDF Files",
file_types=[".pdf"],
file_count="multiple"
)
with gr.Column():
question_input = gr.Textbox(
label="Your Question",
placeholder="What would you like to know about the document?",
lines=2
)
submit_btn = gr.Button("Ask Question", variant="primary")
with gr.Row():
output = gr.Textbox(
label="Answer",
lines=4,
interactive=False
)
# Examples
gr.Examples(
examples=[
["What is the main topic of this document?"],
["Can you summarize the key points?"],
["What are the main findings or conclusions?"],
["Who are the authors and what are their credentials?"]
],
inputs=question_input,
label="Example Questions"
)
# Handle both button click and enter key
submit_btn.click(ask_question, inputs=[pdf_input, question_input], outputs=output)
question_input.submit(ask_question, inputs=[pdf_input, question_input], outputs=output)
demo.launch()