File size: 4,882 Bytes
67d85a4 0eefb20 c630cd2 10d6816 c630cd2 cb6ff7e ccfc149 c630cd2 cb6ff7e ccfc149 cb6ff7e c630cd2 cb6ff7e c630cd2 67d85a4 c630cd2 6f67ac4 c630cd2 60db15e 6f67ac4 67d85a4 60db15e cb6ff7e c630cd2 0eefb20 c630cd2 60db15e c630cd2 60db15e 67d85a4 60db15e c630cd2 6f67ac4 cb6ff7e 6f67ac4 c630cd2 cb6ff7e c630cd2 6f67ac4 cb6ff7e c630cd2 cb6ff7e 4f755fe cb6ff7e c630cd2 4f755fe cb6ff7e 4f755fe cb6ff7e c630cd2 cb6ff7e c630cd2 67d85a4 cb6ff7e 6f67ac4 cb6ff7e c630cd2 cb6ff7e 9689af9 cb6ff7e 6f67ac4 cb6ff7e 6f67ac4 c979a25 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
import gradio as gr
from PyPDF2 import PdfReader
# LangChain components
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.prompts import PromptTemplate
# Hugging Face Transformers
from transformers import pipeline
# ---------------- Load LLM ----------------
def load_llm():
try:
# Use a model that's good at instruction following
pipe = pipeline(
"text2text-generation",
model="google/flan-t5-base",
max_length=512,
temperature=0.1 # Lower temperature for more focused answers
)
print("✅ Successfully loaded model: google/flan-t5-base")
return pipe
except Exception as e:
print(f"⚠️ Failed to load model: {e}")
return None
llm = load_llm()
# ---------------- Process PDF ----------------
def process_pdf(pdf_files):
text = ""
for pdf in pdf_files:
reader = PdfReader(pdf)
for page in reader.pages:
extracted = page.extract_text()
if extracted:
text += extracted + "\n"
if not text.strip():
return None
# Split text into chunks
splitter = CharacterTextSplitter(chunk_size=800, chunk_overlap=100)
texts = splitter.split_text(text)
# Embeddings & vector store
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db = FAISS.from_texts(texts, embeddings)
return db
# ---------------- Ask Questions ----------------
def ask_question(pdf_files, question):
try:
if not pdf_files:
return "⚠️ Please upload at least one PDF file."
if not llm:
return "⚠️ Language model failed to load. Please try again later."
db = process_pdf(pdf_files)
if not db:
return "⚠️ No text found in the uploaded PDF(s)."
retriever = db.as_retriever(search_kwargs={"k": 4})
docs = retriever.get_relevant_documents(question)
# Combine retrieved context
context = "\n".join([doc.page_content for doc in docs])
# Clean up context to remove excessive whitespace
context = " ".join(context.split())
# Better prompt template that forces the model to answer
prompt = f"""Based on the following information, answer the question clearly and concisely.
Information:
{context}
Question: {question}
Answer:"""
# Generate response
result = llm(
prompt,
max_length=300,
num_return_sequences=1,
do_sample=False,
temperature=0.1
)
response = result[0]['generated_text'].strip()
# Clean up the response
if response.startswith("Answer:"):
response = response.replace("Answer:", "").strip()
# If response is empty or just repeats the prompt, provide fallback
if not response or len(response) < 10:
return "I couldn't find a clear answer to your question in the provided documents. Please try rephrasing your question or check if the relevant information is in the uploaded PDFs."
return response
except Exception as e:
return f"⚠️ Error: {str(e)}"
# ---------------- Gradio UI ----------------
with gr.Blocks() as demo:
gr.Markdown("## 📚 PDF Question Answering System")
gr.Markdown("Upload PDF files and ask questions about their content.")
with gr.Row():
with gr.Column():
pdf_input = gr.File(
label="Upload PDF Files",
file_types=[".pdf"],
file_count="multiple"
)
with gr.Column():
question_input = gr.Textbox(
label="Your Question",
placeholder="What would you like to know about the document?",
lines=2
)
submit_btn = gr.Button("Ask Question", variant="primary")
with gr.Row():
output = gr.Textbox(
label="Answer",
lines=4,
interactive=False
)
# Examples
gr.Examples(
examples=[
["What is the main topic of this document?"],
["Can you summarize the key points?"],
["What are the main findings or conclusions?"],
["Who are the authors and what are their credentials?"]
],
inputs=question_input,
label="Example Questions"
)
# Handle both button click and enter key
submit_btn.click(ask_question, inputs=[pdf_input, question_input], outputs=output)
question_input.submit(ask_question, inputs=[pdf_input, question_input], outputs=output)
demo.launch() |