File size: 4,882 Bytes
67d85a4
0eefb20
c630cd2
 
 
10d6816
c630cd2
 
 
 
cb6ff7e
ccfc149
c630cd2
cb6ff7e
ccfc149
cb6ff7e
 
 
 
 
 
 
 
 
 
 
 
 
c630cd2
 
cb6ff7e
c630cd2
 
 
 
67d85a4
c630cd2
6f67ac4
c630cd2
 
 
 
60db15e
 
6f67ac4
67d85a4
60db15e
cb6ff7e
c630cd2
0eefb20
c630cd2
60db15e
c630cd2
 
60db15e
67d85a4
60db15e
c630cd2
 
 
6f67ac4
 
cb6ff7e
 
 
6f67ac4
c630cd2
 
 
 
cb6ff7e
c630cd2
 
 
6f67ac4
cb6ff7e
 
 
c630cd2
cb6ff7e
 
4f755fe
cb6ff7e
 
c630cd2
4f755fe
 
 
 
cb6ff7e
 
 
 
 
 
 
 
 
4f755fe
cb6ff7e
 
 
 
 
 
 
 
 
 
c630cd2
 
cb6ff7e
c630cd2
 
 
67d85a4
cb6ff7e
6f67ac4
cb6ff7e
c630cd2
cb6ff7e
 
 
 
 
 
 
 
 
 
 
 
 
 
9689af9
cb6ff7e
 
 
 
6f67ac4
cb6ff7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6f67ac4
c979a25
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import gradio as gr
from PyPDF2 import PdfReader

# LangChain components
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.prompts import PromptTemplate

# Hugging Face Transformers
from transformers import pipeline


# ---------------- Load LLM ----------------
def load_llm():
    try:
        # Use a model that's good at instruction following
        pipe = pipeline(
            "text2text-generation",
            model="google/flan-t5-base",
            max_length=512,
            temperature=0.1  # Lower temperature for more focused answers
        )
        print("✅ Successfully loaded model: google/flan-t5-base")
        return pipe
    except Exception as e:
        print(f"⚠️ Failed to load model: {e}")
        return None


llm = load_llm()


# ---------------- Process PDF ----------------
def process_pdf(pdf_files):
    text = ""
    for pdf in pdf_files:
        reader = PdfReader(pdf)
        for page in reader.pages:
            extracted = page.extract_text()
            if extracted:
                text += extracted + "\n"

    if not text.strip():
        return None

    # Split text into chunks
    splitter = CharacterTextSplitter(chunk_size=800, chunk_overlap=100)
    texts = splitter.split_text(text)

    # Embeddings & vector store
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    db = FAISS.from_texts(texts, embeddings)

    return db


# ---------------- Ask Questions ----------------
def ask_question(pdf_files, question):
    try:
        if not pdf_files:
            return "⚠️ Please upload at least one PDF file."
        
        if not llm:
            return "⚠️ Language model failed to load. Please try again later."
            
        db = process_pdf(pdf_files)
        if not db:
            return "⚠️ No text found in the uploaded PDF(s)."

        retriever = db.as_retriever(search_kwargs={"k": 4})
        docs = retriever.get_relevant_documents(question)

        # Combine retrieved context
        context = "\n".join([doc.page_content for doc in docs])
        
        # Clean up context to remove excessive whitespace
        context = " ".join(context.split())

        # Better prompt template that forces the model to answer
        prompt = f"""Based on the following information, answer the question clearly and concisely.

Information:
{context}

Question: {question}

Answer:"""

        # Generate response
        result = llm(
            prompt,
            max_length=300,
            num_return_sequences=1,
            do_sample=False,
            temperature=0.1
        )
        
        response = result[0]['generated_text'].strip()
        
        # Clean up the response
        if response.startswith("Answer:"):
            response = response.replace("Answer:", "").strip()
        
        # If response is empty or just repeats the prompt, provide fallback
        if not response or len(response) < 10:
            return "I couldn't find a clear answer to your question in the provided documents. Please try rephrasing your question or check if the relevant information is in the uploaded PDFs."
        
        return response

    except Exception as e:
        return f"⚠️ Error: {str(e)}"


# ---------------- Gradio UI ----------------
with gr.Blocks() as demo:
    gr.Markdown("## 📚 PDF Question Answering System")
    gr.Markdown("Upload PDF files and ask questions about their content.")
    
    with gr.Row():
        with gr.Column():
            pdf_input = gr.File(
                label="Upload PDF Files",
                file_types=[".pdf"],
                file_count="multiple"
            )
        with gr.Column():
            question_input = gr.Textbox(
                label="Your Question",
                placeholder="What would you like to know about the document?",
                lines=2
            )
            submit_btn = gr.Button("Ask Question", variant="primary")
    
    with gr.Row():
        output = gr.Textbox(
            label="Answer",
            lines=4,
            interactive=False
        )
    
    # Examples
    gr.Examples(
        examples=[
            ["What is the main topic of this document?"],
            ["Can you summarize the key points?"],
            ["What are the main findings or conclusions?"],
            ["Who are the authors and what are their credentials?"]
        ],
        inputs=question_input,
        label="Example Questions"
    )
    
    # Handle both button click and enter key
    submit_btn.click(ask_question, inputs=[pdf_input, question_input], outputs=output)
    question_input.submit(ask_question, inputs=[pdf_input, question_input], outputs=output)

demo.launch()