File size: 4,295 Bytes
fa2c767
0dd4a89
 
 
d0b1c15
0dd4a89
b2c7a5c
d0b1c15
0dd4a89
 
 
 
fa2c767
0dd4a89
fa2c767
 
0dd4a89
c912965
 
d0b1c15
0dd4a89
 
 
d0b1c15
b2c7a5c
d0b1c15
 
 
 
 
 
0dd4a89
 
 
 
 
 
d0b1c15
b2c7a5c
 
 
c912965
 
b2c7a5c
 
0dd4a89
c912965
 
 
 
 
 
 
 
 
0dd4a89
 
fa2c767
 
 
 
 
 
 
 
 
 
0dd4a89
 
39400c1
0dd4a89
 
 
fa2c767
c912965
d0b1c15
 
0dd4a89
d0b1c15
 
 
 
 
 
 
0dd4a89
 
 
 
 
 
d0b1c15
b2c7a5c
0dd4a89
 
 
 
b2c7a5c
 
0dd4a89
fa2c767
5b8505f
 
d0b1c15
fa2c767
d0b1c15
fa2c767
d0b1c15
fa2c767
0dd4a89
fa2c767
 
0dd4a89
fa2c767
d0b1c15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import os
import gradio as gr
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import google.generativeai as genai
from langchain_community.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv

# Load environment variables
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=GOOGLE_API_KEY)

# Directory to save FAISS index
INDEX_PATH = "faiss_index"

def get_pdf_text(pdf_files):
    text = ""
    for pdf in pdf_files:
        try:
            pdf_reader = PdfReader(pdf.name)
            for page in pdf_reader.pages:
                extracted_text = page.extract_text()
                if extracted_text:
                    text += extracted_text
        except Exception as e:
            return f"Error reading PDF: {str(e)}"
    return text

def get_text_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
    return text_splitter.split_text(text)

def create_vector_store(text_chunks):
    try:
        embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
        vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
        vector_store.save_local(INDEX_PATH)  # Save to disk
        return "PDFs processed successfully! Vector store saved. Now you can ask questions."
    except Exception as e:
        return f"Error creating vector store: {str(e)}"

def load_vector_store():
    try:
        embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
        if os.path.exists(INDEX_PATH):
            return FAISS.load_local(INDEX_PATH, embeddings, allow_dangerous_deserialization=True)
        return None
    except Exception as e:
        return None

def get_conversational_chain():
    prompt_template = """
    Answer the question as detailed as possible from the provided context. 
    If the answer is not in the provided context, respond with "answer is not available in the context".
    Do not provide incorrect information.
    
    Context:
    {context}
    
    Question: 
    {question}
    
    Answer:
    """
    model = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0.3)
    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
    return load_qa_chain(model, chain_type="stuff", prompt=prompt)

def query_pdf(user_question):
    vector_store = load_vector_store()
    if vector_store is None:
        return "Please process a PDF first by uploading and submitting it."
    
    try:
        docs = vector_store.similarity_search(user_question)
        chain = get_conversational_chain()
        response = chain({"input_documents": docs, "question": user_question}, return_only_outputs=True)
        return response["output_text"]
    except Exception as e:
        return f"Error querying the PDF: {str(e)}"

def process_pdfs(pdf_files):
    if not pdf_files:
        return "Please upload at least one PDF."
    
    raw_text = get_pdf_text(pdf_files)
    if isinstance(raw_text, str) and "Error" in raw_text:
        return raw_text
    if not raw_text.strip():
        return "No extractable text found in the uploaded PDFs."
    
    text_chunks = get_text_chunks(raw_text)
    result = create_vector_store(text_chunks)
    return result

# Gradio UI
with gr.Blocks(title="Chat with PDF") as demo:
    gr.Markdown("## Chat with PDF 💁")
    pdf_input = gr.File(file_types=[".pdf"], label="Upload PDF(s)", file_count="multiple")
    process_button = gr.Button("Submit & Process")
    status_output = gr.Textbox(label="Status", placeholder="Status updates will appear here...")
    question_input = gr.Textbox(label="Ask a Question from the PDF")
    answer_output = gr.Textbox(label="Reply", placeholder="Answers will appear here...")
    ask_button = gr.Button("Get Answer")
    
    process_button.click(process_pdfs, inputs=[pdf_input], outputs=[status_output])
    ask_button.click(query_pdf, inputs=[question_input], outputs=[answer_output])

if __name__ == "__main__":
    demo.launch()