File size: 4,863 Bytes
7646bd1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import os
import gradio as gr
import PyPDF2
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from groq import Groq

# -------- Step 1: Set and Verify Groq API Key --------
os.environ["GROQ_API_KEY"] = "your-real-groq-api-key"  # Replace this
api_key = os.getenv("GROQ_API_KEY")
if not api_key:
    raise ValueError("❌ GROQ_API_KEY not found.")

client = Groq(api_key=api_key)

# -------- Step 2: Setup Model --------
model = SentenceTransformer("all-MiniLM-L6-v2")
faiss_index = None
chunks_list = []

# -------- Step 3: Extract Text from PDF --------
def extract_text_from_pdf(pdf_path, logs):
    try:
        text = ""
        logs += "πŸ“– Step 1: Reading PDF...\n"
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text
        if not text.strip():
            raise ValueError("❌ No readable text found in PDF.")
        logs += "βœ… Text extracted successfully.\n"
        return text, logs
    except Exception as e:
        logs += f"❌ Error during PDF text extraction: {str(e)}\n"
        return None, logs

# -------- Step 4: Chunking --------
def create_chunks(text, chunk_size, logs):
    try:
        logs += "🧩 Step 2: Creating text chunks...\n"
        words = text.split()
        chunks = [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
        logs += f"βœ… {len(chunks)} chunks created.\n"
        return chunks, logs
    except Exception as e:
        logs += f"❌ Error in chunking: {str(e)}\n"
        return None, logs

# -------- Step 5: Embeddings & Indexing --------
def embed_chunks(chunks, logs):
    try:
        logs += "πŸ“Š Step 3: Generating embeddings and creating FAISS index...\n"
        embeddings = model.encode(chunks)
        index = faiss.IndexFlatL2(embeddings.shape[1])
        index.add(np.array(embeddings))
        logs += "βœ… Embeddings & index created successfully.\n"
        return index, chunks, logs
    except Exception as e:
        logs += f"❌ Error in embedding/indexing: {str(e)}\n"
        return None, None, logs

# -------- Step 6: Process PDF --------
def process_pdf(file, chunk_size=200):
    global faiss_index, chunks_list
    logs = "πŸ“‚ File uploaded successfully.\n"
    try:
        text, logs = extract_text_from_pdf(file.name, logs)
        if not text:
            return logs

        chunks, logs = create_chunks(text, chunk_size, logs)
        if not chunks:
            return logs

        index, chunks, logs = embed_chunks(chunks, logs)
        if not index:
            return logs

        faiss_index = index
        chunks_list = chunks
        logs += "πŸŽ‰ Step 4: PDF processed and ready for Q&A.\n"
        return logs
    except Exception as e:
        logs += f"❌ Error during processing: {str(e)}\n"
        return logs

# -------- Step 7: Ask Question --------
def answer_question(query):
    logs = "πŸ€– Step 5: Processing your question...\n"
    try:
        if faiss_index is None:
            return "❌ Please process a PDF first."

        query_embedding = model.encode([query])
        _, I = faiss_index.search(np.array(query_embedding), k=3)
        relevant_chunks = [chunks_list[i] for i in I[0] if i < len(chunks_list)]

        if not relevant_chunks:
            return "❌ No relevant content found to answer your question."

        context = "\n".join(relevant_chunks)
        prompt = f"Answer the question based on the context below:\n\nContext:\n{context}\n\nQuestion: {query}"

        logs += "🧠 Sending to Groq LLaMA3 model...\n"
        response = client.chat.completions.create(
            model="llama3-70b-8192",
            messages=[{"role": "user", "content": prompt}]
        )

        answer = response.choices[0].message.content
        logs += "βœ… Answer generated successfully.\n"
        return f"{answer}\n\n{logs}"

    except Exception as e:
        logs += f"❌ Error during answering: {str(e)}\n"
        return logs

# -------- Step 8: Gradio UI --------
with gr.Blocks() as demo:
    gr.Markdown("## πŸ€– RAG PDF Q&A App with Groq + FAISS (Debug-Friendly)")

    with gr.Row():
        pdf_input = gr.File(label="πŸ“‚ Upload PDF", type="filepath")
        process_btn = gr.Button("βš™οΈ Process PDF")
        log_output = gr.Textbox(label="πŸ“ Logs", lines=20)

    with gr.Row():
        question_input = gr.Textbox(label="❓ Ask a Question")
        answer_btn = gr.Button("πŸ’¬ Get Answer")
        answer_output = gr.Textbox(label="πŸ“œ Answer + Logs", lines=20)

    process_btn.click(fn=process_pdf, inputs=pdf_input, outputs=log_output)
    answer_btn.click(fn=answer_question, inputs=question_input, outputs=answer_output)

demo.launch()