File size: 11,533 Bytes
e9ce2a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
import gradio as gr
import os
import uuid
import tempfile
from typing import List, Tuple, Optional
from config import Config
from pdf_processor import PDFProcessor
from vector_store import VectorStore
from rag_engine import RAGEngine

# Initialize components
pdf_processor = PDFProcessor(
    chunk_size=Config.CHUNK_SIZE,
    chunk_overlap=Config.CHUNK_OVERLAP
)

vector_store = VectorStore(
    model_name=Config.EMBEDDING_MODEL,
    vector_db_path=Config.VECTOR_DB_PATH
)

rag_engine = RAGEngine(vector_store)

def upload_and_process_pdfs(files: List[tempfile._TemporaryFileWrapper]) -> str:
    """Process uploaded PDF files and add them to the vector store."""
    if not files:
        return "โŒ No files uploaded."

    try:
        uploaded_files = []
        total_chunks = 0

        for file in files:
            if file is None:
                continue

            file_path = file.name
            filename = os.path.basename(file_path)

            # Check if it's a PDF
            if not filename.lower().endswith('.pdf'):
                continue

            # Process PDF
            chunks = pdf_processor.extract_text_from_pdf(file_path)

            # Add to vector store
            vector_store.add_documents(chunks)

            uploaded_files.append(filename)
            total_chunks += len(chunks)

        if uploaded_files:
            stats = vector_store.get_stats()
            return f"โœ… Successfully processed {len(uploaded_files)} PDF(s):\n" + \
                   f"๐Ÿ“„ Files: {', '.join(uploaded_files)}\n" + \
                   f"๐Ÿ“Š Total chunks created: {total_chunks}\n" + \
                   f"๐Ÿ—ƒ๏ธ Database now contains {stats['total_documents']} total documents"
        else:
            return "โŒ No valid PDF files found."

    except Exception as e:
        return f"โŒ Error processing files: {str(e)}"

def get_database_stats() -> str:
    """Get current database statistics."""
    stats = vector_store.get_stats()
    return f"๐Ÿ“Š **Database Statistics**\n\n" + \
           f"๐Ÿ“„ Total Documents: {stats['total_documents']}\n" + \
           f"๐Ÿ” Index Size: {stats['index_size']}\n" + \
           f"๐Ÿ“ Vector Dimension: {stats.get('dimension', 'N/A')}"

def clear_database() -> str:
    """Clear the entire vector database."""
    try:
        vector_store.clear_index()
        return "โœ… Database cleared successfully!"
    except Exception as e:
        return f"โŒ Error clearing database: {str(e)}"

def respond(message: str, chat_history: List[dict]) -> Tuple[str, List[dict]]:
    """Chat function that handles the new messages format."""
    if not message.strip():
        return "", chat_history

    try:
        # Get response from RAG engine
        result = rag_engine.generate_answer(message, top_k=Config.TOP_K)

        response = result['answer']
        sources = result.get('sources', [])

        # Add source information to response
        if sources:
            response += "\n\n**๐Ÿ“š Sources:**\n"
            for i, source in enumerate(sources[:3], 1):
                response += f"{i}. ๐Ÿ“„ **{source['source_file']}** (Page {source['page_number']})\n"
                response += f"   ๐Ÿ“ _{source['content_preview']}_\n"

        # Add user message to chat history
        chat_history.append({"role": "user", "content": message})

        # Add assistant response to chat history
        chat_history.append({"role": "assistant", "content": response})

        return "", chat_history

    except Exception as e:
        error_response = f"โŒ Error: {str(e)}"

        # Add user message and error response to chat history
        chat_history.append({"role": "user", "content": message})
        chat_history.append({"role": "assistant", "content": error_response})

        return "", chat_history

def create_interface():
    """Create the Gradio interface."""

    with gr.Blocks(title="PDF RAG System") as interface:

        # Header
        gr.Markdown("# ๐Ÿค– PDF RAG Assistant")
        gr.Markdown("Upload PDFs and ask intelligent questions about their content using AI")

        with gr.Tabs():

            # Tab 1: Document Management
            with gr.Tab("๐Ÿ“ Document Management"):

                with gr.Row():
                    with gr.Column(scale=2):
                        gr.Markdown("## ๐Ÿ“ค Upload PDF Documents")
                        gr.Markdown("Drag and drop your PDF files or click to browse")

                        file_upload = gr.File(
                            file_count="multiple",
                            file_types=[".pdf"],
                            label="Select PDF files to upload"
                        )

                        upload_btn = gr.Button(
                            "๐Ÿš€ Process PDFs",
                            variant="primary",
                            size="lg"
                        )

                        upload_status = gr.Textbox(
                            label="๐Ÿ“Š Upload Status",
                            interactive=False,
                            max_lines=8
                        )

                    with gr.Column(scale=1):
                        gr.Markdown("## ๐Ÿ—„๏ธ Database Management")

                        stats_display = gr.Markdown(get_database_stats())

                        with gr.Row():
                            refresh_btn = gr.Button("๐Ÿ”„ Refresh", size="sm", variant="secondary")
                            clear_btn = gr.Button("๐Ÿ—‘๏ธ Clear Database", size="sm", variant="stop")

                        clear_status = gr.Textbox(
                            label="๐Ÿ”ง Database Status",
                            interactive=False,
                            max_lines=3
                        )

                # Event handlers for document management
                def update_stats_display():
                    return get_database_stats()

                upload_btn.click(
                    fn=upload_and_process_pdfs,
                    inputs=[file_upload],
                    outputs=[upload_status]
                ).then(
                    fn=update_stats_display,
                    outputs=[stats_display]
                )

                refresh_btn.click(
                    fn=update_stats_display,
                    outputs=[stats_display]
                )

                clear_btn.click(
                    fn=clear_database,
                    outputs=[clear_status]
                ).then(
                    fn=update_stats_display,
                    outputs=[stats_display]
                )

            # Tab 2: Chat Interface
            with gr.Tab("๐Ÿ’ฌ AI Assistant"):

                gr.Markdown("## ๐Ÿค– Ask questions about your uploaded documents")
                gr.Markdown("**๐Ÿ’ก Tips:** Upload PDFs first, then ask specific questions about their content for detailed answers with source references.")

                # Create chat interface with messages format
                chatbot = gr.Chatbot(
                    height=500,
                    show_label=False,
                    type="messages",
                    value=[{
                        "role": "assistant",
                        "content": "๐Ÿ‘‹ **Welcome to PDF RAG Assistant!**\n\nI'm here to help you analyze and understand your PDF documents. \n\n๐Ÿ“‹ **Getting started:**\n1. Upload PDFs in the 'Document Management' tab\n2. Come back here and ask me questions\n3. I'll provide detailed answers with source references\n\n๐Ÿš€ **Ready to get started?**"
                    }]
                )

                with gr.Row():
                    msg_input = gr.Textbox(
                        placeholder="๐Ÿ’ญ Ask a question about your documents...",
                        label="Your Question",
                        lines=2,
                        scale=4
                    )
                    send_btn = gr.Button(
                        "๐Ÿ“จ Send",
                        variant="primary",
                        size="lg",
                        scale=1
                    )

                clear_chat_btn = gr.Button(
                    "๐Ÿงน Clear Chat",
                    variant="secondary",
                    size="sm"
                )

                # Event handlers for chat
                send_btn.click(
                    fn=respond,
                    inputs=[msg_input, chatbot],
                    outputs=[msg_input, chatbot]
                )

                msg_input.submit(
                    fn=respond,
                    inputs=[msg_input, chatbot],
                    outputs=[msg_input, chatbot]
                )

                clear_chat_btn.click(
                    fn=lambda: [{
                        "role": "assistant",
                        "content": "๐Ÿ‘‹ **Welcome back!**\n\nI'm ready to help you with your PDF documents again. What would you like to know?"
                    }],
                    outputs=[chatbot]
                )

            # Tab 3: System Information
            with gr.Tab("โ„น๏ธ System Information"):

                gr.Markdown("# โš™๏ธ System Configuration & Information")

                with gr.Row():
                    with gr.Column():
                        gr.Markdown("## ๐Ÿ”ง Current Settings")

                        settings_info = f"""
**๐Ÿง  Embedding Model:** `{Config.EMBEDDING_MODEL}`

**๐Ÿ“ Chunk Size:** {Config.CHUNK_SIZE} characters

**๐Ÿ”— Chunk Overlap:** {Config.CHUNK_OVERLAP} characters

**๐ŸŽฏ Search Results:** Top {Config.TOP_K} most relevant chunks

**๐Ÿ“ Max File Size:** 16MB per PDF
"""
                        gr.Markdown(settings_info)

                    with gr.Column():
                        gr.Markdown("## ๐Ÿš€ Key Features")

                        features_info = """
โœ… Multiple PDF upload and processing

โœ… Intelligent text chunking

โœ… Vector similarity search using FAISS

โœ… AI-powered Q&A with Google Gemini

โœ… Source attribution with page numbers

โœ… Persistent vector database storage

โœ… Real-time chat interface

โœ… Responsive modern UI
"""
                        gr.Markdown(features_info)

                gr.Markdown("## ๐Ÿ› ๏ธ Technology Stack")

                with gr.Row():
                    with gr.Column():
                        gr.Markdown("**๐Ÿ–ฅ๏ธ Framework:** Gradio 4.44+")
                        gr.Markdown("**๐Ÿ“„ PDF Processing:** PyMuPDF")
                    with gr.Column():
                        gr.Markdown("**๐Ÿงฎ Embeddings:** Sentence Transformers")
                        gr.Markdown("**๐Ÿ—ƒ๏ธ Vector Database:** FAISS")
                    with gr.Column():
                        gr.Markdown("**๐Ÿค– Language Model:** Google Gemini 1.5")

                gr.Markdown("## ๐Ÿ“ Quick Start Guide")

                guide_info = """
**1.** Upload Documents - Go to 'Document Management' tab and upload your PDF files

**2.** Process & Index - Wait for the system to extract text and create embeddings

**3.** Ask Questions - Switch to 'AI Assistant' tab and start asking questions

**4.** Get Intelligent Answers - Receive detailed responses with source references and page numbers
"""
                gr.Markdown(guide_info)

    return interface

if __name__ == "__main__":
    # Create and launch the interface
    interface = create_interface()
    interface.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        show_error=True
    )