pradeepsengarr commited on
Commit
aee5caa
Β·
verified Β·
1 Parent(s): 26dd37e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +290 -217
app.py CHANGED
@@ -1,264 +1,337 @@
1
- import re
2
  import os
 
3
  import faiss
4
- import numpy as np
 
5
  import gradio as gr
6
- from typing import List
 
7
  from sentence_transformers import SentenceTransformer
8
  from transformers import pipeline
9
- from PyPDF2 import PdfReader
10
- import docx2txt
11
-
12
- # === Helper functions ===
13
-
14
- def clean_text(text: str) -> str:
15
- """Clean and normalize text."""
16
- text = re.sub(r'\s+', ' ', text) # normalize whitespace
17
- text = text.strip()
18
- return text
19
-
20
- def chunk_text(text: str, max_chunk_size: int = 300, overlap: int = 50) -> List[str]:
21
- """Split text into smaller overlapping chunks for better semantic search."""
22
- sentences = re.split(r'(?<=[.?!])\s+', text)
23
- chunks = []
24
- chunk = ""
25
- for sentence in sentences:
26
- if len(chunk) + len(sentence) <= max_chunk_size:
27
- chunk += sentence + " "
28
- else:
29
- chunks.append(chunk.strip())
30
- chunk = sentence + " "
31
- if chunk:
32
- chunks.append(chunk.strip())
33
- # Add overlapping between chunks to retain context
34
- overlapped_chunks = []
35
- for i in range(len(chunks)):
36
- combined = chunks[i]
37
- if i > 0:
38
- combined = chunks[i-1][-overlap:] + " " + combined
39
- overlapped_chunks.append(clean_text(combined))
40
- return overlapped_chunks
41
-
42
- def extract_text_from_pdf(file_path: str) -> str:
43
- """Extract text from PDF file."""
44
- text = ""
45
- try:
46
- reader = PdfReader(file_path)
47
- for page in reader.pages:
48
- text += page.extract_text() + " "
49
- except Exception as e:
50
- print(f"Error reading PDF {file_path}: {e}")
51
- return clean_text(text)
52
-
53
- def extract_text_from_docx(file_path: str) -> str:
54
- """Extract text from DOCX file."""
55
- try:
56
- text = docx2txt.process(file_path)
57
- return clean_text(text)
58
- except Exception as e:
59
- print(f"Error reading DOCX {file_path}: {e}")
60
- return ""
61
-
62
- def extract_text_from_txt(file_path: str) -> str:
63
- """Extract text from TXT file."""
64
- try:
65
- with open(file_path, 'r', encoding='utf-8') as f:
66
- text = f.read()
67
- return clean_text(text)
68
- except Exception as e:
69
- print(f"Error reading TXT {file_path}: {e}")
70
- return ""
71
-
72
- # === Main RAG System ===
73
 
74
  class SmartDocumentRAG:
75
- def __init__(self):
76
- # Model & embedding initialization
77
- self.embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
78
- self.qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
 
 
 
 
79
  self.documents = []
80
- self.chunks = []
 
 
 
81
  self.index = None
82
  self.is_indexed = False
83
- self.document_summary = ""
84
 
85
- def process_documents(self, uploaded_files) -> str:
86
- """Load, extract, chunk, embed, and index documents."""
87
- if not uploaded_files:
88
- return "⚠️ No files uploaded."
89
-
90
- self.documents.clear()
91
- self.chunks.clear()
92
- all_text = ""
93
-
94
- # Extract text from each uploaded file
95
- for file_obj in uploaded_files:
96
- # Save file temporarily to disk to process
97
- file_path = file_obj.name
98
- ext = os.path.splitext(file_path)[1].lower()
99
- text = ""
100
- if ext == ".pdf":
101
- text = extract_text_from_pdf(file_path)
102
- elif ext == ".docx":
103
- text = extract_text_from_docx(file_path)
104
- elif ext == ".txt":
105
- text = extract_text_from_txt(file_path)
106
  else:
107
- continue # skip unsupported
108
-
109
- if text:
110
- self.documents.append(text)
111
- all_text += text + " "
112
-
113
- if not all_text.strip():
114
- return "⚠️ No extractable text found in uploaded files."
115
-
116
- # Create chunks for semantic search
117
- self.chunks = chunk_text(all_text)
118
-
119
- # Create embeddings for chunks
120
- embeddings = self.embedder.encode(self.chunks, convert_to_numpy=True)
121
- embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True) # normalize
122
-
123
- # Create FAISS index
124
- dim = embeddings.shape[1]
125
- self.index = faiss.IndexFlatIP(dim)
126
- self.index.add(embeddings.astype('float32'))
127
- self.is_indexed = True
128
-
129
- # Create simple summary
130
- self.document_summary = self.generate_summary(all_text)
131
-
132
- return f"βœ… Processed {len(self.documents)} document(s), {len(self.chunks)} chunks indexed."
133
-
134
- def generate_summary(self, text: str) -> str:
135
- """Generate a simple summary using top sentences."""
136
- sentences = re.split(r'(?<=[.?!])\s+', text)
137
- summary = ' '.join(sentences[:5]) # first 5 sentences as naive summary
138
- return summary
 
 
 
 
 
 
 
 
 
 
 
139
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  def find_relevant_content(self, query: str, top_k: int = 3) -> str:
141
- """Perform semantic search to find relevant content chunks."""
142
- if not self.is_indexed or not self.chunks:
143
  return ""
144
- query_emb = self.embedder.encode([query], convert_to_numpy=True)
145
- query_emb = query_emb / np.linalg.norm(query_emb, axis=1, keepdims=True)
146
-
147
- scores, indices = self.index.search(query_emb.astype('float32'), min(top_k, len(self.chunks)))
148
-
149
- relevant_chunks = []
150
- for i, idx in enumerate(indices[0]):
151
- if scores[0][i] > 0.1:
152
- relevant_chunks.append(self.chunks[idx])
153
- return " ".join(relevant_chunks)
154
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  def extract_direct_answer(self, query: str, context: str) -> str:
156
- """Simple regex-based fallback extraction."""
157
- q = query.lower()
158
- if any(word in q for word in ['name', 'who is', 'who']):
 
159
  names = re.findall(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', context)
160
  if names:
161
  return f"**Name:** {names[0]}"
162
-
163
- if any(word in q for word in ['experience', 'years']):
164
- years = re.findall(r'(\d+)[\+\-\s]*(?:years?|yrs?)', context.lower())
165
- if years:
166
- return f"**Experience:** {years[0]} years"
167
-
168
- if any(word in q for word in ['skill', 'technology', 'tech']):
169
- skills = re.findall(r'\b(?:Python|Java|JavaScript|React|Node|SQL|AWS|Docker|Kubernetes|Git|HTML|CSS|Angular|Vue|Spring|Django|Flask|MongoDB|PostgreSQL)\b', context, re.I)
170
- if skills:
171
- unique_skills = sorted(set(skills), key=skills.index)
172
- return f"**Skills:** {', '.join(unique_skills)}"
173
-
174
- if any(word in q for word in ['education', 'degree', 'university']):
 
 
 
175
  edu = re.findall(r'(?:Bachelor|Master|PhD|B\.?S\.?|M\.?S\.?|B\.?A\.?|M\.?A\.?).*?(?:in|of)\s+([^.]+)', context, re.I)
176
  if edu:
177
  return f"**Education:** {edu[0]}"
178
-
179
- # Fallback: first sentence from context
180
- sentences = [s.strip() for s in context.split('.') if s.strip()]
181
  if sentences:
182
  return f"**Answer:** {sentences[0]}"
183
- return "I found relevant content but could not extract a specific answer."
184
-
185
- def answer_question(self, query: str) -> str:
186
- if not query.strip():
187
- return "❓ Please ask a question."
188
- if not self.is_indexed:
189
- return "πŸ“ Please upload and process documents first."
190
-
191
- q_lower = query.lower()
192
- if any(word in q_lower for word in ['summary', 'summarize', 'overview', 'about']):
193
- return f"πŸ“„ **Document Summary:**\n\n{self.document_summary}"
194
-
195
- context = self.find_relevant_content(query, top_k=3)
196
- if not context:
197
- return "πŸ” No relevant information found. Try rephrasing your question."
198
-
199
- try:
200
- # Use model for QA
201
- result = self.qa_pipeline(question=query, context=context)
202
- answer = result.get('answer', '').strip()
203
- score = result.get('score', 0)
204
-
205
- # Confidence threshold to fallback to regex extraction
206
- if score < 0.1 or not answer:
207
- return self.extract_direct_answer(query, context)
208
- return f"**Answer:** {answer}\n\n**Context:** {context[:200]}..."
209
-
210
- except Exception as e:
211
- print(f"QA model error: {e}")
212
- return self.extract_direct_answer(query, context)
213
 
214
- # === Gradio UI ===
215
 
216
- def main():
217
- rag = SmartDocumentRAG()
218
 
219
- def process_files(files):
220
- return rag.process_documents(files)
221
-
222
- def ask_question(question):
223
- return rag.answer_question(question)
224
-
225
- def get_summary():
226
- return rag.answer_question("summary")
227
-
228
  with gr.Blocks(title="🧠 Enhanced Document Q&A", theme=gr.themes.Soft()) as demo:
229
  gr.Markdown("""
230
  # 🧠 Enhanced Document Q&A System
231
 
232
- **Optimized with Better Models & Semantic Search**
233
 
234
- - Upload PDF, DOCX, TXT files
235
- - Semantic search + QA pipeline
236
- - Direct answer extraction fallback
 
 
237
  """)
238
-
239
  with gr.Tab("πŸ“€ Upload & Process"):
240
  with gr.Row():
241
  with gr.Column():
242
- file_upload = gr.File(label="πŸ“ Upload Documents", file_types=['.pdf','.docx','.txt'], file_count="multiple", height=150)
243
- process_btn = gr.Button("πŸ”„ Process Documents", variant="primary", size="lg")
244
  with gr.Column():
245
- process_status = gr.Textbox(label="πŸ“‹ Processing Status", lines=10, interactive=False)
246
- process_btn.click(fn=process_files, inputs=file_upload, outputs=process_status)
247
-
 
248
  with gr.Tab("❓ Q&A"):
249
  with gr.Row():
250
  with gr.Column():
251
- question_input = gr.Textbox(label="πŸ€” Ask Your Question", lines=3,
252
- placeholder="Name? Experience? Skills? Education?")
253
  with gr.Row():
254
  ask_btn = gr.Button("🧠 Get Answer", variant="primary")
255
  summary_btn = gr.Button("πŸ“Š Get Summary", variant="secondary")
256
  with gr.Column():
257
  answer_output = gr.Textbox(label="πŸ’‘ Answer", lines=8, interactive=False)
258
- ask_btn.click(fn=ask_question, inputs=question_input, outputs=answer_output)
259
- summary_btn.click(fn=get_summary, inputs=None, outputs=answer_output)
260
-
261
- demo.launch(server_name="0.0.0.0", server_port=7860, share=True)
 
 
262
 
263
  if __name__ == "__main__":
264
- main()
 
 
 
1
  import os
2
+ import re
3
  import faiss
4
+ import docx
5
+ import PyPDF2
6
  import gradio as gr
7
+ import numpy as np
8
+ from typing import List, Dict
9
  from sentence_transformers import SentenceTransformer
10
  from transformers import pipeline
11
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  class SmartDocumentRAG:
14
+ def __init__(self, embedder_model='sentence-transformers/all-MiniLM-L6-v2', qa_model='distilbert-base-cased-distilled-squad'):
15
+ # Load sentence embedding model
16
+ self.embedder = SentenceTransformer(embedder_model)
17
+
18
+ # Load Q&A pipeline model
19
+ self.qa_pipeline = pipeline('question-answering', model=qa_model, tokenizer=qa_model)
20
+
21
+ # Document and index initialization
22
  self.documents = []
23
+ self.document_metadata = []
24
+ self.raw_text = ""
25
+ self.document_summary = ""
26
+ self.document_type = ""
27
  self.index = None
28
  self.is_indexed = False
29
+ self.model_type = "distilbert-qa" # Can add flan-t5 or others as needed
30
 
31
+ ####################
32
+ # Text Extraction
33
+ ####################
34
+ def extract_text_from_file(self, file_path: str) -> str:
35
+ ext = os.path.splitext(file_path)[1].lower()
36
+ try:
37
+ if ext == '.pdf':
38
+ return self.extract_from_pdf(file_path)
39
+ elif ext == '.docx':
40
+ return self.extract_from_docx(file_path)
41
+ elif ext == '.txt':
42
+ return self.extract_from_txt(file_path)
 
 
 
 
 
 
 
 
 
43
  else:
44
+ return f"Unsupported file type: {ext}"
45
+ except Exception as e:
46
+ return f"Error reading file: {e}"
47
+
48
+ def extract_from_pdf(self, file_path: str) -> str:
49
+ text = ""
50
+ try:
51
+ with open(file_path, 'rb') as f:
52
+ reader = PyPDF2.PdfReader(f)
53
+ for page in reader.pages:
54
+ txt = page.extract_text() or ""
55
+ cleaned = self.clean_text(txt)
56
+ text += cleaned + "\n"
57
+ return text.strip()
58
+ except Exception as e:
59
+ return f"Error reading PDF: {e}"
60
+
61
+ def extract_from_docx(self, file_path: str) -> str:
62
+ try:
63
+ doc = docx.Document(file_path)
64
+ paragraphs = [self.clean_text(p.text) for p in doc.paragraphs if p.text.strip()]
65
+ return "\n".join(paragraphs)
66
+ except Exception as e:
67
+ return f"Error reading DOCX: {e}"
68
+
69
+ def extract_from_txt(self, file_path: str) -> str:
70
+ encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
71
+ for enc in encodings:
72
+ try:
73
+ with open(file_path, 'r', encoding=enc) as f:
74
+ return self.clean_text(f.read())
75
+ except UnicodeDecodeError:
76
+ continue
77
+ except Exception as e:
78
+ return f"Error reading TXT: {e}"
79
+ return "Could not decode TXT file."
80
+
81
+ def clean_text(self, text: str) -> str:
82
+ # Normalize whitespace, fix broken words, remove weird chars
83
+ text = re.sub(r'\s+', ' ', text)
84
+ text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text) # Fix camel case merges
85
+ text = text.strip()
86
+ return text
87
 
88
+ ####################
89
+ # Document Type Detection & Summary
90
+ ####################
91
+ def detect_document_type(self, text: str) -> str:
92
+ lower_text = text.lower()
93
+ if any(k in lower_text for k in ['abstract', 'study', 'research', 'methodology']):
94
+ return 'research'
95
+ elif any(k in lower_text for k in ['company', 'business', 'organization', 'financial']):
96
+ return 'business'
97
+ else:
98
+ return 'general'
99
+
100
+ def create_document_summary(self, text: str) -> str:
101
+ sentences = re.split(r'(?<=[.!?]) +', text)
102
+ sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
103
+
104
+ if self.document_type == 'research':
105
+ return self.extract_research_summary(sentences)
106
+ elif self.document_type == 'business':
107
+ return self.extract_business_summary(sentences)
108
+ else:
109
+ return self.extract_general_summary(sentences)
110
+
111
+ def extract_research_summary(self, sentences: List[str]) -> str:
112
+ for s in sentences[:7]:
113
+ if any(w in s.lower() for w in ['abstract', 'study', 'research']):
114
+ return s[:300] + ('...' if len(s) > 300 else '')
115
+ return sentences[0][:300] if sentences else "Research document."
116
+
117
+ def extract_business_summary(self, sentences: List[str]) -> str:
118
+ for s in sentences[:5]:
119
+ if any(w in s.lower() for w in ['company', 'business', 'organization']):
120
+ return s[:300] + ('...' if len(s) > 300 else '')
121
+ return sentences[0][:300] if sentences else "Business document."
122
+
123
+ def extract_general_summary(self, sentences: List[str]) -> str:
124
+ return sentences[0][:300] + ('...' if len(sentences[0]) > 300 else '') if sentences else "General document."
125
+
126
+ ####################
127
+ # Chunking
128
+ ####################
129
+ def enhanced_chunk_text(self, text: str, chunk_size: int = 3, overlap: int = 1) -> List[Dict]:
130
+ if not text.strip():
131
+ return []
132
+
133
+ sentences = re.split(r'(?<=[.!?]) +', text)
134
+ sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
135
+
136
+ chunks = []
137
+ for i in range(0, len(sentences), chunk_size - overlap):
138
+ chunk_sents = sentences[i:i + chunk_size]
139
+ if chunk_sents:
140
+ chunk_text = " ".join(chunk_sents)
141
+ chunks.append({
142
+ "text": chunk_text,
143
+ "sentence_indices": list(range(i, min(i + chunk_size, len(sentences)))),
144
+ "doc_type": self.document_type
145
+ })
146
+ return chunks
147
+
148
+ ####################
149
+ # Processing uploaded files
150
+ ####################
151
+ def process_documents(self, files) -> str:
152
+ if not files:
153
+ return "❌ No files uploaded!"
154
+
155
+ try:
156
+ all_text = ""
157
+ processed_files = []
158
+
159
+ for file in files:
160
+ if file is None:
161
+ continue
162
+ file_text = self.extract_text_from_file(file.name)
163
+ if not file_text.startswith("Error") and not file_text.startswith("Unsupported"):
164
+ all_text += " " + file_text
165
+ processed_files.append(os.path.basename(file.name))
166
+ else:
167
+ return f"❌ {file_text}"
168
+
169
+ if not all_text.strip():
170
+ return "❌ No text extracted from files!"
171
+
172
+ self.raw_text = all_text.strip()
173
+ self.document_type = self.detect_document_type(self.raw_text)
174
+ self.document_summary = self.create_document_summary(self.raw_text)
175
+
176
+ chunks = self.enhanced_chunk_text(self.raw_text)
177
+ if not chunks:
178
+ return "❌ No valid chunks created!"
179
+
180
+ self.documents = [c["text"] for c in chunks]
181
+ self.document_metadata = chunks
182
+
183
+ embeddings = self.embedder.encode(self.documents, show_progress_bar=False, convert_to_numpy=True)
184
+ dimension = embeddings.shape[1]
185
+
186
+ self.index = faiss.IndexFlatIP(dimension)
187
+ faiss.normalize_L2(embeddings)
188
+ self.index.add(embeddings.astype('float32'))
189
+
190
+ self.is_indexed = True
191
+
192
+ return (f"βœ… Processed {len(processed_files)} files: {', '.join(processed_files)}\n"
193
+ f"πŸ“„ Document Type: {self.document_type.title()}\n"
194
+ f"πŸ” Created {len(self.documents)} chunks\n"
195
+ f"πŸ“ Summary: {self.document_summary}\n"
196
+ f"πŸš€ Ready for Q&A!")
197
+
198
+ except Exception as e:
199
+ return f"❌ Error processing documents: {e}"
200
+
201
+ ####################
202
+ # Search & Answer
203
+ ####################
204
  def find_relevant_content(self, query: str, top_k: int = 3) -> str:
205
+ if not self.is_indexed:
 
206
  return ""
207
+
208
+ try:
209
+ query_embedding = self.embedder.encode([query], convert_to_numpy=True)
210
+ faiss.normalize_L2(query_embedding)
211
+
212
+ k = min(top_k, len(self.documents))
213
+ scores, indices = self.index.search(query_embedding.astype('float32'), k)
214
+
215
+ relevant_chunks = []
216
+ for score, idx in zip(scores[0], indices[0]):
217
+ if idx < len(self.documents) and score > 0.15:
218
+ relevant_chunks.append(self.documents[idx])
219
+
220
+ return " ".join(relevant_chunks)
221
+
222
+ except Exception as e:
223
+ print(f"Search error: {e}")
224
+ return ""
225
+
226
+ def answer_question(self, query: str) -> str:
227
+ if not query.strip():
228
+ return "❓ Please ask a question!"
229
+
230
+ if not self.is_indexed:
231
+ return "πŸ“ Please upload and process documents first!"
232
+
233
+ try:
234
+ lower_query = query.lower()
235
+ if any(k in lower_query for k in ['summary', 'summarize', 'about', 'overview']):
236
+ return f"πŸ“„ **Document Summary:**\n\n{self.document_summary}"
237
+
238
+ context = self.find_relevant_content(query, top_k=3)
239
+ if not context:
240
+ return "πŸ” No relevant information found. Try rephrasing your question."
241
+
242
+ # Use Q&A pipeline
243
+ result = self.qa_pipeline(question=query, context=context)
244
+ answer = result.get('answer', '').strip()
245
+ score = result.get('score', 0.0)
246
+
247
+ if score < 0.15 or not answer:
248
+ # Fallback to direct extraction
249
+ return self.extract_direct_answer(query, context)
250
+
251
+ return f"**Answer:** {answer}\n\n**Context:** {context[:300]}..."
252
+
253
+ except Exception as e:
254
+ return f"❌ Error answering question: {e}"
255
+
256
  def extract_direct_answer(self, query: str, context: str) -> str:
257
+ lower_query = query.lower()
258
+
259
+ # Extract names (simple heuristic)
260
+ if any(k in lower_query for k in ['name', 'who is', 'who']):
261
  names = re.findall(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', context)
262
  if names:
263
  return f"**Name:** {names[0]}"
264
+
265
+ # Extract experience years
266
+ if any(k in lower_query for k in ['experience', 'years']):
267
+ exp = re.findall(r'(\d+)[\+\-\s]*(?:years?|yrs?)', context.lower())
268
+ if exp:
269
+ return f"**Experience:** {exp[0]} years"
270
+
271
+ # Extract skills
272
+ if any(k in lower_query for k in ['skill', 'technology', 'tech']):
273
+ skills_regex = r'\b(Python|Java|JavaScript|React|Node|SQL|AWS|Docker|Kubernetes|Git|HTML|CSS|Angular|Vue|Spring|Django|Flask|MongoDB|PostgreSQL)\b'
274
+ skills_found = list(set(re.findall(skills_regex, context, re.I)))
275
+ if skills_found:
276
+ return f"**Skills mentioned:** {', '.join(skills_found)}"
277
+
278
+ # Extract education
279
+ if any(k in lower_query for k in ['education', 'degree', 'university']):
280
  edu = re.findall(r'(?:Bachelor|Master|PhD|B\.?S\.?|M\.?S\.?|B\.?A\.?|M\.?A\.?).*?(?:in|of)\s+([^.]+)', context, re.I)
281
  if edu:
282
  return f"**Education:** {edu[0]}"
283
+
284
+ # Fallback: first sentence
285
+ sentences = re.split(r'(?<=[.!?]) +', context)
286
  if sentences:
287
  return f"**Answer:** {sentences[0]}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
 
289
+ return "I found relevant information but could not extract a precise answer."
290
 
 
 
291
 
292
+ # Gradio interface creation
293
+ def create_interface():
294
+ rag_system = SmartDocumentRAG()
295
+
 
 
 
 
 
296
  with gr.Blocks(title="🧠 Enhanced Document Q&A", theme=gr.themes.Soft()) as demo:
297
  gr.Markdown("""
298
  # 🧠 Enhanced Document Q&A System
299
 
300
+ **Optimized with Better Chunking, Summaries, and Reduced Hallucination**
301
 
302
+ **Features:**
303
+ - 🎯 DistilBERT Q&A pipeline for accurate answers
304
+ - ⚑ SentenceTransformer embeddings + FAISS semantic search
305
+ - πŸ“Š Improved document summaries & chunking
306
+ - πŸ” Direct answer fallback for facts extraction
307
  """)
308
+
309
  with gr.Tab("πŸ“€ Upload & Process"):
310
  with gr.Row():
311
  with gr.Column():
312
+ file_upload = gr.File(label="πŸ“ Upload Documents", file_types=[".pdf", ".docx", ".txt"], file_count="multiple", interactive=True)
313
+ process_btn = gr.Button("πŸ”„ Process Documents", variant="primary")
314
  with gr.Column():
315
+ process_status = gr.Textbox(label="πŸ“‹ Processing Status", lines=8, interactive=False)
316
+
317
+ process_btn.click(fn=rag_system.process_documents, inputs=[file_upload], outputs=[process_status])
318
+
319
  with gr.Tab("❓ Q&A"):
320
  with gr.Row():
321
  with gr.Column():
322
+ question_input = gr.Textbox(label="πŸ€” Ask Your Question", placeholder="Enter your question here...", lines=3)
 
323
  with gr.Row():
324
  ask_btn = gr.Button("🧠 Get Answer", variant="primary")
325
  summary_btn = gr.Button("πŸ“Š Get Summary", variant="secondary")
326
  with gr.Column():
327
  answer_output = gr.Textbox(label="πŸ’‘ Answer", lines=8, interactive=False)
328
+
329
+ ask_btn.click(fn=rag_system.answer_question, inputs=[question_input], outputs=[answer_output])
330
+ summary_btn.click(fn=lambda: rag_system.answer_question("summary"), inputs=[], outputs=[answer_output])
331
+
332
+ return demo
333
+
334
 
335
  if __name__ == "__main__":
336
+ demo = create_interface()
337
+ demo.launch(server_name="0.0.0.0", server_port=7860, share=True)