pradeepsengarr commited on
Commit
253bfed
Β·
verified Β·
1 Parent(s): 3406461

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +397 -445
app.py CHANGED
@@ -11,36 +11,65 @@ import os
11
  import re
12
  from typing import List, Optional, Dict, Tuple
13
  import json
 
14
 
15
  class SmartDocumentRAG:
16
  def __init__(self):
17
- print("πŸš€ Initializing Smart RAG System...")
18
 
19
- # Initialize embedding model (lightweight)
20
- self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
21
- print("βœ… Embedding model loaded")
22
 
23
  # Initialize quantized LLM
24
  self.setup_llm()
25
 
26
  # Document storage
27
  self.documents = []
28
- self.document_metadata = [] # Store metadata about each chunk
29
  self.index = None
30
  self.is_indexed = False
31
  self.raw_text = ""
32
- self.document_type = "general" # Auto-detect document type
33
- self.document_summary = "" # Store document summary
 
 
34
 
35
  def setup_llm(self):
36
- """Setup quantized Mistral model with fallback"""
37
  try:
38
- # Check if CUDA is available
39
  if not torch.cuda.is_available():
40
  print("⚠️ CUDA not available, using CPU-optimized model")
41
  self.setup_cpu_model()
42
  return
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  quantization_config = BitsAndBytesConfig(
45
  load_in_4bit=True,
46
  bnb_4bit_compute_dtype=torch.float16,
@@ -50,35 +79,27 @@ class SmartDocumentRAG:
50
 
51
  model_name = "mistralai/Mistral-7B-Instruct-v0.1"
52
 
53
- self.tokenizer = AutoTokenizer.from_pretrained(
54
- model_name,
55
- trust_remote_code=True
56
- )
57
-
58
- if self.tokenizer.pad_token is None:
59
- self.tokenizer.pad_token = self.tokenizer.eos_token
60
-
61
  self.model = AutoModelForCausalLM.from_pretrained(
62
  model_name,
63
  quantization_config=quantization_config,
64
  device_map="auto",
65
- torch_dtype=torch.float16,
66
- trust_remote_code=True,
67
- low_cpu_mem_usage=True
68
  )
69
 
70
- print("βœ… Quantized Mistral model loaded successfully")
 
 
 
71
 
72
  except Exception as e:
73
- print(f"❌ Error loading Mistral: {e}")
74
- print("πŸ”„ Falling back to CPU model...")
75
  self.setup_cpu_model()
76
 
77
  def setup_cpu_model(self):
78
  """Setup CPU-friendly model"""
79
  try:
80
- # Use GPT-2 for better text generation on CPU
81
- model_name = "gpt2-medium"
82
  self.tokenizer = AutoTokenizer.from_pretrained(model_name)
83
  self.model = AutoModelForCausalLM.from_pretrained(model_name)
84
 
@@ -87,87 +108,155 @@ class SmartDocumentRAG:
87
 
88
  print("βœ… CPU model loaded")
89
  except Exception as e:
90
- print(f"❌ CPU model failed: {e}")
91
  self.model = None
92
  self.tokenizer = None
93
- print("⚠�� Using context-only mode")
94
 
95
  def detect_document_type(self, text: str) -> str:
96
- """Intelligently detect document type"""
97
  text_lower = text.lower()
98
 
99
- # Count keywords for different document types
100
- resume_keywords = ['experience', 'skills', 'education', 'linkedin', 'email', 'phone', 'internship']
101
- research_keywords = ['abstract', 'introduction', 'methodology', 'conclusion', 'references', 'study', 'analysis']
102
- business_keywords = ['company', 'revenue', 'market', 'strategy', 'business', 'financial', 'quarter']
103
- technical_keywords = ['implementation', 'algorithm', 'system', 'technical', 'specification', 'architecture']
104
- legal_keywords = ['contract', 'agreement', 'terms', 'conditions', 'legal', 'clause', 'liability']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
  scores = {
107
- 'resume': sum(1 for kw in resume_keywords if kw in text_lower),
108
- 'research': sum(1 for kw in research_keywords if kw in text_lower),
109
- 'business': sum(1 for kw in business_keywords if kw in text_lower),
110
- 'technical': sum(1 for kw in technical_keywords if kw in text_lower),
111
- 'legal': sum(1 for kw in legal_keywords if kw in text_lower)
112
  }
113
 
114
- return max(scores, key=scores.get) if max(scores.values()) > 2 else 'general'
 
 
 
115
 
116
  def create_document_summary(self, text: str) -> str:
117
- """Create intelligent document summary"""
118
  try:
119
- # Split into paragraphs and find key information
120
- paragraphs = [p.strip() for p in text.split('\n\n') if p.strip() and len(p) > 50]
121
-
122
- if not paragraphs:
123
- return "Document contains basic text information."
124
-
125
- # Take first few paragraphs for summary context
126
- summary_text = ' '.join(paragraphs[:3])[:1000]
127
-
128
- if self.model and self.tokenizer:
129
- # Generate AI summary
130
- prompt = f"""Summarize the following document in 2-3 sentences, focusing on the main points and key information:
131
-
132
- {summary_text}
133
-
134
- Summary:"""
 
135
 
136
- try:
137
- inputs = self.tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
138
- if torch.cuda.is_available() and next(self.model.parameters()).is_cuda:
139
- inputs = {k: v.cuda() for k, v in inputs.items()}
140
-
141
- with torch.no_grad():
142
- outputs = self.model.generate(
143
- **inputs,
144
- max_new_tokens=100,
145
- temperature=0.7,
146
- do_sample=True,
147
- top_p=0.9,
148
- pad_token_id=self.tokenizer.pad_token_id
149
- )
150
-
151
- summary = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
152
- summary = summary.split("Summary:")[-1].strip()
153
-
154
- if len(summary) > 20:
155
- return summary
156
-
157
- except Exception as e:
158
- print(f"Error generating AI summary: {e}")
159
-
160
- # Fallback: Extract key sentences
161
- sentences = re.split(r'[.!?]+', summary_text)
162
- key_sentences = [s.strip() for s in sentences if len(s.strip()) > 30][:2]
163
-
164
- return '. '.join(key_sentences) + '.' if key_sentences else "Document contains relevant information."
165
-
166
  except Exception as e:
 
167
  return "Document summary not available."
168
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  def extract_text_from_file(self, file_path: str) -> str:
170
- """Extract text from various file formats with better error handling"""
171
  try:
172
  file_extension = os.path.splitext(file_path)[1].lower()
173
 
@@ -184,7 +273,7 @@ Summary:"""
184
  return f"Error reading file: {str(e)}"
185
 
186
  def extract_from_pdf(self, file_path: str) -> str:
187
- """Enhanced PDF extraction"""
188
  text = ""
189
  try:
190
  with open(file_path, 'rb') as file:
@@ -192,10 +281,12 @@ Summary:"""
192
  for page_num, page in enumerate(pdf_reader.pages):
193
  page_text = page.extract_text()
194
  if page_text.strip():
195
- text += f"\n[Page {page_num + 1}]\n{page_text}\n"
 
 
196
  except Exception as e:
197
  text = f"Error reading PDF: {str(e)}"
198
- return text
199
 
200
  def extract_from_docx(self, file_path: str) -> str:
201
  """Enhanced DOCX extraction"""
@@ -204,19 +295,22 @@ Summary:"""
204
  text = ""
205
  for paragraph in doc.paragraphs:
206
  if paragraph.text.strip():
207
- text += paragraph.text + "\n"
208
- return text
209
  except Exception as e:
210
  return f"Error reading DOCX: {str(e)}"
211
 
212
  def extract_from_txt(self, file_path: str) -> str:
213
- """Enhanced TXT extraction with encoding detection"""
214
  encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
215
 
216
  for encoding in encodings:
217
  try:
218
  with open(file_path, 'r', encoding=encoding) as file:
219
- return file.read()
 
 
 
220
  except UnicodeDecodeError:
221
  continue
222
  except Exception as e:
@@ -224,81 +318,39 @@ Summary:"""
224
 
225
  return "Error: Could not decode file with any supported encoding"
226
 
227
- def intelligent_chunk_text(self, text: str, doc_type: str) -> List[Dict]:
228
- """Intelligent chunking based on document type"""
229
  if not text.strip():
230
  return []
231
 
232
  chunks = []
233
- lines = [line.strip() for line in text.split('\n') if line.strip()]
234
 
235
- if doc_type == 'research':
236
- # For research papers, chunk by sections
237
- current_chunk = ""
238
- current_section = "introduction"
239
-
240
- for line in lines:
241
- line_lower = line.lower()
242
-
243
- # Detect section headers
244
- if any(header in line_lower for header in ['abstract', 'introduction', 'methodology', 'results', 'conclusion', 'references']):
245
- if current_chunk:
246
- chunks.append({
247
- 'text': current_chunk.strip(),
248
- 'section': current_section,
249
- 'doc_type': doc_type
250
- })
251
- current_chunk = line
252
- current_section = line_lower.split()[0] if line_lower.split() else "section"
253
- else:
254
- current_chunk += "\n" + line
255
-
256
- # Limit chunk size
257
- if len(current_chunk.split()) > 200:
258
- chunks.append({
259
- 'text': current_chunk.strip(),
260
- 'section': current_section,
261
- 'doc_type': doc_type
262
- })
263
- current_chunk = ""
264
-
265
- if current_chunk:
266
- chunks.append({
267
- 'text': current_chunk.strip(),
268
- 'section': current_section,
269
- 'doc_type': doc_type
270
- })
271
-
272
- else:
273
- # General intelligent chunking
274
- current_chunk = ""
275
- sentence_count = 0
276
-
277
- for line in lines:
278
- current_chunk += line + "\n"
279
- sentence_count += len(re.findall(r'[.!?]+', line))
280
-
281
- # Create chunk based on sentence count or word count
282
- if sentence_count >= 5 or len(current_chunk.split()) > 150:
283
  chunks.append({
284
- 'text': current_chunk.strip(),
285
- 'section': 'content',
286
- 'doc_type': doc_type
287
  })
288
- current_chunk = ""
289
- sentence_count = 0
290
-
291
- if current_chunk:
292
- chunks.append({
293
- 'text': current_chunk.strip(),
294
- 'section': 'content',
295
- 'doc_type': doc_type
296
- })
297
 
298
  return chunks
299
 
300
  def process_documents(self, files) -> str:
301
- """Enhanced document processing with intelligent analysis"""
302
  if not files:
303
  return "❌ No files uploaded!"
304
 
@@ -306,14 +358,13 @@ Summary:"""
306
  all_text = ""
307
  processed_files = []
308
 
309
- # Extract text from all files
310
  for file in files:
311
  if file is None:
312
  continue
313
 
314
  file_text = self.extract_text_from_file(file.name)
315
  if not file_text.startswith("Error") and not file_text.startswith("Unsupported"):
316
- all_text += f"\n\n--- {os.path.basename(file.name)} ---\n\n{file_text}"
317
  processed_files.append(os.path.basename(file.name))
318
  else:
319
  return f"❌ {file_text}"
@@ -321,17 +372,13 @@ Summary:"""
321
  if not all_text.strip():
322
  return "❌ No text extracted from files!"
323
 
324
- # Store raw text
325
  self.raw_text = all_text
326
-
327
- # Detect document type
328
  self.document_type = self.detect_document_type(all_text)
329
-
330
- # Create document summary
331
  self.document_summary = self.create_document_summary(all_text)
332
 
333
- # Intelligent chunking
334
- chunk_data = self.intelligent_chunk_text(all_text, self.document_type)
335
 
336
  if not chunk_data:
337
  return "❌ No valid text chunks created!"
@@ -339,15 +386,20 @@ Summary:"""
339
  self.documents = [chunk['text'] for chunk in chunk_data]
340
  self.document_metadata = chunk_data
341
 
342
- # Create embeddings
343
  print(f"πŸ“„ Creating embeddings for {len(self.documents)} chunks...")
344
- embeddings = self.embedder.encode(self.documents, show_progress_bar=True)
 
 
 
 
 
345
 
346
  # Build FAISS index
347
  dimension = embeddings.shape[1]
348
  self.index = faiss.IndexFlatIP(dimension)
349
 
350
- # Normalize embeddings for cosine similarity
351
  faiss.normalize_L2(embeddings)
352
  self.index.add(embeddings.astype('float32'))
353
 
@@ -356,244 +408,144 @@ Summary:"""
356
  return f"βœ… Successfully processed {len(processed_files)} files:\n" + \
357
  f"πŸ“„ Files: {', '.join(processed_files)}\n" + \
358
  f"πŸ“Š Document Type: {self.document_type.title()}\n" + \
359
- f"πŸ” Created {len(self.documents)} intelligent chunks\n" + \
360
- f"πŸ“ Summary: {self.document_summary[:200]}...\n" + \
361
- f"πŸš€ Ready for smart Q&A!"
362
 
363
  except Exception as e:
364
  return f"❌ Error processing documents: {str(e)}"
365
 
366
- def smart_retrieve_context(self, query: str, k: int = 4) -> Tuple[str, List[Dict]]:
367
- """Enhanced context retrieval with intelligent ranking"""
368
  if not self.is_indexed:
369
  return "", []
370
 
371
  try:
372
- # Get query embedding
 
 
 
373
  query_embedding = self.embedder.encode([query])
374
  faiss.normalize_L2(query_embedding)
375
 
376
- # Search for similar chunks
377
- scores, indices = self.index.search(query_embedding.astype('float32'), min(k * 2, len(self.documents)))
378
 
379
- # Analyze query intent
380
- query_lower = query.lower()
381
- is_summary_request = any(word in query_lower for word in ['summary', 'summarize', 'overview', 'what is', 'about'])
382
- is_specific_request = any(word in query_lower for word in ['how', 'why', 'when', 'where', 'which'])
383
 
384
- relevant_chunks = []
 
 
385
 
386
- for i, idx in enumerate(indices[0]):
387
- if idx < len(self.documents):
388
- score = scores[0][i]
389
- chunk_data = self.document_metadata[idx]
390
-
391
- # Adjust scoring based on query type and document structure
392
- adjusted_score = score
393
-
394
- if is_summary_request:
395
- # Boost introductory sections for summary requests
396
- if chunk_data['section'] in ['introduction', 'abstract', 'content']:
397
- adjusted_score += 0.1
398
-
399
- if adjusted_score > 0.15: # Threshold for relevance
400
- relevant_chunks.append({
401
- 'text': self.documents[idx],
402
- 'score': adjusted_score,
403
- 'metadata': chunk_data
404
- })
405
 
406
- # Sort by adjusted score
407
- relevant_chunks.sort(key=lambda x: x['score'], reverse=True)
408
 
409
- # Take top chunks
410
- top_chunks = relevant_chunks[:k]
411
- context = "\n\n".join([chunk['text'] for chunk in top_chunks])
 
 
412
 
413
- return context, top_chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
414
 
415
  except Exception as e:
416
- print(f"Error in retrieval: {e}")
417
  return "", []
418
 
419
- def generate_smart_answer(self, query: str, context: str, chunks_data: List[Dict]) -> str:
420
- """Generate intelligent answers based on query type and context"""
421
  if not context:
422
- return "No relevant information found in the documents."
423
-
424
- query_lower = query.lower()
425
-
426
- # Determine answer type
427
- is_summary_request = any(word in query_lower for word in ['summary', 'summarize', 'overview', 'what is', 'about'])
428
- is_comparison_request = any(word in query_lower for word in ['compare', 'difference', 'versus', 'vs'])
429
- is_specific_question = any(word in query_lower for word in ['how', 'why', 'when', 'where', 'which'])
430
-
431
- if self.model and self.tokenizer:
432
- try:
433
- # Create intelligent prompt based on query type
434
- if is_summary_request:
435
- prompt = self.create_summary_prompt(query, context)
436
- elif is_comparison_request:
437
- prompt = self.create_comparison_prompt(query, context)
438
- else:
439
- prompt = self.create_general_prompt(query, context)
440
-
441
- # Generate response
442
- inputs = self.tokenizer(
443
- prompt,
444
- return_tensors="pt",
445
- max_length=800,
446
- truncation=True,
447
- padding=True
448
- )
449
-
450
- if torch.cuda.is_available() and next(self.model.parameters()).is_cuda:
451
- inputs = {k: v.cuda() for k, v in inputs.items()}
452
-
453
- with torch.no_grad():
454
- outputs = self.model.generate(
455
- **inputs,
456
- max_new_tokens=150,
457
- temperature=0.3,
458
- do_sample=True,
459
- top_p=0.9,
460
- repetition_penalty=1.1,
461
- pad_token_id=self.tokenizer.pad_token_id,
462
- eos_token_id=self.tokenizer.eos_token_id
463
- )
464
-
465
- # Extract and clean answer
466
- full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
467
- answer = self.extract_answer_from_response(full_response, prompt)
468
-
469
- if answer and len(answer) > 20:
470
- return self.clean_and_validate_answer(answer)
471
-
472
- except Exception as e:
473
- print(f"Error in AI generation: {e}")
474
-
475
- # Fallback to intelligent context-based answering
476
- return self.context_based_smart_answer(query, context, chunks_data)
477
-
478
- def create_summary_prompt(self, query: str, context: str) -> str:
479
- """Create prompt for summary requests"""
480
- return f"""Based on the document content below, provide a comprehensive summary addressing the question.
481
-
482
- Document Content:
483
- {context[:1000]}
484
-
485
- Question: {query}
486
-
487
- Provide a clear, informative summary that addresses the question:"""
488
-
489
- def create_comparison_prompt(self, query: str, context: str) -> str:
490
- """Create prompt for comparison requests"""
491
- return f"""Analyze the document content and provide a comparison as requested.
492
-
493
- Document Content:
494
- {context[:1000]}
495
-
496
- Question: {query}
497
-
498
- Provide a detailed comparison based on the information:"""
499
-
500
- def create_general_prompt(self, query: str, context: str) -> str:
501
- """Create prompt for general questions"""
502
- return f"""Answer the question based on the document content provided.
503
-
504
- Document Content:
505
- {context[:1000]}
506
-
507
- Question: {query}
508
-
509
- Provide a specific, accurate answer:"""
510
-
511
- def extract_answer_from_response(self, response: str, prompt: str) -> str:
512
- """Extract clean answer from model response"""
513
- # Remove the prompt part
514
- if prompt in response:
515
- answer = response.replace(prompt, "").strip()
516
- else:
517
- # Try to find the answer after common patterns
518
- patterns = ["Answer:", "Summary:", "Response:", "answer:", "summary:", "response:"]
519
- answer = response
520
- for pattern in patterns:
521
- if pattern in response:
522
- answer = response.split(pattern)[-1].strip()
523
- break
524
 
525
- return answer
526
-
527
- def context_based_smart_answer(self, query: str, context: str, chunks_data: List[Dict]) -> str:
528
- """Intelligent context-based answering as fallback"""
529
  query_lower = query.lower()
530
-
531
- # For summary requests
532
- if any(word in query_lower for word in ['summary', 'summarize', 'overview', 'about']):
533
- return self.create_context_summary(context, chunks_data)
534
-
535
- # For specific questions, find most relevant sentences
536
- context_sentences = [s.strip() for s in context.split('.') if len(s.strip()) > 20]
537
- query_words = set(query_lower.split())
538
-
539
- # Score sentences by relevance
540
- scored_sentences = []
541
- for sentence in context_sentences:
542
- sentence_words = set(sentence.lower().split())
543
- overlap = len(query_words.intersection(sentence_words))
544
- if overlap > 0:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
545
  scored_sentences.append((sentence, overlap))
 
 
 
 
 
 
 
 
546
 
547
- # Sort by relevance and combine top sentences
548
- scored_sentences.sort(key=lambda x: x[1], reverse=True)
549
-
550
- if scored_sentences:
551
- top_sentences = [s[0] for s in scored_sentences[:3]]
552
- return '. '.join(top_sentences) + '.'
553
-
554
- return "I found relevant information but couldn't extract a specific answer. Please try rephrasing your question."
555
-
556
- def create_context_summary(self, context: str, chunks_data: List[Dict]) -> str:
557
- """Create summary from context"""
558
- # Get key sentences from different sections
559
- sentences_by_section = {}
560
-
561
- for chunk in chunks_data:
562
- section = chunk['metadata']['section']
563
- sentences = [s.strip() for s in chunk['text'].split('.') if len(s.strip()) > 30]
564
- if sentences:
565
- if section not in sentences_by_section:
566
- sentences_by_section[section] = []
567
- sentences_by_section[section].extend(sentences[:2]) # Top 2 sentences per section
568
-
569
- # Combine sentences from different sections
570
- summary_parts = []
571
- for section, sentences in sentences_by_section.items():
572
- if sentences:
573
- summary_parts.extend(sentences[:1]) # One sentence per section
574
-
575
- if summary_parts:
576
- return '. '.join(summary_parts[:4]) + '.' # Max 4 sentences
577
-
578
- return self.document_summary if self.document_summary else "Document contains relevant information on the requested topic."
579
-
580
- def clean_and_validate_answer(self, answer: str) -> str:
581
- """Clean and validate the generated answer"""
582
- # Remove unwanted patterns
583
- answer = re.sub(r'--- \w+.*? ---', '', answer)
584
- answer = re.sub(r'\[Page \d+\]', '', answer)
585
-
586
- # Clean up whitespace and formatting
587
- answer = ' '.join(answer.split())
588
-
589
- # Ensure proper sentence structure
590
- if answer and not answer.endswith(('.', '!', '?')):
591
- answer += '.'
592
-
593
- return answer.strip()
594
 
595
  def answer_question(self, query: str) -> str:
596
- """Main function to answer questions intelligently"""
597
  if not query.strip():
598
  return "❓ Please ask a question!"
599
 
@@ -601,42 +553,46 @@ Provide a specific, accurate answer:"""
601
  return "πŸ“ Please upload and process documents first!"
602
 
603
  try:
604
- # Special handling for document-level questions
605
  query_lower = query.lower()
606
- if query_lower in ['summary', 'summarize this document', 'what is this about']:
607
- return f"πŸ“„ Document Summary:\n\n{self.document_summary}"
608
 
609
- # Retrieve relevant context with intelligence
610
- context, chunks_data = self.smart_retrieve_context(query, k=4)
611
 
612
  if not context:
613
- return "πŸ” No relevant information found for your question. Try rephrasing or asking about different aspects of the document."
 
 
 
614
 
615
- # Generate intelligent answer
616
- answer = self.generate_smart_answer(query, context, chunks_data)
 
617
 
618
- return answer if answer else "I couldn't generate a specific answer. Please try asking in a different way."
619
 
620
  except Exception as e:
621
  return f"❌ Error processing question: {str(e)}"
622
 
623
- # Initialize the enhanced RAG system
624
- print("Initializing Smart Document RAG System...")
625
  rag_system = SmartDocumentRAG()
626
 
627
- # Enhanced Gradio Interface
628
  def create_interface():
629
- with gr.Blocks(title="🧠 Smart Document Q&A", theme=gr.themes.Soft()) as demo:
630
  gr.Markdown("""
631
- # 🧠 Smart Document Q&A System
632
 
633
- Upload documents and get intelligent answers with summaries and insights!
634
 
635
- **Features:**
636
- - 🎯 Intelligent document type detection
637
- - πŸ“Š Smart summarization
638
- - πŸ” Context-aware answers
639
- - πŸ“š Multi-format support (PDF, DOCX, TXT)
640
  """)
641
 
642
  with gr.Tab("πŸ“€ Upload & Process"):
@@ -652,7 +608,7 @@ def create_interface():
652
 
653
  with gr.Column():
654
  process_status = gr.Textbox(
655
- label="πŸ“‹ Processing Status & Document Analysis",
656
  lines=10,
657
  interactive=False
658
  )
@@ -663,22 +619,22 @@ def create_interface():
663
  outputs=[process_status]
664
  )
665
 
666
- with gr.Tab("❓ Smart Q&A"):
667
  with gr.Row():
668
  with gr.Column():
669
  question_input = gr.Textbox(
670
- label="πŸ€” Ask Anything",
671
- placeholder="What is this document about? / Summarize the main points / How does X work?",
672
  lines=3
673
  )
674
 
675
  with gr.Row():
676
- ask_btn = gr.Button("🧠 Get Smart Answer", variant="primary")
677
  summary_btn = gr.Button("πŸ“Š Get Summary", variant="secondary")
678
 
679
  with gr.Column():
680
  answer_output = gr.Textbox(
681
- label="πŸ’‘ Smart Answer",
682
  lines=8,
683
  interactive=False
684
  )
@@ -695,52 +651,48 @@ def create_interface():
695
  outputs=[answer_output]
696
  )
697
 
698
- # Enhanced example questions
699
  gr.Markdown("""
700
- ### πŸ’‘ Smart Question Examples:
701
-
702
- **πŸ“Š For Summaries:**
703
- - "What is this document about?"
704
- - "Summarize the main points"
705
- - "Give me an overview"
706
-
707
- **πŸ” For Specific Information:**
708
- - "How does [topic] work?"
709
- - "What are the key findings?"
710
- - "Explain [concept] from the document"
711
-
712
- **🎯 For Analysis:**
713
- - "What are the pros and cons?"
714
- - "Compare [A] and [B]"
715
- - "What conclusions can be drawn?"
716
  """)
717
 
718
- with gr.Tab("ℹ️ Tips"):
719
  gr.Markdown("""
720
- ### πŸš€ How to Get the Best Results:
721
-
722
- **πŸ“„ Document Types Supported:**
723
- - Research papers & academic documents
724
- - Business reports & presentations
725
- - Technical documentation
726
- - Legal documents
727
- - General text documents
728
-
729
- **❓ Question Tips:**
730
- - Be specific about what you want to know
731
- - Use "summarize" or "overview" for general summaries
732
- - Ask "how", "why", "what" for detailed explanations
733
- - Request comparisons with "compare" or "difference"
734
-
735
- **🎯 Best Practices:**
736
- - Upload clear, well-formatted documents
737
- - Ask one question at a time for focused answers
738
- - Try rephrasing if the first answer isn't what you expected
739
  """)
740
 
741
  return demo
742
 
743
- # Launch the enhanced app
744
  if __name__ == "__main__":
745
  demo = create_interface()
746
  demo.launch(
 
11
  import re
12
  from typing import List, Optional, Dict, Tuple
13
  import json
14
+ from collections import Counter
15
 
16
  class SmartDocumentRAG:
17
  def __init__(self):
18
+ print("πŸš€ Initializing Enhanced Smart RAG System...")
19
 
20
+ # Initialize better embedding model
21
+ self.embedder = SentenceTransformer('all-mpnet-base-v2') # Better than MiniLM
22
+ print("βœ… Enhanced embedding model loaded")
23
 
24
  # Initialize quantized LLM
25
  self.setup_llm()
26
 
27
  # Document storage
28
  self.documents = []
29
+ self.document_metadata = []
30
  self.index = None
31
  self.is_indexed = False
32
  self.raw_text = ""
33
+ self.document_type = "general"
34
+ self.document_summary = ""
35
+ self.sentence_embeddings = [] # Store sentence-level embeddings
36
+ self.sentences = [] # Store individual sentences
37
 
38
  def setup_llm(self):
39
+ """Setup optimized model for better text generation"""
40
  try:
 
41
  if not torch.cuda.is_available():
42
  print("⚠️ CUDA not available, using CPU-optimized model")
43
  self.setup_cpu_model()
44
  return
45
 
46
+ # Use a better model for instruction following
47
+ model_name = "microsoft/DialoGPT-medium" # Better for Q&A
48
+
49
+ try:
50
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
51
+ self.model = AutoModelForCausalLM.from_pretrained(
52
+ model_name,
53
+ torch_dtype=torch.float16,
54
+ device_map="auto"
55
+ )
56
+
57
+ if self.tokenizer.pad_token is None:
58
+ self.tokenizer.pad_token = self.tokenizer.eos_token
59
+
60
+ print("βœ… Enhanced Q&A model loaded successfully")
61
+
62
+ except Exception as e:
63
+ print(f"Falling back to Mistral: {e}")
64
+ self.setup_mistral_model()
65
+
66
+ except Exception as e:
67
+ print(f"❌ Error loading models: {e}")
68
+ self.setup_cpu_model()
69
+
70
+ def setup_mistral_model(self):
71
+ """Setup Mistral with better configuration"""
72
+ try:
73
  quantization_config = BitsAndBytesConfig(
74
  load_in_4bit=True,
75
  bnb_4bit_compute_dtype=torch.float16,
 
79
 
80
  model_name = "mistralai/Mistral-7B-Instruct-v0.1"
81
 
82
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
 
 
 
 
 
 
 
83
  self.model = AutoModelForCausalLM.from_pretrained(
84
  model_name,
85
  quantization_config=quantization_config,
86
  device_map="auto",
87
+ torch_dtype=torch.float16
 
 
88
  )
89
 
90
+ if self.tokenizer.pad_token is None:
91
+ self.tokenizer.pad_token = self.tokenizer.eos_token
92
+
93
+ print("βœ… Mistral model loaded")
94
 
95
  except Exception as e:
96
+ print(f"❌ Mistral failed: {e}")
 
97
  self.setup_cpu_model()
98
 
99
  def setup_cpu_model(self):
100
  """Setup CPU-friendly model"""
101
  try:
102
+ model_name = "distilgpt2" # Lighter than GPT-2 medium
 
103
  self.tokenizer = AutoTokenizer.from_pretrained(model_name)
104
  self.model = AutoModelForCausalLM.from_pretrained(model_name)
105
 
 
108
 
109
  print("βœ… CPU model loaded")
110
  except Exception as e:
111
+ print(f"❌ All models failed: {e}")
112
  self.model = None
113
  self.tokenizer = None
 
114
 
115
  def detect_document_type(self, text: str) -> str:
116
+ """Enhanced document type detection"""
117
  text_lower = text.lower()
118
 
119
+ # More comprehensive keyword matching
120
+ resume_patterns = [
121
+ 'experience', 'skills', 'education', 'linkedin', 'email', 'phone',
122
+ 'work experience', 'employment', 'resume', 'cv', 'curriculum vitae',
123
+ 'internship', 'projects', 'achievements', 'career', 'profile'
124
+ ]
125
+
126
+ research_patterns = [
127
+ 'abstract', 'introduction', 'methodology', 'conclusion', 'references',
128
+ 'literature review', 'hypothesis', 'study', 'research', 'findings',
129
+ 'data analysis', 'results', 'discussion', 'bibliography'
130
+ ]
131
+
132
+ business_patterns = [
133
+ 'company', 'revenue', 'market', 'strategy', 'business', 'financial',
134
+ 'quarter', 'profit', 'sales', 'growth', 'investment', 'stakeholder',
135
+ 'operations', 'management', 'corporate', 'enterprise'
136
+ ]
137
+
138
+ technical_patterns = [
139
+ 'implementation', 'algorithm', 'system', 'technical', 'specification',
140
+ 'architecture', 'development', 'software', 'programming', 'api',
141
+ 'database', 'framework', 'deployment', 'infrastructure'
142
+ ]
143
+
144
+ # Count matches with higher weights for exact phrases
145
+ def count_matches(patterns, text):
146
+ score = 0
147
+ for pattern in patterns:
148
+ if pattern in text:
149
+ score += text.count(pattern)
150
+ return score
151
 
152
  scores = {
153
+ 'resume': count_matches(resume_patterns, text_lower),
154
+ 'research': count_matches(research_patterns, text_lower),
155
+ 'business': count_matches(business_patterns, text_lower),
156
+ 'technical': count_matches(technical_patterns, text_lower)
 
157
  }
158
 
159
+ max_score = max(scores.values())
160
+ if max_score > 3:
161
+ return max(scores, key=scores.get)
162
+ return 'general'
163
 
164
  def create_document_summary(self, text: str) -> str:
165
+ """Enhanced document summary creation"""
166
  try:
167
+ # Clean and prepare text
168
+ clean_text = re.sub(r'\s+', ' ', text).strip()
169
+ sentences = re.split(r'[.!?]+', clean_text)
170
+ sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
171
+
172
+ if not sentences:
173
+ return "Document contains basic information."
174
+
175
+ # Extract key information based on document type
176
+ if self.document_type == 'resume':
177
+ return self.extract_resume_summary(sentences)
178
+ elif self.document_type == 'research':
179
+ return self.extract_research_summary(sentences)
180
+ elif self.document_type == 'business':
181
+ return self.extract_business_summary(sentences)
182
+ else:
183
+ return self.extract_general_summary(sentences)
184
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  except Exception as e:
186
+ print(f"Summary creation error: {e}")
187
  return "Document summary not available."
188
 
189
+ def extract_resume_summary(self, sentences: List[str]) -> str:
190
+ """Extract resume-specific summary"""
191
+ key_info = []
192
+
193
+ # Look for name, role, experience
194
+ for sentence in sentences[:10]: # Check first 10 sentences
195
+ lower = sentence.lower()
196
+ if any(word in lower for word in ['engineer', 'developer', 'manager', 'analyst', 'specialist']):
197
+ key_info.append(sentence)
198
+ if any(word in lower for word in ['years', 'experience', 'worked']):
199
+ key_info.append(sentence)
200
+ if len(key_info) >= 2:
201
+ break
202
+
203
+ if key_info:
204
+ return '. '.join(key_info[:2]) + '.'
205
+ return "Resume of a professional with relevant experience and skills."
206
+
207
+ def extract_research_summary(self, sentences: List[str]) -> str:
208
+ """Extract research paper summary"""
209
+ abstract_sentences = []
210
+ intro_sentences = []
211
+
212
+ for sentence in sentences:
213
+ lower = sentence.lower()
214
+ if any(word in lower for word in ['study', 'research', 'analysis', 'findings']):
215
+ if len(sentence) > 50: # Substantial sentences
216
+ abstract_sentences.append(sentence)
217
+ elif any(word in lower for word in ['propose', 'method', 'approach']):
218
+ intro_sentences.append(sentence)
219
+
220
+ summary_sentences = (abstract_sentences + intro_sentences)[:2]
221
+ if summary_sentences:
222
+ return '. '.join(summary_sentences) + '.'
223
+ return "Research document with methodology and findings."
224
+
225
+ def extract_business_summary(self, sentences: List[str]) -> str:
226
+ """Extract business document summary"""
227
+ business_sentences = []
228
+
229
+ for sentence in sentences:
230
+ lower = sentence.lower()
231
+ if any(word in lower for word in ['company', 'business', 'market', 'strategy', 'revenue']):
232
+ if len(sentence) > 40:
233
+ business_sentences.append(sentence)
234
+
235
+ if business_sentences:
236
+ return '. '.join(business_sentences[:2]) + '.'
237
+ return "Business document containing strategic and operational information."
238
+
239
+ def extract_general_summary(self, sentences: List[str]) -> str:
240
+ """Extract general document summary"""
241
+ # Take the most informative sentences (longer ones with key terms)
242
+ scored_sentences = []
243
+
244
+ for sentence in sentences:
245
+ score = len(sentence.split()) # Word count as base score
246
+ if any(word in sentence.lower() for word in ['important', 'key', 'main', 'primary']):
247
+ score += 10
248
+ scored_sentences.append((sentence, score))
249
+
250
+ # Sort by score and take top sentences
251
+ scored_sentences.sort(key=lambda x: x[1], reverse=True)
252
+ top_sentences = [s[0] for s in scored_sentences[:2]]
253
+
254
+ if top_sentences:
255
+ return '. '.join(top_sentences) + '.'
256
+ return "Document contains relevant information and details."
257
+
258
  def extract_text_from_file(self, file_path: str) -> str:
259
+ """Enhanced text extraction with better error handling"""
260
  try:
261
  file_extension = os.path.splitext(file_path)[1].lower()
262
 
 
273
  return f"Error reading file: {str(e)}"
274
 
275
  def extract_from_pdf(self, file_path: str) -> str:
276
+ """Enhanced PDF extraction with better text cleaning"""
277
  text = ""
278
  try:
279
  with open(file_path, 'rb') as file:
 
281
  for page_num, page in enumerate(pdf_reader.pages):
282
  page_text = page.extract_text()
283
  if page_text.strip():
284
+ # Clean the text
285
+ page_text = re.sub(r'\s+', ' ', page_text)
286
+ text += f"{page_text}\n"
287
  except Exception as e:
288
  text = f"Error reading PDF: {str(e)}"
289
+ return text.strip()
290
 
291
  def extract_from_docx(self, file_path: str) -> str:
292
  """Enhanced DOCX extraction"""
 
295
  text = ""
296
  for paragraph in doc.paragraphs:
297
  if paragraph.text.strip():
298
+ text += paragraph.text.strip() + "\n"
299
+ return text.strip()
300
  except Exception as e:
301
  return f"Error reading DOCX: {str(e)}"
302
 
303
  def extract_from_txt(self, file_path: str) -> str:
304
+ """Enhanced TXT extraction"""
305
  encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
306
 
307
  for encoding in encodings:
308
  try:
309
  with open(file_path, 'r', encoding=encoding) as file:
310
+ content = file.read()
311
+ # Clean the content
312
+ content = re.sub(r'\s+', ' ', content)
313
+ return content.strip()
314
  except UnicodeDecodeError:
315
  continue
316
  except Exception as e:
 
318
 
319
  return "Error: Could not decode file with any supported encoding"
320
 
321
+ def enhanced_chunk_text(self, text: str) -> List[Dict]:
322
+ """Enhanced chunking strategy for better retrieval"""
323
  if not text.strip():
324
  return []
325
 
326
  chunks = []
 
327
 
328
+ # Split into sentences first
329
+ sentences = re.split(r'[.!?]+', text)
330
+ sentences = [s.strip() for s in sentences if len(s.strip()) > 15]
331
+
332
+ # Store sentences for fine-grained retrieval
333
+ self.sentences = sentences
334
+
335
+ # Create overlapping chunks
336
+ chunk_size = 3 # sentences per chunk
337
+ overlap = 1 # sentence overlap
338
+
339
+ for i in range(0, len(sentences), chunk_size - overlap):
340
+ chunk_sentences = sentences[i:i + chunk_size]
341
+ if chunk_sentences:
342
+ chunk_text = '. '.join(chunk_sentences)
343
+ if len(chunk_text.strip()) > 20:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
  chunks.append({
345
+ 'text': chunk_text + '.',
346
+ 'sentence_indices': list(range(i, min(i + chunk_size, len(sentences)))),
347
+ 'doc_type': self.document_type
348
  })
 
 
 
 
 
 
 
 
 
349
 
350
  return chunks
351
 
352
  def process_documents(self, files) -> str:
353
+ """Enhanced document processing"""
354
  if not files:
355
  return "❌ No files uploaded!"
356
 
 
358
  all_text = ""
359
  processed_files = []
360
 
 
361
  for file in files:
362
  if file is None:
363
  continue
364
 
365
  file_text = self.extract_text_from_file(file.name)
366
  if not file_text.startswith("Error") and not file_text.startswith("Unsupported"):
367
+ all_text += f"\n{file_text}"
368
  processed_files.append(os.path.basename(file.name))
369
  else:
370
  return f"❌ {file_text}"
 
372
  if not all_text.strip():
373
  return "❌ No text extracted from files!"
374
 
375
+ # Store and analyze
376
  self.raw_text = all_text
 
 
377
  self.document_type = self.detect_document_type(all_text)
 
 
378
  self.document_summary = self.create_document_summary(all_text)
379
 
380
+ # Enhanced chunking
381
+ chunk_data = self.enhanced_chunk_text(all_text)
382
 
383
  if not chunk_data:
384
  return "❌ No valid text chunks created!"
 
386
  self.documents = [chunk['text'] for chunk in chunk_data]
387
  self.document_metadata = chunk_data
388
 
389
+ # Create embeddings for chunks
390
  print(f"πŸ“„ Creating embeddings for {len(self.documents)} chunks...")
391
+ embeddings = self.embedder.encode(self.documents, show_progress_bar=False)
392
+
393
+ # Also create sentence-level embeddings for fine-grained search
394
+ if self.sentences:
395
+ print(f"πŸ“ Creating sentence embeddings for {len(self.sentences)} sentences...")
396
+ self.sentence_embeddings = self.embedder.encode(self.sentences, show_progress_bar=False)
397
 
398
  # Build FAISS index
399
  dimension = embeddings.shape[1]
400
  self.index = faiss.IndexFlatIP(dimension)
401
 
402
+ # Normalize for cosine similarity
403
  faiss.normalize_L2(embeddings)
404
  self.index.add(embeddings.astype('float32'))
405
 
 
408
  return f"βœ… Successfully processed {len(processed_files)} files:\n" + \
409
  f"πŸ“„ Files: {', '.join(processed_files)}\n" + \
410
  f"πŸ“Š Document Type: {self.document_type.title()}\n" + \
411
+ f"πŸ” Created {len(self.documents)} chunks and {len(self.sentences)} sentences\n" + \
412
+ f"πŸ“ Summary: {self.document_summary}\n" + \
413
+ f"πŸš€ Ready for enhanced Q&A!"
414
 
415
  except Exception as e:
416
  return f"❌ Error processing documents: {str(e)}"
417
 
418
+ def find_relevant_content(self, query: str, k: int = 5) -> Tuple[str, List[str]]:
419
+ """Enhanced content retrieval using multiple strategies"""
420
  if not self.is_indexed:
421
  return "", []
422
 
423
  try:
424
+ query_lower = query.lower()
425
+ relevant_content = []
426
+
427
+ # Strategy 1: Semantic search using embeddings
428
  query_embedding = self.embedder.encode([query])
429
  faiss.normalize_L2(query_embedding)
430
 
431
+ scores, indices = self.index.search(query_embedding.astype('float32'), min(k, len(self.documents)))
 
432
 
433
+ semantic_matches = []
434
+ for i, idx in enumerate(indices[0]):
435
+ if idx < len(self.documents) and scores[0][i] > 0.2: # Relevance threshold
436
+ semantic_matches.append(self.documents[idx])
437
 
438
+ # Strategy 2: Keyword matching in sentences
439
+ query_words = set(query_lower.split())
440
+ keyword_matches = []
441
 
442
+ for sentence in self.sentences:
443
+ sentence_words = set(sentence.lower().split())
444
+ overlap = len(query_words.intersection(sentence_words))
445
+ if overlap >= 2: # At least 2 word overlap
446
+ keyword_matches.append(sentence)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
447
 
448
+ # Strategy 3: Pattern matching for specific question types
449
+ pattern_matches = []
450
 
451
+ if any(word in query_lower for word in ['name', 'who']):
452
+ # Look for names and identities
453
+ for sentence in self.sentences:
454
+ if re.search(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', sentence): # Name pattern
455
+ pattern_matches.append(sentence)
456
 
457
+ if any(word in query_lower for word in ['experience', 'work', 'job']):
458
+ # Look for experience-related content
459
+ for sentence in self.sentences:
460
+ if any(word in sentence.lower() for word in ['year', 'experience', 'work', 'company', 'role']):
461
+ pattern_matches.append(sentence)
462
+
463
+ if any(word in query_lower for word in ['skill', 'technology', 'tech']):
464
+ # Look for skills and technologies
465
+ for sentence in self.sentences:
466
+ if any(word in sentence.lower() for word in ['skill', 'technology', 'programming', 'software']):
467
+ pattern_matches.append(sentence)
468
+
469
+ # Combine all strategies
470
+ all_matches = list(set(semantic_matches + keyword_matches + pattern_matches))
471
+
472
+ # Sort by relevance (prefer shorter, more specific sentences)
473
+ all_matches.sort(key=lambda x: len(x.split()))
474
+
475
+ return '\n'.join(all_matches[:k]), all_matches[:k]
476
 
477
  except Exception as e:
478
+ print(f"Error in content retrieval: {e}")
479
  return "", []
480
 
481
+ def generate_direct_answer(self, query: str, context: str) -> str:
482
+ """Generate direct, relevant answers"""
483
  if not context:
484
+ return "No relevant information found in the document."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
485
 
 
 
 
 
486
  query_lower = query.lower()
487
+ context_sentences = [s.strip() for s in context.split('\n') if s.strip()]
488
+
489
+ # Handle specific question types with direct extraction
490
+ if any(word in query_lower for word in ['name', 'who is']):
491
+ # Extract names
492
+ for sentence in context_sentences:
493
+ names = re.findall(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', sentence)
494
+ if names:
495
+ return f"The person mentioned is {names[0]}."
496
+
497
+ if any(word in query_lower for word in ['experience', 'years']):
498
+ # Extract experience information
499
+ for sentence in context_sentences:
500
+ exp_match = re.search(r'(\d+)\s*(?:years?|yr)', sentence.lower())
501
+ if exp_match:
502
+ return f"The experience mentioned is {exp_match.group(1)} years. {sentence}"
503
+
504
+ if any(word in query_lower for word in ['skill', 'technology']):
505
+ # Extract skills
506
+ skills = []
507
+ for sentence in context_sentences:
508
+ # Look for programming languages, frameworks, etc.
509
+ tech_words = ['python', 'java', 'javascript', 'react', 'node', 'sql', 'aws', 'docker']
510
+ found_tech = [word for word in tech_words if word in sentence.lower()]
511
+ if found_tech:
512
+ skills.extend(found_tech)
513
+
514
+ if skills:
515
+ return f"Technologies/skills mentioned include: {', '.join(set(skills))}. {context_sentences[0] if context_sentences else ''}"
516
+
517
+ if any(word in query_lower for word in ['education', 'degree', 'university', 'college']):
518
+ # Extract education information
519
+ for sentence in context_sentences:
520
+ if any(word in sentence.lower() for word in ['degree', 'university', 'college', 'bachelor', 'master']):
521
+ return sentence
522
+
523
+ if any(word in query_lower for word in ['summary', 'about', 'overview']):
524
+ return self.document_summary
525
+
526
+ # For other questions, return the most relevant sentence
527
+ if context_sentences:
528
+ # Score sentences by query word overlap
529
+ query_words = set(query_lower.split())
530
+ scored_sentences = []
531
+
532
+ for sentence in context_sentences:
533
+ sentence_words = set(sentence.lower().split())
534
+ overlap = len(query_words.intersection(sentence_words))
535
  scored_sentences.append((sentence, overlap))
536
+
537
+ # Sort by overlap and return best match
538
+ scored_sentences.sort(key=lambda x: x[1], reverse=True)
539
+
540
+ if scored_sentences and scored_sentences[0][1] > 0:
541
+ return scored_sentences[0][0]
542
+ else:
543
+ return context_sentences[0] # Return first relevant sentence
544
 
545
+ return "I found relevant content but couldn't extract a specific answer."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
546
 
547
  def answer_question(self, query: str) -> str:
548
+ """Main question answering function with enhanced accuracy"""
549
  if not query.strip():
550
  return "❓ Please ask a question!"
551
 
 
553
  return "πŸ“ Please upload and process documents first!"
554
 
555
  try:
556
+ # Handle summary requests directly
557
  query_lower = query.lower()
558
+ if query_lower in ['summary', 'summarize', 'about', 'overview']:
559
+ return f"πŸ“„ **Document Summary:**\n\n{self.document_summary}"
560
 
561
+ # Find relevant content using enhanced retrieval
562
+ context, matches = self.find_relevant_content(query, k=5)
563
 
564
  if not context:
565
+ return "πŸ” No relevant information found. Try rephrasing your question or asking about different aspects of the document."
566
+
567
+ # Generate direct answer
568
+ answer = self.generate_direct_answer(query, context)
569
 
570
+ # Add context if answer is too brief
571
+ if len(answer) < 50 and matches:
572
+ answer += f"\n\n**Additional context:** {matches[0][:200]}..."
573
 
574
+ return answer
575
 
576
  except Exception as e:
577
  return f"❌ Error processing question: {str(e)}"
578
 
579
+ # Initialize the enhanced system
580
+ print("Initializing Enhanced Smart RAG System...")
581
  rag_system = SmartDocumentRAG()
582
 
583
+ # Create the interface
584
  def create_interface():
585
+ with gr.Blocks(title="🧠 Enhanced Document Q&A", theme=gr.themes.Soft()) as demo:
586
  gr.Markdown("""
587
+ # 🧠 Enhanced Document Q&A System
588
 
589
+ **Improved for Better Accuracy & Relevance!**
590
 
591
+ **New Features:**
592
+ - 🎯 Multi-strategy content retrieval
593
+ - πŸ“Š Direct answer extraction
594
+ - πŸ” Enhanced keyword and pattern matching
595
+ - πŸ“š Better handling of resumes, research papers, and business docs
596
  """)
597
 
598
  with gr.Tab("πŸ“€ Upload & Process"):
 
608
 
609
  with gr.Column():
610
  process_status = gr.Textbox(
611
+ label="πŸ“‹ Processing Status & Analysis",
612
  lines=10,
613
  interactive=False
614
  )
 
619
  outputs=[process_status]
620
  )
621
 
622
+ with gr.Tab("❓ Enhanced Q&A"):
623
  with gr.Row():
624
  with gr.Column():
625
  question_input = gr.Textbox(
626
+ label="πŸ€” Ask Your Question",
627
+ placeholder="What is the person's name? / How many years of experience? / What are their skills?",
628
  lines=3
629
  )
630
 
631
  with gr.Row():
632
+ ask_btn = gr.Button("🧠 Get Answer", variant="primary")
633
  summary_btn = gr.Button("πŸ“Š Get Summary", variant="secondary")
634
 
635
  with gr.Column():
636
  answer_output = gr.Textbox(
637
+ label="πŸ’‘ Enhanced Answer",
638
  lines=8,
639
  interactive=False
640
  )
 
651
  outputs=[answer_output]
652
  )
653
 
 
654
  gr.Markdown("""
655
+ ### πŸ’‘ Try These Specific Questions:
656
+
657
+ **For Resumes:**
658
+ - "What is the person's name?"
659
+ - "How many years of experience do they have?"
660
+ - "What are their technical skills?"
661
+ - "What is their educational background?"
662
+ - "What companies have they worked for?"
663
+
664
+ **For Any Document:**
665
+ - "Summarize this document"
666
+ - "What is the main topic?"
667
+ - "List the key points"
 
 
 
668
  """)
669
 
670
+ with gr.Tab("πŸ”§ System Info"):
671
  gr.Markdown("""
672
+ ### πŸš€ Enhanced Features:
673
+
674
+ **Better Retrieval:**
675
+ - Semantic search using embeddings
676
+ - Keyword matching with context
677
+ - Pattern recognition for names, dates, skills
678
+ - Multi-level chunking (sentences + paragraphs)
679
+
680
+ **Improved Answers:**
681
+ - Direct information extraction
682
+ - Question-type specific processing
683
+ - Context-aware responses
684
+ - Relevance scoring and filtering
685
+
686
+ **Document Types:**
687
+ - βœ… Resumes (name, experience, skills extraction)
688
+ - βœ… Research papers (methodology, findings)
689
+ - βœ… Business documents (strategy, metrics)
690
+ - βœ… Technical documentation (specifications)
691
  """)
692
 
693
  return demo
694
 
695
+ # Launch the app
696
  if __name__ == "__main__":
697
  demo = create_interface()
698
  demo.launch(