pradeepsengarr commited on
Commit
dcc21e5
Β·
verified Β·
1 Parent(s): c8716d2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +90 -43
app.py CHANGED
@@ -361,33 +361,38 @@ class SmartDocumentRAG:
361
 
362
  return "Error: Could not decode file"
363
 
364
- def enhanced_chunk_text(self, text: str) -> List[Dict]:
365
- """Enhanced chunking with better overlap"""
366
- if not text.strip():
367
- return []
 
 
 
 
 
 
 
 
 
 
 
 
368
 
 
369
  chunks = []
370
-
371
- # Split into sentences
372
- sentences = re.split(r'[.!?]+', text)
373
- sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
374
- self.sentences = sentences
375
-
376
- # Create overlapping chunks
377
- chunk_size = 4 # sentences per chunk
378
- overlap = 2 # sentence overlap
379
-
380
- for i in range(0, len(sentences), chunk_size - overlap):
381
- chunk_sentences = sentences[i:i + chunk_size]
382
- if chunk_sentences:
383
- chunk_text = '. '.join(chunk_sentences) + '.'
384
- chunks.append({
385
- 'text': chunk_text,
386
- 'sentence_indices': list(range(i, min(i + chunk_size, len(sentences)))),
387
- 'doc_type': self.document_type
388
- })
389
-
390
  return chunks
 
391
 
392
  def process_documents(self, files) -> str:
393
  """Enhanced document processing"""
@@ -451,7 +456,7 @@ class SmartDocumentRAG:
451
  return f"❌ Error processing documents: {str(e)}"
452
 
453
  def find_relevant_content(self, query: str, k: int = 3) -> str:
454
- """Improved content retrieval"""
455
  if not self.is_indexed:
456
  return ""
457
 
@@ -464,17 +469,19 @@ class SmartDocumentRAG:
464
 
465
  relevant_chunks = []
466
  for i, idx in enumerate(indices[0]):
467
- if idx < len(self.documents) and scores[0][i] > 0.1: # Lower threshold
 
468
  relevant_chunks.append(self.documents[idx])
469
 
470
  return ' '.join(relevant_chunks)
471
-
472
  except Exception as e:
473
  print(f"Error in content retrieval: {e}")
474
  return ""
 
475
 
476
  def answer_question(self, query: str) -> str:
477
- """Enhanced question answering with better model usage"""
478
  if not query.strip():
479
  return "❓ Please ask a question!"
480
 
@@ -484,48 +491,61 @@ class SmartDocumentRAG:
484
  try:
485
  query_lower = query.lower()
486
 
487
- # Handle summary requests
488
  if any(word in query_lower for word in ['summary', 'summarize', 'about', 'overview']):
489
  return f"πŸ“„ **Document Summary:**\n\n{self.document_summary}"
490
 
491
- # Get relevant content
492
  context = self.find_relevant_content(query, k=3)
493
 
494
  if not context:
495
  return "πŸ” No relevant information found. Try rephrasing your question."
496
 
497
- # Use appropriate model for answering
498
  if self.qa_pipeline is None:
499
  return self.extract_direct_answer(query, context)
500
 
501
  try:
502
- if self.model_type == "distilbert-qa" or self.model_type == "fallback":
503
- # Use Q&A pipeline
504
  result = self.qa_pipeline(question=query, context=context)
505
- answer = result['answer']
506
- confidence = result['score']
507
 
508
- if confidence > 0.1: # Reasonable confidence
509
  return f"**Answer:** {answer}\n\n**Context:** {context[:200]}..."
510
  else:
511
  return self.extract_direct_answer(query, context)
512
-
513
  elif self.model_type == "flan-t5":
514
- # Use text generation pipeline
515
- prompt = f"Answer the question based on the context.\nContext: {context}\nQuestion: {query}\nAnswer:"
516
- result = self.qa_pipeline(prompt, max_length=200, num_return_sequences=1)
517
- answer = result[0]['generated_text'].replace(prompt, '').strip()
518
- return f"**Answer:** {answer}"
 
 
 
 
 
 
519
 
 
 
 
 
 
520
  else:
 
521
  return self.extract_direct_answer(query, context)
522
-
523
  except Exception as e:
524
  print(f"Model inference error: {e}")
525
  return self.extract_direct_answer(query, context)
526
 
527
  except Exception as e:
528
  return f"❌ Error processing question: {str(e)}"
 
529
 
530
  def extract_direct_answer(self, query: str, context: str) -> str:
531
  """Direct answer extraction as fallback"""
@@ -570,6 +590,33 @@ class SmartDocumentRAG:
570
 
571
  return "I found relevant content but couldn't extract a specific answer."
572
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
573
  # Initialize the system
574
  print("Initializing Enhanced Smart RAG System...")
575
  rag_system = SmartDocumentRAG()
 
361
 
362
  return "Error: Could not decode file"
363
 
364
+ def enhanced_chunk_text(self, text: str, max_chunk_size: int = 300, overlap: int = 50) -> list[str]:
365
+ """
366
+ Splits text into smaller overlapping chunks for better semantic search.
367
+
368
+ Args:
369
+ text (str): The full text to chunk.
370
+ max_chunk_size (int): Maximum tokens/words per chunk.
371
+ overlap (int): Number of words overlapping between consecutive chunks.
372
+
373
+ Returns:
374
+ list[str]: List of text chunks.
375
+ """
376
+ import re
377
+
378
+ # Clean and normalize whitespace
379
+ text = re.sub(r'\s+', ' ', text).strip()
380
 
381
+ words = text.split()
382
  chunks = []
383
+ start = 0
384
+ text_len = len(words)
385
+
386
+ while start < text_len:
387
+ end = min(start + max_chunk_size, text_len)
388
+ chunk_words = words[start:end]
389
+ chunk = ' '.join(chunk_words)
390
+ chunks.append(chunk)
391
+ # Move start forward by chunk size minus overlap to create overlap
392
+ start += max_chunk_size - overlap
393
+
 
 
 
 
 
 
 
 
 
394
  return chunks
395
+
396
 
397
  def process_documents(self, files) -> str:
398
  """Enhanced document processing"""
 
456
  return f"❌ Error processing documents: {str(e)}"
457
 
458
  def find_relevant_content(self, query: str, k: int = 3) -> str:
459
+ """Improved content retrieval with stricter relevance filter"""
460
  if not self.is_indexed:
461
  return ""
462
 
 
469
 
470
  relevant_chunks = []
471
  for i, idx in enumerate(indices[0]):
472
+ score = scores[0][i]
473
+ if idx < len(self.documents) and score > 0.4: # βœ… stricter similarity filter
474
  relevant_chunks.append(self.documents[idx])
475
 
476
  return ' '.join(relevant_chunks)
477
+
478
  except Exception as e:
479
  print(f"Error in content retrieval: {e}")
480
  return ""
481
+
482
 
483
  def answer_question(self, query: str) -> str:
484
+ """Enhanced question answering with better model usage and hallucination reduction."""
485
  if not query.strip():
486
  return "❓ Please ask a question!"
487
 
 
491
  try:
492
  query_lower = query.lower()
493
 
494
+ # Handle summary requests explicitly
495
  if any(word in query_lower for word in ['summary', 'summarize', 'about', 'overview']):
496
  return f"πŸ“„ **Document Summary:**\n\n{self.document_summary}"
497
 
498
+ # Retrieve relevant content chunks via semantic search
499
  context = self.find_relevant_content(query, k=3)
500
 
501
  if not context:
502
  return "πŸ” No relevant information found. Try rephrasing your question."
503
 
504
+ # If no QA pipeline, fall back to direct extraction
505
  if self.qa_pipeline is None:
506
  return self.extract_direct_answer(query, context)
507
 
508
  try:
509
+ if self.model_type in ["distilbert-qa", "fallback"]:
510
+ # Use extractive Q&A pipeline
511
  result = self.qa_pipeline(question=query, context=context)
512
+ answer = result.get('answer', '').strip()
513
+ confidence = result.get('score', 0)
514
 
515
+ if confidence > 0.1 and answer:
516
  return f"**Answer:** {answer}\n\n**Context:** {context[:200]}..."
517
  else:
518
  return self.extract_direct_answer(query, context)
519
+
520
  elif self.model_type == "flan-t5":
521
+ # Use generative model with improved prompt to reduce hallucination
522
+ prompt = (
523
+ f"Answer concisely and strictly based on the following context.\n\n"
524
+ f"Context:\n{context}\n\n"
525
+ f"Question:\n{query}\n\n"
526
+ f"If the answer is not contained in the context, reply with 'Not found in document.'\n"
527
+ f"Answer:"
528
+ )
529
+ result = self.qa_pipeline(prompt, max_length=256, num_return_sequences=1)
530
+ generated_text = result[0].get('generated_text', '')
531
+ answer = generated_text.replace(prompt, '').strip()
532
 
533
+ if answer.lower() in ["not found in document.", "no answer", "unknown", ""]:
534
+ return "πŸ” Sorry, the answer was not found in the documents."
535
+ else:
536
+ return f"**Answer:** {answer}"
537
+
538
  else:
539
+ # Default fallback extraction
540
  return self.extract_direct_answer(query, context)
541
+
542
  except Exception as e:
543
  print(f"Model inference error: {e}")
544
  return self.extract_direct_answer(query, context)
545
 
546
  except Exception as e:
547
  return f"❌ Error processing question: {str(e)}"
548
+
549
 
550
  def extract_direct_answer(self, query: str, context: str) -> str:
551
  """Direct answer extraction as fallback"""
 
590
 
591
  return "I found relevant content but couldn't extract a specific answer."
592
 
593
+ def clean_text(self, text: str) -> str:
594
+ """
595
+ Clean and normalize raw text by:
596
+ - Removing excessive whitespace
597
+ - Fixing merged words (camel case separation)
598
+ - Removing unwanted characters (optional)
599
+ - Lowercasing or preserving case (optional)
600
+ """
601
+ import re
602
+
603
+ # Replace multiple whitespace/newlines/tabs with single space
604
+ text = re.sub(r'\s+', ' ', text).strip()
605
+
606
+ # Fix merged words like 'wordAnotherWord' -> 'word Another Word'
607
+ text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)
608
+
609
+ # Optional: remove special characters except basic punctuation
610
+ # text = re.sub(r'[^a-zA-Z0-9,.!?;:\'\"()\-\s]', '', text)
611
+
612
+ return text
613
+
614
+
615
+
616
+
617
+
618
+
619
+
620
  # Initialize the system
621
  print("Initializing Enhanced Smart RAG System...")
622
  rag_system = SmartDocumentRAG()