pradeepsengarr commited on
Commit
c8716d2
Β·
verified Β·
1 Parent(s): 253bfed

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +290 -335
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import gradio as gr
2
  import torch
3
- from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
4
  from sentence_transformers import SentenceTransformer
5
  import faiss
6
  import numpy as np
@@ -12,16 +12,18 @@ import re
12
  from typing import List, Optional, Dict, Tuple
13
  import json
14
  from collections import Counter
 
 
15
 
16
  class SmartDocumentRAG:
17
  def __init__(self):
18
  print("πŸš€ Initializing Enhanced Smart RAG System...")
19
 
20
  # Initialize better embedding model
21
- self.embedder = SentenceTransformer('all-mpnet-base-v2') # Better than MiniLM
22
- print("βœ… Enhanced embedding model loaded")
23
 
24
- # Initialize quantized LLM
25
  self.setup_llm()
26
 
27
  # Document storage
@@ -32,121 +34,157 @@ class SmartDocumentRAG:
32
  self.raw_text = ""
33
  self.document_type = "general"
34
  self.document_summary = ""
35
- self.sentence_embeddings = [] # Store sentence-level embeddings
36
- self.sentences = [] # Store individual sentences
37
 
38
  def setup_llm(self):
39
- """Setup optimized model for better text generation"""
40
  try:
41
- if not torch.cuda.is_available():
42
- print("⚠️ CUDA not available, using CPU-optimized model")
 
 
 
 
 
43
  self.setup_cpu_model()
44
- return
45
-
46
- # Use a better model for instruction following
47
- model_name = "microsoft/DialoGPT-medium" # Better for Q&A
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  try:
 
 
 
 
50
  self.tokenizer = AutoTokenizer.from_pretrained(model_name)
51
  self.model = AutoModelForCausalLM.from_pretrained(
52
  model_name,
 
 
53
  torch_dtype=torch.float16,
54
- device_map="auto"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  )
56
 
57
  if self.tokenizer.pad_token is None:
58
  self.tokenizer.pad_token = self.tokenizer.eos_token
59
 
60
- print("βœ… Enhanced Q&A model loaded successfully")
61
-
62
- except Exception as e:
63
- print(f"Falling back to Mistral: {e}")
64
- self.setup_mistral_model()
65
 
66
  except Exception as e:
67
- print(f"❌ Error loading models: {e}")
68
  self.setup_cpu_model()
69
 
70
- def setup_mistral_model(self):
71
- """Setup Mistral with better configuration"""
72
  try:
73
- quantization_config = BitsAndBytesConfig(
74
- load_in_4bit=True,
75
- bnb_4bit_compute_dtype=torch.float16,
76
- bnb_4bit_use_double_quant=True,
77
- bnb_4bit_quant_type="nf4"
 
 
 
78
  )
79
-
80
- model_name = "mistralai/Mistral-7B-Instruct-v0.1"
81
-
82
- self.tokenizer = AutoTokenizer.from_pretrained(model_name)
83
- self.model = AutoModelForCausalLM.from_pretrained(
84
- model_name,
85
- quantization_config=quantization_config,
86
- device_map="auto",
87
- torch_dtype=torch.float16
88
- )
89
-
90
- if self.tokenizer.pad_token is None:
91
- self.tokenizer.pad_token = self.tokenizer.eos_token
92
-
93
- print("βœ… Mistral model loaded")
94
 
95
  except Exception as e:
96
- print(f"❌ Mistral failed: {e}")
97
- self.setup_cpu_model()
98
 
99
- def setup_cpu_model(self):
100
- """Setup CPU-friendly model"""
101
  try:
102
- model_name = "distilgpt2" # Lighter than GPT-2 medium
103
- self.tokenizer = AutoTokenizer.from_pretrained(model_name)
104
- self.model = AutoModelForCausalLM.from_pretrained(model_name)
105
-
106
- if self.tokenizer.pad_token is None:
107
- self.tokenizer.pad_token = self.tokenizer.eos_token
108
-
109
- print("βœ… CPU model loaded")
110
  except Exception as e:
111
  print(f"❌ All models failed: {e}")
112
- self.model = None
113
- self.tokenizer = None
114
 
115
  def detect_document_type(self, text: str) -> str:
116
  """Enhanced document type detection"""
117
  text_lower = text.lower()
118
 
119
- # More comprehensive keyword matching
120
  resume_patterns = [
121
  'experience', 'skills', 'education', 'linkedin', 'email', 'phone',
122
  'work experience', 'employment', 'resume', 'cv', 'curriculum vitae',
123
- 'internship', 'projects', 'achievements', 'career', 'profile'
124
  ]
125
 
126
  research_patterns = [
127
  'abstract', 'introduction', 'methodology', 'conclusion', 'references',
128
  'literature review', 'hypothesis', 'study', 'research', 'findings',
129
- 'data analysis', 'results', 'discussion', 'bibliography'
130
  ]
131
 
132
  business_patterns = [
133
  'company', 'revenue', 'market', 'strategy', 'business', 'financial',
134
  'quarter', 'profit', 'sales', 'growth', 'investment', 'stakeholder',
135
- 'operations', 'management', 'corporate', 'enterprise'
136
  ]
137
 
138
  technical_patterns = [
139
  'implementation', 'algorithm', 'system', 'technical', 'specification',
140
  'architecture', 'development', 'software', 'programming', 'api',
141
- 'database', 'framework', 'deployment', 'infrastructure'
142
  ]
143
 
144
- # Count matches with higher weights for exact phrases
145
  def count_matches(patterns, text):
146
  score = 0
147
  for pattern in patterns:
148
- if pattern in text:
149
- score += text.count(pattern)
150
  return score
151
 
152
  scores = {
@@ -157,24 +195,23 @@ class SmartDocumentRAG:
157
  }
158
 
159
  max_score = max(scores.values())
160
- if max_score > 3:
161
  return max(scores, key=scores.get)
162
  return 'general'
163
 
164
  def create_document_summary(self, text: str) -> str:
165
  """Enhanced document summary creation"""
166
  try:
167
- # Clean and prepare text
168
  clean_text = re.sub(r'\s+', ' ', text).strip()
169
  sentences = re.split(r'[.!?]+', clean_text)
170
- sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
171
 
172
  if not sentences:
173
  return "Document contains basic information."
174
 
175
- # Extract key information based on document type
176
  if self.document_type == 'resume':
177
- return self.extract_resume_summary(sentences)
178
  elif self.document_type == 'research':
179
  return self.extract_research_summary(sentences)
180
  elif self.document_type == 'business':
@@ -186,77 +223,85 @@ class SmartDocumentRAG:
186
  print(f"Summary creation error: {e}")
187
  return "Document summary not available."
188
 
189
- def extract_resume_summary(self, sentences: List[str]) -> str:
190
- """Extract resume-specific summary"""
191
- key_info = []
192
-
193
- # Look for name, role, experience
194
- for sentence in sentences[:10]: # Check first 10 sentences
195
- lower = sentence.lower()
196
- if any(word in lower for word in ['engineer', 'developer', 'manager', 'analyst', 'specialist']):
197
- key_info.append(sentence)
198
- if any(word in lower for word in ['years', 'experience', 'worked']):
199
- key_info.append(sentence)
200
- if len(key_info) >= 2:
201
- break
202
-
203
- if key_info:
204
- return '. '.join(key_info[:2]) + '.'
205
- return "Resume of a professional with relevant experience and skills."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
  def extract_research_summary(self, sentences: List[str]) -> str:
208
  """Extract research paper summary"""
209
- abstract_sentences = []
210
- intro_sentences = []
211
-
212
- for sentence in sentences:
213
- lower = sentence.lower()
214
- if any(word in lower for word in ['study', 'research', 'analysis', 'findings']):
215
- if len(sentence) > 50: # Substantial sentences
216
- abstract_sentences.append(sentence)
217
- elif any(word in lower for word in ['propose', 'method', 'approach']):
218
- intro_sentences.append(sentence)
219
-
220
- summary_sentences = (abstract_sentences + intro_sentences)[:2]
221
- if summary_sentences:
222
- return '. '.join(summary_sentences) + '.'
223
- return "Research document with methodology and findings."
224
 
225
  def extract_business_summary(self, sentences: List[str]) -> str:
226
  """Extract business document summary"""
227
- business_sentences = []
228
-
229
- for sentence in sentences:
230
- lower = sentence.lower()
231
- if any(word in lower for word in ['company', 'business', 'market', 'strategy', 'revenue']):
232
- if len(sentence) > 40:
233
- business_sentences.append(sentence)
234
 
235
- if business_sentences:
236
- return '. '.join(business_sentences[:2]) + '.'
237
- return "Business document containing strategic and operational information."
238
 
239
  def extract_general_summary(self, sentences: List[str]) -> str:
240
  """Extract general document summary"""
241
- # Take the most informative sentences (longer ones with key terms)
242
- scored_sentences = []
243
-
244
- for sentence in sentences:
245
- score = len(sentence.split()) # Word count as base score
246
- if any(word in sentence.lower() for word in ['important', 'key', 'main', 'primary']):
247
- score += 10
248
- scored_sentences.append((sentence, score))
249
-
250
- # Sort by score and take top sentences
251
- scored_sentences.sort(key=lambda x: x[1], reverse=True)
252
- top_sentences = [s[0] for s in scored_sentences[:2]]
253
-
254
- if top_sentences:
255
- return '. '.join(top_sentences) + '.'
256
- return "Document contains relevant information and details."
257
 
258
  def extract_text_from_file(self, file_path: str) -> str:
259
- """Enhanced text extraction with better error handling"""
260
  try:
261
  file_extension = os.path.splitext(file_path)[1].lower()
262
 
@@ -273,16 +318,17 @@ class SmartDocumentRAG:
273
  return f"Error reading file: {str(e)}"
274
 
275
  def extract_from_pdf(self, file_path: str) -> str:
276
- """Enhanced PDF extraction with better text cleaning"""
277
  text = ""
278
  try:
279
  with open(file_path, 'rb') as file:
280
  pdf_reader = PyPDF2.PdfReader(file)
281
- for page_num, page in enumerate(pdf_reader.pages):
282
  page_text = page.extract_text()
283
  if page_text.strip():
284
- # Clean the text
285
  page_text = re.sub(r'\s+', ' ', page_text)
 
286
  text += f"{page_text}\n"
287
  except Exception as e:
288
  text = f"Error reading PDF: {str(e)}"
@@ -307,45 +353,39 @@ class SmartDocumentRAG:
307
  for encoding in encodings:
308
  try:
309
  with open(file_path, 'r', encoding=encoding) as file:
310
- content = file.read()
311
- # Clean the content
312
- content = re.sub(r'\s+', ' ', content)
313
- return content.strip()
314
  except UnicodeDecodeError:
315
  continue
316
  except Exception as e:
317
  return f"Error reading TXT: {str(e)}"
318
 
319
- return "Error: Could not decode file with any supported encoding"
320
 
321
  def enhanced_chunk_text(self, text: str) -> List[Dict]:
322
- """Enhanced chunking strategy for better retrieval"""
323
  if not text.strip():
324
  return []
325
 
326
  chunks = []
327
 
328
- # Split into sentences first
329
  sentences = re.split(r'[.!?]+', text)
330
- sentences = [s.strip() for s in sentences if len(s.strip()) > 15]
331
-
332
- # Store sentences for fine-grained retrieval
333
  self.sentences = sentences
334
 
335
  # Create overlapping chunks
336
- chunk_size = 3 # sentences per chunk
337
- overlap = 1 # sentence overlap
338
 
339
  for i in range(0, len(sentences), chunk_size - overlap):
340
  chunk_sentences = sentences[i:i + chunk_size]
341
  if chunk_sentences:
342
- chunk_text = '. '.join(chunk_sentences)
343
- if len(chunk_text.strip()) > 20:
344
- chunks.append({
345
- 'text': chunk_text + '.',
346
- 'sentence_indices': list(range(i, min(i + chunk_size, len(sentences)))),
347
- 'doc_type': self.document_type
348
- })
349
 
350
  return chunks
351
 
@@ -386,15 +426,10 @@ class SmartDocumentRAG:
386
  self.documents = [chunk['text'] for chunk in chunk_data]
387
  self.document_metadata = chunk_data
388
 
389
- # Create embeddings for chunks
390
  print(f"πŸ“„ Creating embeddings for {len(self.documents)} chunks...")
391
  embeddings = self.embedder.encode(self.documents, show_progress_bar=False)
392
 
393
- # Also create sentence-level embeddings for fine-grained search
394
- if self.sentences:
395
- print(f"πŸ“ Creating sentence embeddings for {len(self.sentences)} sentences...")
396
- self.sentence_embeddings = self.embedder.encode(self.sentences, show_progress_bar=False)
397
-
398
  # Build FAISS index
399
  dimension = embeddings.shape[1]
400
  self.index = faiss.IndexFlatIP(dimension)
@@ -408,144 +443,38 @@ class SmartDocumentRAG:
408
  return f"βœ… Successfully processed {len(processed_files)} files:\n" + \
409
  f"πŸ“„ Files: {', '.join(processed_files)}\n" + \
410
  f"πŸ“Š Document Type: {self.document_type.title()}\n" + \
411
- f"πŸ” Created {len(self.documents)} chunks and {len(self.sentences)} sentences\n" + \
412
  f"πŸ“ Summary: {self.document_summary}\n" + \
413
- f"πŸš€ Ready for enhanced Q&A!"
414
 
415
  except Exception as e:
416
  return f"❌ Error processing documents: {str(e)}"
417
 
418
- def find_relevant_content(self, query: str, k: int = 5) -> Tuple[str, List[str]]:
419
- """Enhanced content retrieval using multiple strategies"""
420
  if not self.is_indexed:
421
- return "", []
422
 
423
  try:
424
- query_lower = query.lower()
425
- relevant_content = []
426
-
427
- # Strategy 1: Semantic search using embeddings
428
  query_embedding = self.embedder.encode([query])
429
  faiss.normalize_L2(query_embedding)
430
 
431
  scores, indices = self.index.search(query_embedding.astype('float32'), min(k, len(self.documents)))
432
 
433
- semantic_matches = []
434
  for i, idx in enumerate(indices[0]):
435
- if idx < len(self.documents) and scores[0][i] > 0.2: # Relevance threshold
436
- semantic_matches.append(self.documents[idx])
437
-
438
- # Strategy 2: Keyword matching in sentences
439
- query_words = set(query_lower.split())
440
- keyword_matches = []
441
-
442
- for sentence in self.sentences:
443
- sentence_words = set(sentence.lower().split())
444
- overlap = len(query_words.intersection(sentence_words))
445
- if overlap >= 2: # At least 2 word overlap
446
- keyword_matches.append(sentence)
447
-
448
- # Strategy 3: Pattern matching for specific question types
449
- pattern_matches = []
450
 
451
- if any(word in query_lower for word in ['name', 'who']):
452
- # Look for names and identities
453
- for sentence in self.sentences:
454
- if re.search(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', sentence): # Name pattern
455
- pattern_matches.append(sentence)
456
-
457
- if any(word in query_lower for word in ['experience', 'work', 'job']):
458
- # Look for experience-related content
459
- for sentence in self.sentences:
460
- if any(word in sentence.lower() for word in ['year', 'experience', 'work', 'company', 'role']):
461
- pattern_matches.append(sentence)
462
-
463
- if any(word in query_lower for word in ['skill', 'technology', 'tech']):
464
- # Look for skills and technologies
465
- for sentence in self.sentences:
466
- if any(word in sentence.lower() for word in ['skill', 'technology', 'programming', 'software']):
467
- pattern_matches.append(sentence)
468
-
469
- # Combine all strategies
470
- all_matches = list(set(semantic_matches + keyword_matches + pattern_matches))
471
-
472
- # Sort by relevance (prefer shorter, more specific sentences)
473
- all_matches.sort(key=lambda x: len(x.split()))
474
-
475
- return '\n'.join(all_matches[:k]), all_matches[:k]
476
 
477
  except Exception as e:
478
  print(f"Error in content retrieval: {e}")
479
- return "", []
480
-
481
- def generate_direct_answer(self, query: str, context: str) -> str:
482
- """Generate direct, relevant answers"""
483
- if not context:
484
- return "No relevant information found in the document."
485
-
486
- query_lower = query.lower()
487
- context_sentences = [s.strip() for s in context.split('\n') if s.strip()]
488
-
489
- # Handle specific question types with direct extraction
490
- if any(word in query_lower for word in ['name', 'who is']):
491
- # Extract names
492
- for sentence in context_sentences:
493
- names = re.findall(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', sentence)
494
- if names:
495
- return f"The person mentioned is {names[0]}."
496
-
497
- if any(word in query_lower for word in ['experience', 'years']):
498
- # Extract experience information
499
- for sentence in context_sentences:
500
- exp_match = re.search(r'(\d+)\s*(?:years?|yr)', sentence.lower())
501
- if exp_match:
502
- return f"The experience mentioned is {exp_match.group(1)} years. {sentence}"
503
-
504
- if any(word in query_lower for word in ['skill', 'technology']):
505
- # Extract skills
506
- skills = []
507
- for sentence in context_sentences:
508
- # Look for programming languages, frameworks, etc.
509
- tech_words = ['python', 'java', 'javascript', 'react', 'node', 'sql', 'aws', 'docker']
510
- found_tech = [word for word in tech_words if word in sentence.lower()]
511
- if found_tech:
512
- skills.extend(found_tech)
513
-
514
- if skills:
515
- return f"Technologies/skills mentioned include: {', '.join(set(skills))}. {context_sentences[0] if context_sentences else ''}"
516
-
517
- if any(word in query_lower for word in ['education', 'degree', 'university', 'college']):
518
- # Extract education information
519
- for sentence in context_sentences:
520
- if any(word in sentence.lower() for word in ['degree', 'university', 'college', 'bachelor', 'master']):
521
- return sentence
522
-
523
- if any(word in query_lower for word in ['summary', 'about', 'overview']):
524
- return self.document_summary
525
-
526
- # For other questions, return the most relevant sentence
527
- if context_sentences:
528
- # Score sentences by query word overlap
529
- query_words = set(query_lower.split())
530
- scored_sentences = []
531
-
532
- for sentence in context_sentences:
533
- sentence_words = set(sentence.lower().split())
534
- overlap = len(query_words.intersection(sentence_words))
535
- scored_sentences.append((sentence, overlap))
536
-
537
- # Sort by overlap and return best match
538
- scored_sentences.sort(key=lambda x: x[1], reverse=True)
539
-
540
- if scored_sentences and scored_sentences[0][1] > 0:
541
- return scored_sentences[0][0]
542
- else:
543
- return context_sentences[0] # Return first relevant sentence
544
-
545
- return "I found relevant content but couldn't extract a specific answer."
546
 
547
  def answer_question(self, query: str) -> str:
548
- """Main question answering function with enhanced accuracy"""
549
  if not query.strip():
550
  return "❓ Please ask a question!"
551
 
@@ -553,30 +482,95 @@ class SmartDocumentRAG:
553
  return "πŸ“ Please upload and process documents first!"
554
 
555
  try:
556
- # Handle summary requests directly
557
  query_lower = query.lower()
558
- if query_lower in ['summary', 'summarize', 'about', 'overview']:
 
 
559
  return f"πŸ“„ **Document Summary:**\n\n{self.document_summary}"
560
 
561
- # Find relevant content using enhanced retrieval
562
- context, matches = self.find_relevant_content(query, k=5)
563
 
564
  if not context:
565
- return "πŸ” No relevant information found. Try rephrasing your question or asking about different aspects of the document."
566
-
567
- # Generate direct answer
568
- answer = self.generate_direct_answer(query, context)
569
 
570
- # Add context if answer is too brief
571
- if len(answer) < 50 and matches:
572
- answer += f"\n\n**Additional context:** {matches[0][:200]}..."
573
 
574
- return answer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
575
 
576
  except Exception as e:
577
  return f"❌ Error processing question: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
578
 
579
- # Initialize the enhanced system
580
  print("Initializing Enhanced Smart RAG System...")
581
  rag_system = SmartDocumentRAG()
582
 
@@ -586,13 +580,13 @@ def create_interface():
586
  gr.Markdown("""
587
  # 🧠 Enhanced Document Q&A System
588
 
589
- **Improved for Better Accuracy & Relevance!**
590
 
591
- **New Features:**
592
- - 🎯 Multi-strategy content retrieval
 
593
  - πŸ“Š Direct answer extraction
594
- - πŸ” Enhanced keyword and pattern matching
595
- - πŸ“š Better handling of resumes, research papers, and business docs
596
  """)
597
 
598
  with gr.Tab("πŸ“€ Upload & Process"):
@@ -608,7 +602,7 @@ def create_interface():
608
 
609
  with gr.Column():
610
  process_status = gr.Textbox(
611
- label="πŸ“‹ Processing Status & Analysis",
612
  lines=10,
613
  interactive=False
614
  )
@@ -619,12 +613,12 @@ def create_interface():
619
  outputs=[process_status]
620
  )
621
 
622
- with gr.Tab("❓ Enhanced Q&A"):
623
  with gr.Row():
624
  with gr.Column():
625
  question_input = gr.Textbox(
626
  label="πŸ€” Ask Your Question",
627
- placeholder="What is the person's name? / How many years of experience? / What are their skills?",
628
  lines=3
629
  )
630
 
@@ -634,7 +628,7 @@ def create_interface():
634
 
635
  with gr.Column():
636
  answer_output = gr.Textbox(
637
- label="πŸ’‘ Enhanced Answer",
638
  lines=8,
639
  interactive=False
640
  )
@@ -650,45 +644,6 @@ def create_interface():
650
  inputs=[],
651
  outputs=[answer_output]
652
  )
653
-
654
- gr.Markdown("""
655
- ### πŸ’‘ Try These Specific Questions:
656
-
657
- **For Resumes:**
658
- - "What is the person's name?"
659
- - "How many years of experience do they have?"
660
- - "What are their technical skills?"
661
- - "What is their educational background?"
662
- - "What companies have they worked for?"
663
-
664
- **For Any Document:**
665
- - "Summarize this document"
666
- - "What is the main topic?"
667
- - "List the key points"
668
- """)
669
-
670
- with gr.Tab("πŸ”§ System Info"):
671
- gr.Markdown("""
672
- ### πŸš€ Enhanced Features:
673
-
674
- **Better Retrieval:**
675
- - Semantic search using embeddings
676
- - Keyword matching with context
677
- - Pattern recognition for names, dates, skills
678
- - Multi-level chunking (sentences + paragraphs)
679
-
680
- **Improved Answers:**
681
- - Direct information extraction
682
- - Question-type specific processing
683
- - Context-aware responses
684
- - Relevance scoring and filtering
685
-
686
- **Document Types:**
687
- - βœ… Resumes (name, experience, skills extraction)
688
- - βœ… Research papers (methodology, findings)
689
- - βœ… Business documents (strategy, metrics)
690
- - βœ… Technical documentation (specifications)
691
- """)
692
 
693
  return demo
694
 
 
1
  import gradio as gr
2
  import torch
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
4
  from sentence_transformers import SentenceTransformer
5
  import faiss
6
  import numpy as np
 
12
  from typing import List, Optional, Dict, Tuple
13
  import json
14
  from collections import Counter
15
+ import warnings
16
+ warnings.filterwarnings("ignore")
17
 
18
  class SmartDocumentRAG:
19
  def __init__(self):
20
  print("πŸš€ Initializing Enhanced Smart RAG System...")
21
 
22
  # Initialize better embedding model
23
+ self.embedder = SentenceTransformer('all-MiniLM-L6-v2') # Faster and good quality
24
+ print("βœ… Embedding model loaded")
25
 
26
+ # Initialize optimized LLM with better quantization
27
  self.setup_llm()
28
 
29
  # Document storage
 
34
  self.raw_text = ""
35
  self.document_type = "general"
36
  self.document_summary = ""
37
+ self.sentence_embeddings = []
38
+ self.sentences = []
39
 
40
  def setup_llm(self):
41
+ """Setup optimized model with better quantization"""
42
  try:
43
+ # Check CUDA availability
44
+ device = "cuda" if torch.cuda.is_available() else "cpu"
45
+ print(f"πŸ”§ Using device: {device}")
46
+
47
+ if device == "cuda":
48
+ self.setup_gpu_model()
49
+ else:
50
  self.setup_cpu_model()
51
+
52
+ except Exception as e:
53
+ print(f"❌ Error loading models: {e}")
54
+ self.setup_fallback_model()
55
+
56
+ def setup_gpu_model(self):
57
+ """Setup GPU model with proper quantization"""
58
+ try:
59
+ # Use Phi-2 - excellent for Q&A and reasoning
60
+ model_name = "microsoft/DialoGPT-medium"
61
+
62
+ # Better quantization config
63
+ quantization_config = BitsAndBytesConfig(
64
+ load_in_4bit=True,
65
+ bnb_4bit_compute_dtype=torch.float16,
66
+ bnb_4bit_use_double_quant=True,
67
+ bnb_4bit_quant_type="nf4",
68
+ bnb_4bit_quant_storage=torch.uint8
69
+ )
70
 
71
  try:
72
+ # Try Flan-T5 first - excellent for Q&A
73
+ model_name = "google/flan-t5-base"
74
+ print(f"πŸ€– Loading {model_name}...")
75
+
76
  self.tokenizer = AutoTokenizer.from_pretrained(model_name)
77
  self.model = AutoModelForCausalLM.from_pretrained(
78
  model_name,
79
+ quantization_config=quantization_config,
80
+ device_map="auto",
81
  torch_dtype=torch.float16,
82
+ trust_remote_code=True
83
+ )
84
+
85
+ # Create pipeline for easier use
86
+ self.qa_pipeline = pipeline(
87
+ "text2text-generation",
88
+ model=self.model,
89
+ tokenizer=self.tokenizer,
90
+ max_length=512,
91
+ do_sample=True,
92
+ temperature=0.3,
93
+ top_p=0.9
94
+ )
95
+
96
+ print("βœ… Flan-T5 model loaded successfully")
97
+ self.model_type = "flan-t5"
98
+
99
+ except Exception as e:
100
+ print(f"Flan-T5 failed, trying Phi-2: {e}")
101
+ # Try Phi-2 as backup
102
+ model_name = "microsoft/phi-2"
103
+ print(f"πŸ€– Loading {model_name}...")
104
+
105
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
106
+ self.model = AutoModelForCausalLM.from_pretrained(
107
+ model_name,
108
+ quantization_config=quantization_config,
109
+ device_map="auto",
110
+ torch_dtype=torch.float16,
111
+ trust_remote_code=True
112
  )
113
 
114
  if self.tokenizer.pad_token is None:
115
  self.tokenizer.pad_token = self.tokenizer.eos_token
116
 
117
+ print("βœ… Phi-2 model loaded successfully")
118
+ self.model_type = "phi-2"
 
 
 
119
 
120
  except Exception as e:
121
+ print(f"❌ GPU models failed: {e}")
122
  self.setup_cpu_model()
123
 
124
+ def setup_cpu_model(self):
125
+ """Setup CPU-optimized model"""
126
  try:
127
+ # Use DistilBERT for Q&A - much better than DialoGPT for this task
128
+ model_name = "distilbert-base-cased-distilled-squad"
129
+ print(f"πŸ€– Loading CPU model: {model_name}")
130
+
131
+ self.qa_pipeline = pipeline(
132
+ "question-answering",
133
+ model=model_name,
134
+ tokenizer=model_name
135
  )
136
+ self.model_type = "distilbert-qa"
137
+ print("βœ… DistilBERT Q&A model loaded successfully")
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
  except Exception as e:
140
+ print(f"❌ CPU model failed: {e}")
141
+ self.setup_fallback_model()
142
 
143
+ def setup_fallback_model(self):
144
+ """Fallback to basic model"""
145
  try:
146
+ print("πŸ€– Loading fallback model...")
147
+ self.qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
148
+ self.model_type = "fallback"
149
+ print("βœ… Fallback model loaded")
 
 
 
 
150
  except Exception as e:
151
  print(f"❌ All models failed: {e}")
152
+ self.qa_pipeline = None
153
+ self.model_type = "none"
154
 
155
  def detect_document_type(self, text: str) -> str:
156
  """Enhanced document type detection"""
157
  text_lower = text.lower()
158
 
 
159
  resume_patterns = [
160
  'experience', 'skills', 'education', 'linkedin', 'email', 'phone',
161
  'work experience', 'employment', 'resume', 'cv', 'curriculum vitae',
162
+ 'internship', 'projects', 'achievements', 'career', 'profile', 'objective'
163
  ]
164
 
165
  research_patterns = [
166
  'abstract', 'introduction', 'methodology', 'conclusion', 'references',
167
  'literature review', 'hypothesis', 'study', 'research', 'findings',
168
+ 'data analysis', 'results', 'discussion', 'bibliography', 'journal'
169
  ]
170
 
171
  business_patterns = [
172
  'company', 'revenue', 'market', 'strategy', 'business', 'financial',
173
  'quarter', 'profit', 'sales', 'growth', 'investment', 'stakeholder',
174
+ 'operations', 'management', 'corporate', 'enterprise', 'budget'
175
  ]
176
 
177
  technical_patterns = [
178
  'implementation', 'algorithm', 'system', 'technical', 'specification',
179
  'architecture', 'development', 'software', 'programming', 'api',
180
+ 'database', 'framework', 'deployment', 'infrastructure', 'code'
181
  ]
182
 
 
183
  def count_matches(patterns, text):
184
  score = 0
185
  for pattern in patterns:
186
+ count = text.count(pattern)
187
+ score += count * (2 if len(pattern.split()) > 1 else 1) # Weight phrases higher
188
  return score
189
 
190
  scores = {
 
195
  }
196
 
197
  max_score = max(scores.values())
198
+ if max_score > 5: # Higher threshold
199
  return max(scores, key=scores.get)
200
  return 'general'
201
 
202
  def create_document_summary(self, text: str) -> str:
203
  """Enhanced document summary creation"""
204
  try:
 
205
  clean_text = re.sub(r'\s+', ' ', text).strip()
206
  sentences = re.split(r'[.!?]+', clean_text)
207
+ sentences = [s.strip() for s in sentences if len(s.strip()) > 30]
208
 
209
  if not sentences:
210
  return "Document contains basic information."
211
 
212
+ # Use first few sentences and key information
213
  if self.document_type == 'resume':
214
+ return self.extract_resume_summary(sentences, clean_text)
215
  elif self.document_type == 'research':
216
  return self.extract_research_summary(sentences)
217
  elif self.document_type == 'business':
 
223
  print(f"Summary creation error: {e}")
224
  return "Document summary not available."
225
 
226
+ def extract_resume_summary(self, sentences: List[str], full_text: str) -> str:
227
+ """Extract resume-specific summary with better name detection"""
228
+ summary_parts = []
229
+
230
+ # Extract name using multiple patterns
231
+ name = self.extract_name(full_text)
232
+ if name:
233
+ summary_parts.append(f"Resume of {name}")
234
+
235
+ # Extract role/title
236
+ role_patterns = [
237
+ r'(?:software|senior|junior|lead|principal)?\s*(?:engineer|developer|analyst|manager|designer|architect|consultant)',
238
+ r'(?:full stack|frontend|backend|data|ml|ai)\s*(?:engineer|developer)',
239
+ r'(?:product|project|technical)\s*manager'
240
+ ]
241
+
242
+ for sentence in sentences[:5]:
243
+ for pattern in role_patterns:
244
+ matches = re.findall(pattern, sentence.lower())
245
+ if matches:
246
+ summary_parts.append(f"working as {matches[0].title()}")
247
+ break
248
+
249
+ # Extract experience
250
+ exp_match = re.search(r'(\d+)[\+\-\s]*(?:years?|yrs?)\s*(?:of\s*)?(?:experience|exp)', full_text.lower())
251
+ if exp_match:
252
+ summary_parts.append(f"with {exp_match.group(1)}+ years of experience")
253
+
254
+ return '. '.join(summary_parts) + '.' if summary_parts else "Professional resume with career details."
255
+
256
+ def extract_name(self, text: str) -> str:
257
+ """Extract name from document using multiple strategies"""
258
+ # Strategy 1: Look for name patterns at the beginning
259
+ lines = text.split('\n')[:10] # First 10 lines
260
+
261
+ for line in lines:
262
+ line = line.strip()
263
+ if len(line) < 50 and len(line) > 3: # Likely a header line
264
+ # Check if it looks like a name
265
+ name_match = re.match(r'^([A-Z][a-z]+\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)(?:\s|$)', line)
266
+ if name_match:
267
+ return name_match.group(1)
268
+
269
+ # Strategy 2: Look for "Name:" pattern
270
+ name_patterns = [
271
+ r'(?:name|full name):\s*([A-Z][a-z]+\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)',
272
+ r'^([A-Z][a-z]+\s+[A-Z][a-z]+)(?:\s*\n|\s*email|\s*phone|\s*linkedin)',
273
+ ]
274
+
275
+ for pattern in name_patterns:
276
+ match = re.search(pattern, text, re.MULTILINE | re.IGNORECASE)
277
+ if match:
278
+ return match.group(1)
279
+
280
+ return ""
281
 
282
  def extract_research_summary(self, sentences: List[str]) -> str:
283
  """Extract research paper summary"""
284
+ # Look for abstract or introduction
285
+ for sentence in sentences[:5]:
286
+ if any(word in sentence.lower() for word in ['abstract', 'study', 'research', 'paper']):
287
+ return sentence[:200] + ('...' if len(sentence) > 200 else '')
288
+
289
+ return "Research document with academic content."
 
 
 
 
 
 
 
 
 
290
 
291
  def extract_business_summary(self, sentences: List[str]) -> str:
292
  """Extract business document summary"""
293
+ for sentence in sentences[:3]:
294
+ if any(word in sentence.lower() for word in ['company', 'business', 'organization']):
295
+ return sentence[:200] + ('...' if len(sentence) > 200 else '')
 
 
 
 
296
 
297
+ return "Business document with organizational information."
 
 
298
 
299
  def extract_general_summary(self, sentences: List[str]) -> str:
300
  """Extract general document summary"""
301
+ return sentences[0][:200] + ('...' if len(sentences[0]) > 200 else '') if sentences else "General document."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
 
303
  def extract_text_from_file(self, file_path: str) -> str:
304
+ """Enhanced text extraction"""
305
  try:
306
  file_extension = os.path.splitext(file_path)[1].lower()
307
 
 
318
  return f"Error reading file: {str(e)}"
319
 
320
  def extract_from_pdf(self, file_path: str) -> str:
321
+ """Enhanced PDF extraction"""
322
  text = ""
323
  try:
324
  with open(file_path, 'rb') as file:
325
  pdf_reader = PyPDF2.PdfReader(file)
326
+ for page in pdf_reader.pages:
327
  page_text = page.extract_text()
328
  if page_text.strip():
329
+ # Better text cleaning
330
  page_text = re.sub(r'\s+', ' ', page_text)
331
+ page_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', page_text) # Fix merged words
332
  text += f"{page_text}\n"
333
  except Exception as e:
334
  text = f"Error reading PDF: {str(e)}"
 
353
  for encoding in encodings:
354
  try:
355
  with open(file_path, 'r', encoding=encoding) as file:
356
+ return file.read().strip()
 
 
 
357
  except UnicodeDecodeError:
358
  continue
359
  except Exception as e:
360
  return f"Error reading TXT: {str(e)}"
361
 
362
+ return "Error: Could not decode file"
363
 
364
  def enhanced_chunk_text(self, text: str) -> List[Dict]:
365
+ """Enhanced chunking with better overlap"""
366
  if not text.strip():
367
  return []
368
 
369
  chunks = []
370
 
371
+ # Split into sentences
372
  sentences = re.split(r'[.!?]+', text)
373
+ sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
 
 
374
  self.sentences = sentences
375
 
376
  # Create overlapping chunks
377
+ chunk_size = 4 # sentences per chunk
378
+ overlap = 2 # sentence overlap
379
 
380
  for i in range(0, len(sentences), chunk_size - overlap):
381
  chunk_sentences = sentences[i:i + chunk_size]
382
  if chunk_sentences:
383
+ chunk_text = '. '.join(chunk_sentences) + '.'
384
+ chunks.append({
385
+ 'text': chunk_text,
386
+ 'sentence_indices': list(range(i, min(i + chunk_size, len(sentences)))),
387
+ 'doc_type': self.document_type
388
+ })
 
389
 
390
  return chunks
391
 
 
426
  self.documents = [chunk['text'] for chunk in chunk_data]
427
  self.document_metadata = chunk_data
428
 
429
+ # Create embeddings
430
  print(f"πŸ“„ Creating embeddings for {len(self.documents)} chunks...")
431
  embeddings = self.embedder.encode(self.documents, show_progress_bar=False)
432
 
 
 
 
 
 
433
  # Build FAISS index
434
  dimension = embeddings.shape[1]
435
  self.index = faiss.IndexFlatIP(dimension)
 
443
  return f"βœ… Successfully processed {len(processed_files)} files:\n" + \
444
  f"πŸ“„ Files: {', '.join(processed_files)}\n" + \
445
  f"πŸ“Š Document Type: {self.document_type.title()}\n" + \
446
+ f"πŸ” Created {len(self.documents)} chunks\n" + \
447
  f"πŸ“ Summary: {self.document_summary}\n" + \
448
+ f"πŸš€ Ready for Q&A!"
449
 
450
  except Exception as e:
451
  return f"❌ Error processing documents: {str(e)}"
452
 
453
+ def find_relevant_content(self, query: str, k: int = 3) -> str:
454
+ """Improved content retrieval"""
455
  if not self.is_indexed:
456
+ return ""
457
 
458
  try:
459
+ # Semantic search
 
 
 
460
  query_embedding = self.embedder.encode([query])
461
  faiss.normalize_L2(query_embedding)
462
 
463
  scores, indices = self.index.search(query_embedding.astype('float32'), min(k, len(self.documents)))
464
 
465
+ relevant_chunks = []
466
  for i, idx in enumerate(indices[0]):
467
+ if idx < len(self.documents) and scores[0][i] > 0.1: # Lower threshold
468
+ relevant_chunks.append(self.documents[idx])
 
 
 
 
 
 
 
 
 
 
 
 
 
469
 
470
+ return ' '.join(relevant_chunks)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
471
 
472
  except Exception as e:
473
  print(f"Error in content retrieval: {e}")
474
+ return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
475
 
476
  def answer_question(self, query: str) -> str:
477
+ """Enhanced question answering with better model usage"""
478
  if not query.strip():
479
  return "❓ Please ask a question!"
480
 
 
482
  return "πŸ“ Please upload and process documents first!"
483
 
484
  try:
 
485
  query_lower = query.lower()
486
+
487
+ # Handle summary requests
488
+ if any(word in query_lower for word in ['summary', 'summarize', 'about', 'overview']):
489
  return f"πŸ“„ **Document Summary:**\n\n{self.document_summary}"
490
 
491
+ # Get relevant content
492
+ context = self.find_relevant_content(query, k=3)
493
 
494
  if not context:
495
+ return "πŸ” No relevant information found. Try rephrasing your question."
 
 
 
496
 
497
+ # Use appropriate model for answering
498
+ if self.qa_pipeline is None:
499
+ return self.extract_direct_answer(query, context)
500
 
501
+ try:
502
+ if self.model_type == "distilbert-qa" or self.model_type == "fallback":
503
+ # Use Q&A pipeline
504
+ result = self.qa_pipeline(question=query, context=context)
505
+ answer = result['answer']
506
+ confidence = result['score']
507
+
508
+ if confidence > 0.1: # Reasonable confidence
509
+ return f"**Answer:** {answer}\n\n**Context:** {context[:200]}..."
510
+ else:
511
+ return self.extract_direct_answer(query, context)
512
+
513
+ elif self.model_type == "flan-t5":
514
+ # Use text generation pipeline
515
+ prompt = f"Answer the question based on the context.\nContext: {context}\nQuestion: {query}\nAnswer:"
516
+ result = self.qa_pipeline(prompt, max_length=200, num_return_sequences=1)
517
+ answer = result[0]['generated_text'].replace(prompt, '').strip()
518
+ return f"**Answer:** {answer}"
519
+
520
+ else:
521
+ return self.extract_direct_answer(query, context)
522
+
523
+ except Exception as e:
524
+ print(f"Model inference error: {e}")
525
+ return self.extract_direct_answer(query, context)
526
 
527
  except Exception as e:
528
  return f"❌ Error processing question: {str(e)}"
529
+
530
+ def extract_direct_answer(self, query: str, context: str) -> str:
531
+ """Direct answer extraction as fallback"""
532
+ query_lower = query.lower()
533
+
534
+ # Name extraction
535
+ if any(word in query_lower for word in ['name', 'who is', 'who']):
536
+ names = re.findall(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', context)
537
+ if names:
538
+ return f"**Name:** {names[0]}"
539
+
540
+ # Experience extraction
541
+ if any(word in query_lower for word in ['experience', 'years']):
542
+ exp_matches = re.findall(r'(\d+)[\+\-\s]*(?:years?|yrs?)', context.lower())
543
+ if exp_matches:
544
+ return f"**Experience:** {exp_matches[0]} years"
545
+
546
+ # Skills extraction
547
+ if any(word in query_lower for word in ['skill', 'technology', 'tech']):
548
+ # Common tech skills
549
+ tech_patterns = [
550
+ r'\b(?:Python|Java|JavaScript|React|Node|SQL|AWS|Docker|Kubernetes|Git)\b',
551
+ r'\b(?:HTML|CSS|Angular|Vue|Spring|Django|Flask|MongoDB|PostgreSQL)\b'
552
+ ]
553
+ skills = []
554
+ for pattern in tech_patterns:
555
+ skills.extend(re.findall(pattern, context, re.IGNORECASE))
556
+
557
+ if skills:
558
+ return f"**Skills mentioned:** {', '.join(set(skills))}"
559
+
560
+ # Education extraction
561
+ if any(word in query_lower for word in ['education', 'degree', 'university']):
562
+ edu_matches = re.findall(r'(?:Bachelor|Master|PhD|B\.?S\.?|M\.?S\.?|B\.?A\.?|M\.?A\.?).*?(?:in|of)\s+([^.]+)', context)
563
+ if edu_matches:
564
+ return f"**Education:** {edu_matches[0]}"
565
+
566
+ # Return first relevant sentence
567
+ sentences = [s.strip() for s in context.split('.') if s.strip()]
568
+ if sentences:
569
+ return f"**Answer:** {sentences[0]}"
570
+
571
+ return "I found relevant content but couldn't extract a specific answer."
572
 
573
+ # Initialize the system
574
  print("Initializing Enhanced Smart RAG System...")
575
  rag_system = SmartDocumentRAG()
576
 
 
580
  gr.Markdown("""
581
  # 🧠 Enhanced Document Q&A System
582
 
583
+ **Optimized with Better Models & Quantization!**
584
 
585
+ **Features:**
586
+ - 🎯 Flan-T5 or DistilBERT for accurate Q&A
587
+ - ⚑ 4-bit quantization for GPU efficiency
588
  - πŸ“Š Direct answer extraction
589
+ - πŸ” Enhanced semantic search
 
590
  """)
591
 
592
  with gr.Tab("πŸ“€ Upload & Process"):
 
602
 
603
  with gr.Column():
604
  process_status = gr.Textbox(
605
+ label="πŸ“‹ Processing Status",
606
  lines=10,
607
  interactive=False
608
  )
 
613
  outputs=[process_status]
614
  )
615
 
616
+ with gr.Tab("❓ Q&A"):
617
  with gr.Row():
618
  with gr.Column():
619
  question_input = gr.Textbox(
620
  label="πŸ€” Ask Your Question",
621
+ placeholder="What is the person's name? / How many years of experience? / What skills do they have?",
622
  lines=3
623
  )
624
 
 
628
 
629
  with gr.Column():
630
  answer_output = gr.Textbox(
631
+ label="πŸ’‘ Answer",
632
  lines=8,
633
  interactive=False
634
  )
 
644
  inputs=[],
645
  outputs=[answer_output]
646
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
647
 
648
  return demo
649