pradeepsengarr commited on
Commit
3406461
Β·
verified Β·
1 Parent(s): d7bf74b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +468 -363
app.py CHANGED
@@ -9,11 +9,12 @@ import docx
9
  import io
10
  import os
11
  import re
12
- from typing import List, Optional
 
13
 
14
- class DocumentRAG:
15
  def __init__(self):
16
- print("πŸš€ Initializing RAG System...")
17
 
18
  # Initialize embedding model (lightweight)
19
  self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
@@ -24,17 +25,20 @@ class DocumentRAG:
24
 
25
  # Document storage
26
  self.documents = []
 
27
  self.index = None
28
  self.is_indexed = False
29
- self.raw_text = "" # Store raw text for fallback
 
 
30
 
31
  def setup_llm(self):
32
- """Setup quantized Mistral model"""
33
  try:
34
  # Check if CUDA is available
35
  if not torch.cuda.is_available():
36
- print("⚠️ CUDA not available, falling back to CPU or alternative model")
37
- self.setup_fallback_model()
38
  return
39
 
40
  quantization_config = BitsAndBytesConfig(
@@ -46,17 +50,14 @@ class DocumentRAG:
46
 
47
  model_name = "mistralai/Mistral-7B-Instruct-v0.1"
48
 
49
- # Load tokenizer first
50
  self.tokenizer = AutoTokenizer.from_pretrained(
51
  model_name,
52
  trust_remote_code=True
53
  )
54
 
55
- # Fix padding token issue
56
  if self.tokenizer.pad_token is None:
57
  self.tokenizer.pad_token = self.tokenizer.eos_token
58
 
59
- # Load model with quantization
60
  self.model = AutoModelForCausalLM.from_pretrained(
61
  model_name,
62
  quantization_config=quantization_config,
@@ -69,169 +70,104 @@ class DocumentRAG:
69
  print("βœ… Quantized Mistral model loaded successfully")
70
 
71
  except Exception as e:
72
- print(f"❌ Error loading model: {e}")
73
- print("πŸ”„ Falling back to alternative model...")
74
- self.setup_fallback_model()
75
 
76
- def setup_fallback_model(self):
77
- """Fallback to smaller model if Mistral fails"""
78
  try:
79
- # Use a model that's better for factual Q&A and less prone to hallucination
80
- model_name = "microsoft/DialoGPT-small"
81
  self.tokenizer = AutoTokenizer.from_pretrained(model_name)
82
  self.model = AutoModelForCausalLM.from_pretrained(model_name)
83
 
84
- # Fix padding token for fallback model too
85
  if self.tokenizer.pad_token is None:
86
  self.tokenizer.pad_token = self.tokenizer.eos_token
87
 
88
- print("βœ… Fallback model loaded")
89
  except Exception as e:
90
- print(f"❌ Fallback model failed: {e}")
91
- # Try an even simpler approach - return context-based answers without generation
92
  self.model = None
93
  self.tokenizer = None
94
- print("⚠️ Using context-only mode (no text generation)")
95
 
96
- def extract_profile_info(self, text: str) -> dict:
97
- """Extract key profile information from resume text"""
98
- profile = {
99
- 'name': '',
100
- 'role': '',
101
- 'skills': [],
102
- 'experience': [],
103
- 'education': [],
104
- 'projects': []
 
 
 
 
 
 
 
 
105
  }
106
 
107
- lines = text.split('\n')
108
- current_section = None
109
-
110
- for line in lines:
111
- line = line.strip()
112
- if not line:
113
- continue
114
-
115
- line_lower = line.lower()
116
-
117
- # Extract name (usually first meaningful line)
118
- if not profile['name'] and len(line.split()) <= 4 and not any(char in line for char in ['@', '.com', '+91', 'linkedin']):
119
- if not any(word in line_lower for word in ['resume', 'cv', 'experience', 'education', 'skills']):
120
- profile['name'] = line
121
-
122
- # Look for role/title indicators
123
- if any(keyword in line_lower for keyword in ['data scientist', 'software engineer', 'developer', 'analyst', 'intern']):
124
- if 'data scientist' in line_lower:
125
- profile['role'] = 'Data Scientist'
126
- elif 'software engineer' in line_lower:
127
- profile['role'] = 'Software Engineer'
128
- elif 'developer' in line_lower:
129
- profile['role'] = 'Developer'
130
- elif 'analyst' in line_lower:
131
- profile['role'] = 'Analyst'
132
-
133
- # Extract skills
134
- if any(keyword in line_lower for keyword in ['python', 'machine learning', 'react', 'javascript', 'sql']):
135
- if 'python' in line_lower:
136
- profile['skills'].append('Python')
137
- if 'machine learning' in line_lower:
138
- profile['skills'].append('Machine Learning')
139
- if 'react' in line_lower:
140
- profile['skills'].append('React')
141
- if 'javascript' in line_lower:
142
- profile['skills'].append('JavaScript')
143
-
144
- return profile
145
 
146
- def simple_context_answer(self, query: str, context: str) -> str:
147
- """Improved smart answering based on context analysis"""
148
- if not context:
149
- return "No relevant information found in the documents."
150
-
151
- query_lower = query.lower()
152
-
153
- # Extract profile information first
154
- profile = self.extract_profile_info(self.raw_text if self.raw_text else context)
155
-
156
- # Handle "who is" questions specifically
157
- if "who is" in query_lower:
158
- name_in_query = re.search(r'who is (\w+)', query_lower)
159
- person_name = name_in_query.group(1) if name_in_query else "this person"
160
 
161
- # Build answer from profile
162
- answer_parts = []
163
 
164
- if profile['name']:
165
- if profile['role']:
166
- answer_parts.append(f"{profile['name']} is a {profile['role']}")
167
- else:
168
- # Try to infer role from context
169
- context_lower = context.lower()
170
- if 'data scientist' in context_lower or ('python' in context_lower and 'machine learning' in context_lower):
171
- answer_parts.append(f"{profile['name']} is a Data Scientist")
172
- elif 'software' in context_lower and 'developer' in context_lower:
173
- answer_parts.append(f"{profile['name']} is a Software Developer")
174
- else:
175
- answer_parts.append(f"{profile['name']} is a professional")
176
- else:
177
- # Use name from query
178
- context_lower = context.lower()
179
- if 'data scientist' in context_lower or ('python' in context_lower and 'machine learning' in context_lower):
180
- answer_parts.append(f"{person_name.title()} is a Data Scientist")
181
- elif 'software' in context_lower and 'developer' in context_lower:
182
- answer_parts.append(f"{person_name.title()} is a Software Developer")
183
- else:
184
- answer_parts.append(f"{person_name.title()} is a professional")
185
-
186
- # Add key skills if available
187
- if profile['skills']:
188
- top_skills = profile['skills'][:3] # Top 3 skills
189
- answer_parts.append(f"with expertise in {', '.join(top_skills)}")
190
-
191
- if answer_parts:
192
- return '. '.join(answer_parts) + '.'
193
-
194
- # Handle other question types
195
- elif any(keyword in query_lower for keyword in ['what', 'skills', 'experience', 'work']):
196
- if 'skills' in query_lower:
197
- if profile['skills']:
198
- return f"Key skills include: {', '.join(profile['skills'])}."
199
- elif 'experience' in query_lower or 'work' in query_lower:
200
- # Look for experience indicators in context
201
- exp_lines = []
202
- for line in context.split('\n'):
203
- if any(word in line.lower() for word in ['experience', 'worked', 'internship', 'project']):
204
- exp_lines.append(line.strip())
205
- if exp_lines:
206
- return exp_lines[0]
207
-
208
- # Fallback to keyword matching
209
- query_words = set(query_lower.split())
210
- context_sentences = [s.strip() for s in context.split('.') if s.strip()]
211
-
212
- # Find most relevant sentence
213
- best_sentence = ""
214
- max_matches = 0
215
-
216
- for sentence in context_sentences:
217
- if len(sentence) < 20: # Skip very short sentences
218
- continue
219
 
220
- sentence_words = set(sentence.lower().split())
221
- matches = len(query_words.intersection(sentence_words))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
223
- if matches > max_matches:
224
- max_matches = matches
225
- best_sentence = sentence
226
-
227
- if best_sentence:
228
- return best_sentence + '.'
229
-
230
- # Final fallback
231
- return "Based on the document, I found relevant information but cannot provide a specific answer."
232
 
233
  def extract_text_from_file(self, file_path: str) -> str:
234
- """Extract text from various file formats"""
235
  try:
236
  file_extension = os.path.splitext(file_path)[1].lower()
237
 
@@ -248,99 +184,121 @@ class DocumentRAG:
248
  return f"Error reading file: {str(e)}"
249
 
250
  def extract_from_pdf(self, file_path: str) -> str:
251
- """Extract text from PDF"""
252
  text = ""
253
  try:
254
  with open(file_path, 'rb') as file:
255
  pdf_reader = PyPDF2.PdfReader(file)
256
- for page in pdf_reader.pages:
257
- text += page.extract_text() + "\n"
 
 
258
  except Exception as e:
259
  text = f"Error reading PDF: {str(e)}"
260
  return text
261
 
262
  def extract_from_docx(self, file_path: str) -> str:
263
- """Extract text from DOCX"""
264
  try:
265
  doc = docx.Document(file_path)
266
  text = ""
267
  for paragraph in doc.paragraphs:
268
- text += paragraph.text + "\n"
 
269
  return text
270
  except Exception as e:
271
  return f"Error reading DOCX: {str(e)}"
272
 
273
  def extract_from_txt(self, file_path: str) -> str:
274
- """Extract text from TXT"""
275
- try:
276
- with open(file_path, 'r', encoding='utf-8') as file:
277
- return file.read()
278
- except Exception as e:
279
  try:
280
- with open(file_path, 'r', encoding='latin-1') as file:
281
  return file.read()
282
- except Exception as e2:
283
- return f"Error reading TXT: {str(e2)}"
 
 
 
 
284
 
285
- def smart_chunk_text(self, text: str) -> List[str]:
286
- """Smart chunking that preserves important information together"""
287
  if not text.strip():
288
  return []
289
 
290
  chunks = []
291
- lines = text.split('\n')
292
-
293
- # Create chunks based on semantic meaning
294
- current_chunk = ""
295
- chunk_type = None
296
 
297
- for line in lines:
298
- line = line.strip()
299
- if not line:
300
- continue
301
 
302
- line_lower = line.lower()
303
-
304
- # Identify section types
305
- new_chunk_type = None
306
- if any(keyword in line_lower for keyword in ['name', 'email', 'phone', 'linkedin', 'github']):
307
- new_chunk_type = 'contact'
308
- elif any(keyword in line_lower for keyword in ['experience', 'work', 'internship']):
309
- new_chunk_type = 'experience'
310
- elif any(keyword in line_lower for keyword in ['education', 'degree', 'university', 'college']):
311
- new_chunk_type = 'education'
312
- elif any(keyword in line_lower for keyword in ['skills', 'technologies', 'programming']):
313
- new_chunk_type = 'skills'
314
- elif any(keyword in line_lower for keyword in ['project', 'developed', 'built']):
315
- new_chunk_type = 'projects'
316
-
317
- # If section type changes, save current chunk and start new one
318
- if new_chunk_type != chunk_type and current_chunk:
319
- chunks.append(current_chunk.strip())
320
- current_chunk = line
321
- chunk_type = new_chunk_type
322
- else:
323
- # Add to current chunk
324
- if current_chunk:
325
- current_chunk += "\n" + line
326
- else:
327
  current_chunk = line
328
- chunk_type = new_chunk_type
329
-
330
- # Limit chunk size
331
- if len(current_chunk.split()) > 150:
332
- chunks.append(current_chunk.strip())
333
- current_chunk = ""
334
- chunk_type = None
335
-
336
- # Add the last chunk
337
- if current_chunk:
338
- chunks.append(current_chunk.strip())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
339
 
340
  return chunks
341
 
342
  def process_documents(self, files) -> str:
343
- """Process uploaded files and create embeddings"""
344
  if not files:
345
  return "❌ No files uploaded!"
346
 
@@ -363,15 +321,24 @@ class DocumentRAG:
363
  if not all_text.strip():
364
  return "❌ No text extracted from files!"
365
 
366
- # Store raw text for smart answering
367
  self.raw_text = all_text
368
 
369
- # Smart chunk the text
370
- self.documents = self.smart_chunk_text(all_text)
 
 
 
371
 
372
- if not self.documents:
 
 
 
373
  return "❌ No valid text chunks created!"
374
 
 
 
 
375
  # Create embeddings
376
  print(f"πŸ“„ Creating embeddings for {len(self.documents)} chunks...")
377
  embeddings = self.embedder.encode(self.documents, show_progress_bar=True)
@@ -388,16 +355,18 @@ class DocumentRAG:
388
 
389
  return f"βœ… Successfully processed {len(processed_files)} files:\n" + \
390
  f"πŸ“„ Files: {', '.join(processed_files)}\n" + \
391
- f"πŸ“Š Created {len(self.documents)} text chunks\n" + \
392
- f"πŸ” Ready for Q&A!"
 
 
393
 
394
  except Exception as e:
395
  return f"❌ Error processing documents: {str(e)}"
396
 
397
- def retrieve_context(self, query: str, k: int = 3) -> str:
398
- """Retrieve relevant context with improved filtering"""
399
  if not self.is_indexed:
400
- return ""
401
 
402
  try:
403
  # Get query embedding
@@ -405,139 +374,226 @@ class DocumentRAG:
405
  faiss.normalize_L2(query_embedding)
406
 
407
  # Search for similar chunks
408
- scores, indices = self.index.search(query_embedding.astype('float32'), min(k, len(self.documents)))
409
 
410
- # Get relevant documents with reasonable threshold
411
- relevant_docs = []
412
  query_lower = query.lower()
 
 
 
 
413
 
414
  for i, idx in enumerate(indices[0]):
415
  if idx < len(self.documents):
416
- doc = self.documents[idx]
417
  score = scores[0][i]
 
 
 
 
418
 
419
- # For "who is" questions, prioritize contact/basic info chunks
420
- if "who is" in query_lower:
421
- doc_lower = doc.lower()
422
- if any(keyword in doc_lower for keyword in ['name', 'email', 'linkedin', 'data scientist', 'developer']):
423
- relevant_docs.insert(0, doc) # Put at beginning
424
- elif score > 0.15: # Lower threshold for other relevant content
425
- relevant_docs.append(doc)
426
- else:
427
- if score > 0.2: # Standard threshold
428
- relevant_docs.append(doc)
429
-
430
- # If no good matches for "who is", get the first few chunks
431
- if "who is" in query_lower and not relevant_docs:
432
- relevant_docs = self.documents[:2]
433
-
434
- return "\n\n".join(relevant_docs[:3]) # Limit to top 3 chunks
 
 
 
 
435
 
436
  except Exception as e:
437
  print(f"Error in retrieval: {e}")
438
- return ""
439
 
440
- def generate_answer(self, query: str, context: str) -> str:
441
- """Generate answer using the LLM with improved prompting"""
442
- if self.model is None or self.tokenizer is None:
443
- return self.simple_context_answer(query, context)
444
 
445
- try:
446
- # Check if using Mistral (has specific prompt format) or fallback model
447
- model_name = getattr(self.model.config, '_name_or_path', '').lower()
448
- is_mistral = 'mistral' in model_name
449
-
450
- if is_mistral:
451
- # Focused prompt for Mistral
452
- prompt = f"""<s>[INST] Answer the question about the person based on their resume. Be concise and direct.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
453
 
454
- Resume Information:
455
- {context[:800]}
456
 
457
  Question: {query}
458
 
459
- Provide a brief, specific answer in 1 sentence. [/INST]"""
460
- else:
461
- # Focused prompt for fallback models
462
- prompt = f"""Resume: {context[:600]}
 
 
 
 
463
 
464
  Question: {query}
465
- Answer briefly:"""
466
 
467
- # Tokenize
468
- inputs = self.tokenizer(
469
- prompt,
470
- return_tensors="pt",
471
- max_length=600,
472
- truncation=True,
473
- padding=True
474
- )
475
-
476
- # Move to same device as model
477
- if torch.cuda.is_available() and next(self.model.parameters()).is_cuda:
478
- inputs = {k: v.cuda() for k, v in inputs.items()}
479
-
480
- # Generate with focused parameters
481
- with torch.no_grad():
482
- outputs = self.model.generate(
483
- **inputs,
484
- max_new_tokens=50, # Much shorter for focused answers
485
- temperature=0.1, # Very low for deterministic responses
486
- do_sample=True,
487
- top_p=0.9,
488
- early_stopping=True,
489
- repetition_penalty=1.1,
490
- pad_token_id=self.tokenizer.pad_token_id,
491
- eos_token_id=self.tokenizer.eos_token_id
492
- )
493
-
494
- # Decode response
495
- full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
496
-
497
- # Extract answer
498
- if is_mistral and "[/INST]" in full_response:
499
- answer = full_response.split("[/INST]")[-1].strip()
500
- else:
501
- answer = full_response[len(prompt):].strip()
502
-
503
- # Clean and validate answer
504
- answer = self.clean_answer(answer)
505
-
506
- # If answer is too long or poor quality, use fallback
507
- if not answer or len(answer) > 200:
508
- return self.simple_context_answer(query, context)
509
-
510
- return answer
511
-
512
- except Exception as e:
513
- print(f"Error in generation: {e}")
514
- return self.simple_context_answer(query, context)
515
 
516
- def clean_answer(self, answer: str) -> str:
517
- """Clean up the generated answer"""
518
- if not answer or len(answer) < 5:
519
- return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
520
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
521
  # Remove unwanted patterns
522
  answer = re.sub(r'--- \w+.*? ---', '', answer)
523
- answer = re.sub(r'\b\w+@\w+\.\w+\b', '', answer) # Remove emails
524
- answer = re.sub(r'\+91-?\d+', '', answer) # Remove phone numbers
525
- answer = answer.replace('LinkedIn:', '').replace('Github:', '')
526
 
527
- # Clean up whitespace
528
  answer = ' '.join(answer.split())
529
 
530
- # Take only the first sentence if multiple
531
- sentences = answer.split('.')
532
- if sentences:
533
- first_sentence = sentences[0].strip()
534
- if len(first_sentence) > 10:
535
- return first_sentence + '.'
536
 
537
  return answer.strip()
538
 
539
  def answer_question(self, query: str) -> str:
540
- """Main function to answer questions"""
541
  if not query.strip():
542
  return "❓ Please ask a question!"
543
 
@@ -545,52 +601,59 @@ Answer briefly:"""
545
  return "πŸ“ Please upload and process documents first!"
546
 
547
  try:
548
- # Retrieve relevant context
549
- context = self.retrieve_context(query, k=3)
 
 
 
 
 
550
 
551
  if not context:
552
- return "πŸ” No relevant information found in the uploaded documents."
553
 
554
- # Generate answer
555
- answer = self.generate_answer(query, context)
556
 
557
- if answer and len(answer) > 5:
558
- return answer
559
- else:
560
- return "I couldn't generate a specific answer from the document content."
561
 
562
  except Exception as e:
563
- return f"❌ Error answering question: {str(e)}"
564
 
565
- # Initialize the RAG system
566
- print("Initializing Document RAG System...")
567
- rag_system = DocumentRAG()
568
 
569
- # Gradio Interface
570
  def create_interface():
571
- with gr.Blocks(title="πŸ“š Document Q&A with RAG", theme=gr.themes.Soft()) as demo:
572
  gr.Markdown("""
573
- # πŸ“š Document Q&A System
574
 
575
- Upload your documents and ask questions about them!
576
 
577
- **Supported formats:** PDF, DOCX, TXT
 
 
 
 
578
  """)
579
 
580
- with gr.Tab("πŸ“€ Upload Documents"):
581
  with gr.Row():
582
  with gr.Column():
583
  file_upload = gr.File(
584
- label="Upload Documents",
585
  file_count="multiple",
586
- file_types=[".pdf", ".docx", ".txt"]
 
587
  )
588
- process_btn = gr.Button("πŸ”„ Process Documents", variant="primary")
589
 
590
  with gr.Column():
591
  process_status = gr.Textbox(
592
- label="Processing Status",
593
- lines=8,
594
  interactive=False
595
  )
596
 
@@ -600,20 +663,23 @@ def create_interface():
600
  outputs=[process_status]
601
  )
602
 
603
- with gr.Tab("❓ Ask Questions"):
604
  with gr.Row():
605
  with gr.Column():
606
  question_input = gr.Textbox(
607
- label="Your Question",
608
- placeholder="Who is Pradeep?",
609
  lines=3
610
  )
611
- ask_btn = gr.Button("πŸ” Get Answer", variant="primary")
 
 
 
612
 
613
  with gr.Column():
614
  answer_output = gr.Textbox(
615
- label="Answer",
616
- lines=6,
617
  interactive=False
618
  )
619
 
@@ -623,19 +689,58 @@ def create_interface():
623
  outputs=[answer_output]
624
  )
625
 
626
- # Example questions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
627
  gr.Markdown("""
628
- ### πŸ’‘ Example Questions:
629
- - Who is [Name]?
630
- - What are [Name]'s skills?
631
- - What experience does [Name] have?
632
- - What projects has [Name] worked on?
633
- - What is [Name]'s educational background?
 
 
 
 
 
 
 
 
 
 
 
 
 
634
  """)
635
 
636
  return demo
637
 
638
- # Launch the app
639
  if __name__ == "__main__":
640
  demo = create_interface()
641
  demo.launch(
 
9
  import io
10
  import os
11
  import re
12
+ from typing import List, Optional, Dict, Tuple
13
+ import json
14
 
15
+ class SmartDocumentRAG:
16
  def __init__(self):
17
+ print("πŸš€ Initializing Smart RAG System...")
18
 
19
  # Initialize embedding model (lightweight)
20
  self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
 
25
 
26
  # Document storage
27
  self.documents = []
28
+ self.document_metadata = [] # Store metadata about each chunk
29
  self.index = None
30
  self.is_indexed = False
31
+ self.raw_text = ""
32
+ self.document_type = "general" # Auto-detect document type
33
+ self.document_summary = "" # Store document summary
34
 
35
  def setup_llm(self):
36
+ """Setup quantized Mistral model with fallback"""
37
  try:
38
  # Check if CUDA is available
39
  if not torch.cuda.is_available():
40
+ print("⚠️ CUDA not available, using CPU-optimized model")
41
+ self.setup_cpu_model()
42
  return
43
 
44
  quantization_config = BitsAndBytesConfig(
 
50
 
51
  model_name = "mistralai/Mistral-7B-Instruct-v0.1"
52
 
 
53
  self.tokenizer = AutoTokenizer.from_pretrained(
54
  model_name,
55
  trust_remote_code=True
56
  )
57
 
 
58
  if self.tokenizer.pad_token is None:
59
  self.tokenizer.pad_token = self.tokenizer.eos_token
60
 
 
61
  self.model = AutoModelForCausalLM.from_pretrained(
62
  model_name,
63
  quantization_config=quantization_config,
 
70
  print("βœ… Quantized Mistral model loaded successfully")
71
 
72
  except Exception as e:
73
+ print(f"❌ Error loading Mistral: {e}")
74
+ print("πŸ”„ Falling back to CPU model...")
75
+ self.setup_cpu_model()
76
 
77
+ def setup_cpu_model(self):
78
+ """Setup CPU-friendly model"""
79
  try:
80
+ # Use GPT-2 for better text generation on CPU
81
+ model_name = "gpt2-medium"
82
  self.tokenizer = AutoTokenizer.from_pretrained(model_name)
83
  self.model = AutoModelForCausalLM.from_pretrained(model_name)
84
 
 
85
  if self.tokenizer.pad_token is None:
86
  self.tokenizer.pad_token = self.tokenizer.eos_token
87
 
88
+ print("βœ… CPU model loaded")
89
  except Exception as e:
90
+ print(f"❌ CPU model failed: {e}")
 
91
  self.model = None
92
  self.tokenizer = None
93
+ print("⚠️ Using context-only mode")
94
 
95
+ def detect_document_type(self, text: str) -> str:
96
+ """Intelligently detect document type"""
97
+ text_lower = text.lower()
98
+
99
+ # Count keywords for different document types
100
+ resume_keywords = ['experience', 'skills', 'education', 'linkedin', 'email', 'phone', 'internship']
101
+ research_keywords = ['abstract', 'introduction', 'methodology', 'conclusion', 'references', 'study', 'analysis']
102
+ business_keywords = ['company', 'revenue', 'market', 'strategy', 'business', 'financial', 'quarter']
103
+ technical_keywords = ['implementation', 'algorithm', 'system', 'technical', 'specification', 'architecture']
104
+ legal_keywords = ['contract', 'agreement', 'terms', 'conditions', 'legal', 'clause', 'liability']
105
+
106
+ scores = {
107
+ 'resume': sum(1 for kw in resume_keywords if kw in text_lower),
108
+ 'research': sum(1 for kw in research_keywords if kw in text_lower),
109
+ 'business': sum(1 for kw in business_keywords if kw in text_lower),
110
+ 'technical': sum(1 for kw in technical_keywords if kw in text_lower),
111
+ 'legal': sum(1 for kw in legal_keywords if kw in text_lower)
112
  }
113
 
114
+ return max(scores, key=scores.get) if max(scores.values()) > 2 else 'general'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
+ def create_document_summary(self, text: str) -> str:
117
+ """Create intelligent document summary"""
118
+ try:
119
+ # Split into paragraphs and find key information
120
+ paragraphs = [p.strip() for p in text.split('\n\n') if p.strip() and len(p) > 50]
 
 
 
 
 
 
 
 
 
121
 
122
+ if not paragraphs:
123
+ return "Document contains basic text information."
124
 
125
+ # Take first few paragraphs for summary context
126
+ summary_text = ' '.join(paragraphs[:3])[:1000]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
+ if self.model and self.tokenizer:
129
+ # Generate AI summary
130
+ prompt = f"""Summarize the following document in 2-3 sentences, focusing on the main points and key information:
131
+
132
+ {summary_text}
133
+
134
+ Summary:"""
135
+
136
+ try:
137
+ inputs = self.tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
138
+ if torch.cuda.is_available() and next(self.model.parameters()).is_cuda:
139
+ inputs = {k: v.cuda() for k, v in inputs.items()}
140
+
141
+ with torch.no_grad():
142
+ outputs = self.model.generate(
143
+ **inputs,
144
+ max_new_tokens=100,
145
+ temperature=0.7,
146
+ do_sample=True,
147
+ top_p=0.9,
148
+ pad_token_id=self.tokenizer.pad_token_id
149
+ )
150
+
151
+ summary = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
152
+ summary = summary.split("Summary:")[-1].strip()
153
+
154
+ if len(summary) > 20:
155
+ return summary
156
+
157
+ except Exception as e:
158
+ print(f"Error generating AI summary: {e}")
159
 
160
+ # Fallback: Extract key sentences
161
+ sentences = re.split(r'[.!?]+', summary_text)
162
+ key_sentences = [s.strip() for s in sentences if len(s.strip()) > 30][:2]
163
+
164
+ return '. '.join(key_sentences) + '.' if key_sentences else "Document contains relevant information."
165
+
166
+ except Exception as e:
167
+ return "Document summary not available."
 
168
 
169
  def extract_text_from_file(self, file_path: str) -> str:
170
+ """Extract text from various file formats with better error handling"""
171
  try:
172
  file_extension = os.path.splitext(file_path)[1].lower()
173
 
 
184
  return f"Error reading file: {str(e)}"
185
 
186
  def extract_from_pdf(self, file_path: str) -> str:
187
+ """Enhanced PDF extraction"""
188
  text = ""
189
  try:
190
  with open(file_path, 'rb') as file:
191
  pdf_reader = PyPDF2.PdfReader(file)
192
+ for page_num, page in enumerate(pdf_reader.pages):
193
+ page_text = page.extract_text()
194
+ if page_text.strip():
195
+ text += f"\n[Page {page_num + 1}]\n{page_text}\n"
196
  except Exception as e:
197
  text = f"Error reading PDF: {str(e)}"
198
  return text
199
 
200
  def extract_from_docx(self, file_path: str) -> str:
201
+ """Enhanced DOCX extraction"""
202
  try:
203
  doc = docx.Document(file_path)
204
  text = ""
205
  for paragraph in doc.paragraphs:
206
+ if paragraph.text.strip():
207
+ text += paragraph.text + "\n"
208
  return text
209
  except Exception as e:
210
  return f"Error reading DOCX: {str(e)}"
211
 
212
  def extract_from_txt(self, file_path: str) -> str:
213
+ """Enhanced TXT extraction with encoding detection"""
214
+ encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
215
+
216
+ for encoding in encodings:
 
217
  try:
218
+ with open(file_path, 'r', encoding=encoding) as file:
219
  return file.read()
220
+ except UnicodeDecodeError:
221
+ continue
222
+ except Exception as e:
223
+ return f"Error reading TXT: {str(e)}"
224
+
225
+ return "Error: Could not decode file with any supported encoding"
226
 
227
+ def intelligent_chunk_text(self, text: str, doc_type: str) -> List[Dict]:
228
+ """Intelligent chunking based on document type"""
229
  if not text.strip():
230
  return []
231
 
232
  chunks = []
233
+ lines = [line.strip() for line in text.split('\n') if line.strip()]
 
 
 
 
234
 
235
+ if doc_type == 'research':
236
+ # For research papers, chunk by sections
237
+ current_chunk = ""
238
+ current_section = "introduction"
239
 
240
+ for line in lines:
241
+ line_lower = line.lower()
242
+
243
+ # Detect section headers
244
+ if any(header in line_lower for header in ['abstract', 'introduction', 'methodology', 'results', 'conclusion', 'references']):
245
+ if current_chunk:
246
+ chunks.append({
247
+ 'text': current_chunk.strip(),
248
+ 'section': current_section,
249
+ 'doc_type': doc_type
250
+ })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
  current_chunk = line
252
+ current_section = line_lower.split()[0] if line_lower.split() else "section"
253
+ else:
254
+ current_chunk += "\n" + line
255
+
256
+ # Limit chunk size
257
+ if len(current_chunk.split()) > 200:
258
+ chunks.append({
259
+ 'text': current_chunk.strip(),
260
+ 'section': current_section,
261
+ 'doc_type': doc_type
262
+ })
263
+ current_chunk = ""
264
+
265
+ if current_chunk:
266
+ chunks.append({
267
+ 'text': current_chunk.strip(),
268
+ 'section': current_section,
269
+ 'doc_type': doc_type
270
+ })
271
+
272
+ else:
273
+ # General intelligent chunking
274
+ current_chunk = ""
275
+ sentence_count = 0
276
+
277
+ for line in lines:
278
+ current_chunk += line + "\n"
279
+ sentence_count += len(re.findall(r'[.!?]+', line))
280
+
281
+ # Create chunk based on sentence count or word count
282
+ if sentence_count >= 5 or len(current_chunk.split()) > 150:
283
+ chunks.append({
284
+ 'text': current_chunk.strip(),
285
+ 'section': 'content',
286
+ 'doc_type': doc_type
287
+ })
288
+ current_chunk = ""
289
+ sentence_count = 0
290
+
291
+ if current_chunk:
292
+ chunks.append({
293
+ 'text': current_chunk.strip(),
294
+ 'section': 'content',
295
+ 'doc_type': doc_type
296
+ })
297
 
298
  return chunks
299
 
300
  def process_documents(self, files) -> str:
301
+ """Enhanced document processing with intelligent analysis"""
302
  if not files:
303
  return "❌ No files uploaded!"
304
 
 
321
  if not all_text.strip():
322
  return "❌ No text extracted from files!"
323
 
324
+ # Store raw text
325
  self.raw_text = all_text
326
 
327
+ # Detect document type
328
+ self.document_type = self.detect_document_type(all_text)
329
+
330
+ # Create document summary
331
+ self.document_summary = self.create_document_summary(all_text)
332
 
333
+ # Intelligent chunking
334
+ chunk_data = self.intelligent_chunk_text(all_text, self.document_type)
335
+
336
+ if not chunk_data:
337
  return "❌ No valid text chunks created!"
338
 
339
+ self.documents = [chunk['text'] for chunk in chunk_data]
340
+ self.document_metadata = chunk_data
341
+
342
  # Create embeddings
343
  print(f"πŸ“„ Creating embeddings for {len(self.documents)} chunks...")
344
  embeddings = self.embedder.encode(self.documents, show_progress_bar=True)
 
355
 
356
  return f"βœ… Successfully processed {len(processed_files)} files:\n" + \
357
  f"πŸ“„ Files: {', '.join(processed_files)}\n" + \
358
+ f"πŸ“Š Document Type: {self.document_type.title()}\n" + \
359
+ f"πŸ” Created {len(self.documents)} intelligent chunks\n" + \
360
+ f"πŸ“ Summary: {self.document_summary[:200]}...\n" + \
361
+ f"πŸš€ Ready for smart Q&A!"
362
 
363
  except Exception as e:
364
  return f"❌ Error processing documents: {str(e)}"
365
 
366
+ def smart_retrieve_context(self, query: str, k: int = 4) -> Tuple[str, List[Dict]]:
367
+ """Enhanced context retrieval with intelligent ranking"""
368
  if not self.is_indexed:
369
+ return "", []
370
 
371
  try:
372
  # Get query embedding
 
374
  faiss.normalize_L2(query_embedding)
375
 
376
  # Search for similar chunks
377
+ scores, indices = self.index.search(query_embedding.astype('float32'), min(k * 2, len(self.documents)))
378
 
379
+ # Analyze query intent
 
380
  query_lower = query.lower()
381
+ is_summary_request = any(word in query_lower for word in ['summary', 'summarize', 'overview', 'what is', 'about'])
382
+ is_specific_request = any(word in query_lower for word in ['how', 'why', 'when', 'where', 'which'])
383
+
384
+ relevant_chunks = []
385
 
386
  for i, idx in enumerate(indices[0]):
387
  if idx < len(self.documents):
 
388
  score = scores[0][i]
389
+ chunk_data = self.document_metadata[idx]
390
+
391
+ # Adjust scoring based on query type and document structure
392
+ adjusted_score = score
393
 
394
+ if is_summary_request:
395
+ # Boost introductory sections for summary requests
396
+ if chunk_data['section'] in ['introduction', 'abstract', 'content']:
397
+ adjusted_score += 0.1
398
+
399
+ if adjusted_score > 0.15: # Threshold for relevance
400
+ relevant_chunks.append({
401
+ 'text': self.documents[idx],
402
+ 'score': adjusted_score,
403
+ 'metadata': chunk_data
404
+ })
405
+
406
+ # Sort by adjusted score
407
+ relevant_chunks.sort(key=lambda x: x['score'], reverse=True)
408
+
409
+ # Take top chunks
410
+ top_chunks = relevant_chunks[:k]
411
+ context = "\n\n".join([chunk['text'] for chunk in top_chunks])
412
+
413
+ return context, top_chunks
414
 
415
  except Exception as e:
416
  print(f"Error in retrieval: {e}")
417
+ return "", []
418
 
419
+ def generate_smart_answer(self, query: str, context: str, chunks_data: List[Dict]) -> str:
420
+ """Generate intelligent answers based on query type and context"""
421
+ if not context:
422
+ return "No relevant information found in the documents."
423
 
424
+ query_lower = query.lower()
425
+
426
+ # Determine answer type
427
+ is_summary_request = any(word in query_lower for word in ['summary', 'summarize', 'overview', 'what is', 'about'])
428
+ is_comparison_request = any(word in query_lower for word in ['compare', 'difference', 'versus', 'vs'])
429
+ is_specific_question = any(word in query_lower for word in ['how', 'why', 'when', 'where', 'which'])
430
+
431
+ if self.model and self.tokenizer:
432
+ try:
433
+ # Create intelligent prompt based on query type
434
+ if is_summary_request:
435
+ prompt = self.create_summary_prompt(query, context)
436
+ elif is_comparison_request:
437
+ prompt = self.create_comparison_prompt(query, context)
438
+ else:
439
+ prompt = self.create_general_prompt(query, context)
440
+
441
+ # Generate response
442
+ inputs = self.tokenizer(
443
+ prompt,
444
+ return_tensors="pt",
445
+ max_length=800,
446
+ truncation=True,
447
+ padding=True
448
+ )
449
+
450
+ if torch.cuda.is_available() and next(self.model.parameters()).is_cuda:
451
+ inputs = {k: v.cuda() for k, v in inputs.items()}
452
+
453
+ with torch.no_grad():
454
+ outputs = self.model.generate(
455
+ **inputs,
456
+ max_new_tokens=150,
457
+ temperature=0.3,
458
+ do_sample=True,
459
+ top_p=0.9,
460
+ repetition_penalty=1.1,
461
+ pad_token_id=self.tokenizer.pad_token_id,
462
+ eos_token_id=self.tokenizer.eos_token_id
463
+ )
464
+
465
+ # Extract and clean answer
466
+ full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
467
+ answer = self.extract_answer_from_response(full_response, prompt)
468
+
469
+ if answer and len(answer) > 20:
470
+ return self.clean_and_validate_answer(answer)
471
+
472
+ except Exception as e:
473
+ print(f"Error in AI generation: {e}")
474
+
475
+ # Fallback to intelligent context-based answering
476
+ return self.context_based_smart_answer(query, context, chunks_data)
477
+
478
+ def create_summary_prompt(self, query: str, context: str) -> str:
479
+ """Create prompt for summary requests"""
480
+ return f"""Based on the document content below, provide a comprehensive summary addressing the question.
481
 
482
+ Document Content:
483
+ {context[:1000]}
484
 
485
  Question: {query}
486
 
487
+ Provide a clear, informative summary that addresses the question:"""
488
+
489
+ def create_comparison_prompt(self, query: str, context: str) -> str:
490
+ """Create prompt for comparison requests"""
491
+ return f"""Analyze the document content and provide a comparison as requested.
492
+
493
+ Document Content:
494
+ {context[:1000]}
495
 
496
  Question: {query}
 
497
 
498
+ Provide a detailed comparison based on the information:"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
499
 
500
+ def create_general_prompt(self, query: str, context: str) -> str:
501
+ """Create prompt for general questions"""
502
+ return f"""Answer the question based on the document content provided.
503
+
504
+ Document Content:
505
+ {context[:1000]}
506
+
507
+ Question: {query}
508
+
509
+ Provide a specific, accurate answer:"""
510
+
511
+ def extract_answer_from_response(self, response: str, prompt: str) -> str:
512
+ """Extract clean answer from model response"""
513
+ # Remove the prompt part
514
+ if prompt in response:
515
+ answer = response.replace(prompt, "").strip()
516
+ else:
517
+ # Try to find the answer after common patterns
518
+ patterns = ["Answer:", "Summary:", "Response:", "answer:", "summary:", "response:"]
519
+ answer = response
520
+ for pattern in patterns:
521
+ if pattern in response:
522
+ answer = response.split(pattern)[-1].strip()
523
+ break
524
+
525
+ return answer
526
+
527
+ def context_based_smart_answer(self, query: str, context: str, chunks_data: List[Dict]) -> str:
528
+ """Intelligent context-based answering as fallback"""
529
+ query_lower = query.lower()
530
+
531
+ # For summary requests
532
+ if any(word in query_lower for word in ['summary', 'summarize', 'overview', 'about']):
533
+ return self.create_context_summary(context, chunks_data)
534
+
535
+ # For specific questions, find most relevant sentences
536
+ context_sentences = [s.strip() for s in context.split('.') if len(s.strip()) > 20]
537
+ query_words = set(query_lower.split())
538
+
539
+ # Score sentences by relevance
540
+ scored_sentences = []
541
+ for sentence in context_sentences:
542
+ sentence_words = set(sentence.lower().split())
543
+ overlap = len(query_words.intersection(sentence_words))
544
+ if overlap > 0:
545
+ scored_sentences.append((sentence, overlap))
546
+
547
+ # Sort by relevance and combine top sentences
548
+ scored_sentences.sort(key=lambda x: x[1], reverse=True)
549
+
550
+ if scored_sentences:
551
+ top_sentences = [s[0] for s in scored_sentences[:3]]
552
+ return '. '.join(top_sentences) + '.'
553
 
554
+ return "I found relevant information but couldn't extract a specific answer. Please try rephrasing your question."
555
+
556
+ def create_context_summary(self, context: str, chunks_data: List[Dict]) -> str:
557
+ """Create summary from context"""
558
+ # Get key sentences from different sections
559
+ sentences_by_section = {}
560
+
561
+ for chunk in chunks_data:
562
+ section = chunk['metadata']['section']
563
+ sentences = [s.strip() for s in chunk['text'].split('.') if len(s.strip()) > 30]
564
+ if sentences:
565
+ if section not in sentences_by_section:
566
+ sentences_by_section[section] = []
567
+ sentences_by_section[section].extend(sentences[:2]) # Top 2 sentences per section
568
+
569
+ # Combine sentences from different sections
570
+ summary_parts = []
571
+ for section, sentences in sentences_by_section.items():
572
+ if sentences:
573
+ summary_parts.extend(sentences[:1]) # One sentence per section
574
+
575
+ if summary_parts:
576
+ return '. '.join(summary_parts[:4]) + '.' # Max 4 sentences
577
+
578
+ return self.document_summary if self.document_summary else "Document contains relevant information on the requested topic."
579
+
580
+ def clean_and_validate_answer(self, answer: str) -> str:
581
+ """Clean and validate the generated answer"""
582
  # Remove unwanted patterns
583
  answer = re.sub(r'--- \w+.*? ---', '', answer)
584
+ answer = re.sub(r'\[Page \d+\]', '', answer)
 
 
585
 
586
+ # Clean up whitespace and formatting
587
  answer = ' '.join(answer.split())
588
 
589
+ # Ensure proper sentence structure
590
+ if answer and not answer.endswith(('.', '!', '?')):
591
+ answer += '.'
 
 
 
592
 
593
  return answer.strip()
594
 
595
  def answer_question(self, query: str) -> str:
596
+ """Main function to answer questions intelligently"""
597
  if not query.strip():
598
  return "❓ Please ask a question!"
599
 
 
601
  return "πŸ“ Please upload and process documents first!"
602
 
603
  try:
604
+ # Special handling for document-level questions
605
+ query_lower = query.lower()
606
+ if query_lower in ['summary', 'summarize this document', 'what is this about']:
607
+ return f"πŸ“„ Document Summary:\n\n{self.document_summary}"
608
+
609
+ # Retrieve relevant context with intelligence
610
+ context, chunks_data = self.smart_retrieve_context(query, k=4)
611
 
612
  if not context:
613
+ return "πŸ” No relevant information found for your question. Try rephrasing or asking about different aspects of the document."
614
 
615
+ # Generate intelligent answer
616
+ answer = self.generate_smart_answer(query, context, chunks_data)
617
 
618
+ return answer if answer else "I couldn't generate a specific answer. Please try asking in a different way."
 
 
 
619
 
620
  except Exception as e:
621
+ return f"❌ Error processing question: {str(e)}"
622
 
623
+ # Initialize the enhanced RAG system
624
+ print("Initializing Smart Document RAG System...")
625
+ rag_system = SmartDocumentRAG()
626
 
627
+ # Enhanced Gradio Interface
628
  def create_interface():
629
+ with gr.Blocks(title="🧠 Smart Document Q&A", theme=gr.themes.Soft()) as demo:
630
  gr.Markdown("""
631
+ # 🧠 Smart Document Q&A System
632
 
633
+ Upload documents and get intelligent answers with summaries and insights!
634
 
635
+ **Features:**
636
+ - 🎯 Intelligent document type detection
637
+ - πŸ“Š Smart summarization
638
+ - πŸ” Context-aware answers
639
+ - πŸ“š Multi-format support (PDF, DOCX, TXT)
640
  """)
641
 
642
+ with gr.Tab("πŸ“€ Upload & Process"):
643
  with gr.Row():
644
  with gr.Column():
645
  file_upload = gr.File(
646
+ label="πŸ“ Upload Documents",
647
  file_count="multiple",
648
+ file_types=[".pdf", ".docx", ".txt"],
649
+ height=150
650
  )
651
+ process_btn = gr.Button("πŸ”„ Process Documents", variant="primary", size="lg")
652
 
653
  with gr.Column():
654
  process_status = gr.Textbox(
655
+ label="πŸ“‹ Processing Status & Document Analysis",
656
+ lines=10,
657
  interactive=False
658
  )
659
 
 
663
  outputs=[process_status]
664
  )
665
 
666
+ with gr.Tab("❓ Smart Q&A"):
667
  with gr.Row():
668
  with gr.Column():
669
  question_input = gr.Textbox(
670
+ label="πŸ€” Ask Anything",
671
+ placeholder="What is this document about? / Summarize the main points / How does X work?",
672
  lines=3
673
  )
674
+
675
+ with gr.Row():
676
+ ask_btn = gr.Button("🧠 Get Smart Answer", variant="primary")
677
+ summary_btn = gr.Button("πŸ“Š Get Summary", variant="secondary")
678
 
679
  with gr.Column():
680
  answer_output = gr.Textbox(
681
+ label="πŸ’‘ Smart Answer",
682
+ lines=8,
683
  interactive=False
684
  )
685
 
 
689
  outputs=[answer_output]
690
  )
691
 
692
+ summary_btn.click(
693
+ fn=lambda: rag_system.answer_question("summary"),
694
+ inputs=[],
695
+ outputs=[answer_output]
696
+ )
697
+
698
+ # Enhanced example questions
699
+ gr.Markdown("""
700
+ ### πŸ’‘ Smart Question Examples:
701
+
702
+ **πŸ“Š For Summaries:**
703
+ - "What is this document about?"
704
+ - "Summarize the main points"
705
+ - "Give me an overview"
706
+
707
+ **πŸ” For Specific Information:**
708
+ - "How does [topic] work?"
709
+ - "What are the key findings?"
710
+ - "Explain [concept] from the document"
711
+
712
+ **🎯 For Analysis:**
713
+ - "What are the pros and cons?"
714
+ - "Compare [A] and [B]"
715
+ - "What conclusions can be drawn?"
716
+ """)
717
+
718
+ with gr.Tab("ℹ️ Tips"):
719
  gr.Markdown("""
720
+ ### πŸš€ How to Get the Best Results:
721
+
722
+ **πŸ“„ Document Types Supported:**
723
+ - Research papers & academic documents
724
+ - Business reports & presentations
725
+ - Technical documentation
726
+ - Legal documents
727
+ - General text documents
728
+
729
+ **❓ Question Tips:**
730
+ - Be specific about what you want to know
731
+ - Use "summarize" or "overview" for general summaries
732
+ - Ask "how", "why", "what" for detailed explanations
733
+ - Request comparisons with "compare" or "difference"
734
+
735
+ **🎯 Best Practices:**
736
+ - Upload clear, well-formatted documents
737
+ - Ask one question at a time for focused answers
738
+ - Try rephrasing if the first answer isn't what you expected
739
  """)
740
 
741
  return demo
742
 
743
+ # Launch the enhanced app
744
  if __name__ == "__main__":
745
  demo = create_interface()
746
  demo.launch(