pradeepsengarr commited on
Commit
38c113a
·
verified ·
1 Parent(s): 8b78b3b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -31
app.py CHANGED
@@ -96,8 +96,50 @@ class DocumentRAG:
96
  if not context:
97
  return "No relevant information found in the documents."
98
 
99
- # Improved keyword matching approach
100
- query_words = set(query.lower().split())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  context_sentences = context.split('.')
102
 
103
  # Find sentences that contain query keywords
@@ -115,7 +157,7 @@ class DocumentRAG:
115
 
116
  if relevant_sentences:
117
  # Return the most relevant sentences
118
- return '. '.join(relevant_sentences[:3]) + '.'
119
  else:
120
  # If no exact matches, return first few sentences of context
121
  first_sentences = context_sentences[:2]
@@ -303,25 +345,27 @@ class DocumentRAG:
303
  is_mistral = 'mistral' in model_name
304
 
305
  if is_mistral:
306
- # Improved prompt for Mistral - more flexible
307
- prompt = f"""<s>[INST] You are a helpful document assistant. Answer the question based on the provided context. If the exact answer isn't in the context, provide the most relevant information available.
 
 
308
 
309
- Context:
310
  {context[:1500]}
311
 
312
  Question: {query}
313
 
314
- Please provide a helpful answer based on the available information. [/INST]"""
315
  else:
316
  # Improved prompt for fallback models
317
- prompt = f"""Based on the following information, please answer the question:
318
 
319
- Context:
320
  {context[:1000]}
321
 
322
  Question: {query}
323
 
324
- Answer:"""
325
 
326
  # Tokenize with proper handling
327
  inputs = self.tokenizer(
@@ -336,17 +380,17 @@ Answer:"""
336
  if torch.cuda.is_available() and next(self.model.parameters()).is_cuda:
337
  inputs = {k: v.cuda() for k, v in inputs.items()}
338
 
339
- # Generate with more flexible parameters
340
  with torch.no_grad():
341
  outputs = self.model.generate(
342
  **inputs,
343
- max_new_tokens=150,
344
- temperature=0.3, # Slightly higher for more natural responses
345
  do_sample=True,
346
- top_p=0.9,
347
- num_beams=2,
348
  early_stopping=True,
349
- repetition_penalty=1.1,
350
  pad_token_id=self.tokenizer.pad_token_id,
351
  eos_token_id=self.tokenizer.eos_token_id
352
  )
@@ -359,7 +403,9 @@ Answer:"""
359
  answer = full_response.split("[/INST]")[-1].strip()
360
  else:
361
  # For other models, remove the prompt
362
- if "Answer:" in full_response:
 
 
363
  answer = full_response.split("Answer:")[-1].strip()
364
  else:
365
  answer = full_response[len(prompt):].strip()
@@ -374,28 +420,45 @@ Answer:"""
374
  return self.simple_context_answer(query, context)
375
 
376
  def clean_answer(self, answer: str) -> str:
377
- """Clean up the generated answer"""
378
  if not answer or len(answer) < 5:
379
  return ""
380
 
381
- # Remove obvious problematic patterns
382
- lines = answer.split('\n')
383
- cleaned_lines = []
384
 
385
- for line in lines:
386
- line = line.strip()
387
- if line and not any(pattern in line.lower() for pattern in [
 
 
 
 
 
 
 
 
388
  'what are you doing', 'what do you think', 'how are you',
389
- 'i am an ai', 'i cannot', 'i don\'t know'
 
390
  ]):
391
- cleaned_lines.append(line)
 
 
 
 
 
392
 
393
- cleaned_answer = ' '.join(cleaned_lines)
 
 
 
 
394
 
395
- # Limit length to prevent rambling
396
- if len(cleaned_answer) > 500:
397
- sentences = cleaned_answer.split('.')
398
- cleaned_answer = '. '.join(sentences[:3]) + '.'
399
 
400
  return cleaned_answer.strip()
401
 
 
96
  if not context:
97
  return "No relevant information found in the documents."
98
 
99
+ query_lower = query.lower()
100
+
101
+ # Handle "who is" questions specifically
102
+ if "who is" in query_lower:
103
+ # Extract name from query
104
+ name_part = query_lower.replace("who is", "").strip()
105
+
106
+ # Look for professional information in context
107
+ lines = context.split('\n')
108
+ name_info = []
109
+ professional_info = []
110
+
111
+ for line in lines:
112
+ line = line.strip()
113
+ if not line or line.startswith('---'):
114
+ continue
115
+
116
+ line_lower = line.lower()
117
+
118
+ # Look for job titles, companies, roles
119
+ if any(keyword in line_lower for keyword in [
120
+ 'scientist', 'engineer', 'analyst', 'developer', 'manager',
121
+ 'consultant', 'specialist', 'coordinator', 'associate', 'intern',
122
+ 'at ', 'working at', 'employed', 'position', 'role'
123
+ ]):
124
+ professional_info.append(line)
125
+
126
+ # Look for name and basic info
127
+ elif any(keyword in line_lower for keyword in [
128
+ 'name', 'email', 'phone', 'linkedin', 'github', 'experience'
129
+ ]):
130
+ name_info.append(line)
131
+
132
+ # Construct answer
133
+ if professional_info:
134
+ answer = f"Based on the resume, {name_part} is " + professional_info[0]
135
+ if len(professional_info) > 1:
136
+ answer += f" and also {professional_info[1]}"
137
+ return answer
138
+ elif name_info:
139
+ return f"The document shows information about {name_part}: " + "; ".join(name_info[:2])
140
+
141
+ # For other questions, use improved keyword matching
142
+ query_words = set(query_lower.split())
143
  context_sentences = context.split('.')
144
 
145
  # Find sentences that contain query keywords
 
157
 
158
  if relevant_sentences:
159
  # Return the most relevant sentences
160
+ return '. '.join(relevant_sentences[:2]) + '.'
161
  else:
162
  # If no exact matches, return first few sentences of context
163
  first_sentences = context_sentences[:2]
 
345
  is_mistral = 'mistral' in model_name
346
 
347
  if is_mistral:
348
+ # Improved prompt for Mistral with specific instructions
349
+ prompt = f"""<s>[INST] You are a helpful assistant that answers questions about people based on their resume/document information.
350
+
351
+ Answer the question clearly and concisely. For "who is" questions, provide a brief professional summary.
352
 
353
+ Context from document:
354
  {context[:1500]}
355
 
356
  Question: {query}
357
 
358
+ Provide a clear, direct answer in 1-2 sentences. [/INST]"""
359
  else:
360
  # Improved prompt for fallback models
361
+ prompt = f"""Answer the question about the person based on their resume information:
362
 
363
+ Resume Information:
364
  {context[:1000]}
365
 
366
  Question: {query}
367
 
368
+ Answer (be direct and concise):"""
369
 
370
  # Tokenize with proper handling
371
  inputs = self.tokenizer(
 
380
  if torch.cuda.is_available() and next(self.model.parameters()).is_cuda:
381
  inputs = {k: v.cuda() for k, v in inputs.items()}
382
 
383
+ # Generate with more focused parameters
384
  with torch.no_grad():
385
  outputs = self.model.generate(
386
  **inputs,
387
+ max_new_tokens=100, # Shorter for more focused answers
388
+ temperature=0.2, # Lower for more deterministic responses
389
  do_sample=True,
390
+ top_p=0.8,
391
+ num_beams=3,
392
  early_stopping=True,
393
+ repetition_penalty=1.2,
394
  pad_token_id=self.tokenizer.pad_token_id,
395
  eos_token_id=self.tokenizer.eos_token_id
396
  )
 
403
  answer = full_response.split("[/INST]")[-1].strip()
404
  else:
405
  # For other models, remove the prompt
406
+ if "Answer (be direct and concise):" in full_response:
407
+ answer = full_response.split("Answer (be direct and concise):")[-1].strip()
408
+ elif "Answer:" in full_response:
409
  answer = full_response.split("Answer:")[-1].strip()
410
  else:
411
  answer = full_response[len(prompt):].strip()
 
420
  return self.simple_context_answer(query, context)
421
 
422
  def clean_answer(self, answer: str) -> str:
423
+ """Clean up the generated answer with better formatting"""
424
  if not answer or len(answer) < 5:
425
  return ""
426
 
427
+ # Remove file markers and cleanup
428
+ answer = answer.replace('--- ', '').replace(' ---', '')
429
+ answer = answer.replace('.pdf', '').replace('.docx', '').replace('.txt', '')
430
 
431
+ # Split into sentences and clean each
432
+ sentences = answer.split('.')
433
+ cleaned_sentences = []
434
+
435
+ for sentence in sentences:
436
+ sentence = sentence.strip()
437
+ if not sentence:
438
+ continue
439
+
440
+ # Skip problematic patterns
441
+ if any(pattern in sentence.lower() for pattern in [
442
  'what are you doing', 'what do you think', 'how are you',
443
+ 'i am an ai', 'i cannot', 'i don\'t know', 'linkedin: www',
444
+ 'github:', 'email:', 'mobile:', '+91-'
445
  ]):
446
+ continue
447
+
448
+ # Clean up common formatting issues
449
+ sentence = sentence.replace(' ', ' ')
450
+ if sentence and len(sentence) > 3:
451
+ cleaned_sentences.append(sentence)
452
 
453
+ if not cleaned_sentences:
454
+ return ""
455
+
456
+ # Reconstruct answer
457
+ cleaned_answer = '. '.join(cleaned_sentences[:2]) # Limit to 2 sentences
458
 
459
+ # Add period if missing
460
+ if cleaned_answer and not cleaned_answer.endswith('.'):
461
+ cleaned_answer += '.'
 
462
 
463
  return cleaned_answer.strip()
464