Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -96,8 +96,50 @@ class DocumentRAG:
|
|
96 |
if not context:
|
97 |
return "No relevant information found in the documents."
|
98 |
|
99 |
-
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
context_sentences = context.split('.')
|
102 |
|
103 |
# Find sentences that contain query keywords
|
@@ -115,7 +157,7 @@ class DocumentRAG:
|
|
115 |
|
116 |
if relevant_sentences:
|
117 |
# Return the most relevant sentences
|
118 |
-
return '. '.join(relevant_sentences[:
|
119 |
else:
|
120 |
# If no exact matches, return first few sentences of context
|
121 |
first_sentences = context_sentences[:2]
|
@@ -303,25 +345,27 @@ class DocumentRAG:
|
|
303 |
is_mistral = 'mistral' in model_name
|
304 |
|
305 |
if is_mistral:
|
306 |
-
# Improved prompt for Mistral
|
307 |
-
prompt = f"""<s>[INST] You are a helpful
|
|
|
|
|
308 |
|
309 |
-
Context:
|
310 |
{context[:1500]}
|
311 |
|
312 |
Question: {query}
|
313 |
|
314 |
-
|
315 |
else:
|
316 |
# Improved prompt for fallback models
|
317 |
-
prompt = f"""
|
318 |
|
319 |
-
|
320 |
{context[:1000]}
|
321 |
|
322 |
Question: {query}
|
323 |
|
324 |
-
Answer:"""
|
325 |
|
326 |
# Tokenize with proper handling
|
327 |
inputs = self.tokenizer(
|
@@ -336,17 +380,17 @@ Answer:"""
|
|
336 |
if torch.cuda.is_available() and next(self.model.parameters()).is_cuda:
|
337 |
inputs = {k: v.cuda() for k, v in inputs.items()}
|
338 |
|
339 |
-
# Generate with more
|
340 |
with torch.no_grad():
|
341 |
outputs = self.model.generate(
|
342 |
**inputs,
|
343 |
-
max_new_tokens=
|
344 |
-
temperature=0.
|
345 |
do_sample=True,
|
346 |
-
top_p=0.
|
347 |
-
num_beams=
|
348 |
early_stopping=True,
|
349 |
-
repetition_penalty=1.
|
350 |
pad_token_id=self.tokenizer.pad_token_id,
|
351 |
eos_token_id=self.tokenizer.eos_token_id
|
352 |
)
|
@@ -359,7 +403,9 @@ Answer:"""
|
|
359 |
answer = full_response.split("[/INST]")[-1].strip()
|
360 |
else:
|
361 |
# For other models, remove the prompt
|
362 |
-
if "Answer:" in full_response:
|
|
|
|
|
363 |
answer = full_response.split("Answer:")[-1].strip()
|
364 |
else:
|
365 |
answer = full_response[len(prompt):].strip()
|
@@ -374,28 +420,45 @@ Answer:"""
|
|
374 |
return self.simple_context_answer(query, context)
|
375 |
|
376 |
def clean_answer(self, answer: str) -> str:
|
377 |
-
"""Clean up the generated answer"""
|
378 |
if not answer or len(answer) < 5:
|
379 |
return ""
|
380 |
|
381 |
-
# Remove
|
382 |
-
|
383 |
-
|
384 |
|
385 |
-
|
386 |
-
|
387 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
388 |
'what are you doing', 'what do you think', 'how are you',
|
389 |
-
'i am an ai', 'i cannot', 'i don\'t know'
|
|
|
390 |
]):
|
391 |
-
|
|
|
|
|
|
|
|
|
|
|
392 |
|
393 |
-
|
|
|
|
|
|
|
|
|
394 |
|
395 |
-
#
|
396 |
-
if
|
397 |
-
|
398 |
-
cleaned_answer = '. '.join(sentences[:3]) + '.'
|
399 |
|
400 |
return cleaned_answer.strip()
|
401 |
|
|
|
96 |
if not context:
|
97 |
return "No relevant information found in the documents."
|
98 |
|
99 |
+
query_lower = query.lower()
|
100 |
+
|
101 |
+
# Handle "who is" questions specifically
|
102 |
+
if "who is" in query_lower:
|
103 |
+
# Extract name from query
|
104 |
+
name_part = query_lower.replace("who is", "").strip()
|
105 |
+
|
106 |
+
# Look for professional information in context
|
107 |
+
lines = context.split('\n')
|
108 |
+
name_info = []
|
109 |
+
professional_info = []
|
110 |
+
|
111 |
+
for line in lines:
|
112 |
+
line = line.strip()
|
113 |
+
if not line or line.startswith('---'):
|
114 |
+
continue
|
115 |
+
|
116 |
+
line_lower = line.lower()
|
117 |
+
|
118 |
+
# Look for job titles, companies, roles
|
119 |
+
if any(keyword in line_lower for keyword in [
|
120 |
+
'scientist', 'engineer', 'analyst', 'developer', 'manager',
|
121 |
+
'consultant', 'specialist', 'coordinator', 'associate', 'intern',
|
122 |
+
'at ', 'working at', 'employed', 'position', 'role'
|
123 |
+
]):
|
124 |
+
professional_info.append(line)
|
125 |
+
|
126 |
+
# Look for name and basic info
|
127 |
+
elif any(keyword in line_lower for keyword in [
|
128 |
+
'name', 'email', 'phone', 'linkedin', 'github', 'experience'
|
129 |
+
]):
|
130 |
+
name_info.append(line)
|
131 |
+
|
132 |
+
# Construct answer
|
133 |
+
if professional_info:
|
134 |
+
answer = f"Based on the resume, {name_part} is " + professional_info[0]
|
135 |
+
if len(professional_info) > 1:
|
136 |
+
answer += f" and also {professional_info[1]}"
|
137 |
+
return answer
|
138 |
+
elif name_info:
|
139 |
+
return f"The document shows information about {name_part}: " + "; ".join(name_info[:2])
|
140 |
+
|
141 |
+
# For other questions, use improved keyword matching
|
142 |
+
query_words = set(query_lower.split())
|
143 |
context_sentences = context.split('.')
|
144 |
|
145 |
# Find sentences that contain query keywords
|
|
|
157 |
|
158 |
if relevant_sentences:
|
159 |
# Return the most relevant sentences
|
160 |
+
return '. '.join(relevant_sentences[:2]) + '.'
|
161 |
else:
|
162 |
# If no exact matches, return first few sentences of context
|
163 |
first_sentences = context_sentences[:2]
|
|
|
345 |
is_mistral = 'mistral' in model_name
|
346 |
|
347 |
if is_mistral:
|
348 |
+
# Improved prompt for Mistral with specific instructions
|
349 |
+
prompt = f"""<s>[INST] You are a helpful assistant that answers questions about people based on their resume/document information.
|
350 |
+
|
351 |
+
Answer the question clearly and concisely. For "who is" questions, provide a brief professional summary.
|
352 |
|
353 |
+
Context from document:
|
354 |
{context[:1500]}
|
355 |
|
356 |
Question: {query}
|
357 |
|
358 |
+
Provide a clear, direct answer in 1-2 sentences. [/INST]"""
|
359 |
else:
|
360 |
# Improved prompt for fallback models
|
361 |
+
prompt = f"""Answer the question about the person based on their resume information:
|
362 |
|
363 |
+
Resume Information:
|
364 |
{context[:1000]}
|
365 |
|
366 |
Question: {query}
|
367 |
|
368 |
+
Answer (be direct and concise):"""
|
369 |
|
370 |
# Tokenize with proper handling
|
371 |
inputs = self.tokenizer(
|
|
|
380 |
if torch.cuda.is_available() and next(self.model.parameters()).is_cuda:
|
381 |
inputs = {k: v.cuda() for k, v in inputs.items()}
|
382 |
|
383 |
+
# Generate with more focused parameters
|
384 |
with torch.no_grad():
|
385 |
outputs = self.model.generate(
|
386 |
**inputs,
|
387 |
+
max_new_tokens=100, # Shorter for more focused answers
|
388 |
+
temperature=0.2, # Lower for more deterministic responses
|
389 |
do_sample=True,
|
390 |
+
top_p=0.8,
|
391 |
+
num_beams=3,
|
392 |
early_stopping=True,
|
393 |
+
repetition_penalty=1.2,
|
394 |
pad_token_id=self.tokenizer.pad_token_id,
|
395 |
eos_token_id=self.tokenizer.eos_token_id
|
396 |
)
|
|
|
403 |
answer = full_response.split("[/INST]")[-1].strip()
|
404 |
else:
|
405 |
# For other models, remove the prompt
|
406 |
+
if "Answer (be direct and concise):" in full_response:
|
407 |
+
answer = full_response.split("Answer (be direct and concise):")[-1].strip()
|
408 |
+
elif "Answer:" in full_response:
|
409 |
answer = full_response.split("Answer:")[-1].strip()
|
410 |
else:
|
411 |
answer = full_response[len(prompt):].strip()
|
|
|
420 |
return self.simple_context_answer(query, context)
|
421 |
|
422 |
def clean_answer(self, answer: str) -> str:
|
423 |
+
"""Clean up the generated answer with better formatting"""
|
424 |
if not answer or len(answer) < 5:
|
425 |
return ""
|
426 |
|
427 |
+
# Remove file markers and cleanup
|
428 |
+
answer = answer.replace('--- ', '').replace(' ---', '')
|
429 |
+
answer = answer.replace('.pdf', '').replace('.docx', '').replace('.txt', '')
|
430 |
|
431 |
+
# Split into sentences and clean each
|
432 |
+
sentences = answer.split('.')
|
433 |
+
cleaned_sentences = []
|
434 |
+
|
435 |
+
for sentence in sentences:
|
436 |
+
sentence = sentence.strip()
|
437 |
+
if not sentence:
|
438 |
+
continue
|
439 |
+
|
440 |
+
# Skip problematic patterns
|
441 |
+
if any(pattern in sentence.lower() for pattern in [
|
442 |
'what are you doing', 'what do you think', 'how are you',
|
443 |
+
'i am an ai', 'i cannot', 'i don\'t know', 'linkedin: www',
|
444 |
+
'github:', 'email:', 'mobile:', '+91-'
|
445 |
]):
|
446 |
+
continue
|
447 |
+
|
448 |
+
# Clean up common formatting issues
|
449 |
+
sentence = sentence.replace(' ', ' ')
|
450 |
+
if sentence and len(sentence) > 3:
|
451 |
+
cleaned_sentences.append(sentence)
|
452 |
|
453 |
+
if not cleaned_sentences:
|
454 |
+
return ""
|
455 |
+
|
456 |
+
# Reconstruct answer
|
457 |
+
cleaned_answer = '. '.join(cleaned_sentences[:2]) # Limit to 2 sentences
|
458 |
|
459 |
+
# Add period if missing
|
460 |
+
if cleaned_answer and not cleaned_answer.endswith('.'):
|
461 |
+
cleaned_answer += '.'
|
|
|
462 |
|
463 |
return cleaned_answer.strip()
|
464 |
|