Spaces:

pradeepsengarr
/

Custom_Rag_Bot

Running

App Files Files Community

pradeepsengarr commited on Jun 7

Commit

d7bf74b

verified ·

1 Parent(s): 38c113a

Update app.py

Browse files

Files changed (1) hide show

app.py +246 -173

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ import PyPDF2
 import docx
 import io
 import os
 from typing import List, Optional
 class DocumentRAG:
@@ -25,6 +26,7 @@ class DocumentRAG:
         self.documents = []
         self.index = None
         self.is_indexed = False
     def setup_llm(self):
         """Setup quantized Mistral model"""
@@ -91,79 +93,142 @@ class DocumentRAG:
             self.tokenizer = None
             print("⚠️ Using context-only mode (no text generation)")
     def simple_context_answer(self, query: str, context: str) -> str:
-        """Improved context-based answering when model is not available"""
         if not context:
             return "No relevant information found in the documents."
         query_lower = query.lower()
         # Handle "who is" questions specifically
         if "who is" in query_lower:
-            # Extract name from query
-            name_part = query_lower.replace("who is", "").strip()
-            # Look for professional information in context
-            lines = context.split('\n')
-            name_info = []
-            professional_info = []
-            for line in lines:
-                line = line.strip()
-                if not line or line.startswith('---'):
-                    continue
-                line_lower = line.lower()
-                # Look for job titles, companies, roles
-                if any(keyword in line_lower for keyword in [
-                    'scientist', 'engineer', 'analyst', 'developer', 'manager',
-                    'consultant', 'specialist', 'coordinator', 'associate', 'intern',
-                    'at ', 'working at', 'employed', 'position', 'role'
-                ]):
-                    professional_info.append(line)
-                # Look for name and basic info
-                elif any(keyword in line_lower for keyword in [
-                    'name', 'email', 'phone', 'linkedin', 'github', 'experience'
-                ]):
-                    name_info.append(line)
-            # Construct answer
-            if professional_info:
-                answer = f"Based on the resume, {name_part} is " + professional_info[0]
-                if len(professional_info) > 1:
-                    answer += f" and also {professional_info[1]}"
-                return answer
-            elif name_info:
-                return f"The document shows information about {name_part}: " + "; ".join(name_info[:2])
-        # For other questions, use improved keyword matching
         query_words = set(query_lower.split())
-        context_sentences = context.split('.')
-        # Find sentences that contain query keywords
-        relevant_sentences = []
         for sentence in context_sentences:
-            sentence = sentence.strip()
-            if len(sentence) < 10:  # Skip very short sentences
                 continue
             sentence_words = set(sentence.lower().split())
-            # Check if sentence contains query keywords
-            common_words = query_words.intersection(sentence_words)
-            if len(common_words) >= 1:  # Lowered threshold
-                relevant_sentences.append(sentence)
-        if relevant_sentences:
-            # Return the most relevant sentences
-            return '. '.join(relevant_sentences[:2]) + '.'
-        else:
-            # If no exact matches, return first few sentences of context
-            first_sentences = context_sentences[:2]
-            if first_sentences:
-                return '. '.join([s.strip() for s in first_sentences if s.strip()]) + '.'
-            return "Based on the document content, I found some information but cannot provide a specific answer to your question."
     def extract_text_from_file(self, file_path: str) -> str:
         """Extract text from various file formats"""
@@ -217,36 +282,61 @@ class DocumentRAG:
             except Exception as e2:
                 return f"Error reading TXT: {str(e2)}"
-    def chunk_text(self, text: str, chunk_size: int = 200, overlap: int = 30) -> List[str]:
-        """Split text into overlapping chunks with better sentence preservation"""
         if not text.strip():
             return []
-        # Split by sentences first, then group into chunks
-        sentences = text.replace('\n', ' ').split('. ')
         chunks = []
         current_chunk = ""
-        for sentence in sentences:
-            sentence = sentence.strip()
-            if not sentence:
                 continue
-            # Add sentence to current chunk
-            test_chunk = current_chunk + ". " + sentence if current_chunk else sentence
-            # If chunk gets too long, save it and start new one
-            if len(test_chunk.split()) > chunk_size:
-                if current_chunk:
-                    chunks.append(current_chunk.strip())
-                current_chunk = sentence
             else:
-                current_chunk = test_chunk
         # Add the last chunk
         if current_chunk:
             chunks.append(current_chunk.strip())
         return chunks
     def process_documents(self, files) -> str:
@@ -273,8 +363,11 @@ class DocumentRAG:
             if not all_text.strip():
                 return "❌ No text extracted from files!"
-            # Chunk the text
-            self.documents = self.chunk_text(all_text)
             if not self.documents:
                 return "❌ No valid text chunks created!"
@@ -301,8 +394,8 @@ class DocumentRAG:
         except Exception as e:
             return f"❌ Error processing documents: {str(e)}"
-    def retrieve_context(self, query: str, k: int = 5) -> str:
-        """Retrieve relevant context for the query with improved retrieval"""
         if not self.is_indexed:
             return ""
@@ -312,23 +405,33 @@ class DocumentRAG:
             faiss.normalize_L2(query_embedding)
             # Search for similar chunks
-            scores, indices = self.index.search(query_embedding.astype('float32'), k)
-            # Get relevant documents with MUCH LOWER threshold
             relevant_docs = []
-            for i, idx in enumerate(indices[0]):
-                if idx < len(self.documents) and scores[0][i] > 0.05:  # Much lower threshold
-                    relevant_docs.append(self.documents[idx])
-            # If no high-similarity matches, take the top results anyway
-            if not relevant_docs:
-                for i, idx in enumerate(indices[0]):
-                    if idx < len(self.documents):
-                        relevant_docs.append(self.documents[idx])
-                        if len(relevant_docs) >= 3:  # Take at least 3 chunks
-                            break
-            return "\n\n".join(relevant_docs)
         except Exception as e:
             print(f"Error in retrieval: {e}")
@@ -345,33 +448,27 @@ class DocumentRAG:
             is_mistral = 'mistral' in model_name
             if is_mistral:
-                # Improved prompt for Mistral with specific instructions
-                prompt = f"""<s>[INST] You are a helpful assistant that answers questions about people based on their resume/document information.
-Answer the question clearly and concisely. For "who is" questions, provide a brief professional summary.
-Context from document:
-{context[:1500]}
 Question: {query}
-Provide a clear, direct answer in 1-2 sentences. [/INST]"""
             else:
-                # Improved prompt for fallback models
-                prompt = f"""Answer the question about the person based on their resume information:
-Resume Information:
-{context[:1000]}
 Question: {query}
-Answer (be direct and concise):"""
-            # Tokenize with proper handling
             inputs = self.tokenizer(
                 prompt,
                 return_tensors="pt",
-                max_length=800,
                 truncation=True,
                 padding=True
             )
@@ -380,17 +477,16 @@ Answer (be direct and concise):"""
             if torch.cuda.is_available() and next(self.model.parameters()).is_cuda:
                 inputs = {k: v.cuda() for k, v in inputs.items()}
-            # Generate with more focused parameters
             with torch.no_grad():
                 outputs = self.model.generate(
                     **inputs,
-                    max_new_tokens=100,   # Shorter for more focused answers
-                    temperature=0.2,      # Lower for more deterministic responses
                     do_sample=True,
-                    top_p=0.8,
-                    num_beams=3,
                     early_stopping=True,
-                    repetition_penalty=1.2,
                     pad_token_id=self.tokenizer.pad_token_id,
                     eos_token_id=self.tokenizer.eos_token_id
                 )
@@ -398,72 +494,50 @@ Answer (be direct and concise):"""
             # Decode response
             full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-            # Extract answer based on model type
             if is_mistral and "[/INST]" in full_response:
                 answer = full_response.split("[/INST]")[-1].strip()
             else:
-                # For other models, remove the prompt
-                if "Answer (be direct and concise):" in full_response:
-                    answer = full_response.split("Answer (be direct and concise):")[-1].strip()
-                elif "Answer:" in full_response:
-                    answer = full_response.split("Answer:")[-1].strip()
-                else:
-                    answer = full_response[len(prompt):].strip()
-            # Clean up the answer
             answer = self.clean_answer(answer)
-            return answer if answer else self.simple_context_answer(query, context)
         except Exception as e:
             print(f"Error in generation: {e}")
             return self.simple_context_answer(query, context)
     def clean_answer(self, answer: str) -> str:
-        """Clean up the generated answer with better formatting"""
         if not answer or len(answer) < 5:
             return ""
-        # Remove file markers and cleanup
-        answer = answer.replace('--- ', '').replace(' ---', '')
-        answer = answer.replace('.pdf', '').replace('.docx', '').replace('.txt', '')
-        # Split into sentences and clean each
-        sentences = answer.split('.')
-        cleaned_sentences = []
-        for sentence in sentences:
-            sentence = sentence.strip()
-            if not sentence:
-                continue
-            # Skip problematic patterns
-            if any(pattern in sentence.lower() for pattern in [
-                'what are you doing', 'what do you think', 'how are you',
-                'i am an ai', 'i cannot', 'i don\'t know', 'linkedin: www',
-                'github:', 'email:', 'mobile:', '+91-'
-            ]):
-                continue
-            # Clean up common formatting issues
-            sentence = sentence.replace('  ', ' ')
-            if sentence and len(sentence) > 3:
-                cleaned_sentences.append(sentence)
-        if not cleaned_sentences:
-            return ""
-        # Reconstruct answer
-        cleaned_answer = '. '.join(cleaned_sentences[:2])  # Limit to 2 sentences
-        # Add period if missing
-        if cleaned_answer and not cleaned_answer.endswith('.'):
-            cleaned_answer += '.'
-        return cleaned_answer.strip()
     def answer_question(self, query: str) -> str:
-        """Main function to answer questions with improved handling"""
         if not query.strip():
             return "❓ Please ask a question!"
@@ -472,19 +546,18 @@ Answer (be direct and concise):"""
         try:
             # Retrieve relevant context
-            context = self.retrieve_context(query, k=7)  # Get more chunks
             if not context:
-                return "🔍 No relevant information found in the uploaded documents for your question."
             # Generate answer
             answer = self.generate_answer(query, context)
-            if answer and len(answer) > 10:
-                return f"💡 **Answer:** {answer}\n\n📄 **Source Context:**\n{context[:300]}..."
             else:
-                # Fallback to simple context display
-                return f"📄 **Based on the document content:**\n{context[:500]}..."
         except Exception as e:
             return f"❌ Error answering question: {str(e)}"
@@ -532,7 +605,7 @@ def create_interface():
                 with gr.Column():
                     question_input = gr.Textbox(
                         label="Your Question",
-                        placeholder="What would you like to know about your documents?",
                         lines=3
                     )
                     ask_btn = gr.Button("🔍 Get Answer", variant="primary")
@@ -540,7 +613,7 @@ def create_interface():
                 with gr.Column():
                     answer_output = gr.Textbox(
                         label="Answer",
-                        lines=12,
                         interactive=False
                     )
@@ -553,11 +626,11 @@ def create_interface():
             # Example questions
             gr.Markdown("""
             ### 💡 Example Questions:
-            - What is the main topic of the document?
-            - Can you summarize the key points?
-            - What are the conclusions mentioned?
-            - Are there any specific numbers or statistics?
-            - Who are the main people or organizations mentioned?
             """)
     return demo

 import docx
 import io
 import os
+import re
 from typing import List, Optional
 class DocumentRAG:
         self.documents = []
         self.index = None
         self.is_indexed = False
+        self.raw_text = ""  # Store raw text for fallback
     def setup_llm(self):
         """Setup quantized Mistral model"""
             self.tokenizer = None
             print("⚠️ Using context-only mode (no text generation)")
+    def extract_profile_info(self, text: str) -> dict:
+        """Extract key profile information from resume text"""
+        profile = {
+            'name': '',
+            'role': '',
+            'skills': [],
+            'experience': [],
+            'education': [],
+            'projects': []
+        }
+        lines = text.split('\n')
+        current_section = None
+        for line in lines:
+            line = line.strip()
+            if not line:
+                continue
+            line_lower = line.lower()
+            # Extract name (usually first meaningful line)
+            if not profile['name'] and len(line.split()) <= 4 and not any(char in line for char in ['@', '.com', '+91', 'linkedin']):
+                if not any(word in line_lower for word in ['resume', 'cv', 'experience', 'education', 'skills']):
+                    profile['name'] = line
+            # Look for role/title indicators
+            if any(keyword in line_lower for keyword in ['data scientist', 'software engineer', 'developer', 'analyst', 'intern']):
+                if 'data scientist' in line_lower:
+                    profile['role'] = 'Data Scientist'
+                elif 'software engineer' in line_lower:
+                    profile['role'] = 'Software Engineer'
+                elif 'developer' in line_lower:
+                    profile['role'] = 'Developer'
+                elif 'analyst' in line_lower:
+                    profile['role'] = 'Analyst'
+            # Extract skills
+            if any(keyword in line_lower for keyword in ['python', 'machine learning', 'react', 'javascript', 'sql']):
+                if 'python' in line_lower:
+                    profile['skills'].append('Python')
+                if 'machine learning' in line_lower:
+                    profile['skills'].append('Machine Learning')
+                if 'react' in line_lower:
+                    profile['skills'].append('React')
+                if 'javascript' in line_lower:
+                    profile['skills'].append('JavaScript')
+        return profile
     def simple_context_answer(self, query: str, context: str) -> str:
+        """Improved smart answering based on context analysis"""
         if not context:
             return "No relevant information found in the documents."
         query_lower = query.lower()
+        # Extract profile information first
+        profile = self.extract_profile_info(self.raw_text if self.raw_text else context)
         # Handle "who is" questions specifically
         if "who is" in query_lower:
+            name_in_query = re.search(r'who is (\w+)', query_lower)
+            person_name = name_in_query.group(1) if name_in_query else "this person"
+            # Build answer from profile
+            answer_parts = []
+            if profile['name']:
+                if profile['role']:
+                    answer_parts.append(f"{profile['name']} is a {profile['role']}")
+                else:
+                    # Try to infer role from context
+                    context_lower = context.lower()
+                    if 'data scientist' in context_lower or ('python' in context_lower and 'machine learning' in context_lower):
+                        answer_parts.append(f"{profile['name']} is a Data Scientist")
+                    elif 'software' in context_lower and 'developer' in context_lower:
+                        answer_parts.append(f"{profile['name']} is a Software Developer")
+                    else:
+                        answer_parts.append(f"{profile['name']} is a professional")
+            else:
+                # Use name from query
+                context_lower = context.lower()
+                if 'data scientist' in context_lower or ('python' in context_lower and 'machine learning' in context_lower):
+                    answer_parts.append(f"{person_name.title()} is a Data Scientist")
+                elif 'software' in context_lower and 'developer' in context_lower:
+                    answer_parts.append(f"{person_name.title()} is a Software Developer")
+                else:
+                    answer_parts.append(f"{person_name.title()} is a professional")
+            # Add key skills if available
+            if profile['skills']:
+                top_skills = profile['skills'][:3]  # Top 3 skills
+                answer_parts.append(f"with expertise in {', '.join(top_skills)}")
+            if answer_parts:
+                return '. '.join(answer_parts) + '.'
+        # Handle other question types
+        elif any(keyword in query_lower for keyword in ['what', 'skills', 'experience', 'work']):
+            if 'skills' in query_lower:
+                if profile['skills']:
+                    return f"Key skills include: {', '.join(profile['skills'])}."
+            elif 'experience' in query_lower or 'work' in query_lower:
+                # Look for experience indicators in context
+                exp_lines = []
+                for line in context.split('\n'):
+                    if any(word in line.lower() for word in ['experience', 'worked', 'internship', 'project']):
+                        exp_lines.append(line.strip())
+                if exp_lines:
+                    return exp_lines[0]
+        # Fallback to keyword matching
         query_words = set(query_lower.split())
+        context_sentences = [s.strip() for s in context.split('.') if s.strip()]
+        # Find most relevant sentence
+        best_sentence = ""
+        max_matches = 0
         for sentence in context_sentences:
+            if len(sentence) < 20:  # Skip very short sentences
                 continue
             sentence_words = set(sentence.lower().split())
+            matches = len(query_words.intersection(sentence_words))
+            if matches > max_matches:
+                max_matches = matches
+                best_sentence = sentence
+        if best_sentence:
+            return best_sentence + '.'
+        # Final fallback
+        return "Based on the document, I found relevant information but cannot provide a specific answer."
     def extract_text_from_file(self, file_path: str) -> str:
         """Extract text from various file formats"""
             except Exception as e2:
                 return f"Error reading TXT: {str(e2)}"
+    def smart_chunk_text(self, text: str) -> List[str]:
+        """Smart chunking that preserves important information together"""
         if not text.strip():
             return []
         chunks = []
+        lines = text.split('\n')
+        # Create chunks based on semantic meaning
         current_chunk = ""
+        chunk_type = None
+        for line in lines:
+            line = line.strip()
+            if not line:
                 continue
+            line_lower = line.lower()
+            # Identify section types
+            new_chunk_type = None
+            if any(keyword in line_lower for keyword in ['name', 'email', 'phone', 'linkedin', 'github']):
+                new_chunk_type = 'contact'
+            elif any(keyword in line_lower for keyword in ['experience', 'work', 'internship']):
+                new_chunk_type = 'experience'
+            elif any(keyword in line_lower for keyword in ['education', 'degree', 'university', 'college']):
+                new_chunk_type = 'education'
+            elif any(keyword in line_lower for keyword in ['skills', 'technologies', 'programming']):
+                new_chunk_type = 'skills'
+            elif any(keyword in line_lower for keyword in ['project', 'developed', 'built']):
+                new_chunk_type = 'projects'
+            # If section type changes, save current chunk and start new one
+            if new_chunk_type != chunk_type and current_chunk:
+                chunks.append(current_chunk.strip())
+                current_chunk = line
+                chunk_type = new_chunk_type
             else:
+                # Add to current chunk
+                if current_chunk:
+                    current_chunk += "\n" + line
+                else:
+                    current_chunk = line
+                    chunk_type = new_chunk_type
+            # Limit chunk size
+            if len(current_chunk.split()) > 150:
+                chunks.append(current_chunk.strip())
+                current_chunk = ""
+                chunk_type = None
         # Add the last chunk
         if current_chunk:
             chunks.append(current_chunk.strip())
         return chunks
     def process_documents(self, files) -> str:
             if not all_text.strip():
                 return "❌ No text extracted from files!"
+            # Store raw text for smart answering
+            self.raw_text = all_text
+            # Smart chunk the text
+            self.documents = self.smart_chunk_text(all_text)
             if not self.documents:
                 return "❌ No valid text chunks created!"
         except Exception as e:
             return f"❌ Error processing documents: {str(e)}"
+    def retrieve_context(self, query: str, k: int = 3) -> str:
+        """Retrieve relevant context with improved filtering"""
         if not self.is_indexed:
             return ""
             faiss.normalize_L2(query_embedding)
             # Search for similar chunks
+            scores, indices = self.index.search(query_embedding.astype('float32'), min(k, len(self.documents)))
+            # Get relevant documents with reasonable threshold
             relevant_docs = []
+            query_lower = query.lower()
+            for i, idx in enumerate(indices[0]):
+                if idx < len(self.documents):
+                    doc = self.documents[idx]
+                    score = scores[0][i]
+                    # For "who is" questions, prioritize contact/basic info chunks
+                    if "who is" in query_lower:
+                        doc_lower = doc.lower()
+                        if any(keyword in doc_lower for keyword in ['name', 'email', 'linkedin', 'data scientist', 'developer']):
+                            relevant_docs.insert(0, doc)  # Put at beginning
+                        elif score > 0.15:  # Lower threshold for other relevant content
+                            relevant_docs.append(doc)
+                    else:
+                        if score > 0.2:  # Standard threshold
+                            relevant_docs.append(doc)
+            # If no good matches for "who is", get the first few chunks
+            if "who is" in query_lower and not relevant_docs:
+                relevant_docs = self.documents[:2]
+            return "\n\n".join(relevant_docs[:3])  # Limit to top 3 chunks
         except Exception as e:
             print(f"Error in retrieval: {e}")
             is_mistral = 'mistral' in model_name
             if is_mistral:
+                # Focused prompt for Mistral
+                prompt = f"""<s>[INST] Answer the question about the person based on their resume. Be concise and direct.
+Resume Information:
+{context[:800]}
 Question: {query}
+Provide a brief, specific answer in 1 sentence. [/INST]"""
             else:
+                # Focused prompt for fallback models
+                prompt = f"""Resume: {context[:600]}
 Question: {query}
+Answer briefly:"""
+            # Tokenize
             inputs = self.tokenizer(
                 prompt,
                 return_tensors="pt",
+                max_length=600,
                 truncation=True,
                 padding=True
             )
             if torch.cuda.is_available() and next(self.model.parameters()).is_cuda:
                 inputs = {k: v.cuda() for k, v in inputs.items()}
+            # Generate with focused parameters
             with torch.no_grad():
                 outputs = self.model.generate(
                     **inputs,
+                    max_new_tokens=50,    # Much shorter for focused answers
+                    temperature=0.1,      # Very low for deterministic responses
                     do_sample=True,
+                    top_p=0.9,
                     early_stopping=True,
+                    repetition_penalty=1.1,
                     pad_token_id=self.tokenizer.pad_token_id,
                     eos_token_id=self.tokenizer.eos_token_id
                 )
             # Decode response
             full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+            # Extract answer
             if is_mistral and "[/INST]" in full_response:
                 answer = full_response.split("[/INST]")[-1].strip()
             else:
+                answer = full_response[len(prompt):].strip()
+            # Clean and validate answer
             answer = self.clean_answer(answer)
+            # If answer is too long or poor quality, use fallback
+            if not answer or len(answer) > 200:
+                return self.simple_context_answer(query, context)
+            return answer
         except Exception as e:
             print(f"Error in generation: {e}")
             return self.simple_context_answer(query, context)
     def clean_answer(self, answer: str) -> str:
+        """Clean up the generated answer"""
         if not answer or len(answer) < 5:
             return ""
+        # Remove unwanted patterns
+        answer = re.sub(r'--- \w+.*? ---', '', answer)
+        answer = re.sub(r'\b\w+@\w+\.\w+\b', '', answer)  # Remove emails
+        answer = re.sub(r'\+91-?\d+', '', answer)  # Remove phone numbers
+        answer = answer.replace('LinkedIn:', '').replace('Github:', '')
+        # Clean up whitespace
+        answer = ' '.join(answer.split())
+        # Take only the first sentence if multiple
+        sentences = answer.split('.')
+        if sentences:
+            first_sentence = sentences[0].strip()
+            if len(first_sentence) > 10:
+                return first_sentence + '.'
+        return answer.strip()
     def answer_question(self, query: str) -> str:
+        """Main function to answer questions"""
         if not query.strip():
             return "❓ Please ask a question!"
         try:
             # Retrieve relevant context
+            context = self.retrieve_context(query, k=3)
             if not context:
+                return "🔍 No relevant information found in the uploaded documents."
             # Generate answer
             answer = self.generate_answer(query, context)
+            if answer and len(answer) > 5:
+                return answer
             else:
+                return "I couldn't generate a specific answer from the document content."
         except Exception as e:
             return f"❌ Error answering question: {str(e)}"
                 with gr.Column():
                     question_input = gr.Textbox(
                         label="Your Question",
+                        placeholder="Who is Pradeep?",
                         lines=3
                     )
                     ask_btn = gr.Button("🔍 Get Answer", variant="primary")
                 with gr.Column():
                     answer_output = gr.Textbox(
                         label="Answer",
+                        lines=6,
                         interactive=False
                     )
             # Example questions
             gr.Markdown("""
             ### 💡 Example Questions:
+            - Who is [Name]?
+            - What are [Name]'s skills?
+            - What experience does [Name] have?
+            - What projects has [Name] worked on?
+            - What is [Name]'s educational background?
             """)
     return demo