Spaces:

pradeepsengarr
/

Custom_Rag_Bot

Running

App Files Files Community

pradeepsengarr commited on 4 days ago

Commit

253bfed

verified ·

1 Parent(s): 3406461

Update app.py

Browse files

Files changed (1) hide show

app.py +397 -445

app.py CHANGED Viewed

@@ -11,36 +11,65 @@ import os
 import re
 from typing import List, Optional, Dict, Tuple
 import json
 class SmartDocumentRAG:
     def __init__(self):
-        print("🚀 Initializing Smart RAG System...")
-        # Initialize embedding model (lightweight)
-        self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
-        print("✅ Embedding model loaded")
         # Initialize quantized LLM
         self.setup_llm()
         # Document storage
         self.documents = []
-        self.document_metadata = []  # Store metadata about each chunk
         self.index = None
         self.is_indexed = False
         self.raw_text = ""
-        self.document_type = "general"  # Auto-detect document type
-        self.document_summary = ""  # Store document summary
     def setup_llm(self):
-        """Setup quantized Mistral model with fallback"""
         try:
-            # Check if CUDA is available
             if not torch.cuda.is_available():
                 print("⚠️ CUDA not available, using CPU-optimized model")
                 self.setup_cpu_model()
                 return
             quantization_config = BitsAndBytesConfig(
                 load_in_4bit=True,
                 bnb_4bit_compute_dtype=torch.float16,
@@ -50,35 +79,27 @@ class SmartDocumentRAG:
             model_name = "mistralai/Mistral-7B-Instruct-v0.1"
-            self.tokenizer = AutoTokenizer.from_pretrained(
-                model_name,
-                trust_remote_code=True
-            )
-            if self.tokenizer.pad_token is None:
-                self.tokenizer.pad_token = self.tokenizer.eos_token
             self.model = AutoModelForCausalLM.from_pretrained(
                 model_name,
                 quantization_config=quantization_config,
                 device_map="auto",
-                torch_dtype=torch.float16,
-                trust_remote_code=True,
-                low_cpu_mem_usage=True
             )
-            print("✅ Quantized Mistral model loaded successfully")
         except Exception as e:
-            print(f"❌ Error loading Mistral: {e}")
-            print("🔄 Falling back to CPU model...")
             self.setup_cpu_model()
     def setup_cpu_model(self):
         """Setup CPU-friendly model"""
         try:
-            # Use GPT-2 for better text generation on CPU
-            model_name = "gpt2-medium"
             self.tokenizer = AutoTokenizer.from_pretrained(model_name)
             self.model = AutoModelForCausalLM.from_pretrained(model_name)
@@ -87,87 +108,155 @@ class SmartDocumentRAG:
             print("✅ CPU model loaded")
         except Exception as e:
-            print(f"❌ CPU model failed: {e}")
             self.model = None
             self.tokenizer = None
-            print("⚠�� Using context-only mode")
     def detect_document_type(self, text: str) -> str:
-        """Intelligently detect document type"""
         text_lower = text.lower()
-        # Count keywords for different document types
-        resume_keywords = ['experience', 'skills', 'education', 'linkedin', 'email', 'phone', 'internship']
-        research_keywords = ['abstract', 'introduction', 'methodology', 'conclusion', 'references', 'study', 'analysis']
-        business_keywords = ['company', 'revenue', 'market', 'strategy', 'business', 'financial', 'quarter']
-        technical_keywords = ['implementation', 'algorithm', 'system', 'technical', 'specification', 'architecture']
-        legal_keywords = ['contract', 'agreement', 'terms', 'conditions', 'legal', 'clause', 'liability']
         scores = {
-            'resume': sum(1 for kw in resume_keywords if kw in text_lower),
-            'research': sum(1 for kw in research_keywords if kw in text_lower),
-            'business': sum(1 for kw in business_keywords if kw in text_lower),
-            'technical': sum(1 for kw in technical_keywords if kw in text_lower),
-            'legal': sum(1 for kw in legal_keywords if kw in text_lower)
         }
-        return max(scores, key=scores.get) if max(scores.values()) > 2 else 'general'
     def create_document_summary(self, text: str) -> str:
-        """Create intelligent document summary"""
         try:
-            # Split into paragraphs and find key information
-            paragraphs = [p.strip() for p in text.split('\n\n') if p.strip() and len(p) > 50]
-            if not paragraphs:
-                return "Document contains basic text information."
-            # Take first few paragraphs for summary context
-            summary_text = ' '.join(paragraphs[:3])[:1000]
-            if self.model and self.tokenizer:
-                # Generate AI summary
-                prompt = f"""Summarize the following document in 2-3 sentences, focusing on the main points and key information:
-{summary_text}
-Summary:"""
-                try:
-                    inputs = self.tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
-                    if torch.cuda.is_available() and next(self.model.parameters()).is_cuda:
-                        inputs = {k: v.cuda() for k, v in inputs.items()}
-                    with torch.no_grad():
-                        outputs = self.model.generate(
-                            **inputs,
-                            max_new_tokens=100,
-                            temperature=0.7,
-                            do_sample=True,
-                            top_p=0.9,
-                            pad_token_id=self.tokenizer.pad_token_id
-                        )
-                    summary = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-                    summary = summary.split("Summary:")[-1].strip()
-                    if len(summary) > 20:
-                        return summary
-                except Exception as e:
-                    print(f"Error generating AI summary: {e}")
-            # Fallback: Extract key sentences
-            sentences = re.split(r'[.!?]+', summary_text)
-            key_sentences = [s.strip() for s in sentences if len(s.strip()) > 30][:2]
-            return '. '.join(key_sentences) + '.' if key_sentences else "Document contains relevant information."
         except Exception as e:
             return "Document summary not available."
     def extract_text_from_file(self, file_path: str) -> str:
-        """Extract text from various file formats with better error handling"""
         try:
             file_extension = os.path.splitext(file_path)[1].lower()
@@ -184,7 +273,7 @@ Summary:"""
             return f"Error reading file: {str(e)}"
     def extract_from_pdf(self, file_path: str) -> str:
-        """Enhanced PDF extraction"""
         text = ""
         try:
             with open(file_path, 'rb') as file:
@@ -192,10 +281,12 @@ Summary:"""
                 for page_num, page in enumerate(pdf_reader.pages):
                     page_text = page.extract_text()
                     if page_text.strip():
-                        text += f"\n[Page {page_num + 1}]\n{page_text}\n"
         except Exception as e:
             text = f"Error reading PDF: {str(e)}"
-        return text
     def extract_from_docx(self, file_path: str) -> str:
         """Enhanced DOCX extraction"""
@@ -204,19 +295,22 @@ Summary:"""
             text = ""
             for paragraph in doc.paragraphs:
                 if paragraph.text.strip():
-                    text += paragraph.text + "\n"
-            return text
         except Exception as e:
             return f"Error reading DOCX: {str(e)}"
     def extract_from_txt(self, file_path: str) -> str:
-        """Enhanced TXT extraction with encoding detection"""
         encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
         for encoding in encodings:
             try:
                 with open(file_path, 'r', encoding=encoding) as file:
-                    return file.read()
             except UnicodeDecodeError:
                 continue
             except Exception as e:
@@ -224,81 +318,39 @@ Summary:"""
         return "Error: Could not decode file with any supported encoding"
-    def intelligent_chunk_text(self, text: str, doc_type: str) -> List[Dict]:
-        """Intelligent chunking based on document type"""
         if not text.strip():
             return []
         chunks = []
-        lines = [line.strip() for line in text.split('\n') if line.strip()]
-        if doc_type == 'research':
-            # For research papers, chunk by sections
-            current_chunk = ""
-            current_section = "introduction"
-            for line in lines:
-                line_lower = line.lower()
-                # Detect section headers
-                if any(header in line_lower for header in ['abstract', 'introduction', 'methodology', 'results', 'conclusion', 'references']):
-                    if current_chunk:
-                        chunks.append({
-                            'text': current_chunk.strip(),
-                            'section': current_section,
-                            'doc_type': doc_type
-                        })
-                    current_chunk = line
-                    current_section = line_lower.split()[0] if line_lower.split() else "section"
-                else:
-                    current_chunk += "\n" + line
-                # Limit chunk size
-                if len(current_chunk.split()) > 200:
-                    chunks.append({
-                        'text': current_chunk.strip(),
-                        'section': current_section,
-                        'doc_type': doc_type
-                    })
-                    current_chunk = ""
-            if current_chunk:
-                chunks.append({
-                    'text': current_chunk.strip(),
-                    'section': current_section,
-                    'doc_type': doc_type
-                })
-        else:
-            # General intelligent chunking
-            current_chunk = ""
-            sentence_count = 0
-            for line in lines:
-                current_chunk += line + "\n"
-                sentence_count += len(re.findall(r'[.!?]+', line))
-                # Create chunk based on sentence count or word count
-                if sentence_count >= 5 or len(current_chunk.split()) > 150:
                     chunks.append({
-                        'text': current_chunk.strip(),
-                        'section': 'content',
-                        'doc_type': doc_type
                     })
-                    current_chunk = ""
-                    sentence_count = 0
-            if current_chunk:
-                chunks.append({
-                    'text': current_chunk.strip(),
-                    'section': 'content',
-                    'doc_type': doc_type
-                })
         return chunks
     def process_documents(self, files) -> str:
-        """Enhanced document processing with intelligent analysis"""
         if not files:
             return "❌ No files uploaded!"
@@ -306,14 +358,13 @@ Summary:"""
             all_text = ""
             processed_files = []
-            # Extract text from all files
             for file in files:
                 if file is None:
                     continue
                 file_text = self.extract_text_from_file(file.name)
                 if not file_text.startswith("Error") and not file_text.startswith("Unsupported"):
-                    all_text += f"\n\n--- {os.path.basename(file.name)} ---\n\n{file_text}"
                     processed_files.append(os.path.basename(file.name))
                 else:
                     return f"❌ {file_text}"
@@ -321,17 +372,13 @@ Summary:"""
             if not all_text.strip():
                 return "❌ No text extracted from files!"
-            # Store raw text
             self.raw_text = all_text
-            # Detect document type
             self.document_type = self.detect_document_type(all_text)
-            # Create document summary
             self.document_summary = self.create_document_summary(all_text)
-            # Intelligent chunking
-            chunk_data = self.intelligent_chunk_text(all_text, self.document_type)
             if not chunk_data:
                 return "❌ No valid text chunks created!"
@@ -339,15 +386,20 @@ Summary:"""
             self.documents = [chunk['text'] for chunk in chunk_data]
             self.document_metadata = chunk_data
-            # Create embeddings
             print(f"📄 Creating embeddings for {len(self.documents)} chunks...")
-            embeddings = self.embedder.encode(self.documents, show_progress_bar=True)
             # Build FAISS index
             dimension = embeddings.shape[1]
             self.index = faiss.IndexFlatIP(dimension)
-            # Normalize embeddings for cosine similarity
             faiss.normalize_L2(embeddings)
             self.index.add(embeddings.astype('float32'))
@@ -356,244 +408,144 @@ Summary:"""
             return f"✅ Successfully processed {len(processed_files)} files:\n" + \
                    f"📄 Files: {', '.join(processed_files)}\n" + \
                    f"📊 Document Type: {self.document_type.title()}\n" + \
-                   f"🔍 Created {len(self.documents)} intelligent chunks\n" + \
-                   f"📝 Summary: {self.document_summary[:200]}...\n" + \
-                   f"🚀 Ready for smart Q&A!"
         except Exception as e:
             return f"❌ Error processing documents: {str(e)}"
-    def smart_retrieve_context(self, query: str, k: int = 4) -> Tuple[str, List[Dict]]:
-        """Enhanced context retrieval with intelligent ranking"""
         if not self.is_indexed:
             return "", []
         try:
-            # Get query embedding
             query_embedding = self.embedder.encode([query])
             faiss.normalize_L2(query_embedding)
-            # Search for similar chunks
-            scores, indices = self.index.search(query_embedding.astype('float32'), min(k * 2, len(self.documents)))
-            # Analyze query intent
-            query_lower = query.lower()
-            is_summary_request = any(word in query_lower for word in ['summary', 'summarize', 'overview', 'what is', 'about'])
-            is_specific_request = any(word in query_lower for word in ['how', 'why', 'when', 'where', 'which'])
-            relevant_chunks = []
-            for i, idx in enumerate(indices[0]):
-                if idx < len(self.documents):
-                    score = scores[0][i]
-                    chunk_data = self.document_metadata[idx]
-                    # Adjust scoring based on query type and document structure
-                    adjusted_score = score
-                    if is_summary_request:
-                        # Boost introductory sections for summary requests
-                        if chunk_data['section'] in ['introduction', 'abstract', 'content']:
-                            adjusted_score += 0.1
-                    if adjusted_score > 0.15:  # Threshold for relevance
-                        relevant_chunks.append({
-                            'text': self.documents[idx],
-                            'score': adjusted_score,
-                            'metadata': chunk_data
-                        })
-            # Sort by adjusted score
-            relevant_chunks.sort(key=lambda x: x['score'], reverse=True)
-            # Take top chunks
-            top_chunks = relevant_chunks[:k]
-            context = "\n\n".join([chunk['text'] for chunk in top_chunks])
-            return context, top_chunks
         except Exception as e:
-            print(f"Error in retrieval: {e}")
             return "", []
-    def generate_smart_answer(self, query: str, context: str, chunks_data: List[Dict]) -> str:
-        """Generate intelligent answers based on query type and context"""
         if not context:
-            return "No relevant information found in the documents."
-        query_lower = query.lower()
-        # Determine answer type
-        is_summary_request = any(word in query_lower for word in ['summary', 'summarize', 'overview', 'what is', 'about'])
-        is_comparison_request = any(word in query_lower for word in ['compare', 'difference', 'versus', 'vs'])
-        is_specific_question = any(word in query_lower for word in ['how', 'why', 'when', 'where', 'which'])
-        if self.model and self.tokenizer:
-            try:
-                # Create intelligent prompt based on query type
-                if is_summary_request:
-                    prompt = self.create_summary_prompt(query, context)
-                elif is_comparison_request:
-                    prompt = self.create_comparison_prompt(query, context)
-                else:
-                    prompt = self.create_general_prompt(query, context)
-                # Generate response
-                inputs = self.tokenizer(
-                    prompt,
-                    return_tensors="pt",
-                    max_length=800,
-                    truncation=True,
-                    padding=True
-                )
-                if torch.cuda.is_available() and next(self.model.parameters()).is_cuda:
-                    inputs = {k: v.cuda() for k, v in inputs.items()}
-                with torch.no_grad():
-                    outputs = self.model.generate(
-                        **inputs,
-                        max_new_tokens=150,
-                        temperature=0.3,
-                        do_sample=True,
-                        top_p=0.9,
-                        repetition_penalty=1.1,
-                        pad_token_id=self.tokenizer.pad_token_id,
-                        eos_token_id=self.tokenizer.eos_token_id
-                    )
-                # Extract and clean answer
-                full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-                answer = self.extract_answer_from_response(full_response, prompt)
-                if answer and len(answer) > 20:
-                    return self.clean_and_validate_answer(answer)
-            except Exception as e:
-                print(f"Error in AI generation: {e}")
-        # Fallback to intelligent context-based answering
-        return self.context_based_smart_answer(query, context, chunks_data)
-    def create_summary_prompt(self, query: str, context: str) -> str:
-        """Create prompt for summary requests"""
-        return f"""Based on the document content below, provide a comprehensive summary addressing the question.
-Document Content:
-{context[:1000]}
-Question: {query}
-Provide a clear, informative summary that addresses the question:"""
-    def create_comparison_prompt(self, query: str, context: str) -> str:
-        """Create prompt for comparison requests"""
-        return f"""Analyze the document content and provide a comparison as requested.
-Document Content:
-{context[:1000]}
-Question: {query}
-Provide a detailed comparison based on the information:"""
-    def create_general_prompt(self, query: str, context: str) -> str:
-        """Create prompt for general questions"""
-        return f"""Answer the question based on the document content provided.
-Document Content:
-{context[:1000]}
-Question: {query}
-Provide a specific, accurate answer:"""
-    def extract_answer_from_response(self, response: str, prompt: str) -> str:
-        """Extract clean answer from model response"""
-        # Remove the prompt part
-        if prompt in response:
-            answer = response.replace(prompt, "").strip()
-        else:
-            # Try to find the answer after common patterns
-            patterns = ["Answer:", "Summary:", "Response:", "answer:", "summary:", "response:"]
-            answer = response
-            for pattern in patterns:
-                if pattern in response:
-                    answer = response.split(pattern)[-1].strip()
-                    break
-        return answer
-    def context_based_smart_answer(self, query: str, context: str, chunks_data: List[Dict]) -> str:
-        """Intelligent context-based answering as fallback"""
         query_lower = query.lower()
-        # For summary requests
-        if any(word in query_lower for word in ['summary', 'summarize', 'overview', 'about']):
-            return self.create_context_summary(context, chunks_data)
-        # For specific questions, find most relevant sentences
-        context_sentences = [s.strip() for s in context.split('.') if len(s.strip()) > 20]
-        query_words = set(query_lower.split())
-        # Score sentences by relevance
-        scored_sentences = []
-        for sentence in context_sentences:
-            sentence_words = set(sentence.lower().split())
-            overlap = len(query_words.intersection(sentence_words))
-            if overlap > 0:
                 scored_sentences.append((sentence, overlap))
-        # Sort by relevance and combine top sentences
-        scored_sentences.sort(key=lambda x: x[1], reverse=True)
-        if scored_sentences:
-            top_sentences = [s[0] for s in scored_sentences[:3]]
-            return '. '.join(top_sentences) + '.'
-        return "I found relevant information but couldn't extract a specific answer. Please try rephrasing your question."
-    def create_context_summary(self, context: str, chunks_data: List[Dict]) -> str:
-        """Create summary from context"""
-        # Get key sentences from different sections
-        sentences_by_section = {}
-        for chunk in chunks_data:
-            section = chunk['metadata']['section']
-            sentences = [s.strip() for s in chunk['text'].split('.') if len(s.strip()) > 30]
-            if sentences:
-                if section not in sentences_by_section:
-                    sentences_by_section[section] = []
-                sentences_by_section[section].extend(sentences[:2])  # Top 2 sentences per section
-        # Combine sentences from different sections
-        summary_parts = []
-        for section, sentences in sentences_by_section.items():
-            if sentences:
-                summary_parts.extend(sentences[:1])  # One sentence per section
-        if summary_parts:
-            return '. '.join(summary_parts[:4]) + '.'  # Max 4 sentences
-        return self.document_summary if self.document_summary else "Document contains relevant information on the requested topic."
-    def clean_and_validate_answer(self, answer: str) -> str:
-        """Clean and validate the generated answer"""
-        # Remove unwanted patterns
-        answer = re.sub(r'--- \w+.*? ---', '', answer)
-        answer = re.sub(r'\[Page \d+\]', '', answer)
-        # Clean up whitespace and formatting
-        answer = ' '.join(answer.split())
-        # Ensure proper sentence structure
-        if answer and not answer.endswith(('.', '!', '?')):
-            answer += '.'
-        return answer.strip()
     def answer_question(self, query: str) -> str:
-        """Main function to answer questions intelligently"""
         if not query.strip():
             return "❓ Please ask a question!"
@@ -601,42 +553,46 @@ Provide a specific, accurate answer:"""
             return "📁 Please upload and process documents first!"
         try:
-            # Special handling for document-level questions
             query_lower = query.lower()
-            if query_lower in ['summary', 'summarize this document', 'what is this about']:
-                return f"📄 Document Summary:\n\n{self.document_summary}"
-            # Retrieve relevant context with intelligence
-            context, chunks_data = self.smart_retrieve_context(query, k=4)
             if not context:
-                return "🔍 No relevant information found for your question. Try rephrasing or asking about different aspects of the document."
-            # Generate intelligent answer
-            answer = self.generate_smart_answer(query, context, chunks_data)
-            return answer if answer else "I couldn't generate a specific answer. Please try asking in a different way."
         except Exception as e:
             return f"❌ Error processing question: {str(e)}"
-# Initialize the enhanced RAG system
-print("Initializing Smart Document RAG System...")
 rag_system = SmartDocumentRAG()
-# Enhanced Gradio Interface
 def create_interface():
-    with gr.Blocks(title="🧠 Smart Document Q&A", theme=gr.themes.Soft()) as demo:
         gr.Markdown("""
-        # 🧠 Smart Document Q&A System
-        Upload documents and get intelligent answers with summaries and insights!
-        **Features:**
-        - 🎯 Intelligent document type detection
-        - 📊 Smart summarization
-        - 🔍 Context-aware answers
-        - 📚 Multi-format support (PDF, DOCX, TXT)
         """)
         with gr.Tab("📤 Upload & Process"):
@@ -652,7 +608,7 @@ def create_interface():
                 with gr.Column():
                     process_status = gr.Textbox(
-                        label="📋 Processing Status & Document Analysis",
                         lines=10,
                         interactive=False
                     )
@@ -663,22 +619,22 @@ def create_interface():
                 outputs=[process_status]
             )
-        with gr.Tab("❓ Smart Q&A"):
             with gr.Row():
                 with gr.Column():
                     question_input = gr.Textbox(
-                        label="🤔 Ask Anything",
-                        placeholder="What is this document about? / Summarize the main points / How does X work?",
                         lines=3
                     )
                     with gr.Row():
-                        ask_btn = gr.Button("🧠 Get Smart Answer", variant="primary")
                         summary_btn = gr.Button("📊 Get Summary", variant="secondary")
                 with gr.Column():
                     answer_output = gr.Textbox(
-                        label="💡 Smart Answer",
                         lines=8,
                         interactive=False
                     )
@@ -695,52 +651,48 @@ def create_interface():
                 outputs=[answer_output]
             )
-            # Enhanced example questions
             gr.Markdown("""
-            ### 💡 Smart Question Examples:
-            **📊 For Summaries:**
-            - "What is this document about?"
-            - "Summarize the main points"
-            - "Give me an overview"
-            **🔍 For Specific Information:**
-            - "How does [topic] work?"
-            - "What are the key findings?"
-            - "Explain [concept] from the document"
-            **🎯 For Analysis:**
-            - "What are the pros and cons?"
-            - "Compare [A] and [B]"
-            - "What conclusions can be drawn?"
             """)
-        with gr.Tab("ℹ️ Tips"):
             gr.Markdown("""
-            ### 🚀 How to Get the Best Results:
-            **📄 Document Types Supported:**
-            - Research papers & academic documents
-            - Business reports & presentations
-            - Technical documentation
-            - Legal documents
-            - General text documents
-            **❓ Question Tips:**
-            - Be specific about what you want to know
-            - Use "summarize" or "overview" for general summaries
-            - Ask "how", "why", "what" for detailed explanations
-            - Request comparisons with "compare" or "difference"
-            **🎯 Best Practices:**
-            - Upload clear, well-formatted documents
-            - Ask one question at a time for focused answers
-            - Try rephrasing if the first answer isn't what you expected
             """)
     return demo
-# Launch the enhanced app
 if __name__ == "__main__":
     demo = create_interface()
     demo.launch(

 import re
 from typing import List, Optional, Dict, Tuple
 import json
+from collections import Counter
 class SmartDocumentRAG:
     def __init__(self):
+        print("🚀 Initializing Enhanced Smart RAG System...")
+        # Initialize better embedding model
+        self.embedder = SentenceTransformer('all-mpnet-base-v2')  # Better than MiniLM
+        print("✅ Enhanced embedding model loaded")
         # Initialize quantized LLM
         self.setup_llm()
         # Document storage
         self.documents = []
+        self.document_metadata = []
         self.index = None
         self.is_indexed = False
         self.raw_text = ""
+        self.document_type = "general"
+        self.document_summary = ""
+        self.sentence_embeddings = []  # Store sentence-level embeddings
+        self.sentences = []  # Store individual sentences
     def setup_llm(self):
+        """Setup optimized model for better text generation"""
         try:
             if not torch.cuda.is_available():
                 print("⚠️ CUDA not available, using CPU-optimized model")
                 self.setup_cpu_model()
                 return
+            # Use a better model for instruction following
+            model_name = "microsoft/DialoGPT-medium"  # Better for Q&A
+            try:
+                self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+                self.model = AutoModelForCausalLM.from_pretrained(
+                    model_name,
+                    torch_dtype=torch.float16,
+                    device_map="auto"
+                )
+                if self.tokenizer.pad_token is None:
+                    self.tokenizer.pad_token = self.tokenizer.eos_token
+                print("✅ Enhanced Q&A model loaded successfully")
+            except Exception as e:
+                print(f"Falling back to Mistral: {e}")
+                self.setup_mistral_model()
+        except Exception as e:
+            print(f"❌ Error loading models: {e}")
+            self.setup_cpu_model()
+    def setup_mistral_model(self):
+        """Setup Mistral with better configuration"""
+        try:
             quantization_config = BitsAndBytesConfig(
                 load_in_4bit=True,
                 bnb_4bit_compute_dtype=torch.float16,
             model_name = "mistralai/Mistral-7B-Instruct-v0.1"
+            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
             self.model = AutoModelForCausalLM.from_pretrained(
                 model_name,
                 quantization_config=quantization_config,
                 device_map="auto",
+                torch_dtype=torch.float16
             )
+            if self.tokenizer.pad_token is None:
+                self.tokenizer.pad_token = self.tokenizer.eos_token
+            print("✅ Mistral model loaded")
         except Exception as e:
+            print(f"❌ Mistral failed: {e}")
             self.setup_cpu_model()
     def setup_cpu_model(self):
         """Setup CPU-friendly model"""
         try:
+            model_name = "distilgpt2"  # Lighter than GPT-2 medium
             self.tokenizer = AutoTokenizer.from_pretrained(model_name)
             self.model = AutoModelForCausalLM.from_pretrained(model_name)
             print("✅ CPU model loaded")
         except Exception as e:
+            print(f"❌ All models failed: {e}")
             self.model = None
             self.tokenizer = None
     def detect_document_type(self, text: str) -> str:
+        """Enhanced document type detection"""
         text_lower = text.lower()
+        # More comprehensive keyword matching
+        resume_patterns = [
+            'experience', 'skills', 'education', 'linkedin', 'email', 'phone',
+            'work experience', 'employment', 'resume', 'cv', 'curriculum vitae',
+            'internship', 'projects', 'achievements', 'career', 'profile'
+        ]
+        research_patterns = [
+            'abstract', 'introduction', 'methodology', 'conclusion', 'references',
+            'literature review', 'hypothesis', 'study', 'research', 'findings',
+            'data analysis', 'results', 'discussion', 'bibliography'
+        ]
+        business_patterns = [
+            'company', 'revenue', 'market', 'strategy', 'business', 'financial',
+            'quarter', 'profit', 'sales', 'growth', 'investment', 'stakeholder',
+            'operations', 'management', 'corporate', 'enterprise'
+        ]
+        technical_patterns = [
+            'implementation', 'algorithm', 'system', 'technical', 'specification',
+            'architecture', 'development', 'software', 'programming', 'api',
+            'database', 'framework', 'deployment', 'infrastructure'
+        ]
+        # Count matches with higher weights for exact phrases
+        def count_matches(patterns, text):
+            score = 0
+            for pattern in patterns:
+                if pattern in text:
+                    score += text.count(pattern)
+            return score
         scores = {
+            'resume': count_matches(resume_patterns, text_lower),
+            'research': count_matches(research_patterns, text_lower),
+            'business': count_matches(business_patterns, text_lower),
+            'technical': count_matches(technical_patterns, text_lower)
         }
+        max_score = max(scores.values())
+        if max_score > 3:
+            return max(scores, key=scores.get)
+        return 'general'
     def create_document_summary(self, text: str) -> str:
+        """Enhanced document summary creation"""
         try:
+            # Clean and prepare text
+            clean_text = re.sub(r'\s+', ' ', text).strip()
+            sentences = re.split(r'[.!?]+', clean_text)
+            sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
+            if not sentences:
+                return "Document contains basic information."
+            # Extract key information based on document type
+            if self.document_type == 'resume':
+                return self.extract_resume_summary(sentences)
+            elif self.document_type == 'research':
+                return self.extract_research_summary(sentences)
+            elif self.document_type == 'business':
+                return self.extract_business_summary(sentences)
+            else:
+                return self.extract_general_summary(sentences)
         except Exception as e:
+            print(f"Summary creation error: {e}")
             return "Document summary not available."
+    def extract_resume_summary(self, sentences: List[str]) -> str:
+        """Extract resume-specific summary"""
+        key_info = []
+        # Look for name, role, experience
+        for sentence in sentences[:10]:  # Check first 10 sentences
+            lower = sentence.lower()
+            if any(word in lower for word in ['engineer', 'developer', 'manager', 'analyst', 'specialist']):
+                key_info.append(sentence)
+            if any(word in lower for word in ['years', 'experience', 'worked']):
+                key_info.append(sentence)
+            if len(key_info) >= 2:
+                break
+        if key_info:
+            return '. '.join(key_info[:2]) + '.'
+        return "Resume of a professional with relevant experience and skills."
+    def extract_research_summary(self, sentences: List[str]) -> str:
+        """Extract research paper summary"""
+        abstract_sentences = []
+        intro_sentences = []
+        for sentence in sentences:
+            lower = sentence.lower()
+            if any(word in lower for word in ['study', 'research', 'analysis', 'findings']):
+                if len(sentence) > 50:  # Substantial sentences
+                    abstract_sentences.append(sentence)
+            elif any(word in lower for word in ['propose', 'method', 'approach']):
+                intro_sentences.append(sentence)
+        summary_sentences = (abstract_sentences + intro_sentences)[:2]
+        if summary_sentences:
+            return '. '.join(summary_sentences) + '.'
+        return "Research document with methodology and findings."
+    def extract_business_summary(self, sentences: List[str]) -> str:
+        """Extract business document summary"""
+        business_sentences = []
+        for sentence in sentences:
+            lower = sentence.lower()
+            if any(word in lower for word in ['company', 'business', 'market', 'strategy', 'revenue']):
+                if len(sentence) > 40:
+                    business_sentences.append(sentence)
+        if business_sentences:
+            return '. '.join(business_sentences[:2]) + '.'
+        return "Business document containing strategic and operational information."
+    def extract_general_summary(self, sentences: List[str]) -> str:
+        """Extract general document summary"""
+        # Take the most informative sentences (longer ones with key terms)
+        scored_sentences = []
+        for sentence in sentences:
+            score = len(sentence.split())  # Word count as base score
+            if any(word in sentence.lower() for word in ['important', 'key', 'main', 'primary']):
+                score += 10
+            scored_sentences.append((sentence, score))
+        # Sort by score and take top sentences
+        scored_sentences.sort(key=lambda x: x[1], reverse=True)
+        top_sentences = [s[0] for s in scored_sentences[:2]]
+        if top_sentences:
+            return '. '.join(top_sentences) + '.'
+        return "Document contains relevant information and details."
     def extract_text_from_file(self, file_path: str) -> str:
+        """Enhanced text extraction with better error handling"""
         try:
             file_extension = os.path.splitext(file_path)[1].lower()
             return f"Error reading file: {str(e)}"
     def extract_from_pdf(self, file_path: str) -> str:
+        """Enhanced PDF extraction with better text cleaning"""
         text = ""
         try:
             with open(file_path, 'rb') as file:
                 for page_num, page in enumerate(pdf_reader.pages):
                     page_text = page.extract_text()
                     if page_text.strip():
+                        # Clean the text
+                        page_text = re.sub(r'\s+', ' ', page_text)
+                        text += f"{page_text}\n"
         except Exception as e:
             text = f"Error reading PDF: {str(e)}"
+        return text.strip()
     def extract_from_docx(self, file_path: str) -> str:
         """Enhanced DOCX extraction"""
             text = ""
             for paragraph in doc.paragraphs:
                 if paragraph.text.strip():
+                    text += paragraph.text.strip() + "\n"
+            return text.strip()
         except Exception as e:
             return f"Error reading DOCX: {str(e)}"
     def extract_from_txt(self, file_path: str) -> str:
+        """Enhanced TXT extraction"""
         encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
         for encoding in encodings:
             try:
                 with open(file_path, 'r', encoding=encoding) as file:
+                    content = file.read()
+                    # Clean the content
+                    content = re.sub(r'\s+', ' ', content)
+                    return content.strip()
             except UnicodeDecodeError:
                 continue
             except Exception as e:
         return "Error: Could not decode file with any supported encoding"
+    def enhanced_chunk_text(self, text: str) -> List[Dict]:
+        """Enhanced chunking strategy for better retrieval"""
         if not text.strip():
             return []
         chunks = []
+        # Split into sentences first
+        sentences = re.split(r'[.!?]+', text)
+        sentences = [s.strip() for s in sentences if len(s.strip()) > 15]
+        # Store sentences for fine-grained retrieval
+        self.sentences = sentences
+        # Create overlapping chunks
+        chunk_size = 3  # sentences per chunk
+        overlap = 1     # sentence overlap
+        for i in range(0, len(sentences), chunk_size - overlap):
+            chunk_sentences = sentences[i:i + chunk_size]
+            if chunk_sentences:
+                chunk_text = '. '.join(chunk_sentences)
+                if len(chunk_text.strip()) > 20:
                     chunks.append({
+                        'text': chunk_text + '.',
+                        'sentence_indices': list(range(i, min(i + chunk_size, len(sentences)))),
+                        'doc_type': self.document_type
                     })
         return chunks
     def process_documents(self, files) -> str:
+        """Enhanced document processing"""
         if not files:
             return "❌ No files uploaded!"
             all_text = ""
             processed_files = []
             for file in files:
                 if file is None:
                     continue
                 file_text = self.extract_text_from_file(file.name)
                 if not file_text.startswith("Error") and not file_text.startswith("Unsupported"):
+                    all_text += f"\n{file_text}"
                     processed_files.append(os.path.basename(file.name))
                 else:
                     return f"❌ {file_text}"
             if not all_text.strip():
                 return "❌ No text extracted from files!"
+            # Store and analyze
             self.raw_text = all_text
             self.document_type = self.detect_document_type(all_text)
             self.document_summary = self.create_document_summary(all_text)
+            # Enhanced chunking
+            chunk_data = self.enhanced_chunk_text(all_text)
             if not chunk_data:
                 return "❌ No valid text chunks created!"
             self.documents = [chunk['text'] for chunk in chunk_data]
             self.document_metadata = chunk_data
+            # Create embeddings for chunks
             print(f"📄 Creating embeddings for {len(self.documents)} chunks...")
+            embeddings = self.embedder.encode(self.documents, show_progress_bar=False)
+            # Also create sentence-level embeddings for fine-grained search
+            if self.sentences:
+                print(f"📝 Creating sentence embeddings for {len(self.sentences)} sentences...")
+                self.sentence_embeddings = self.embedder.encode(self.sentences, show_progress_bar=False)
             # Build FAISS index
             dimension = embeddings.shape[1]
             self.index = faiss.IndexFlatIP(dimension)
+            # Normalize for cosine similarity
             faiss.normalize_L2(embeddings)
             self.index.add(embeddings.astype('float32'))
             return f"✅ Successfully processed {len(processed_files)} files:\n" + \
                    f"📄 Files: {', '.join(processed_files)}\n" + \
                    f"📊 Document Type: {self.document_type.title()}\n" + \
+                   f"🔍 Created {len(self.documents)} chunks and {len(self.sentences)} sentences\n" + \
+                   f"📝 Summary: {self.document_summary}\n" + \
+                   f"🚀 Ready for enhanced Q&A!"
         except Exception as e:
             return f"❌ Error processing documents: {str(e)}"
+    def find_relevant_content(self, query: str, k: int = 5) -> Tuple[str, List[str]]:
+        """Enhanced content retrieval using multiple strategies"""
         if not self.is_indexed:
             return "", []
         try:
+            query_lower = query.lower()
+            relevant_content = []
+            # Strategy 1: Semantic search using embeddings
             query_embedding = self.embedder.encode([query])
             faiss.normalize_L2(query_embedding)
+            scores, indices = self.index.search(query_embedding.astype('float32'), min(k, len(self.documents)))
+            semantic_matches = []
+            for i, idx in enumerate(indices[0]):
+                if idx < len(self.documents) and scores[0][i] > 0.2:  # Relevance threshold
+                    semantic_matches.append(self.documents[idx])
+            # Strategy 2: Keyword matching in sentences
+            query_words = set(query_lower.split())
+            keyword_matches = []
+            for sentence in self.sentences:
+                sentence_words = set(sentence.lower().split())
+                overlap = len(query_words.intersection(sentence_words))
+                if overlap >= 2:  # At least 2 word overlap
+                    keyword_matches.append(sentence)
+            # Strategy 3: Pattern matching for specific question types
+            pattern_matches = []
+            if any(word in query_lower for word in ['name', 'who']):
+                # Look for names and identities
+                for sentence in self.sentences:
+                    if re.search(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', sentence):  # Name pattern
+                        pattern_matches.append(sentence)
+            if any(word in query_lower for word in ['experience', 'work', 'job']):
+                # Look for experience-related content
+                for sentence in self.sentences:
+                    if any(word in sentence.lower() for word in ['year', 'experience', 'work', 'company', 'role']):
+                        pattern_matches.append(sentence)
+            if any(word in query_lower for word in ['skill', 'technology', 'tech']):
+                # Look for skills and technologies
+                for sentence in self.sentences:
+                    if any(word in sentence.lower() for word in ['skill', 'technology', 'programming', 'software']):
+                        pattern_matches.append(sentence)
+            # Combine all strategies
+            all_matches = list(set(semantic_matches + keyword_matches + pattern_matches))
+            # Sort by relevance (prefer shorter, more specific sentences)
+            all_matches.sort(key=lambda x: len(x.split()))
+            return '\n'.join(all_matches[:k]), all_matches[:k]
         except Exception as e:
+            print(f"Error in content retrieval: {e}")
             return "", []
+    def generate_direct_answer(self, query: str, context: str) -> str:
+        """Generate direct, relevant answers"""
         if not context:
+            return "No relevant information found in the document."
         query_lower = query.lower()
+        context_sentences = [s.strip() for s in context.split('\n') if s.strip()]
+        # Handle specific question types with direct extraction
+        if any(word in query_lower for word in ['name', 'who is']):
+            # Extract names
+            for sentence in context_sentences:
+                names = re.findall(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', sentence)
+                if names:
+                    return f"The person mentioned is {names[0]}."
+        if any(word in query_lower for word in ['experience', 'years']):
+            # Extract experience information
+            for sentence in context_sentences:
+                exp_match = re.search(r'(\d+)\s*(?:years?|yr)', sentence.lower())
+                if exp_match:
+                    return f"The experience mentioned is {exp_match.group(1)} years. {sentence}"
+        if any(word in query_lower for word in ['skill', 'technology']):
+            # Extract skills
+            skills = []
+            for sentence in context_sentences:
+                # Look for programming languages, frameworks, etc.
+                tech_words = ['python', 'java', 'javascript', 'react', 'node', 'sql', 'aws', 'docker']
+                found_tech = [word for word in tech_words if word in sentence.lower()]
+                if found_tech:
+                    skills.extend(found_tech)
+            if skills:
+                return f"Technologies/skills mentioned include: {', '.join(set(skills))}. {context_sentences[0] if context_sentences else ''}"
+        if any(word in query_lower for word in ['education', 'degree', 'university', 'college']):
+            # Extract education information
+            for sentence in context_sentences:
+                if any(word in sentence.lower() for word in ['degree', 'university', 'college', 'bachelor', 'master']):
+                    return sentence
+        if any(word in query_lower for word in ['summary', 'about', 'overview']):
+            return self.document_summary
+        # For other questions, return the most relevant sentence
+        if context_sentences:
+            # Score sentences by query word overlap
+            query_words = set(query_lower.split())
+            scored_sentences = []
+            for sentence in context_sentences:
+                sentence_words = set(sentence.lower().split())
+                overlap = len(query_words.intersection(sentence_words))
                 scored_sentences.append((sentence, overlap))
+            # Sort by overlap and return best match
+            scored_sentences.sort(key=lambda x: x[1], reverse=True)
+            if scored_sentences and scored_sentences[0][1] > 0:
+                return scored_sentences[0][0]
+            else:
+                return context_sentences[0]  # Return first relevant sentence
+        return "I found relevant content but couldn't extract a specific answer."
     def answer_question(self, query: str) -> str:
+        """Main question answering function with enhanced accuracy"""
         if not query.strip():
             return "❓ Please ask a question!"
             return "📁 Please upload and process documents first!"
         try:
+            # Handle summary requests directly
             query_lower = query.lower()
+            if query_lower in ['summary', 'summarize', 'about', 'overview']:
+                return f"📄 **Document Summary:**\n\n{self.document_summary}"
+            # Find relevant content using enhanced retrieval
+            context, matches = self.find_relevant_content(query, k=5)
             if not context:
+                return "🔍 No relevant information found. Try rephrasing your question or asking about different aspects of the document."
+            # Generate direct answer
+            answer = self.generate_direct_answer(query, context)
+            # Add context if answer is too brief
+            if len(answer) < 50 and matches:
+                answer += f"\n\n**Additional context:** {matches[0][:200]}..."
+            return answer
         except Exception as e:
             return f"❌ Error processing question: {str(e)}"
+# Initialize the enhanced system
+print("Initializing Enhanced Smart RAG System...")
 rag_system = SmartDocumentRAG()
+# Create the interface
 def create_interface():
+    with gr.Blocks(title="🧠 Enhanced Document Q&A", theme=gr.themes.Soft()) as demo:
         gr.Markdown("""
+        # 🧠 Enhanced Document Q&A System
+        **Improved for Better Accuracy & Relevance!**
+        **New Features:**
+        - 🎯 Multi-strategy content retrieval
+        - 📊 Direct answer extraction
+        - 🔍 Enhanced keyword and pattern matching
+        - 📚 Better handling of resumes, research papers, and business docs
         """)
         with gr.Tab("📤 Upload & Process"):
                 with gr.Column():
                     process_status = gr.Textbox(
+                        label="📋 Processing Status & Analysis",
                         lines=10,
                         interactive=False
                     )
                 outputs=[process_status]
             )
+        with gr.Tab("❓ Enhanced Q&A"):
             with gr.Row():
                 with gr.Column():
                     question_input = gr.Textbox(
+                        label="🤔 Ask Your Question",
+                        placeholder="What is the person's name? / How many years of experience? / What are their skills?",
                         lines=3
                     )
                     with gr.Row():
+                        ask_btn = gr.Button("🧠 Get Answer", variant="primary")
                         summary_btn = gr.Button("📊 Get Summary", variant="secondary")
                 with gr.Column():
                     answer_output = gr.Textbox(
+                        label="💡 Enhanced Answer",
                         lines=8,
                         interactive=False
                     )
                 outputs=[answer_output]
             )
             gr.Markdown("""
+            ### 💡 Try These Specific Questions:
+            **For Resumes:**
+            - "What is the person's name?"
+            - "How many years of experience do they have?"
+            - "What are their technical skills?"
+            - "What is their educational background?"
+            - "What companies have they worked for?"
+            **For Any Document:**
+            - "Summarize this document"
+            - "What is the main topic?"
+            - "List the key points"
             """)
+        with gr.Tab("🔧 System Info"):
             gr.Markdown("""
+            ### 🚀 Enhanced Features:
+            **Better Retrieval:**
+            - Semantic search using embeddings
+            - Keyword matching with context
+            - Pattern recognition for names, dates, skills
+            - Multi-level chunking (sentences + paragraphs)
+            **Improved Answers:**
+            - Direct information extraction
+            - Question-type specific processing
+            - Context-aware responses
+            - Relevance scoring and filtering
+            **Document Types:**
+            - ✅ Resumes (name, experience, skills extraction)
+            - ✅ Research papers (methodology, findings)
+            - ✅ Business documents (strategy, metrics)
+            - ✅ Technical documentation (specifications)
             """)
     return demo
+# Launch the app
 if __name__ == "__main__":
     demo = create_interface()
     demo.launch(