Spaces:

pradeepsengarr
/

Custom_Rag_Bot

Running

App Files Files Community

pradeepsengarr commited on Jun 7

Commit

3406461

verified ·

1 Parent(s): d7bf74b

Update app.py

Browse files

Files changed (1) hide show

app.py +468 -363

app.py CHANGED Viewed

@@ -9,11 +9,12 @@ import docx
 import io
 import os
 import re
-from typing import List, Optional
-class DocumentRAG:
     def __init__(self):
-        print("🚀 Initializing RAG System...")
         # Initialize embedding model (lightweight)
         self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
@@ -24,17 +25,20 @@ class DocumentRAG:
         # Document storage
         self.documents = []
         self.index = None
         self.is_indexed = False
-        self.raw_text = ""  # Store raw text for fallback
     def setup_llm(self):
-        """Setup quantized Mistral model"""
         try:
             # Check if CUDA is available
             if not torch.cuda.is_available():
-                print("⚠️ CUDA not available, falling back to CPU or alternative model")
-                self.setup_fallback_model()
                 return
             quantization_config = BitsAndBytesConfig(
@@ -46,17 +50,14 @@ class DocumentRAG:
             model_name = "mistralai/Mistral-7B-Instruct-v0.1"
-            # Load tokenizer first
             self.tokenizer = AutoTokenizer.from_pretrained(
                 model_name,
                 trust_remote_code=True
             )
-            # Fix padding token issue
             if self.tokenizer.pad_token is None:
                 self.tokenizer.pad_token = self.tokenizer.eos_token
-            # Load model with quantization
             self.model = AutoModelForCausalLM.from_pretrained(
                 model_name,
                 quantization_config=quantization_config,
@@ -69,169 +70,104 @@ class DocumentRAG:
             print("✅ Quantized Mistral model loaded successfully")
         except Exception as e:
-            print(f"❌ Error loading model: {e}")
-            print("🔄 Falling back to alternative model...")
-            self.setup_fallback_model()
-    def setup_fallback_model(self):
-        """Fallback to smaller model if Mistral fails"""
         try:
-            # Use a model that's better for factual Q&A and less prone to hallucination
-            model_name = "microsoft/DialoGPT-small"
             self.tokenizer = AutoTokenizer.from_pretrained(model_name)
             self.model = AutoModelForCausalLM.from_pretrained(model_name)
-            # Fix padding token for fallback model too
             if self.tokenizer.pad_token is None:
                 self.tokenizer.pad_token = self.tokenizer.eos_token
-            print("✅ Fallback model loaded")
         except Exception as e:
-            print(f"❌ Fallback model failed: {e}")
-            # Try an even simpler approach - return context-based answers without generation
             self.model = None
             self.tokenizer = None
-            print("⚠️ Using context-only mode (no text generation)")
-    def extract_profile_info(self, text: str) -> dict:
-        """Extract key profile information from resume text"""
-        profile = {
-            'name': '',
-            'role': '',
-            'skills': [],
-            'experience': [],
-            'education': [],
-            'projects': []
         }
-        lines = text.split('\n')
-        current_section = None
-        for line in lines:
-            line = line.strip()
-            if not line:
-                continue
-            line_lower = line.lower()
-            # Extract name (usually first meaningful line)
-            if not profile['name'] and len(line.split()) <= 4 and not any(char in line for char in ['@', '.com', '+91', 'linkedin']):
-                if not any(word in line_lower for word in ['resume', 'cv', 'experience', 'education', 'skills']):
-                    profile['name'] = line
-            # Look for role/title indicators
-            if any(keyword in line_lower for keyword in ['data scientist', 'software engineer', 'developer', 'analyst', 'intern']):
-                if 'data scientist' in line_lower:
-                    profile['role'] = 'Data Scientist'
-                elif 'software engineer' in line_lower:
-                    profile['role'] = 'Software Engineer'
-                elif 'developer' in line_lower:
-                    profile['role'] = 'Developer'
-                elif 'analyst' in line_lower:
-                    profile['role'] = 'Analyst'
-            # Extract skills
-            if any(keyword in line_lower for keyword in ['python', 'machine learning', 'react', 'javascript', 'sql']):
-                if 'python' in line_lower:
-                    profile['skills'].append('Python')
-                if 'machine learning' in line_lower:
-                    profile['skills'].append('Machine Learning')
-                if 'react' in line_lower:
-                    profile['skills'].append('React')
-                if 'javascript' in line_lower:
-                    profile['skills'].append('JavaScript')
-        return profile
-    def simple_context_answer(self, query: str, context: str) -> str:
-        """Improved smart answering based on context analysis"""
-        if not context:
-            return "No relevant information found in the documents."
-        query_lower = query.lower()
-        # Extract profile information first
-        profile = self.extract_profile_info(self.raw_text if self.raw_text else context)
-        # Handle "who is" questions specifically
-        if "who is" in query_lower:
-            name_in_query = re.search(r'who is (\w+)', query_lower)
-            person_name = name_in_query.group(1) if name_in_query else "this person"
-            # Build answer from profile
-            answer_parts = []
-            if profile['name']:
-                if profile['role']:
-                    answer_parts.append(f"{profile['name']} is a {profile['role']}")
-                else:
-                    # Try to infer role from context
-                    context_lower = context.lower()
-                    if 'data scientist' in context_lower or ('python' in context_lower and 'machine learning' in context_lower):
-                        answer_parts.append(f"{profile['name']} is a Data Scientist")
-                    elif 'software' in context_lower and 'developer' in context_lower:
-                        answer_parts.append(f"{profile['name']} is a Software Developer")
-                    else:
-                        answer_parts.append(f"{profile['name']} is a professional")
-            else:
-                # Use name from query
-                context_lower = context.lower()
-                if 'data scientist' in context_lower or ('python' in context_lower and 'machine learning' in context_lower):
-                    answer_parts.append(f"{person_name.title()} is a Data Scientist")
-                elif 'software' in context_lower and 'developer' in context_lower:
-                    answer_parts.append(f"{person_name.title()} is a Software Developer")
-                else:
-                    answer_parts.append(f"{person_name.title()} is a professional")
-            # Add key skills if available
-            if profile['skills']:
-                top_skills = profile['skills'][:3]  # Top 3 skills
-                answer_parts.append(f"with expertise in {', '.join(top_skills)}")
-            if answer_parts:
-                return '. '.join(answer_parts) + '.'
-        # Handle other question types
-        elif any(keyword in query_lower for keyword in ['what', 'skills', 'experience', 'work']):
-            if 'skills' in query_lower:
-                if profile['skills']:
-                    return f"Key skills include: {', '.join(profile['skills'])}."
-            elif 'experience' in query_lower or 'work' in query_lower:
-                # Look for experience indicators in context
-                exp_lines = []
-                for line in context.split('\n'):
-                    if any(word in line.lower() for word in ['experience', 'worked', 'internship', 'project']):
-                        exp_lines.append(line.strip())
-                if exp_lines:
-                    return exp_lines[0]
-        # Fallback to keyword matching
-        query_words = set(query_lower.split())
-        context_sentences = [s.strip() for s in context.split('.') if s.strip()]
-        # Find most relevant sentence
-        best_sentence = ""
-        max_matches = 0
-        for sentence in context_sentences:
-            if len(sentence) < 20:  # Skip very short sentences
-                continue
-            sentence_words = set(sentence.lower().split())
-            matches = len(query_words.intersection(sentence_words))
-            if matches > max_matches:
-                max_matches = matches
-                best_sentence = sentence
-        if best_sentence:
-            return best_sentence + '.'
-        # Final fallback
-        return "Based on the document, I found relevant information but cannot provide a specific answer."
     def extract_text_from_file(self, file_path: str) -> str:
-        """Extract text from various file formats"""
         try:
             file_extension = os.path.splitext(file_path)[1].lower()
@@ -248,99 +184,121 @@ class DocumentRAG:
             return f"Error reading file: {str(e)}"
     def extract_from_pdf(self, file_path: str) -> str:
-        """Extract text from PDF"""
         text = ""
         try:
             with open(file_path, 'rb') as file:
                 pdf_reader = PyPDF2.PdfReader(file)
-                for page in pdf_reader.pages:
-                    text += page.extract_text() + "\n"
         except Exception as e:
             text = f"Error reading PDF: {str(e)}"
         return text
     def extract_from_docx(self, file_path: str) -> str:
-        """Extract text from DOCX"""
         try:
             doc = docx.Document(file_path)
             text = ""
             for paragraph in doc.paragraphs:
-                text += paragraph.text + "\n"
             return text
         except Exception as e:
             return f"Error reading DOCX: {str(e)}"
     def extract_from_txt(self, file_path: str) -> str:
-        """Extract text from TXT"""
-        try:
-            with open(file_path, 'r', encoding='utf-8') as file:
-                return file.read()
-        except Exception as e:
             try:
-                with open(file_path, 'r', encoding='latin-1') as file:
                     return file.read()
-            except Exception as e2:
-                return f"Error reading TXT: {str(e2)}"
-    def smart_chunk_text(self, text: str) -> List[str]:
-        """Smart chunking that preserves important information together"""
         if not text.strip():
             return []
         chunks = []
-        lines = text.split('\n')
-        # Create chunks based on semantic meaning
-        current_chunk = ""
-        chunk_type = None
-        for line in lines:
-            line = line.strip()
-            if not line:
-                continue
-            line_lower = line.lower()
-            # Identify section types
-            new_chunk_type = None
-            if any(keyword in line_lower for keyword in ['name', 'email', 'phone', 'linkedin', 'github']):
-                new_chunk_type = 'contact'
-            elif any(keyword in line_lower for keyword in ['experience', 'work', 'internship']):
-                new_chunk_type = 'experience'
-            elif any(keyword in line_lower for keyword in ['education', 'degree', 'university', 'college']):
-                new_chunk_type = 'education'
-            elif any(keyword in line_lower for keyword in ['skills', 'technologies', 'programming']):
-                new_chunk_type = 'skills'
-            elif any(keyword in line_lower for keyword in ['project', 'developed', 'built']):
-                new_chunk_type = 'projects'
-            # If section type changes, save current chunk and start new one
-            if new_chunk_type != chunk_type and current_chunk:
-                chunks.append(current_chunk.strip())
-                current_chunk = line
-                chunk_type = new_chunk_type
-            else:
-                # Add to current chunk
-                if current_chunk:
-                    current_chunk += "\n" + line
-                else:
                     current_chunk = line
-                    chunk_type = new_chunk_type
-            # Limit chunk size
-            if len(current_chunk.split()) > 150:
-                chunks.append(current_chunk.strip())
-                current_chunk = ""
-                chunk_type = None
-        # Add the last chunk
-        if current_chunk:
-            chunks.append(current_chunk.strip())
         return chunks
     def process_documents(self, files) -> str:
-        """Process uploaded files and create embeddings"""
         if not files:
             return "❌ No files uploaded!"
@@ -363,15 +321,24 @@ class DocumentRAG:
             if not all_text.strip():
                 return "❌ No text extracted from files!"
-            # Store raw text for smart answering
             self.raw_text = all_text
-            # Smart chunk the text
-            self.documents = self.smart_chunk_text(all_text)
-            if not self.documents:
                 return "❌ No valid text chunks created!"
             # Create embeddings
             print(f"📄 Creating embeddings for {len(self.documents)} chunks...")
             embeddings = self.embedder.encode(self.documents, show_progress_bar=True)
@@ -388,16 +355,18 @@ class DocumentRAG:
             return f"✅ Successfully processed {len(processed_files)} files:\n" + \
                    f"📄 Files: {', '.join(processed_files)}\n" + \
-                   f"📊 Created {len(self.documents)} text chunks\n" + \
-                   f"🔍 Ready for Q&A!"
         except Exception as e:
             return f"❌ Error processing documents: {str(e)}"
-    def retrieve_context(self, query: str, k: int = 3) -> str:
-        """Retrieve relevant context with improved filtering"""
         if not self.is_indexed:
-            return ""
         try:
             # Get query embedding
@@ -405,139 +374,226 @@ class DocumentRAG:
             faiss.normalize_L2(query_embedding)
             # Search for similar chunks
-            scores, indices = self.index.search(query_embedding.astype('float32'), min(k, len(self.documents)))
-            # Get relevant documents with reasonable threshold
-            relevant_docs = []
             query_lower = query.lower()
             for i, idx in enumerate(indices[0]):
                 if idx < len(self.documents):
-                    doc = self.documents[idx]
                     score = scores[0][i]
-                    # For "who is" questions, prioritize contact/basic info chunks
-                    if "who is" in query_lower:
-                        doc_lower = doc.lower()
-                        if any(keyword in doc_lower for keyword in ['name', 'email', 'linkedin', 'data scientist', 'developer']):
-                            relevant_docs.insert(0, doc)  # Put at beginning
-                        elif score > 0.15:  # Lower threshold for other relevant content
-                            relevant_docs.append(doc)
-                    else:
-                        if score > 0.2:  # Standard threshold
-                            relevant_docs.append(doc)
-            # If no good matches for "who is", get the first few chunks
-            if "who is" in query_lower and not relevant_docs:
-                relevant_docs = self.documents[:2]
-            return "\n\n".join(relevant_docs[:3])  # Limit to top 3 chunks
         except Exception as e:
             print(f"Error in retrieval: {e}")
-            return ""
-    def generate_answer(self, query: str, context: str) -> str:
-        """Generate answer using the LLM with improved prompting"""
-        if self.model is None or self.tokenizer is None:
-            return self.simple_context_answer(query, context)
-        try:
-            # Check if using Mistral (has specific prompt format) or fallback model
-            model_name = getattr(self.model.config, '_name_or_path', '').lower()
-            is_mistral = 'mistral' in model_name
-            if is_mistral:
-                # Focused prompt for Mistral
-                prompt = f"""<s>[INST] Answer the question about the person based on their resume. Be concise and direct.
-Resume Information:
-{context[:800]}
 Question: {query}
-Provide a brief, specific answer in 1 sentence. [/INST]"""
-            else:
-                # Focused prompt for fallback models
-                prompt = f"""Resume: {context[:600]}
 Question: {query}
-Answer briefly:"""
-            # Tokenize
-            inputs = self.tokenizer(
-                prompt,
-                return_tensors="pt",
-                max_length=600,
-                truncation=True,
-                padding=True
-            )
-            # Move to same device as model
-            if torch.cuda.is_available() and next(self.model.parameters()).is_cuda:
-                inputs = {k: v.cuda() for k, v in inputs.items()}
-            # Generate with focused parameters
-            with torch.no_grad():
-                outputs = self.model.generate(
-                    **inputs,
-                    max_new_tokens=50,    # Much shorter for focused answers
-                    temperature=0.1,      # Very low for deterministic responses
-                    do_sample=True,
-                    top_p=0.9,
-                    early_stopping=True,
-                    repetition_penalty=1.1,
-                    pad_token_id=self.tokenizer.pad_token_id,
-                    eos_token_id=self.tokenizer.eos_token_id
-                )
-            # Decode response
-            full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-            # Extract answer
-            if is_mistral and "[/INST]" in full_response:
-                answer = full_response.split("[/INST]")[-1].strip()
-            else:
-                answer = full_response[len(prompt):].strip()
-            # Clean and validate answer
-            answer = self.clean_answer(answer)
-            # If answer is too long or poor quality, use fallback
-            if not answer or len(answer) > 200:
-                return self.simple_context_answer(query, context)
-            return answer
-        except Exception as e:
-            print(f"Error in generation: {e}")
-            return self.simple_context_answer(query, context)
-    def clean_answer(self, answer: str) -> str:
-        """Clean up the generated answer"""
-        if not answer or len(answer) < 5:
-            return ""
         # Remove unwanted patterns
         answer = re.sub(r'--- \w+.*? ---', '', answer)
-        answer = re.sub(r'\b\w+@\w+\.\w+\b', '', answer)  # Remove emails
-        answer = re.sub(r'\+91-?\d+', '', answer)  # Remove phone numbers
-        answer = answer.replace('LinkedIn:', '').replace('Github:', '')
-        # Clean up whitespace
         answer = ' '.join(answer.split())
-        # Take only the first sentence if multiple
-        sentences = answer.split('.')
-        if sentences:
-            first_sentence = sentences[0].strip()
-            if len(first_sentence) > 10:
-                return first_sentence + '.'
         return answer.strip()
     def answer_question(self, query: str) -> str:
-        """Main function to answer questions"""
         if not query.strip():
             return "❓ Please ask a question!"
@@ -545,52 +601,59 @@ Answer briefly:"""
             return "📁 Please upload and process documents first!"
         try:
-            # Retrieve relevant context
-            context = self.retrieve_context(query, k=3)
             if not context:
-                return "🔍 No relevant information found in the uploaded documents."
-            # Generate answer
-            answer = self.generate_answer(query, context)
-            if answer and len(answer) > 5:
-                return answer
-            else:
-                return "I couldn't generate a specific answer from the document content."
         except Exception as e:
-            return f"❌ Error answering question: {str(e)}"
-# Initialize the RAG system
-print("Initializing Document RAG System...")
-rag_system = DocumentRAG()
-# Gradio Interface
 def create_interface():
-    with gr.Blocks(title="📚 Document Q&A with RAG", theme=gr.themes.Soft()) as demo:
         gr.Markdown("""
-        # 📚 Document Q&A System
-        Upload your documents and ask questions about them!
-        **Supported formats:** PDF, DOCX, TXT
         """)
-        with gr.Tab("📤 Upload Documents"):
             with gr.Row():
                 with gr.Column():
                     file_upload = gr.File(
-                        label="Upload Documents",
                         file_count="multiple",
-                        file_types=[".pdf", ".docx", ".txt"]
                     )
-                    process_btn = gr.Button("🔄 Process Documents", variant="primary")
                 with gr.Column():
                     process_status = gr.Textbox(
-                        label="Processing Status",
-                        lines=8,
                         interactive=False
                     )
@@ -600,20 +663,23 @@ def create_interface():
                 outputs=[process_status]
             )
-        with gr.Tab("❓ Ask Questions"):
             with gr.Row():
                 with gr.Column():
                     question_input = gr.Textbox(
-                        label="Your Question",
-                        placeholder="Who is Pradeep?",
                         lines=3
                     )
-                    ask_btn = gr.Button("🔍 Get Answer", variant="primary")
                 with gr.Column():
                     answer_output = gr.Textbox(
-                        label="Answer",
-                        lines=6,
                         interactive=False
                     )
@@ -623,19 +689,58 @@ def create_interface():
                 outputs=[answer_output]
             )
-            # Example questions
             gr.Markdown("""
-            ### 💡 Example Questions:
-            - Who is [Name]?
-            - What are [Name]'s skills?
-            - What experience does [Name] have?
-            - What projects has [Name] worked on?
-            - What is [Name]'s educational background?
             """)
     return demo
-# Launch the app
 if __name__ == "__main__":
     demo = create_interface()
     demo.launch(

 import io
 import os
 import re
+from typing import List, Optional, Dict, Tuple
+import json
+class SmartDocumentRAG:
     def __init__(self):
+        print("🚀 Initializing Smart RAG System...")
         # Initialize embedding model (lightweight)
         self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
         # Document storage
         self.documents = []
+        self.document_metadata = []  # Store metadata about each chunk
         self.index = None
         self.is_indexed = False
+        self.raw_text = ""
+        self.document_type = "general"  # Auto-detect document type
+        self.document_summary = ""  # Store document summary
     def setup_llm(self):
+        """Setup quantized Mistral model with fallback"""
         try:
             # Check if CUDA is available
             if not torch.cuda.is_available():
+                print("⚠️ CUDA not available, using CPU-optimized model")
+                self.setup_cpu_model()
                 return
             quantization_config = BitsAndBytesConfig(
             model_name = "mistralai/Mistral-7B-Instruct-v0.1"
             self.tokenizer = AutoTokenizer.from_pretrained(
                 model_name,
                 trust_remote_code=True
             )
             if self.tokenizer.pad_token is None:
                 self.tokenizer.pad_token = self.tokenizer.eos_token
             self.model = AutoModelForCausalLM.from_pretrained(
                 model_name,
                 quantization_config=quantization_config,
             print("✅ Quantized Mistral model loaded successfully")
         except Exception as e:
+            print(f"❌ Error loading Mistral: {e}")
+            print("🔄 Falling back to CPU model...")
+            self.setup_cpu_model()
+    def setup_cpu_model(self):
+        """Setup CPU-friendly model"""
         try:
+            # Use GPT-2 for better text generation on CPU
+            model_name = "gpt2-medium"
             self.tokenizer = AutoTokenizer.from_pretrained(model_name)
             self.model = AutoModelForCausalLM.from_pretrained(model_name)
             if self.tokenizer.pad_token is None:
                 self.tokenizer.pad_token = self.tokenizer.eos_token
+            print("✅ CPU model loaded")
         except Exception as e:
+            print(f"❌ CPU model failed: {e}")
             self.model = None
             self.tokenizer = None
+            print("⚠️ Using context-only mode")
+    def detect_document_type(self, text: str) -> str:
+        """Intelligently detect document type"""
+        text_lower = text.lower()
+        # Count keywords for different document types
+        resume_keywords = ['experience', 'skills', 'education', 'linkedin', 'email', 'phone', 'internship']
+        research_keywords = ['abstract', 'introduction', 'methodology', 'conclusion', 'references', 'study', 'analysis']
+        business_keywords = ['company', 'revenue', 'market', 'strategy', 'business', 'financial', 'quarter']
+        technical_keywords = ['implementation', 'algorithm', 'system', 'technical', 'specification', 'architecture']
+        legal_keywords = ['contract', 'agreement', 'terms', 'conditions', 'legal', 'clause', 'liability']
+        scores = {
+            'resume': sum(1 for kw in resume_keywords if kw in text_lower),
+            'research': sum(1 for kw in research_keywords if kw in text_lower),
+            'business': sum(1 for kw in business_keywords if kw in text_lower),
+            'technical': sum(1 for kw in technical_keywords if kw in text_lower),
+            'legal': sum(1 for kw in legal_keywords if kw in text_lower)
         }
+        return max(scores, key=scores.get) if max(scores.values()) > 2 else 'general'
+    def create_document_summary(self, text: str) -> str:
+        """Create intelligent document summary"""
+        try:
+            # Split into paragraphs and find key information
+            paragraphs = [p.strip() for p in text.split('\n\n') if p.strip() and len(p) > 50]
+            if not paragraphs:
+                return "Document contains basic text information."
+            # Take first few paragraphs for summary context
+            summary_text = ' '.join(paragraphs[:3])[:1000]
+            if self.model and self.tokenizer:
+                # Generate AI summary
+                prompt = f"""Summarize the following document in 2-3 sentences, focusing on the main points and key information:
+{summary_text}
+Summary:"""
+                try:
+                    inputs = self.tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
+                    if torch.cuda.is_available() and next(self.model.parameters()).is_cuda:
+                        inputs = {k: v.cuda() for k, v in inputs.items()}
+                    with torch.no_grad():
+                        outputs = self.model.generate(
+                            **inputs,
+                            max_new_tokens=100,
+                            temperature=0.7,
+                            do_sample=True,
+                            top_p=0.9,
+                            pad_token_id=self.tokenizer.pad_token_id
+                        )
+                    summary = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+                    summary = summary.split("Summary:")[-1].strip()
+                    if len(summary) > 20:
+                        return summary
+                except Exception as e:
+                    print(f"Error generating AI summary: {e}")
+            # Fallback: Extract key sentences
+            sentences = re.split(r'[.!?]+', summary_text)
+            key_sentences = [s.strip() for s in sentences if len(s.strip()) > 30][:2]
+            return '. '.join(key_sentences) + '.' if key_sentences else "Document contains relevant information."
+        except Exception as e:
+            return "Document summary not available."
     def extract_text_from_file(self, file_path: str) -> str:
+        """Extract text from various file formats with better error handling"""
         try:
             file_extension = os.path.splitext(file_path)[1].lower()
             return f"Error reading file: {str(e)}"
     def extract_from_pdf(self, file_path: str) -> str:
+        """Enhanced PDF extraction"""
         text = ""
         try:
             with open(file_path, 'rb') as file:
                 pdf_reader = PyPDF2.PdfReader(file)
+                for page_num, page in enumerate(pdf_reader.pages):
+                    page_text = page.extract_text()
+                    if page_text.strip():
+                        text += f"\n[Page {page_num + 1}]\n{page_text}\n"
         except Exception as e:
             text = f"Error reading PDF: {str(e)}"
         return text
     def extract_from_docx(self, file_path: str) -> str:
+        """Enhanced DOCX extraction"""
         try:
             doc = docx.Document(file_path)
             text = ""
             for paragraph in doc.paragraphs:
+                if paragraph.text.strip():
+                    text += paragraph.text + "\n"
             return text
         except Exception as e:
             return f"Error reading DOCX: {str(e)}"
     def extract_from_txt(self, file_path: str) -> str:
+        """Enhanced TXT extraction with encoding detection"""
+        encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
+        for encoding in encodings:
             try:
+                with open(file_path, 'r', encoding=encoding) as file:
                     return file.read()
+            except UnicodeDecodeError:
+                continue
+            except Exception as e:
+                return f"Error reading TXT: {str(e)}"
+        return "Error: Could not decode file with any supported encoding"
+    def intelligent_chunk_text(self, text: str, doc_type: str) -> List[Dict]:
+        """Intelligent chunking based on document type"""
         if not text.strip():
             return []
         chunks = []
+        lines = [line.strip() for line in text.split('\n') if line.strip()]
+        if doc_type == 'research':
+            # For research papers, chunk by sections
+            current_chunk = ""
+            current_section = "introduction"
+            for line in lines:
+                line_lower = line.lower()
+                # Detect section headers
+                if any(header in line_lower for header in ['abstract', 'introduction', 'methodology', 'results', 'conclusion', 'references']):
+                    if current_chunk:
+                        chunks.append({
+                            'text': current_chunk.strip(),
+                            'section': current_section,
+                            'doc_type': doc_type
+                        })
                     current_chunk = line
+                    current_section = line_lower.split()[0] if line_lower.split() else "section"
+                else:
+                    current_chunk += "\n" + line
+                # Limit chunk size
+                if len(current_chunk.split()) > 200:
+                    chunks.append({
+                        'text': current_chunk.strip(),
+                        'section': current_section,
+                        'doc_type': doc_type
+                    })
+                    current_chunk = ""
+            if current_chunk:
+                chunks.append({
+                    'text': current_chunk.strip(),
+                    'section': current_section,
+                    'doc_type': doc_type
+                })
+        else:
+            # General intelligent chunking
+            current_chunk = ""
+            sentence_count = 0
+            for line in lines:
+                current_chunk += line + "\n"
+                sentence_count += len(re.findall(r'[.!?]+', line))
+                # Create chunk based on sentence count or word count
+                if sentence_count >= 5 or len(current_chunk.split()) > 150:
+                    chunks.append({
+                        'text': current_chunk.strip(),
+                        'section': 'content',
+                        'doc_type': doc_type
+                    })
+                    current_chunk = ""
+                    sentence_count = 0
+            if current_chunk:
+                chunks.append({
+                    'text': current_chunk.strip(),
+                    'section': 'content',
+                    'doc_type': doc_type
+                })
         return chunks
     def process_documents(self, files) -> str:
+        """Enhanced document processing with intelligent analysis"""
         if not files:
             return "❌ No files uploaded!"
             if not all_text.strip():
                 return "❌ No text extracted from files!"
+            # Store raw text
             self.raw_text = all_text
+            # Detect document type
+            self.document_type = self.detect_document_type(all_text)
+            # Create document summary
+            self.document_summary = self.create_document_summary(all_text)
+            # Intelligent chunking
+            chunk_data = self.intelligent_chunk_text(all_text, self.document_type)
+            if not chunk_data:
                 return "❌ No valid text chunks created!"
+            self.documents = [chunk['text'] for chunk in chunk_data]
+            self.document_metadata = chunk_data
             # Create embeddings
             print(f"📄 Creating embeddings for {len(self.documents)} chunks...")
             embeddings = self.embedder.encode(self.documents, show_progress_bar=True)
             return f"✅ Successfully processed {len(processed_files)} files:\n" + \
                    f"📄 Files: {', '.join(processed_files)}\n" + \
+                   f"📊 Document Type: {self.document_type.title()}\n" + \
+                   f"🔍 Created {len(self.documents)} intelligent chunks\n" + \
+                   f"📝 Summary: {self.document_summary[:200]}...\n" + \
+                   f"🚀 Ready for smart Q&A!"
         except Exception as e:
             return f"❌ Error processing documents: {str(e)}"
+    def smart_retrieve_context(self, query: str, k: int = 4) -> Tuple[str, List[Dict]]:
+        """Enhanced context retrieval with intelligent ranking"""
         if not self.is_indexed:
+            return "", []
         try:
             # Get query embedding
             faiss.normalize_L2(query_embedding)
             # Search for similar chunks
+            scores, indices = self.index.search(query_embedding.astype('float32'), min(k * 2, len(self.documents)))
+            # Analyze query intent
             query_lower = query.lower()
+            is_summary_request = any(word in query_lower for word in ['summary', 'summarize', 'overview', 'what is', 'about'])
+            is_specific_request = any(word in query_lower for word in ['how', 'why', 'when', 'where', 'which'])
+            relevant_chunks = []
             for i, idx in enumerate(indices[0]):
                 if idx < len(self.documents):
                     score = scores[0][i]
+                    chunk_data = self.document_metadata[idx]
+                    # Adjust scoring based on query type and document structure
+                    adjusted_score = score
+                    if is_summary_request:
+                        # Boost introductory sections for summary requests
+                        if chunk_data['section'] in ['introduction', 'abstract', 'content']:
+                            adjusted_score += 0.1
+                    if adjusted_score > 0.15:  # Threshold for relevance
+                        relevant_chunks.append({
+                            'text': self.documents[idx],
+                            'score': adjusted_score,
+                            'metadata': chunk_data
+                        })
+            # Sort by adjusted score
+            relevant_chunks.sort(key=lambda x: x['score'], reverse=True)
+            # Take top chunks
+            top_chunks = relevant_chunks[:k]
+            context = "\n\n".join([chunk['text'] for chunk in top_chunks])
+            return context, top_chunks
         except Exception as e:
             print(f"Error in retrieval: {e}")
+            return "", []
+    def generate_smart_answer(self, query: str, context: str, chunks_data: List[Dict]) -> str:
+        """Generate intelligent answers based on query type and context"""
+        if not context:
+            return "No relevant information found in the documents."
+        query_lower = query.lower()
+        # Determine answer type
+        is_summary_request = any(word in query_lower for word in ['summary', 'summarize', 'overview', 'what is', 'about'])
+        is_comparison_request = any(word in query_lower for word in ['compare', 'difference', 'versus', 'vs'])
+        is_specific_question = any(word in query_lower for word in ['how', 'why', 'when', 'where', 'which'])
+        if self.model and self.tokenizer:
+            try:
+                # Create intelligent prompt based on query type
+                if is_summary_request:
+                    prompt = self.create_summary_prompt(query, context)
+                elif is_comparison_request:
+                    prompt = self.create_comparison_prompt(query, context)
+                else:
+                    prompt = self.create_general_prompt(query, context)
+                # Generate response
+                inputs = self.tokenizer(
+                    prompt,
+                    return_tensors="pt",
+                    max_length=800,
+                    truncation=True,
+                    padding=True
+                )
+                if torch.cuda.is_available() and next(self.model.parameters()).is_cuda:
+                    inputs = {k: v.cuda() for k, v in inputs.items()}
+                with torch.no_grad():
+                    outputs = self.model.generate(
+                        **inputs,
+                        max_new_tokens=150,
+                        temperature=0.3,
+                        do_sample=True,
+                        top_p=0.9,
+                        repetition_penalty=1.1,
+                        pad_token_id=self.tokenizer.pad_token_id,
+                        eos_token_id=self.tokenizer.eos_token_id
+                    )
+                # Extract and clean answer
+                full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+                answer = self.extract_answer_from_response(full_response, prompt)
+                if answer and len(answer) > 20:
+                    return self.clean_and_validate_answer(answer)
+            except Exception as e:
+                print(f"Error in AI generation: {e}")
+        # Fallback to intelligent context-based answering
+        return self.context_based_smart_answer(query, context, chunks_data)
+    def create_summary_prompt(self, query: str, context: str) -> str:
+        """Create prompt for summary requests"""
+        return f"""Based on the document content below, provide a comprehensive summary addressing the question.
+Document Content:
+{context[:1000]}
 Question: {query}
+Provide a clear, informative summary that addresses the question:"""
+    def create_comparison_prompt(self, query: str, context: str) -> str:
+        """Create prompt for comparison requests"""
+        return f"""Analyze the document content and provide a comparison as requested.
+Document Content:
+{context[:1000]}
 Question: {query}
+Provide a detailed comparison based on the information:"""
+    def create_general_prompt(self, query: str, context: str) -> str:
+        """Create prompt for general questions"""
+        return f"""Answer the question based on the document content provided.
+Document Content:
+{context[:1000]}
+Question: {query}
+Provide a specific, accurate answer:"""
+    def extract_answer_from_response(self, response: str, prompt: str) -> str:
+        """Extract clean answer from model response"""
+        # Remove the prompt part
+        if prompt in response:
+            answer = response.replace(prompt, "").strip()
+        else:
+            # Try to find the answer after common patterns
+            patterns = ["Answer:", "Summary:", "Response:", "answer:", "summary:", "response:"]
+            answer = response
+            for pattern in patterns:
+                if pattern in response:
+                    answer = response.split(pattern)[-1].strip()
+                    break
+        return answer
+    def context_based_smart_answer(self, query: str, context: str, chunks_data: List[Dict]) -> str:
+        """Intelligent context-based answering as fallback"""
+        query_lower = query.lower()
+        # For summary requests
+        if any(word in query_lower for word in ['summary', 'summarize', 'overview', 'about']):
+            return self.create_context_summary(context, chunks_data)
+        # For specific questions, find most relevant sentences
+        context_sentences = [s.strip() for s in context.split('.') if len(s.strip()) > 20]
+        query_words = set(query_lower.split())
+        # Score sentences by relevance
+        scored_sentences = []
+        for sentence in context_sentences:
+            sentence_words = set(sentence.lower().split())
+            overlap = len(query_words.intersection(sentence_words))
+            if overlap > 0:
+                scored_sentences.append((sentence, overlap))
+        # Sort by relevance and combine top sentences
+        scored_sentences.sort(key=lambda x: x[1], reverse=True)
+        if scored_sentences:
+            top_sentences = [s[0] for s in scored_sentences[:3]]
+            return '. '.join(top_sentences) + '.'
+        return "I found relevant information but couldn't extract a specific answer. Please try rephrasing your question."
+    def create_context_summary(self, context: str, chunks_data: List[Dict]) -> str:
+        """Create summary from context"""
+        # Get key sentences from different sections
+        sentences_by_section = {}
+        for chunk in chunks_data:
+            section = chunk['metadata']['section']
+            sentences = [s.strip() for s in chunk['text'].split('.') if len(s.strip()) > 30]
+            if sentences:
+                if section not in sentences_by_section:
+                    sentences_by_section[section] = []
+                sentences_by_section[section].extend(sentences[:2])  # Top 2 sentences per section
+        # Combine sentences from different sections
+        summary_parts = []
+        for section, sentences in sentences_by_section.items():
+            if sentences:
+                summary_parts.extend(sentences[:1])  # One sentence per section
+        if summary_parts:
+            return '. '.join(summary_parts[:4]) + '.'  # Max 4 sentences
+        return self.document_summary if self.document_summary else "Document contains relevant information on the requested topic."
+    def clean_and_validate_answer(self, answer: str) -> str:
+        """Clean and validate the generated answer"""
         # Remove unwanted patterns
         answer = re.sub(r'--- \w+.*? ---', '', answer)
+        answer = re.sub(r'\[Page \d+\]', '', answer)
+        # Clean up whitespace and formatting
         answer = ' '.join(answer.split())
+        # Ensure proper sentence structure
+        if answer and not answer.endswith(('.', '!', '?')):
+            answer += '.'
         return answer.strip()
     def answer_question(self, query: str) -> str:
+        """Main function to answer questions intelligently"""
         if not query.strip():
             return "❓ Please ask a question!"
             return "📁 Please upload and process documents first!"
         try:
+            # Special handling for document-level questions
+            query_lower = query.lower()
+            if query_lower in ['summary', 'summarize this document', 'what is this about']:
+                return f"📄 Document Summary:\n\n{self.document_summary}"
+            # Retrieve relevant context with intelligence
+            context, chunks_data = self.smart_retrieve_context(query, k=4)
             if not context:
+                return "🔍 No relevant information found for your question. Try rephrasing or asking about different aspects of the document."
+            # Generate intelligent answer
+            answer = self.generate_smart_answer(query, context, chunks_data)
+            return answer if answer else "I couldn't generate a specific answer. Please try asking in a different way."
         except Exception as e:
+            return f"❌ Error processing question: {str(e)}"
+# Initialize the enhanced RAG system
+print("Initializing Smart Document RAG System...")
+rag_system = SmartDocumentRAG()
+# Enhanced Gradio Interface
 def create_interface():
+    with gr.Blocks(title="🧠 Smart Document Q&A", theme=gr.themes.Soft()) as demo:
         gr.Markdown("""
+        # 🧠 Smart Document Q&A System
+        Upload documents and get intelligent answers with summaries and insights!
+        **Features:**
+        - 🎯 Intelligent document type detection
+        - 📊 Smart summarization
+        - 🔍 Context-aware answers
+        - 📚 Multi-format support (PDF, DOCX, TXT)
         """)
+        with gr.Tab("📤 Upload & Process"):
             with gr.Row():
                 with gr.Column():
                     file_upload = gr.File(
+                        label="📁 Upload Documents",
                         file_count="multiple",
+                        file_types=[".pdf", ".docx", ".txt"],
+                        height=150
                     )
+                    process_btn = gr.Button("🔄 Process Documents", variant="primary", size="lg")
                 with gr.Column():
                     process_status = gr.Textbox(
+                        label="📋 Processing Status & Document Analysis",
+                        lines=10,
                         interactive=False
                     )
                 outputs=[process_status]
             )
+        with gr.Tab("❓ Smart Q&A"):
             with gr.Row():
                 with gr.Column():
                     question_input = gr.Textbox(
+                        label="🤔 Ask Anything",
+                        placeholder="What is this document about? / Summarize the main points / How does X work?",
                         lines=3
                     )
+                    with gr.Row():
+                        ask_btn = gr.Button("🧠 Get Smart Answer", variant="primary")
+                        summary_btn = gr.Button("📊 Get Summary", variant="secondary")
                 with gr.Column():
                     answer_output = gr.Textbox(
+                        label="💡 Smart Answer",
+                        lines=8,
                         interactive=False
                     )
                 outputs=[answer_output]
             )
+            summary_btn.click(
+                fn=lambda: rag_system.answer_question("summary"),
+                inputs=[],
+                outputs=[answer_output]
+            )
+            # Enhanced example questions
+            gr.Markdown("""
+            ### 💡 Smart Question Examples:
+            **📊 For Summaries:**
+            - "What is this document about?"
+            - "Summarize the main points"
+            - "Give me an overview"
+            **🔍 For Specific Information:**
+            - "How does [topic] work?"
+            - "What are the key findings?"
+            - "Explain [concept] from the document"
+            **🎯 For Analysis:**
+            - "What are the pros and cons?"
+            - "Compare [A] and [B]"
+            - "What conclusions can be drawn?"
+            """)
+        with gr.Tab("ℹ️ Tips"):
             gr.Markdown("""
+            ### 🚀 How to Get the Best Results:
+            **📄 Document Types Supported:**
+            - Research papers & academic documents
+            - Business reports & presentations
+            - Technical documentation
+            - Legal documents
+            - General text documents
+            **❓ Question Tips:**
+            - Be specific about what you want to know
+            - Use "summarize" or "overview" for general summaries
+            - Ask "how", "why", "what" for detailed explanations
+            - Request comparisons with "compare" or "difference"
+            **🎯 Best Practices:**
+            - Upload clear, well-formatted documents
+            - Ask one question at a time for focused answers
+            - Try rephrasing if the first answer isn't what you expected
             """)
     return demo
+# Launch the enhanced app
 if __name__ == "__main__":
     demo = create_interface()
     demo.launch(