Spaces:

pradeepsengarr
/

Custom_Rag_Bot

Running

App Files Files Community

pradeepsengarr commited on Jun 7

Commit

d64804c

verified ·

1 Parent(s): adc1d58

Update app.py

Browse files

Files changed (1) hide show

app.py +285 -167

app.py CHANGED Viewed

@@ -1,245 +1,363 @@
 import re
 import faiss
 import numpy as np
-from typing import List
 from sentence_transformers import SentenceTransformer
 from transformers import pipeline
-import gradio as gr
-# Helper: clean and normalize text
-def clean_text(text: str) -> str:
-    text = re.sub(r'\s+', ' ', text)
-    text = text.strip()
-    return text
-# Main class for Document Retrieval & Q&A
 class SmartDocumentRAG:
-    def __init__(self,
-                 embedder_model='sentence-transformers/all-MiniLM-L6-v2',
-                 qa_model='distilbert-base-cased-distilled-squad',
-                 summarization_model='facebook/bart-large-cnn'):
-        print("Loading models... this may take a moment.")
-        # Embedding model for semantic search
         self.embedder = SentenceTransformer(embedder_model)
-        # Q&A pipeline for answering questions
         self.qa_pipeline = pipeline('question-answering', model=qa_model, tokenizer=qa_model)
-        # Summarization pipeline for document summaries
-        self.summarizer = pipeline('summarization', model=summarization_model, tokenizer=summarization_model)
-        # Initialize document storage and index
-        self.documents: List[str] = []
         self.index = None
         self.is_indexed = False
-        self.document_summary = ""
-        self.raw_text = ""
-    # --- Document processing ---
-    def chunk_text(self, text: str, max_len: int = 250) -> List[str]:
-        # Split text into smaller chunks of max_len tokens approx (words here)
-        words = text.split()
         chunks = []
-        for i in range(0, len(words), max_len):
-            chunk = ' '.join(words[i:i+max_len])
-            chunks.append(clean_text(chunk))
         return chunks
     def process_documents(self, files) -> str:
         if not files:
             return "❌ No files uploaded!"
-        all_text = ""
         try:
-            for file_obj in files:
-                filename = file_obj.name
-                file_bytes = file_obj.read()
-                ext = filename.split('.')[-1].lower()
-                text = ""
-                if ext == 'pdf':
-                    import fitz  # PyMuPDF
-                    doc = fitz.open(stream=file_bytes, filetype="pdf")
-                    for page in doc:
-                        text += page.get_text()
-                    doc.close()
-                elif ext == 'docx':
-                    import docx2txt
-                    import io
-                    # docx2txt accepts path or file-like; use BytesIO
-                    text = docx2txt.process(io.BytesIO(file_bytes))
-                elif ext in ['txt', 'text']:
-                    text = file_bytes.decode('utf-8', errors='ignore')
                 else:
-                    return f"❌ Unsupported file type: {ext}"
-                all_text += "\n\n" + text
-            all_text = clean_text(all_text)
-            self.raw_text = all_text
-            # Chunk documents
-            self.documents = self.chunk_text(all_text)
-            if not self.documents:
-                return "❌ No text extracted from documents."
-            # Build FAISS index
-            embeddings = self.embedder.encode(self.documents, convert_to_numpy=True, show_progress_bar=True)
-            embeddings = embeddings.astype('float32')
             dimension = embeddings.shape[1]
             self.index = faiss.IndexFlatIP(dimension)
             faiss.normalize_L2(embeddings)
-            self.index.add(embeddings)
             self.is_indexed = True
-            # Create summary
-            self.document_summary = self.create_document_summary(all_text)
-            return f"✅ Processed {len(self.documents)} text chunks from documents. Summary generated."
         except Exception as e:
-            return f"❌ Error processing documents: {str(e)}"
-    # --- Semantic search ---
     def find_relevant_content(self, query: str, top_k: int = 3) -> str:
-        if not self.is_indexed or not self.index:
             return ""
         try:
             query_embedding = self.embedder.encode([query], convert_to_numpy=True)
             faiss.normalize_L2(query_embedding)
-            scores, indices = self.index.search(query_embedding.astype('float32'), top_k)
             relevant_chunks = []
             for score, idx in zip(scores[0], indices[0]):
-                if idx < len(self.documents) and score > 0.15:  # threshold tuned to reduce noise
                     relevant_chunks.append(self.documents[idx])
-            if not relevant_chunks:
-                return ""
-            return ' '.join(relevant_chunks)
         except Exception as e:
-            print(f"Error in semantic search: {e}")
             return ""
-    # --- Summarization ---
-    def create_document_summary(self, text: str) -> str:
-        try:
-            # Limit input size for summarizer to ~1000 tokens to avoid issues
-            max_input_length = 1000
-            input_text = text[:max_input_length] + ('...' if len(text) > max_input_length else '')
-            summary_output = self.summarizer(input_text, max_length=150, min_length=40, do_sample=False)
-            summary = summary_output[0]['summary_text']
-            return summary
-        except Exception as e:
-            # fallback simple heuristic summary
-            sentences = re.split(r'(?<=[.!?]) +', text)
-            return sentences[0][:300] + ('...' if len(sentences[0]) > 300 else '')
-    # --- Question answering ---
-    def answer_question(self, query: str) -> str:
         if not query.strip():
-            return "❓ Please ask a question!"
         if not self.is_indexed:
-            return "📁 Please upload and process documents first!"
         query_lower = query.lower()
-        # Summary shortcut
         if any(word in query_lower for word in ['summary', 'summarize', 'overview', 'about']):
-            return f"📄 Document Summary:\n\n{self.document_summary}"
-        # Get relevant context
         context = self.find_relevant_content(query, top_k=3)
         if not context:
-            return "🔍 No relevant information found for your question."
         try:
-            # Q&A pipeline expects question + context separately
             result = self.qa_pipeline(question=query, context=context)
             answer = result.get('answer', '').strip()
             score = result.get('score', 0.0)
-            # Confidence thresholding & hallucination check
             if score < 0.20 or not answer or answer.lower() in ['no answer', '']:
-                return "I don't know based on the provided documents."
-            # Optional heuristic: if answer too short or irrelevant to question, fallback
             if len(answer) < 3 or (query_lower not in answer.lower() and score < 0.35):
-                return "I don't know based on the provided documents."
-            # Return answer + snippet from context for transparency
-            return f"**Answer:** {answer}\n\n*Context snippet:* {context[:300]}..."
         except Exception as e:
-            return f"❌ Error answering question: {str(e)}"
-# --- Gradio UI ---
 def create_interface():
     rag_system = SmartDocumentRAG()
-    with gr.Blocks(title="🧠 Enhanced Document Q&A System", theme=gr.themes.Soft()) as demo:
         gr.Markdown("""
         # 🧠 Enhanced Document Q&A System
         **Features:**
-        - 🎯 DistilBERT for Q&A with confidence checks
-        - ⚡ Sentence-BERT + FAISS semantic search
-        - 📊 Strong summarization with BART-large-CNN
-        - 🔍 Transparent answers with context snippets
         """)
         with gr.Tab("📤 Upload & Process"):
             with gr.Row():
                 with gr.Column():
-                    file_upload = gr.File(
-                        label="📁 Upload Documents (PDF, DOCX, TXT)",
-                        file_count="multiple",
-                        file_types=[".pdf", ".docx", ".txt"],
-                        height=150
-                    )
-                    process_btn = gr.Button("🔄 Process Documents", variant="primary", size="lg")
                 with gr.Column():
-                    process_status = gr.Textbox(label="📋 Processing Status", lines=10, interactive=False)
-            process_btn.click(
-                fn=rag_system.process_documents,
-                inputs=[file_upload],
-                outputs=[process_status]
-            )
         with gr.Tab("❓ Q&A"):
             with gr.Row():
                 with gr.Column():
-                    question_input = gr.Textbox(
-                        label="🤔 Ask Your Question",
-                        placeholder="e.g., What is the person's name? How many years of experience? What skills do they have?",
-                        lines=3
-                    )
                     with gr.Row():
                         ask_btn = gr.Button("🧠 Get Answer", variant="primary")
                         summary_btn = gr.Button("📊 Get Summary", variant="secondary")
                 with gr.Column():
                     answer_output = gr.Textbox(label="💡 Answer", lines=8, interactive=False)
-            ask_btn.click(
-                fn=rag_system.answer_question,
-                inputs=[question_input],
-                outputs=[answer_output]
-            )
-            summary_btn.click(
-                fn=lambda: rag_system.answer_question("summary"),
-                inputs=[],
-                outputs=[answer_output]
-            )
     return demo

+import os
 import re
 import faiss
+import docx
+import PyPDF2
+import gradio as gr
 import numpy as np
+from typing import List, Dict
 from sentence_transformers import SentenceTransformer
 from transformers import pipeline
 class SmartDocumentRAG:
+    def __init__(self, embedder_model='sentence-transformers/all-MiniLM-L6-v2', qa_model='distilbert-base-cased-distilled-squad'):
+        # Load sentence embedding model
         self.embedder = SentenceTransformer(embedder_model)
+        # Load Q&A pipeline model
         self.qa_pipeline = pipeline('question-answering', model=qa_model, tokenizer=qa_model)
+        # Document and index initialization
+        self.documents = []
+        self.document_metadata = []
+        self.raw_text = ""
+        self.document_summary = ""
+        self.document_type = ""
         self.index = None
         self.is_indexed = False
+        self.model_type = "distilbert-qa"  # Can add flan-t5 or others as needed
+    ####################
+    # Text Extraction
+    ####################
+    def extract_text_from_file(self, file_path: str) -> str:
+        ext = os.path.splitext(file_path)[1].lower()
+        try:
+            if ext == '.pdf':
+                return self.extract_from_pdf(file_path)
+            elif ext == '.docx':
+                return self.extract_from_docx(file_path)
+            elif ext == '.txt':
+                return self.extract_from_txt(file_path)
+            else:
+                return f"Unsupported file type: {ext}"
+        except Exception as e:
+            return f"Error reading file: {e}"
+    def extract_from_pdf(self, file_path: str) -> str:
+        text = ""
+        try:
+            with open(file_path, 'rb') as f:
+                reader = PyPDF2.PdfReader(f)
+                for page in reader.pages:
+                    txt = page.extract_text() or ""
+                    cleaned = self.clean_text(txt)
+                    text += cleaned + "\n"
+            return text.strip()
+        except Exception as e:
+            return f"Error reading PDF: {e}"
+    def extract_from_docx(self, file_path: str) -> str:
+        try:
+            doc = docx.Document(file_path)
+            paragraphs = [self.clean_text(p.text) for p in doc.paragraphs if p.text.strip()]
+            return "\n".join(paragraphs)
+        except Exception as e:
+            return f"Error reading DOCX: {e}"
+    def extract_from_txt(self, file_path: str) -> str:
+        encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
+        for enc in encodings:
+            try:
+                with open(file_path, 'r', encoding=enc) as f:
+                    return self.clean_text(f.read())
+            except UnicodeDecodeError:
+                continue
+            except Exception as e:
+                return f"Error reading TXT: {e}"
+        return "Could not decode TXT file."
+    def clean_text(self, text: str) -> str:
+        # Normalize whitespace, fix broken words, remove weird chars
+        text = re.sub(r'\s+', ' ', text)
+        text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)  # Fix camel case merges
+        text = text.strip()
+        return text
+    ####################
+    # Document Type Detection & Summary
+    ####################
+    def detect_document_type(self, text: str) -> str:
+        lower_text = text.lower()
+        if any(k in lower_text for k in ['abstract', 'study', 'research', 'methodology']):
+            return 'research'
+        elif any(k in lower_text for k in ['company', 'business', 'organization', 'financial']):
+            return 'business'
+        else:
+            return 'general'
+    def create_document_summary(self, text: str) -> str:
+        sentences = re.split(r'(?<=[.!?]) +', text)
+        sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
+        if self.document_type == 'research':
+            return self.extract_research_summary(sentences)
+        elif self.document_type == 'business':
+            return self.extract_business_summary(sentences)
+        else:
+            return self.extract_general_summary(sentences)
+    def extract_research_summary(self, sentences: List[str]) -> str:
+        for s in sentences[:7]:
+            if any(w in s.lower() for w in ['abstract', 'study', 'research']):
+                return s[:300] + ('...' if len(s) > 300 else '')
+        return sentences[0][:300] if sentences else "Research document."
+    def extract_business_summary(self, sentences: List[str]) -> str:
+        for s in sentences[:5]:
+            if any(w in s.lower() for w in ['company', 'business', 'organization']):
+                return s[:300] + ('...' if len(s) > 300 else '')
+        return sentences[0][:300] if sentences else "Business document."
+    def extract_general_summary(self, sentences: List[str]) -> str:
+        return sentences[0][:300] + ('...' if len(sentences[0]) > 300 else '') if sentences else "General document."
+    ####################
+    # Chunking
+    ####################
+    def enhanced_chunk_text(self, text: str, chunk_size: int = 3, overlap: int = 1) -> List[Dict]:
+        if not text.strip():
+            return []
+        sentences = re.split(r'(?<=[.!?]) +', text)
+        sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
         chunks = []
+        for i in range(0, len(sentences), chunk_size - overlap):
+            chunk_sents = sentences[i:i + chunk_size]
+            if chunk_sents:
+                chunk_text = " ".join(chunk_sents)
+                chunks.append({
+                    "text": chunk_text,
+                    "sentence_indices": list(range(i, min(i + chunk_size, len(sentences)))),
+                    "doc_type": self.document_type
+                })
         return chunks
+    ####################
+    # Processing uploaded files
+    ####################
     def process_documents(self, files) -> str:
         if not files:
             return "❌ No files uploaded!"
         try:
+            all_text = ""
+            processed_files = []
+            for file in files:
+                if file is None:
+                    continue
+                file_text = self.extract_text_from_file(file.name)
+                if not file_text.startswith("Error") and not file_text.startswith("Unsupported"):
+                    all_text += " " + file_text
+                    processed_files.append(os.path.basename(file.name))
                 else:
+                    return f"❌ {file_text}"
+            if not all_text.strip():
+                return "❌ No text extracted from files!"
+            self.raw_text = all_text.strip()
+            self.document_type = self.detect_document_type(self.raw_text)
+            self.document_summary = self.create_document_summary(self.raw_text)
+            chunks = self.enhanced_chunk_text(self.raw_text)
+            if not chunks:
+                return "❌ No valid chunks created!"
+            self.documents = [c["text"] for c in chunks]
+            self.document_metadata = chunks
+            embeddings = self.embedder.encode(self.documents, show_progress_bar=False, convert_to_numpy=True)
             dimension = embeddings.shape[1]
             self.index = faiss.IndexFlatIP(dimension)
             faiss.normalize_L2(embeddings)
+            self.index.add(embeddings.astype('float32'))
             self.is_indexed = True
+            return (f"✅ Processed {len(processed_files)} files: {', '.join(processed_files)}\n"
+                    f"📄 Document Type: {self.document_type.title()}\n"
+                    f"🔍 Created {len(self.documents)} chunks\n"
+                    f"📝 Summary: {self.document_summary}\n"
+                    f"🚀 Ready for Q&A!")
         except Exception as e:
+            return f"❌ Error processing documents: {e}"
+    ####################
+    # Search & Answer
+    ####################
     def find_relevant_content(self, query: str, top_k: int = 3) -> str:
+        if not self.is_indexed:
             return ""
         try:
             query_embedding = self.embedder.encode([query], convert_to_numpy=True)
             faiss.normalize_L2(query_embedding)
+            k = min(top_k, len(self.documents))
+            scores, indices = self.index.search(query_embedding.astype('float32'), k)
             relevant_chunks = []
             for score, idx in zip(scores[0], indices[0]):
+                if idx < len(self.documents) and score > 0.15:
                     relevant_chunks.append(self.documents[idx])
+            return " ".join(relevant_chunks)
         except Exception as e:
+            print(f"Search error: {e}")
             return ""
+    def answer_question(self, query: str) -> str:
+        """
+        Answer the user's question based on processed documents.
+        Features:
+        - Returns document summary if query asks for summary.
+        - Uses semantic search to find relevant context.
+        - Uses QA pipeline with prompt-style input.
+        - Applies confidence threshold to reduce hallucinations.
+        - Returns a fallback message if answer is unreliable.
+        """
         if not query.strip():
+            return "❓ Please ask a valid question."
         if not self.is_indexed:
+            return "📁 Please upload and process documents before asking questions."
         query_lower = query.lower()
+        # Handle summary requests
         if any(word in query_lower for word in ['summary', 'summarize', 'overview', 'about']):
+            if self.document_summary:
+                return f"📄 Document Summary:\n\n{self.document_summary}"
+            else:
+                return "⚠️ Summary not available. Please process documents first."
+        # Find relevant chunks for context
         context = self.find_relevant_content(query, top_k=3)
         if not context:
+            return "🔍 Sorry, no relevant information was found for your question. Try rephrasing."
         try:
+            # Prepare input for QA pipeline (some QA pipelines accept question and context separately)
+            # For distilbert QA pipeline:
             result = self.qa_pipeline(question=query, context=context)
             answer = result.get('answer', '').strip()
             score = result.get('score', 0.0)
+            # Confidence threshold to prevent hallucination
             if score < 0.20 or not answer or answer.lower() in ['no answer', '']:
+                return "🤔 I couldn't find a confident answer to your question based on the documents."
+            # Optional heuristic: check if answer is too generic or unrelated
             if len(answer) < 3 or (query_lower not in answer.lower() and score < 0.35):
+                return "🤔 I couldn't find a confident answer to your question based on the documents."
+            # Return answer with a snippet of context for transparency
+            snippet = context[:300].strip()
+            if len(context) > 300:
+                snippet += "..."
+            return f"**Answer:** {answer}\n\n*Context snippet:* {snippet}"
         except Exception as e:
+            # If model fails, fallback to simple answer or message
+            return f"❌ An error occurred while answering your question: {str(e)}"
+    def extract_direct_answer(self, query: str, context: str) -> str:
+        lower_query = query.lower()
+        # Extract names (simple heuristic)
+        if any(k in lower_query for k in ['name', 'who is', 'who']):
+            names = re.findall(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', context)
+            if names:
+                return f"**Name:** {names[0]}"
+        # Extract experience years
+        if any(k in lower_query for k in ['experience', 'years']):
+            exp = re.findall(r'(\d+)[\+\-\s]*(?:years?|yrs?)', context.lower())
+            if exp:
+                return f"**Experience:** {exp[0]} years"
+        # Extract skills
+        if any(k in lower_query for k in ['skill', 'technology', 'tech']):
+            skills_regex = r'\b(Python|Java|JavaScript|React|Node|SQL|AWS|Docker|Kubernetes|Git|HTML|CSS|Angular|Vue|Spring|Django|Flask|MongoDB|PostgreSQL)\b'
+            skills_found = list(set(re.findall(skills_regex, context, re.I)))
+            if skills_found:
+                return f"**Skills mentioned:** {', '.join(skills_found)}"
+        # Extract education
+        if any(k in lower_query for k in ['education', 'degree', 'university']):
+            edu = re.findall(r'(?:Bachelor|Master|PhD|B\.?S\.?|M\.?S\.?|B\.?A\.?|M\.?A\.?).*?(?:in|of)\s+([^.]+)', context, re.I)
+            if edu:
+                return f"**Education:** {edu[0]}"
+        # Fallback: first sentence
+        sentences = re.split(r'(?<=[.!?]) +', context)
+        if sentences:
+            return f"**Answer:** {sentences[0]}"
+        return "I found relevant information but could not extract a precise answer."
+# Gradio interface creation
 def create_interface():
     rag_system = SmartDocumentRAG()
+    with gr.Blocks(title="🧠 Enhanced Document Q&A", theme=gr.themes.Soft()) as demo:
         gr.Markdown("""
         # 🧠 Enhanced Document Q&A System
+        **Optimized with Better Chunking, Summaries, and Reduced Hallucination**
         **Features:**
+        - 🎯 DistilBERT Q&A pipeline for accurate answers
+        - ⚡ SentenceTransformer embeddings + FAISS semantic search
+        - 📊 Improved document summaries & chunking
+        - 🔍 Direct answer fallback for facts extraction
         """)
         with gr.Tab("📤 Upload & Process"):
             with gr.Row():
                 with gr.Column():
+                    file_upload = gr.File(label="📁 Upload Documents", file_types=[".pdf", ".docx", ".txt"], file_count="multiple", interactive=True)
+                    process_btn = gr.Button("🔄 Process Documents", variant="primary")
                 with gr.Column():
+                    process_status = gr.Textbox(label="📋 Processing Status", lines=8, interactive=False)
+            process_btn.click(fn=rag_system.process_documents, inputs=[file_upload], outputs=[process_status])
         with gr.Tab("❓ Q&A"):
             with gr.Row():
                 with gr.Column():
+                    question_input = gr.Textbox(label="🤔 Ask Your Question", placeholder="Enter your question here...", lines=3)
                     with gr.Row():
                         ask_btn = gr.Button("🧠 Get Answer", variant="primary")
                         summary_btn = gr.Button("📊 Get Summary", variant="secondary")
                 with gr.Column():
                     answer_output = gr.Textbox(label="💡 Answer", lines=8, interactive=False)
+            ask_btn.click(fn=rag_system.answer_question, inputs=[question_input], outputs=[answer_output])
+            summary_btn.click(fn=lambda: rag_system.answer_question("summary"), inputs=[], outputs=[answer_output])
     return demo