Spaces:

pradeepsengarr
/

Custom_Rag_Bot

Running

App Files Files Community

pradeepsengarr commited on Jun 7

Commit

adc1d58

verified ·

1 Parent(s): aee5caa

Update app.py

Browse files

Files changed (1) hide show

app.py +172 -261

app.py CHANGED Viewed

@@ -1,334 +1,245 @@
-import os
 import re
 import faiss
-import docx
-import PyPDF2
-import gradio as gr
 import numpy as np
-from typing import List, Dict
 from sentence_transformers import SentenceTransformer
 from transformers import pipeline
 class SmartDocumentRAG:
-    def __init__(self, embedder_model='sentence-transformers/all-MiniLM-L6-v2', qa_model='distilbert-base-cased-distilled-squad'):
-        # Load sentence embedding model
         self.embedder = SentenceTransformer(embedder_model)
-        # Load Q&A pipeline model
         self.qa_pipeline = pipeline('question-answering', model=qa_model, tokenizer=qa_model)
-        # Document and index initialization
-        self.documents = []
-        self.document_metadata = []
-        self.raw_text = ""
-        self.document_summary = ""
-        self.document_type = ""
         self.index = None
         self.is_indexed = False
-        self.model_type = "distilbert-qa"  # Can add flan-t5 or others as needed
-    ####################
-    # Text Extraction
-    ####################
-    def extract_text_from_file(self, file_path: str) -> str:
-        ext = os.path.splitext(file_path)[1].lower()
-        try:
-            if ext == '.pdf':
-                return self.extract_from_pdf(file_path)
-            elif ext == '.docx':
-                return self.extract_from_docx(file_path)
-            elif ext == '.txt':
-                return self.extract_from_txt(file_path)
-            else:
-                return f"Unsupported file type: {ext}"
-        except Exception as e:
-            return f"Error reading file: {e}"
-    def extract_from_pdf(self, file_path: str) -> str:
-        text = ""
-        try:
-            with open(file_path, 'rb') as f:
-                reader = PyPDF2.PdfReader(f)
-                for page in reader.pages:
-                    txt = page.extract_text() or ""
-                    cleaned = self.clean_text(txt)
-                    text += cleaned + "\n"
-            return text.strip()
-        except Exception as e:
-            return f"Error reading PDF: {e}"
-    def extract_from_docx(self, file_path: str) -> str:
-        try:
-            doc = docx.Document(file_path)
-            paragraphs = [self.clean_text(p.text) for p in doc.paragraphs if p.text.strip()]
-            return "\n".join(paragraphs)
-        except Exception as e:
-            return f"Error reading DOCX: {e}"
-    def extract_from_txt(self, file_path: str) -> str:
-        encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
-        for enc in encodings:
-            try:
-                with open(file_path, 'r', encoding=enc) as f:
-                    return self.clean_text(f.read())
-            except UnicodeDecodeError:
-                continue
-            except Exception as e:
-                return f"Error reading TXT: {e}"
-        return "Could not decode TXT file."
-    def clean_text(self, text: str) -> str:
-        # Normalize whitespace, fix broken words, remove weird chars
-        text = re.sub(r'\s+', ' ', text)
-        text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)  # Fix camel case merges
-        text = text.strip()
-        return text
-    ####################
-    # Document Type Detection & Summary
-    ####################
-    def detect_document_type(self, text: str) -> str:
-        lower_text = text.lower()
-        if any(k in lower_text for k in ['abstract', 'study', 'research', 'methodology']):
-            return 'research'
-        elif any(k in lower_text for k in ['company', 'business', 'organization', 'financial']):
-            return 'business'
-        else:
-            return 'general'
-    def create_document_summary(self, text: str) -> str:
-        sentences = re.split(r'(?<=[.!?]) +', text)
-        sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
-        if self.document_type == 'research':
-            return self.extract_research_summary(sentences)
-        elif self.document_type == 'business':
-            return self.extract_business_summary(sentences)
-        else:
-            return self.extract_general_summary(sentences)
-    def extract_research_summary(self, sentences: List[str]) -> str:
-        for s in sentences[:7]:
-            if any(w in s.lower() for w in ['abstract', 'study', 'research']):
-                return s[:300] + ('...' if len(s) > 300 else '')
-        return sentences[0][:300] if sentences else "Research document."
-    def extract_business_summary(self, sentences: List[str]) -> str:
-        for s in sentences[:5]:
-            if any(w in s.lower() for w in ['company', 'business', 'organization']):
-                return s[:300] + ('...' if len(s) > 300 else '')
-        return sentences[0][:300] if sentences else "Business document."
-    def extract_general_summary(self, sentences: List[str]) -> str:
-        return sentences[0][:300] + ('...' if len(sentences[0]) > 300 else '') if sentences else "General document."
-    ####################
-    # Chunking
-    ####################
-    def enhanced_chunk_text(self, text: str, chunk_size: int = 3, overlap: int = 1) -> List[Dict]:
-        if not text.strip():
-            return []
-        sentences = re.split(r'(?<=[.!?]) +', text)
-        sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
         chunks = []
-        for i in range(0, len(sentences), chunk_size - overlap):
-            chunk_sents = sentences[i:i + chunk_size]
-            if chunk_sents:
-                chunk_text = " ".join(chunk_sents)
-                chunks.append({
-                    "text": chunk_text,
-                    "sentence_indices": list(range(i, min(i + chunk_size, len(sentences)))),
-                    "doc_type": self.document_type
-                })
         return chunks
-    ####################
-    # Processing uploaded files
-    ####################
     def process_documents(self, files) -> str:
         if not files:
             return "❌ No files uploaded!"
         try:
-            all_text = ""
-            processed_files = []
-            for file in files:
-                if file is None:
-                    continue
-                file_text = self.extract_text_from_file(file.name)
-                if not file_text.startswith("Error") and not file_text.startswith("Unsupported"):
-                    all_text += " " + file_text
-                    processed_files.append(os.path.basename(file.name))
                 else:
-                    return f"❌ {file_text}"
-            if not all_text.strip():
-                return "❌ No text extracted from files!"
-            self.raw_text = all_text.strip()
-            self.document_type = self.detect_document_type(self.raw_text)
-            self.document_summary = self.create_document_summary(self.raw_text)
-            chunks = self.enhanced_chunk_text(self.raw_text)
-            if not chunks:
-                return "❌ No valid chunks created!"
-            self.documents = [c["text"] for c in chunks]
-            self.document_metadata = chunks
-            embeddings = self.embedder.encode(self.documents, show_progress_bar=False, convert_to_numpy=True)
             dimension = embeddings.shape[1]
             self.index = faiss.IndexFlatIP(dimension)
             faiss.normalize_L2(embeddings)
-            self.index.add(embeddings.astype('float32'))
             self.is_indexed = True
-            return (f"✅ Processed {len(processed_files)} files: {', '.join(processed_files)}\n"
-                    f"📄 Document Type: {self.document_type.title()}\n"
-                    f"🔍 Created {len(self.documents)} chunks\n"
-                    f"📝 Summary: {self.document_summary}\n"
-                    f"🚀 Ready for Q&A!")
         except Exception as e:
-            return f"❌ Error processing documents: {e}"
-    ####################
-    # Search & Answer
-    ####################
     def find_relevant_content(self, query: str, top_k: int = 3) -> str:
-        if not self.is_indexed:
             return ""
         try:
             query_embedding = self.embedder.encode([query], convert_to_numpy=True)
             faiss.normalize_L2(query_embedding)
-            k = min(top_k, len(self.documents))
-            scores, indices = self.index.search(query_embedding.astype('float32'), k)
             relevant_chunks = []
             for score, idx in zip(scores[0], indices[0]):
-                if idx < len(self.documents) and score > 0.15:
                     relevant_chunks.append(self.documents[idx])
-            return " ".join(relevant_chunks)
         except Exception as e:
-            print(f"Search error: {e}")
             return ""
     def answer_question(self, query: str) -> str:
         if not query.strip():
             return "❓ Please ask a question!"
         if not self.is_indexed:
             return "📁 Please upload and process documents first!"
         try:
-            lower_query = query.lower()
-            if any(k in lower_query for k in ['summary', 'summarize', 'about', 'overview']):
-                return f"📄 **Document Summary:**\n\n{self.document_summary}"
-            context = self.find_relevant_content(query, top_k=3)
-            if not context:
-                return "🔍 No relevant information found. Try rephrasing your question."
-            # Use Q&A pipeline
             result = self.qa_pipeline(question=query, context=context)
             answer = result.get('answer', '').strip()
             score = result.get('score', 0.0)
-            if score < 0.15 or not answer:
-                # Fallback to direct extraction
-                return self.extract_direct_answer(query, context)
-            return f"**Answer:** {answer}\n\n**Context:** {context[:300]}..."
         except Exception as e:
-            return f"❌ Error answering question: {e}"
-    def extract_direct_answer(self, query: str, context: str) -> str:
-        lower_query = query.lower()
-        # Extract names (simple heuristic)
-        if any(k in lower_query for k in ['name', 'who is', 'who']):
-            names = re.findall(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', context)
-            if names:
-                return f"**Name:** {names[0]}"
-        # Extract experience years
-        if any(k in lower_query for k in ['experience', 'years']):
-            exp = re.findall(r'(\d+)[\+\-\s]*(?:years?|yrs?)', context.lower())
-            if exp:
-                return f"**Experience:** {exp[0]} years"
-        # Extract skills
-        if any(k in lower_query for k in ['skill', 'technology', 'tech']):
-            skills_regex = r'\b(Python|Java|JavaScript|React|Node|SQL|AWS|Docker|Kubernetes|Git|HTML|CSS|Angular|Vue|Spring|Django|Flask|MongoDB|PostgreSQL)\b'
-            skills_found = list(set(re.findall(skills_regex, context, re.I)))
-            if skills_found:
-                return f"**Skills mentioned:** {', '.join(skills_found)}"
-        # Extract education
-        if any(k in lower_query for k in ['education', 'degree', 'university']):
-            edu = re.findall(r'(?:Bachelor|Master|PhD|B\.?S\.?|M\.?S\.?|B\.?A\.?|M\.?A\.?).*?(?:in|of)\s+([^.]+)', context, re.I)
-            if edu:
-                return f"**Education:** {edu[0]}"
-        # Fallback: first sentence
-        sentences = re.split(r'(?<=[.!?]) +', context)
-        if sentences:
-            return f"**Answer:** {sentences[0]}"
-        return "I found relevant information but could not extract a precise answer."
-# Gradio interface creation
 def create_interface():
     rag_system = SmartDocumentRAG()
-    with gr.Blocks(title="🧠 Enhanced Document Q&A", theme=gr.themes.Soft()) as demo:
         gr.Markdown("""
         # 🧠 Enhanced Document Q&A System
-        **Optimized with Better Chunking, Summaries, and Reduced Hallucination**
         **Features:**
-        - 🎯 DistilBERT Q&A pipeline for accurate answers
-        - ⚡ SentenceTransformer embeddings + FAISS semantic search
-        - 📊 Improved document summaries & chunking
-        - 🔍 Direct answer fallback for facts extraction
         """)
         with gr.Tab("📤 Upload & Process"):
             with gr.Row():
                 with gr.Column():
-                    file_upload = gr.File(label="📁 Upload Documents", file_types=[".pdf", ".docx", ".txt"], file_count="multiple", interactive=True)
-                    process_btn = gr.Button("🔄 Process Documents", variant="primary")
                 with gr.Column():
-                    process_status = gr.Textbox(label="📋 Processing Status", lines=8, interactive=False)
-            process_btn.click(fn=rag_system.process_documents, inputs=[file_upload], outputs=[process_status])
         with gr.Tab("❓ Q&A"):
             with gr.Row():
                 with gr.Column():
-                    question_input = gr.Textbox(label="🤔 Ask Your Question", placeholder="Enter your question here...", lines=3)
                     with gr.Row():
                         ask_btn = gr.Button("🧠 Get Answer", variant="primary")
                         summary_btn = gr.Button("📊 Get Summary", variant="secondary")
                 with gr.Column():
                     answer_output = gr.Textbox(label="💡 Answer", lines=8, interactive=False)
-            ask_btn.click(fn=rag_system.answer_question, inputs=[question_input], outputs=[answer_output])
-            summary_btn.click(fn=lambda: rag_system.answer_question("summary"), inputs=[], outputs=[answer_output])
     return demo

 import re
 import faiss
 import numpy as np
+from typing import List
 from sentence_transformers import SentenceTransformer
 from transformers import pipeline
+import gradio as gr
+# Helper: clean and normalize text
+def clean_text(text: str) -> str:
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+# Main class for Document Retrieval & Q&A
 class SmartDocumentRAG:
+    def __init__(self,
+                 embedder_model='sentence-transformers/all-MiniLM-L6-v2',
+                 qa_model='distilbert-base-cased-distilled-squad',
+                 summarization_model='facebook/bart-large-cnn'):
+        print("Loading models... this may take a moment.")
+        # Embedding model for semantic search
         self.embedder = SentenceTransformer(embedder_model)
+        # Q&A pipeline for answering questions
         self.qa_pipeline = pipeline('question-answering', model=qa_model, tokenizer=qa_model)
+        # Summarization pipeline for document summaries
+        self.summarizer = pipeline('summarization', model=summarization_model, tokenizer=summarization_model)
+        # Initialize document storage and index
+        self.documents: List[str] = []
         self.index = None
         self.is_indexed = False
+        self.document_summary = ""
+        self.raw_text = ""
+    # --- Document processing ---
+    def chunk_text(self, text: str, max_len: int = 250) -> List[str]:
+        # Split text into smaller chunks of max_len tokens approx (words here)
+        words = text.split()
         chunks = []
+        for i in range(0, len(words), max_len):
+            chunk = ' '.join(words[i:i+max_len])
+            chunks.append(clean_text(chunk))
         return chunks
     def process_documents(self, files) -> str:
         if not files:
             return "❌ No files uploaded!"
+        all_text = ""
         try:
+            for file_obj in files:
+                filename = file_obj.name
+                file_bytes = file_obj.read()
+                ext = filename.split('.')[-1].lower()
+                text = ""
+                if ext == 'pdf':
+                    import fitz  # PyMuPDF
+                    doc = fitz.open(stream=file_bytes, filetype="pdf")
+                    for page in doc:
+                        text += page.get_text()
+                    doc.close()
+                elif ext == 'docx':
+                    import docx2txt
+                    import io
+                    # docx2txt accepts path or file-like; use BytesIO
+                    text = docx2txt.process(io.BytesIO(file_bytes))
+                elif ext in ['txt', 'text']:
+                    text = file_bytes.decode('utf-8', errors='ignore')
                 else:
+                    return f"❌ Unsupported file type: {ext}"
+                all_text += "\n\n" + text
+            all_text = clean_text(all_text)
+            self.raw_text = all_text
+            # Chunk documents
+            self.documents = self.chunk_text(all_text)
+            if not self.documents:
+                return "❌ No text extracted from documents."
+            # Build FAISS index
+            embeddings = self.embedder.encode(self.documents, convert_to_numpy=True, show_progress_bar=True)
+            embeddings = embeddings.astype('float32')
             dimension = embeddings.shape[1]
             self.index = faiss.IndexFlatIP(dimension)
             faiss.normalize_L2(embeddings)
+            self.index.add(embeddings)
             self.is_indexed = True
+            # Create summary
+            self.document_summary = self.create_document_summary(all_text)
+            return f"✅ Processed {len(self.documents)} text chunks from documents. Summary generated."
         except Exception as e:
+            return f"❌ Error processing documents: {str(e)}"
+    # --- Semantic search ---
     def find_relevant_content(self, query: str, top_k: int = 3) -> str:
+        if not self.is_indexed or not self.index:
             return ""
         try:
             query_embedding = self.embedder.encode([query], convert_to_numpy=True)
             faiss.normalize_L2(query_embedding)
+            scores, indices = self.index.search(query_embedding.astype('float32'), top_k)
             relevant_chunks = []
             for score, idx in zip(scores[0], indices[0]):
+                if idx < len(self.documents) and score > 0.15:  # threshold tuned to reduce noise
                     relevant_chunks.append(self.documents[idx])
+            if not relevant_chunks:
+                return ""
+            return ' '.join(relevant_chunks)
         except Exception as e:
+            print(f"Error in semantic search: {e}")
             return ""
+    # --- Summarization ---
+    def create_document_summary(self, text: str) -> str:
+        try:
+            # Limit input size for summarizer to ~1000 tokens to avoid issues
+            max_input_length = 1000
+            input_text = text[:max_input_length] + ('...' if len(text) > max_input_length else '')
+            summary_output = self.summarizer(input_text, max_length=150, min_length=40, do_sample=False)
+            summary = summary_output[0]['summary_text']
+            return summary
+        except Exception as e:
+            # fallback simple heuristic summary
+            sentences = re.split(r'(?<=[.!?]) +', text)
+            return sentences[0][:300] + ('...' if len(sentences[0]) > 300 else '')
+    # --- Question answering ---
     def answer_question(self, query: str) -> str:
         if not query.strip():
             return "❓ Please ask a question!"
         if not self.is_indexed:
             return "📁 Please upload and process documents first!"
+        query_lower = query.lower()
+        # Summary shortcut
+        if any(word in query_lower for word in ['summary', 'summarize', 'overview', 'about']):
+            return f"📄 Document Summary:\n\n{self.document_summary}"
+        # Get relevant context
+        context = self.find_relevant_content(query, top_k=3)
+        if not context:
+            return "🔍 No relevant information found for your question."
         try:
+            # Q&A pipeline expects question + context separately
             result = self.qa_pipeline(question=query, context=context)
             answer = result.get('answer', '').strip()
             score = result.get('score', 0.0)
+            # Confidence thresholding & hallucination check
+            if score < 0.20 or not answer or answer.lower() in ['no answer', '']:
+                return "I don't know based on the provided documents."
+            # Optional heuristic: if answer too short or irrelevant to question, fallback
+            if len(answer) < 3 or (query_lower not in answer.lower() and score < 0.35):
+                return "I don't know based on the provided documents."
+            # Return answer + snippet from context for transparency
+            return f"**Answer:** {answer}\n\n*Context snippet:* {context[:300]}..."
         except Exception as e:
+            return f"❌ Error answering question: {str(e)}"
+# --- Gradio UI ---
 def create_interface():
     rag_system = SmartDocumentRAG()
+    with gr.Blocks(title="🧠 Enhanced Document Q&A System", theme=gr.themes.Soft()) as demo:
         gr.Markdown("""
         # 🧠 Enhanced Document Q&A System
         **Features:**
+        - 🎯 DistilBERT for Q&A with confidence checks
+        - ⚡ Sentence-BERT + FAISS semantic search
+        - 📊 Strong summarization with BART-large-CNN
+        - 🔍 Transparent answers with context snippets
         """)
         with gr.Tab("📤 Upload & Process"):
             with gr.Row():
                 with gr.Column():
+                    file_upload = gr.File(
+                        label="📁 Upload Documents (PDF, DOCX, TXT)",
+                        file_count="multiple",
+                        file_types=[".pdf", ".docx", ".txt"],
+                        height=150
+                    )
+                    process_btn = gr.Button("🔄 Process Documents", variant="primary", size="lg")
                 with gr.Column():
+                    process_status = gr.Textbox(label="📋 Processing Status", lines=10, interactive=False)
+            process_btn.click(
+                fn=rag_system.process_documents,
+                inputs=[file_upload],
+                outputs=[process_status]
+            )
         with gr.Tab("❓ Q&A"):
             with gr.Row():
                 with gr.Column():
+                    question_input = gr.Textbox(
+                        label="🤔 Ask Your Question",
+                        placeholder="e.g., What is the person's name? How many years of experience? What skills do they have?",
+                        lines=3
+                    )
                     with gr.Row():
                         ask_btn = gr.Button("🧠 Get Answer", variant="primary")
                         summary_btn = gr.Button("📊 Get Summary", variant="secondary")
                 with gr.Column():
                     answer_output = gr.Textbox(label="💡 Answer", lines=8, interactive=False)
+            ask_btn.click(
+                fn=rag_system.answer_question,
+                inputs=[question_input],
+                outputs=[answer_output]
+            )
+            summary_btn.click(
+                fn=lambda: rag_system.answer_question("summary"),
+                inputs=[],
+                outputs=[answer_output]
+            )
     return demo