pradeepsengarr commited on
Commit
adc1d58
Β·
verified Β·
1 Parent(s): aee5caa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +172 -261
app.py CHANGED
@@ -1,334 +1,245 @@
1
- import os
2
  import re
3
  import faiss
4
- import docx
5
- import PyPDF2
6
- import gradio as gr
7
  import numpy as np
8
- from typing import List, Dict
9
  from sentence_transformers import SentenceTransformer
10
  from transformers import pipeline
 
11
 
 
 
 
 
 
12
 
 
13
  class SmartDocumentRAG:
14
- def __init__(self, embedder_model='sentence-transformers/all-MiniLM-L6-v2', qa_model='distilbert-base-cased-distilled-squad'):
15
- # Load sentence embedding model
 
 
 
 
 
 
16
  self.embedder = SentenceTransformer(embedder_model)
17
 
18
- # Load Q&A pipeline model
19
  self.qa_pipeline = pipeline('question-answering', model=qa_model, tokenizer=qa_model)
20
 
21
- # Document and index initialization
22
- self.documents = []
23
- self.document_metadata = []
24
- self.raw_text = ""
25
- self.document_summary = ""
26
- self.document_type = ""
27
  self.index = None
28
  self.is_indexed = False
29
- self.model_type = "distilbert-qa" # Can add flan-t5 or others as needed
30
-
31
- ####################
32
- # Text Extraction
33
- ####################
34
- def extract_text_from_file(self, file_path: str) -> str:
35
- ext = os.path.splitext(file_path)[1].lower()
36
- try:
37
- if ext == '.pdf':
38
- return self.extract_from_pdf(file_path)
39
- elif ext == '.docx':
40
- return self.extract_from_docx(file_path)
41
- elif ext == '.txt':
42
- return self.extract_from_txt(file_path)
43
- else:
44
- return f"Unsupported file type: {ext}"
45
- except Exception as e:
46
- return f"Error reading file: {e}"
47
-
48
- def extract_from_pdf(self, file_path: str) -> str:
49
- text = ""
50
- try:
51
- with open(file_path, 'rb') as f:
52
- reader = PyPDF2.PdfReader(f)
53
- for page in reader.pages:
54
- txt = page.extract_text() or ""
55
- cleaned = self.clean_text(txt)
56
- text += cleaned + "\n"
57
- return text.strip()
58
- except Exception as e:
59
- return f"Error reading PDF: {e}"
60
-
61
- def extract_from_docx(self, file_path: str) -> str:
62
- try:
63
- doc = docx.Document(file_path)
64
- paragraphs = [self.clean_text(p.text) for p in doc.paragraphs if p.text.strip()]
65
- return "\n".join(paragraphs)
66
- except Exception as e:
67
- return f"Error reading DOCX: {e}"
68
-
69
- def extract_from_txt(self, file_path: str) -> str:
70
- encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
71
- for enc in encodings:
72
- try:
73
- with open(file_path, 'r', encoding=enc) as f:
74
- return self.clean_text(f.read())
75
- except UnicodeDecodeError:
76
- continue
77
- except Exception as e:
78
- return f"Error reading TXT: {e}"
79
- return "Could not decode TXT file."
80
-
81
- def clean_text(self, text: str) -> str:
82
- # Normalize whitespace, fix broken words, remove weird chars
83
- text = re.sub(r'\s+', ' ', text)
84
- text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text) # Fix camel case merges
85
- text = text.strip()
86
- return text
87
 
88
- ####################
89
- # Document Type Detection & Summary
90
- ####################
91
- def detect_document_type(self, text: str) -> str:
92
- lower_text = text.lower()
93
- if any(k in lower_text for k in ['abstract', 'study', 'research', 'methodology']):
94
- return 'research'
95
- elif any(k in lower_text for k in ['company', 'business', 'organization', 'financial']):
96
- return 'business'
97
- else:
98
- return 'general'
99
-
100
- def create_document_summary(self, text: str) -> str:
101
- sentences = re.split(r'(?<=[.!?]) +', text)
102
- sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
103
-
104
- if self.document_type == 'research':
105
- return self.extract_research_summary(sentences)
106
- elif self.document_type == 'business':
107
- return self.extract_business_summary(sentences)
108
- else:
109
- return self.extract_general_summary(sentences)
110
-
111
- def extract_research_summary(self, sentences: List[str]) -> str:
112
- for s in sentences[:7]:
113
- if any(w in s.lower() for w in ['abstract', 'study', 'research']):
114
- return s[:300] + ('...' if len(s) > 300 else '')
115
- return sentences[0][:300] if sentences else "Research document."
116
-
117
- def extract_business_summary(self, sentences: List[str]) -> str:
118
- for s in sentences[:5]:
119
- if any(w in s.lower() for w in ['company', 'business', 'organization']):
120
- return s[:300] + ('...' if len(s) > 300 else '')
121
- return sentences[0][:300] if sentences else "Business document."
122
-
123
- def extract_general_summary(self, sentences: List[str]) -> str:
124
- return sentences[0][:300] + ('...' if len(sentences[0]) > 300 else '') if sentences else "General document."
125
-
126
- ####################
127
- # Chunking
128
- ####################
129
- def enhanced_chunk_text(self, text: str, chunk_size: int = 3, overlap: int = 1) -> List[Dict]:
130
- if not text.strip():
131
- return []
132
-
133
- sentences = re.split(r'(?<=[.!?]) +', text)
134
- sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
135
 
 
 
 
136
  chunks = []
137
- for i in range(0, len(sentences), chunk_size - overlap):
138
- chunk_sents = sentences[i:i + chunk_size]
139
- if chunk_sents:
140
- chunk_text = " ".join(chunk_sents)
141
- chunks.append({
142
- "text": chunk_text,
143
- "sentence_indices": list(range(i, min(i + chunk_size, len(sentences)))),
144
- "doc_type": self.document_type
145
- })
146
  return chunks
147
-
148
- ####################
149
- # Processing uploaded files
150
- ####################
151
  def process_documents(self, files) -> str:
152
  if not files:
153
  return "❌ No files uploaded!"
154
-
 
155
  try:
156
- all_text = ""
157
- processed_files = []
158
-
159
- for file in files:
160
- if file is None:
161
- continue
162
- file_text = self.extract_text_from_file(file.name)
163
- if not file_text.startswith("Error") and not file_text.startswith("Unsupported"):
164
- all_text += " " + file_text
165
- processed_files.append(os.path.basename(file.name))
 
 
 
 
 
 
 
 
 
166
  else:
167
- return f"❌ {file_text}"
168
-
169
- if not all_text.strip():
170
- return "❌ No text extracted from files!"
171
-
172
- self.raw_text = all_text.strip()
173
- self.document_type = self.detect_document_type(self.raw_text)
174
- self.document_summary = self.create_document_summary(self.raw_text)
175
-
176
- chunks = self.enhanced_chunk_text(self.raw_text)
177
- if not chunks:
178
- return "❌ No valid chunks created!"
179
-
180
- self.documents = [c["text"] for c in chunks]
181
- self.document_metadata = chunks
182
-
183
- embeddings = self.embedder.encode(self.documents, show_progress_bar=False, convert_to_numpy=True)
184
  dimension = embeddings.shape[1]
185
-
186
  self.index = faiss.IndexFlatIP(dimension)
187
  faiss.normalize_L2(embeddings)
188
- self.index.add(embeddings.astype('float32'))
189
-
190
  self.is_indexed = True
191
-
192
- return (f"βœ… Processed {len(processed_files)} files: {', '.join(processed_files)}\n"
193
- f"πŸ“„ Document Type: {self.document_type.title()}\n"
194
- f"πŸ” Created {len(self.documents)} chunks\n"
195
- f"πŸ“ Summary: {self.document_summary}\n"
196
- f"πŸš€ Ready for Q&A!")
197
-
198
  except Exception as e:
199
- return f"❌ Error processing documents: {e}"
200
-
201
- ####################
202
- # Search & Answer
203
- ####################
204
  def find_relevant_content(self, query: str, top_k: int = 3) -> str:
205
- if not self.is_indexed:
206
  return ""
207
-
208
  try:
209
  query_embedding = self.embedder.encode([query], convert_to_numpy=True)
210
  faiss.normalize_L2(query_embedding)
211
-
212
- k = min(top_k, len(self.documents))
213
- scores, indices = self.index.search(query_embedding.astype('float32'), k)
214
-
215
  relevant_chunks = []
216
  for score, idx in zip(scores[0], indices[0]):
217
- if idx < len(self.documents) and score > 0.15:
218
  relevant_chunks.append(self.documents[idx])
219
-
220
- return " ".join(relevant_chunks)
221
-
 
 
222
  except Exception as e:
223
- print(f"Search error: {e}")
224
  return ""
225
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
  def answer_question(self, query: str) -> str:
227
  if not query.strip():
228
  return "❓ Please ask a question!"
229
-
230
  if not self.is_indexed:
231
  return "πŸ“ Please upload and process documents first!"
232
-
 
 
 
 
 
 
 
 
 
 
233
  try:
234
- lower_query = query.lower()
235
- if any(k in lower_query for k in ['summary', 'summarize', 'about', 'overview']):
236
- return f"πŸ“„ **Document Summary:**\n\n{self.document_summary}"
237
-
238
- context = self.find_relevant_content(query, top_k=3)
239
- if not context:
240
- return "πŸ” No relevant information found. Try rephrasing your question."
241
-
242
- # Use Q&A pipeline
243
  result = self.qa_pipeline(question=query, context=context)
 
244
  answer = result.get('answer', '').strip()
245
  score = result.get('score', 0.0)
246
-
247
- if score < 0.15 or not answer:
248
- # Fallback to direct extraction
249
- return self.extract_direct_answer(query, context)
250
-
251
- return f"**Answer:** {answer}\n\n**Context:** {context[:300]}..."
252
-
 
 
 
 
253
  except Exception as e:
254
- return f"❌ Error answering question: {e}"
255
-
256
- def extract_direct_answer(self, query: str, context: str) -> str:
257
- lower_query = query.lower()
258
-
259
- # Extract names (simple heuristic)
260
- if any(k in lower_query for k in ['name', 'who is', 'who']):
261
- names = re.findall(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', context)
262
- if names:
263
- return f"**Name:** {names[0]}"
264
 
265
- # Extract experience years
266
- if any(k in lower_query for k in ['experience', 'years']):
267
- exp = re.findall(r'(\d+)[\+\-\s]*(?:years?|yrs?)', context.lower())
268
- if exp:
269
- return f"**Experience:** {exp[0]} years"
270
 
271
- # Extract skills
272
- if any(k in lower_query for k in ['skill', 'technology', 'tech']):
273
- skills_regex = r'\b(Python|Java|JavaScript|React|Node|SQL|AWS|Docker|Kubernetes|Git|HTML|CSS|Angular|Vue|Spring|Django|Flask|MongoDB|PostgreSQL)\b'
274
- skills_found = list(set(re.findall(skills_regex, context, re.I)))
275
- if skills_found:
276
- return f"**Skills mentioned:** {', '.join(skills_found)}"
277
 
278
- # Extract education
279
- if any(k in lower_query for k in ['education', 'degree', 'university']):
280
- edu = re.findall(r'(?:Bachelor|Master|PhD|B\.?S\.?|M\.?S\.?|B\.?A\.?|M\.?A\.?).*?(?:in|of)\s+([^.]+)', context, re.I)
281
- if edu:
282
- return f"**Education:** {edu[0]}"
283
-
284
- # Fallback: first sentence
285
- sentences = re.split(r'(?<=[.!?]) +', context)
286
- if sentences:
287
- return f"**Answer:** {sentences[0]}"
288
-
289
- return "I found relevant information but could not extract a precise answer."
290
-
291
-
292
- # Gradio interface creation
293
  def create_interface():
294
  rag_system = SmartDocumentRAG()
295
-
296
- with gr.Blocks(title="🧠 Enhanced Document Q&A", theme=gr.themes.Soft()) as demo:
297
  gr.Markdown("""
298
  # 🧠 Enhanced Document Q&A System
299
 
300
- **Optimized with Better Chunking, Summaries, and Reduced Hallucination**
301
-
302
  **Features:**
303
- - 🎯 DistilBERT Q&A pipeline for accurate answers
304
- - ⚑ SentenceTransformer embeddings + FAISS semantic search
305
- - πŸ“Š Improved document summaries & chunking
306
- - πŸ” Direct answer fallback for facts extraction
307
  """)
308
-
309
  with gr.Tab("πŸ“€ Upload & Process"):
310
  with gr.Row():
311
  with gr.Column():
312
- file_upload = gr.File(label="πŸ“ Upload Documents", file_types=[".pdf", ".docx", ".txt"], file_count="multiple", interactive=True)
313
- process_btn = gr.Button("πŸ”„ Process Documents", variant="primary")
 
 
 
 
 
314
  with gr.Column():
315
- process_status = gr.Textbox(label="πŸ“‹ Processing Status", lines=8, interactive=False)
316
-
317
- process_btn.click(fn=rag_system.process_documents, inputs=[file_upload], outputs=[process_status])
318
-
 
 
 
 
319
  with gr.Tab("❓ Q&A"):
320
  with gr.Row():
321
  with gr.Column():
322
- question_input = gr.Textbox(label="πŸ€” Ask Your Question", placeholder="Enter your question here...", lines=3)
 
 
 
 
323
  with gr.Row():
324
  ask_btn = gr.Button("🧠 Get Answer", variant="primary")
325
  summary_btn = gr.Button("πŸ“Š Get Summary", variant="secondary")
326
  with gr.Column():
327
  answer_output = gr.Textbox(label="πŸ’‘ Answer", lines=8, interactive=False)
328
-
329
- ask_btn.click(fn=rag_system.answer_question, inputs=[question_input], outputs=[answer_output])
330
- summary_btn.click(fn=lambda: rag_system.answer_question("summary"), inputs=[], outputs=[answer_output])
331
-
 
 
 
 
 
 
 
 
 
332
  return demo
333
 
334
 
 
 
1
  import re
2
  import faiss
 
 
 
3
  import numpy as np
4
+ from typing import List
5
  from sentence_transformers import SentenceTransformer
6
  from transformers import pipeline
7
+ import gradio as gr
8
 
9
+ # Helper: clean and normalize text
10
+ def clean_text(text: str) -> str:
11
+ text = re.sub(r'\s+', ' ', text)
12
+ text = text.strip()
13
+ return text
14
 
15
+ # Main class for Document Retrieval & Q&A
16
  class SmartDocumentRAG:
17
+ def __init__(self,
18
+ embedder_model='sentence-transformers/all-MiniLM-L6-v2',
19
+ qa_model='distilbert-base-cased-distilled-squad',
20
+ summarization_model='facebook/bart-large-cnn'):
21
+
22
+ print("Loading models... this may take a moment.")
23
+
24
+ # Embedding model for semantic search
25
  self.embedder = SentenceTransformer(embedder_model)
26
 
27
+ # Q&A pipeline for answering questions
28
  self.qa_pipeline = pipeline('question-answering', model=qa_model, tokenizer=qa_model)
29
 
30
+ # Summarization pipeline for document summaries
31
+ self.summarizer = pipeline('summarization', model=summarization_model, tokenizer=summarization_model)
32
+
33
+ # Initialize document storage and index
34
+ self.documents: List[str] = []
 
35
  self.index = None
36
  self.is_indexed = False
37
+ self.document_summary = ""
38
+ self.raw_text = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
+ # --- Document processing ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
+ def chunk_text(self, text: str, max_len: int = 250) -> List[str]:
43
+ # Split text into smaller chunks of max_len tokens approx (words here)
44
+ words = text.split()
45
  chunks = []
46
+ for i in range(0, len(words), max_len):
47
+ chunk = ' '.join(words[i:i+max_len])
48
+ chunks.append(clean_text(chunk))
 
 
 
 
 
 
49
  return chunks
50
+
 
 
 
51
  def process_documents(self, files) -> str:
52
  if not files:
53
  return "❌ No files uploaded!"
54
+
55
+ all_text = ""
56
  try:
57
+ for file_obj in files:
58
+ filename = file_obj.name
59
+ file_bytes = file_obj.read()
60
+ ext = filename.split('.')[-1].lower()
61
+
62
+ text = ""
63
+ if ext == 'pdf':
64
+ import fitz # PyMuPDF
65
+ doc = fitz.open(stream=file_bytes, filetype="pdf")
66
+ for page in doc:
67
+ text += page.get_text()
68
+ doc.close()
69
+ elif ext == 'docx':
70
+ import docx2txt
71
+ import io
72
+ # docx2txt accepts path or file-like; use BytesIO
73
+ text = docx2txt.process(io.BytesIO(file_bytes))
74
+ elif ext in ['txt', 'text']:
75
+ text = file_bytes.decode('utf-8', errors='ignore')
76
  else:
77
+ return f"❌ Unsupported file type: {ext}"
78
+
79
+ all_text += "\n\n" + text
80
+
81
+ all_text = clean_text(all_text)
82
+ self.raw_text = all_text
83
+ # Chunk documents
84
+ self.documents = self.chunk_text(all_text)
85
+
86
+ if not self.documents:
87
+ return "❌ No text extracted from documents."
88
+
89
+ # Build FAISS index
90
+ embeddings = self.embedder.encode(self.documents, convert_to_numpy=True, show_progress_bar=True)
91
+ embeddings = embeddings.astype('float32')
92
+
 
93
  dimension = embeddings.shape[1]
 
94
  self.index = faiss.IndexFlatIP(dimension)
95
  faiss.normalize_L2(embeddings)
96
+ self.index.add(embeddings)
97
+
98
  self.is_indexed = True
99
+
100
+ # Create summary
101
+ self.document_summary = self.create_document_summary(all_text)
102
+
103
+ return f"βœ… Processed {len(self.documents)} text chunks from documents. Summary generated."
104
+
 
105
  except Exception as e:
106
+ return f"❌ Error processing documents: {str(e)}"
107
+
108
+ # --- Semantic search ---
 
 
109
  def find_relevant_content(self, query: str, top_k: int = 3) -> str:
110
+ if not self.is_indexed or not self.index:
111
  return ""
 
112
  try:
113
  query_embedding = self.embedder.encode([query], convert_to_numpy=True)
114
  faiss.normalize_L2(query_embedding)
115
+ scores, indices = self.index.search(query_embedding.astype('float32'), top_k)
116
+
 
 
117
  relevant_chunks = []
118
  for score, idx in zip(scores[0], indices[0]):
119
+ if idx < len(self.documents) and score > 0.15: # threshold tuned to reduce noise
120
  relevant_chunks.append(self.documents[idx])
121
+
122
+ if not relevant_chunks:
123
+ return ""
124
+
125
+ return ' '.join(relevant_chunks)
126
  except Exception as e:
127
+ print(f"Error in semantic search: {e}")
128
  return ""
129
+
130
+ # --- Summarization ---
131
+ def create_document_summary(self, text: str) -> str:
132
+ try:
133
+ # Limit input size for summarizer to ~1000 tokens to avoid issues
134
+ max_input_length = 1000
135
+ input_text = text[:max_input_length] + ('...' if len(text) > max_input_length else '')
136
+ summary_output = self.summarizer(input_text, max_length=150, min_length=40, do_sample=False)
137
+ summary = summary_output[0]['summary_text']
138
+ return summary
139
+ except Exception as e:
140
+ # fallback simple heuristic summary
141
+ sentences = re.split(r'(?<=[.!?]) +', text)
142
+ return sentences[0][:300] + ('...' if len(sentences[0]) > 300 else '')
143
+
144
+ # --- Question answering ---
145
  def answer_question(self, query: str) -> str:
146
  if not query.strip():
147
  return "❓ Please ask a question!"
 
148
  if not self.is_indexed:
149
  return "πŸ“ Please upload and process documents first!"
150
+
151
+ query_lower = query.lower()
152
+ # Summary shortcut
153
+ if any(word in query_lower for word in ['summary', 'summarize', 'overview', 'about']):
154
+ return f"πŸ“„ Document Summary:\n\n{self.document_summary}"
155
+
156
+ # Get relevant context
157
+ context = self.find_relevant_content(query, top_k=3)
158
+ if not context:
159
+ return "πŸ” No relevant information found for your question."
160
+
161
  try:
162
+ # Q&A pipeline expects question + context separately
 
 
 
 
 
 
 
 
163
  result = self.qa_pipeline(question=query, context=context)
164
+
165
  answer = result.get('answer', '').strip()
166
  score = result.get('score', 0.0)
167
+
168
+ # Confidence thresholding & hallucination check
169
+ if score < 0.20 or not answer or answer.lower() in ['no answer', '']:
170
+ return "I don't know based on the provided documents."
171
+
172
+ # Optional heuristic: if answer too short or irrelevant to question, fallback
173
+ if len(answer) < 3 or (query_lower not in answer.lower() and score < 0.35):
174
+ return "I don't know based on the provided documents."
175
+
176
+ # Return answer + snippet from context for transparency
177
+ return f"**Answer:** {answer}\n\n*Context snippet:* {context[:300]}..."
178
  except Exception as e:
179
+ return f"❌ Error answering question: {str(e)}"
 
 
 
 
 
 
 
 
 
180
 
 
 
 
 
 
181
 
182
+ # --- Gradio UI ---
 
 
 
 
 
183
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  def create_interface():
185
  rag_system = SmartDocumentRAG()
186
+
187
+ with gr.Blocks(title="🧠 Enhanced Document Q&A System", theme=gr.themes.Soft()) as demo:
188
  gr.Markdown("""
189
  # 🧠 Enhanced Document Q&A System
190
 
 
 
191
  **Features:**
192
+ - 🎯 DistilBERT for Q&A with confidence checks
193
+ - ⚑ Sentence-BERT + FAISS semantic search
194
+ - πŸ“Š Strong summarization with BART-large-CNN
195
+ - πŸ” Transparent answers with context snippets
196
  """)
197
+
198
  with gr.Tab("πŸ“€ Upload & Process"):
199
  with gr.Row():
200
  with gr.Column():
201
+ file_upload = gr.File(
202
+ label="πŸ“ Upload Documents (PDF, DOCX, TXT)",
203
+ file_count="multiple",
204
+ file_types=[".pdf", ".docx", ".txt"],
205
+ height=150
206
+ )
207
+ process_btn = gr.Button("πŸ”„ Process Documents", variant="primary", size="lg")
208
  with gr.Column():
209
+ process_status = gr.Textbox(label="πŸ“‹ Processing Status", lines=10, interactive=False)
210
+
211
+ process_btn.click(
212
+ fn=rag_system.process_documents,
213
+ inputs=[file_upload],
214
+ outputs=[process_status]
215
+ )
216
+
217
  with gr.Tab("❓ Q&A"):
218
  with gr.Row():
219
  with gr.Column():
220
+ question_input = gr.Textbox(
221
+ label="πŸ€” Ask Your Question",
222
+ placeholder="e.g., What is the person's name? How many years of experience? What skills do they have?",
223
+ lines=3
224
+ )
225
  with gr.Row():
226
  ask_btn = gr.Button("🧠 Get Answer", variant="primary")
227
  summary_btn = gr.Button("πŸ“Š Get Summary", variant="secondary")
228
  with gr.Column():
229
  answer_output = gr.Textbox(label="πŸ’‘ Answer", lines=8, interactive=False)
230
+
231
+ ask_btn.click(
232
+ fn=rag_system.answer_question,
233
+ inputs=[question_input],
234
+ outputs=[answer_output]
235
+ )
236
+
237
+ summary_btn.click(
238
+ fn=lambda: rag_system.answer_question("summary"),
239
+ inputs=[],
240
+ outputs=[answer_output]
241
+ )
242
+
243
  return demo
244
 
245