import os import PyPDF2 from sentence_transformers import SentenceTransformer import faiss import numpy as np from rapidfuzz import fuzz embedder = SentenceTransformer("all-MiniLM-L6-v2") faiss_index = None pdf_chunks = [] chunk_texts = [] def process_pdfs(pdf_files): global faiss_index, pdf_chunks, chunk_texts all_text = "" chunk_texts = [] for pdf_file in pdf_files: reader = PyPDF2.PdfReader(pdf_file.name) for page in reader.pages: all_text += page.extract_text() + "\n" chunk_size = 500 pdf_chunks = [all_text[i:i+chunk_size] for i in range(0, len(all_text), chunk_size)] chunk_texts = pdf_chunks embeddings = embedder.encode(pdf_chunks, convert_to_numpy=True) dim = embeddings.shape[1] faiss_index = faiss.IndexFlatL2(dim) faiss_index.add(embeddings) return f"Processed {len(pdf_chunks)} chunks from {len(pdf_files)} PDF(s)." def semantic_search(query, top_k=3): global faiss_index, chunk_texts if faiss_index is None or not chunk_texts: return [] query_emb = embedder.encode([query], convert_to_numpy=True) D, I = faiss_index.search(query_emb, top_k) return [chunk_texts[i] for i in I[0] if i < len(chunk_texts)] def keyword_search(query, top_k=3): global chunk_texts if not chunk_texts: return [] scored = [(chunk, fuzz.partial_ratio(query.lower(), chunk.lower())) for chunk in chunk_texts] scored = sorted(scored, key=lambda x: x[1], reverse=True) return [chunk for chunk, score in scored[:top_k]] def retrieve_context(query, top_k=3): semantic_results = semantic_search(query, top_k) keyword_results = keyword_search(query, top_k) combined = [] seen = set() for chunk in semantic_results + keyword_results: if chunk not in seen: combined.append(chunk) seen.add(chunk) return "\n".join(combined)