|
import os |
|
import PyPDF2 |
|
from sentence_transformers import SentenceTransformer |
|
import faiss |
|
import numpy as np |
|
from rapidfuzz import fuzz |
|
|
|
embedder = SentenceTransformer("all-MiniLM-L6-v2") |
|
faiss_index = None |
|
pdf_chunks = [] |
|
chunk_texts = [] |
|
|
|
def process_pdfs(pdf_files): |
|
global faiss_index, pdf_chunks, chunk_texts |
|
all_text = "" |
|
chunk_texts = [] |
|
for pdf_file in pdf_files: |
|
reader = PyPDF2.PdfReader(pdf_file.name) |
|
for page in reader.pages: |
|
all_text += page.extract_text() + "\n" |
|
chunk_size = 500 |
|
pdf_chunks = [all_text[i:i+chunk_size] for i in range(0, len(all_text), chunk_size)] |
|
chunk_texts = pdf_chunks |
|
embeddings = embedder.encode(pdf_chunks, convert_to_numpy=True) |
|
dim = embeddings.shape[1] |
|
faiss_index = faiss.IndexFlatL2(dim) |
|
faiss_index.add(embeddings) |
|
return f"Processed {len(pdf_chunks)} chunks from {len(pdf_files)} PDF(s)." |
|
|
|
def semantic_search(query, top_k=3): |
|
global faiss_index, chunk_texts |
|
if faiss_index is None or not chunk_texts: |
|
return [] |
|
query_emb = embedder.encode([query], convert_to_numpy=True) |
|
D, I = faiss_index.search(query_emb, top_k) |
|
return [chunk_texts[i] for i in I[0] if i < len(chunk_texts)] |
|
|
|
def keyword_search(query, top_k=3): |
|
global chunk_texts |
|
if not chunk_texts: |
|
return [] |
|
scored = [(chunk, fuzz.partial_ratio(query.lower(), chunk.lower())) for chunk in chunk_texts] |
|
scored = sorted(scored, key=lambda x: x[1], reverse=True) |
|
return [chunk for chunk, score in scored[:top_k]] |
|
|
|
def retrieve_context(query, top_k=3): |
|
semantic_results = semantic_search(query, top_k) |
|
keyword_results = keyword_search(query, top_k) |
|
combined = [] |
|
seen = set() |
|
for chunk in semantic_results + keyword_results: |
|
if chunk not in seen: |
|
combined.append(chunk) |
|
seen.add(chunk) |
|
return "\n".join(combined) |