File size: 1,900 Bytes
a704a0c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import os
import PyPDF2
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from rapidfuzz import fuzz

embedder = SentenceTransformer("all-MiniLM-L6-v2")
faiss_index = None
pdf_chunks = []
chunk_texts = []

def process_pdfs(pdf_files):
    global faiss_index, pdf_chunks, chunk_texts
    all_text = ""
    chunk_texts = []
    for pdf_file in pdf_files:
        reader = PyPDF2.PdfReader(pdf_file.name)
        for page in reader.pages:
            all_text += page.extract_text() + "\n"
    chunk_size = 500
    pdf_chunks = [all_text[i:i+chunk_size] for i in range(0, len(all_text), chunk_size)]
    chunk_texts = pdf_chunks
    embeddings = embedder.encode(pdf_chunks, convert_to_numpy=True)
    dim = embeddings.shape[1]
    faiss_index = faiss.IndexFlatL2(dim)
    faiss_index.add(embeddings)
    return f"Processed {len(pdf_chunks)} chunks from {len(pdf_files)} PDF(s)."

def semantic_search(query, top_k=3):
    global faiss_index, chunk_texts
    if faiss_index is None or not chunk_texts:
        return []
    query_emb = embedder.encode([query], convert_to_numpy=True)
    D, I = faiss_index.search(query_emb, top_k)
    return [chunk_texts[i] for i in I[0] if i < len(chunk_texts)]

def keyword_search(query, top_k=3):
    global chunk_texts
    if not chunk_texts:
        return []
    scored = [(chunk, fuzz.partial_ratio(query.lower(), chunk.lower())) for chunk in chunk_texts]
    scored = sorted(scored, key=lambda x: x[1], reverse=True)
    return [chunk for chunk, score in scored[:top_k]]

def retrieve_context(query, top_k=3):
    semantic_results = semantic_search(query, top_k)
    keyword_results = keyword_search(query, top_k)
    combined = []
    seen = set()
    for chunk in semantic_results + keyword_results:
        if chunk not in seen:
            combined.append(chunk)
            seen.add(chunk)
    return "\n".join(combined)