File size: 2,046 Bytes
5f3b20a
 
 
 
 
 
 
670c138
 
5f3b20a
 
670c138
1d98a7e
5f3b20a
 
 
 
 
 
670c138
5f3b20a
 
670c138
5f3b20a
 
670c138
5f3b20a
670c138
5f3b20a
670c138
5f3b20a
 
 
670c138
5f3b20a
 
 
 
670c138
5f3b20a
 
670c138
5f3b20a
670c138
5f3b20a
670c138
5f3b20a
 
670c138
5f3b20a
670c138
5f3b20a
 
 
670c138
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import os
import glob
from langchain.schema.document import Document
from e5_embeddings import E5Embeddings
from langchain_community.vectorstores import FAISS
from document_processor import load_pdf_with_pymupdf, split_documents

# Path configuration
FOLDER = "cleaned_pdfs"  # Folder containing the cleaned PDFs
VECTOR_STORE_PATH = "vector_db"

# 1. Load the embedding model
def get_embeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", device="cuda"):
    return E5Embeddings(
        model_name=model_name,
        model_kwargs={'device': device},
        encode_kwargs={'normalize_embeddings': True}
    )

# 2. Load existing vector store
def load_vector_store(embeddings, load_path=VECTOR_STORE_PATH):
    if not os.path.exists(load_path):
        raise FileNotFoundError(f"Cannot find vector store: {load_path}")
    return FAISS.load_local(load_path, embeddings, allow_dangerous_deserialization=True)

# 3. Embed only the cleaned PDFs
def embed_cleaned_pdfs(folder, vectorstore, embeddings):
    pattern = os.path.join(folder, "cleaned*.pdf")
    pdf_files = glob.glob(pattern)
    print(f"Number of target PDFs: {len(pdf_files)}")

    new_documents = []
    for pdf_path in pdf_files:
        print(f"Processing: {pdf_path}")
        text = load_pdf_with_pymupdf(pdf_path)
        if text.strip():
            new_documents.append(Document(page_content=text, metadata={"source": pdf_path}))

    print(f"Number of documents: {len(new_documents)}")

    chunks = split_documents(new_documents, chunk_size=300, chunk_overlap=50)
    print(f"Number of chunks: {len(chunks)}")

    print(f"Vector count before addition: {vectorstore.index.ntotal}")
    vectorstore.add_documents(chunks)
    print(f"Vector count after addition: {vectorstore.index.ntotal}")

    vectorstore.save_local(VECTOR_STORE_PATH)
    print(f"Save completed: {VECTOR_STORE_PATH}")

# Execution
if __name__ == "__main__":
    embeddings = get_embeddings()
    vectorstore = load_vector_store(embeddings)
    embed_cleaned_pdfs(FOLDER, vectorstore, embeddings)