import os import glob from langchain.schema.document import Document from e5_embeddings import E5Embeddings from langchain_community.vectorstores import FAISS from document_processor import load_pdf_with_pymupdf, split_documents # Path configuration FOLDER = "cleaned_pdfs" # Folder containing the cleaned PDFs VECTOR_STORE_PATH = "vector_db" # 1. Load the embedding model def get_embeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", device="cuda"): return E5Embeddings( model_name=model_name, model_kwargs={'device': device}, encode_kwargs={'normalize_embeddings': True} ) # 2. Load existing vector store def load_vector_store(embeddings, load_path=VECTOR_STORE_PATH): if not os.path.exists(load_path): raise FileNotFoundError(f"Cannot find vector store: {load_path}") return FAISS.load_local(load_path, embeddings, allow_dangerous_deserialization=True) # 3. Embed only the cleaned PDFs def embed_cleaned_pdfs(folder, vectorstore, embeddings): pattern = os.path.join(folder, "cleaned*.pdf") pdf_files = glob.glob(pattern) print(f"Number of target PDFs: {len(pdf_files)}") new_documents = [] for pdf_path in pdf_files: print(f"Processing: {pdf_path}") text = load_pdf_with_pymupdf(pdf_path) if text.strip(): new_documents.append(Document(page_content=text, metadata={"source": pdf_path})) print(f"Number of documents: {len(new_documents)}") chunks = split_documents(new_documents, chunk_size=300, chunk_overlap=50) print(f"Number of chunks: {len(chunks)}") print(f"Vector count before addition: {vectorstore.index.ntotal}") vectorstore.add_documents(chunks) print(f"Vector count after addition: {vectorstore.index.ntotal}") vectorstore.save_local(VECTOR_STORE_PATH) print(f"Save completed: {VECTOR_STORE_PATH}") # Execution if __name__ == "__main__": embeddings = get_embeddings() vectorstore = load_vector_store(embeddings) embed_cleaned_pdfs(FOLDER, vectorstore, embeddings)