Spaces:
Runtime error
Runtime error
File size: 2,046 Bytes
5f3b20a 670c138 5f3b20a 670c138 1d98a7e 5f3b20a 670c138 5f3b20a 670c138 5f3b20a 670c138 5f3b20a 670c138 5f3b20a 670c138 5f3b20a 670c138 5f3b20a 670c138 5f3b20a 670c138 5f3b20a 670c138 5f3b20a 670c138 5f3b20a 670c138 5f3b20a 670c138 5f3b20a 670c138 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
import os
import glob
from langchain.schema.document import Document
from e5_embeddings import E5Embeddings
from langchain_community.vectorstores import FAISS
from document_processor import load_pdf_with_pymupdf, split_documents
# Path configuration
FOLDER = "cleaned_pdfs" # Folder containing the cleaned PDFs
VECTOR_STORE_PATH = "vector_db"
# 1. Load the embedding model
def get_embeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", device="cuda"):
return E5Embeddings(
model_name=model_name,
model_kwargs={'device': device},
encode_kwargs={'normalize_embeddings': True}
)
# 2. Load existing vector store
def load_vector_store(embeddings, load_path=VECTOR_STORE_PATH):
if not os.path.exists(load_path):
raise FileNotFoundError(f"Cannot find vector store: {load_path}")
return FAISS.load_local(load_path, embeddings, allow_dangerous_deserialization=True)
# 3. Embed only the cleaned PDFs
def embed_cleaned_pdfs(folder, vectorstore, embeddings):
pattern = os.path.join(folder, "cleaned*.pdf")
pdf_files = glob.glob(pattern)
print(f"Number of target PDFs: {len(pdf_files)}")
new_documents = []
for pdf_path in pdf_files:
print(f"Processing: {pdf_path}")
text = load_pdf_with_pymupdf(pdf_path)
if text.strip():
new_documents.append(Document(page_content=text, metadata={"source": pdf_path}))
print(f"Number of documents: {len(new_documents)}")
chunks = split_documents(new_documents, chunk_size=300, chunk_overlap=50)
print(f"Number of chunks: {len(chunks)}")
print(f"Vector count before addition: {vectorstore.index.ntotal}")
vectorstore.add_documents(chunks)
print(f"Vector count after addition: {vectorstore.index.ntotal}")
vectorstore.save_local(VECTOR_STORE_PATH)
print(f"Save completed: {VECTOR_STORE_PATH}")
# Execution
if __name__ == "__main__":
embeddings = get_embeddings()
vectorstore = load_vector_store(embeddings)
embed_cleaned_pdfs(FOLDER, vectorstore, embeddings) |