import os from langchain.schema.document import Document from e5_embeddings import E5Embeddings from langchain_community.vectorstores import FAISS from document_processor_image import load_documents, split_documents # This function is required! # Path configuration NEW_FOLDER = "new_documents" # Folder containing the new documents VECTOR_STORE_PATH = "vector_db" # 1. Loading the embedding model def get_embeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", device="cuda"): return E5Embeddings( model_name=model_name, model_kwargs={'device': device}, encode_kwargs={'normalize_embeddings': True} ) # 2. Load existing vector store def load_vector_store(embeddings, load_path="vector_db"): if not os.path.exists(load_path): raise FileNotFoundError(f"Cannot find vector store: {load_path}") return FAISS.load_local(load_path, embeddings, allow_dangerous_deserialization=True) # 3. Embed and Add New Documents def add_new_documents_to_vector_store(new_folder, vectorstore, embeddings): print(f"Loading new documents: {new_folder}") new_docs = load_documents(new_folder) new_chunks = split_documents(new_docs) #, chunk_size=800, chunk_overlap=100 #Es fehlen noch die Parameter chunk_size=800, chunk_overlap=100, aber ohne Kenntnis der Funktionen, kann ich diese nicht sinnvoll befüllen print(f"Number of new chunks: {len(new_chunks)}") print(f"Vector count before addition: {vectorstore.index.ntotal}") vectorstore.add_documents(new_chunks) print(f"Vector count after addition: {vectorstore.index.ntotal}") print("New documents have been added to the vector store.") # 4. Main Execution if __name__ == "__main__": embeddings = get_embeddings() vectorstore = load_vector_store(embeddings, VECTOR_STORE_PATH) add_new_documents_to_vector_store(NEW_FOLDER, vectorstore, embeddings) vectorstore.save_local(VECTOR_STORE_PATH) print(f"Vector store save completed: {VECTOR_STORE_PATH}")