File size: 2,000 Bytes
5f3b20a
 
 
 
 
8c95c04
5f3b20a
8c95c04
 
5f3b20a
 
8c95c04
3c9ce69
5f3b20a
 
 
 
 
 
8c95c04
5f3b20a
 
8c95c04
5f3b20a
 
8c95c04
5f3b20a
8c95c04
5f3b20a
8c95c04
 
5f3b20a
8c95c04
 
5f3b20a
8c95c04
5f3b20a
8c95c04
5f3b20a
8c95c04
5f3b20a
 
 
 
 
8c95c04
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import os
from langchain.schema.document import Document
from e5_embeddings import E5Embeddings
from langchain_community.vectorstores import FAISS

from document_processor_image import load_documents, split_documents  # This function is required!

# Path configuration
NEW_FOLDER = "new_documents"  # Folder containing the new documents
VECTOR_STORE_PATH = "vector_db"

# 1. Loading the embedding model
def get_embeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", device="cuda"):
    return E5Embeddings(
        model_name=model_name,
        model_kwargs={'device': device},
        encode_kwargs={'normalize_embeddings': True}
    )

# 2. Load existing vector store
def load_vector_store(embeddings, load_path="vector_db"):
    if not os.path.exists(load_path):
        raise FileNotFoundError(f"Cannot find vector store: {load_path}")
    return FAISS.load_local(load_path, embeddings, allow_dangerous_deserialization=True)

# 3. Embed and Add New Documents
def add_new_documents_to_vector_store(new_folder, vectorstore, embeddings):
    print(f"Loading new documents: {new_folder}")
    new_docs = load_documents(new_folder)
    new_chunks = split_documents(new_docs) #, chunk_size=800, chunk_overlap=100
    #Es fehlen noch die Parameter chunk_size=800, chunk_overlap=100, aber ohne Kenntnis der Funktionen, kann ich diese nicht sinnvoll befüllen

    print(f"Number of new chunks: {len(new_chunks)}")
    print(f"Vector count before addition: {vectorstore.index.ntotal}")
    vectorstore.add_documents(new_chunks)
    print(f"Vector count after addition: {vectorstore.index.ntotal}")

    print("New documents have been added to the vector store.")

# 4. Main Execution
if __name__ == "__main__":
    embeddings = get_embeddings()
    vectorstore = load_vector_store(embeddings, VECTOR_STORE_PATH)
    add_new_documents_to_vector_store(NEW_FOLDER, vectorstore, embeddings)
    vectorstore.save_local(VECTOR_STORE_PATH)
    print(f"Vector store save completed: {VECTOR_STORE_PATH}")