File size: 917 Bytes
32e0ef9
 
2ee3353
32e0ef9
 
3c7db82
 
 
 
0ee80e8
 
3c7db82
 
 
 
 
 
 
 
 
 
 
 
 
32e0ef9
3c7db82
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import pickle
from pathlib import Path
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

def main():
    BASE_DIR = Path(__file__).resolve().parent.parent
    CHUNKS_PATH = BASE_DIR / "output" / "chunks.pkl"
    DB_DIR = BASE_DIR / "db"

    DB_DIR.mkdir(parents=True, exist_ok=True)  # <-- create the db folder if not exists
    
    with open(CHUNKS_PATH, "rb") as f:
        chunks = pickle.load(f)
    
    embedding = OpenAIEmbeddings(model="text-embedding-3-small")
    vectorstore = Chroma(persist_directory=str(DB_DIR), embedding_function=embedding)
    
    BATCH_SIZE = 100
    print(f"🧠 Embedding and adding {len(chunks)} chunks in batches...")
    for i in range(0, len(chunks), BATCH_SIZE):
        batch = chunks[i:i + BATCH_SIZE]
        vectorstore.add_documents(batch)
        print(f"✅ Added batch {i // BATCH_SIZE + 1}")

if __name__ == "__main__":
    main()