import pickle from pathlib import Path from langchain_chroma import Chroma from langchain_openai import OpenAIEmbeddings def main(): BASE_DIR = Path(__file__).resolve().parent.parent CHUNKS_PATH = BASE_DIR / "output" / "chunks.pkl" DB_DIR = BASE_DIR / "db" DB_DIR.mkdir(parents=True, exist_ok=True) # <-- create the db folder if not exists with open(CHUNKS_PATH, "rb") as f: chunks = pickle.load(f) embedding = OpenAIEmbeddings(model="text-embedding-3-small") vectorstore = Chroma(persist_directory=str(DB_DIR), embedding_function=embedding) BATCH_SIZE = 100 print(f"🧠 Embedding and adding {len(chunks)} chunks in batches...") for i in range(0, len(chunks), BATCH_SIZE): batch = chunks[i:i + BATCH_SIZE] vectorstore.add_documents(batch) print(f"✅ Added batch {i // BATCH_SIZE + 1}") if __name__ == "__main__": main()