Spaces:
Sleeping
Sleeping
import pickle | |
from pathlib import Path | |
from langchain_chroma import Chroma | |
from langchain_openai import OpenAIEmbeddings | |
def main(): | |
BASE_DIR = Path(__file__).resolve().parent.parent | |
CHUNKS_PATH = BASE_DIR / "output" / "chunks.pkl" | |
DB_DIR = BASE_DIR / "db" | |
DB_DIR.mkdir(parents=True, exist_ok=True) # <-- create the db folder if not exists | |
with open(CHUNKS_PATH, "rb") as f: | |
chunks = pickle.load(f) | |
embedding = OpenAIEmbeddings(model="text-embedding-3-small") | |
vectorstore = Chroma(persist_directory=str(DB_DIR), embedding_function=embedding) | |
BATCH_SIZE = 100 | |
print(f"🧠 Embedding and adding {len(chunks)} chunks in batches...") | |
for i in range(0, len(chunks), BATCH_SIZE): | |
batch = chunks[i:i + BATCH_SIZE] | |
vectorstore.add_documents(batch) | |
print(f"✅ Added batch {i // BATCH_SIZE + 1}") | |
if __name__ == "__main__": | |
main() | |