Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| import json | |
| import argparse | |
| from pathlib import Path | |
| import chromadb | |
| from chromadb.config import Settings | |
| def import_collection(client, json_file: Path, include_embeddings=False): | |
| """Import a JSON file into a ChromaDB collection.""" | |
| collection_name = json_file.stem | |
| print(f"π₯ Importing {collection_name} from {json_file}") | |
| # Load JSON | |
| data = json.loads(json_file.read_text(encoding="utf-8")) | |
| # Extract fields | |
| ids = [item["id"] for item in data] | |
| documents = [item.get("document") for item in data] | |
| metadatas = [item.get("metadata") for item in data] | |
| if include_embeddings: | |
| embeddings = [item.get("embedding") for item in data] | |
| else: | |
| embeddings = None | |
| # Create or get collection | |
| collection = client.get_or_create_collection(collection_name) | |
| # Add to collection | |
| collection.add( | |
| ids=ids, | |
| documents=documents, | |
| metadatas=metadatas, | |
| embeddings=embeddings | |
| ) | |
| print(f"β Imported {len(ids)} items into {collection_name}") | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Import JSON files into ChromaDB collections.") | |
| parser.add_argument( | |
| "--db-path", | |
| type=str, | |
| required=True, | |
| help="Path to the target chromadb_store folder", | |
| ) | |
| parser.add_argument( | |
| "--input", | |
| type=str, | |
| default="chroma_exports", | |
| help="Folder containing JSON files to import", | |
| ) | |
| parser.add_argument( | |
| "--include-embeddings", | |
| action="store_true", | |
| help="Load embeddings from JSON (off by default)", | |
| ) | |
| args = parser.parse_args() | |
| db_path = Path(args.db_path).expanduser().resolve() | |
| input_dir = Path(args.input).expanduser().resolve() | |
| if not input_dir.exists(): | |
| print(f"β Input folder does not exist: {input_dir}") | |
| return | |
| # Connect to ChromaDB | |
| client = chromadb.PersistentClient( | |
| path=str(db_path), | |
| settings=Settings(anonymized_telemetry=False) | |
| ) | |
| # Iterate JSON files | |
| for json_file in input_dir.glob("*.json"): | |
| import_collection(client, json_file, args.include_embeddings) | |
| print("\nπ All JSON files imported!") | |
| if __name__ == "__main__": | |
| main() | |