import json import argparse from pathlib import Path import chromadb from chromadb.config import Settings import numpy as np def make_json_safe(value): """Convert numpy types to JSON-serializable Python types.""" if isinstance(value, np.ndarray): return value.tolist() if isinstance(value, (np.integer,)): return int(value) if isinstance(value, (np.floating,)): return float(value) return value def export_collection(collection, output_dir: Path, include_embeddings=False): """Export one ChromaDB collection to a JSON file with flattened, JSON-safe metadata.""" include_fields = ["documents", "metadatas"] if include_embeddings: include_fields.append("embeddings") items = collection.get(include=include_fields) data = [] for idx, _id in enumerate(items["ids"]): record = { "id": _id, "document": items["documents"][idx] if items.get("documents") else None, "metadata": items["metadatas"][idx] if items.get("metadatas") else None, } if include_embeddings: record["embedding"] = ( items["embeddings"][idx] if items.get("embeddings") else None ) data.append(record) out_path = output_dir / f"{collection.name}.json" out_path.write_text( json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8" ) print(f"āœ” Exported {collection.name} ({len(data)} docs) → {out_path}") def main(): parser = argparse.ArgumentParser( description="Export ChromaDB collections to MongoDB-ready JSON." ) parser.add_argument( "--db-path", type=str, required=True, help="Path to the chromadb_store folder", ) parser.add_argument( "--output", type=str, default="chroma_exports", help="Output folder for JSON files", ) parser.add_argument( "--collection", type=str, help="Name of a single collection to export (omit to export all)", ) parser.add_argument( "--include-embeddings", action="store_true", help="Include embeddings in the export", ) args = parser.parse_args() db_path = Path(args.db_path).expanduser().resolve() output_dir = Path(args.output).expanduser().resolve() output_dir.mkdir(parents=True, exist_ok=True) client = chromadb.PersistentClient( path=str(db_path), settings=Settings(anonymized_telemetry=False) ) # šŸ”¹ Single collection if args.collection: print(f"ā–¶ Exporting collection: {args.collection}") collection = client.get_collection(args.collection) export_collection(collection, output_dir, args.include_embeddings) # šŸ”¹ All collections else: print("ā–¶ Exporting ALL collections") for cname in client.list_collections(): collection = client.get_collection(cname.name) export_collection(collection, output_dir, args.include_embeddings) print("\nšŸŽ‰ Export complete!") if __name__ == "__main__": main()