Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 3,139 Bytes
8b5be8c e7cf65c 8b5be8c e7cf65c 8b5be8c e7cf65c 8b5be8c e7cf65c 8b5be8c e7cf65c 8b5be8c e7cf65c 8b5be8c e7cf65c 8b5be8c e7cf65c 8b5be8c e7cf65c 8b5be8c e7cf65c 8b5be8c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
import json
import argparse
from pathlib import Path
import chromadb
from chromadb.config import Settings
import numpy as np
def make_json_safe(value):
"""Convert numpy types to JSON-serializable Python types."""
if isinstance(value, np.ndarray):
return value.tolist()
if isinstance(value, (np.integer,)):
return int(value)
if isinstance(value, (np.floating,)):
return float(value)
return value
def export_collection(collection, output_dir: Path, include_embeddings=False):
"""Export one ChromaDB collection to a JSON file with flattened, JSON-safe metadata."""
include_fields = ["documents", "metadatas"]
if include_embeddings:
include_fields.append("embeddings")
items = collection.get(include=include_fields)
data = []
for idx, _id in enumerate(items["ids"]):
record = {
"id": _id,
"document": items["documents"][idx] if items.get("documents") else None,
"metadata": items["metadatas"][idx] if items.get("metadatas") else None,
}
if include_embeddings:
record["embedding"] = (
items["embeddings"][idx] if items.get("embeddings") else None
)
data.append(record)
out_path = output_dir / f"{collection.name}.json"
out_path.write_text(
json.dumps(data, indent=2, ensure_ascii=False),
encoding="utf-8"
)
print(f"✔ Exported {collection.name} ({len(data)} docs) → {out_path}")
def main():
parser = argparse.ArgumentParser(
description="Export ChromaDB collections to MongoDB-ready JSON."
)
parser.add_argument(
"--db-path",
type=str,
required=True,
help="Path to the chromadb_store folder",
)
parser.add_argument(
"--output",
type=str,
default="chroma_exports",
help="Output folder for JSON files",
)
parser.add_argument(
"--collection",
type=str,
help="Name of a single collection to export (omit to export all)",
)
parser.add_argument(
"--include-embeddings",
action="store_true",
help="Include embeddings in the export",
)
args = parser.parse_args()
db_path = Path(args.db_path).expanduser().resolve()
output_dir = Path(args.output).expanduser().resolve()
output_dir.mkdir(parents=True, exist_ok=True)
client = chromadb.PersistentClient(
path=str(db_path),
settings=Settings(anonymized_telemetry=False)
)
# 🔹 Single collection
if args.collection:
print(f"â–¶ Exporting collection: {args.collection}")
collection = client.get_collection(args.collection)
export_collection(collection, output_dir, args.include_embeddings)
# 🔹 All collections
else:
print("â–¶ Exporting ALL collections")
for cname in client.list_collections():
collection = client.get_collection(cname.name)
export_collection(collection, output_dir, args.include_embeddings)
print("\n🎉 Export complete!")
if __name__ == "__main__":
main()
|