File size: 3,139 Bytes
8b5be8c
 
 
 
 
e7cf65c
 
 
 
 
 
 
 
 
 
 
8b5be8c
 
 
e7cf65c
8b5be8c
 
 
 
 
e7cf65c
8b5be8c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e7cf65c
 
 
 
 
 
8b5be8c
 
 
e7cf65c
 
 
8b5be8c
 
 
 
e7cf65c
8b5be8c
 
 
 
 
e7cf65c
 
 
 
 
 
8b5be8c
 
 
 
e7cf65c
8b5be8c
 
 
 
 
 
 
 
 
 
 
 
 
e7cf65c
 
 
 
8b5be8c
 
e7cf65c
 
 
 
 
 
 
 
8b5be8c
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import json
import argparse
from pathlib import Path
import chromadb
from chromadb.config import Settings
import numpy as np

def make_json_safe(value):
    """Convert numpy types to JSON-serializable Python types."""
    if isinstance(value, np.ndarray):
        return value.tolist()
    if isinstance(value, (np.integer,)):
        return int(value)
    if isinstance(value, (np.floating,)):
        return float(value)
    return value


def export_collection(collection, output_dir: Path, include_embeddings=False):
    """Export one ChromaDB collection to a JSON file with flattened, JSON-safe metadata."""

    include_fields = ["documents", "metadatas"]
    if include_embeddings:
        include_fields.append("embeddings")

    items = collection.get(include=include_fields)

    data = []
    for idx, _id in enumerate(items["ids"]):
        record = {
            "id": _id,
            "document": items["documents"][idx] if items.get("documents") else None,
            "metadata": items["metadatas"][idx] if items.get("metadatas") else None,
        }

        if include_embeddings:
            record["embedding"] = (
                items["embeddings"][idx] if items.get("embeddings") else None
            )

        data.append(record)

    out_path = output_dir / f"{collection.name}.json"
    out_path.write_text(
        json.dumps(data, indent=2, ensure_ascii=False),
        encoding="utf-8"
    )

    print(f"✔ Exported {collection.name} ({len(data)} docs) → {out_path}")


def main():
    parser = argparse.ArgumentParser(
        description="Export ChromaDB collections to MongoDB-ready JSON."
    )
    parser.add_argument(
        "--db-path",
        type=str,
        required=True,
        help="Path to the chromadb_store folder",
    )
    parser.add_argument(
        "--output",
        type=str,
        default="chroma_exports",
        help="Output folder for JSON files",
    )
    parser.add_argument(
        "--collection",
        type=str,
        help="Name of a single collection to export (omit to export all)",
    )
    parser.add_argument(
        "--include-embeddings",
        action="store_true",
        help="Include embeddings in the export",
    )

    args = parser.parse_args()

    db_path = Path(args.db_path).expanduser().resolve()
    output_dir = Path(args.output).expanduser().resolve()
    output_dir.mkdir(parents=True, exist_ok=True)

    client = chromadb.PersistentClient(
        path=str(db_path),
        settings=Settings(anonymized_telemetry=False)
    )

    # 🔹 Single collection
    if args.collection:
        print(f"â–¶ Exporting collection: {args.collection}")
        collection = client.get_collection(args.collection)
        export_collection(collection, output_dir, args.include_embeddings)

    # 🔹 All collections
    else:
        print("â–¶ Exporting ALL collections")
        for cname in client.list_collections():
            collection = client.get_collection(cname.name)
            export_collection(collection, output_dir, args.include_embeddings)

    print("\n🎉 Export complete!")


if __name__ == "__main__":
    main()