Spaces:

vikramvasudevan
/

sanatan_ai

Running on CPU Upgrade

File size: 3,139 Bytes

8b5be8c
 
 
 
 
e7cf65c
 
 
 
 
 
 
 
 
 
 
8b5be8c
 
 
e7cf65c
8b5be8c
 
 
 
 
e7cf65c
8b5be8c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e7cf65c
 
 
 
 
 
8b5be8c
 
 
e7cf65c
 
 
8b5be8c
 
 
 
e7cf65c
8b5be8c
 
 
 
 
e7cf65c
 
 
 
 
 
8b5be8c
 
 
 
e7cf65c
8b5be8c
 
 
 
 
 
 
 
 
 
 
 
 
e7cf65c
 
 
 
8b5be8c
 
e7cf65c
 
 
 
 
 
 
 
8b5be8c

import json
import argparse
from pathlib import Path
import chromadb
from chromadb.config import Settings
import numpy as np

def make_json_safe(value):
    """Convert numpy types to JSON-serializable Python types."""
    if isinstance(value, np.ndarray):
        return value.tolist()
    if isinstance(value, (np.integer,)):
        return int(value)
    if isinstance(value, (np.floating,)):
        return float(value)
    return value


def export_collection(collection, output_dir: Path, include_embeddings=False):
    """Export one ChromaDB collection to a JSON file with flattened, JSON-safe metadata."""

    include_fields = ["documents", "metadatas"]
    if include_embeddings:
        include_fields.append("embeddings")

    items = collection.get(include=include_fields)

    data = []
    for idx, _id in enumerate(items["ids"]):
        record = {
            "id": _id,
            "document": items["documents"][idx] if items.get("documents") else None,
            "metadata": items["metadatas"][idx] if items.get("metadatas") else None,
        }

        if include_embeddings:
            record["embedding"] = (
                items["embeddings"][idx] if items.get("embeddings") else None
            )

        data.append(record)

    out_path = output_dir / f"{collection.name}.json"
    out_path.write_text(
        json.dumps(data, indent=2, ensure_ascii=False),
        encoding="utf-8"
    )

    print(f"✔ Exported {collection.name} ({len(data)} docs) → {out_path}")


def main():
    parser = argparse.ArgumentParser(
        description="Export ChromaDB collections to MongoDB-ready JSON."
    )
    parser.add_argument(
        "--db-path",
        type=str,
        required=True,
        help="Path to the chromadb_store folder",
    )
    parser.add_argument(
        "--output",
        type=str,
        default="chroma_exports",
        help="Output folder for JSON files",
    )
    parser.add_argument(
        "--collection",
        type=str,
        help="Name of a single collection to export (omit to export all)",
    )
    parser.add_argument(
        "--include-embeddings",
        action="store_true",
        help="Include embeddings in the export",
    )

    args = parser.parse_args()

    db_path = Path(args.db_path).expanduser().resolve()
    output_dir = Path(args.output).expanduser().resolve()
    output_dir.mkdir(parents=True, exist_ok=True)

    client = chromadb.PersistentClient(
        path=str(db_path),
        settings=Settings(anonymized_telemetry=False)
    )

    # 🔹 Single collection
    if args.collection:
        print(f"▶ Exporting collection: {args.collection}")
        collection = client.get_collection(args.collection)
        export_collection(collection, output_dir, args.include_embeddings)

    # 🔹 All collections
    else:
        print("▶ Exporting ALL collections")
        for cname in client.list_collections():
            collection = client.get_collection(cname.name)
            export_collection(collection, output_dir, args.include_embeddings)

    print("\n🎉 Export complete!")


if __name__ == "__main__":
    main()