sanatan_ai / modules /db /import.py
vikramvasudevan's picture
Upload folder using huggingface_hub
8b5be8c verified
import json
import argparse
from pathlib import Path
import chromadb
from chromadb.config import Settings
def import_collection(client, json_file: Path, include_embeddings=False):
"""Import a JSON file into a ChromaDB collection."""
collection_name = json_file.stem
print(f"πŸ“₯ Importing {collection_name} from {json_file}")
# Load JSON
data = json.loads(json_file.read_text(encoding="utf-8"))
# Extract fields
ids = [item["id"] for item in data]
documents = [item.get("document") for item in data]
metadatas = [item.get("metadata") for item in data]
if include_embeddings:
embeddings = [item.get("embedding") for item in data]
else:
embeddings = None
# Create or get collection
collection = client.get_or_create_collection(collection_name)
# Add to collection
collection.add(
ids=ids,
documents=documents,
metadatas=metadatas,
embeddings=embeddings
)
print(f"βœ” Imported {len(ids)} items into {collection_name}")
def main():
parser = argparse.ArgumentParser(description="Import JSON files into ChromaDB collections.")
parser.add_argument(
"--db-path",
type=str,
required=True,
help="Path to the target chromadb_store folder",
)
parser.add_argument(
"--input",
type=str,
default="chroma_exports",
help="Folder containing JSON files to import",
)
parser.add_argument(
"--include-embeddings",
action="store_true",
help="Load embeddings from JSON (off by default)",
)
args = parser.parse_args()
db_path = Path(args.db_path).expanduser().resolve()
input_dir = Path(args.input).expanduser().resolve()
if not input_dir.exists():
print(f"❌ Input folder does not exist: {input_dir}")
return
# Connect to ChromaDB
client = chromadb.PersistentClient(
path=str(db_path),
settings=Settings(anonymized_telemetry=False)
)
# Iterate JSON files
for json_file in input_dir.glob("*.json"):
import_collection(client, json_file, args.include_embeddings)
print("\nπŸŽ‰ All JSON files imported!")
if __name__ == "__main__":
main()