# app/main.py import argparse, os, json from pathlib import Path from dotenv import dotenv_values import pandas as pd from app.ingest import ingest from app.search import search from app.paths import DATA_DIR, DOCSTORE_DIR, INDEX_DIR, EXPORT_DIR # ← canonical paths def get_env(): """ Load environment with safe, repo-relative defaults from app.paths. - Honors .env (dotenv) and real env vars if set. - Falls back to ./data, ./data/docstore, ./data/index, ./data/exports which work on macOS AND Hugging Face Spaces. """ # 1) Start with .env (if present) env = dict(dotenv_values(".env") or {}) # 2) Merge in process env (so Space secrets / shell vars override .env) for k, v in os.environ.items(): env[k] = v # 3) Provide safe defaults from app.paths if not specified env.setdefault("DATA_DIR", str(DATA_DIR)) env.setdefault("DOCSTORE_DIR", str(DOCSTORE_DIR)) env.setdefault("INDEX_DIR", str(INDEX_DIR)) env.setdefault("EXPORT_DIR", str(EXPORT_DIR)) # Optional UI/debug flags env.setdefault("SHOW_DEV", "0") # 4) Ensure directories exist for k in ("DATA_DIR", "DOCSTORE_DIR", "INDEX_DIR", "EXPORT_DIR"): Path(env[k]).mkdir(parents=True, exist_ok=True) return env def ensure_index_exists(env: dict): """ Ensure a FAISS index exists in env['INDEX_DIR']. If missing, run a minimal ingest using config/sources.yaml. """ index_dir = Path(env["INDEX_DIR"]) faiss_idx = index_dir / "faiss.index" meta_json = index_dir / "meta.json" if faiss_idx.exists() and meta_json.exists(): return # already built print("Index not found. Building now via ingest() …") # Ingest reads config and writes index/meta/docstore # If your ingest needs API keys, set them in Space Settings → Variables path, n = ingest("config/sources.yaml", env) print(f"Ingest complete. {n} records. Docstore: {path}") def cmd_ingest(_args): env = get_env() path, n = ingest("config/sources.yaml", env) print(f"Ingest complete. {n} records. Docstore: {path}") def cmd_search(args): env = get_env() ensure_index_exists(env) filters = {} if args.geo: filters["geo"] = args.geo.split(",") if args.categories: filters["categories"] = args.categories.split(",") res = search(args.q, env, top_k=args.k, filters=filters) for r in res: geo = r.get("geo") if isinstance(geo, list): geo = ",".join(geo) print(f"- {r.get('title','(no title)')} [{r.get('source','')}] ({geo}) score={r.get('score',0):.3f}") print(f" {r.get('url','')}") def cmd_export(args): env = get_env() ensure_index_exists(env) filters = {} if args.geo: filters["geo"] = args.geo.split(",") if args.categories: filters["categories"] = args.categories.split(",") res = search(args.q, env, top_k=args.k, filters=filters) out = Path(env["EXPORT_DIR"]) / (args.out or "results.csv") pd.DataFrame(res).to_csv(out, index=False) print(f"Exported {len(res)} rows to {out}") if __name__ == "__main__": p = argparse.ArgumentParser() sub = p.add_subparsers(dest="cmd") p_ing = sub.add_parser("ingest", help="Ingest sources and build index") p_ing.set_defaults(func=cmd_ingest) p_search = sub.add_parser("search", help="Search index") p_search.add_argument("--q", required=True) p_search.add_argument("--k", type=int, default=15) p_search.add_argument("--geo", default="") p_search.add_argument("--categories", default="") p_search.set_defaults(func=cmd_search) p_export = sub.add_parser("export", help="Export search results to CSV") p_export.add_argument("--q", required=True) p_export.add_argument("--k", type=int, default=50) p_export.add_argument("--geo", default="") p_export.add_argument("--categories", default="") p_export.add_argument("--out", default="results.csv") p_export.set_defaults(func=cmd_export) args = p.parse_args() if not args.cmd: p.print_help() else: args.func(args)