grants-rag / app /main.py
Gen. Overseer Lupo
Add local extra JSON source and update config
99589b3
# app/main.py
import argparse, os, json
from pathlib import Path
from dotenv import dotenv_values
import pandas as pd
from app.ingest import ingest
from app.search import search
from app.paths import DATA_DIR, DOCSTORE_DIR, INDEX_DIR, EXPORT_DIR # ← canonical paths
def get_env():
"""
Load environment with safe, repo-relative defaults from app.paths.
- Honors .env (dotenv) and real env vars if set.
- Falls back to ./data, ./data/docstore, ./data/index, ./data/exports
which work on macOS AND Hugging Face Spaces.
"""
# 1) Start with .env (if present)
env = dict(dotenv_values(".env") or {})
# 2) Merge in process env (so Space secrets / shell vars override .env)
for k, v in os.environ.items():
env[k] = v
# 3) Provide safe defaults from app.paths if not specified
env.setdefault("DATA_DIR", str(DATA_DIR))
env.setdefault("DOCSTORE_DIR", str(DOCSTORE_DIR))
env.setdefault("INDEX_DIR", str(INDEX_DIR))
env.setdefault("EXPORT_DIR", str(EXPORT_DIR))
# Optional UI/debug flags
env.setdefault("SHOW_DEV", "0")
# 4) Ensure directories exist
for k in ("DATA_DIR", "DOCSTORE_DIR", "INDEX_DIR", "EXPORT_DIR"):
Path(env[k]).mkdir(parents=True, exist_ok=True)
return env
def ensure_index_exists(env: dict):
"""
Ensure a FAISS index exists in env['INDEX_DIR'].
If missing, run a minimal ingest using config/sources.yaml.
"""
index_dir = Path(env["INDEX_DIR"])
faiss_idx = index_dir / "faiss.index"
meta_json = index_dir / "meta.json"
if faiss_idx.exists() and meta_json.exists():
return # already built
print("Index not found. Building now via ingest() …")
# Ingest reads config and writes index/meta/docstore
# If your ingest needs API keys, set them in Space Settings β†’ Variables
path, n = ingest("config/sources.yaml", env)
print(f"Ingest complete. {n} records. Docstore: {path}")
def cmd_ingest(_args):
env = get_env()
path, n = ingest("config/sources.yaml", env)
print(f"Ingest complete. {n} records. Docstore: {path}")
def cmd_search(args):
env = get_env()
ensure_index_exists(env)
filters = {}
if args.geo:
filters["geo"] = args.geo.split(",")
if args.categories:
filters["categories"] = args.categories.split(",")
res = search(args.q, env, top_k=args.k, filters=filters)
for r in res:
geo = r.get("geo")
if isinstance(geo, list):
geo = ",".join(geo)
print(f"- {r.get('title','(no title)')} [{r.get('source','')}] ({geo}) score={r.get('score',0):.3f}")
print(f" {r.get('url','')}")
def cmd_export(args):
env = get_env()
ensure_index_exists(env)
filters = {}
if args.geo:
filters["geo"] = args.geo.split(",")
if args.categories:
filters["categories"] = args.categories.split(",")
res = search(args.q, env, top_k=args.k, filters=filters)
out = Path(env["EXPORT_DIR"]) / (args.out or "results.csv")
pd.DataFrame(res).to_csv(out, index=False)
print(f"Exported {len(res)} rows to {out}")
if __name__ == "__main__":
p = argparse.ArgumentParser()
sub = p.add_subparsers(dest="cmd")
p_ing = sub.add_parser("ingest", help="Ingest sources and build index")
p_ing.set_defaults(func=cmd_ingest)
p_search = sub.add_parser("search", help="Search index")
p_search.add_argument("--q", required=True)
p_search.add_argument("--k", type=int, default=15)
p_search.add_argument("--geo", default="")
p_search.add_argument("--categories", default="")
p_search.set_defaults(func=cmd_search)
p_export = sub.add_parser("export", help="Export search results to CSV")
p_export.add_argument("--q", required=True)
p_export.add_argument("--k", type=int, default=50)
p_export.add_argument("--geo", default="")
p_export.add_argument("--categories", default="")
p_export.add_argument("--out", default="results.csv")
p_export.set_defaults(func=cmd_export)
args = p.parse_args()
if not args.cmd:
p.print_help()
else:
args.func(args)