Spaces:
Running
Running
# app/main.py | |
import argparse, os, json | |
from pathlib import Path | |
from dotenv import dotenv_values | |
import pandas as pd | |
from app.ingest import ingest | |
from app.search import search | |
from app.paths import DATA_DIR, DOCSTORE_DIR, INDEX_DIR, EXPORT_DIR # β canonical paths | |
def get_env(): | |
""" | |
Load environment with safe, repo-relative defaults from app.paths. | |
- Honors .env (dotenv) and real env vars if set. | |
- Falls back to ./data, ./data/docstore, ./data/index, ./data/exports | |
which work on macOS AND Hugging Face Spaces. | |
""" | |
# 1) Start with .env (if present) | |
env = dict(dotenv_values(".env") or {}) | |
# 2) Merge in process env (so Space secrets / shell vars override .env) | |
for k, v in os.environ.items(): | |
env[k] = v | |
# 3) Provide safe defaults from app.paths if not specified | |
env.setdefault("DATA_DIR", str(DATA_DIR)) | |
env.setdefault("DOCSTORE_DIR", str(DOCSTORE_DIR)) | |
env.setdefault("INDEX_DIR", str(INDEX_DIR)) | |
env.setdefault("EXPORT_DIR", str(EXPORT_DIR)) | |
# Optional UI/debug flags | |
env.setdefault("SHOW_DEV", "0") | |
# 4) Ensure directories exist | |
for k in ("DATA_DIR", "DOCSTORE_DIR", "INDEX_DIR", "EXPORT_DIR"): | |
Path(env[k]).mkdir(parents=True, exist_ok=True) | |
return env | |
def ensure_index_exists(env: dict): | |
""" | |
Ensure a FAISS index exists in env['INDEX_DIR']. | |
If missing, run a minimal ingest using config/sources.yaml. | |
""" | |
index_dir = Path(env["INDEX_DIR"]) | |
faiss_idx = index_dir / "faiss.index" | |
meta_json = index_dir / "meta.json" | |
if faiss_idx.exists() and meta_json.exists(): | |
return # already built | |
print("Index not found. Building now via ingest() β¦") | |
# Ingest reads config and writes index/meta/docstore | |
# If your ingest needs API keys, set them in Space Settings β Variables | |
path, n = ingest("config/sources.yaml", env) | |
print(f"Ingest complete. {n} records. Docstore: {path}") | |
def cmd_ingest(_args): | |
env = get_env() | |
path, n = ingest("config/sources.yaml", env) | |
print(f"Ingest complete. {n} records. Docstore: {path}") | |
def cmd_search(args): | |
env = get_env() | |
ensure_index_exists(env) | |
filters = {} | |
if args.geo: | |
filters["geo"] = args.geo.split(",") | |
if args.categories: | |
filters["categories"] = args.categories.split(",") | |
res = search(args.q, env, top_k=args.k, filters=filters) | |
for r in res: | |
geo = r.get("geo") | |
if isinstance(geo, list): | |
geo = ",".join(geo) | |
print(f"- {r.get('title','(no title)')} [{r.get('source','')}] ({geo}) score={r.get('score',0):.3f}") | |
print(f" {r.get('url','')}") | |
def cmd_export(args): | |
env = get_env() | |
ensure_index_exists(env) | |
filters = {} | |
if args.geo: | |
filters["geo"] = args.geo.split(",") | |
if args.categories: | |
filters["categories"] = args.categories.split(",") | |
res = search(args.q, env, top_k=args.k, filters=filters) | |
out = Path(env["EXPORT_DIR"]) / (args.out or "results.csv") | |
pd.DataFrame(res).to_csv(out, index=False) | |
print(f"Exported {len(res)} rows to {out}") | |
if __name__ == "__main__": | |
p = argparse.ArgumentParser() | |
sub = p.add_subparsers(dest="cmd") | |
p_ing = sub.add_parser("ingest", help="Ingest sources and build index") | |
p_ing.set_defaults(func=cmd_ingest) | |
p_search = sub.add_parser("search", help="Search index") | |
p_search.add_argument("--q", required=True) | |
p_search.add_argument("--k", type=int, default=15) | |
p_search.add_argument("--geo", default="") | |
p_search.add_argument("--categories", default="") | |
p_search.set_defaults(func=cmd_search) | |
p_export = sub.add_parser("export", help="Export search results to CSV") | |
p_export.add_argument("--q", required=True) | |
p_export.add_argument("--k", type=int, default=50) | |
p_export.add_argument("--geo", default="") | |
p_export.add_argument("--categories", default="") | |
p_export.add_argument("--out", default="results.csv") | |
p_export.set_defaults(func=cmd_export) | |
args = p.parse_args() | |
if not args.cmd: | |
p.print_help() | |
else: | |
args.func(args) | |