Spaces:
Running
Running
File size: 4,143 Bytes
99589b3 598f5cb 99589b3 598f5cb f0c8529 598f5cb a58d2db 99589b3 a58d2db 99589b3 a58d2db 99589b3 a58d2db 99589b3 a58d2db 99589b3 a58d2db 99589b3 a58d2db 598f5cb f0c8529 99589b3 f0c8529 99589b3 598f5cb f0c8529 598f5cb 99589b3 598f5cb 99589b3 598f5cb f0c8529 598f5cb 99589b3 598f5cb f0c8529 598f5cb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
# app/main.py
import argparse, os, json
from pathlib import Path
from dotenv import dotenv_values
import pandas as pd
from app.ingest import ingest
from app.search import search
from app.paths import DATA_DIR, DOCSTORE_DIR, INDEX_DIR, EXPORT_DIR # β canonical paths
def get_env():
"""
Load environment with safe, repo-relative defaults from app.paths.
- Honors .env (dotenv) and real env vars if set.
- Falls back to ./data, ./data/docstore, ./data/index, ./data/exports
which work on macOS AND Hugging Face Spaces.
"""
# 1) Start with .env (if present)
env = dict(dotenv_values(".env") or {})
# 2) Merge in process env (so Space secrets / shell vars override .env)
for k, v in os.environ.items():
env[k] = v
# 3) Provide safe defaults from app.paths if not specified
env.setdefault("DATA_DIR", str(DATA_DIR))
env.setdefault("DOCSTORE_DIR", str(DOCSTORE_DIR))
env.setdefault("INDEX_DIR", str(INDEX_DIR))
env.setdefault("EXPORT_DIR", str(EXPORT_DIR))
# Optional UI/debug flags
env.setdefault("SHOW_DEV", "0")
# 4) Ensure directories exist
for k in ("DATA_DIR", "DOCSTORE_DIR", "INDEX_DIR", "EXPORT_DIR"):
Path(env[k]).mkdir(parents=True, exist_ok=True)
return env
def ensure_index_exists(env: dict):
"""
Ensure a FAISS index exists in env['INDEX_DIR'].
If missing, run a minimal ingest using config/sources.yaml.
"""
index_dir = Path(env["INDEX_DIR"])
faiss_idx = index_dir / "faiss.index"
meta_json = index_dir / "meta.json"
if faiss_idx.exists() and meta_json.exists():
return # already built
print("Index not found. Building now via ingest() β¦")
# Ingest reads config and writes index/meta/docstore
# If your ingest needs API keys, set them in Space Settings β Variables
path, n = ingest("config/sources.yaml", env)
print(f"Ingest complete. {n} records. Docstore: {path}")
def cmd_ingest(_args):
env = get_env()
path, n = ingest("config/sources.yaml", env)
print(f"Ingest complete. {n} records. Docstore: {path}")
def cmd_search(args):
env = get_env()
ensure_index_exists(env)
filters = {}
if args.geo:
filters["geo"] = args.geo.split(",")
if args.categories:
filters["categories"] = args.categories.split(",")
res = search(args.q, env, top_k=args.k, filters=filters)
for r in res:
geo = r.get("geo")
if isinstance(geo, list):
geo = ",".join(geo)
print(f"- {r.get('title','(no title)')} [{r.get('source','')}] ({geo}) score={r.get('score',0):.3f}")
print(f" {r.get('url','')}")
def cmd_export(args):
env = get_env()
ensure_index_exists(env)
filters = {}
if args.geo:
filters["geo"] = args.geo.split(",")
if args.categories:
filters["categories"] = args.categories.split(",")
res = search(args.q, env, top_k=args.k, filters=filters)
out = Path(env["EXPORT_DIR"]) / (args.out or "results.csv")
pd.DataFrame(res).to_csv(out, index=False)
print(f"Exported {len(res)} rows to {out}")
if __name__ == "__main__":
p = argparse.ArgumentParser()
sub = p.add_subparsers(dest="cmd")
p_ing = sub.add_parser("ingest", help="Ingest sources and build index")
p_ing.set_defaults(func=cmd_ingest)
p_search = sub.add_parser("search", help="Search index")
p_search.add_argument("--q", required=True)
p_search.add_argument("--k", type=int, default=15)
p_search.add_argument("--geo", default="")
p_search.add_argument("--categories", default="")
p_search.set_defaults(func=cmd_search)
p_export = sub.add_parser("export", help="Export search results to CSV")
p_export.add_argument("--q", required=True)
p_export.add_argument("--k", type=int, default=50)
p_export.add_argument("--geo", default="")
p_export.add_argument("--categories", default="")
p_export.add_argument("--out", default="results.csv")
p_export.set_defaults(func=cmd_export)
args = p.parse_args()
if not args.cmd:
p.print_help()
else:
args.func(args)
|