File size: 4,143 Bytes
99589b3
598f5cb
 
 
 
 
 
 
99589b3
598f5cb
f0c8529
598f5cb
a58d2db
99589b3
 
 
 
a58d2db
99589b3
 
a58d2db
99589b3
 
 
a58d2db
99589b3
 
 
 
 
a58d2db
99589b3
 
a58d2db
99589b3
 
 
a58d2db
598f5cb
 
f0c8529
 
 
 
 
 
 
 
 
 
 
 
 
 
99589b3
 
f0c8529
 
 
 
99589b3
598f5cb
 
 
 
f0c8529
598f5cb
 
99589b3
598f5cb
 
 
 
 
 
 
99589b3
 
 
 
 
598f5cb
f0c8529
598f5cb
 
99589b3
598f5cb
 
 
 
 
 
 
 
 
 
f0c8529
598f5cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# app/main.py
import argparse, os, json
from pathlib import Path
from dotenv import dotenv_values
import pandas as pd

from app.ingest import ingest
from app.search import search
from app.paths import DATA_DIR, DOCSTORE_DIR, INDEX_DIR, EXPORT_DIR  # ← canonical paths


def get_env():
    """
    Load environment with safe, repo-relative defaults from app.paths.
    - Honors .env (dotenv) and real env vars if set.
    - Falls back to ./data, ./data/docstore, ./data/index, ./data/exports
      which work on macOS AND Hugging Face Spaces.
    """
    # 1) Start with .env (if present)
    env = dict(dotenv_values(".env") or {})

    # 2) Merge in process env (so Space secrets / shell vars override .env)
    for k, v in os.environ.items():
        env[k] = v

    # 3) Provide safe defaults from app.paths if not specified
    env.setdefault("DATA_DIR",     str(DATA_DIR))
    env.setdefault("DOCSTORE_DIR", str(DOCSTORE_DIR))
    env.setdefault("INDEX_DIR",    str(INDEX_DIR))
    env.setdefault("EXPORT_DIR",   str(EXPORT_DIR))

    # Optional UI/debug flags
    env.setdefault("SHOW_DEV", "0")

    # 4) Ensure directories exist
    for k in ("DATA_DIR", "DOCSTORE_DIR", "INDEX_DIR", "EXPORT_DIR"):
        Path(env[k]).mkdir(parents=True, exist_ok=True)

    return env


def ensure_index_exists(env: dict):
    """
    Ensure a FAISS index exists in env['INDEX_DIR'].
    If missing, run a minimal ingest using config/sources.yaml.
    """
    index_dir = Path(env["INDEX_DIR"])
    faiss_idx = index_dir / "faiss.index"
    meta_json = index_dir / "meta.json"

    if faiss_idx.exists() and meta_json.exists():
        return  # already built

    print("Index not found. Building now via ingest() …")
    # Ingest reads config and writes index/meta/docstore
    # If your ingest needs API keys, set them in Space Settings β†’ Variables
    path, n = ingest("config/sources.yaml", env)
    print(f"Ingest complete. {n} records. Docstore: {path}")


def cmd_ingest(_args):
    env = get_env()
    path, n = ingest("config/sources.yaml", env)
    print(f"Ingest complete. {n} records. Docstore: {path}")


def cmd_search(args):
    env = get_env()
    ensure_index_exists(env)
    filters = {}
    if args.geo:
        filters["geo"] = args.geo.split(",")
    if args.categories:
        filters["categories"] = args.categories.split(",")
    res = search(args.q, env, top_k=args.k, filters=filters)
    for r in res:
        geo = r.get("geo")
        if isinstance(geo, list):
            geo = ",".join(geo)
        print(f"- {r.get('title','(no title)')} [{r.get('source','')}] ({geo}) score={r.get('score',0):.3f}")
        print(f"  {r.get('url','')}")


def cmd_export(args):
    env = get_env()
    ensure_index_exists(env)
    filters = {}
    if args.geo:
        filters["geo"] = args.geo.split(",")
    if args.categories:
        filters["categories"] = args.categories.split(",")
    res = search(args.q, env, top_k=args.k, filters=filters)
    out = Path(env["EXPORT_DIR"]) / (args.out or "results.csv")
    pd.DataFrame(res).to_csv(out, index=False)
    print(f"Exported {len(res)} rows to {out}")


if __name__ == "__main__":
    p = argparse.ArgumentParser()
    sub = p.add_subparsers(dest="cmd")

    p_ing = sub.add_parser("ingest", help="Ingest sources and build index")
    p_ing.set_defaults(func=cmd_ingest)

    p_search = sub.add_parser("search", help="Search index")
    p_search.add_argument("--q", required=True)
    p_search.add_argument("--k", type=int, default=15)
    p_search.add_argument("--geo", default="")
    p_search.add_argument("--categories", default="")
    p_search.set_defaults(func=cmd_search)

    p_export = sub.add_parser("export", help="Export search results to CSV")
    p_export.add_argument("--q", required=True)
    p_export.add_argument("--k", type=int, default=50)
    p_export.add_argument("--geo", default="")
    p_export.add_argument("--categories", default="")
    p_export.add_argument("--out", default="results.csv")
    p_export.set_defaults(func=cmd_export)

    args = p.parse_args()
    if not args.cmd:
        p.print_help()
    else:
        args.func(args)