grants-rag / app /normalize.py
Gen. Overseer Lupo
Add local extra JSON source and update config
99589b3
from typing import Any, Dict, Callable, Optional
from datetime import datetime
def _iso(d: Any) -> Optional[str]:
if not d:
return None
s = str(d)
for fmt in ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%S.%f"):
try:
return datetime.strptime(s, fmt).date().isoformat()
except Exception:
pass
try:
return datetime.fromisoformat(s).date().isoformat()
except Exception:
return None
def _first(x: Any) -> Any:
return (x[0] if isinstance(x, (list, tuple)) and x else x)
def _list(x: Any) -> list:
if x is None:
return []
if isinstance(x, list):
return x
if isinstance(x, (set, tuple)):
return list(x)
return [x]
# Registry of source mappers: raw -> unified schema
MAPPERS: Dict[str, Callable[[Dict[str, Any]], Dict[str, Any]]] = {}
def mapper(name: str):
def _wrap(fn: Callable[[Dict[str, Any]], Dict[str, Any]]):
MAPPERS[name] = fn
return fn
return _wrap
@mapper("grants_gov")
def _map_grants_gov(h: Dict[str, Any]) -> Dict[str, Any]:
gg_id = h.get("id")
num = h.get("number")
aln_list = h.get("alnist") or h.get("aln") or []
out: Dict[str, Any] = {
"id": f"gg:{num or gg_id}",
"source": "grants.gov",
"title": h.get("title"),
"agency": h.get("agencyName") or h.get("agencyCode") or h.get("agency"),
"program_number": _first(aln_list) or h.get("program_number"),
"posted_date": _iso(h.get("openDate") or h.get("posted_date")),
"deadline": _iso(h.get("closeDate") or h.get("deadline")),
"synopsis": h.get("synopsis") or h.get("summary"),
"location_scope": h.get("location_scope") or ["US"],
"tags": h.get("tags") or [],
"url": h.get("url") or (f"https://www.grants.gov/search-results-detail/{gg_id}" if gg_id else None),
"raw": h,
}
# Optionals if present on the raw record
for k_src, k_dst in [
("awardFloor", "award_floor"),
("awardCeiling", "award_ceiling"),
("expectedNumberOfAwards", "expected_awards"),
("eligibility", "eligibility"),
]:
if h.get(k_src) is not None or h.get(k_dst) is not None:
out[k_dst] = h.get(k_dst) if h.get(k_dst) is not None else h.get(k_src)
return out
@mapper("local_sample")
def _map_local_sample(op: Dict[str, Any]) -> Dict[str, Any]:
return {
"id": f"sample:{op.get('opportunityNumber')}",
"source": "sample_local",
"title": op.get("opportunityTitle"),
"agency": op.get("agency"),
"program_number": None,
"posted_date": _iso(op.get("postedDate")),
"deadline": _iso(op.get("closeDate")),
"synopsis": op.get("synopsis"),
"location_scope": ["US"],
"tags": [],
"url": None,
"raw": op,
}
def normalize(source_key: str, raw: Dict[str, Any], static: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
if source_key not in MAPPERS:
raise KeyError("No mapper registered for %r" % source_key)
rec = MAPPERS[source_key](raw)
static = static or {}
# attach geo
if static.get("geo"):
rec["geo"] = static["geo"]
# attach categories and mirror into tags
cats = _list(static.get("categories"))
rec.setdefault("categories", [])
for c in cats:
if c not in rec["categories"]:
rec["categories"].append(c)
rec["tags"] = list(set(_list(rec.get("tags")) + cats))
return rec