gabrielaltay's picture
update regex for legis url replacement
9ee8cbe
"""Text processing utilities for LegisQA"""
import re
CONGRESS_GOV_TYPE_MAP = {
"hconres": "house-concurrent-resolution",
"hjres": "house-joint-resolution",
"hr": "house-bill",
"hres": "house-resolution",
"s": "senate-bill",
"sconres": "senate-concurrent-resolution",
"sjres": "senate-joint-resolution",
"sres": "senate-resolution",
}
def escape_markdown(text: str) -> str:
"""Escape markdown special characters in text"""
MD_SPECIAL_CHARS = r"\`*_{}[]()#+-.!$"
for char in MD_SPECIAL_CHARS:
text = text.replace(char, "\\" + char)
return text
def get_sponsor_url(bioguide_id: str) -> str:
"""Generate URL for a sponsor's bioguide page"""
return f"https://bioguide.congress.gov/search/bio/{bioguide_id}"
def get_congress_gov_url(congress_num: int, legis_type: str, legis_num: int) -> str:
"""Generate Congress.gov URL for a piece of legislation"""
lt = CONGRESS_GOV_TYPE_MAP[legis_type]
return f"https://www.congress.gov/bill/{int(congress_num)}th-congress/{lt}/{int(legis_num)}"
def legis_id_to_link(legis_id: str) -> str:
"""Convert a legislation ID to a Congress.gov URL"""
congress_num, legis_type, legis_num = legis_id.split("-")
return get_congress_gov_url(congress_num, legis_type, legis_num)
def legis_id_match_to_link(matchobj):
"""Convert a regex match object to a markdown link"""
mstring = matchobj.string[matchobj.start() : matchobj.end()]
url = legis_id_to_link(mstring)
link = f"[{mstring}]({url})"
return link
def replace_legis_ids_with_urls(text: str) -> str:
"""Replace legislation IDs in text with markdown links"""
pattern = "1[12][3456789]-[a-z]+-\\d{1,5}"
rtext = re.sub(pattern, legis_id_match_to_link, text)
return rtext