"""Text processing utilities for LegisQA""" import re CONGRESS_GOV_TYPE_MAP = { "hconres": "house-concurrent-resolution", "hjres": "house-joint-resolution", "hr": "house-bill", "hres": "house-resolution", "s": "senate-bill", "sconres": "senate-concurrent-resolution", "sjres": "senate-joint-resolution", "sres": "senate-resolution", } def escape_markdown(text: str) -> str: """Escape markdown special characters in text""" MD_SPECIAL_CHARS = r"\`*_{}[]()#+-.!$" for char in MD_SPECIAL_CHARS: text = text.replace(char, "\\" + char) return text def get_sponsor_url(bioguide_id: str) -> str: """Generate URL for a sponsor's bioguide page""" return f"https://bioguide.congress.gov/search/bio/{bioguide_id}" def get_congress_gov_url(congress_num: int, legis_type: str, legis_num: int) -> str: """Generate Congress.gov URL for a piece of legislation""" lt = CONGRESS_GOV_TYPE_MAP[legis_type] return f"https://www.congress.gov/bill/{int(congress_num)}th-congress/{lt}/{int(legis_num)}" def legis_id_to_link(legis_id: str) -> str: """Convert a legislation ID to a Congress.gov URL""" congress_num, legis_type, legis_num = legis_id.split("-") return get_congress_gov_url(congress_num, legis_type, legis_num) def legis_id_match_to_link(matchobj): """Convert a regex match object to a markdown link""" mstring = matchobj.string[matchobj.start() : matchobj.end()] url = legis_id_to_link(mstring) link = f"[{mstring}]({url})" return link def replace_legis_ids_with_urls(text: str) -> str: """Replace legislation IDs in text with markdown links""" pattern = "1[12][3456789]-[a-z]+-\\d{1,5}" rtext = re.sub(pattern, legis_id_match_to_link, text) return rtext