MedCodeMCP / src /parse_tabular.py
gpaasch's picture
repo structure cleanup (https://chatgpt.com/share/6841a61b-3dfc-8004-aec7-9c48fba857fd)
8e61455
raw
history blame
1.58 kB
import xml.etree.ElementTree as ET
import json
import sys
import os
def main(xml_path):
if not os.path.isfile(xml_path):
print(f"ERROR: cannot find tabular XML at '{xml_path}'")
sys.exit(1)
tree = ET.parse(xml_path)
root = tree.getroot()
icd_to_description = {}
# Iterate over every <diag> in the entire file, recursively.
# Each <diag> has:
# • <name> (the ICD-10 code)
# • <desc> (the human-readable description)
# • zero or more nested <diag> children (sub-codes).
for diag in root.iter("diag"):
name_elem = diag.find("name")
desc_elem = diag.find("desc")
if name_elem is None or desc_elem is None:
continue
# Some <diag> nodes might have <name/> or <desc/> with no text; skip those.
if name_elem.text is None or desc_elem.text is None:
continue
code = name_elem.text.strip()
description = desc_elem.text.strip()
# Only store non-empty strings:
if code and description:
icd_to_description[code] = description
# Write out a flat JSON mapping code → description
out_path = "icd_to_description.json"
with open(out_path, "w", encoding="utf-8") as fp:
json.dump(icd_to_description, fp, indent=2, ensure_ascii=False)
print(f"Wrote {len(icd_to_description)} code entries to {out_path}")
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python parse_tabular.py <path/to/icd10cm_tabular_2025.xml>")
sys.exit(1)
main(sys.argv[1])