Spaces:

Agents-MCP-Hackathon
/

MedCodeMCP

Running

MedCodeMCP / src /parse_tabular.py

repo structure cleanup (https://chatgpt.com/share/6841a61b-3dfc-8004-aec7-9c48fba857fd)

8e61455 about 2 months ago

1.58 kB

	import xml.etree.ElementTree as ET
	import json
	import sys
	import os

	def main(xml_path):
	if not os.path.isfile(xml_path):
	print(f"ERROR: cannot find tabular XML at '{xml_path}'")
	sys.exit(1)

	tree = ET.parse(xml_path)
	root = tree.getroot()

	icd_to_description = {}

	# Iterate over every <diag> in the entire file, recursively.
	# Each <diag> has:
	# • <name> (the ICD-10 code)
	# • <desc> (the human-readable description)
	# • zero or more nested <diag> children (sub-codes).
	for diag in root.iter("diag"):
	name_elem = diag.find("name")
	desc_elem = diag.find("desc")
	if name_elem is None or desc_elem is None:
	continue
	# Some <diag> nodes might have <name/> or <desc/> with no text; skip those.
	if name_elem.text is None or desc_elem.text is None:
	continue

	code = name_elem.text.strip()
	description = desc_elem.text.strip()
	# Only store non-empty strings:
	if code and description:
	icd_to_description[code] = description

	# Write out a flat JSON mapping code → description
	out_path = "icd_to_description.json"
	with open(out_path, "w", encoding="utf-8") as fp:
	json.dump(icd_to_description, fp, indent=2, ensure_ascii=False)

	print(f"Wrote {len(icd_to_description)} code entries to {out_path}")


	if __name__ == "__main__":
	if len(sys.argv) != 2:
	print("Usage: python parse_tabular.py <path/to/icd10cm_tabular_2025.xml>")
	sys.exit(1)
	main(sys.argv[1])