Spaces:
Running
Running
File size: 44,950 Bytes
2e237ce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 |
#!/usr/bin/env python3
# update_docx_from_json.py
import sys, json, re
from pathlib import Path
from typing import Dict, List, Tuple, Optional
from docx import Document
from docx.shared import RGBColor, Pt # add Pt
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph
from copy import deepcopy
from docx.oxml.ns import qn
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
BLACK = RGBColor(0, 0, 0)
RED = RGBColor(0xFF, 0x00, 0x00)
# ----------------------------- text helpers -----------------------------
def _find_table_with_headers(doc: Document, must_have: list[str]) -> Optional[Table]:
for t in doc.tables:
if not t.rows:
continue
head = canon(" ".join(cell_text(c) for c in t.rows[0].cells))
if all(canon_label(x) in head for x in must_have):
return t
return None
def ensure_auditor_decl_headers(doc: Document) -> bool:
"""
Second-last page table under 'NHVAS APPROVED AUDITOR DECLARATION'.
Force the HEADER row to read exactly:
[ Print Name | NHVR or Exemplar Global Auditor Registration Number ]
Never touch the bottom (values) row.
"""
changed = False
expected_left = "Print Name"
expected_right = "NHVR or Exemplar Global Auditor Registration Number"
for t in doc.tables:
if not t.rows or not t.rows[0].cells:
continue
# must look like the auditor table: header left says "Print Name", 2+ cols, 2+ rows
head_left = canon_label(cell_text(t.rows[0].cells[0]))
if head_left == "print name" and len(t.rows[0].cells) >= 2 and len(t.rows) >= 2:
# fix left header if needed
if canon_label(cell_text(t.rows[0].cells[0])) != canon_label(expected_left) or \
any(is_red_run(r) for p in t.rows[0].cells[0].paragraphs for r in p.runs):
_set_cell_text_black(t.rows[0].cells[0], expected_left)
changed = True
# unconditionally set the RIGHT header text (this is where "Peter Sheppard" was sitting)
if canon_label(cell_text(t.rows[0].cells[1])) != canon_label(expected_right) or \
any(is_red_run(r) for p in t.rows[0].cells[1].paragraphs for r in p.runs):
_set_cell_text_black(t.rows[0].cells[1], expected_right)
changed = True
# found and fixed the table; no need to continue
break
return changed
def fill_operator_declaration(doc: Document, print_name: str, position_title: str) -> bool:
"""Last page table: write values ONLY into the bottom row (red placeholders)."""
t = _find_table_with_headers(doc, ["Print Name", "Position Title"])
if not t or len(t.rows) < 2 or len(t.rows[0].cells) < 2:
return False
bot_left = t.rows[1].cells[0]
bot_right = t.rows[1].cells[1]
# only replace if that cell has a red placeholder
if any(is_red_run(r) for p in bot_left.paragraphs for r in p.runs):
_set_cell_text_black(bot_left, print_name)
if any(is_red_run(r) for p in bot_right.paragraphs for r in p.runs):
_set_cell_text_black(bot_right, position_title)
return True
def find_heading_index_from_end(doc: Document, heading: str) -> Optional[int]:
key = canon(heading)
allp = iter_paragraphs(doc)
for i in range(len(allp) - 1, -1, -1):
if key in canon(para_text(allp[i])):
return i
return None
def set_date_by_heading_from_end(doc: Document, heading: str, date_text: str, max_scan: int = 60) -> bool:
"""Find the LAST occurrence of `heading`, then replace the FIRST red run in the next paragraphs."""
if not date_text:
return False
allp = iter_paragraphs(doc)
idx = find_heading_index_from_end(doc, heading)
if idx is None:
return False
for p in allp[idx + 1 : min(idx + 1 + max_scan, len(allp))]:
if replace_red_in_paragraph(p, date_text): # writes in black
return True
return False
def set_date_by_paragraph_from_end(doc: Document, paragraph_text: str, date_text: str, max_scan: int = 60) -> bool:
"""Find the LAST paragraph matching `paragraph_text`, then set the FIRST red run after it."""
if not date_text:
return False
key = canon(paragraph_text)
allp = iter_paragraphs(doc)
hit = None
for i in range(len(allp) - 1, -1, -1):
if key in canon(para_text(allp[i])):
hit = i
break
if hit is None:
return False
# date placeholder is on the LAST page, right after this long paragraph
for p in allp[hit + 1 : min(hit + 1 + max_scan, len(allp))]:
if replace_red_in_paragraph(p, date_text): # writes in black
return True
return False
def set_layer3_name_after_management_heading(doc: Document, mid_heading: str, allowed_prev_titles: List[str], name: str) -> bool:
if not name:
return False
allp = iter_paragraphs(doc)
wrote = False
mid = canon(mid_heading)
allowed_prev = {canon(t) for t in allowed_prev_titles}
for i, p in enumerate(allp):
if canon(para_text(p)) != mid:
continue
# previous non-empty must be one of the allowed titles
j = i - 1
while j >= 0 and not nz(para_text(allp[j])):
j -= 1
if j < 0 or canon(para_text(allp[j])) not in allowed_prev:
continue
# next non-empty is the 3rd line we overwrite
k = i + 1
while k < len(allp) and not nz(para_text(allp[k])):
k += 1
if k >= len(allp):
continue
# compute target size from the middle heading; fall back to a sensible bump
target_size = _para_effective_font_size(allp[i]) or Pt(16)
_clear_para_and_write_black(allp[k], name)
# apply size to all runs explicitly (overrides style)
for r in allp[k].runs:
r.font.size = target_size
wrote = True
return wrote
def _para_effective_font_size(p: Paragraph):
# try explicit run sizes first
for r in p.runs:
if r.font.size:
return r.font.size
# then the paragraph style
if p.style and p.style.font and p.style.font.size:
return p.style.font.size
return None
# --- helpers for summary tables ---
# --- helpers for summary overwrite ---
def _std_key(s: str) -> str:
"""
Normalize a label to match a 'Std N' key.
e.g. 'Std 7. Internal Review' -> 'std 7'
"""
t = canon_label(s)
m = re.match(r"(std\s+\d+)", t)
return m.group(1) if m else t
def _looks_like_summary_table(table: Table) -> Optional[Tuple[int, int]]:
"""
Return (label_col_idx, details_col_idx) if this is a Summary table
with a DETAILS column; otherwise None.
"""
if not table.rows:
return None
first = table.rows[0]
cols = len(first.cells)
if cols < 2:
return None
# header texts for first row
head = [canon(cell_text(c)) for c in first.cells]
# find DETAILS column
details_col = None
for j, t in enumerate(head):
if "detail" in t:
details_col = j
break
if details_col is None:
return None
# find the label column (left-hand standards column)
label_col = None
for j, t in enumerate(head):
if any(k in t for k in ["maintenance management", "mass management", "fatigue management"]):
label_col = j
break
if label_col is None:
# fallback: assume the first non-DETAILS column is the label column
label_col = 0 if details_col != 0 else 1
return (label_col, details_col)
def count_header_rows(table: Table, scan_up_to: int = 6) -> int:
"""Heuristically count header rows (stop when first data row like '1.' appears)."""
for i, row in enumerate(table.rows[:scan_up_to]):
first = cell_text(row.cells[0]).strip()
if re.match(r"^\d+\.?$", first):
return i
return 1
def _header_col_texts(table: Table, scan_rows: int = 5) -> List[str]:
scan_rows = min(scan_rows, len(table.rows))
if scan_rows == 0:
return []
# pick the row with the most cells as base
base_row = max(range(scan_rows), key=lambda i: len(table.rows[i].cells))
base_cols = len(table.rows[base_row].cells)
cols = []
for j in range(base_cols):
parts = []
for i in range(scan_rows):
row = table.rows[i]
if j < len(row.cells):
parts.append(cell_text(row.cells[j]))
cols.append(canon(" ".join(parts)))
return cols
def count_header_rows(table: Table, scan_up_to: int = 6) -> int:
"""Header ends right before the first row whose 1st cell looks like '1.'"""
limit = min(scan_up_to, len(table.rows))
for i in range(limit):
first = cell_text(table.rows[i].cells[0]).strip()
if re.match(r"^\d+\.?$", first):
return i
# fallback to 1 header row
return 1
def map_cols_mass_strict(table: Table) -> Dict[str, int]:
cols = _header_col_texts(table, 5)
def first_col(*needles):
for j, t in enumerate(cols):
if all(n in t for n in needles):
return j
return None
idx = {
"no": first_col("no"),
"reg": first_col("registration", "number") or first_col("registration"),
"wv": first_col("weight", "verification"),
"rfs": first_col("rfs", "cert") or first_col("rfs", "certification"),
"susp": first_col("suspension", "maintenance"),
"trip": first_col("trip", "record"),
"frs": first_col("fault", "suspension") or first_col("fault", "reporting", "suspension"),
}
return {k: v for k, v in idx.items() if v is not None}
def find_mass_vehicle_numbers_table(doc: Document) -> Optional[Table]:
"""Pick the Mass vehicle-number table by matching its column set (not the Summary table)."""
best = None
best_score = -1
for t in iter_tables(doc):
cols = _header_col_texts(t, 5)
allhdr = " ".join(cols)
# must look like the vehicle numbers table
hits = 0
hits += int(any("registration" in c and "number" in c for c in cols))
hits += int(any("weight" in c and "verification" in c for c in cols))
hits += int(any("rfs" in c and ("cert" in c or "certification" in c) for c in cols))
hits += int(any("suspension" in c and "maintenance" in c for c in cols))
hits += int(any("trip" in c and "record" in c for c in cols))
hits += int(any("fault" in c and "suspension" in c for c in cols))
# reject obvious Summary tables
if "details" in allhdr:
continue
# prefer tables with numbering column and many rows
score = hits + (0.5 if any("no" == c or c.startswith("no ") for c in cols) else 0) + (len(t.rows) / 100.0)
if hits >= 4 and score > best_score:
best, best_score = t, score
return best
def update_operator_declaration(doc: Document, print_name: str, position_title: str) -> bool:
"""
First try strict table label mapping for 'Print Name' and 'Position Title'.
If not found, fallback to the first two red placeholders under the 'Operator Declaration' heading.
"""
changed = False
# 1) Table label approach
for lbl, val in (("Print Name", print_name), ("Position Title", position_title)):
if not val:
continue
loc = find_label_cell(doc, lbl)
if not loc:
# tolerate odd spacing/colon/camelcase
for alt in ("PrintName", "Print Name", "Print Name:", "PositionTitle", "Position Title", "Position Title:"):
loc = find_label_cell(doc, alt)
if loc:
break
if loc:
t, r, c = loc
cell = get_adjacent_value_cell(t, r, c)
if not replace_red_in_cell(cell, val):
_set_cell_text_black(cell, val)
changed = True
if changed:
return True
# 2) Fallback: heading-scoped red placeholders
head = "OPERATOR DECLARATION"
p = find_heading_paragraph(doc, head) or find_heading_paragraph(doc, head.title())
if not p:
return False
allp = iter_paragraphs(doc)
try:
i = allp.index(p)
except ValueError:
i = 0
red_targets = []
for q in allp[i+1:i+1+20]:
reds = [r for r in q.runs if is_red_run(r)]
if reds:
red_targets.extend(reds)
if len(red_targets) >= 2:
break
wrote = False
if print_name and red_targets:
_set_text_and_black(red_targets[0], print_name); wrote = True
if position_title and len(red_targets) >= 2:
_set_text_and_black(red_targets[1], position_title); wrote = True
return wrote
def fill_mass_vehicle_table_preserve_headers(table: Table, arrays: Dict[str, List[str]]):
colmap = map_cols_mass_strict(table)
if "reg" not in colmap:
return
hdr_rows = count_header_rows(table, 6)
regs = arrays.get("Registration Number", [])
n = len(regs)
# clear data rows only
while len(table.rows) > hdr_rows:
table._tbl.remove(table.rows[-1]._tr)
# ensure enough rows
while len(table.rows) < hdr_rows + n:
table.add_row()
def put(row, key, arr_key, i):
if key in colmap:
vals = arrays.get(arr_key, [])
val = nz(vals[i]) if i < len(vals) else ""
replace_red_in_cell(row.cells[colmap[key]], val)
for i in range(n):
row = table.rows[hdr_rows + i]
replace_red_in_cell(row.cells[colmap["reg"]], nz(regs[i]))
put(row, "wv", "Weight Verification Records", i)
put(row, "rfs", "RFS Suspension Certification #", i)
put(row, "susp", "Suspension System Maintenance", i)
put(row, "trip", "Trip Records", i)
put(row, "frs", "Fault Recording/ Reporting on Suspension System", i)
def overwrite_summary_details_cells(doc: Document, section_name: str, section_dict: Dict[str, List[str]]) -> int:
"""For a Summary table (Maintenance/Mass/Fatigue), replace the entire DETAILS cell
for each Std N row with the JSON text (written in black)."""
# build desired texts
desired: Dict[str, str] = { _std_key(k): join_value(v) for k, v in section_dict.items() }
# pick which tables belong to this section by header sniff
wanted_prefix = canon_label(section_name.split()[0]) # "maintenance" | "mass" | "fatigue"
updated = 0
for t in doc.tables:
cols = _looks_like_summary_table(t)
if not cols:
continue
label_col, details_col = cols
head_txt = table_header_text(t, up_to_rows=2)
if wanted_prefix not in head_txt: # keep to the correct section
continue
# walk body rows
for i in range(1, len(t.rows)):
row = t.rows[i]
key = _std_key(cell_text(row.cells[label_col]))
# exact match or "std N" prefix match
cand = desired.get(key)
if not cand:
m = re.match(r"(std\s+\d+)", key)
if m:
for k2, v2 in desired.items():
if k2.startswith(m.group(1)):
cand = v2
break
if not cand:
continue
_set_cell_text_black(row.cells[details_col], cand) # full overwrite, black
updated += 1
return updated
SPLIT_SENT_PAT = re.compile(r"(?<=\.|\?|!)\s+")
ORDINAL_DATE_PAT = re.compile(r"\b(\d{1,2}(?:st|nd|rd|th)\s+[A-Za-z]+\s+\d{4})\b", re.I)
def split_sentences_keep(text: str) -> List[str]:
s = " ".join(str(text or "").split())
if not s:
return []
out = []
start = 0
for m in SPLIT_SENT_PAT.finditer(s):
out.append(s[start:m.start()].strip())
start = m.end()
last = s[start:].strip()
if last:
out.append(last)
return out
_sent_split = re.compile(r'(?<=[.!?])\s+|\n+')
_date_pat = re.compile(r'\b(?:\d{1,2}(?:st|nd|rd|th)\s+[A-Za-z]+\s+\d{4}|\d{1,2}/\d{1,2}/\d{2,4}|[A-Za-z]+\s+\d{1,2},\s*\d{4})\b')
def extract_summary_snippets(desired_text: str):
sents = _sentences(desired_text)
dates = [m.group(0) for m in _date_pat.finditer(desired_text)]
pick = lambda rx: next((s for s in sents if re.search(rx, s, re.I)), None)
return {
"sheet_sent": pick(r'\b(daily\s+check|sheet)\b'),
"sheet_phrase": _extract_sheet_phrase_from_desired(desired_text),
"review": pick(r'\binternal\s+review\b'),
"qcs": pick(r'\bquarterly\b.*\bcompliance\b') or pick(r'\bquarterly\b'),
"dates": dates,
"sents": sents,
}
def fill_management_summary_tables(doc: Document, section_key: str, section_data: Dict[str, List[str]]):
"""
Fill ALL summary tables for the given section_key ('maintenance'|'mass'|'fatigue')
by matching each row label (left column) against keys in section_data and
patching only the red text inside the DETAILS cell.
"""
targets = [x for x in find_all_summary_tables(doc) if x[0] == section_key]
if not targets:
return
# build list of (normalized label, original label, desired_text)
desired = []
for label, vals in section_data.items():
want = canon_label(label)
if not want:
continue
desired.append((want, label, join_value(vals)))
for _, table, lcol, dcol in targets:
# iterate data rows (skip header)
for i in range(1, len(table.rows)):
left_txt_norm = canon_label(cell_text(table.rows[i].cells[lcol]))
if not left_txt_norm:
continue
for want_norm, _orig_lbl, value in desired:
# loose contains match handles minor punctuation differences
if want_norm and want_norm in left_txt_norm:
patch_details_cell_from_json(table.rows[i].cells[dcol], value)
def _set_text_and_black(run, new_text: str):
"""Replace a run's text and force color to black (clears theme color too)."""
if new_text is None:
new_text = ""
run.text = str(new_text)
run.font.color.rgb = BLACK
try:
# clear any theme color so rgb sticks
run.font.color.theme_color = None
except Exception:
pass
def update_business_summary_once(doc: Document, value) -> bool:
"""Replace only the red summary paragraph; keep 'Accreditation Number' and 'Expiry Date' lines."""
loc = (find_label_cell(doc, "Nature of the Operators Business (Summary)")
or find_label_cell(doc, "Nature of the Operators Business (Summary):"))
if not loc:
return False
t, r, c = loc
cell = get_adjacent_value_cell(t, r, c)
if not cell.paragraphs:
cell.add_paragraph("")
txt = join_value(value)
# find paragraphs with any red runs (the placeholders for the summary)
red_paras = [p for p in cell.paragraphs if any(is_red_run(run) for run in p.runs)]
if red_paras:
# write the summary into the first red paragraph (in black)
_clear_para_and_write_black(red_paras[0], txt)
# clear any extra red placeholders
for p in red_paras[1:]:
_clear_para_and_write_black(p, "")
else:
# no red placeholder found: just put the summary into the first paragraph, leave others
_clear_para_and_write_black(cell.paragraphs[0], txt)
return True
def _nuke_cell_paragraphs(cell: _Cell):
"""Remove ALL paragraphs from a cell (true delete, not just emptying runs)."""
for p in list(cell.paragraphs):
p._element.getparent().remove(p._element)
def _clear_para_and_write_black(paragraph, text: str):
"""Clear a whole paragraph and write fresh black text."""
# wipe existing runs
for r in list(paragraph.runs):
r.text = ""
r = paragraph.add_run(str(text or ""))
r.font.color.rgb = BLACK
try:
r.font.color.theme_color = None
except Exception:
pass
def _set_cell_text_black(cell, text: str):
"""Clear a table cell and insert black text."""
# remove text from all runs in all paragraphs
for p in cell.paragraphs:
for r in p.runs:
r.text = ""
p = cell.paragraphs[0] if cell.paragraphs else cell.add_paragraph()
r = p.add_run(str(text or ""))
r.font.color.rgb = BLACK
try:
r.font.color.theme_color = None
except Exception:
pass
def nz(x: Optional[str]) -> str:
return (x or "").strip()
def canon(s: str) -> str:
s = re.sub(r"\s+", " ", str(s)).strip().lower()
s = s.replace("–", "-").replace("—", "-")
return re.sub(r"[^a-z0-9/#()+,.\- ]+", "", s)
def canon_label(s: str) -> str:
# labels often vary by punctuation/casing; keep digits/letters
s = re.sub(r"\s+", " ", str(s)).strip().lower()
s = s.replace("–", "-").replace("—", "-")
s = re.sub(r"[^a-z0-9 ]+", " ", s)
return re.sub(r"\s+", " ", s).strip()
def join_value(value) -> str:
if isinstance(value, list):
# Keep multi-line when list provided
return "\n".join([str(v) for v in value if nz(v)])
return str(value)
def split_digits(s: str) -> List[str]:
return re.findall(r"\d", s)
def para_text(p: Paragraph) -> str:
return "".join(run.text for run in p.runs)
def cell_text(c: _Cell) -> str:
return "\n".join(para_text(p) for p in c.paragraphs)
def is_red_run(run) -> bool:
col = run.font.color
if not col:
return False
if col.rgb is not None:
return col.rgb == RED
# Some templates use theme colors; treat explicit red text snippets only
return False
def replace_red_in_paragraph(p: Paragraph, new_text: str) -> bool:
replaced = False
red_runs = [r for r in p.runs if is_red_run(r)]
if not red_runs:
return False
# collapse all red runs into one and write value (in black)
first = red_runs[0]
_set_text_and_black(first, new_text)
for r in red_runs[1:]:
r.text = ""
replaced = True
return replaced
def replace_red_in_cell(cell: _Cell, new_text: str) -> bool:
# replace only red runs; if none, replace whole cell with a single run (fallback)
any_red = False
for p in cell.paragraphs:
if replace_red_in_paragraph(p, new_text):
any_red = True
if any_red:
return True
# fallback: clear cell, set single paragraph text in black
_set_cell_text_black(cell, new_text)
return True
def parse_attendance_lines(value) -> List[str]:
"""
Parse strings like:
"Peter Sheppard - Compliance Greg Dyer - Auditor"
into:
["Peter Sheppard - Compliance", "Greg Dyer - Auditor"]
Handles lists, newlines, semicolons, and pipes too.
"""
if isinstance(value, list):
s = " ".join(str(v) for v in value if v)
else:
s = str(value or "")
s = re.sub(r"\s+", " ", s).strip()
if not s:
return []
# First split on explicit separators; then within each chunk, extract Name - Title pairs.
chunks = re.split(r"\s*[\n;|]\s*", s)
items: List[str] = []
pair_pat = re.compile(
r"([A-Z][A-Za-z.'-]+(?:\s+[A-Z][A-Za-z.'-]+){0,3})\s*-\s*"
r"([^-\n]+?)(?=\s+[A-Z][A-Za-z.'-]+(?:\s+[A-Z][A-Za-z.'-]+){0,3}\s*-\s*|$)"
)
for chunk in chunks:
chunk = chunk.strip()
if not chunk:
continue
found = False
for m in pair_pat.finditer(chunk):
name = m.group(1).strip()
title = m.group(2).strip()
items.append(f"{name} - {title}")
found = True
if not found:
# Fallback: single "Name - Title"
if " - " in chunk:
a, b = chunk.split(" - ", 1)
items.append(f"{a.strip()} - {b.strip()}")
elif chunk:
items.append(chunk)
return items
def fill_attendance_block(doc: Document, value) -> bool:
items = parse_attendance_lines(value)
if not items:
return False
loc = find_label_cell(doc, "Attendance List (Names and Position Titles)")
if not loc:
return False
t, r, c = loc
# value cell: usually directly under the heading cell
target = (
t.rows[r + 1].cells[c]
if r + 1 < len(t.rows) and c < len(t.rows[r + 1].cells)
else get_adjacent_value_cell(t, r, c)
)
# ---- read ONLY the target cell (don’t touch the row)
def is_red_para(p): return any(is_red_run(run) for run in p.runs)
def looks_like_pair(s: str) -> bool:
if " - " not in s: return False
a, b = s.split(" - ", 1)
return bool(a.strip()) and bool(b.strip())
paras = list(target.paragraphs)
red_count = sum(1 for p in paras if is_red_para(p))
existing_black = [para_text(p).strip() for p in paras
if (not is_red_para(p)) and looks_like_pair(para_text(p))]
# compose final lines
out_lines: List[str] = []
out_lines.extend(items[:red_count]) # replace red placeholders
out_lines.extend(existing_black) # keep black lines
norm = lambda s: re.sub(r"\s+", " ", s.strip().lower())
seen = {norm(x) for x in out_lines}
for extra in items[red_count:]:
k = norm(extra)
if k not in seen:
out_lines.append(extra); seen.add(k)
# ---- hard clear target cell and write fresh (all black)
_nuke_cell_paragraphs(target)
# first line
p = target.add_paragraph()
_clear_para_and_write_black(p, out_lines[0] if out_lines else "")
# remaining lines
for line in out_lines[1:]:
p = target.add_paragraph()
_clear_para_and_write_black(p, line)
return True
# ----------------------------- document search -----------------------------
def iter_tables(doc: Document) -> List[Table]:
return list(doc.tables)
def iter_paragraphs(doc: Document) -> List[Paragraph]:
# paragraphs at doc level + inside tables
out = list(doc.paragraphs)
for t in doc.tables:
for row in t.rows:
for cell in row.cells:
out.extend(cell.paragraphs)
return out
def find_heading_paragraph(doc: Document, heading_text: str, window: int = 60) -> Optional[Paragraph]:
key = canon(heading_text)
for p in iter_paragraphs(doc):
if canon(para_text(p)).startswith(key):
return p
# fuzzy contains
for p in iter_paragraphs(doc):
if key in canon(para_text(p)):
return p
return None
def find_label_cell_in_table(table: Table, label: str) -> Optional[Tuple[int, int]]:
target = canon_label(label)
for r_i, row in enumerate(table.rows):
for c_i, cell in enumerate(row.cells):
if canon_label(cell_text(cell)) == target:
return (r_i, c_i)
# allow contains (safe-ish)
for r_i, row in enumerate(table.rows):
for c_i, cell in enumerate(row.cells):
if target and target in canon_label(cell_text(cell)):
return (r_i, c_i)
return None
def find_label_cell(doc: Document, label: str) -> Optional[Tuple[Table, int, int]]:
for t in iter_tables(doc):
pos = find_label_cell_in_table(t, label)
if pos:
return (t, pos[0], pos[1])
return None
def get_adjacent_value_cell(table: Table, r: int, c: int) -> _Cell:
# Prefer right cell, otherwise next row same col, otherwise this cell
cols = len(table.rows[0].cells)
if c + 1 < cols:
return table.rows[r].cells[c+1]
if r + 1 < len(table.rows):
return table.rows[r+1].cells[c]
return table.rows[r].cells[c]
# ----------------------------- label/value updates -----------------------------
def update_label_value_in_tables(doc: Document, label: str, value) -> bool:
tup = find_label_cell(doc, label)
val = join_value(value)
if not tup:
return False
t, r, c = tup
target_cell = get_adjacent_value_cell(t, r, c)
return replace_red_in_cell(target_cell, val)
def update_heading_followed_red(doc: Document, heading: str, value, max_scan: int = 12) -> bool:
"""Find heading paragraph, then replace the first red run found within next N paragraphs (including inside tables)"""
start = find_heading_paragraph(doc, heading)
if not start:
return False
# Build a linear list of paragraphs across whole doc to get an index
allp = iter_paragraphs(doc)
try:
idx = allp.index(start)
except ValueError:
idx = 0
new_text = join_value(value)
# Scan forward
for p in allp[idx+1: idx+1+max_scan]:
if replace_red_in_paragraph(p, new_text):
return True
# Also check any red in table cells inside this paragraph's parent (already covered via iter_paragraphs)
return False
# ----------------------------- ACN per-digit fill -----------------------------
def fill_acn_digits(doc: Document, acn_value: str) -> bool:
digits = split_digits(acn_value)
if not digits:
return False
loc = find_label_cell(doc, "Australian Company Number")
if not loc:
return False
t, r, c = loc
# Collect cells to the RIGHT in the same row first
targets: List[_Cell] = [t.rows[r].cells[j] for j in range(c + 1, len(t.rows[r].cells))]
# If not enough, continue row-by-row below (left→right)
rr = r + 1
while len(targets) < len(digits) and rr < len(t.rows):
targets.extend(list(t.rows[rr].cells))
rr += 1
targets = targets[:len(digits)]
if not targets:
return False
# Clear each target cell and write ONE digit in black
for d, cell in zip(digits, targets):
_set_cell_text_black(cell, d)
return True
# ----------------------------- vehicle tables -----------------------------
def table_header_text(table: Table, up_to_rows: int = 3) -> str:
heads = []
for i, row in enumerate(table.rows[:up_to_rows]):
for cell in row.cells:
heads.append(cell_text(cell))
return canon(" ".join(heads))
def find_vehicle_table(doc: Document, want: str) -> Optional[Table]:
"""
want = "maintenance" or "mass"
"""
MAINT_KEYS = ["registration number", "maintenance records", "daily checks", "fault recording", "fault repair"]
MASS_KEYS = ["registration number", "weight verification", "rfs suspension", "suspension system maintenance", "trip records", "reporting on suspension"]
candidates = []
for t in iter_tables(doc):
htxt = table_header_text(t)
if want == "maintenance":
if all(k in htxt for k in ["registration", "maintenance", "fault"]) and "suspension" not in htxt:
candidates.append(t)
elif want == "mass":
if "suspension" in htxt and "weight" in htxt:
candidates.append(t)
# Prefer the one with most rows
if not candidates:
return None
return max(candidates, key=lambda tb: len(tb.rows))
def map_cols(table: Table, want: str) -> Dict[str, int]:
# map header columns by keywords from the first 2 rows that contain headers
header_rows = table.rows[:2]
col_texts = []
cols = len(table.rows[0].cells)
for j in range(cols):
txt = " ".join(cell_text(r.cells[j]) for r in header_rows if j < len(r.cells))
col_texts.append(canon(txt))
idx = {}
def first_col(*needles) -> Optional[int]:
for j, t in enumerate(col_texts):
if all(n in t for n in needles):
return j
return None
if want == "maintenance":
idx["reg"] = first_col("registration")
idx["rw"] = first_col("roadworthiness")
idx["mr"] = first_col("maintenance", "records")
idx["daily"] = first_col("daily", "check")
idx["fr"] = first_col("fault", "recording")
idx["rep"] = first_col("fault", "repair")
else:
idx["reg"] = first_col("registration")
idx["wv"] = first_col("weight", "verification")
idx["rfs"] = first_col("rfs", "cert")
idx["susp"] = first_col("suspension", "maintenance")
idx["trip"] = first_col("trip", "record")
idx["frs"] = first_col("fault", "suspension")
return {k:v for k,v in idx.items() if v is not None}
def clear_data_rows_keep_headers(table: Table, header_rows: int = 1):
# Keep first header_rows, drop everything else
while len(table.rows) > header_rows:
table._tbl.remove(table.rows[-1]._tr)
def ensure_rows(table: Table, need_rows: int):
# assumes 1 header row; add rows to reach need_rows + 1 total
while len(table.rows) < need_rows + 1:
table.add_row()
def fill_vehicle_table(table: Table, want: str, arrays: Dict[str, List[str]]):
colmap = map_cols(table, want)
if "reg" not in colmap:
return
if want == "maintenance":
regs = arrays.get("Registration Number", [])
rw = arrays.get("Roadworthiness Certificates", [])
mr = arrays.get("Maintenance Records", [])
daily= arrays.get("Daily Checks", [])
fr = arrays.get("Fault Recording/ Reporting", [])
rep = arrays.get("Fault Repair", [])
n = len(regs)
# keep header row(s), then fill N rows
clear_data_rows_keep_headers(table, header_rows=1)
ensure_rows(table, n)
for i in range(n):
row = table.rows[i+1]
def put(col_key, vals):
if col_key not in colmap or i >= len(vals): return
c = row.cells[colmap[col_key]]
replace_red_in_cell(c, nz(vals[i]))
# write each col
c_reg = row.cells[colmap["reg"]]; replace_red_in_cell(c_reg, nz(regs[i]))
put("rw", rw)
put("mr", mr)
put("daily",daily)
put("fr", fr)
put("rep", rep)
else:
regs = arrays.get("Registration Number", [])
wv = arrays.get("Weight Verification Records", [])
rfs = arrays.get("RFS Suspension Certification #", [])
susp = arrays.get("Suspension System Maintenance", [])
trip = arrays.get("Trip Records", [])
frs = arrays.get("Fault Recording/ Reporting on Suspension System", [])
n = len(regs)
clear_data_rows_keep_headers(table, header_rows=1)
ensure_rows(table, n)
for i in range(n):
row = table.rows[i+1]
def put(col_key, vals):
if col_key not in colmap or i >= len(vals): return
c = row.cells[colmap[col_key]]
replace_red_in_cell(c, nz(vals[i]))
c_reg = row.cells[colmap["reg"]]; replace_red_in_cell(c_reg, nz(regs[i]))
put("wv", wv)
put("rfs", rfs)
put("susp", susp)
put("trip", trip)
put("frs", frs)
# ----------------------------- driver table -----------------------------
def find_driver_table(doc: Document) -> Optional[Table]:
for t in iter_tables(doc):
h = table_header_text(t)
if "driver / scheduler" in h and ("fit for duty" in h or "work diary" in h):
return t
return None
def map_driver_cols(table: Table) -> Dict[str,int]:
header_rows = table.rows[:2]
cols = len(table.rows[0].cells)
col_texts = []
for j in range(cols):
txt = " ".join(cell_text(r.cells[j]) for r in header_rows if j < len(r.cells))
col_texts.append(canon(txt))
idx = {}
def first_col(*needles):
for j, t in enumerate(col_texts):
if all(n in t for n in needles):
return j
return None
idx["name"] = first_col("driver", "name")
idx["roster"]= first_col("roster", "safe")
idx["fit"] = first_col("fit for duty")
# Work diary might be split across two headers; match "work diary" OR "electronic work diary"
wd = first_col("work diary") or first_col("electronic work diary")
if wd is not None: idx["wd"] = wd
return {k:v for k,v in idx.items() if v is not None}
def fill_driver_table(table: Table, arrays: Dict[str, List[str]]):
colmap = map_driver_cols(table)
if not colmap:
return
names = arrays.get("Driver / Scheduler Name", [])
rosters = arrays.get("Roster / Schedule / Safe Driving Plan (Date Range)", [])
fit = arrays.get("Fit for Duty Statement Completed (Yes/No)", [])
wd = arrays.get("Work Diary Pages (Page Numbers) Electronic Work Diary Records (Date Range)", [])
n = max(len(rosters), len(fit), len(wd), len(names))
clear_data_rows_keep_headers(table, header_rows=1)
ensure_rows(table, n)
has_any_name = any(str(x).strip() for x in names)
for i in range(n):
row = table.rows[i+1]
if "name" in colmap and has_any_name:
replace_red_in_cell(row.cells[colmap["name"]], names[i] if i < len(names) else "")
if "roster" in colmap:
replace_red_in_cell(row.cells[colmap["roster"]], rosters[i] if i < len(rosters) else "")
if "fit" in colmap:
replace_red_in_cell(row.cells[colmap["fit"]], fit[i] if i < len(fit) else "")
if "wd" in colmap:
replace_red_in_cell(row.cells[colmap["wd"]], wd[i] if i < len(wd) else "")
# ----------------------------- main mapping -----------------------------
def flatten_simple_sections(data: Dict) -> Dict[str, str]:
"""Collect simple label->single value mappings from top-level sections other than tables."""
out = {}
skip_sections = {
"Vehicle Registration Numbers Maintenance",
"Vehicle Registration Numbers Mass",
"Driver / Scheduler Records Examined",
"paragraphs",
"Attendance List (Names and Position Titles)",
"Nature of the Operators Business (Summary)",
"Maintenance Management Summary",
"Mass Management Summary",
"Fatigue Management Summary",
}
for sec, kv in data.items():
if sec in skip_sections: continue
if not isinstance(kv, dict): continue
for label, val in kv.items():
out[f"{sec}::{label}"] = join_value(val)
return out
def run(input_json: Path, template_docx: Path, output_docx: Path):
with open(input_json, "r", encoding="utf-8") as f:
data = json.load(f)
doc = Document(str(template_docx))
# 1) simple label/value tables
simple = flatten_simple_sections(data)
# Map by (section::label). We try: (a) find exact label cell somewhere and write in the adjacent cell;
# (b) if not found, search by heading then the next red run below the heading.
for k, v in simple.items():
# use the part after '::' as the label
label = k.split("::", 1)[1] if "::" in k else k
# SPECIAL: skip ACN here; we'll fill per-digit later
if canon_label(label) == "australian company number":
continue
ok = update_label_value_in_tables(doc, label, v)
if not ok:
sec = k.split("::", 1)[0] if "::" in k else k
update_heading_followed_red(doc, sec, v)
# 2) paragraphs block
paras = data.get("paragraphs", {})
# 2a) generic headings → replace next red (skip the 3 management headings here)
# third-line headings above the three tables
for head in ("MAINTENANCE MANAGEMENT", "MASS MANAGEMENT", "FATIGUE MANAGEMENT"):
name_val = join_value(paras.get(head, ""))
if name_val:
update_heading_followed_red(doc, head, name_val, max_scan=6)
# 2b) the 3-layer headings → overwrite the 3rd line only
# second-last page: date under page heading
aud_head = "NHVAS APPROVED AUDITOR DECLARATION"
aud_date = join_value(paras.get(aud_head, ""))
if aud_date:
set_date_by_heading_from_end(doc, aud_head, aud_date, max_scan=40)
# last page: date under the long acknowledgement paragraph
ack_head = ("I hereby acknowledge and agree with the findings detailed in this NHVAS Audit Summary Report. "
"I have read and understand the conditions applicable to the Scheme, including the NHVAS Business Rules and Standards.")
ack_date = join_value(paras.get(ack_head, ""))
if ack_date:
set_date_by_paragraph_from_end(doc, ack_head, ack_date, max_scan=40)
maint_name = join_value(paras.get("MAINTENANCE MANAGEMENT", ""))
if maint_name:
set_layer3_name_after_management_heading(
doc,
"MAINTENANCE MANAGEMENT",
["Vehicle Registration Numbers of Records Examined"],
maint_name,
)
mass_name = join_value(paras.get("MASS MANAGEMENT", ""))
if mass_name:
set_layer3_name_after_management_heading(
doc,
"MASS MANAGEMENT",
["Vehicle Registration Numbers of Records Examined"],
mass_name,
)
fat_name = join_value(paras.get("FATIGUE MANAGEMENT", ""))
if fat_name:
set_layer3_name_after_management_heading(
doc,
"FATIGUE MANAGEMENT",
["Driver / Scheduler Records Examined"],
fat_name,
)
# 3) ACN digits
op_info = data.get("Operator Information", {})
acn_val = join_value(op_info.get("Australian Company Number", ""))
if acn_val:
fill_acn_digits(doc, acn_val)
# 4) Vehicle tables
maint = data.get("Vehicle Registration Numbers Maintenance", {})
mass = data.get("Vehicle Registration Numbers Mass", {})
t_m = find_vehicle_table(doc, "maintenance")
if t_m and maint:
fill_vehicle_table(t_m, "maintenance", maint)
t_ms = find_mass_vehicle_numbers_table(doc)
if t_ms and mass:
fill_mass_vehicle_table_preserve_headers(t_ms, mass)
# 5) Driver table
drivers = data.get("Driver / Scheduler Records Examined", {})
t_d = find_driver_table(doc)
if t_d and drivers:
fill_driver_table(t_d, drivers)
# 6) Special: Audit Declaration dates via heading
decl = data.get("Audit Declaration dates", {})
if decl.get("Audit was conducted on"):
update_heading_followed_red(doc, "Audit was conducted on", decl["Audit was conducted on"])
# 7) Operator Declaration (last page, bottom row only), and fix Auditor table header
op_decl = data.get("Operator Declaration", {})
if op_decl:
fill_operator_declaration(
doc,
join_value(op_decl.get("Print Name", "")),
join_value(op_decl.get("Position Title", "")),
)
# make sure the second-last page “NHVAS APPROVED AUDITOR DECLARATION” header row is labels
ensure_auditor_decl_headers(doc)
# 8) Attendance List
# Attendance: replace red lines only
atts = data.get("Attendance List (Names and Position Titles)", {})
att_val = atts.get("Attendance List (Names and Position Titles)")
if att_val:
fill_attendance_block(doc, att_val)
# 9) Nature of the Operators Business (Summary): write once (no duplicates)
biz = data.get("Nature of the Operators Business (Summary)", {})
if biz:
val = biz.get("Nature of the Operators Business (Summary):") or next(iter(biz.values()), "")
if val:
update_business_summary_once(doc, val)
# 10) Summary tables: FULL OVERWRITE of DETAILS from JSON
mm_sum = data.get("Maintenance Management Summary", {})
if mm_sum:
overwrite_summary_details_cells(doc, "Maintenance Management Summary", mm_sum)
mass_sum = data.get("Mass Management Summary", {})
if mass_sum:
overwrite_summary_details_cells(doc, "Mass Management Summary", mass_sum)
fat_sum = data.get("Fatigue Management Summary", {})
if fat_sum:
overwrite_summary_details_cells(doc, "Fatigue Management Summary", fat_sum)
doc.save(str(output_docx))
# ----------------------------- CLI -----------------------------
if __name__ == "__main__":
import sys
from pathlib import Path
if len(sys.argv) != 4:
print("Usage: python updated_word.py <json> <template.docx> <output.docx>")
sys.exit(1)
a, b, c = map(Path, sys.argv[1:4])
files = [a, b, c]
json_path = next((p for p in files if p.suffix.lower() == ".json"), None)
docx_paths = [p for p in files if p.suffix.lower() == ".docx"]
if not json_path or len(docx_paths) < 2:
print("Error: provide one .json and two .docx (template + output).")
sys.exit(1)
# Template = the .docx that already exists; Output = the other .docx
template_docx = next((p for p in docx_paths if p.exists()), docx_paths[0])
output_docx = docx_paths[1] if docx_paths[0] == template_docx else docx_paths[0]
run(json_path, template_docx, output_docx) |