PDF-Data_Extractor / space-pdf /updated_word.py
Wasim
Sync: robust vehicle parser + full project
2e237ce
#!/usr/bin/env python3
# update_docx_from_json.py
import sys, json, re
from pathlib import Path
from typing import Dict, List, Tuple, Optional
from docx import Document
from docx.shared import RGBColor, Pt # add Pt
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph
from copy import deepcopy
from docx.oxml.ns import qn
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
BLACK = RGBColor(0, 0, 0)
RED = RGBColor(0xFF, 0x00, 0x00)
# ----------------------------- text helpers -----------------------------
def _find_table_with_headers(doc: Document, must_have: list[str]) -> Optional[Table]:
for t in doc.tables:
if not t.rows:
continue
head = canon(" ".join(cell_text(c) for c in t.rows[0].cells))
if all(canon_label(x) in head for x in must_have):
return t
return None
def ensure_auditor_decl_headers(doc: Document) -> bool:
"""
Second-last page table under 'NHVAS APPROVED AUDITOR DECLARATION'.
Force the HEADER row to read exactly:
[ Print Name | NHVR or Exemplar Global Auditor Registration Number ]
Never touch the bottom (values) row.
"""
changed = False
expected_left = "Print Name"
expected_right = "NHVR or Exemplar Global Auditor Registration Number"
for t in doc.tables:
if not t.rows or not t.rows[0].cells:
continue
# must look like the auditor table: header left says "Print Name", 2+ cols, 2+ rows
head_left = canon_label(cell_text(t.rows[0].cells[0]))
if head_left == "print name" and len(t.rows[0].cells) >= 2 and len(t.rows) >= 2:
# fix left header if needed
if canon_label(cell_text(t.rows[0].cells[0])) != canon_label(expected_left) or \
any(is_red_run(r) for p in t.rows[0].cells[0].paragraphs for r in p.runs):
_set_cell_text_black(t.rows[0].cells[0], expected_left)
changed = True
# unconditionally set the RIGHT header text (this is where "Peter Sheppard" was sitting)
if canon_label(cell_text(t.rows[0].cells[1])) != canon_label(expected_right) or \
any(is_red_run(r) for p in t.rows[0].cells[1].paragraphs for r in p.runs):
_set_cell_text_black(t.rows[0].cells[1], expected_right)
changed = True
# found and fixed the table; no need to continue
break
return changed
def fill_operator_declaration(doc: Document, print_name: str, position_title: str) -> bool:
"""Last page table: write values ONLY into the bottom row (red placeholders)."""
t = _find_table_with_headers(doc, ["Print Name", "Position Title"])
if not t or len(t.rows) < 2 or len(t.rows[0].cells) < 2:
return False
bot_left = t.rows[1].cells[0]
bot_right = t.rows[1].cells[1]
# only replace if that cell has a red placeholder
if any(is_red_run(r) for p in bot_left.paragraphs for r in p.runs):
_set_cell_text_black(bot_left, print_name)
if any(is_red_run(r) for p in bot_right.paragraphs for r in p.runs):
_set_cell_text_black(bot_right, position_title)
return True
def find_heading_index_from_end(doc: Document, heading: str) -> Optional[int]:
key = canon(heading)
allp = iter_paragraphs(doc)
for i in range(len(allp) - 1, -1, -1):
if key in canon(para_text(allp[i])):
return i
return None
def set_date_by_heading_from_end(doc: Document, heading: str, date_text: str, max_scan: int = 60) -> bool:
"""Find the LAST occurrence of `heading`, then replace the FIRST red run in the next paragraphs."""
if not date_text:
return False
allp = iter_paragraphs(doc)
idx = find_heading_index_from_end(doc, heading)
if idx is None:
return False
for p in allp[idx + 1 : min(idx + 1 + max_scan, len(allp))]:
if replace_red_in_paragraph(p, date_text): # writes in black
return True
return False
def set_date_by_paragraph_from_end(doc: Document, paragraph_text: str, date_text: str, max_scan: int = 60) -> bool:
"""Find the LAST paragraph matching `paragraph_text`, then set the FIRST red run after it."""
if not date_text:
return False
key = canon(paragraph_text)
allp = iter_paragraphs(doc)
hit = None
for i in range(len(allp) - 1, -1, -1):
if key in canon(para_text(allp[i])):
hit = i
break
if hit is None:
return False
# date placeholder is on the LAST page, right after this long paragraph
for p in allp[hit + 1 : min(hit + 1 + max_scan, len(allp))]:
if replace_red_in_paragraph(p, date_text): # writes in black
return True
return False
def set_layer3_name_after_management_heading(doc: Document, mid_heading: str, allowed_prev_titles: List[str], name: str) -> bool:
if not name:
return False
allp = iter_paragraphs(doc)
wrote = False
mid = canon(mid_heading)
allowed_prev = {canon(t) for t in allowed_prev_titles}
for i, p in enumerate(allp):
if canon(para_text(p)) != mid:
continue
# previous non-empty must be one of the allowed titles
j = i - 1
while j >= 0 and not nz(para_text(allp[j])):
j -= 1
if j < 0 or canon(para_text(allp[j])) not in allowed_prev:
continue
# next non-empty is the 3rd line we overwrite
k = i + 1
while k < len(allp) and not nz(para_text(allp[k])):
k += 1
if k >= len(allp):
continue
# compute target size from the middle heading; fall back to a sensible bump
target_size = _para_effective_font_size(allp[i]) or Pt(16)
_clear_para_and_write_black(allp[k], name)
# apply size to all runs explicitly (overrides style)
for r in allp[k].runs:
r.font.size = target_size
wrote = True
return wrote
def _para_effective_font_size(p: Paragraph):
# try explicit run sizes first
for r in p.runs:
if r.font.size:
return r.font.size
# then the paragraph style
if p.style and p.style.font and p.style.font.size:
return p.style.font.size
return None
# --- helpers for summary tables ---
# --- helpers for summary overwrite ---
def _std_key(s: str) -> str:
"""
Normalize a label to match a 'Std N' key.
e.g. 'Std 7. Internal Review' -> 'std 7'
"""
t = canon_label(s)
m = re.match(r"(std\s+\d+)", t)
return m.group(1) if m else t
def _looks_like_summary_table(table: Table) -> Optional[Tuple[int, int]]:
"""
Return (label_col_idx, details_col_idx) if this is a Summary table
with a DETAILS column; otherwise None.
"""
if not table.rows:
return None
first = table.rows[0]
cols = len(first.cells)
if cols < 2:
return None
# header texts for first row
head = [canon(cell_text(c)) for c in first.cells]
# find DETAILS column
details_col = None
for j, t in enumerate(head):
if "detail" in t:
details_col = j
break
if details_col is None:
return None
# find the label column (left-hand standards column)
label_col = None
for j, t in enumerate(head):
if any(k in t for k in ["maintenance management", "mass management", "fatigue management"]):
label_col = j
break
if label_col is None:
# fallback: assume the first non-DETAILS column is the label column
label_col = 0 if details_col != 0 else 1
return (label_col, details_col)
def count_header_rows(table: Table, scan_up_to: int = 6) -> int:
"""Heuristically count header rows (stop when first data row like '1.' appears)."""
for i, row in enumerate(table.rows[:scan_up_to]):
first = cell_text(row.cells[0]).strip()
if re.match(r"^\d+\.?$", first):
return i
return 1
def _header_col_texts(table: Table, scan_rows: int = 5) -> List[str]:
scan_rows = min(scan_rows, len(table.rows))
if scan_rows == 0:
return []
# pick the row with the most cells as base
base_row = max(range(scan_rows), key=lambda i: len(table.rows[i].cells))
base_cols = len(table.rows[base_row].cells)
cols = []
for j in range(base_cols):
parts = []
for i in range(scan_rows):
row = table.rows[i]
if j < len(row.cells):
parts.append(cell_text(row.cells[j]))
cols.append(canon(" ".join(parts)))
return cols
def count_header_rows(table: Table, scan_up_to: int = 6) -> int:
"""Header ends right before the first row whose 1st cell looks like '1.'"""
limit = min(scan_up_to, len(table.rows))
for i in range(limit):
first = cell_text(table.rows[i].cells[0]).strip()
if re.match(r"^\d+\.?$", first):
return i
# fallback to 1 header row
return 1
def map_cols_mass_strict(table: Table) -> Dict[str, int]:
cols = _header_col_texts(table, 5)
def first_col(*needles):
for j, t in enumerate(cols):
if all(n in t for n in needles):
return j
return None
idx = {
"no": first_col("no"),
"reg": first_col("registration", "number") or first_col("registration"),
"wv": first_col("weight", "verification"),
"rfs": first_col("rfs", "cert") or first_col("rfs", "certification"),
"susp": first_col("suspension", "maintenance"),
"trip": first_col("trip", "record"),
"frs": first_col("fault", "suspension") or first_col("fault", "reporting", "suspension"),
}
return {k: v for k, v in idx.items() if v is not None}
def find_mass_vehicle_numbers_table(doc: Document) -> Optional[Table]:
"""Pick the Mass vehicle-number table by matching its column set (not the Summary table)."""
best = None
best_score = -1
for t in iter_tables(doc):
cols = _header_col_texts(t, 5)
allhdr = " ".join(cols)
# must look like the vehicle numbers table
hits = 0
hits += int(any("registration" in c and "number" in c for c in cols))
hits += int(any("weight" in c and "verification" in c for c in cols))
hits += int(any("rfs" in c and ("cert" in c or "certification" in c) for c in cols))
hits += int(any("suspension" in c and "maintenance" in c for c in cols))
hits += int(any("trip" in c and "record" in c for c in cols))
hits += int(any("fault" in c and "suspension" in c for c in cols))
# reject obvious Summary tables
if "details" in allhdr:
continue
# prefer tables with numbering column and many rows
score = hits + (0.5 if any("no" == c or c.startswith("no ") for c in cols) else 0) + (len(t.rows) / 100.0)
if hits >= 4 and score > best_score:
best, best_score = t, score
return best
def update_operator_declaration(doc: Document, print_name: str, position_title: str) -> bool:
"""
First try strict table label mapping for 'Print Name' and 'Position Title'.
If not found, fallback to the first two red placeholders under the 'Operator Declaration' heading.
"""
changed = False
# 1) Table label approach
for lbl, val in (("Print Name", print_name), ("Position Title", position_title)):
if not val:
continue
loc = find_label_cell(doc, lbl)
if not loc:
# tolerate odd spacing/colon/camelcase
for alt in ("PrintName", "Print Name", "Print Name:", "PositionTitle", "Position Title", "Position Title:"):
loc = find_label_cell(doc, alt)
if loc:
break
if loc:
t, r, c = loc
cell = get_adjacent_value_cell(t, r, c)
if not replace_red_in_cell(cell, val):
_set_cell_text_black(cell, val)
changed = True
if changed:
return True
# 2) Fallback: heading-scoped red placeholders
head = "OPERATOR DECLARATION"
p = find_heading_paragraph(doc, head) or find_heading_paragraph(doc, head.title())
if not p:
return False
allp = iter_paragraphs(doc)
try:
i = allp.index(p)
except ValueError:
i = 0
red_targets = []
for q in allp[i+1:i+1+20]:
reds = [r for r in q.runs if is_red_run(r)]
if reds:
red_targets.extend(reds)
if len(red_targets) >= 2:
break
wrote = False
if print_name and red_targets:
_set_text_and_black(red_targets[0], print_name); wrote = True
if position_title and len(red_targets) >= 2:
_set_text_and_black(red_targets[1], position_title); wrote = True
return wrote
def fill_mass_vehicle_table_preserve_headers(table: Table, arrays: Dict[str, List[str]]):
colmap = map_cols_mass_strict(table)
if "reg" not in colmap:
return
hdr_rows = count_header_rows(table, 6)
regs = arrays.get("Registration Number", [])
n = len(regs)
# clear data rows only
while len(table.rows) > hdr_rows:
table._tbl.remove(table.rows[-1]._tr)
# ensure enough rows
while len(table.rows) < hdr_rows + n:
table.add_row()
def put(row, key, arr_key, i):
if key in colmap:
vals = arrays.get(arr_key, [])
val = nz(vals[i]) if i < len(vals) else ""
replace_red_in_cell(row.cells[colmap[key]], val)
for i in range(n):
row = table.rows[hdr_rows + i]
replace_red_in_cell(row.cells[colmap["reg"]], nz(regs[i]))
put(row, "wv", "Weight Verification Records", i)
put(row, "rfs", "RFS Suspension Certification #", i)
put(row, "susp", "Suspension System Maintenance", i)
put(row, "trip", "Trip Records", i)
put(row, "frs", "Fault Recording/ Reporting on Suspension System", i)
def overwrite_summary_details_cells(doc: Document, section_name: str, section_dict: Dict[str, List[str]]) -> int:
"""For a Summary table (Maintenance/Mass/Fatigue), replace the entire DETAILS cell
for each Std N row with the JSON text (written in black)."""
# build desired texts
desired: Dict[str, str] = { _std_key(k): join_value(v) for k, v in section_dict.items() }
# pick which tables belong to this section by header sniff
wanted_prefix = canon_label(section_name.split()[0]) # "maintenance" | "mass" | "fatigue"
updated = 0
for t in doc.tables:
cols = _looks_like_summary_table(t)
if not cols:
continue
label_col, details_col = cols
head_txt = table_header_text(t, up_to_rows=2)
if wanted_prefix not in head_txt: # keep to the correct section
continue
# walk body rows
for i in range(1, len(t.rows)):
row = t.rows[i]
key = _std_key(cell_text(row.cells[label_col]))
# exact match or "std N" prefix match
cand = desired.get(key)
if not cand:
m = re.match(r"(std\s+\d+)", key)
if m:
for k2, v2 in desired.items():
if k2.startswith(m.group(1)):
cand = v2
break
if not cand:
continue
_set_cell_text_black(row.cells[details_col], cand) # full overwrite, black
updated += 1
return updated
SPLIT_SENT_PAT = re.compile(r"(?<=\.|\?|!)\s+")
ORDINAL_DATE_PAT = re.compile(r"\b(\d{1,2}(?:st|nd|rd|th)\s+[A-Za-z]+\s+\d{4})\b", re.I)
def split_sentences_keep(text: str) -> List[str]:
s = " ".join(str(text or "").split())
if not s:
return []
out = []
start = 0
for m in SPLIT_SENT_PAT.finditer(s):
out.append(s[start:m.start()].strip())
start = m.end()
last = s[start:].strip()
if last:
out.append(last)
return out
_sent_split = re.compile(r'(?<=[.!?])\s+|\n+')
_date_pat = re.compile(r'\b(?:\d{1,2}(?:st|nd|rd|th)\s+[A-Za-z]+\s+\d{4}|\d{1,2}/\d{1,2}/\d{2,4}|[A-Za-z]+\s+\d{1,2},\s*\d{4})\b')
def extract_summary_snippets(desired_text: str):
sents = _sentences(desired_text)
dates = [m.group(0) for m in _date_pat.finditer(desired_text)]
pick = lambda rx: next((s for s in sents if re.search(rx, s, re.I)), None)
return {
"sheet_sent": pick(r'\b(daily\s+check|sheet)\b'),
"sheet_phrase": _extract_sheet_phrase_from_desired(desired_text),
"review": pick(r'\binternal\s+review\b'),
"qcs": pick(r'\bquarterly\b.*\bcompliance\b') or pick(r'\bquarterly\b'),
"dates": dates,
"sents": sents,
}
def fill_management_summary_tables(doc: Document, section_key: str, section_data: Dict[str, List[str]]):
"""
Fill ALL summary tables for the given section_key ('maintenance'|'mass'|'fatigue')
by matching each row label (left column) against keys in section_data and
patching only the red text inside the DETAILS cell.
"""
targets = [x for x in find_all_summary_tables(doc) if x[0] == section_key]
if not targets:
return
# build list of (normalized label, original label, desired_text)
desired = []
for label, vals in section_data.items():
want = canon_label(label)
if not want:
continue
desired.append((want, label, join_value(vals)))
for _, table, lcol, dcol in targets:
# iterate data rows (skip header)
for i in range(1, len(table.rows)):
left_txt_norm = canon_label(cell_text(table.rows[i].cells[lcol]))
if not left_txt_norm:
continue
for want_norm, _orig_lbl, value in desired:
# loose contains match handles minor punctuation differences
if want_norm and want_norm in left_txt_norm:
patch_details_cell_from_json(table.rows[i].cells[dcol], value)
def _set_text_and_black(run, new_text: str):
"""Replace a run's text and force color to black (clears theme color too)."""
if new_text is None:
new_text = ""
run.text = str(new_text)
run.font.color.rgb = BLACK
try:
# clear any theme color so rgb sticks
run.font.color.theme_color = None
except Exception:
pass
def update_business_summary_once(doc: Document, value) -> bool:
"""Replace only the red summary paragraph; keep 'Accreditation Number' and 'Expiry Date' lines."""
loc = (find_label_cell(doc, "Nature of the Operators Business (Summary)")
or find_label_cell(doc, "Nature of the Operators Business (Summary):"))
if not loc:
return False
t, r, c = loc
cell = get_adjacent_value_cell(t, r, c)
if not cell.paragraphs:
cell.add_paragraph("")
txt = join_value(value)
# find paragraphs with any red runs (the placeholders for the summary)
red_paras = [p for p in cell.paragraphs if any(is_red_run(run) for run in p.runs)]
if red_paras:
# write the summary into the first red paragraph (in black)
_clear_para_and_write_black(red_paras[0], txt)
# clear any extra red placeholders
for p in red_paras[1:]:
_clear_para_and_write_black(p, "")
else:
# no red placeholder found: just put the summary into the first paragraph, leave others
_clear_para_and_write_black(cell.paragraphs[0], txt)
return True
def _nuke_cell_paragraphs(cell: _Cell):
"""Remove ALL paragraphs from a cell (true delete, not just emptying runs)."""
for p in list(cell.paragraphs):
p._element.getparent().remove(p._element)
def _clear_para_and_write_black(paragraph, text: str):
"""Clear a whole paragraph and write fresh black text."""
# wipe existing runs
for r in list(paragraph.runs):
r.text = ""
r = paragraph.add_run(str(text or ""))
r.font.color.rgb = BLACK
try:
r.font.color.theme_color = None
except Exception:
pass
def _set_cell_text_black(cell, text: str):
"""Clear a table cell and insert black text."""
# remove text from all runs in all paragraphs
for p in cell.paragraphs:
for r in p.runs:
r.text = ""
p = cell.paragraphs[0] if cell.paragraphs else cell.add_paragraph()
r = p.add_run(str(text or ""))
r.font.color.rgb = BLACK
try:
r.font.color.theme_color = None
except Exception:
pass
def nz(x: Optional[str]) -> str:
return (x or "").strip()
def canon(s: str) -> str:
s = re.sub(r"\s+", " ", str(s)).strip().lower()
s = s.replace("–", "-").replace("—", "-")
return re.sub(r"[^a-z0-9/#()+,.\- ]+", "", s)
def canon_label(s: str) -> str:
# labels often vary by punctuation/casing; keep digits/letters
s = re.sub(r"\s+", " ", str(s)).strip().lower()
s = s.replace("–", "-").replace("—", "-")
s = re.sub(r"[^a-z0-9 ]+", " ", s)
return re.sub(r"\s+", " ", s).strip()
def join_value(value) -> str:
if isinstance(value, list):
# Keep multi-line when list provided
return "\n".join([str(v) for v in value if nz(v)])
return str(value)
def split_digits(s: str) -> List[str]:
return re.findall(r"\d", s)
def para_text(p: Paragraph) -> str:
return "".join(run.text for run in p.runs)
def cell_text(c: _Cell) -> str:
return "\n".join(para_text(p) for p in c.paragraphs)
def is_red_run(run) -> bool:
col = run.font.color
if not col:
return False
if col.rgb is not None:
return col.rgb == RED
# Some templates use theme colors; treat explicit red text snippets only
return False
def replace_red_in_paragraph(p: Paragraph, new_text: str) -> bool:
replaced = False
red_runs = [r for r in p.runs if is_red_run(r)]
if not red_runs:
return False
# collapse all red runs into one and write value (in black)
first = red_runs[0]
_set_text_and_black(first, new_text)
for r in red_runs[1:]:
r.text = ""
replaced = True
return replaced
def replace_red_in_cell(cell: _Cell, new_text: str) -> bool:
# replace only red runs; if none, replace whole cell with a single run (fallback)
any_red = False
for p in cell.paragraphs:
if replace_red_in_paragraph(p, new_text):
any_red = True
if any_red:
return True
# fallback: clear cell, set single paragraph text in black
_set_cell_text_black(cell, new_text)
return True
def parse_attendance_lines(value) -> List[str]:
"""
Parse strings like:
"Peter Sheppard - Compliance Greg Dyer - Auditor"
into:
["Peter Sheppard - Compliance", "Greg Dyer - Auditor"]
Handles lists, newlines, semicolons, and pipes too.
"""
if isinstance(value, list):
s = " ".join(str(v) for v in value if v)
else:
s = str(value or "")
s = re.sub(r"\s+", " ", s).strip()
if not s:
return []
# First split on explicit separators; then within each chunk, extract Name - Title pairs.
chunks = re.split(r"\s*[\n;|]\s*", s)
items: List[str] = []
pair_pat = re.compile(
r"([A-Z][A-Za-z.'-]+(?:\s+[A-Z][A-Za-z.'-]+){0,3})\s*-\s*"
r"([^-\n]+?)(?=\s+[A-Z][A-Za-z.'-]+(?:\s+[A-Z][A-Za-z.'-]+){0,3}\s*-\s*|$)"
)
for chunk in chunks:
chunk = chunk.strip()
if not chunk:
continue
found = False
for m in pair_pat.finditer(chunk):
name = m.group(1).strip()
title = m.group(2).strip()
items.append(f"{name} - {title}")
found = True
if not found:
# Fallback: single "Name - Title"
if " - " in chunk:
a, b = chunk.split(" - ", 1)
items.append(f"{a.strip()} - {b.strip()}")
elif chunk:
items.append(chunk)
return items
def fill_attendance_block(doc: Document, value) -> bool:
items = parse_attendance_lines(value)
if not items:
return False
loc = find_label_cell(doc, "Attendance List (Names and Position Titles)")
if not loc:
return False
t, r, c = loc
# value cell: usually directly under the heading cell
target = (
t.rows[r + 1].cells[c]
if r + 1 < len(t.rows) and c < len(t.rows[r + 1].cells)
else get_adjacent_value_cell(t, r, c)
)
# ---- read ONLY the target cell (don’t touch the row)
def is_red_para(p): return any(is_red_run(run) for run in p.runs)
def looks_like_pair(s: str) -> bool:
if " - " not in s: return False
a, b = s.split(" - ", 1)
return bool(a.strip()) and bool(b.strip())
paras = list(target.paragraphs)
red_count = sum(1 for p in paras if is_red_para(p))
existing_black = [para_text(p).strip() for p in paras
if (not is_red_para(p)) and looks_like_pair(para_text(p))]
# compose final lines
out_lines: List[str] = []
out_lines.extend(items[:red_count]) # replace red placeholders
out_lines.extend(existing_black) # keep black lines
norm = lambda s: re.sub(r"\s+", " ", s.strip().lower())
seen = {norm(x) for x in out_lines}
for extra in items[red_count:]:
k = norm(extra)
if k not in seen:
out_lines.append(extra); seen.add(k)
# ---- hard clear target cell and write fresh (all black)
_nuke_cell_paragraphs(target)
# first line
p = target.add_paragraph()
_clear_para_and_write_black(p, out_lines[0] if out_lines else "")
# remaining lines
for line in out_lines[1:]:
p = target.add_paragraph()
_clear_para_and_write_black(p, line)
return True
# ----------------------------- document search -----------------------------
def iter_tables(doc: Document) -> List[Table]:
return list(doc.tables)
def iter_paragraphs(doc: Document) -> List[Paragraph]:
# paragraphs at doc level + inside tables
out = list(doc.paragraphs)
for t in doc.tables:
for row in t.rows:
for cell in row.cells:
out.extend(cell.paragraphs)
return out
def find_heading_paragraph(doc: Document, heading_text: str, window: int = 60) -> Optional[Paragraph]:
key = canon(heading_text)
for p in iter_paragraphs(doc):
if canon(para_text(p)).startswith(key):
return p
# fuzzy contains
for p in iter_paragraphs(doc):
if key in canon(para_text(p)):
return p
return None
def find_label_cell_in_table(table: Table, label: str) -> Optional[Tuple[int, int]]:
target = canon_label(label)
for r_i, row in enumerate(table.rows):
for c_i, cell in enumerate(row.cells):
if canon_label(cell_text(cell)) == target:
return (r_i, c_i)
# allow contains (safe-ish)
for r_i, row in enumerate(table.rows):
for c_i, cell in enumerate(row.cells):
if target and target in canon_label(cell_text(cell)):
return (r_i, c_i)
return None
def find_label_cell(doc: Document, label: str) -> Optional[Tuple[Table, int, int]]:
for t in iter_tables(doc):
pos = find_label_cell_in_table(t, label)
if pos:
return (t, pos[0], pos[1])
return None
def get_adjacent_value_cell(table: Table, r: int, c: int) -> _Cell:
# Prefer right cell, otherwise next row same col, otherwise this cell
cols = len(table.rows[0].cells)
if c + 1 < cols:
return table.rows[r].cells[c+1]
if r + 1 < len(table.rows):
return table.rows[r+1].cells[c]
return table.rows[r].cells[c]
# ----------------------------- label/value updates -----------------------------
def update_label_value_in_tables(doc: Document, label: str, value) -> bool:
tup = find_label_cell(doc, label)
val = join_value(value)
if not tup:
return False
t, r, c = tup
target_cell = get_adjacent_value_cell(t, r, c)
return replace_red_in_cell(target_cell, val)
def update_heading_followed_red(doc: Document, heading: str, value, max_scan: int = 12) -> bool:
"""Find heading paragraph, then replace the first red run found within next N paragraphs (including inside tables)"""
start = find_heading_paragraph(doc, heading)
if not start:
return False
# Build a linear list of paragraphs across whole doc to get an index
allp = iter_paragraphs(doc)
try:
idx = allp.index(start)
except ValueError:
idx = 0
new_text = join_value(value)
# Scan forward
for p in allp[idx+1: idx+1+max_scan]:
if replace_red_in_paragraph(p, new_text):
return True
# Also check any red in table cells inside this paragraph's parent (already covered via iter_paragraphs)
return False
# ----------------------------- ACN per-digit fill -----------------------------
def fill_acn_digits(doc: Document, acn_value: str) -> bool:
digits = split_digits(acn_value)
if not digits:
return False
loc = find_label_cell(doc, "Australian Company Number")
if not loc:
return False
t, r, c = loc
# Collect cells to the RIGHT in the same row first
targets: List[_Cell] = [t.rows[r].cells[j] for j in range(c + 1, len(t.rows[r].cells))]
# If not enough, continue row-by-row below (left→right)
rr = r + 1
while len(targets) < len(digits) and rr < len(t.rows):
targets.extend(list(t.rows[rr].cells))
rr += 1
targets = targets[:len(digits)]
if not targets:
return False
# Clear each target cell and write ONE digit in black
for d, cell in zip(digits, targets):
_set_cell_text_black(cell, d)
return True
# ----------------------------- vehicle tables -----------------------------
def table_header_text(table: Table, up_to_rows: int = 3) -> str:
heads = []
for i, row in enumerate(table.rows[:up_to_rows]):
for cell in row.cells:
heads.append(cell_text(cell))
return canon(" ".join(heads))
def find_vehicle_table(doc: Document, want: str) -> Optional[Table]:
"""
want = "maintenance" or "mass"
"""
MAINT_KEYS = ["registration number", "maintenance records", "daily checks", "fault recording", "fault repair"]
MASS_KEYS = ["registration number", "weight verification", "rfs suspension", "suspension system maintenance", "trip records", "reporting on suspension"]
candidates = []
for t in iter_tables(doc):
htxt = table_header_text(t)
if want == "maintenance":
if all(k in htxt for k in ["registration", "maintenance", "fault"]) and "suspension" not in htxt:
candidates.append(t)
elif want == "mass":
if "suspension" in htxt and "weight" in htxt:
candidates.append(t)
# Prefer the one with most rows
if not candidates:
return None
return max(candidates, key=lambda tb: len(tb.rows))
def map_cols(table: Table, want: str) -> Dict[str, int]:
# map header columns by keywords from the first 2 rows that contain headers
header_rows = table.rows[:2]
col_texts = []
cols = len(table.rows[0].cells)
for j in range(cols):
txt = " ".join(cell_text(r.cells[j]) for r in header_rows if j < len(r.cells))
col_texts.append(canon(txt))
idx = {}
def first_col(*needles) -> Optional[int]:
for j, t in enumerate(col_texts):
if all(n in t for n in needles):
return j
return None
if want == "maintenance":
idx["reg"] = first_col("registration")
idx["rw"] = first_col("roadworthiness")
idx["mr"] = first_col("maintenance", "records")
idx["daily"] = first_col("daily", "check")
idx["fr"] = first_col("fault", "recording")
idx["rep"] = first_col("fault", "repair")
else:
idx["reg"] = first_col("registration")
idx["wv"] = first_col("weight", "verification")
idx["rfs"] = first_col("rfs", "cert")
idx["susp"] = first_col("suspension", "maintenance")
idx["trip"] = first_col("trip", "record")
idx["frs"] = first_col("fault", "suspension")
return {k:v for k,v in idx.items() if v is not None}
def clear_data_rows_keep_headers(table: Table, header_rows: int = 1):
# Keep first header_rows, drop everything else
while len(table.rows) > header_rows:
table._tbl.remove(table.rows[-1]._tr)
def ensure_rows(table: Table, need_rows: int):
# assumes 1 header row; add rows to reach need_rows + 1 total
while len(table.rows) < need_rows + 1:
table.add_row()
def fill_vehicle_table(table: Table, want: str, arrays: Dict[str, List[str]]):
colmap = map_cols(table, want)
if "reg" not in colmap:
return
if want == "maintenance":
regs = arrays.get("Registration Number", [])
rw = arrays.get("Roadworthiness Certificates", [])
mr = arrays.get("Maintenance Records", [])
daily= arrays.get("Daily Checks", [])
fr = arrays.get("Fault Recording/ Reporting", [])
rep = arrays.get("Fault Repair", [])
n = len(regs)
# keep header row(s), then fill N rows
clear_data_rows_keep_headers(table, header_rows=1)
ensure_rows(table, n)
for i in range(n):
row = table.rows[i+1]
def put(col_key, vals):
if col_key not in colmap or i >= len(vals): return
c = row.cells[colmap[col_key]]
replace_red_in_cell(c, nz(vals[i]))
# write each col
c_reg = row.cells[colmap["reg"]]; replace_red_in_cell(c_reg, nz(regs[i]))
put("rw", rw)
put("mr", mr)
put("daily",daily)
put("fr", fr)
put("rep", rep)
else:
regs = arrays.get("Registration Number", [])
wv = arrays.get("Weight Verification Records", [])
rfs = arrays.get("RFS Suspension Certification #", [])
susp = arrays.get("Suspension System Maintenance", [])
trip = arrays.get("Trip Records", [])
frs = arrays.get("Fault Recording/ Reporting on Suspension System", [])
n = len(regs)
clear_data_rows_keep_headers(table, header_rows=1)
ensure_rows(table, n)
for i in range(n):
row = table.rows[i+1]
def put(col_key, vals):
if col_key not in colmap or i >= len(vals): return
c = row.cells[colmap[col_key]]
replace_red_in_cell(c, nz(vals[i]))
c_reg = row.cells[colmap["reg"]]; replace_red_in_cell(c_reg, nz(regs[i]))
put("wv", wv)
put("rfs", rfs)
put("susp", susp)
put("trip", trip)
put("frs", frs)
# ----------------------------- driver table -----------------------------
def find_driver_table(doc: Document) -> Optional[Table]:
for t in iter_tables(doc):
h = table_header_text(t)
if "driver / scheduler" in h and ("fit for duty" in h or "work diary" in h):
return t
return None
def map_driver_cols(table: Table) -> Dict[str,int]:
header_rows = table.rows[:2]
cols = len(table.rows[0].cells)
col_texts = []
for j in range(cols):
txt = " ".join(cell_text(r.cells[j]) for r in header_rows if j < len(r.cells))
col_texts.append(canon(txt))
idx = {}
def first_col(*needles):
for j, t in enumerate(col_texts):
if all(n in t for n in needles):
return j
return None
idx["name"] = first_col("driver", "name")
idx["roster"]= first_col("roster", "safe")
idx["fit"] = first_col("fit for duty")
# Work diary might be split across two headers; match "work diary" OR "electronic work diary"
wd = first_col("work diary") or first_col("electronic work diary")
if wd is not None: idx["wd"] = wd
return {k:v for k,v in idx.items() if v is not None}
def fill_driver_table(table: Table, arrays: Dict[str, List[str]]):
colmap = map_driver_cols(table)
if not colmap:
return
names = arrays.get("Driver / Scheduler Name", [])
rosters = arrays.get("Roster / Schedule / Safe Driving Plan (Date Range)", [])
fit = arrays.get("Fit for Duty Statement Completed (Yes/No)", [])
wd = arrays.get("Work Diary Pages (Page Numbers) Electronic Work Diary Records (Date Range)", [])
n = max(len(rosters), len(fit), len(wd), len(names))
clear_data_rows_keep_headers(table, header_rows=1)
ensure_rows(table, n)
has_any_name = any(str(x).strip() for x in names)
for i in range(n):
row = table.rows[i+1]
if "name" in colmap and has_any_name:
replace_red_in_cell(row.cells[colmap["name"]], names[i] if i < len(names) else "")
if "roster" in colmap:
replace_red_in_cell(row.cells[colmap["roster"]], rosters[i] if i < len(rosters) else "")
if "fit" in colmap:
replace_red_in_cell(row.cells[colmap["fit"]], fit[i] if i < len(fit) else "")
if "wd" in colmap:
replace_red_in_cell(row.cells[colmap["wd"]], wd[i] if i < len(wd) else "")
# ----------------------------- main mapping -----------------------------
def flatten_simple_sections(data: Dict) -> Dict[str, str]:
"""Collect simple label->single value mappings from top-level sections other than tables."""
out = {}
skip_sections = {
"Vehicle Registration Numbers Maintenance",
"Vehicle Registration Numbers Mass",
"Driver / Scheduler Records Examined",
"paragraphs",
"Attendance List (Names and Position Titles)",
"Nature of the Operators Business (Summary)",
"Maintenance Management Summary",
"Mass Management Summary",
"Fatigue Management Summary",
}
for sec, kv in data.items():
if sec in skip_sections: continue
if not isinstance(kv, dict): continue
for label, val in kv.items():
out[f"{sec}::{label}"] = join_value(val)
return out
def run(input_json: Path, template_docx: Path, output_docx: Path):
with open(input_json, "r", encoding="utf-8") as f:
data = json.load(f)
doc = Document(str(template_docx))
# 1) simple label/value tables
simple = flatten_simple_sections(data)
# Map by (section::label). We try: (a) find exact label cell somewhere and write in the adjacent cell;
# (b) if not found, search by heading then the next red run below the heading.
for k, v in simple.items():
# use the part after '::' as the label
label = k.split("::", 1)[1] if "::" in k else k
# SPECIAL: skip ACN here; we'll fill per-digit later
if canon_label(label) == "australian company number":
continue
ok = update_label_value_in_tables(doc, label, v)
if not ok:
sec = k.split("::", 1)[0] if "::" in k else k
update_heading_followed_red(doc, sec, v)
# 2) paragraphs block
paras = data.get("paragraphs", {})
# 2a) generic headings → replace next red (skip the 3 management headings here)
# third-line headings above the three tables
for head in ("MAINTENANCE MANAGEMENT", "MASS MANAGEMENT", "FATIGUE MANAGEMENT"):
name_val = join_value(paras.get(head, ""))
if name_val:
update_heading_followed_red(doc, head, name_val, max_scan=6)
# 2b) the 3-layer headings → overwrite the 3rd line only
# second-last page: date under page heading
aud_head = "NHVAS APPROVED AUDITOR DECLARATION"
aud_date = join_value(paras.get(aud_head, ""))
if aud_date:
set_date_by_heading_from_end(doc, aud_head, aud_date, max_scan=40)
# last page: date under the long acknowledgement paragraph
ack_head = ("I hereby acknowledge and agree with the findings detailed in this NHVAS Audit Summary Report. "
"I have read and understand the conditions applicable to the Scheme, including the NHVAS Business Rules and Standards.")
ack_date = join_value(paras.get(ack_head, ""))
if ack_date:
set_date_by_paragraph_from_end(doc, ack_head, ack_date, max_scan=40)
maint_name = join_value(paras.get("MAINTENANCE MANAGEMENT", ""))
if maint_name:
set_layer3_name_after_management_heading(
doc,
"MAINTENANCE MANAGEMENT",
["Vehicle Registration Numbers of Records Examined"],
maint_name,
)
mass_name = join_value(paras.get("MASS MANAGEMENT", ""))
if mass_name:
set_layer3_name_after_management_heading(
doc,
"MASS MANAGEMENT",
["Vehicle Registration Numbers of Records Examined"],
mass_name,
)
fat_name = join_value(paras.get("FATIGUE MANAGEMENT", ""))
if fat_name:
set_layer3_name_after_management_heading(
doc,
"FATIGUE MANAGEMENT",
["Driver / Scheduler Records Examined"],
fat_name,
)
# 3) ACN digits
op_info = data.get("Operator Information", {})
acn_val = join_value(op_info.get("Australian Company Number", ""))
if acn_val:
fill_acn_digits(doc, acn_val)
# 4) Vehicle tables
maint = data.get("Vehicle Registration Numbers Maintenance", {})
mass = data.get("Vehicle Registration Numbers Mass", {})
t_m = find_vehicle_table(doc, "maintenance")
if t_m and maint:
fill_vehicle_table(t_m, "maintenance", maint)
t_ms = find_mass_vehicle_numbers_table(doc)
if t_ms and mass:
fill_mass_vehicle_table_preserve_headers(t_ms, mass)
# 5) Driver table
drivers = data.get("Driver / Scheduler Records Examined", {})
t_d = find_driver_table(doc)
if t_d and drivers:
fill_driver_table(t_d, drivers)
# 6) Special: Audit Declaration dates via heading
decl = data.get("Audit Declaration dates", {})
if decl.get("Audit was conducted on"):
update_heading_followed_red(doc, "Audit was conducted on", decl["Audit was conducted on"])
# 7) Operator Declaration (last page, bottom row only), and fix Auditor table header
op_decl = data.get("Operator Declaration", {})
if op_decl:
fill_operator_declaration(
doc,
join_value(op_decl.get("Print Name", "")),
join_value(op_decl.get("Position Title", "")),
)
# make sure the second-last page “NHVAS APPROVED AUDITOR DECLARATION” header row is labels
ensure_auditor_decl_headers(doc)
# 8) Attendance List
# Attendance: replace red lines only
atts = data.get("Attendance List (Names and Position Titles)", {})
att_val = atts.get("Attendance List (Names and Position Titles)")
if att_val:
fill_attendance_block(doc, att_val)
# 9) Nature of the Operators Business (Summary): write once (no duplicates)
biz = data.get("Nature of the Operators Business (Summary)", {})
if biz:
val = biz.get("Nature of the Operators Business (Summary):") or next(iter(biz.values()), "")
if val:
update_business_summary_once(doc, val)
# 10) Summary tables: FULL OVERWRITE of DETAILS from JSON
mm_sum = data.get("Maintenance Management Summary", {})
if mm_sum:
overwrite_summary_details_cells(doc, "Maintenance Management Summary", mm_sum)
mass_sum = data.get("Mass Management Summary", {})
if mass_sum:
overwrite_summary_details_cells(doc, "Mass Management Summary", mass_sum)
fat_sum = data.get("Fatigue Management Summary", {})
if fat_sum:
overwrite_summary_details_cells(doc, "Fatigue Management Summary", fat_sum)
doc.save(str(output_docx))
# ----------------------------- CLI -----------------------------
if __name__ == "__main__":
import sys
from pathlib import Path
if len(sys.argv) != 4:
print("Usage: python updated_word.py <json> <template.docx> <output.docx>")
sys.exit(1)
a, b, c = map(Path, sys.argv[1:4])
files = [a, b, c]
json_path = next((p for p in files if p.suffix.lower() == ".json"), None)
docx_paths = [p for p in files if p.suffix.lower() == ".docx"]
if not json_path or len(docx_paths) < 2:
print("Error: provide one .json and two .docx (template + output).")
sys.exit(1)
# Template = the .docx that already exists; Output = the other .docx
template_docx = next((p for p in docx_paths if p.exists()), docx_paths[0])
output_docx = docx_paths[1] if docx_paths[0] == template_docx else docx_paths[0]
run(json_path, template_docx, output_docx)