Spaces:

Shami96
/

PDF-Data_Extractor

Running

PDF-Data_Extractor / space-pdf /updated_word.py

Wasim

Sync: robust vehicle parser + full project

2e237ce 11 days ago

45 kB

	#!/usr/bin/env python3
	# update_docx_from_json.py
	import sys, json, re
	from pathlib import Path
	from typing import Dict, List, Tuple, Optional
	from docx import Document
	from docx.shared import RGBColor, Pt # add Pt
	from docx.table import _Cell, Table
	from docx.text.paragraph import Paragraph
	from copy import deepcopy
	from docx.oxml.ns import qn
	from docx.oxml.table import CT_Tbl
	from docx.oxml.text.paragraph import CT_P

	BLACK = RGBColor(0, 0, 0)
	RED = RGBColor(0xFF, 0x00, 0x00)

	# ----------------------------- text helpers -----------------------------
	def _find_table_with_headers(doc: Document, must_have: list[str]) -> Optional[Table]:
	for t in doc.tables:
	if not t.rows:
	continue
	head = canon(" ".join(cell_text(c) for c in t.rows[0].cells))
	if all(canon_label(x) in head for x in must_have):
	return t
	return None

	def ensure_auditor_decl_headers(doc: Document) -> bool:
	"""
	Second-last page table under 'NHVAS APPROVED AUDITOR DECLARATION'.
	Force the HEADER row to read exactly:
	[ Print Name \| NHVR or Exemplar Global Auditor Registration Number ]
	Never touch the bottom (values) row.
	"""
	changed = False
	expected_left = "Print Name"
	expected_right = "NHVR or Exemplar Global Auditor Registration Number"

	for t in doc.tables:
	if not t.rows or not t.rows[0].cells:
	continue
	# must look like the auditor table: header left says "Print Name", 2+ cols, 2+ rows
	head_left = canon_label(cell_text(t.rows[0].cells[0]))
	if head_left == "print name" and len(t.rows[0].cells) >= 2 and len(t.rows) >= 2:
	# fix left header if needed
	if canon_label(cell_text(t.rows[0].cells[0])) != canon_label(expected_left) or \
	any(is_red_run(r) for p in t.rows[0].cells[0].paragraphs for r in p.runs):
	_set_cell_text_black(t.rows[0].cells[0], expected_left)
	changed = True
	# unconditionally set the RIGHT header text (this is where "Peter Sheppard" was sitting)
	if canon_label(cell_text(t.rows[0].cells[1])) != canon_label(expected_right) or \
	any(is_red_run(r) for p in t.rows[0].cells[1].paragraphs for r in p.runs):
	_set_cell_text_black(t.rows[0].cells[1], expected_right)
	changed = True
	# found and fixed the table; no need to continue
	break

	return changed


	def fill_operator_declaration(doc: Document, print_name: str, position_title: str) -> bool:
	"""Last page table: write values ONLY into the bottom row (red placeholders)."""
	t = _find_table_with_headers(doc, ["Print Name", "Position Title"])
	if not t or len(t.rows) < 2 or len(t.rows[0].cells) < 2:
	return False
	bot_left = t.rows[1].cells[0]
	bot_right = t.rows[1].cells[1]

	# only replace if that cell has a red placeholder
	if any(is_red_run(r) for p in bot_left.paragraphs for r in p.runs):
	_set_cell_text_black(bot_left, print_name)
	if any(is_red_run(r) for p in bot_right.paragraphs for r in p.runs):
	_set_cell_text_black(bot_right, position_title)
	return True

	def find_heading_index_from_end(doc: Document, heading: str) -> Optional[int]:
	key = canon(heading)
	allp = iter_paragraphs(doc)
	for i in range(len(allp) - 1, -1, -1):
	if key in canon(para_text(allp[i])):
	return i
	return None

	def set_date_by_heading_from_end(doc: Document, heading: str, date_text: str, max_scan: int = 60) -> bool:
	"""Find the LAST occurrence of `heading`, then replace the FIRST red run in the next paragraphs."""
	if not date_text:
	return False
	allp = iter_paragraphs(doc)
	idx = find_heading_index_from_end(doc, heading)
	if idx is None:
	return False
	for p in allp[idx + 1 : min(idx + 1 + max_scan, len(allp))]:
	if replace_red_in_paragraph(p, date_text): # writes in black
	return True
	return False

	def set_date_by_paragraph_from_end(doc: Document, paragraph_text: str, date_text: str, max_scan: int = 60) -> bool:
	"""Find the LAST paragraph matching `paragraph_text`, then set the FIRST red run after it."""
	if not date_text:
	return False
	key = canon(paragraph_text)
	allp = iter_paragraphs(doc)
	hit = None
	for i in range(len(allp) - 1, -1, -1):
	if key in canon(para_text(allp[i])):
	hit = i
	break
	if hit is None:
	return False
	# date placeholder is on the LAST page, right after this long paragraph
	for p in allp[hit + 1 : min(hit + 1 + max_scan, len(allp))]:
	if replace_red_in_paragraph(p, date_text): # writes in black
	return True
	return False

	def set_layer3_name_after_management_heading(doc: Document, mid_heading: str, allowed_prev_titles: List[str], name: str) -> bool:
	if not name:
	return False

	allp = iter_paragraphs(doc)
	wrote = False
	mid = canon(mid_heading)
	allowed_prev = {canon(t) for t in allowed_prev_titles}

	for i, p in enumerate(allp):
	if canon(para_text(p)) != mid:
	continue

	# previous non-empty must be one of the allowed titles
	j = i - 1
	while j >= 0 and not nz(para_text(allp[j])):
	j -= 1
	if j < 0 or canon(para_text(allp[j])) not in allowed_prev:
	continue

	# next non-empty is the 3rd line we overwrite
	k = i + 1
	while k < len(allp) and not nz(para_text(allp[k])):
	k += 1
	if k >= len(allp):
	continue

	# compute target size from the middle heading; fall back to a sensible bump
	target_size = _para_effective_font_size(allp[i]) or Pt(16)

	_clear_para_and_write_black(allp[k], name)

	# apply size to all runs explicitly (overrides style)
	for r in allp[k].runs:
	r.font.size = target_size

	wrote = True

	return wrote

	def _para_effective_font_size(p: Paragraph):
	# try explicit run sizes first
	for r in p.runs:
	if r.font.size:
	return r.font.size
	# then the paragraph style
	if p.style and p.style.font and p.style.font.size:
	return p.style.font.size
	return None

	# --- helpers for summary tables ---
	# --- helpers for summary overwrite ---
	def _std_key(s: str) -> str:
	"""
	Normalize a label to match a 'Std N' key.
	e.g. 'Std 7. Internal Review' -> 'std 7'
	"""
	t = canon_label(s)
	m = re.match(r"(std\s+\d+)", t)
	return m.group(1) if m else t

	def _looks_like_summary_table(table: Table) -> Optional[Tuple[int, int]]:
	"""
	Return (label_col_idx, details_col_idx) if this is a Summary table
	with a DETAILS column; otherwise None.
	"""
	if not table.rows:
	return None
	first = table.rows[0]
	cols = len(first.cells)
	if cols < 2:
	return None

	# header texts for first row
	head = [canon(cell_text(c)) for c in first.cells]

	# find DETAILS column
	details_col = None
	for j, t in enumerate(head):
	if "detail" in t:
	details_col = j
	break
	if details_col is None:
	return None

	# find the label column (left-hand standards column)
	label_col = None
	for j, t in enumerate(head):
	if any(k in t for k in ["maintenance management", "mass management", "fatigue management"]):
	label_col = j
	break
	if label_col is None:
	# fallback: assume the first non-DETAILS column is the label column
	label_col = 0 if details_col != 0 else 1

	return (label_col, details_col)
	def count_header_rows(table: Table, scan_up_to: int = 6) -> int:
	"""Heuristically count header rows (stop when first data row like '1.' appears)."""
	for i, row in enumerate(table.rows[:scan_up_to]):
	first = cell_text(row.cells[0]).strip()
	if re.match(r"^\d+\.?$", first):
	return i
	return 1
	def _header_col_texts(table: Table, scan_rows: int = 5) -> List[str]:
	scan_rows = min(scan_rows, len(table.rows))
	if scan_rows == 0:
	return []
	# pick the row with the most cells as base
	base_row = max(range(scan_rows), key=lambda i: len(table.rows[i].cells))
	base_cols = len(table.rows[base_row].cells)
	cols = []
	for j in range(base_cols):
	parts = []
	for i in range(scan_rows):
	row = table.rows[i]
	if j < len(row.cells):
	parts.append(cell_text(row.cells[j]))
	cols.append(canon(" ".join(parts)))
	return cols

	def count_header_rows(table: Table, scan_up_to: int = 6) -> int:
	"""Header ends right before the first row whose 1st cell looks like '1.'"""
	limit = min(scan_up_to, len(table.rows))
	for i in range(limit):
	first = cell_text(table.rows[i].cells[0]).strip()
	if re.match(r"^\d+\.?$", first):
	return i
	# fallback to 1 header row
	return 1

	def map_cols_mass_strict(table: Table) -> Dict[str, int]:
	cols = _header_col_texts(table, 5)
	def first_col(*needles):
	for j, t in enumerate(cols):
	if all(n in t for n in needles):
	return j
	return None
	idx = {
	"no": first_col("no"),
	"reg": first_col("registration", "number") or first_col("registration"),
	"wv": first_col("weight", "verification"),
	"rfs": first_col("rfs", "cert") or first_col("rfs", "certification"),
	"susp": first_col("suspension", "maintenance"),
	"trip": first_col("trip", "record"),
	"frs": first_col("fault", "suspension") or first_col("fault", "reporting", "suspension"),
	}
	return {k: v for k, v in idx.items() if v is not None}

	def find_mass_vehicle_numbers_table(doc: Document) -> Optional[Table]:
	"""Pick the Mass vehicle-number table by matching its column set (not the Summary table)."""
	best = None
	best_score = -1
	for t in iter_tables(doc):
	cols = _header_col_texts(t, 5)
	allhdr = " ".join(cols)
	# must look like the vehicle numbers table
	hits = 0
	hits += int(any("registration" in c and "number" in c for c in cols))
	hits += int(any("weight" in c and "verification" in c for c in cols))
	hits += int(any("rfs" in c and ("cert" in c or "certification" in c) for c in cols))
	hits += int(any("suspension" in c and "maintenance" in c for c in cols))
	hits += int(any("trip" in c and "record" in c for c in cols))
	hits += int(any("fault" in c and "suspension" in c for c in cols))
	# reject obvious Summary tables
	if "details" in allhdr:
	continue
	# prefer tables with numbering column and many rows
	score = hits + (0.5 if any("no" == c or c.startswith("no ") for c in cols) else 0) + (len(t.rows) / 100.0)
	if hits >= 4 and score > best_score:
	best, best_score = t, score
	return best

	def update_operator_declaration(doc: Document, print_name: str, position_title: str) -> bool:
	"""
	First try strict table label mapping for 'Print Name' and 'Position Title'.
	If not found, fallback to the first two red placeholders under the 'Operator Declaration' heading.
	"""
	changed = False
	# 1) Table label approach
	for lbl, val in (("Print Name", print_name), ("Position Title", position_title)):
	if not val:
	continue
	loc = find_label_cell(doc, lbl)
	if not loc:
	# tolerate odd spacing/colon/camelcase
	for alt in ("PrintName", "Print Name", "Print Name:", "PositionTitle", "Position Title", "Position Title:"):
	loc = find_label_cell(doc, alt)
	if loc:
	break
	if loc:
	t, r, c = loc
	cell = get_adjacent_value_cell(t, r, c)
	if not replace_red_in_cell(cell, val):
	_set_cell_text_black(cell, val)
	changed = True

	if changed:
	return True

	# 2) Fallback: heading-scoped red placeholders
	head = "OPERATOR DECLARATION"
	p = find_heading_paragraph(doc, head) or find_heading_paragraph(doc, head.title())
	if not p:
	return False
	allp = iter_paragraphs(doc)
	try:
	i = allp.index(p)
	except ValueError:
	i = 0
	red_targets = []
	for q in allp[i+1:i+1+20]:
	reds = [r for r in q.runs if is_red_run(r)]
	if reds:
	red_targets.extend(reds)
	if len(red_targets) >= 2:
	break
	wrote = False
	if print_name and red_targets:
	_set_text_and_black(red_targets[0], print_name); wrote = True
	if position_title and len(red_targets) >= 2:
	_set_text_and_black(red_targets[1], position_title); wrote = True
	return wrote


	def fill_mass_vehicle_table_preserve_headers(table: Table, arrays: Dict[str, List[str]]):
	colmap = map_cols_mass_strict(table)
	if "reg" not in colmap:
	return
	hdr_rows = count_header_rows(table, 6)
	regs = arrays.get("Registration Number", [])
	n = len(regs)

	# clear data rows only
	while len(table.rows) > hdr_rows:
	table._tbl.remove(table.rows[-1]._tr)
	# ensure enough rows
	while len(table.rows) < hdr_rows + n:
	table.add_row()

	def put(row, key, arr_key, i):
	if key in colmap:
	vals = arrays.get(arr_key, [])
	val = nz(vals[i]) if i < len(vals) else ""
	replace_red_in_cell(row.cells[colmap[key]], val)

	for i in range(n):
	row = table.rows[hdr_rows + i]
	replace_red_in_cell(row.cells[colmap["reg"]], nz(regs[i]))
	put(row, "wv", "Weight Verification Records", i)
	put(row, "rfs", "RFS Suspension Certification #", i)
	put(row, "susp", "Suspension System Maintenance", i)
	put(row, "trip", "Trip Records", i)
	put(row, "frs", "Fault Recording/ Reporting on Suspension System", i)

	def overwrite_summary_details_cells(doc: Document, section_name: str, section_dict: Dict[str, List[str]]) -> int:
	"""For a Summary table (Maintenance/Mass/Fatigue), replace the entire DETAILS cell
	for each Std N row with the JSON text (written in black)."""
	# build desired texts
	desired: Dict[str, str] = { _std_key(k): join_value(v) for k, v in section_dict.items() }

	# pick which tables belong to this section by header sniff
	wanted_prefix = canon_label(section_name.split()[0]) # "maintenance" \| "mass" \| "fatigue"

	updated = 0
	for t in doc.tables:
	cols = _looks_like_summary_table(t)
	if not cols:
	continue
	label_col, details_col = cols

	head_txt = table_header_text(t, up_to_rows=2)
	if wanted_prefix not in head_txt: # keep to the correct section
	continue

	# walk body rows
	for i in range(1, len(t.rows)):
	row = t.rows[i]
	key = _std_key(cell_text(row.cells[label_col]))

	# exact match or "std N" prefix match
	cand = desired.get(key)
	if not cand:
	m = re.match(r"(std\s+\d+)", key)
	if m:
	for k2, v2 in desired.items():
	if k2.startswith(m.group(1)):
	cand = v2
	break
	if not cand:
	continue

	_set_cell_text_black(row.cells[details_col], cand) # full overwrite, black
	updated += 1
	return updated

	SPLIT_SENT_PAT = re.compile(r"(?<=\.\|\?\|!)\s+")
	ORDINAL_DATE_PAT = re.compile(r"\b(\d{1,2}(?:st\|nd\|rd\|th)\s+[A-Za-z]+\s+\d{4})\b", re.I)

	def split_sentences_keep(text: str) -> List[str]:
	s = " ".join(str(text or "").split())
	if not s:
	return []
	out = []
	start = 0
	for m in SPLIT_SENT_PAT.finditer(s):
	out.append(s[start:m.start()].strip())
	start = m.end()
	last = s[start:].strip()
	if last:
	out.append(last)
	return out

	_sent_split = re.compile(r'(?<=[.!?])\s+\|\n+')
	_date_pat = re.compile(r'\b(?:\d{1,2}(?:st\|nd\|rd\|th)\s+[A-Za-z]+\s+\d{4}\|\d{1,2}/\d{1,2}/\d{2,4}\|[A-Za-z]+\s+\d{1,2},\s*\d{4})\b')

	def extract_summary_snippets(desired_text: str):
	sents = _sentences(desired_text)
	dates = [m.group(0) for m in _date_pat.finditer(desired_text)]
	pick = lambda rx: next((s for s in sents if re.search(rx, s, re.I)), None)
	return {
	"sheet_sent": pick(r'\b(daily\s+check\|sheet)\b'),
	"sheet_phrase": _extract_sheet_phrase_from_desired(desired_text),
	"review": pick(r'\binternal\s+review\b'),
	"qcs": pick(r'\bquarterly\b.*\bcompliance\b') or pick(r'\bquarterly\b'),
	"dates": dates,
	"sents": sents,
	}

	def fill_management_summary_tables(doc: Document, section_key: str, section_data: Dict[str, List[str]]):
	"""
	Fill ALL summary tables for the given section_key ('maintenance'\|'mass'\|'fatigue')
	by matching each row label (left column) against keys in section_data and
	patching only the red text inside the DETAILS cell.
	"""
	targets = [x for x in find_all_summary_tables(doc) if x[0] == section_key]
	if not targets:
	return

	# build list of (normalized label, original label, desired_text)
	desired = []
	for label, vals in section_data.items():
	want = canon_label(label)
	if not want:
	continue
	desired.append((want, label, join_value(vals)))

	for _, table, lcol, dcol in targets:
	# iterate data rows (skip header)
	for i in range(1, len(table.rows)):
	left_txt_norm = canon_label(cell_text(table.rows[i].cells[lcol]))
	if not left_txt_norm:
	continue
	for want_norm, _orig_lbl, value in desired:
	# loose contains match handles minor punctuation differences
	if want_norm and want_norm in left_txt_norm:
	patch_details_cell_from_json(table.rows[i].cells[dcol], value)

	def _set_text_and_black(run, new_text: str):
	"""Replace a run's text and force color to black (clears theme color too)."""
	if new_text is None:
	new_text = ""
	run.text = str(new_text)
	run.font.color.rgb = BLACK
	try:
	# clear any theme color so rgb sticks
	run.font.color.theme_color = None
	except Exception:
	pass

	def update_business_summary_once(doc: Document, value) -> bool:
	"""Replace only the red summary paragraph; keep 'Accreditation Number' and 'Expiry Date' lines."""
	loc = (find_label_cell(doc, "Nature of the Operators Business (Summary)")
	or find_label_cell(doc, "Nature of the Operators Business (Summary):"))
	if not loc:
	return False

	t, r, c = loc
	cell = get_adjacent_value_cell(t, r, c)
	if not cell.paragraphs:
	cell.add_paragraph("")

	txt = join_value(value)

	# find paragraphs with any red runs (the placeholders for the summary)
	red_paras = [p for p in cell.paragraphs if any(is_red_run(run) for run in p.runs)]

	if red_paras:
	# write the summary into the first red paragraph (in black)
	_clear_para_and_write_black(red_paras[0], txt)
	# clear any extra red placeholders
	for p in red_paras[1:]:
	_clear_para_and_write_black(p, "")
	else:
	# no red placeholder found: just put the summary into the first paragraph, leave others
	_clear_para_and_write_black(cell.paragraphs[0], txt)

	return True


	def _nuke_cell_paragraphs(cell: _Cell):
	"""Remove ALL paragraphs from a cell (true delete, not just emptying runs)."""
	for p in list(cell.paragraphs):
	p._element.getparent().remove(p._element)

	def _clear_para_and_write_black(paragraph, text: str):
	"""Clear a whole paragraph and write fresh black text."""
	# wipe existing runs
	for r in list(paragraph.runs):
	r.text = ""
	r = paragraph.add_run(str(text or ""))
	r.font.color.rgb = BLACK
	try:
	r.font.color.theme_color = None
	except Exception:
	pass

	def _set_cell_text_black(cell, text: str):
	"""Clear a table cell and insert black text."""
	# remove text from all runs in all paragraphs
	for p in cell.paragraphs:
	for r in p.runs:
	r.text = ""
	p = cell.paragraphs[0] if cell.paragraphs else cell.add_paragraph()
	r = p.add_run(str(text or ""))
	r.font.color.rgb = BLACK
	try:
	r.font.color.theme_color = None
	except Exception:
	pass

	def nz(x: Optional[str]) -> str:
	return (x or "").strip()

	def canon(s: str) -> str:
	s = re.sub(r"\s+", " ", str(s)).strip().lower()
	s = s.replace("–", "-").replace("—", "-")
	return re.sub(r"[^a-z0-9/#()+,.\- ]+", "", s)

	def canon_label(s: str) -> str:
	# labels often vary by punctuation/casing; keep digits/letters
	s = re.sub(r"\s+", " ", str(s)).strip().lower()
	s = s.replace("–", "-").replace("—", "-")
	s = re.sub(r"[^a-z0-9 ]+", " ", s)
	return re.sub(r"\s+", " ", s).strip()

	def join_value(value) -> str:
	if isinstance(value, list):
	# Keep multi-line when list provided
	return "\n".join([str(v) for v in value if nz(v)])
	return str(value)

	def split_digits(s: str) -> List[str]:
	return re.findall(r"\d", s)

	def para_text(p: Paragraph) -> str:
	return "".join(run.text for run in p.runs)

	def cell_text(c: _Cell) -> str:
	return "\n".join(para_text(p) for p in c.paragraphs)

	def is_red_run(run) -> bool:
	col = run.font.color
	if not col:
	return False
	if col.rgb is not None:
	return col.rgb == RED
	# Some templates use theme colors; treat explicit red text snippets only
	return False

	def replace_red_in_paragraph(p: Paragraph, new_text: str) -> bool:
	replaced = False
	red_runs = [r for r in p.runs if is_red_run(r)]
	if not red_runs:
	return False
	# collapse all red runs into one and write value (in black)
	first = red_runs[0]
	_set_text_and_black(first, new_text)
	for r in red_runs[1:]:
	r.text = ""
	replaced = True
	return replaced

	def replace_red_in_cell(cell: _Cell, new_text: str) -> bool:
	# replace only red runs; if none, replace whole cell with a single run (fallback)
	any_red = False
	for p in cell.paragraphs:
	if replace_red_in_paragraph(p, new_text):
	any_red = True
	if any_red:
	return True
	# fallback: clear cell, set single paragraph text in black
	_set_cell_text_black(cell, new_text)
	return True

	def parse_attendance_lines(value) -> List[str]:
	"""
	Parse strings like:
	"Peter Sheppard - Compliance Greg Dyer - Auditor"
	into:
	["Peter Sheppard - Compliance", "Greg Dyer - Auditor"]
	Handles lists, newlines, semicolons, and pipes too.
	"""
	if isinstance(value, list):
	s = " ".join(str(v) for v in value if v)
	else:
	s = str(value or "")
	s = re.sub(r"\s+", " ", s).strip()
	if not s:
	return []

	# First split on explicit separators; then within each chunk, extract Name - Title pairs.
	chunks = re.split(r"\s[\n;\|]\s", s)
	items: List[str] = []

	pair_pat = re.compile(
	r"([A-Z][A-Za-z.'-]+(?:\s+[A-Z][A-Za-z.'-]+){0,3})\s-\s"
	r"([^-\n]+?)(?=\s+[A-Z][A-Za-z.'-]+(?:\s+[A-Z][A-Za-z.'-]+){0,3}\s-\s\|$)"
	)

	for chunk in chunks:
	chunk = chunk.strip()
	if not chunk:
	continue
	found = False
	for m in pair_pat.finditer(chunk):
	name = m.group(1).strip()
	title = m.group(2).strip()
	items.append(f"{name} - {title}")
	found = True
	if not found:
	# Fallback: single "Name - Title"
	if " - " in chunk:
	a, b = chunk.split(" - ", 1)
	items.append(f"{a.strip()} - {b.strip()}")
	elif chunk:
	items.append(chunk)

	return items

	def fill_attendance_block(doc: Document, value) -> bool:
	items = parse_attendance_lines(value)
	if not items:
	return False

	loc = find_label_cell(doc, "Attendance List (Names and Position Titles)")
	if not loc:
	return False

	t, r, c = loc
	# value cell: usually directly under the heading cell
	target = (
	t.rows[r + 1].cells[c]
	if r + 1 < len(t.rows) and c < len(t.rows[r + 1].cells)
	else get_adjacent_value_cell(t, r, c)
	)

	# ---- read ONLY the target cell (don’t touch the row)
	def is_red_para(p): return any(is_red_run(run) for run in p.runs)
	def looks_like_pair(s: str) -> bool:
	if " - " not in s: return False
	a, b = s.split(" - ", 1)
	return bool(a.strip()) and bool(b.strip())

	paras = list(target.paragraphs)
	red_count = sum(1 for p in paras if is_red_para(p))
	existing_black = [para_text(p).strip() for p in paras
	if (not is_red_para(p)) and looks_like_pair(para_text(p))]

	# compose final lines
	out_lines: List[str] = []
	out_lines.extend(items[:red_count]) # replace red placeholders
	out_lines.extend(existing_black) # keep black lines
	norm = lambda s: re.sub(r"\s+", " ", s.strip().lower())
	seen = {norm(x) for x in out_lines}
	for extra in items[red_count:]:
	k = norm(extra)
	if k not in seen:
	out_lines.append(extra); seen.add(k)

	# ---- hard clear target cell and write fresh (all black)
	_nuke_cell_paragraphs(target)
	# first line
	p = target.add_paragraph()
	_clear_para_and_write_black(p, out_lines[0] if out_lines else "")
	# remaining lines
	for line in out_lines[1:]:
	p = target.add_paragraph()
	_clear_para_and_write_black(p, line)

	return True

	# ----------------------------- document search -----------------------------
	def iter_tables(doc: Document) -> List[Table]:
	return list(doc.tables)

	def iter_paragraphs(doc: Document) -> List[Paragraph]:
	# paragraphs at doc level + inside tables
	out = list(doc.paragraphs)
	for t in doc.tables:
	for row in t.rows:
	for cell in row.cells:
	out.extend(cell.paragraphs)
	return out

	def find_heading_paragraph(doc: Document, heading_text: str, window: int = 60) -> Optional[Paragraph]:
	key = canon(heading_text)
	for p in iter_paragraphs(doc):
	if canon(para_text(p)).startswith(key):
	return p
	# fuzzy contains
	for p in iter_paragraphs(doc):
	if key in canon(para_text(p)):
	return p
	return None

	def find_label_cell_in_table(table: Table, label: str) -> Optional[Tuple[int, int]]:
	target = canon_label(label)
	for r_i, row in enumerate(table.rows):
	for c_i, cell in enumerate(row.cells):
	if canon_label(cell_text(cell)) == target:
	return (r_i, c_i)
	# allow contains (safe-ish)
	for r_i, row in enumerate(table.rows):
	for c_i, cell in enumerate(row.cells):
	if target and target in canon_label(cell_text(cell)):
	return (r_i, c_i)
	return None

	def find_label_cell(doc: Document, label: str) -> Optional[Tuple[Table, int, int]]:
	for t in iter_tables(doc):
	pos = find_label_cell_in_table(t, label)
	if pos:
	return (t, pos[0], pos[1])
	return None

	def get_adjacent_value_cell(table: Table, r: int, c: int) -> _Cell:
	# Prefer right cell, otherwise next row same col, otherwise this cell
	cols = len(table.rows[0].cells)
	if c + 1 < cols:
	return table.rows[r].cells[c+1]
	if r + 1 < len(table.rows):
	return table.rows[r+1].cells[c]
	return table.rows[r].cells[c]

	# ----------------------------- label/value updates -----------------------------
	def update_label_value_in_tables(doc: Document, label: str, value) -> bool:
	tup = find_label_cell(doc, label)
	val = join_value(value)
	if not tup:
	return False
	t, r, c = tup
	target_cell = get_adjacent_value_cell(t, r, c)
	return replace_red_in_cell(target_cell, val)

	def update_heading_followed_red(doc: Document, heading: str, value, max_scan: int = 12) -> bool:
	"""Find heading paragraph, then replace the first red run found within next N paragraphs (including inside tables)"""
	start = find_heading_paragraph(doc, heading)
	if not start:
	return False
	# Build a linear list of paragraphs across whole doc to get an index
	allp = iter_paragraphs(doc)
	try:
	idx = allp.index(start)
	except ValueError:
	idx = 0
	new_text = join_value(value)
	# Scan forward
	for p in allp[idx+1: idx+1+max_scan]:
	if replace_red_in_paragraph(p, new_text):
	return True
	# Also check any red in table cells inside this paragraph's parent (already covered via iter_paragraphs)
	return False

	# ----------------------------- ACN per-digit fill -----------------------------
	def fill_acn_digits(doc: Document, acn_value: str) -> bool:
	digits = split_digits(acn_value)
	if not digits:
	return False
	loc = find_label_cell(doc, "Australian Company Number")
	if not loc:
	return False

	t, r, c = loc

	# Collect cells to the RIGHT in the same row first
	targets: List[_Cell] = [t.rows[r].cells[j] for j in range(c + 1, len(t.rows[r].cells))]

	# If not enough, continue row-by-row below (left→right)
	rr = r + 1
	while len(targets) < len(digits) and rr < len(t.rows):
	targets.extend(list(t.rows[rr].cells))
	rr += 1

	targets = targets[:len(digits)]
	if not targets:
	return False

	# Clear each target cell and write ONE digit in black
	for d, cell in zip(digits, targets):
	_set_cell_text_black(cell, d)

	return True


	# ----------------------------- vehicle tables -----------------------------
	def table_header_text(table: Table, up_to_rows: int = 3) -> str:
	heads = []
	for i, row in enumerate(table.rows[:up_to_rows]):
	for cell in row.cells:
	heads.append(cell_text(cell))
	return canon(" ".join(heads))

	def find_vehicle_table(doc: Document, want: str) -> Optional[Table]:
	"""
	want = "maintenance" or "mass"
	"""
	MAINT_KEYS = ["registration number", "maintenance records", "daily checks", "fault recording", "fault repair"]
	MASS_KEYS = ["registration number", "weight verification", "rfs suspension", "suspension system maintenance", "trip records", "reporting on suspension"]
	candidates = []
	for t in iter_tables(doc):
	htxt = table_header_text(t)
	if want == "maintenance":
	if all(k in htxt for k in ["registration", "maintenance", "fault"]) and "suspension" not in htxt:
	candidates.append(t)
	elif want == "mass":
	if "suspension" in htxt and "weight" in htxt:
	candidates.append(t)
	# Prefer the one with most rows
	if not candidates:
	return None
	return max(candidates, key=lambda tb: len(tb.rows))

	def map_cols(table: Table, want: str) -> Dict[str, int]:
	# map header columns by keywords from the first 2 rows that contain headers
	header_rows = table.rows[:2]
	col_texts = []
	cols = len(table.rows[0].cells)
	for j in range(cols):
	txt = " ".join(cell_text(r.cells[j]) for r in header_rows if j < len(r.cells))
	col_texts.append(canon(txt))
	idx = {}
	def first_col(*needles) -> Optional[int]:
	for j, t in enumerate(col_texts):
	if all(n in t for n in needles):
	return j
	return None
	if want == "maintenance":
	idx["reg"] = first_col("registration")
	idx["rw"] = first_col("roadworthiness")
	idx["mr"] = first_col("maintenance", "records")
	idx["daily"] = first_col("daily", "check")
	idx["fr"] = first_col("fault", "recording")
	idx["rep"] = first_col("fault", "repair")
	else:
	idx["reg"] = first_col("registration")
	idx["wv"] = first_col("weight", "verification")
	idx["rfs"] = first_col("rfs", "cert")
	idx["susp"] = first_col("suspension", "maintenance")
	idx["trip"] = first_col("trip", "record")
	idx["frs"] = first_col("fault", "suspension")
	return {k:v for k,v in idx.items() if v is not None}

	def clear_data_rows_keep_headers(table: Table, header_rows: int = 1):
	# Keep first header_rows, drop everything else
	while len(table.rows) > header_rows:
	table._tbl.remove(table.rows[-1]._tr)

	def ensure_rows(table: Table, need_rows: int):
	# assumes 1 header row; add rows to reach need_rows + 1 total
	while len(table.rows) < need_rows + 1:
	table.add_row()

	def fill_vehicle_table(table: Table, want: str, arrays: Dict[str, List[str]]):
	colmap = map_cols(table, want)
	if "reg" not in colmap:
	return
	if want == "maintenance":
	regs = arrays.get("Registration Number", [])
	rw = arrays.get("Roadworthiness Certificates", [])
	mr = arrays.get("Maintenance Records", [])
	daily= arrays.get("Daily Checks", [])
	fr = arrays.get("Fault Recording/ Reporting", [])
	rep = arrays.get("Fault Repair", [])
	n = len(regs)
	# keep header row(s), then fill N rows
	clear_data_rows_keep_headers(table, header_rows=1)
	ensure_rows(table, n)
	for i in range(n):
	row = table.rows[i+1]
	def put(col_key, vals):
	if col_key not in colmap or i >= len(vals): return
	c = row.cells[colmap[col_key]]
	replace_red_in_cell(c, nz(vals[i]))
	# write each col
	c_reg = row.cells[colmap["reg"]]; replace_red_in_cell(c_reg, nz(regs[i]))
	put("rw", rw)
	put("mr", mr)
	put("daily",daily)
	put("fr", fr)
	put("rep", rep)
	else:
	regs = arrays.get("Registration Number", [])
	wv = arrays.get("Weight Verification Records", [])
	rfs = arrays.get("RFS Suspension Certification #", [])
	susp = arrays.get("Suspension System Maintenance", [])
	trip = arrays.get("Trip Records", [])
	frs = arrays.get("Fault Recording/ Reporting on Suspension System", [])
	n = len(regs)
	clear_data_rows_keep_headers(table, header_rows=1)
	ensure_rows(table, n)
	for i in range(n):
	row = table.rows[i+1]
	def put(col_key, vals):
	if col_key not in colmap or i >= len(vals): return
	c = row.cells[colmap[col_key]]
	replace_red_in_cell(c, nz(vals[i]))
	c_reg = row.cells[colmap["reg"]]; replace_red_in_cell(c_reg, nz(regs[i]))
	put("wv", wv)
	put("rfs", rfs)
	put("susp", susp)
	put("trip", trip)
	put("frs", frs)

	# ----------------------------- driver table -----------------------------
	def find_driver_table(doc: Document) -> Optional[Table]:
	for t in iter_tables(doc):
	h = table_header_text(t)
	if "driver / scheduler" in h and ("fit for duty" in h or "work diary" in h):
	return t
	return None

	def map_driver_cols(table: Table) -> Dict[str,int]:
	header_rows = table.rows[:2]
	cols = len(table.rows[0].cells)
	col_texts = []
	for j in range(cols):
	txt = " ".join(cell_text(r.cells[j]) for r in header_rows if j < len(r.cells))
	col_texts.append(canon(txt))
	idx = {}
	def first_col(*needles):
	for j, t in enumerate(col_texts):
	if all(n in t for n in needles):
	return j
	return None
	idx["name"] = first_col("driver", "name")
	idx["roster"]= first_col("roster", "safe")
	idx["fit"] = first_col("fit for duty")
	# Work diary might be split across two headers; match "work diary" OR "electronic work diary"
	wd = first_col("work diary") or first_col("electronic work diary")
	if wd is not None: idx["wd"] = wd
	return {k:v for k,v in idx.items() if v is not None}

	def fill_driver_table(table: Table, arrays: Dict[str, List[str]]):
	colmap = map_driver_cols(table)
	if not colmap:
	return

	names = arrays.get("Driver / Scheduler Name", [])
	rosters = arrays.get("Roster / Schedule / Safe Driving Plan (Date Range)", [])
	fit = arrays.get("Fit for Duty Statement Completed (Yes/No)", [])
	wd = arrays.get("Work Diary Pages (Page Numbers) Electronic Work Diary Records (Date Range)", [])

	n = max(len(rosters), len(fit), len(wd), len(names))
	clear_data_rows_keep_headers(table, header_rows=1)
	ensure_rows(table, n)

	has_any_name = any(str(x).strip() for x in names)

	for i in range(n):
	row = table.rows[i+1]
	if "name" in colmap and has_any_name:
	replace_red_in_cell(row.cells[colmap["name"]], names[i] if i < len(names) else "")
	if "roster" in colmap:
	replace_red_in_cell(row.cells[colmap["roster"]], rosters[i] if i < len(rosters) else "")
	if "fit" in colmap:
	replace_red_in_cell(row.cells[colmap["fit"]], fit[i] if i < len(fit) else "")
	if "wd" in colmap:
	replace_red_in_cell(row.cells[colmap["wd"]], wd[i] if i < len(wd) else "")



	# ----------------------------- main mapping -----------------------------
	def flatten_simple_sections(data: Dict) -> Dict[str, str]:
	"""Collect simple label->single value mappings from top-level sections other than tables."""
	out = {}
	skip_sections = {
	"Vehicle Registration Numbers Maintenance",
	"Vehicle Registration Numbers Mass",
	"Driver / Scheduler Records Examined",
	"paragraphs",
	"Attendance List (Names and Position Titles)",
	"Nature of the Operators Business (Summary)",
	"Maintenance Management Summary",
	"Mass Management Summary",
	"Fatigue Management Summary",
	}
	for sec, kv in data.items():
	if sec in skip_sections: continue
	if not isinstance(kv, dict): continue
	for label, val in kv.items():
	out[f"{sec}::{label}"] = join_value(val)
	return out

	def run(input_json: Path, template_docx: Path, output_docx: Path):
	with open(input_json, "r", encoding="utf-8") as f:
	data = json.load(f)

	doc = Document(str(template_docx))

	# 1) simple label/value tables
	simple = flatten_simple_sections(data)

	# Map by (section::label). We try: (a) find exact label cell somewhere and write in the adjacent cell;
	# (b) if not found, search by heading then the next red run below the heading.
	for k, v in simple.items():
	# use the part after '::' as the label
	label = k.split("::", 1)[1] if "::" in k else k

	# SPECIAL: skip ACN here; we'll fill per-digit later
	if canon_label(label) == "australian company number":
	continue

	ok = update_label_value_in_tables(doc, label, v)
	if not ok:
	sec = k.split("::", 1)[0] if "::" in k else k
	update_heading_followed_red(doc, sec, v)


	# 2) paragraphs block
	paras = data.get("paragraphs", {})

	# 2a) generic headings → replace next red (skip the 3 management headings here)
	# third-line headings above the three tables
	for head in ("MAINTENANCE MANAGEMENT", "MASS MANAGEMENT", "FATIGUE MANAGEMENT"):
	name_val = join_value(paras.get(head, ""))
	if name_val:
	update_heading_followed_red(doc, head, name_val, max_scan=6)

	# 2b) the 3-layer headings → overwrite the 3rd line only
	# second-last page: date under page heading
	aud_head = "NHVAS APPROVED AUDITOR DECLARATION"
	aud_date = join_value(paras.get(aud_head, ""))
	if aud_date:
	set_date_by_heading_from_end(doc, aud_head, aud_date, max_scan=40)

	# last page: date under the long acknowledgement paragraph
	ack_head = ("I hereby acknowledge and agree with the findings detailed in this NHVAS Audit Summary Report. "
	"I have read and understand the conditions applicable to the Scheme, including the NHVAS Business Rules and Standards.")
	ack_date = join_value(paras.get(ack_head, ""))
	if ack_date:
	set_date_by_paragraph_from_end(doc, ack_head, ack_date, max_scan=40)

	maint_name = join_value(paras.get("MAINTENANCE MANAGEMENT", ""))
	if maint_name:
	set_layer3_name_after_management_heading(
	doc,
	"MAINTENANCE MANAGEMENT",
	["Vehicle Registration Numbers of Records Examined"],
	maint_name,
	)

	mass_name = join_value(paras.get("MASS MANAGEMENT", ""))
	if mass_name:
	set_layer3_name_after_management_heading(
	doc,
	"MASS MANAGEMENT",
	["Vehicle Registration Numbers of Records Examined"],
	mass_name,
	)

	fat_name = join_value(paras.get("FATIGUE MANAGEMENT", ""))
	if fat_name:
	set_layer3_name_after_management_heading(
	doc,
	"FATIGUE MANAGEMENT",
	["Driver / Scheduler Records Examined"],
	fat_name,
	)


	# 3) ACN digits
	op_info = data.get("Operator Information", {})
	acn_val = join_value(op_info.get("Australian Company Number", ""))
	if acn_val:
	fill_acn_digits(doc, acn_val)

	# 4) Vehicle tables
	maint = data.get("Vehicle Registration Numbers Maintenance", {})
	mass = data.get("Vehicle Registration Numbers Mass", {})
	t_m = find_vehicle_table(doc, "maintenance")
	if t_m and maint:
	fill_vehicle_table(t_m, "maintenance", maint)
	t_ms = find_mass_vehicle_numbers_table(doc)
	if t_ms and mass:
	fill_mass_vehicle_table_preserve_headers(t_ms, mass)

	# 5) Driver table
	drivers = data.get("Driver / Scheduler Records Examined", {})
	t_d = find_driver_table(doc)
	if t_d and drivers:
	fill_driver_table(t_d, drivers)

	# 6) Special: Audit Declaration dates via heading
	decl = data.get("Audit Declaration dates", {})
	if decl.get("Audit was conducted on"):
	update_heading_followed_red(doc, "Audit was conducted on", decl["Audit was conducted on"])

	# 7) Operator Declaration (last page, bottom row only), and fix Auditor table header
	op_decl = data.get("Operator Declaration", {})
	if op_decl:
	fill_operator_declaration(
	doc,
	join_value(op_decl.get("Print Name", "")),
	join_value(op_decl.get("Position Title", "")),
	)

	# make sure the second-last page “NHVAS APPROVED AUDITOR DECLARATION” header row is labels
	ensure_auditor_decl_headers(doc)


	# 8) Attendance List
	# Attendance: replace red lines only
	atts = data.get("Attendance List (Names and Position Titles)", {})
	att_val = atts.get("Attendance List (Names and Position Titles)")
	if att_val:
	fill_attendance_block(doc, att_val)

	# 9) Nature of the Operators Business (Summary): write once (no duplicates)
	biz = data.get("Nature of the Operators Business (Summary)", {})
	if biz:
	val = biz.get("Nature of the Operators Business (Summary):") or next(iter(biz.values()), "")
	if val:
	update_business_summary_once(doc, val)

	# 10) Summary tables: FULL OVERWRITE of DETAILS from JSON
	mm_sum = data.get("Maintenance Management Summary", {})
	if mm_sum:
	overwrite_summary_details_cells(doc, "Maintenance Management Summary", mm_sum)

	mass_sum = data.get("Mass Management Summary", {})
	if mass_sum:
	overwrite_summary_details_cells(doc, "Mass Management Summary", mass_sum)

	fat_sum = data.get("Fatigue Management Summary", {})
	if fat_sum:
	overwrite_summary_details_cells(doc, "Fatigue Management Summary", fat_sum)


	doc.save(str(output_docx))

	# ----------------------------- CLI -----------------------------
	if __name__ == "__main__":
	import sys
	from pathlib import Path

	if len(sys.argv) != 4:
	print("Usage: python updated_word.py <json> <template.docx> <output.docx>")
	sys.exit(1)

	a, b, c = map(Path, sys.argv[1:4])
	files = [a, b, c]

	json_path = next((p for p in files if p.suffix.lower() == ".json"), None)
	docx_paths = [p for p in files if p.suffix.lower() == ".docx"]

	if not json_path or len(docx_paths) < 2:
	print("Error: provide one .json and two .docx (template + output).")
	sys.exit(1)

	# Template = the .docx that already exists; Output = the other .docx
	template_docx = next((p for p in docx_paths if p.exists()), docx_paths[0])
	output_docx = docx_paths[1] if docx_paths[0] == template_docx else docx_paths[0]

	run(json_path, template_docx, output_docx)