Shami96 commited on
Commit
d77de54
Β·
verified Β·
1 Parent(s): 8bbc7e5

Update updated_word.py

Browse files
Files changed (1) hide show
  1. updated_word.py +270 -517
updated_word.py CHANGED
@@ -1,28 +1,24 @@
1
  #!/usr/bin/env python3
2
  """
3
- Updated pipeline.py
4
- Merged improvements:
5
- - removed duplicate functions
6
- - table processed-marker to avoid multiple handlers clobbering the same table
7
- - stricter detection of print-accreditation/operator-declaration tables
8
- - safer force replacement (avoid short->long mapping)
9
- - prefer exact qualified keys for Print Name / Position Title lookups
10
- - preserved all other logic and prints/logging
11
- - ADDED: header normalization, context-aware vehicle JSON selection,
12
- management summary scoping, unmatched-headers logging
13
  """
14
 
15
  import json
16
  from docx import Document
17
  from docx.shared import RGBColor
18
  import re
19
- from typing import Any
20
- import os
21
 
22
  # ============================================================================
23
- # Configuration / Heading patterns for document structure detection
24
  # ============================================================================
25
-
26
  HEADING_PATTERNS = {
27
  "main": [
28
  r"NHVAS\s+Audit\s+Summary\s+Report",
@@ -47,19 +43,14 @@ HEADING_PATTERNS = {
47
  }
48
 
49
  # ============================================================================
50
- # State for unmatched headers (for iterative improvement)
51
  # ============================================================================
52
  _unmatched_headers = {}
53
-
54
  def record_unmatched_header(header: str):
55
  if not header:
56
  return
57
  _unmatched_headers[header] = _unmatched_headers.get(header, 0) + 1
58
 
59
- # ============================================================================
60
- # UTILITY FUNCTIONS
61
- # ============================================================================
62
-
63
  def load_json(filepath):
64
  with open(filepath, 'r', encoding='utf-8') as file:
65
  return json.load(file)
@@ -89,10 +80,10 @@ def get_value_as_string(value, field_name=""):
89
  elif len(value) == 1:
90
  return str(value[0])
91
  else:
 
92
  if "australian company number" in field_name.lower() or "company number" in field_name.lower():
93
  return value
94
- else:
95
- return " ".join(str(v) for v in value)
96
  else:
97
  return str(value)
98
 
@@ -116,112 +107,124 @@ def has_red_text_in_paragraph(paragraph):
116
  return True
117
  return False
118
 
119
- # New helper: normalize header text (removes parentheticals, punctuation, etc.)
120
  def normalize_header_text(s: str) -> str:
121
  if not s:
122
  return ""
123
- # remove parenthetical content
124
- s = re.sub(r'\([^)]*\)', ' ', s)
125
- # replace slashes
126
  s = s.replace("/", " ")
127
- # remove punctuation except # and %
128
  s = re.sub(r'[^\w\s\#\%]', ' ', s)
129
  s = re.sub(r'\s+', ' ', s).strip().lower()
130
- # common canonicalizations
131
  s = s.replace('registrationno', 'registration number')
132
  s = s.replace('registrationnumber', 'registration number')
133
- s = s.replace('sub contracted', 'sub contractor')
134
  s = s.replace('sub-contractor', 'sub contractor')
135
- s = s.replace('date range', '')
136
- s = s.replace('applicable for entry audit', '')
137
- s = s.strip()
138
- return s
139
 
140
  # ============================================================================
141
- # JSON MATCHING FUNCTIONS
 
 
 
142
  # ============================================================================
143
-
144
  def find_matching_json_value(field_name, flat_json):
145
- """Find matching value in JSON with multiple strategies"""
 
 
 
 
 
 
 
 
146
  field_name = (field_name or "").strip()
147
  if not field_name:
148
  return None
149
 
150
- # Try exact match first
151
  if field_name in flat_json:
152
  print(f" βœ… Direct match found for key '{field_name}'")
153
- return flat_json[field_name]
154
 
155
- # Case-insensitive exact match
156
  for key, value in flat_json.items():
157
  if key.lower() == field_name.lower():
158
- print(f" βœ… Case-insensitive match found for key '{field_name}' with JSON key '{key}'")
159
- return value
160
 
161
- # Better Print Name detection for operator vs auditor
162
  if field_name.lower().strip() == "print name":
163
  operator_keys = [k for k in flat_json.keys() if "operator" in k.lower() and "print name" in k.lower()]
164
  auditor_keys = [k for k in flat_json.keys() if "auditor" in k.lower() and ("print name" in k.lower() or "name" in k.lower())]
165
-
166
  if operator_keys:
167
  print(f" βœ… Operator Print Name match: '{field_name}' -> '{operator_keys[0]}'")
168
- return flat_json[operator_keys[0]]
169
  elif auditor_keys:
170
  print(f" βœ… Auditor Name match: '{field_name}' -> '{auditor_keys[0]}'")
171
- return flat_json[auditor_keys[0]]
172
 
173
- # Suffix matching for nested keys
174
  for key, value in flat_json.items():
175
  if '.' in key and key.split('.')[-1].lower() == field_name.lower():
176
- print(f" βœ… Suffix match found for key '{field_name}' with JSON key '{key}'")
177
- return value
178
 
179
- # Clean & exact match attempt
180
  clean_field = re.sub(r'[^\w\s]', ' ', field_name.lower()).strip()
181
  clean_field = re.sub(r'\s+', ' ', clean_field)
182
  for key, value in flat_json.items():
183
  clean_key = re.sub(r'[^\w\s]', ' ', key.lower()).strip()
184
  clean_key = re.sub(r'\s+', ' ', clean_key)
185
  if clean_field == clean_key:
186
- print(f" βœ… Clean match found for key '{field_name}' with JSON key '{key}'")
187
- return value
188
 
189
- # Enhanced fuzzy matching with word-token scoring
190
  field_words = set(word.lower() for word in re.findall(r'\b\w+\b', field_name) if len(word) > 2)
191
  if not field_words:
192
  return None
193
 
194
- best_match = None
195
- best_score = 0
196
  best_key = None
 
 
197
 
198
  for key, value in flat_json.items():
199
  key_words = set(word.lower() for word in re.findall(r'\b\w+\b', key) if len(word) > 2)
200
  if not key_words:
201
  continue
202
 
203
- common_words = field_words.intersection(key_words)
204
- if common_words:
205
- similarity = len(common_words) / len(field_words.union(key_words))
206
- coverage = len(common_words) / len(field_words)
 
 
 
 
 
 
 
 
 
 
207
  final_score = (similarity * 0.6) + (coverage * 0.4)
208
 
209
- if final_score > best_score:
210
- best_score = final_score
211
- best_match = value
212
- best_key = key
213
 
214
- if best_match and best_score >= 0.25:
 
215
  print(f" βœ… Fuzzy match found for key '{field_name}' with JSON key '{best_key}' (score: {best_score:.2f})")
216
- return best_match
217
 
218
  print(f" ❌ No match found for '{field_name}'")
219
  return None
220
 
221
  # ============================================================================
222
- # RED TEXT PROCESSING FUNCTIONS
223
  # ============================================================================
224
-
225
  def extract_red_text_segments(cell):
226
  red_segments = []
227
  for para_idx, paragraph in enumerate(cell.paragraphs):
@@ -234,52 +237,37 @@ def extract_red_text_segments(cell):
234
  segment_runs.append((para_idx, run_idx, run))
235
  else:
236
  if segment_runs:
237
- red_segments.append({
238
- 'text': current_segment,
239
- 'runs': segment_runs.copy(),
240
- 'paragraph_idx': para_idx
241
- })
242
  current_segment = ""
243
  segment_runs = []
244
  if segment_runs:
245
- red_segments.append({
246
- 'text': current_segment,
247
- 'runs': segment_runs.copy(),
248
- 'paragraph_idx': para_idx
249
- })
250
  return red_segments
251
 
252
  def replace_all_red_segments(red_segments, replacement_text):
253
  if not red_segments:
254
  return 0
255
-
256
  if '\n' in replacement_text:
257
  replacement_lines = replacement_text.split('\n')
258
  else:
259
  replacement_lines = [replacement_text]
260
-
261
  replacements_made = 0
262
-
263
- if red_segments and replacement_lines:
264
- first_segment = red_segments[0]
265
- if first_segment['runs']:
266
- first_run = first_segment['runs'][0][2]
267
- first_run.text = replacement_lines[0]
268
- first_run.font.color.rgb = RGBColor(0, 0, 0)
269
- replacements_made = 1
270
- for _, _, run in first_segment['runs'][1:]:
271
- run.text = ''
272
-
273
  for segment in red_segments[1:]:
274
  for _, _, run in segment['runs']:
275
  run.text = ''
276
-
277
  if len(replacement_lines) > 1 and red_segments:
278
  try:
279
  first_run = red_segments[0]['runs'][0][2]
280
  paragraph = first_run.element.getparent()
281
  from docx.oxml import OxmlElement
282
- parent = first_run.element.getparent()
283
  for line in replacement_lines[1:]:
284
  if line.strip():
285
  br = OxmlElement('w:br')
@@ -291,7 +279,6 @@ def replace_all_red_segments(red_segments, replacement_text):
291
  first_run = red_segments[0]['runs'][0][2]
292
  first_run.text = ' '.join(replacement_lines)
293
  first_run.font.color.rgb = RGBColor(0, 0, 0)
294
-
295
  return replacements_made
296
 
297
  def replace_single_segment(segment, replacement_text):
@@ -311,7 +298,7 @@ def replace_red_text_in_cell(cell, replacement_text):
311
  return replace_all_red_segments(red_segments, replacement_text)
312
 
313
  # ============================================================================
314
- # SPECIALIZED TABLE HANDLERS
315
  # ============================================================================
316
 
317
  def handle_australian_company_number(row, company_numbers):
@@ -327,57 +314,27 @@ def handle_australian_company_number(row, company_numbers):
327
  return replacements_made
328
 
329
  def handle_vehicle_registration_table(table, flat_json):
330
- """Handle vehicle registration table data replacement (improved header normalization and context-aware selection)"""
 
 
 
331
  replacements_made = 0
332
 
333
- # build a table_text context (used to find mass/maintenance/fatigue)
334
- table_text = ""
335
- for r in table.rows[:3]:
336
- for c in r.cells:
337
- table_text += get_clean_text(c).lower() + " "
338
-
339
- # 1) Detect the most relevant vehicle-related JSON section using context tokens
340
  vehicle_section = None
341
- context_tokens = []
342
- if "mass" in table_text:
343
- context_tokens.append("mass")
344
- if "maintenance" in table_text:
345
- context_tokens.append("maintenance")
346
- if "fatigue" in table_text or "driver" in table_text or "scheduler" in table_text:
347
- context_tokens.append("fatigue")
348
-
349
- # candidate keys that mention 'registration' or 'vehicle'
350
- candidates = []
351
- for key, value in flat_json.items():
352
- k = key.lower()
353
- if "registration" in k or "vehicle registration" in k or "vehicle" in k:
354
- candidates.append((key, value))
355
-
356
- # prefer candidates whose key contains one of the context tokens
357
- if candidates and context_tokens:
358
- for token in context_tokens:
359
- for k, v in candidates:
360
- if token in k.lower():
361
- vehicle_section = v if isinstance(v, (list, dict)) else {k: v}
362
- print(f" βœ… Found vehicle data by context token '{token}' in key '{k}'")
363
- break
364
- if vehicle_section:
365
- break
366
-
367
- # fallback: choose candidate containing 'registration' explicitly
368
- if vehicle_section is None and candidates:
369
- for k, v in candidates:
370
- if "registration" in k.lower():
371
- vehicle_section = v if isinstance(v, (list, dict)) else {k: v}
372
- print(f" βœ… Fallback vehicle data chosen from '{k}'")
373
- break
374
 
375
  # fallback: collect flattened keys that look like vehicle columns
376
  if vehicle_section is None:
377
  potential_columns = {}
378
  for key, value in flat_json.items():
379
  lk = key.lower()
380
- if any(col_name in lk for col_name in ["registration number", "sub-contractor", "weight verification", "rfs suspension", "trip records", "suspension", "daily checks", "fault recording", "fault repair", "roadworthiness"]):
381
  if "." in key:
382
  column_name = key.split(".")[-1]
383
  else:
@@ -391,42 +348,36 @@ def handle_vehicle_registration_table(table, flat_json):
391
  print(f" ❌ Vehicle registration data not found in JSON")
392
  return 0
393
 
394
- # ensure vehicle_section is a dict mapping column_name -> list/value
395
  if isinstance(vehicle_section, list):
396
- # if a list of dicts, attempt to flatten into columns
397
  if vehicle_section and isinstance(vehicle_section[0], dict):
398
  flattened = {}
399
  for entry in vehicle_section:
400
  for k, v in entry.items():
401
  flattened.setdefault(k, []).append(v)
402
  vehicle_section = flattened
 
 
 
403
 
404
  if not isinstance(vehicle_section, dict):
405
- # convert single scalar to dict
406
  try:
407
  vehicle_section = dict(vehicle_section)
408
  except Exception:
409
- vehicle_section = {str(k): v for k, v in (vehicle_section.items() if isinstance(vehicle_section, dict) else [])}
410
 
411
  print(f" βœ… Found vehicle registration data with {len(vehicle_section)} columns")
412
 
413
- # Find header row index by searching for a row that contains 'registration' + 'number'
414
  header_row_idx = -1
415
  header_row = None
416
  for row_idx, row in enumerate(table.rows):
417
  row_text = " ".join(get_clean_text(cell).lower() for cell in row.cells)
418
- if "registration" in row_text and "number" in row_text:
419
  header_row_idx = row_idx
420
  header_row = row
421
  break
422
- if header_row_idx == -1:
423
- # try alternative detection: a row with 'registration' or 'reg no'
424
- for row_idx, row in enumerate(table.rows):
425
- row_text = " ".join(get_clean_text(cell).lower() for cell in row.cells)
426
- if "registration" in row_text or "reg no" in row_text or "regno" in row_text:
427
- header_row_idx = row_idx
428
- header_row = row
429
- break
430
 
431
  if header_row_idx == -1:
432
  print(f" ❌ Could not find header row in vehicle table")
@@ -434,34 +385,26 @@ def handle_vehicle_registration_table(table, flat_json):
434
 
435
  print(f" βœ… Found header row at index {header_row_idx}")
436
 
437
- # Enhanced column mapping: normalize both header and candidate keys, token overlap scoring
438
- column_mapping = {}
439
- # build normalized master map from vehicle_section keys
440
  master_labels = {}
441
  for orig_key in vehicle_section.keys():
442
  norm = normalize_header_text(str(orig_key))
443
  if norm:
444
- master_labels.setdefault(norm, orig_key)
445
-
446
- # add fallback synonyms for common labels (preserve existing)
447
- fallback_synonyms = [
448
- "no", "registration number", "reg no", "registration", "sub contractor", "sub-contractor",
449
- "sub contracted", "weight verification records", "rfs suspension certification", "suspension system maintenance",
450
- "trip records", "fault recording reporting", "daily checks", "roadworthiness certificates",
451
- "maintenance records", "fault repair"
452
- ]
453
- for syn in fallback_synonyms:
454
- norm = normalize_header_text(syn)
455
- if norm and norm not in master_labels:
456
- master_labels.setdefault(norm, syn)
457
 
458
- # map header cells
 
459
  for col_idx, cell in enumerate(header_row.cells):
460
  header_text = get_clean_text(cell).strip()
461
  if not header_text:
462
  continue
463
- # skip 'No.' column mapping attempts in many templates
464
- if header_text.strip().lower() in {"no", "no.", "#"}:
465
  continue
466
 
467
  norm_header = normalize_header_text(header_text)
@@ -473,7 +416,7 @@ def handle_vehicle_registration_table(table, flat_json):
473
  best_match = master_labels[norm_header]
474
  best_score = 1.0
475
  else:
476
- # token overlap scoring
477
  header_tokens = set(t for t in norm_header.split() if len(t) > 2)
478
  for norm_key, orig_label in master_labels.items():
479
  key_tokens = set(t for t in norm_key.split() if len(t) > 2)
@@ -483,7 +426,7 @@ def handle_vehicle_registration_table(table, flat_json):
483
  if common:
484
  score = len(common) / max(1, len(header_tokens.union(key_tokens)))
485
  else:
486
- # substring fallback
487
  if norm_header in norm_key or norm_key in norm_header:
488
  score = min(len(norm_header), len(norm_key)) / max(len(norm_header), len(norm_key))
489
  else:
@@ -492,18 +435,26 @@ def handle_vehicle_registration_table(table, flat_json):
492
  best_score = score
493
  best_match = orig_label
494
 
 
 
 
 
 
 
 
 
495
  if best_match and best_score >= 0.30:
496
  column_mapping[col_idx] = best_match
497
- print(f" πŸ“Œ Column {col_idx}: '{header_text}' -> '{best_match}' (norm: '{norm_header}', score: {best_score:.2f})")
498
  else:
499
- print(f" ⚠️ No mapping found for '{header_text}' (norm: '{norm_header}')")
500
  record_unmatched_header(header_text)
501
 
502
  if not column_mapping:
503
  print(f" ❌ No column mappings found")
504
  return 0
505
 
506
- # Determine number of rows to populate
507
  max_data_rows = 0
508
  for json_key, data in vehicle_section.items():
509
  if isinstance(data, list):
@@ -511,51 +462,39 @@ def handle_vehicle_registration_table(table, flat_json):
511
 
512
  print(f" πŸ“Œ Need to populate {max_data_rows} data rows")
513
 
514
- # Fill or add rows as needed
515
  for data_row_index in range(max_data_rows):
516
  table_row_idx = header_row_idx + 1 + data_row_index
517
-
518
  if table_row_idx >= len(table.rows):
519
- print(f" ⚠️ Row {table_row_idx + 1} doesn't exist - table only has {len(table.rows)} rows")
520
- print(f" βž• Adding new row for vehicle {data_row_index + 1}")
521
- new_row = table.add_row()
522
- print(f" βœ… Successfully added row {len(table.rows)} to the table")
523
 
524
  row = table.rows[table_row_idx]
525
  print(f" πŸ“Œ Processing data row {table_row_idx + 1} (vehicle {data_row_index + 1})")
526
-
527
  for col_idx, json_key in column_mapping.items():
528
  if col_idx < len(row.cells):
529
  cell = row.cells[col_idx]
530
-
531
  column_data = vehicle_section.get(json_key, [])
532
  if isinstance(column_data, list) and data_row_index < len(column_data):
533
  replacement_value = str(column_data[data_row_index])
534
-
535
  cell_text = get_clean_text(cell)
536
  if has_red_text(cell) or not cell_text.strip():
537
  if not cell_text.strip():
538
  cell.text = replacement_value
539
  replacements_made += 1
540
- print(f" -> Added '{replacement_value}' to empty cell (column '{json_key}')")
541
  else:
542
  cell_replacements = replace_red_text_in_cell(cell, replacement_value)
543
  replacements_made += cell_replacements
544
  if cell_replacements > 0:
545
- print(f" -> Replaced red text with '{replacement_value}' (column '{json_key}')")
546
 
547
  return replacements_made
548
 
549
  def handle_attendance_list_table_enhanced(table, flat_json):
550
- """Enhanced Attendance List processing with better detection"""
551
  replacements_made = 0
552
-
553
- attendance_patterns = [
554
- "attendance list",
555
- "names and position titles",
556
- "attendees"
557
- ]
558
-
559
  found_attendance_row = None
560
  for row_idx, row in enumerate(table.rows[:3]):
561
  for cell_idx, cell in enumerate(row.cells):
@@ -566,7 +505,6 @@ def handle_attendance_list_table_enhanced(table, flat_json):
566
  break
567
  if found_attendance_row is not None:
568
  break
569
-
570
  if found_attendance_row is None:
571
  return 0
572
 
@@ -577,42 +515,38 @@ def handle_attendance_list_table_enhanced(table, flat_json):
577
  "attendance list",
578
  "attendees"
579
  ]
580
-
581
  print(f" πŸ” Searching for attendance data in JSON...")
582
-
583
  for search_key in attendance_search_keys:
584
- attendance_value = find_matching_json_value(search_key, flat_json)
585
- if attendance_value is not None:
586
- print(f" βœ… Found attendance data with key: '{search_key}'")
 
587
  print(f" πŸ“Š Raw value: {attendance_value}")
588
  break
589
-
590
  if attendance_value is None:
591
  print(f" ❌ No attendance data found in JSON")
592
  return 0
593
 
 
594
  target_cell = None
595
  print(f" πŸ” Scanning ALL cells in attendance table for red text...")
596
-
597
  for row_idx, row in enumerate(table.rows):
598
  for cell_idx, cell in enumerate(row.cells):
599
  if has_red_text(cell):
600
- print(f" 🎯 Found red text in row {row_idx + 1}, cell {cell_idx + 1}")
601
-
602
  red_text = ""
603
  for paragraph in cell.paragraphs:
604
  for run in paragraph.runs:
605
  if is_red(run):
606
  red_text += run.text
607
-
608
- print(f" πŸ“‹ Red text content: '{red_text[:80]}...'")
609
-
610
- red_text_lower = red_text.lower()
611
- if any(indicator in red_text_lower for indicator in ['manager', '–', '-']):
612
- target_cell = cell
613
- print(f" βœ… This looks like attendance data - using this cell")
614
- break
615
- if target_cell is not None:
616
  break
617
 
618
  if target_cell is None:
@@ -621,60 +555,44 @@ def handle_attendance_list_table_enhanced(table, flat_json):
621
 
622
  if has_red_text(target_cell):
623
  print(f" πŸ”§ Replacing red text with properly formatted attendance list...")
624
-
625
  if isinstance(attendance_value, list):
626
  attendance_list = [str(item).strip() for item in attendance_value if str(item).strip()]
627
  else:
628
  attendance_list = [str(attendance_value).strip()]
629
-
630
  print(f" πŸ“ Attendance items to add:")
631
  for i, item in enumerate(attendance_list):
632
  print(f" {i+1}. {item}")
633
-
634
  replacement_text = "\n".join(attendance_list)
635
  cell_replacements = replace_red_text_in_cell(target_cell, replacement_text)
636
  replacements_made += cell_replacements
637
-
638
  print(f" βœ… Added {len(attendance_list)} attendance items")
639
  print(f" πŸ“Š Replacements made: {cell_replacements}")
640
-
641
  return replacements_made
642
 
643
  def fix_management_summary_details_column(table, flat_json):
644
- """Fix the DETAILS column in Management Summary table (multi-management aware)."""
645
  replacements_made = 0
646
-
647
  print(f" 🎯 FIX: Management Summary DETAILS column processing")
648
-
649
- # Build table text to detect management type(s)
650
  table_text = ""
651
  for row in table.rows[:3]:
652
  for cell in row.cells:
653
  table_text += get_clean_text(cell).lower() + " "
654
-
655
- # Identify which management types this table likely represents
656
  mgmt_types = []
657
  if "mass management" in table_text or "mass" in table_text:
658
  mgmt_types.append("Mass Management Summary")
659
  if "maintenance management" in table_text or "maintenance" in table_text:
660
  mgmt_types.append("Maintenance Management Summary")
661
- if "fatigue management" in table_text or "fatigue" in table_text or "driver" in table_text:
662
  mgmt_types.append("Fatigue Management Summary")
663
-
664
  if not mgmt_types:
665
- # fallback: try fuzzy detection through headings or presence of "Std 5." etc.
666
  if any("std 5" in get_clean_text(c).lower() for r in table.rows for c in r.cells):
667
  mgmt_types.append("Mass Management Summary")
668
-
669
  if not mgmt_types:
670
  return 0
671
-
672
  for mgmt_type in mgmt_types:
673
  print(f" βœ… Confirmed {mgmt_type} table processing")
674
- # find data dict in flat_json for mgmt_type
675
  mgmt_data = flat_json.get(mgmt_type)
676
  if not isinstance(mgmt_data, dict):
677
- # attempt suffix based keys in flat_json
678
  for key in flat_json.keys():
679
  if mgmt_type.split()[0].lower() in key.lower() and "summary" in key.lower():
680
  mgmt_data = flat_json.get(key)
@@ -682,26 +600,18 @@ def fix_management_summary_details_column(table, flat_json):
682
  if not isinstance(mgmt_data, dict):
683
  print(f" ⚠️ No JSON management dict found for {mgmt_type}, skipping this type")
684
  continue
685
-
686
- # Process rows looking for Std 5. and Std 6.
687
  for row_idx, row in enumerate(table.rows):
688
  if len(row.cells) >= 2:
689
  standard_cell = row.cells[0]
690
  details_cell = row.cells[1]
691
  standard_text = get_clean_text(standard_cell).strip().lower()
692
-
693
- # Std 5.
694
  if "std 5" in standard_text or "verification" in standard_text:
695
  if has_red_text(details_cell):
696
- print(f" πŸ” Found Std 5/Verification with red text")
697
- # try to find the appropriate key in mgmt_data
698
  std_val = None
699
- # exact key variants
700
  for candidate in ("Std 5. Verification", "Std 5 Verification", "Std 5", "Verification"):
701
  std_val = mgmt_data.get(candidate)
702
  if std_val is not None:
703
  break
704
- # fuzzy fallback
705
  if std_val is None:
706
  for k, v in mgmt_data.items():
707
  if 'std 5' in k.lower() or 'verification' in k.lower():
@@ -713,11 +623,8 @@ def fix_management_summary_details_column(table, flat_json):
713
  replacements_made += cell_replacements
714
  if cell_replacements:
715
  print(f" βœ… Replaced Std 5. Verification details for {mgmt_type}")
716
-
717
- # Std 6.
718
  if "std 6" in standard_text or "internal review" in standard_text:
719
  if has_red_text(details_cell):
720
- print(f" πŸ” Found Std 6/Internal Review with red text")
721
  std_val = None
722
  for candidate in ("Std 6. Internal Review", "Std 6 Internal Review", "Std 6", "Internal Review"):
723
  std_val = mgmt_data.get(candidate)
@@ -734,23 +641,20 @@ def fix_management_summary_details_column(table, flat_json):
734
  replacements_made += cell_replacements
735
  if cell_replacements:
736
  print(f" βœ… Replaced Std 6. Internal Review details for {mgmt_type}")
737
-
738
  return replacements_made
739
 
740
- # Canonical operator declaration fixer (keeps original robust logic)
 
 
741
  def fix_operator_declaration_empty_values(table, flat_json):
742
  replacements_made = 0
743
-
744
  print(f" 🎯 FIX: Operator Declaration empty values processing")
745
-
746
  table_context = ""
747
  for row in table.rows:
748
  for cell in row.cells:
749
  table_context += get_clean_text(cell).lower() + " "
750
-
751
  if not ("print name" in table_context and "position title" in table_context):
752
  return 0
753
-
754
  print(f" βœ… Confirmed Operator Declaration table")
755
 
756
  def parse_name_and_position(value):
@@ -761,16 +665,15 @@ def fix_operator_declaration_empty_values(table, flat_json):
761
  return None, None
762
  if len(value) == 1:
763
  return str(value[0]).strip(), None
 
764
  first = str(value[0]).strip()
765
  second = str(value[1]).strip()
766
  if first and second:
767
  return first, second
768
  value = " ".join(str(v).strip() for v in value if str(v).strip())
769
-
770
  s = str(value).strip()
771
  if not s:
772
  return None, None
773
-
774
  parts = re.split(r'\s+[-–—]\s+|\s*,\s*|\s*\|\s*', s)
775
  if len(parts) >= 2:
776
  left = parts[0].strip()
@@ -782,7 +685,6 @@ def fix_operator_declaration_empty_values(table, flat_json):
782
  if any(ind in left.lower() for ind in role_indicators) and not any(ind in right.lower() for ind in role_indicators):
783
  return right, left
784
  return left, right
785
-
786
  tokens = s.split()
787
  if len(tokens) >= 2:
788
  last = tokens[-1]
@@ -790,35 +692,35 @@ def fix_operator_declaration_empty_values(table, flat_json):
790
  'coordinator', 'driver', 'operator', 'representative', 'chief']
791
  if any(ind == last.lower() for ind in role_indicators):
792
  return " ".join(tokens[:-1]), last
793
-
794
  return s, None
795
 
796
  for row_idx, row in enumerate(table.rows):
797
  if len(row.cells) >= 2:
798
  cell1_text = get_clean_text(row.cells[0]).strip().lower()
799
  cell2_text = get_clean_text(row.cells[1]).strip().lower()
800
-
801
  if "print name" in cell1_text and "position" in cell2_text:
802
  print(f" πŸ“Œ Found header row at {row_idx + 1}")
803
-
804
  if row_idx + 1 < len(table.rows):
805
  data_row = table.rows[row_idx + 1]
806
  if len(data_row.cells) >= 2:
807
  name_cell = data_row.cells[0]
808
  position_cell = data_row.cells[1]
809
-
810
  name_text = get_clean_text(name_cell).strip()
811
  position_text = get_clean_text(position_cell).strip()
812
  print(f" πŸ“‹ Current values: Name='{name_text}', Position='{position_text}'")
813
 
814
- name_value = find_matching_json_value("Operator Declaration.Print Name", flat_json)
815
- if name_value is None:
816
- name_value = find_matching_json_value("Print Name", flat_json)
 
 
 
817
 
818
- position_value = find_matching_json_value("Operator Declaration.Position Title", flat_json)
819
- if position_value is None:
820
- position_value = find_matching_json_value("Position Title", flat_json)
821
 
 
822
  parsed_name_from_nameval, parsed_pos_from_nameval = parse_name_and_position(name_value) if name_value is not None else (None, None)
823
  parsed_name_from_posval, parsed_pos_from_posval = parse_name_and_position(position_value) if position_value is not None else (None, None)
824
 
@@ -830,13 +732,43 @@ def fix_operator_declaration_empty_values(table, flat_json):
830
  elif name_value is not None:
831
  final_name = get_value_as_string(name_value)
832
 
833
- if parsed_pos_from_posval:
834
- final_pos = parsed_pos_from_posval
835
- elif position_value is not None:
836
- final_pos = get_value_as_string(position_value)
837
- elif parsed_pos_from_nameval:
838
- final_pos = parsed_pos_from_nameval
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
839
 
 
840
  if isinstance(final_name, list):
841
  final_name = " ".join(str(x) for x in final_name).strip()
842
  if isinstance(final_pos, list):
@@ -853,8 +785,9 @@ def fix_operator_declaration_empty_values(table, flat_json):
853
  low = name_str.lower()
854
  if any(bp in low for bp in bad_phrases):
855
  return False
856
- return len(name_str) > 1
857
 
 
858
  if (not name_text or has_red_text(name_cell)) and final_name and looks_like_person(final_name):
859
  if has_red_text(name_cell):
860
  replace_red_text_in_cell(name_cell, final_name)
@@ -863,7 +796,8 @@ def fix_operator_declaration_empty_values(table, flat_json):
863
  replacements_made += 1
864
  print(f" βœ… Updated Print Name -> '{final_name}'")
865
 
866
- if (not position_text or has_red_text(position_cell)) and final_pos:
 
867
  if has_red_text(position_cell):
868
  replace_red_text_in_cell(position_cell, final_pos)
869
  else:
@@ -890,9 +824,9 @@ def handle_multiple_red_segments_in_cell(cell, flat_json):
890
  for i, segment in enumerate(red_segments):
891
  segment_text = segment['text'].strip()
892
  if segment_text:
893
- json_value = find_matching_json_value(segment_text, flat_json)
894
- if json_value is not None:
895
- replacement_text = get_value_as_string(json_value, segment_text)
896
  if replace_single_segment(segment, replacement_text):
897
  replacements_made += 1
898
  print(f" βœ… Replaced segment {i+1}: '{segment_text}' -> '{replacement_text}'")
@@ -910,9 +844,9 @@ def handle_nature_business_multiline_fix(cell, flat_json):
910
  return 0
911
  nature_indicators = ["transport", "logistics", "freight", "delivery", "trucking", "haulage"]
912
  if any(indicator in red_text.lower() for indicator in nature_indicators):
913
- nature_value = find_matching_json_value("Nature of Business", flat_json)
914
- if nature_value is not None:
915
- replacement_text = get_value_as_string(nature_value, "Nature of Business")
916
  cell_replacements = replace_red_text_in_cell(cell, replacement_text)
917
  replacements_made += cell_replacements
918
  print(f" βœ… Fixed Nature of Business multiline content")
@@ -930,229 +864,85 @@ def handle_management_summary_fix(cell, flat_json):
930
  return 0
931
  management_types = ["Mass Management Summary", "Maintenance Management Summary", "Fatigue Management Summary"]
932
  for mgmt_type in management_types:
933
- if mgmt_type in flat_json:
934
  mgmt_data = flat_json[mgmt_type]
935
- if isinstance(mgmt_data, dict):
936
- for std_key, std_value in mgmt_data.items():
937
- if isinstance(std_value, list) and std_value:
938
- if len(red_text) > 10:
939
- for item in std_value:
940
- if red_text.lower() in str(item).lower() or str(item).lower() in red_text.lower():
941
- replacement_text = "\n".join(str(i) for i in std_value)
942
- cell_replacements = replace_red_text_in_cell(cell, replacement_text)
943
- replacements_made += cell_replacements
944
- print(f" βœ… Fixed {mgmt_type} - {std_key}")
945
- return replacements_made
946
- return replacements_made
947
-
948
- # ============================================================================
949
- # SMALL OPERATOR/AUDITOR TABLE HANDLER (skip if already processed)
950
- # ============================================================================
951
-
952
- def handle_operator_declaration_fix(table, flat_json):
953
- replacements_made = 0
954
-
955
- if getattr(table, "_processed_operator_declaration", False):
956
- print(f" ⏭️ Skipping - Operator Declaration table already processed")
957
- return 0
958
-
959
- if len(table.rows) > 4:
960
- return 0
961
-
962
- replaced = fix_operator_declaration_empty_values(table, flat_json)
963
- replacements_made += replaced
964
- if replaced:
965
- return replacements_made
966
-
967
- def is_date_like(s: str) -> bool:
968
- if not s:
969
- return False
970
- s = s.strip()
971
- month_names = r"(jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec|january|february|march|april|may|june|july|august|september|october|november|december)"
972
- if re.search(r"\bDate\b", s, re.IGNORECASE):
973
- return True
974
- if re.search(r"\b\d{1,2}(?:st|nd|rd|th)?\b\s+" + month_names, s, re.IGNORECASE):
975
- return True
976
- if re.search(month_names + r".*\b\d{4}\b", s, re.IGNORECASE):
977
- return True
978
- if re.search(r"\b\d{1,2}[\/\.\-]\d{1,2}[\/\.\-]\d{2,4}\b", s):
979
- return True
980
- if re.search(r"\b\d{4}[\/\.\-]\d{1,2}[\/\.\-]\d{1,2}\b", s):
981
- return True
982
- if re.fullmatch(r"\d{4}", s):
983
- return True
984
- return False
985
-
986
- def looks_like_person_name(s: str) -> bool:
987
- if not s:
988
- return False
989
- low = s.lower().strip()
990
- bad_terms = ["pty ltd", "p/l", "plc", "company", "farming", "farm", "trust", "ltd"]
991
- if any(bt in low for bt in bad_terms):
992
- return False
993
- if len(low) < 3:
994
- return False
995
- return bool(re.search(r"[a-zA-Z]", low))
996
-
997
- def looks_like_position(s: str) -> bool:
998
- if not s:
999
- return False
1000
- low = s.lower()
1001
- roles = ["manager", "auditor", "owner", "director", "supervisor", "coordinator", "driver", "operator", "representative", "chief"]
1002
- return any(r in low for r in roles)
1003
-
1004
- print(f" 🎯 Processing other declaration table (fallback small-table behavior)")
1005
-
1006
- for row_idx, row in enumerate(table.rows):
1007
- for cell_idx, cell in enumerate(row.cells):
1008
- if not has_red_text(cell):
1009
- continue
1010
-
1011
- declaration_fields = [
1012
- "NHVAS Approved Auditor Declaration.Print Name",
1013
- "Auditor name",
1014
- "Signature",
1015
- "Date"
1016
- ]
1017
-
1018
- replaced_this_cell = False
1019
- for field in declaration_fields:
1020
- field_value = find_matching_json_value(field, flat_json)
1021
- if field_value is None:
1022
- continue
1023
-
1024
- replacement_text = get_value_as_string(field_value, field).strip()
1025
- if not replacement_text:
1026
- continue
1027
-
1028
- if is_date_like(replacement_text):
1029
- red_text = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red(run)).strip()
1030
- if "date" not in red_text.lower():
1031
- print(f" ⚠️ Skipping date-like replacement for field '{field}' -> '{replacement_text[:30]}...'")
1032
- continue
1033
-
1034
- if (looks_like_person_name(replacement_text) or looks_like_position(replacement_text) or "signature" in field.lower() or "date" in field.lower()):
1035
- cell_replacements = replace_red_text_in_cell(cell, replacement_text)
1036
- if cell_replacements > 0:
1037
- replacements_made += cell_replacements
1038
- replaced_this_cell = True
1039
- print(f" βœ… Fixed declaration field: {field} -> '{replacement_text}'")
1040
- break
1041
- else:
1042
- print(f" ⚠️ Replacement for field '{field}' does not look like name/role, skipping: '{replacement_text[:30]}...'")
1043
- continue
1044
-
1045
- if not replaced_this_cell:
1046
- red_text = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red(run)).strip().lower()
1047
- if "signature" in red_text:
1048
- cell_replacements = replace_red_text_in_cell(cell, "[Signature]")
1049
- if cell_replacements > 0:
1050
- replacements_made += cell_replacements
1051
- print(f" βœ… Inserted placeholder [Signature]")
1052
- elif "date" in red_text:
1053
- date_value = find_matching_json_value("Date", flat_json) or find_matching_json_value("Date of Audit", flat_json) or find_matching_json_value("Audit was conducted on", flat_json)
1054
- if date_value is not None:
1055
- date_text = get_value_as_string(date_value)
1056
- if not is_date_like(date_text):
1057
- print(f" ⚠️ Found date-value but not date-like, skipping: '{date_text}'")
1058
- else:
1059
- cell_replacements = replace_red_text_in_cell(cell, date_text)
1060
- if cell_replacements > 0:
1061
  replacements_made += cell_replacements
1062
- print(f" βœ… Inserted date value: '{date_text}'")
1063
-
1064
- if replacements_made > 0:
1065
- try:
1066
- setattr(table, "_processed_operator_declaration", True)
1067
- print(" πŸ”– Marked table as processed by operator declaration fallback")
1068
- except Exception:
1069
- pass
1070
-
1071
  return replacements_made
1072
 
1073
  def handle_print_accreditation_section(table, flat_json):
1074
  replacements_made = 0
1075
-
1076
  if getattr(table, "_processed_operator_declaration", False):
1077
  print(f" ⏭️ Skipping Print Accreditation - this is an Operator Declaration table")
1078
  return 0
1079
-
1080
  table_context = ""
1081
  for row in table.rows:
1082
  for cell in row.cells:
1083
  table_context += get_clean_text(cell).lower() + " "
1084
-
1085
  if "operator declaration" in table_context or ("print name" in table_context and "position title" in table_context):
1086
  print(f" ⏭️ Skipping Print Accreditation - this is an Operator Declaration table")
1087
  return 0
1088
-
1089
  print(f" πŸ“‹ Processing Print Accreditation section")
1090
-
1091
  for row_idx, row in enumerate(table.rows):
1092
  for cell_idx, cell in enumerate(row.cells):
1093
  if has_red_text(cell):
1094
  accreditation_fields = [
1095
  "(print accreditation name)",
1096
  "Operator name (Legal entity)",
1097
- "Print accreditation name",
1098
- "(print accreditation name)"
1099
  ]
1100
-
1101
  for field in accreditation_fields:
1102
- field_value = find_matching_json_value(field, flat_json)
1103
- if field_value is not None:
1104
- replacement_text = get_value_as_string(field_value, field)
1105
  if replacement_text.strip():
1106
  cell_replacements = replace_red_text_in_cell(cell, replacement_text)
1107
  replacements_made += cell_replacements
1108
  if cell_replacements > 0:
1109
- print(f" βœ… Fixed accreditation: {field}")
1110
  break
1111
-
1112
  return replacements_made
1113
 
1114
  def process_single_column_sections(cell, key_text, flat_json):
1115
  replacements_made = 0
1116
-
1117
  if has_red_text(cell):
1118
  red_text = ""
1119
  for paragraph in cell.paragraphs:
1120
  for run in paragraph.runs:
1121
  if is_red(run):
1122
  red_text += run.text
1123
-
1124
  if red_text.strip():
1125
- section_value = find_matching_json_value(red_text.strip(), flat_json)
1126
- if section_value is None:
1127
- section_value = find_matching_json_value(key_text, flat_json)
1128
-
1129
- if section_value is not None:
1130
- section_replacement = get_value_as_string(section_value, red_text.strip())
1131
  cell_replacements = replace_red_text_in_cell(cell, section_replacement)
1132
  replacements_made += cell_replacements
1133
  if cell_replacements > 0:
1134
  print(f" βœ… Fixed single column section: '{key_text}'")
1135
-
1136
  return replacements_made
1137
 
1138
  # ============================================================================
1139
- # MAIN TABLE/PARAGRAPH PROCESSING
1140
  # ============================================================================
1141
-
1142
  def process_tables(document, flat_json):
1143
- """Process all tables in the document with comprehensive fixes"""
1144
  replacements_made = 0
1145
-
1146
  for table_idx, table in enumerate(document.tables):
1147
  print(f"\nπŸ” Processing table {table_idx + 1}:")
1148
-
1149
- # collect brief context
1150
  table_text = ""
1151
  for row in table.rows[:3]:
1152
  for cell in row.cells:
1153
  table_text += get_clean_text(cell).lower() + " "
1154
 
1155
- # detect management summary & details column
1156
  management_summary_indicators = ["mass management", "maintenance management", "fatigue management"]
1157
  has_management = any(indicator in table_text for indicator in management_summary_indicators)
1158
  has_details = "details" in table_text
@@ -1162,12 +952,10 @@ def process_tables(document, flat_json):
1162
  summary_fixes = fix_management_summary_details_column(table, flat_json)
1163
  replacements_made += summary_fixes
1164
 
1165
- # Process remaining red text in management summary
1166
  summary_replacements = 0
1167
  for row_idx, row in enumerate(table.rows):
1168
  for cell_idx, cell in enumerate(row.cells):
1169
  if has_red_text(cell):
1170
- # Try direct matching with new schema names first
1171
  for mgmt_type in ["Mass Management Summary", "Maintenance Management Summary", "Fatigue Management Summary"]:
1172
  if mgmt_type.lower().replace(" summary", "") in table_text:
1173
  if mgmt_type in flat_json:
@@ -1192,7 +980,7 @@ def process_tables(document, flat_json):
1192
  replacements_made += summary_replacements
1193
  continue
1194
 
1195
- # Detect Vehicle Registration tables
1196
  vehicle_indicators = ["registration number", "sub-contractor", "weight verification", "rfs suspension", "registration"]
1197
  indicator_count = sum(1 for indicator in vehicle_indicators if indicator in table_text)
1198
  if indicator_count >= 2:
@@ -1201,50 +989,46 @@ def process_tables(document, flat_json):
1201
  replacements_made += vehicle_replacements
1202
  continue
1203
 
1204
- # Detect Attendance List tables
1205
  if "attendance list" in table_text and "names and position titles" in table_text:
1206
  print(f" πŸ‘₯ Detected Attendance List table")
1207
  attendance_replacements = handle_attendance_list_table_enhanced(table, flat_json)
1208
  replacements_made += attendance_replacements
1209
  continue
1210
 
1211
- # Detect Print Accreditation / Operator Declaration tables
1212
  print_accreditation_indicators = ["print name", "position title"]
1213
  indicator_count = sum(1 for indicator in print_accreditation_indicators if indicator in table_text)
1214
-
1215
  if indicator_count >= 2 or ("print name" in table_text and "position title" in table_text):
1216
  print(f" πŸ“‹ Detected Print Accreditation/Operator Declaration table")
1217
  declaration_fixes = fix_operator_declaration_empty_values(table, flat_json)
1218
  replacements_made += declaration_fixes
1219
-
1220
  if not getattr(table, "_processed_operator_declaration", False):
1221
  print_accreditation_replacements = handle_print_accreditation_section(table, flat_json)
1222
  replacements_made += print_accreditation_replacements
1223
-
1224
  continue
1225
 
1226
- # Process regular table rows (original logic preserved)
1227
  for row_idx, row in enumerate(table.rows):
1228
  if len(row.cells) < 1:
1229
  continue
1230
-
1231
  key_cell = row.cells[0]
1232
  key_text = get_clean_text(key_cell)
1233
-
1234
  if not key_text:
1235
  continue
1236
-
1237
  print(f" πŸ“Œ Row {row_idx + 1}: Key = '{key_text}'")
1238
-
1239
- json_value = find_matching_json_value(key_text, flat_json)
1240
 
1241
  if json_value is not None:
1242
  replacement_text = get_value_as_string(json_value, key_text)
1243
 
 
1244
  if ("australian company number" in key_text.lower() or "company number" in key_text.lower()) and isinstance(json_value, list):
1245
  cell_replacements = handle_australian_company_number(row, json_value)
1246
  replacements_made += cell_replacements
1247
 
 
1248
  elif ("attendance list" in key_text.lower() or "nature of" in key_text.lower()) and row_idx + 1 < len(table.rows):
1249
  print(f" βœ… Section header detected, checking next row...")
1250
  next_row = table.rows[row_idx + 1]
@@ -1252,17 +1036,21 @@ def process_tables(document, flat_json):
1252
  if has_red_text(cell):
1253
  print(f" βœ… Found red text in next row, cell {cell_idx + 1}")
1254
  if isinstance(json_value, list):
1255
- replacement_text = "\n".join(str(item) for item in json_value)
1256
- cell_replacements = replace_red_text_in_cell(cell, replacement_text)
 
 
1257
  replacements_made += cell_replacements
1258
  if cell_replacements > 0:
1259
  print(f" -> Replaced section content")
1260
 
 
1261
  elif len(row.cells) == 1 or (len(row.cells) > 1 and not any(has_red_text(row.cells[i]) for i in range(1, len(row.cells)))):
1262
  if has_red_text(key_cell):
1263
  cell_replacements = process_single_column_sections(key_cell, key_text, flat_json)
1264
  replacements_made += cell_replacements
1265
 
 
1266
  else:
1267
  for cell_idx in range(1, len(row.cells)):
1268
  value_cell = row.cells[cell_idx]
@@ -1272,6 +1060,7 @@ def process_tables(document, flat_json):
1272
  replacements_made += cell_replacements
1273
 
1274
  else:
 
1275
  if len(row.cells) == 1 and has_red_text(key_cell):
1276
  red_text = ""
1277
  for paragraph in key_cell.paragraphs:
@@ -1279,56 +1068,55 @@ def process_tables(document, flat_json):
1279
  if is_red(run):
1280
  red_text += run.text
1281
  if red_text.strip():
1282
- section_value = find_matching_json_value(red_text.strip(), flat_json)
1283
- if section_value is not None:
1284
- section_replacement = get_value_as_string(section_value, red_text.strip())
1285
  cell_replacements = replace_red_text_in_cell(key_cell, section_replacement)
1286
  replacements_made += cell_replacements
1287
 
 
1288
  for cell_idx in range(len(row.cells)):
1289
  cell = row.cells[cell_idx]
1290
  if has_red_text(cell):
1291
  cell_replacements = handle_multiple_red_segments_in_cell(cell, flat_json)
1292
  replacements_made += cell_replacements
1293
-
1294
  if cell_replacements == 0:
1295
  surgical_fix = handle_nature_business_multiline_fix(cell, flat_json)
1296
  replacements_made += surgical_fix
1297
-
1298
  if cell_replacements == 0:
1299
  management_summary_fix = handle_management_summary_fix(cell, flat_json)
1300
  replacements_made += management_summary_fix
1301
 
1302
- # Final declaration checks on last few tables
1303
  print(f"\n🎯 Final check for Declaration tables...")
1304
  for table in document.tables[-3:]:
1305
  if len(table.rows) <= 4:
1306
  if getattr(table, "_processed_operator_declaration", False):
1307
  print(f" ⏭️ Skipping - already processed by operator declaration handler")
1308
  continue
1309
- declaration_fix = handle_operator_declaration_fix(table, flat_json)
1310
  replacements_made += declaration_fix
1311
 
1312
  return replacements_made
1313
 
1314
  def process_paragraphs(document, flat_json):
1315
- """Process all paragraphs in the document"""
1316
  replacements_made = 0
1317
  print(f"\nπŸ” Processing paragraphs:")
1318
-
1319
  for para_idx, paragraph in enumerate(document.paragraphs):
1320
  red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
1321
  if red_runs:
1322
  red_text_only = "".join(run.text for run in red_runs).strip()
1323
  print(f" πŸ“Œ Paragraph {para_idx + 1}: Found red text: '{red_text_only}'")
1324
 
1325
- json_value = find_matching_json_value(red_text_only, flat_json)
 
1326
 
1327
  if json_value is None:
1328
  if "AUDITOR SIGNATURE" in red_text_only.upper() or "DATE" in red_text_only.upper():
1329
- json_value = find_matching_json_value("auditor signature", flat_json)
1330
  elif "OPERATOR SIGNATURE" in red_text_only.upper():
1331
- json_value = find_matching_json_value("operator signature", flat_json)
 
1332
 
1333
  if json_value is not None:
1334
  replacement_text = get_value_as_string(json_value)
@@ -1338,21 +1126,16 @@ def process_paragraphs(document, flat_json):
1338
  for run in red_runs[1:]:
1339
  run.text = ''
1340
  replacements_made += 1
1341
-
1342
  return replacements_made
1343
 
1344
  def process_headings(document, flat_json):
1345
- """Process headings and their related content"""
1346
  replacements_made = 0
1347
  print(f"\nπŸ” Processing headings:")
1348
-
1349
  paragraphs = document.paragraphs
1350
-
1351
  for para_idx, paragraph in enumerate(paragraphs):
1352
  paragraph_text = paragraph.text.strip()
1353
  if not paragraph_text:
1354
  continue
1355
-
1356
  matched_heading = None
1357
  for category, patterns in HEADING_PATTERNS.items():
1358
  for pattern in patterns:
@@ -1361,26 +1144,20 @@ def process_headings(document, flat_json):
1361
  break
1362
  if matched_heading:
1363
  break
1364
-
1365
  if matched_heading:
1366
  print(f" πŸ“Œ Found heading at paragraph {para_idx + 1}: '{paragraph_text}'")
1367
-
1368
  if has_red_text_in_paragraph(paragraph):
1369
  print(f" πŸ”΄ Found red text in heading itself")
1370
  heading_replacements = process_red_text_in_paragraph(paragraph, paragraph_text, flat_json)
1371
  replacements_made += heading_replacements
1372
-
1373
  for next_para_offset in range(1, 6):
1374
  next_para_idx = para_idx + next_para_offset
1375
  if next_para_idx >= len(paragraphs):
1376
  break
1377
-
1378
  next_paragraph = paragraphs[next_para_idx]
1379
  next_text = next_paragraph.text.strip()
1380
-
1381
  if not next_text:
1382
  continue
1383
-
1384
  is_another_heading = False
1385
  for category, patterns in HEADING_PATTERNS.items():
1386
  for pattern in patterns:
@@ -1389,10 +1166,8 @@ def process_headings(document, flat_json):
1389
  break
1390
  if is_another_heading:
1391
  break
1392
-
1393
  if is_another_heading:
1394
  break
1395
-
1396
  if has_red_text_in_paragraph(next_paragraph):
1397
  print(f" πŸ”΄ Found red text in paragraph {next_para_idx + 1} after heading")
1398
  context_replacements = process_red_text_in_paragraph(
@@ -1401,55 +1176,46 @@ def process_headings(document, flat_json):
1401
  flat_json
1402
  )
1403
  replacements_made += context_replacements
1404
-
1405
  return replacements_made
1406
 
1407
  def process_red_text_in_paragraph(paragraph, context_text, flat_json):
1408
- """Process red text within a paragraph using context"""
1409
  replacements_made = 0
1410
-
1411
  red_text_segments = []
1412
  for run in paragraph.runs:
1413
  if is_red(run) and run.text.strip():
1414
  red_text_segments.append(run.text.strip())
1415
-
1416
  if not red_text_segments:
1417
  return 0
1418
-
1419
  combined_red_text = " ".join(red_text_segments).strip()
1420
  print(f" πŸ” Red text found: '{combined_red_text}'")
1421
-
1422
- json_value = None
1423
- json_value = find_matching_json_value(combined_red_text, flat_json)
1424
 
1425
  if json_value is None:
1426
  if "NHVAS APPROVED AUDITOR" in context_text.upper():
1427
  auditor_fields = ["auditor name", "auditor", "nhvas auditor", "approved auditor", "print name"]
1428
  for field in auditor_fields:
1429
- json_value = find_matching_json_value(field, flat_json)
1430
- if json_value is not None:
1431
- print(f" βœ… Found auditor match with field: '{field}'")
 
1432
  break
1433
-
1434
  elif "OPERATOR DECLARATION" in context_text.upper():
1435
  operator_fields = ["operator name", "operator", "company name", "organisation name", "print name"]
1436
  for field in operator_fields:
1437
- json_value = find_matching_json_value(field, flat_json)
1438
- if json_value is not None:
1439
- print(f" βœ… Found operator match with field: '{field}'")
 
1440
  break
1441
 
1442
  if json_value is None:
1443
- context_queries = [
1444
- f"{context_text} {combined_red_text}",
1445
- combined_red_text,
1446
- context_text
1447
- ]
1448
-
1449
  for query in context_queries:
1450
- json_value = find_matching_json_value(query, flat_json)
1451
- if json_value is not None:
1452
- print(f" βœ… Found match with combined query")
 
1453
  break
1454
 
1455
  if json_value is not None:
@@ -1468,13 +1234,10 @@ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
1468
  return replacements_made
1469
 
1470
  # ============================================================================
1471
- # Main process function
1472
  # ============================================================================
1473
-
1474
  def process_hf(json_file, docx_file, output_file):
1475
- """Main processing function with comprehensive error handling"""
1476
  try:
1477
- # Load JSON
1478
  if hasattr(json_file, "read"):
1479
  json_data = json.load(json_file)
1480
  else:
@@ -1488,19 +1251,15 @@ def process_hf(json_file, docx_file, output_file):
1488
  print(f" - {key}: {value}")
1489
  print(f" ... and {len(flat_json) - 10} more keys\n")
1490
 
1491
- # Load DOCX
1492
  if hasattr(docx_file, "read"):
1493
  doc = Document(docx_file)
1494
  else:
1495
  doc = Document(docx_file)
1496
 
1497
- # Process document with all fixes
1498
  print("πŸš€ Starting comprehensive document processing...")
1499
-
1500
  table_replacements = process_tables(doc, flat_json)
1501
  paragraph_replacements = process_paragraphs(doc, flat_json)
1502
  heading_replacements = process_headings(doc, flat_json)
1503
-
1504
  total_replacements = table_replacements + paragraph_replacements + heading_replacements
1505
 
1506
  # Save unmatched headers for iterative improvement
@@ -1513,11 +1272,9 @@ def process_hf(json_file, docx_file, output_file):
1513
  except Exception as e:
1514
  print(f"⚠️ Could not save unmatched headers: {e}")
1515
 
1516
- # Save output docx
1517
  if hasattr(output_file, "write"):
1518
  doc.save(output_file)
1519
  else:
1520
- # If output path is a file path string
1521
  doc.save(output_file)
1522
 
1523
  print(f"\nβœ… Document saved as: {output_file}")
@@ -1534,10 +1291,6 @@ def process_hf(json_file, docx_file, output_file):
1534
  import traceback
1535
  traceback.print_exc()
1536
 
1537
- # ============================================================================
1538
- # CLI entrypoint
1539
- # ============================================================================
1540
-
1541
  if __name__ == "__main__":
1542
  import sys
1543
  if len(sys.argv) != 4:
 
1
  #!/usr/bin/env python3
2
  """
3
+ pipeline.py β€” safer matching and operator-declaration protections
4
+
5
+ Key improvements:
6
+ - find_matching_json_key_and_value() returns (key, value) so callers can accept/reject by key.
7
+ - Higher fuzzy thresholds for risky substitutions.
8
+ - Operator Declaration: avoid using attendance lists / unrelated keys for Position Title.
9
+ - Vehicle header mapping: stronger normalized substring/ token matching for long headers.
10
+ - Preserves existing logging and all previous handlers/logic.
 
 
11
  """
12
 
13
  import json
14
  from docx import Document
15
  from docx.shared import RGBColor
16
  import re
17
+ from typing import Any, Tuple, Optional
 
18
 
19
  # ============================================================================
20
+ # Heading patterns for document structure detection (unchanged)
21
  # ============================================================================
 
22
  HEADING_PATTERNS = {
23
  "main": [
24
  r"NHVAS\s+Audit\s+Summary\s+Report",
 
43
  }
44
 
45
  # ============================================================================
46
+ # Utility helpers
47
  # ============================================================================
48
  _unmatched_headers = {}
 
49
  def record_unmatched_header(header: str):
50
  if not header:
51
  return
52
  _unmatched_headers[header] = _unmatched_headers.get(header, 0) + 1
53
 
 
 
 
 
54
  def load_json(filepath):
55
  with open(filepath, 'r', encoding='utf-8') as file:
56
  return json.load(file)
 
80
  elif len(value) == 1:
81
  return str(value[0])
82
  else:
83
+ # Keep lists intact for special patterns (e.g., ACN digits) but default to join
84
  if "australian company number" in field_name.lower() or "company number" in field_name.lower():
85
  return value
86
+ return " ".join(str(v) for v in value)
 
87
  else:
88
  return str(value)
89
 
 
107
  return True
108
  return False
109
 
 
110
  def normalize_header_text(s: str) -> str:
111
  if not s:
112
  return ""
113
+ s = re.sub(r'\([^)]*\)', ' ', s) # remove parenthetical content
 
 
114
  s = s.replace("/", " ")
 
115
  s = re.sub(r'[^\w\s\#\%]', ' ', s)
116
  s = re.sub(r'\s+', ' ', s).strip().lower()
117
+ # canonical tweaks
118
  s = s.replace('registrationno', 'registration number')
119
  s = s.replace('registrationnumber', 'registration number')
 
120
  s = s.replace('sub-contractor', 'sub contractor')
121
+ s = s.replace('sub contracted', 'sub contractor')
122
+ return s.strip()
 
 
123
 
124
  # ============================================================================
125
+ # JSON matching functions
126
+ # - find_matching_json_value: (keeps behavior used elsewhere)
127
+ # - find_matching_json_key_and_value: returns (key, value) so callers can
128
+ # decide whether to use an entry based on the matched key.
129
  # ============================================================================
 
130
  def find_matching_json_value(field_name, flat_json):
131
+ """Legacy API: return value only (preserves existing callers)."""
132
+ result = find_matching_json_key_and_value(field_name, flat_json)
133
+ return result[1] if result else None
134
+
135
+ def find_matching_json_key_and_value(field_name, flat_json) -> Optional[Tuple[str, Any]]:
136
+ """
137
+ Return (matched_key, matched_value) or None.
138
+ Safer thresholds: fuzzy matches require >=0.35 by default.
139
+ """
140
  field_name = (field_name or "").strip()
141
  if not field_name:
142
  return None
143
 
144
+ # Exact match
145
  if field_name in flat_json:
146
  print(f" βœ… Direct match found for key '{field_name}'")
147
+ return field_name, flat_json[field_name]
148
 
149
+ # Case-insensitive exact
150
  for key, value in flat_json.items():
151
  if key.lower() == field_name.lower():
152
+ print(f" βœ… Case-insensitive match found for key '{field_name}' -> '{key}'")
153
+ return key, value
154
 
155
+ # Special-case 'print name' preference for operator vs auditor (prefer fully-qualified)
156
  if field_name.lower().strip() == "print name":
157
  operator_keys = [k for k in flat_json.keys() if "operator" in k.lower() and "print name" in k.lower()]
158
  auditor_keys = [k for k in flat_json.keys() if "auditor" in k.lower() and ("print name" in k.lower() or "name" in k.lower())]
 
159
  if operator_keys:
160
  print(f" βœ… Operator Print Name match: '{field_name}' -> '{operator_keys[0]}'")
161
+ return operator_keys[0], flat_json[operator_keys[0]]
162
  elif auditor_keys:
163
  print(f" βœ… Auditor Name match: '{field_name}' -> '{auditor_keys[0]}'")
164
+ return auditor_keys[0], flat_json[auditor_keys[0]]
165
 
166
+ # Suffix match for nested keys (e.g., 'section.field')
167
  for key, value in flat_json.items():
168
  if '.' in key and key.split('.')[-1].lower() == field_name.lower():
169
+ print(f" βœ… Suffix match found for key '{field_name}' -> '{key}'")
170
+ return key, value
171
 
172
+ # Clean and exact
173
  clean_field = re.sub(r'[^\w\s]', ' ', field_name.lower()).strip()
174
  clean_field = re.sub(r'\s+', ' ', clean_field)
175
  for key, value in flat_json.items():
176
  clean_key = re.sub(r'[^\w\s]', ' ', key.lower()).strip()
177
  clean_key = re.sub(r'\s+', ' ', clean_key)
178
  if clean_field == clean_key:
179
+ print(f" βœ… Clean match found for key '{field_name}' -> '{key}'")
180
+ return key, value
181
 
182
+ # Fuzzy matching with token scoring
183
  field_words = set(word.lower() for word in re.findall(r'\b\w+\b', field_name) if len(word) > 2)
184
  if not field_words:
185
  return None
186
 
 
 
187
  best_key = None
188
+ best_value = None
189
+ best_score = 0.0
190
 
191
  for key, value in flat_json.items():
192
  key_words = set(word.lower() for word in re.findall(r'\b\w+\b', key) if len(word) > 2)
193
  if not key_words:
194
  continue
195
 
196
+ common = field_words.intersection(key_words)
197
+ if not common:
198
+ # allow substring in normalized forms as a weaker fallback
199
+ norm_field = normalize_header_text(field_name)
200
+ norm_key = normalize_header_text(key)
201
+ if norm_field and norm_key and (norm_field in norm_key or norm_key in norm_field):
202
+ # substring score based on length ratio
203
+ substring_score = min(len(norm_field), len(norm_key)) / max(len(norm_field), len(norm_key))
204
+ final_score = 0.4 * substring_score
205
+ else:
206
+ final_score = 0.0
207
+ else:
208
+ similarity = len(common) / len(field_words.union(key_words))
209
+ coverage = len(common) / len(field_words)
210
  final_score = (similarity * 0.6) + (coverage * 0.4)
211
 
212
+ if final_score > best_score:
213
+ best_score = final_score
214
+ best_key = key
215
+ best_value = value
216
 
217
+ # Accept only reasonable fuzzy matches (threshold 0.35)
218
+ if best_key and best_score >= 0.35:
219
  print(f" βœ… Fuzzy match found for key '{field_name}' with JSON key '{best_key}' (score: {best_score:.2f})")
220
+ return best_key, best_value
221
 
222
  print(f" ❌ No match found for '{field_name}'")
223
  return None
224
 
225
  # ============================================================================
226
+ # Red text helpers (unchanged except kept robust)
227
  # ============================================================================
 
228
  def extract_red_text_segments(cell):
229
  red_segments = []
230
  for para_idx, paragraph in enumerate(cell.paragraphs):
 
237
  segment_runs.append((para_idx, run_idx, run))
238
  else:
239
  if segment_runs:
240
+ red_segments.append({'text': current_segment, 'runs': segment_runs.copy(), 'paragraph_idx': para_idx})
 
 
 
 
241
  current_segment = ""
242
  segment_runs = []
243
  if segment_runs:
244
+ red_segments.append({'text': current_segment, 'runs': segment_runs.copy(), 'paragraph_idx': para_idx})
 
 
 
 
245
  return red_segments
246
 
247
  def replace_all_red_segments(red_segments, replacement_text):
248
  if not red_segments:
249
  return 0
 
250
  if '\n' in replacement_text:
251
  replacement_lines = replacement_text.split('\n')
252
  else:
253
  replacement_lines = [replacement_text]
 
254
  replacements_made = 0
255
+ first_segment = red_segments[0]
256
+ if first_segment['runs']:
257
+ first_run = first_segment['runs'][0][2]
258
+ first_run.text = replacement_lines[0]
259
+ first_run.font.color.rgb = RGBColor(0, 0, 0)
260
+ replacements_made = 1
261
+ for _, _, run in first_segment['runs'][1:]:
262
+ run.text = ''
 
 
 
263
  for segment in red_segments[1:]:
264
  for _, _, run in segment['runs']:
265
  run.text = ''
 
266
  if len(replacement_lines) > 1 and red_segments:
267
  try:
268
  first_run = red_segments[0]['runs'][0][2]
269
  paragraph = first_run.element.getparent()
270
  from docx.oxml import OxmlElement
 
271
  for line in replacement_lines[1:]:
272
  if line.strip():
273
  br = OxmlElement('w:br')
 
279
  first_run = red_segments[0]['runs'][0][2]
280
  first_run.text = ' '.join(replacement_lines)
281
  first_run.font.color.rgb = RGBColor(0, 0, 0)
 
282
  return replacements_made
283
 
284
  def replace_single_segment(segment, replacement_text):
 
298
  return replace_all_red_segments(red_segments, replacement_text)
299
 
300
  # ============================================================================
301
+ # Specialized handlers (vehicle, attendance, management, operator) with fixes
302
  # ============================================================================
303
 
304
  def handle_australian_company_number(row, company_numbers):
 
314
  return replacements_made
315
 
316
  def handle_vehicle_registration_table(table, flat_json):
317
+ """
318
+ Stronger header normalization + substring matching for long headers.
319
+ Keeps existing behavior but reduces 'No mapping found' by using normalized substring matching.
320
+ """
321
  replacements_made = 0
322
 
323
+ # Build candidate vehicle_section similar to prior logic
 
 
 
 
 
 
324
  vehicle_section = None
325
+ # Prefer keys explicitly mentioning 'registration' or 'vehicle'
326
+ candidates = [(k, v) for k, v in flat_json.items() if 'registration' in k.lower() or 'vehicle' in k.lower()]
327
+ if candidates:
328
+ # prefer the one with longest key match (likely most specific)
329
+ candidates.sort(key=lambda kv: -len(kv[0]))
330
+ vehicle_section = candidates[0][1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
 
332
  # fallback: collect flattened keys that look like vehicle columns
333
  if vehicle_section is None:
334
  potential_columns = {}
335
  for key, value in flat_json.items():
336
  lk = key.lower()
337
+ if any(col_name in lk for col_name in ["registration number", "sub-contractor", "weight verification", "rfs suspension", "trip records", "fault recording", "fault repair", "daily checks", "roadworthiness"]):
338
  if "." in key:
339
  column_name = key.split(".")[-1]
340
  else:
 
348
  print(f" ❌ Vehicle registration data not found in JSON")
349
  return 0
350
 
351
+ # Normalize vehicle_section into dict of column_label -> list/value
352
  if isinstance(vehicle_section, list):
353
+ # if list of dicts, pivot
354
  if vehicle_section and isinstance(vehicle_section[0], dict):
355
  flattened = {}
356
  for entry in vehicle_section:
357
  for k, v in entry.items():
358
  flattened.setdefault(k, []).append(v)
359
  vehicle_section = flattened
360
+ else:
361
+ # can't interpret, bail
362
+ vehicle_section = {}
363
 
364
  if not isinstance(vehicle_section, dict):
 
365
  try:
366
  vehicle_section = dict(vehicle_section)
367
  except Exception:
368
+ vehicle_section = {}
369
 
370
  print(f" βœ… Found vehicle registration data with {len(vehicle_section)} columns")
371
 
372
+ # Find header row (look for registration + number or reg no)
373
  header_row_idx = -1
374
  header_row = None
375
  for row_idx, row in enumerate(table.rows):
376
  row_text = " ".join(get_clean_text(cell).lower() for cell in row.cells)
377
+ if ("registration" in row_text and "number" in row_text) or "reg no" in row_text or "registration no" in row_text:
378
  header_row_idx = row_idx
379
  header_row = row
380
  break
 
 
 
 
 
 
 
 
381
 
382
  if header_row_idx == -1:
383
  print(f" ❌ Could not find header row in vehicle table")
 
385
 
386
  print(f" βœ… Found header row at index {header_row_idx}")
387
 
388
+ # Build master labels from vehicle_section keys
 
 
389
  master_labels = {}
390
  for orig_key in vehicle_section.keys():
391
  norm = normalize_header_text(str(orig_key))
392
  if norm:
393
+ # if there is collision, prefer longer orig_key (more specific)
394
+ if norm in master_labels:
395
+ if len(orig_key) > len(master_labels[norm]):
396
+ master_labels[norm] = orig_key
397
+ else:
398
+ master_labels[norm] = orig_key
 
 
 
 
 
 
 
399
 
400
+ # Map header cells using normalized token overlap + substring fallback
401
+ column_mapping = {}
402
  for col_idx, cell in enumerate(header_row.cells):
403
  header_text = get_clean_text(cell).strip()
404
  if not header_text:
405
  continue
406
+ header_key = header_text.strip().lower()
407
+ if header_key in {"no", "no.", "#"}:
408
  continue
409
 
410
  norm_header = normalize_header_text(header_text)
 
416
  best_match = master_labels[norm_header]
417
  best_score = 1.0
418
  else:
419
+ # token overlap
420
  header_tokens = set(t for t in norm_header.split() if len(t) > 2)
421
  for norm_key, orig_label in master_labels.items():
422
  key_tokens = set(t for t in norm_key.split() if len(t) > 2)
 
426
  if common:
427
  score = len(common) / max(1, len(header_tokens.union(key_tokens)))
428
  else:
429
+ # substring fallback on normalized strings
430
  if norm_header in norm_key or norm_key in norm_header:
431
  score = min(len(norm_header), len(norm_key)) / max(len(norm_header), len(norm_key))
432
  else:
 
435
  best_score = score
436
  best_match = orig_label
437
 
438
+ # additional heuristic: if header contains 'roadworthiness' and any master_labels key contains that token, accept
439
+ if not best_match:
440
+ for norm_key, orig_label in master_labels.items():
441
+ if 'roadworthiness' in norm_header and 'roadworthiness' in norm_key:
442
+ best_match = orig_label
443
+ best_score = 0.65
444
+ break
445
+
446
  if best_match and best_score >= 0.30:
447
  column_mapping[col_idx] = best_match
448
+ print(f" πŸ“Œ Column {col_idx}: '{header_text}' -> '{best_match}' (norm:'{norm_header}' score:{best_score:.2f})")
449
  else:
450
+ print(f" ⚠️ No mapping found for '{header_text}' (norm:'{norm_header}')")
451
  record_unmatched_header(header_text)
452
 
453
  if not column_mapping:
454
  print(f" ❌ No column mappings found")
455
  return 0
456
 
457
+ # Determine how many rows of data to populate
458
  max_data_rows = 0
459
  for json_key, data in vehicle_section.items():
460
  if isinstance(data, list):
 
462
 
463
  print(f" πŸ“Œ Need to populate {max_data_rows} data rows")
464
 
465
+ # Populate or add rows
466
  for data_row_index in range(max_data_rows):
467
  table_row_idx = header_row_idx + 1 + data_row_index
 
468
  if table_row_idx >= len(table.rows):
469
+ print(f" ⚠️ Row {table_row_idx + 1} doesn't exist, adding one")
470
+ table.add_row()
 
 
471
 
472
  row = table.rows[table_row_idx]
473
  print(f" πŸ“Œ Processing data row {table_row_idx + 1} (vehicle {data_row_index + 1})")
 
474
  for col_idx, json_key in column_mapping.items():
475
  if col_idx < len(row.cells):
476
  cell = row.cells[col_idx]
 
477
  column_data = vehicle_section.get(json_key, [])
478
  if isinstance(column_data, list) and data_row_index < len(column_data):
479
  replacement_value = str(column_data[data_row_index])
 
480
  cell_text = get_clean_text(cell)
481
  if has_red_text(cell) or not cell_text.strip():
482
  if not cell_text.strip():
483
  cell.text = replacement_value
484
  replacements_made += 1
485
+ print(f" -> Added '{replacement_value}' to empty cell (col '{json_key}')")
486
  else:
487
  cell_replacements = replace_red_text_in_cell(cell, replacement_value)
488
  replacements_made += cell_replacements
489
  if cell_replacements > 0:
490
+ print(f" -> Replaced red text with '{replacement_value}' (col '{json_key}')")
491
 
492
  return replacements_made
493
 
494
  def handle_attendance_list_table_enhanced(table, flat_json):
495
+ """Same as before β€” preserved behavior."""
496
  replacements_made = 0
497
+ attendance_patterns = ["attendance list", "names and position titles", "attendees"]
 
 
 
 
 
 
498
  found_attendance_row = None
499
  for row_idx, row in enumerate(table.rows[:3]):
500
  for cell_idx, cell in enumerate(row.cells):
 
505
  break
506
  if found_attendance_row is not None:
507
  break
 
508
  if found_attendance_row is None:
509
  return 0
510
 
 
515
  "attendance list",
516
  "attendees"
517
  ]
 
518
  print(f" πŸ” Searching for attendance data in JSON...")
 
519
  for search_key in attendance_search_keys:
520
+ kv = find_matching_json_key_and_value(search_key, flat_json)
521
+ if kv:
522
+ attendance_value = kv[1]
523
+ print(f" βœ… Found attendance data with key: '{kv[0]}'")
524
  print(f" πŸ“Š Raw value: {attendance_value}")
525
  break
 
526
  if attendance_value is None:
527
  print(f" ❌ No attendance data found in JSON")
528
  return 0
529
 
530
+ # Find red text candidate cell
531
  target_cell = None
532
  print(f" πŸ” Scanning ALL cells in attendance table for red text...")
 
533
  for row_idx, row in enumerate(table.rows):
534
  for cell_idx, cell in enumerate(row.cells):
535
  if has_red_text(cell):
 
 
536
  red_text = ""
537
  for paragraph in cell.paragraphs:
538
  for run in paragraph.runs:
539
  if is_red(run):
540
  red_text += run.text
541
+ if red_text.strip():
542
+ print(f" 🎯 Found red text in row {row_idx + 1}, cell {cell_idx + 1}")
543
+ print(f" πŸ“‹ Red text content: '{red_text[:60]}...'")
544
+ red_lower = red_text.lower()
545
+ if any(ind in red_lower for ind in ['manager', 'director', 'auditor', '–', '-']):
546
+ target_cell = cell
547
+ print(f" βœ… This looks like attendance data - using this cell")
548
+ break
549
+ if target_cell:
550
  break
551
 
552
  if target_cell is None:
 
555
 
556
  if has_red_text(target_cell):
557
  print(f" πŸ”§ Replacing red text with properly formatted attendance list...")
 
558
  if isinstance(attendance_value, list):
559
  attendance_list = [str(item).strip() for item in attendance_value if str(item).strip()]
560
  else:
561
  attendance_list = [str(attendance_value).strip()]
 
562
  print(f" πŸ“ Attendance items to add:")
563
  for i, item in enumerate(attendance_list):
564
  print(f" {i+1}. {item}")
 
565
  replacement_text = "\n".join(attendance_list)
566
  cell_replacements = replace_red_text_in_cell(target_cell, replacement_text)
567
  replacements_made += cell_replacements
 
568
  print(f" βœ… Added {len(attendance_list)} attendance items")
569
  print(f" πŸ“Š Replacements made: {cell_replacements}")
 
570
  return replacements_made
571
 
572
  def fix_management_summary_details_column(table, flat_json):
573
+ """Preserve behavior but prefer scoped mgmt dicts."""
574
  replacements_made = 0
 
575
  print(f" 🎯 FIX: Management Summary DETAILS column processing")
 
 
576
  table_text = ""
577
  for row in table.rows[:3]:
578
  for cell in row.cells:
579
  table_text += get_clean_text(cell).lower() + " "
 
 
580
  mgmt_types = []
581
  if "mass management" in table_text or "mass" in table_text:
582
  mgmt_types.append("Mass Management Summary")
583
  if "maintenance management" in table_text or "maintenance" in table_text:
584
  mgmt_types.append("Maintenance Management Summary")
585
+ if "fatigue management" in table_text or "fatigue" in table_text:
586
  mgmt_types.append("Fatigue Management Summary")
 
587
  if not mgmt_types:
 
588
  if any("std 5" in get_clean_text(c).lower() for r in table.rows for c in r.cells):
589
  mgmt_types.append("Mass Management Summary")
 
590
  if not mgmt_types:
591
  return 0
 
592
  for mgmt_type in mgmt_types:
593
  print(f" βœ… Confirmed {mgmt_type} table processing")
 
594
  mgmt_data = flat_json.get(mgmt_type)
595
  if not isinstance(mgmt_data, dict):
 
596
  for key in flat_json.keys():
597
  if mgmt_type.split()[0].lower() in key.lower() and "summary" in key.lower():
598
  mgmt_data = flat_json.get(key)
 
600
  if not isinstance(mgmt_data, dict):
601
  print(f" ⚠️ No JSON management dict found for {mgmt_type}, skipping this type")
602
  continue
 
 
603
  for row_idx, row in enumerate(table.rows):
604
  if len(row.cells) >= 2:
605
  standard_cell = row.cells[0]
606
  details_cell = row.cells[1]
607
  standard_text = get_clean_text(standard_cell).strip().lower()
 
 
608
  if "std 5" in standard_text or "verification" in standard_text:
609
  if has_red_text(details_cell):
 
 
610
  std_val = None
 
611
  for candidate in ("Std 5. Verification", "Std 5 Verification", "Std 5", "Verification"):
612
  std_val = mgmt_data.get(candidate)
613
  if std_val is not None:
614
  break
 
615
  if std_val is None:
616
  for k, v in mgmt_data.items():
617
  if 'std 5' in k.lower() or 'verification' in k.lower():
 
623
  replacements_made += cell_replacements
624
  if cell_replacements:
625
  print(f" βœ… Replaced Std 5. Verification details for {mgmt_type}")
 
 
626
  if "std 6" in standard_text or "internal review" in standard_text:
627
  if has_red_text(details_cell):
 
628
  std_val = None
629
  for candidate in ("Std 6. Internal Review", "Std 6 Internal Review", "Std 6", "Internal Review"):
630
  std_val = mgmt_data.get(candidate)
 
641
  replacements_made += cell_replacements
642
  if cell_replacements:
643
  print(f" βœ… Replaced Std 6. Internal Review details for {mgmt_type}")
 
644
  return replacements_made
645
 
646
+ # ============================================================================
647
+ # Canonical operator declaration fixer β€” SAFER
648
+ # ============================================================================
649
  def fix_operator_declaration_empty_values(table, flat_json):
650
  replacements_made = 0
 
651
  print(f" 🎯 FIX: Operator Declaration empty values processing")
 
652
  table_context = ""
653
  for row in table.rows:
654
  for cell in row.cells:
655
  table_context += get_clean_text(cell).lower() + " "
 
656
  if not ("print name" in table_context and "position title" in table_context):
657
  return 0
 
658
  print(f" βœ… Confirmed Operator Declaration table")
659
 
660
  def parse_name_and_position(value):
 
665
  return None, None
666
  if len(value) == 1:
667
  return str(value[0]).strip(), None
668
+ # common [name, position] pattern
669
  first = str(value[0]).strip()
670
  second = str(value[1]).strip()
671
  if first and second:
672
  return first, second
673
  value = " ".join(str(v).strip() for v in value if str(v).strip())
 
674
  s = str(value).strip()
675
  if not s:
676
  return None, None
 
677
  parts = re.split(r'\s+[-–—]\s+|\s*,\s*|\s*\|\s*', s)
678
  if len(parts) >= 2:
679
  left = parts[0].strip()
 
685
  if any(ind in left.lower() for ind in role_indicators) and not any(ind in right.lower() for ind in role_indicators):
686
  return right, left
687
  return left, right
 
688
  tokens = s.split()
689
  if len(tokens) >= 2:
690
  last = tokens[-1]
 
692
  'coordinator', 'driver', 'operator', 'representative', 'chief']
693
  if any(ind == last.lower() for ind in role_indicators):
694
  return " ".join(tokens[:-1]), last
 
695
  return s, None
696
 
697
  for row_idx, row in enumerate(table.rows):
698
  if len(row.cells) >= 2:
699
  cell1_text = get_clean_text(row.cells[0]).strip().lower()
700
  cell2_text = get_clean_text(row.cells[1]).strip().lower()
701
+ # header detection
702
  if "print name" in cell1_text and "position" in cell2_text:
703
  print(f" πŸ“Œ Found header row at {row_idx + 1}")
 
704
  if row_idx + 1 < len(table.rows):
705
  data_row = table.rows[row_idx + 1]
706
  if len(data_row.cells) >= 2:
707
  name_cell = data_row.cells[0]
708
  position_cell = data_row.cells[1]
 
709
  name_text = get_clean_text(name_cell).strip()
710
  position_text = get_clean_text(position_cell).strip()
711
  print(f" πŸ“‹ Current values: Name='{name_text}', Position='{position_text}'")
712
 
713
+ # Prefer exact qualified keys first (use key-aware lookup)
714
+ name_kv = find_matching_json_key_and_value("Operator Declaration.Print Name", flat_json) or find_matching_json_key_and_value("Print Name", flat_json)
715
+ position_kv = find_matching_json_key_and_value("Operator Declaration.Position Title", flat_json) or find_matching_json_key_and_value("Position Title", flat_json)
716
+
717
+ name_value = name_kv[1] if name_kv else None
718
+ name_key = name_kv[0] if name_kv else None
719
 
720
+ position_value = position_kv[1] if position_kv else None
721
+ position_key = position_kv[0] if position_kv else None
 
722
 
723
+ # parse combined cases
724
  parsed_name_from_nameval, parsed_pos_from_nameval = parse_name_and_position(name_value) if name_value is not None else (None, None)
725
  parsed_name_from_posval, parsed_pos_from_posval = parse_name_and_position(position_value) if position_value is not None else (None, None)
726
 
 
732
  elif name_value is not None:
733
  final_name = get_value_as_string(name_value)
734
 
735
+ # Position acceptance policy:
736
+ # - Accept position_value ONLY if matched key indicates position/title OR parsed value looks like a role
737
+ def looks_like_role(s: str) -> bool:
738
+ if not s:
739
+ return False
740
+ s = s.lower()
741
+ roles = ['manager', 'auditor', 'owner', 'director', 'supervisor', 'coordinator', 'driver', 'operator', 'representative', 'chief']
742
+ # short role descriptions or containing role token
743
+ if any(r in s for r in roles):
744
+ return True
745
+ # single/short token likely role (<=4 tokens)
746
+ if len(s.split()) <= 4 and any(c.isalpha() for c in s):
747
+ return True
748
+ return False
749
+
750
+ # Only use position_value if the matched key strongly indicates position/title
751
+ use_position = False
752
+ if position_kv:
753
+ k_lower = (position_key or "").lower()
754
+ if ("position" in k_lower or "title" in k_lower or "role" in k_lower):
755
+ use_position = True
756
+ # Avoid using attendance keys or attendance text as position source
757
+ if position_kv and ("attendance" in position_key.lower() or "attendance list" in position_key.lower() or "attendees" in position_key.lower()):
758
+ use_position = False
759
+
760
+ if use_position:
761
+ # choose parsed pos if available
762
+ if parsed_pos_from_posval:
763
+ final_pos = parsed_pos_from_posval
764
+ else:
765
+ final_pos = get_value_as_string(position_value) if position_value is not None else None
766
+ else:
767
+ # allow parsed position gleaned from name_value (if it looks like a role)
768
+ if parsed_pos_from_nameval and looks_like_role(parsed_pos_from_nameval):
769
+ final_pos = parsed_pos_from_nameval
770
 
771
+ # final normalization
772
  if isinstance(final_name, list):
773
  final_name = " ".join(str(x) for x in final_name).strip()
774
  if isinstance(final_pos, list):
 
785
  low = name_str.lower()
786
  if any(bp in low for bp in bad_phrases):
787
  return False
788
+ return len(name_str) > 1 and any(c.isalpha() for c in name_str)
789
 
790
+ # Write name if empty or red
791
  if (not name_text or has_red_text(name_cell)) and final_name and looks_like_person(final_name):
792
  if has_red_text(name_cell):
793
  replace_red_text_in_cell(name_cell, final_name)
 
796
  replacements_made += 1
797
  print(f" βœ… Updated Print Name -> '{final_name}'")
798
 
799
+ # Write position if empty or red and final_pos appears role-like
800
+ if (not position_text or has_red_text(position_cell)) and final_pos and looks_like_role(final_pos):
801
  if has_red_text(position_cell):
802
  replace_red_text_in_cell(position_cell, final_pos)
803
  else:
 
824
  for i, segment in enumerate(red_segments):
825
  segment_text = segment['text'].strip()
826
  if segment_text:
827
+ kv = find_matching_json_key_and_value(segment_text, flat_json)
828
+ if kv:
829
+ replacement_text = get_value_as_string(kv[1], segment_text)
830
  if replace_single_segment(segment, replacement_text):
831
  replacements_made += 1
832
  print(f" βœ… Replaced segment {i+1}: '{segment_text}' -> '{replacement_text}'")
 
844
  return 0
845
  nature_indicators = ["transport", "logistics", "freight", "delivery", "trucking", "haulage"]
846
  if any(indicator in red_text.lower() for indicator in nature_indicators):
847
+ kv = find_matching_json_key_and_value("Nature of Business", flat_json) or find_matching_json_key_and_value("Nature of the Operators Business (Summary)", flat_json)
848
+ if kv:
849
+ replacement_text = get_value_as_string(kv[1], "Nature of Business")
850
  cell_replacements = replace_red_text_in_cell(cell, replacement_text)
851
  replacements_made += cell_replacements
852
  print(f" βœ… Fixed Nature of Business multiline content")
 
864
  return 0
865
  management_types = ["Mass Management Summary", "Maintenance Management Summary", "Fatigue Management Summary"]
866
  for mgmt_type in management_types:
867
+ if mgmt_type in flat_json and isinstance(flat_json[mgmt_type], dict):
868
  mgmt_data = flat_json[mgmt_type]
869
+ for std_key, std_value in mgmt_data.items():
870
+ if isinstance(std_value, list) and std_value:
871
+ if len(red_text) > 10:
872
+ for item in std_value:
873
+ if red_text.lower() in str(item).lower() or str(item).lower() in red_text.lower():
874
+ replacement_text = "\n".join(str(i) for i in std_value)
875
+ cell_replacements = replace_red_text_in_cell(cell, replacement_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
876
  replacements_made += cell_replacements
877
+ print(f" βœ… Fixed {mgmt_type} - {std_key}")
878
+ return replacements_made
 
 
 
 
 
 
 
879
  return replacements_made
880
 
881
  def handle_print_accreditation_section(table, flat_json):
882
  replacements_made = 0
 
883
  if getattr(table, "_processed_operator_declaration", False):
884
  print(f" ⏭️ Skipping Print Accreditation - this is an Operator Declaration table")
885
  return 0
 
886
  table_context = ""
887
  for row in table.rows:
888
  for cell in row.cells:
889
  table_context += get_clean_text(cell).lower() + " "
 
890
  if "operator declaration" in table_context or ("print name" in table_context and "position title" in table_context):
891
  print(f" ⏭️ Skipping Print Accreditation - this is an Operator Declaration table")
892
  return 0
 
893
  print(f" πŸ“‹ Processing Print Accreditation section")
 
894
  for row_idx, row in enumerate(table.rows):
895
  for cell_idx, cell in enumerate(row.cells):
896
  if has_red_text(cell):
897
  accreditation_fields = [
898
  "(print accreditation name)",
899
  "Operator name (Legal entity)",
900
+ "Print accreditation name"
 
901
  ]
 
902
  for field in accreditation_fields:
903
+ kv = find_matching_json_key_and_value(field, flat_json)
904
+ if kv:
905
+ replacement_text = get_value_as_string(kv[1], field)
906
  if replacement_text.strip():
907
  cell_replacements = replace_red_text_in_cell(cell, replacement_text)
908
  replacements_made += cell_replacements
909
  if cell_replacements > 0:
910
+ print(f" βœ… Fixed accreditation: {kv[0]}")
911
  break
 
912
  return replacements_made
913
 
914
  def process_single_column_sections(cell, key_text, flat_json):
915
  replacements_made = 0
 
916
  if has_red_text(cell):
917
  red_text = ""
918
  for paragraph in cell.paragraphs:
919
  for run in paragraph.runs:
920
  if is_red(run):
921
  red_text += run.text
 
922
  if red_text.strip():
923
+ kv = find_matching_json_key_and_value(red_text.strip(), flat_json)
924
+ if not kv:
925
+ kv = find_matching_json_key_and_value(key_text, flat_json)
926
+ if kv:
927
+ section_replacement = get_value_as_string(kv[1], red_text.strip())
 
928
  cell_replacements = replace_red_text_in_cell(cell, section_replacement)
929
  replacements_made += cell_replacements
930
  if cell_replacements > 0:
931
  print(f" βœ… Fixed single column section: '{key_text}'")
 
932
  return replacements_made
933
 
934
  # ============================================================================
935
+ # Main table/paragraph/heading processing (preserve logic + use new helpers)
936
  # ============================================================================
 
937
  def process_tables(document, flat_json):
 
938
  replacements_made = 0
 
939
  for table_idx, table in enumerate(document.tables):
940
  print(f"\nπŸ” Processing table {table_idx + 1}:")
 
 
941
  table_text = ""
942
  for row in table.rows[:3]:
943
  for cell in row.cells:
944
  table_text += get_clean_text(cell).lower() + " "
945
 
 
946
  management_summary_indicators = ["mass management", "maintenance management", "fatigue management"]
947
  has_management = any(indicator in table_text for indicator in management_summary_indicators)
948
  has_details = "details" in table_text
 
952
  summary_fixes = fix_management_summary_details_column(table, flat_json)
953
  replacements_made += summary_fixes
954
 
 
955
  summary_replacements = 0
956
  for row_idx, row in enumerate(table.rows):
957
  for cell_idx, cell in enumerate(row.cells):
958
  if has_red_text(cell):
 
959
  for mgmt_type in ["Mass Management Summary", "Maintenance Management Summary", "Fatigue Management Summary"]:
960
  if mgmt_type.lower().replace(" summary", "") in table_text:
961
  if mgmt_type in flat_json:
 
980
  replacements_made += summary_replacements
981
  continue
982
 
983
+ # Vehicle tables detection
984
  vehicle_indicators = ["registration number", "sub-contractor", "weight verification", "rfs suspension", "registration"]
985
  indicator_count = sum(1 for indicator in vehicle_indicators if indicator in table_text)
986
  if indicator_count >= 2:
 
989
  replacements_made += vehicle_replacements
990
  continue
991
 
992
+ # Attendance
993
  if "attendance list" in table_text and "names and position titles" in table_text:
994
  print(f" πŸ‘₯ Detected Attendance List table")
995
  attendance_replacements = handle_attendance_list_table_enhanced(table, flat_json)
996
  replacements_made += attendance_replacements
997
  continue
998
 
999
+ # Print Accreditation / Operator Declaration
1000
  print_accreditation_indicators = ["print name", "position title"]
1001
  indicator_count = sum(1 for indicator in print_accreditation_indicators if indicator in table_text)
 
1002
  if indicator_count >= 2 or ("print name" in table_text and "position title" in table_text):
1003
  print(f" πŸ“‹ Detected Print Accreditation/Operator Declaration table")
1004
  declaration_fixes = fix_operator_declaration_empty_values(table, flat_json)
1005
  replacements_made += declaration_fixes
 
1006
  if not getattr(table, "_processed_operator_declaration", False):
1007
  print_accreditation_replacements = handle_print_accreditation_section(table, flat_json)
1008
  replacements_made += print_accreditation_replacements
 
1009
  continue
1010
 
1011
+ # Regular table rows handling (preserved)
1012
  for row_idx, row in enumerate(table.rows):
1013
  if len(row.cells) < 1:
1014
  continue
 
1015
  key_cell = row.cells[0]
1016
  key_text = get_clean_text(key_cell)
 
1017
  if not key_text:
1018
  continue
 
1019
  print(f" πŸ“Œ Row {row_idx + 1}: Key = '{key_text}'")
1020
+ kv = find_matching_json_key_and_value(key_text, flat_json)
1021
+ json_value = kv[1] if kv else None
1022
 
1023
  if json_value is not None:
1024
  replacement_text = get_value_as_string(json_value, key_text)
1025
 
1026
+ # ACN handling
1027
  if ("australian company number" in key_text.lower() or "company number" in key_text.lower()) and isinstance(json_value, list):
1028
  cell_replacements = handle_australian_company_number(row, json_value)
1029
  replacements_made += cell_replacements
1030
 
1031
+ # section headers
1032
  elif ("attendance list" in key_text.lower() or "nature of" in key_text.lower()) and row_idx + 1 < len(table.rows):
1033
  print(f" βœ… Section header detected, checking next row...")
1034
  next_row = table.rows[row_idx + 1]
 
1036
  if has_red_text(cell):
1037
  print(f" βœ… Found red text in next row, cell {cell_idx + 1}")
1038
  if isinstance(json_value, list):
1039
+ section_text = "\n".join(str(item) for item in json_value)
1040
+ else:
1041
+ section_text = replacement_text
1042
+ cell_replacements = replace_red_text_in_cell(cell, section_text)
1043
  replacements_made += cell_replacements
1044
  if cell_replacements > 0:
1045
  print(f" -> Replaced section content")
1046
 
1047
+ # single column
1048
  elif len(row.cells) == 1 or (len(row.cells) > 1 and not any(has_red_text(row.cells[i]) for i in range(1, len(row.cells)))):
1049
  if has_red_text(key_cell):
1050
  cell_replacements = process_single_column_sections(key_cell, key_text, flat_json)
1051
  replacements_made += cell_replacements
1052
 
1053
+ # key-value pairs
1054
  else:
1055
  for cell_idx in range(1, len(row.cells)):
1056
  value_cell = row.cells[cell_idx]
 
1060
  replacements_made += cell_replacements
1061
 
1062
  else:
1063
+ # fallback single cell red-text key
1064
  if len(row.cells) == 1 and has_red_text(key_cell):
1065
  red_text = ""
1066
  for paragraph in key_cell.paragraphs:
 
1068
  if is_red(run):
1069
  red_text += run.text
1070
  if red_text.strip():
1071
+ kv2 = find_matching_json_key_and_value(red_text.strip(), flat_json)
1072
+ if kv2:
1073
+ section_replacement = get_value_as_string(kv2[1], red_text.strip())
1074
  cell_replacements = replace_red_text_in_cell(key_cell, section_replacement)
1075
  replacements_made += cell_replacements
1076
 
1077
+ # attempt multiple red-segments or surgical fixes
1078
  for cell_idx in range(len(row.cells)):
1079
  cell = row.cells[cell_idx]
1080
  if has_red_text(cell):
1081
  cell_replacements = handle_multiple_red_segments_in_cell(cell, flat_json)
1082
  replacements_made += cell_replacements
 
1083
  if cell_replacements == 0:
1084
  surgical_fix = handle_nature_business_multiline_fix(cell, flat_json)
1085
  replacements_made += surgical_fix
 
1086
  if cell_replacements == 0:
1087
  management_summary_fix = handle_management_summary_fix(cell, flat_json)
1088
  replacements_made += management_summary_fix
1089
 
1090
+ # Final operator/auditor declaration check on last few tables
1091
  print(f"\n🎯 Final check for Declaration tables...")
1092
  for table in document.tables[-3:]:
1093
  if len(table.rows) <= 4:
1094
  if getattr(table, "_processed_operator_declaration", False):
1095
  print(f" ⏭️ Skipping - already processed by operator declaration handler")
1096
  continue
1097
+ declaration_fix = fix_operator_declaration_empty_values(table, flat_json)
1098
  replacements_made += declaration_fix
1099
 
1100
  return replacements_made
1101
 
1102
  def process_paragraphs(document, flat_json):
 
1103
  replacements_made = 0
1104
  print(f"\nπŸ” Processing paragraphs:")
 
1105
  for para_idx, paragraph in enumerate(document.paragraphs):
1106
  red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
1107
  if red_runs:
1108
  red_text_only = "".join(run.text for run in red_runs).strip()
1109
  print(f" πŸ“Œ Paragraph {para_idx + 1}: Found red text: '{red_text_only}'")
1110
 
1111
+ kv = find_matching_json_key_and_value(red_text_only, flat_json)
1112
+ json_value = kv[1] if kv else None
1113
 
1114
  if json_value is None:
1115
  if "AUDITOR SIGNATURE" in red_text_only.upper() or "DATE" in red_text_only.upper():
1116
+ kv = find_matching_json_key_and_value("auditor signature", flat_json)
1117
  elif "OPERATOR SIGNATURE" in red_text_only.upper():
1118
+ kv = find_matching_json_key_and_value("operator signature", flat_json)
1119
+ json_value = kv[1] if kv else None
1120
 
1121
  if json_value is not None:
1122
  replacement_text = get_value_as_string(json_value)
 
1126
  for run in red_runs[1:]:
1127
  run.text = ''
1128
  replacements_made += 1
 
1129
  return replacements_made
1130
 
1131
  def process_headings(document, flat_json):
 
1132
  replacements_made = 0
1133
  print(f"\nπŸ” Processing headings:")
 
1134
  paragraphs = document.paragraphs
 
1135
  for para_idx, paragraph in enumerate(paragraphs):
1136
  paragraph_text = paragraph.text.strip()
1137
  if not paragraph_text:
1138
  continue
 
1139
  matched_heading = None
1140
  for category, patterns in HEADING_PATTERNS.items():
1141
  for pattern in patterns:
 
1144
  break
1145
  if matched_heading:
1146
  break
 
1147
  if matched_heading:
1148
  print(f" πŸ“Œ Found heading at paragraph {para_idx + 1}: '{paragraph_text}'")
 
1149
  if has_red_text_in_paragraph(paragraph):
1150
  print(f" πŸ”΄ Found red text in heading itself")
1151
  heading_replacements = process_red_text_in_paragraph(paragraph, paragraph_text, flat_json)
1152
  replacements_made += heading_replacements
 
1153
  for next_para_offset in range(1, 6):
1154
  next_para_idx = para_idx + next_para_offset
1155
  if next_para_idx >= len(paragraphs):
1156
  break
 
1157
  next_paragraph = paragraphs[next_para_idx]
1158
  next_text = next_paragraph.text.strip()
 
1159
  if not next_text:
1160
  continue
 
1161
  is_another_heading = False
1162
  for category, patterns in HEADING_PATTERNS.items():
1163
  for pattern in patterns:
 
1166
  break
1167
  if is_another_heading:
1168
  break
 
1169
  if is_another_heading:
1170
  break
 
1171
  if has_red_text_in_paragraph(next_paragraph):
1172
  print(f" πŸ”΄ Found red text in paragraph {next_para_idx + 1} after heading")
1173
  context_replacements = process_red_text_in_paragraph(
 
1176
  flat_json
1177
  )
1178
  replacements_made += context_replacements
 
1179
  return replacements_made
1180
 
1181
  def process_red_text_in_paragraph(paragraph, context_text, flat_json):
 
1182
  replacements_made = 0
 
1183
  red_text_segments = []
1184
  for run in paragraph.runs:
1185
  if is_red(run) and run.text.strip():
1186
  red_text_segments.append(run.text.strip())
 
1187
  if not red_text_segments:
1188
  return 0
 
1189
  combined_red_text = " ".join(red_text_segments).strip()
1190
  print(f" πŸ” Red text found: '{combined_red_text}'")
1191
+ kv = find_matching_json_key_and_value(combined_red_text, flat_json)
1192
+ json_value = kv[1] if kv else None
 
1193
 
1194
  if json_value is None:
1195
  if "NHVAS APPROVED AUDITOR" in context_text.upper():
1196
  auditor_fields = ["auditor name", "auditor", "nhvas auditor", "approved auditor", "print name"]
1197
  for field in auditor_fields:
1198
+ kv = find_matching_json_key_and_value(field, flat_json)
1199
+ if kv:
1200
+ print(f" βœ… Found auditor match with field: '{kv[0]}'")
1201
+ json_value = kv[1]
1202
  break
 
1203
  elif "OPERATOR DECLARATION" in context_text.upper():
1204
  operator_fields = ["operator name", "operator", "company name", "organisation name", "print name"]
1205
  for field in operator_fields:
1206
+ kv = find_matching_json_key_and_value(field, flat_json)
1207
+ if kv:
1208
+ print(f" βœ… Found operator match with field: '{kv[0]}'")
1209
+ json_value = kv[1]
1210
  break
1211
 
1212
  if json_value is None:
1213
+ context_queries = [f"{context_text} {combined_red_text}", combined_red_text, context_text]
 
 
 
 
 
1214
  for query in context_queries:
1215
+ kv = find_matching_json_key_and_value(query, flat_json)
1216
+ if kv:
1217
+ print(f" βœ… Found match with combined query -> {kv[0]}")
1218
+ json_value = kv[1]
1219
  break
1220
 
1221
  if json_value is not None:
 
1234
  return replacements_made
1235
 
1236
  # ============================================================================
1237
+ # Orchestrator
1238
  # ============================================================================
 
1239
  def process_hf(json_file, docx_file, output_file):
 
1240
  try:
 
1241
  if hasattr(json_file, "read"):
1242
  json_data = json.load(json_file)
1243
  else:
 
1251
  print(f" - {key}: {value}")
1252
  print(f" ... and {len(flat_json) - 10} more keys\n")
1253
 
 
1254
  if hasattr(docx_file, "read"):
1255
  doc = Document(docx_file)
1256
  else:
1257
  doc = Document(docx_file)
1258
 
 
1259
  print("πŸš€ Starting comprehensive document processing...")
 
1260
  table_replacements = process_tables(doc, flat_json)
1261
  paragraph_replacements = process_paragraphs(doc, flat_json)
1262
  heading_replacements = process_headings(doc, flat_json)
 
1263
  total_replacements = table_replacements + paragraph_replacements + heading_replacements
1264
 
1265
  # Save unmatched headers for iterative improvement
 
1272
  except Exception as e:
1273
  print(f"⚠️ Could not save unmatched headers: {e}")
1274
 
 
1275
  if hasattr(output_file, "write"):
1276
  doc.save(output_file)
1277
  else:
 
1278
  doc.save(output_file)
1279
 
1280
  print(f"\nβœ… Document saved as: {output_file}")
 
1291
  import traceback
1292
  traceback.print_exc()
1293
 
 
 
 
 
1294
  if __name__ == "__main__":
1295
  import sys
1296
  if len(sys.argv) != 4: