Shami96 commited on
Commit
8bbc7e5
Β·
verified Β·
1 Parent(s): a4dde4c

Update updated_word.py

Browse files
Files changed (1) hide show
  1. updated_word.py +289 -268
updated_word.py CHANGED
@@ -8,6 +8,8 @@ Merged improvements:
8
  - safer force replacement (avoid short->long mapping)
9
  - prefer exact qualified keys for Print Name / Position Title lookups
10
  - preserved all other logic and prints/logging
 
 
11
  """
12
 
13
  import json
@@ -15,8 +17,12 @@ from docx import Document
15
  from docx.shared import RGBColor
16
  import re
17
  from typing import Any
 
 
 
 
 
18
 
19
- # Heading patterns for document structure detection
20
  HEADING_PATTERNS = {
21
  "main": [
22
  r"NHVAS\s+Audit\s+Summary\s+Report",
@@ -40,6 +46,16 @@ HEADING_PATTERNS = {
40
  ]
41
  }
42
 
 
 
 
 
 
 
 
 
 
 
43
  # ============================================================================
44
  # UTILITY FUNCTIONS
45
  # ============================================================================
@@ -61,11 +77,9 @@ def flatten_json(y, prefix=''):
61
 
62
  def is_red(run):
63
  color = run.font.color
64
- # safe checks, handle theme_color fallback as before
65
  try:
66
- return color and (getattr(color, "rgb", None) and color.rgb == RGBColor(255, 0, 0) or getattr(color, "theme_color", None) == 1)
67
  except Exception:
68
- # best-effort: If object doesn't match expected shape, return False
69
  return False
70
 
71
  def get_value_as_string(value, field_name=""):
@@ -102,6 +116,27 @@ def has_red_text_in_paragraph(paragraph):
102
  return True
103
  return False
104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  # ============================================================================
106
  # JSON MATCHING FUNCTIONS
107
  # ============================================================================
@@ -117,13 +152,13 @@ def find_matching_json_value(field_name, flat_json):
117
  print(f" βœ… Direct match found for key '{field_name}'")
118
  return flat_json[field_name]
119
 
120
- # Try case-insensitive exact match
121
  for key, value in flat_json.items():
122
  if key.lower() == field_name.lower():
123
  print(f" βœ… Case-insensitive match found for key '{field_name}' with JSON key '{key}'")
124
  return value
125
 
126
- # Better Print Name detection for operator vs auditor (prefer fully-qualified keys)
127
  if field_name.lower().strip() == "print name":
128
  operator_keys = [k for k in flat_json.keys() if "operator" in k.lower() and "print name" in k.lower()]
129
  auditor_keys = [k for k in flat_json.keys() if "auditor" in k.lower() and ("print name" in k.lower() or "name" in k.lower())]
@@ -135,13 +170,13 @@ def find_matching_json_value(field_name, flat_json):
135
  print(f" βœ… Auditor Name match: '{field_name}' -> '{auditor_keys[0]}'")
136
  return flat_json[auditor_keys[0]]
137
 
138
- # Try suffix matching (for nested keys like "section.field")
139
  for key, value in flat_json.items():
140
  if '.' in key and key.split('.')[-1].lower() == field_name.lower():
141
  print(f" βœ… Suffix match found for key '{field_name}' with JSON key '{key}'")
142
  return value
143
 
144
- # Clean and exact match attempt
145
  clean_field = re.sub(r'[^\w\s]', ' ', field_name.lower()).strip()
146
  clean_field = re.sub(r'\s+', ' ', clean_field)
147
  for key, value in flat_json.items():
@@ -151,7 +186,7 @@ def find_matching_json_value(field_name, flat_json):
151
  print(f" βœ… Clean match found for key '{field_name}' with JSON key '{key}'")
152
  return value
153
 
154
- # Enhanced fuzzy matching with better scoring
155
  field_words = set(word.lower() for word in re.findall(r'\b\w+\b', field_name) if len(word) > 2)
156
  if not field_words:
157
  return None
@@ -165,7 +200,6 @@ def find_matching_json_value(field_name, flat_json):
165
  if not key_words:
166
  continue
167
 
168
- # Calculate similarity score: Jaccard + coverage
169
  common_words = field_words.intersection(key_words)
170
  if common_words:
171
  similarity = len(common_words) / len(field_words.union(key_words))
@@ -189,20 +223,16 @@ def find_matching_json_value(field_name, flat_json):
189
  # ============================================================================
190
 
191
  def extract_red_text_segments(cell):
192
- """Extract red text segments from a cell"""
193
  red_segments = []
194
-
195
  for para_idx, paragraph in enumerate(cell.paragraphs):
196
  current_segment = ""
197
  segment_runs = []
198
-
199
  for run_idx, run in enumerate(paragraph.runs):
200
  if is_red(run):
201
  if run.text:
202
  current_segment += run.text
203
  segment_runs.append((para_idx, run_idx, run))
204
  else:
205
- # End of current red segment
206
  if segment_runs:
207
  red_segments.append({
208
  'text': current_segment,
@@ -211,19 +241,15 @@ def extract_red_text_segments(cell):
211
  })
212
  current_segment = ""
213
  segment_runs = []
214
-
215
- # Handle segment at end of paragraph
216
  if segment_runs:
217
  red_segments.append({
218
  'text': current_segment,
219
  'runs': segment_runs.copy(),
220
  'paragraph_idx': para_idx
221
  })
222
-
223
  return red_segments
224
 
225
  def replace_all_red_segments(red_segments, replacement_text):
226
- """Replace all red segments with replacement text"""
227
  if not red_segments:
228
  return 0
229
 
@@ -241,7 +267,6 @@ def replace_all_red_segments(red_segments, replacement_text):
241
  first_run.text = replacement_lines[0]
242
  first_run.font.color.rgb = RGBColor(0, 0, 0)
243
  replacements_made = 1
244
-
245
  for _, _, run in first_segment['runs'][1:]:
246
  run.text = ''
247
 
@@ -253,14 +278,12 @@ def replace_all_red_segments(red_segments, replacement_text):
253
  try:
254
  first_run = red_segments[0]['runs'][0][2]
255
  paragraph = first_run.element.getparent()
256
- # Add line breaks + new runs (best-effort)
257
  from docx.oxml import OxmlElement
258
  parent = first_run.element.getparent()
259
  for line in replacement_lines[1:]:
260
  if line.strip():
261
  br = OxmlElement('w:br')
262
  first_run.element.append(br)
263
- # create a new run in the same paragraph node (docx high-level API)
264
  new_run = paragraph.add_run(line.strip())
265
  new_run.font.color.rgb = RGBColor(0, 0, 0)
266
  except Exception:
@@ -272,26 +295,19 @@ def replace_all_red_segments(red_segments, replacement_text):
272
  return replacements_made
273
 
274
  def replace_single_segment(segment, replacement_text):
275
- """Replace a single red text segment"""
276
  if not segment['runs']:
277
  return False
278
-
279
  first_run = segment['runs'][0][2]
280
  first_run.text = replacement_text
281
  first_run.font.color.rgb = RGBColor(0, 0, 0)
282
-
283
  for _, _, run in segment['runs'][1:]:
284
  run.text = ''
285
-
286
  return True
287
 
288
  def replace_red_text_in_cell(cell, replacement_text):
289
- """Replace red text in a cell with replacement text"""
290
  red_segments = extract_red_text_segments(cell)
291
-
292
  if not red_segments:
293
  return 0
294
-
295
  return replace_all_red_segments(red_segments, replacement_text)
296
 
297
  # ============================================================================
@@ -299,7 +315,6 @@ def replace_red_text_in_cell(cell, replacement_text):
299
  # ============================================================================
300
 
301
  def handle_australian_company_number(row, company_numbers):
302
- """Handle Australian Company Number digit placement"""
303
  replacements_made = 0
304
  for i, digit in enumerate(company_numbers):
305
  cell_idx = i + 1
@@ -312,48 +327,106 @@ def handle_australian_company_number(row, company_numbers):
312
  return replacements_made
313
 
314
  def handle_vehicle_registration_table(table, flat_json):
315
- """Handle vehicle registration table data replacement"""
316
  replacements_made = 0
317
 
318
- # Try to find vehicle registration data
319
- vehicle_section = None
 
 
 
320
 
 
 
 
 
 
 
 
 
 
 
 
 
321
  for key, value in flat_json.items():
322
- if "vehicle registration numbers of records examined" in key.lower():
323
- if isinstance(value, dict):
324
- vehicle_section = value
325
- print(f" βœ… Found vehicle data in key: '{key}'")
 
 
 
 
 
 
 
 
 
326
  break
327
 
328
- if not vehicle_section:
 
 
 
 
 
 
 
 
 
329
  potential_columns = {}
330
  for key, value in flat_json.items():
331
- if any(col_name in key.lower() for col_name in ["registration number", "sub-contractor", "weight verification", "rfs suspension", "trip records", "suspension"]):
 
332
  if "." in key:
333
  column_name = key.split(".")[-1]
334
  else:
335
  column_name = key
336
  potential_columns[column_name] = value
337
-
338
  if potential_columns:
339
  vehicle_section = potential_columns
340
  print(f" βœ… Found vehicle data from flattened keys: {list(vehicle_section.keys())}")
341
- else:
342
- print(f" ❌ Vehicle registration data not found in JSON")
343
- return 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
 
345
  print(f" βœ… Found vehicle registration data with {len(vehicle_section)} columns")
346
 
347
- # Find header row
348
  header_row_idx = -1
349
  header_row = None
350
-
351
  for row_idx, row in enumerate(table.rows):
352
- row_text = "".join(get_clean_text(cell).lower() for cell in row.cells)
353
  if "registration" in row_text and "number" in row_text:
354
  header_row_idx = row_idx
355
  header_row = row
356
  break
 
 
 
 
 
 
 
 
357
 
358
  if header_row_idx == -1:
359
  print(f" ❌ Could not find header row in vehicle table")
@@ -361,56 +434,76 @@ def handle_vehicle_registration_table(table, flat_json):
361
 
362
  print(f" βœ… Found header row at index {header_row_idx}")
363
 
364
- # Enhanced column mapping (same method as before)
365
  column_mapping = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
366
  for col_idx, cell in enumerate(header_row.cells):
367
  header_text = get_clean_text(cell).strip()
368
- if not header_text or header_text.lower() == "no.":
 
 
 
369
  continue
370
 
 
371
  best_match = None
372
- best_score = 0
373
-
374
- normalized_header = header_text.lower().replace("(", " (").replace(")", ") ").strip()
375
-
376
- for json_key in vehicle_section.keys():
377
- normalized_json = json_key.lower().strip()
378
-
379
- if normalized_header == normalized_json:
380
- best_match = json_key
381
- best_score = 1.0
382
- break
383
-
384
- header_words = set(word.lower() for word in normalized_header.split() if len(word) > 2)
385
- json_words = set(word.lower() for word in normalized_json.split() if len(word) > 2)
386
-
387
- if header_words and json_words:
388
- common_words = header_words.intersection(json_words)
389
- score = len(common_words) / max(len(header_words), len(json_words))
390
 
391
- if score > best_score and score >= 0.3:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
392
  best_score = score
393
- best_match = json_key
394
 
395
- header_clean = normalized_header.replace(" ", "").replace("-", "").replace("(", "").replace(")", "")
396
- json_clean = normalized_json.replace(" ", "").replace("-", "").replace("(", "").replace(")", "")
397
-
398
- if header_clean in json_clean or json_clean in header_clean:
399
- if len(header_clean) > 5 and len(json_clean) > 5:
400
- substring_score = min(len(header_clean), len(json_clean)) / max(len(header_clean), len(json_clean))
401
- if substring_score > best_score and substring_score >= 0.6:
402
- best_score = substring_score
403
- best_match = json_key
404
-
405
- if best_match:
406
  column_mapping[col_idx] = best_match
407
- print(f" πŸ“Œ Column {col_idx + 1} ('{header_text}') -> '{best_match}' (score: {best_score:.2f})")
 
 
 
408
 
409
  if not column_mapping:
410
  print(f" ❌ No column mappings found")
411
  return 0
412
 
413
- # Determine data rows needed
414
  max_data_rows = 0
415
  for json_key, data in vehicle_section.items():
416
  if isinstance(data, list):
@@ -418,14 +511,13 @@ def handle_vehicle_registration_table(table, flat_json):
418
 
419
  print(f" πŸ“Œ Need to populate {max_data_rows} data rows")
420
 
421
- # Process data rows
422
  for data_row_index in range(max_data_rows):
423
  table_row_idx = header_row_idx + 1 + data_row_index
424
 
425
  if table_row_idx >= len(table.rows):
426
  print(f" ⚠️ Row {table_row_idx + 1} doesn't exist - table only has {len(table.rows)} rows")
427
  print(f" βž• Adding new row for vehicle {data_row_index + 1}")
428
-
429
  new_row = table.add_row()
430
  print(f" βœ… Successfully added row {len(table.rows)} to the table")
431
 
@@ -458,33 +550,26 @@ def handle_attendance_list_table_enhanced(table, flat_json):
458
  """Enhanced Attendance List processing with better detection"""
459
  replacements_made = 0
460
 
461
- # Check multiple patterns for attendance list
462
  attendance_patterns = [
463
  "attendance list",
464
  "names and position titles",
465
  "attendees"
466
  ]
467
 
468
- # Scan all cells in the first few rows for attendance list indicators
469
  found_attendance_row = None
470
-
471
- for row_idx, row in enumerate(table.rows[:3]): # Check first 3 rows
472
  for cell_idx, cell in enumerate(row.cells):
473
  cell_text = get_clean_text(cell).lower()
474
-
475
- # Check if this cell contains attendance list header
476
  if any(pattern in cell_text for pattern in attendance_patterns):
477
  found_attendance_row = row_idx
478
  print(f" 🎯 ENHANCED: Found Attendance List in row {row_idx + 1}, cell {cell_idx + 1}")
479
  break
480
-
481
  if found_attendance_row is not None:
482
  break
483
 
484
  if found_attendance_row is None:
485
  return 0
486
 
487
- # Look for attendance data in JSON
488
  attendance_value = None
489
  attendance_search_keys = [
490
  "Attendance List (Names and Position Titles).Attendance List (Names and Position Titles)",
@@ -506,9 +591,7 @@ def handle_attendance_list_table_enhanced(table, flat_json):
506
  print(f" ❌ No attendance data found in JSON")
507
  return 0
508
 
509
- # Look for red text in ALL cells of the table
510
  target_cell = None
511
-
512
  print(f" πŸ” Scanning ALL cells in attendance table for red text...")
513
 
514
  for row_idx, row in enumerate(table.rows):
@@ -516,35 +599,29 @@ def handle_attendance_list_table_enhanced(table, flat_json):
516
  if has_red_text(cell):
517
  print(f" 🎯 Found red text in row {row_idx + 1}, cell {cell_idx + 1}")
518
 
519
- # Get the red text to see if it looks like attendance data
520
  red_text = ""
521
  for paragraph in cell.paragraphs:
522
  for run in paragraph.runs:
523
  if is_red(run):
524
  red_text += run.text
525
 
526
- print(f" πŸ“‹ Red text content: '{red_text[:50]}...'")
527
 
528
- # Check if this red text looks like attendance data (contains names/manager/etc)
529
  red_text_lower = red_text.lower()
530
- if any(indicator in red_text_lower for indicator in ['manager', 'herbig', 'palin', '–', '-']):
531
  target_cell = cell
532
  print(f" βœ… This looks like attendance data - using this cell")
533
  break
534
-
535
  if target_cell is not None:
536
  break
537
 
538
- # If no red text found that looks like attendance data, return
539
  if target_cell is None:
540
  print(f" ⚠️ No red text found that looks like attendance data")
541
  return 0
542
 
543
- # Replace red text with properly formatted attendance list
544
  if has_red_text(target_cell):
545
  print(f" πŸ”§ Replacing red text with properly formatted attendance list...")
546
 
547
- # Ensure attendance_value is a list
548
  if isinstance(attendance_value, list):
549
  attendance_list = [str(item).strip() for item in attendance_value if str(item).strip()]
550
  else:
@@ -554,7 +631,6 @@ def handle_attendance_list_table_enhanced(table, flat_json):
554
  for i, item in enumerate(attendance_list):
555
  print(f" {i+1}. {item}")
556
 
557
- # Replace with line-separated attendance list
558
  replacement_text = "\n".join(attendance_list)
559
  cell_replacements = replace_red_text_in_cell(target_cell, replacement_text)
560
  replacements_made += cell_replacements
@@ -565,71 +641,108 @@ def handle_attendance_list_table_enhanced(table, flat_json):
565
  return replacements_made
566
 
567
  def fix_management_summary_details_column(table, flat_json):
568
- """Fix the DETAILS column in Management Summary table"""
569
  replacements_made = 0
570
 
571
  print(f" 🎯 FIX: Management Summary DETAILS column processing")
572
 
573
- # Check if this is a Management Summary table
574
  table_text = ""
575
- for row in table.rows[:2]:
576
  for cell in row.cells:
577
  table_text += get_clean_text(cell).lower() + " "
578
 
579
- if not ("mass management" in table_text and "details" in table_text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
580
  return 0
581
 
582
- print(f" βœ… Confirmed Mass Management Summary table")
583
-
584
- # Process each row looking for Std 5. and Std 6. with red text
585
- for row_idx, row in enumerate(table.rows):
586
- if len(row.cells) >= 2:
587
- standard_cell = row.cells[0]
588
- details_cell = row.cells[1]
589
-
590
- standard_text = get_clean_text(standard_cell).strip()
591
-
592
- # Look for Std 5. Verification and Std 6. Internal Review specifically
593
- if "Std 5." in standard_text and "Verification" in standard_text:
594
- if has_red_text(details_cell):
595
- print(f" πŸ” Found Std 5. Verification with red text")
596
-
597
- json_value = find_matching_json_value("Std 5. Verification", flat_json)
598
- if json_value is not None:
599
- replacement_text = get_value_as_string(json_value, "Std 5. Verification")
600
- cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
601
- replacements_made += cell_replacements
602
- print(f" βœ… Replaced Std 5. Verification details")
603
-
604
- elif "Std 6." in standard_text and "Internal Review" in standard_text:
605
- if has_red_text(details_cell):
606
- print(f" πŸ” Found Std 6. Internal Review with red text")
607
 
608
- json_value = find_matching_json_value("Std 6. Internal Review", flat_json)
609
- if json_value is not None:
610
- replacement_text = get_value_as_string(json_value, "Std 6. Internal Review")
611
- cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
612
- replacements_made += cell_replacements
613
- print(f" βœ… Replaced Std 6. Internal Review details")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
614
 
615
  return replacements_made
616
 
617
- # ========================================================================
618
- # IMPORTANT: Single canonical definition for Operator Declaration fixer
619
- # ========================================================================
620
-
621
  def fix_operator_declaration_empty_values(table, flat_json):
622
- """Fix Operator Declaration table when values are empty or need updating.
623
- - Prefer exact qualified keys.
624
- - If JSON has combined 'Name - Position', split it safely.
625
- - Only write into cells that are empty or contain red text.
626
- - Mark table as processed on success.
627
- """
628
  replacements_made = 0
629
 
630
  print(f" 🎯 FIX: Operator Declaration empty values processing")
631
 
632
- # Check if this is an Operator Declaration table
633
  table_context = ""
634
  for row in table.rows:
635
  for cell in row.cells:
@@ -641,17 +754,13 @@ def fix_operator_declaration_empty_values(table, flat_json):
641
  print(f" βœ… Confirmed Operator Declaration table")
642
 
643
  def parse_name_and_position(value):
644
- """Try to split combined name/position values into (name, position)."""
645
  if value is None:
646
  return None, None
647
-
648
- # If it's a list: common pattern is [name, position]
649
  if isinstance(value, list):
650
  if len(value) == 0:
651
  return None, None
652
  if len(value) == 1:
653
  return str(value[0]).strip(), None
654
- # use first two sensible entries
655
  first = str(value[0]).strip()
656
  second = str(value[1]).strip()
657
  if first and second:
@@ -662,7 +771,6 @@ def fix_operator_declaration_empty_values(table, flat_json):
662
  if not s:
663
  return None, None
664
 
665
- # Common separators: hyphen, en-dash, em-dash, comma, pipe
666
  parts = re.split(r'\s+[-–—]\s+|\s*,\s*|\s*\|\s*', s)
667
  if len(parts) >= 2:
668
  left = parts[0].strip()
@@ -675,7 +783,6 @@ def fix_operator_declaration_empty_values(table, flat_json):
675
  return right, left
676
  return left, right
677
 
678
- # If no separator, check trailing role token
679
  tokens = s.split()
680
  if len(tokens) >= 2:
681
  last = tokens[-1]
@@ -684,10 +791,8 @@ def fix_operator_declaration_empty_values(table, flat_json):
684
  if any(ind == last.lower() for ind in role_indicators):
685
  return " ".join(tokens[:-1]), last
686
 
687
- # fallback: treat entire string as name
688
  return s, None
689
 
690
- # Locate header row + data row
691
  for row_idx, row in enumerate(table.rows):
692
  if len(row.cells) >= 2:
693
  cell1_text = get_clean_text(row.cells[0]).strip().lower()
@@ -706,7 +811,6 @@ def fix_operator_declaration_empty_values(table, flat_json):
706
  position_text = get_clean_text(position_cell).strip()
707
  print(f" πŸ“‹ Current values: Name='{name_text}', Position='{position_text}'")
708
 
709
- # Prefer exact qualified keys first
710
  name_value = find_matching_json_value("Operator Declaration.Print Name", flat_json)
711
  if name_value is None:
712
  name_value = find_matching_json_value("Print Name", flat_json)
@@ -715,11 +819,9 @@ def fix_operator_declaration_empty_values(table, flat_json):
715
  if position_value is None:
716
  position_value = find_matching_json_value("Position Title", flat_json)
717
 
718
- # parse combined cases
719
  parsed_name_from_nameval, parsed_pos_from_nameval = parse_name_and_position(name_value) if name_value is not None else (None, None)
720
  parsed_name_from_posval, parsed_pos_from_posval = parse_name_and_position(position_value) if position_value is not None else (None, None)
721
 
722
- # decide final candidates
723
  final_name = None
724
  final_pos = None
725
 
@@ -728,7 +830,6 @@ def fix_operator_declaration_empty_values(table, flat_json):
728
  elif name_value is not None:
729
  final_name = get_value_as_string(name_value)
730
 
731
- # position preference: parsed_pos_from_posval > explicit position_value > parsed_pos_from_nameval
732
  if parsed_pos_from_posval:
733
  final_pos = parsed_pos_from_posval
734
  elif position_value is not None:
@@ -736,7 +837,6 @@ def fix_operator_declaration_empty_values(table, flat_json):
736
  elif parsed_pos_from_nameval:
737
  final_pos = parsed_pos_from_nameval
738
 
739
- # normalize
740
  if isinstance(final_name, list):
741
  final_name = " ".join(str(x) for x in final_name).strip()
742
  if isinstance(final_pos, list):
@@ -755,7 +855,6 @@ def fix_operator_declaration_empty_values(table, flat_json):
755
  return False
756
  return len(name_str) > 1
757
 
758
- # Write name if empty or red
759
  if (not name_text or has_red_text(name_cell)) and final_name and looks_like_person(final_name):
760
  if has_red_text(name_cell):
761
  replace_red_text_in_cell(name_cell, final_name)
@@ -764,7 +863,6 @@ def fix_operator_declaration_empty_values(table, flat_json):
764
  replacements_made += 1
765
  print(f" βœ… Updated Print Name -> '{final_name}'")
766
 
767
- # Write position if empty or red
768
  if (not position_text or has_red_text(position_cell)) and final_pos:
769
  if has_red_text(position_cell):
770
  replace_red_text_in_cell(position_cell, final_pos)
@@ -775,7 +873,6 @@ def fix_operator_declaration_empty_values(table, flat_json):
775
 
776
  break
777
 
778
- # mark processed
779
  if replacements_made > 0:
780
  try:
781
  setattr(table, "_processed_operator_declaration", True)
@@ -786,14 +883,10 @@ def fix_operator_declaration_empty_values(table, flat_json):
786
  return replacements_made
787
 
788
  def handle_multiple_red_segments_in_cell(cell, flat_json):
789
- """Handle multiple red text segments within a single cell"""
790
  replacements_made = 0
791
-
792
  red_segments = extract_red_text_segments(cell)
793
  if not red_segments:
794
  return 0
795
-
796
- # Try to match each segment individually
797
  for i, segment in enumerate(red_segments):
798
  segment_text = segment['text'].strip()
799
  if segment_text:
@@ -803,63 +896,45 @@ def handle_multiple_red_segments_in_cell(cell, flat_json):
803
  if replace_single_segment(segment, replacement_text):
804
  replacements_made += 1
805
  print(f" βœ… Replaced segment {i+1}: '{segment_text}' -> '{replacement_text}'")
806
-
807
  return replacements_made
808
 
809
  def handle_nature_business_multiline_fix(cell, flat_json):
810
- """Handle Nature of Business multiline red text"""
811
  replacements_made = 0
812
-
813
- # Extract red text to check if it looks like nature of business
814
  red_text = ""
815
  for paragraph in cell.paragraphs:
816
  for run in paragraph.runs:
817
  if is_red(run):
818
  red_text += run.text
819
-
820
  red_text = red_text.strip()
821
  if not red_text:
822
  return 0
823
-
824
- # Check if this looks like nature of business content
825
  nature_indicators = ["transport", "logistics", "freight", "delivery", "trucking", "haulage"]
826
  if any(indicator in red_text.lower() for indicator in nature_indicators):
827
- # Try to find nature of business in JSON
828
  nature_value = find_matching_json_value("Nature of Business", flat_json)
829
  if nature_value is not None:
830
  replacement_text = get_value_as_string(nature_value, "Nature of Business")
831
  cell_replacements = replace_red_text_in_cell(cell, replacement_text)
832
  replacements_made += cell_replacements
833
  print(f" βœ… Fixed Nature of Business multiline content")
834
-
835
  return replacements_made
836
 
837
  def handle_management_summary_fix(cell, flat_json):
838
- """Handle Management Summary content fixes"""
839
  replacements_made = 0
840
-
841
- # Extract red text
842
  red_text = ""
843
  for paragraph in cell.paragraphs:
844
  for run in paragraph.runs:
845
  if is_red(run):
846
  red_text += run.text
847
-
848
  red_text = red_text.strip()
849
  if not red_text:
850
  return 0
851
-
852
- # Look for management summary data in new schema format
853
  management_types = ["Mass Management Summary", "Maintenance Management Summary", "Fatigue Management Summary"]
854
-
855
  for mgmt_type in management_types:
856
  if mgmt_type in flat_json:
857
  mgmt_data = flat_json[mgmt_type]
858
  if isinstance(mgmt_data, dict):
859
- # Try to match red text with any standard in this management type
860
  for std_key, std_value in mgmt_data.items():
861
  if isinstance(std_value, list) and std_value:
862
- # Check if red text matches this standard
863
  if len(red_text) > 10:
864
  for item in std_value:
865
  if red_text.lower() in str(item).lower() or str(item).lower() in red_text.lower():
@@ -868,44 +943,32 @@ def handle_management_summary_fix(cell, flat_json):
868
  replacements_made += cell_replacements
869
  print(f" βœ… Fixed {mgmt_type} - {std_key}")
870
  return replacements_made
871
-
872
  return replacements_made
873
 
874
- # ========================================================================
875
  # SMALL OPERATOR/AUDITOR TABLE HANDLER (skip if already processed)
876
- # ========================================================================
877
 
878
  def handle_operator_declaration_fix(table, flat_json):
879
- """Wrapper for small declaration tables. Delegate to canonical fix first.
880
- If canonical did not change anything, fall back to the small-table auditor handling.
881
- Safeguards: do not replace with date-like values; prefer person/role candidates.
882
- """
883
  replacements_made = 0
884
 
885
- # skip if already processed
886
  if getattr(table, "_processed_operator_declaration", False):
887
  print(f" ⏭️ Skipping - Operator Declaration table already processed")
888
  return 0
889
 
890
- # only intended for small tables; if large, skip
891
  if len(table.rows) > 4:
892
  return 0
893
 
894
- # First: try canonical operator declaration handler (covers primary case)
895
  replaced = fix_operator_declaration_empty_values(table, flat_json)
896
  replacements_made += replaced
897
  if replaced:
898
- # canonical handled it and set the processed flag
899
  return replacements_made
900
 
901
- # --- Helper validators (local, minimal, safe) ---
902
  def is_date_like(s: str) -> bool:
903
  if not s:
904
  return False
905
  s = s.strip()
906
- # common tokens that indicate a date string
907
  month_names = r"(jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec|january|february|march|april|may|june|july|august|september|october|november|december)"
908
- # patterns: "2nd November 2023", "02/11/2023", "2023-11-02", "November 2023", "Date"
909
  if re.search(r"\bDate\b", s, re.IGNORECASE):
910
  return True
911
  if re.search(r"\b\d{1,2}(?:st|nd|rd|th)?\b\s+" + month_names, s, re.IGNORECASE):
@@ -916,7 +979,6 @@ def handle_operator_declaration_fix(table, flat_json):
916
  return True
917
  if re.search(r"\b\d{4}[\/\.\-]\d{1,2}[\/\.\-]\d{1,2}\b", s):
918
  return True
919
- # single 4-digit year alone
920
  if re.fullmatch(r"\d{4}", s):
921
  return True
922
  return False
@@ -925,11 +987,9 @@ def handle_operator_declaration_fix(table, flat_json):
925
  if not s:
926
  return False
927
  low = s.lower().strip()
928
- # reject org/company terms
929
  bad_terms = ["pty ltd", "p/l", "plc", "company", "farming", "farm", "trust", "ltd"]
930
  if any(bt in low for bt in bad_terms):
931
  return False
932
- # minimal length check and presence of alphabetic characters
933
  if len(low) < 3:
934
  return False
935
  return bool(re.search(r"[a-zA-Z]", low))
@@ -941,16 +1001,13 @@ def handle_operator_declaration_fix(table, flat_json):
941
  roles = ["manager", "auditor", "owner", "director", "supervisor", "coordinator", "driver", "operator", "representative", "chief"]
942
  return any(r in low for r in roles)
943
 
944
- # fallback: original small-table behaviour (auditor declaration etc.)
945
  print(f" 🎯 Processing other declaration table (fallback small-table behavior)")
946
 
947
  for row_idx, row in enumerate(table.rows):
948
  for cell_idx, cell in enumerate(row.cells):
949
  if not has_red_text(cell):
950
- # do not overwrite non-red content in fallback
951
  continue
952
 
953
- # Try auditor-specific fields first
954
  declaration_fields = [
955
  "NHVAS Approved Auditor Declaration.Print Name",
956
  "Auditor name",
@@ -968,19 +1025,13 @@ def handle_operator_declaration_fix(table, flat_json):
968
  if not replacement_text:
969
  continue
970
 
971
- # SAFEGUARD: do not replace with date-like text for name/position cells
972
  if is_date_like(replacement_text):
973
- # allow genuinely date-targeted cells (if red text explicitly contains 'date')
974
- # but skip using a date string to fill 'name' or 'position' slots
975
- # check the red text in the cell to see if it expects a date
976
  red_text = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red(run)).strip()
977
  if "date" not in red_text.lower():
978
  print(f" ⚠️ Skipping date-like replacement for field '{field}' -> '{replacement_text[:30]}...'")
979
  continue
980
 
981
- # Further safeguard: if replacement looks like a person or role, only then write into name/position cells
982
  if (looks_like_person_name(replacement_text) or looks_like_position(replacement_text) or "signature" in field.lower() or "date" in field.lower()):
983
- # Replace only red runs (safe)
984
  cell_replacements = replace_red_text_in_cell(cell, replacement_text)
985
  if cell_replacements > 0:
986
  replacements_made += cell_replacements
@@ -988,11 +1039,9 @@ def handle_operator_declaration_fix(table, flat_json):
988
  print(f" βœ… Fixed declaration field: {field} -> '{replacement_text}'")
989
  break
990
  else:
991
- # Not a person or role-looking text β€” skip to avoid clobbering name/position with unrelated content
992
  print(f" ⚠️ Replacement for field '{field}' does not look like name/role, skipping: '{replacement_text[:30]}...'")
993
  continue
994
 
995
- # If not replaced by the declared fields, try to infer from the cell's red text (date/signature fallback)
996
  if not replaced_this_cell:
997
  red_text = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red(run)).strip().lower()
998
  if "signature" in red_text:
@@ -1001,12 +1050,10 @@ def handle_operator_declaration_fix(table, flat_json):
1001
  replacements_made += cell_replacements
1002
  print(f" βœ… Inserted placeholder [Signature]")
1003
  elif "date" in red_text:
1004
- # Try to find a date value in JSON for an explicit date slot else skip
1005
  date_value = find_matching_json_value("Date", flat_json) or find_matching_json_value("Date of Audit", flat_json) or find_matching_json_value("Audit was conducted on", flat_json)
1006
  if date_value is not None:
1007
  date_text = get_value_as_string(date_value)
1008
  if not is_date_like(date_text):
1009
- # defensive: if the date value is not date-like, skip
1010
  print(f" ⚠️ Found date-value but not date-like, skipping: '{date_text}'")
1011
  else:
1012
  cell_replacements = replace_red_text_in_cell(cell, date_text)
@@ -1014,7 +1061,6 @@ def handle_operator_declaration_fix(table, flat_json):
1014
  replacements_made += cell_replacements
1015
  print(f" βœ… Inserted date value: '{date_text}'")
1016
 
1017
- # if any replacements made here, mark processed
1018
  if replacements_made > 0:
1019
  try:
1020
  setattr(table, "_processed_operator_declaration", True)
@@ -1025,22 +1071,17 @@ def handle_operator_declaration_fix(table, flat_json):
1025
  return replacements_made
1026
 
1027
  def handle_print_accreditation_section(table, flat_json):
1028
- """Handle Print Accreditation section - SKIP Operator Declaration tables"""
1029
  replacements_made = 0
1030
 
1031
- # <<< PATCH: skip if operator declaration already processed
1032
  if getattr(table, "_processed_operator_declaration", False):
1033
  print(f" ⏭️ Skipping Print Accreditation - this is an Operator Declaration table")
1034
  return 0
1035
- # <<< END PATCH
1036
 
1037
- # Get table context to check what type of table this is
1038
  table_context = ""
1039
  for row in table.rows:
1040
  for cell in row.cells:
1041
  table_context += get_clean_text(cell).lower() + " "
1042
 
1043
- # SKIP if this is an Operator Declaration table
1044
  if "operator declaration" in table_context or ("print name" in table_context and "position title" in table_context):
1045
  print(f" ⏭️ Skipping Print Accreditation - this is an Operator Declaration table")
1046
  return 0
@@ -1050,11 +1091,11 @@ def handle_print_accreditation_section(table, flat_json):
1050
  for row_idx, row in enumerate(table.rows):
1051
  for cell_idx, cell in enumerate(row.cells):
1052
  if has_red_text(cell):
1053
- # Try print accreditation fields
1054
  accreditation_fields = [
1055
  "(print accreditation name)",
1056
  "Operator name (Legal entity)",
1057
- "Print accreditation name"
 
1058
  ]
1059
 
1060
  for field in accreditation_fields:
@@ -1071,7 +1112,6 @@ def handle_print_accreditation_section(table, flat_json):
1071
  return replacements_made
1072
 
1073
  def process_single_column_sections(cell, key_text, flat_json):
1074
- """Process single column sections with red text"""
1075
  replacements_made = 0
1076
 
1077
  if has_red_text(cell):
@@ -1082,10 +1122,8 @@ def process_single_column_sections(cell, key_text, flat_json):
1082
  red_text += run.text
1083
 
1084
  if red_text.strip():
1085
- # Try direct matching first
1086
  section_value = find_matching_json_value(red_text.strip(), flat_json)
1087
  if section_value is None:
1088
- # Try key-based matching
1089
  section_value = find_matching_json_value(key_text, flat_json)
1090
 
1091
  if section_value is not None:
@@ -1108,13 +1146,13 @@ def process_tables(document, flat_json):
1108
  for table_idx, table in enumerate(document.tables):
1109
  print(f"\nπŸ” Processing table {table_idx + 1}:")
1110
 
1111
- # Get table context
1112
  table_text = ""
1113
  for row in table.rows[:3]:
1114
  for cell in row.cells:
1115
  table_text += get_clean_text(cell).lower() + " "
1116
 
1117
- # Detect Management Summary tables
1118
  management_summary_indicators = ["mass management", "maintenance management", "fatigue management"]
1119
  has_management = any(indicator in table_text for indicator in management_summary_indicators)
1120
  has_details = "details" in table_text
@@ -1129,10 +1167,9 @@ def process_tables(document, flat_json):
1129
  for row_idx, row in enumerate(table.rows):
1130
  for cell_idx, cell in enumerate(row.cells):
1131
  if has_red_text(cell):
1132
- # Try direct matching with the new schema names first
1133
  for mgmt_type in ["Mass Management Summary", "Maintenance Management Summary", "Fatigue Management Summary"]:
1134
  if mgmt_type.lower().replace(" summary", "") in table_text:
1135
- # Look for this standard in the JSON
1136
  if mgmt_type in flat_json:
1137
  mgmt_data = flat_json[mgmt_type]
1138
  if isinstance(mgmt_data, dict):
@@ -1156,7 +1193,7 @@ def process_tables(document, flat_json):
1156
  continue
1157
 
1158
  # Detect Vehicle Registration tables
1159
- vehicle_indicators = ["registration number", "sub-contractor", "weight verification", "rfs suspension"]
1160
  indicator_count = sum(1 for indicator in vehicle_indicators if indicator in table_text)
1161
  if indicator_count >= 2:
1162
  print(f" πŸš— Detected Vehicle Registration table")
@@ -1175,22 +1212,18 @@ def process_tables(document, flat_json):
1175
  print_accreditation_indicators = ["print name", "position title"]
1176
  indicator_count = sum(1 for indicator in print_accreditation_indicators if indicator in table_text)
1177
 
1178
- # <<< PATCH: require both indicators (or two matches) to reduce false positives
1179
  if indicator_count >= 2 or ("print name" in table_text and "position title" in table_text):
1180
  print(f" πŸ“‹ Detected Print Accreditation/Operator Declaration table")
1181
-
1182
- # First, try strong operator declaration fix (exact keys)
1183
  declaration_fixes = fix_operator_declaration_empty_values(table, flat_json)
1184
  replacements_made += declaration_fixes
1185
 
1186
- # Then only run print accreditation section if not marked processed
1187
  if not getattr(table, "_processed_operator_declaration", False):
1188
  print_accreditation_replacements = handle_print_accreditation_section(table, flat_json)
1189
  replacements_made += print_accreditation_replacements
1190
 
1191
  continue
1192
 
1193
- # Process regular table rows (same as your original logic)
1194
  for row_idx, row in enumerate(table.rows):
1195
  if len(row.cells) < 1:
1196
  continue
@@ -1208,16 +1241,13 @@ def process_tables(document, flat_json):
1208
  if json_value is not None:
1209
  replacement_text = get_value_as_string(json_value, key_text)
1210
 
1211
- # Handle Australian Company Number
1212
  if ("australian company number" in key_text.lower() or "company number" in key_text.lower()) and isinstance(json_value, list):
1213
  cell_replacements = handle_australian_company_number(row, json_value)
1214
  replacements_made += cell_replacements
1215
 
1216
- # Handle section headers
1217
  elif ("attendance list" in key_text.lower() or "nature of" in key_text.lower()) and row_idx + 1 < len(table.rows):
1218
  print(f" βœ… Section header detected, checking next row...")
1219
  next_row = table.rows[row_idx + 1]
1220
-
1221
  for cell_idx, cell in enumerate(next_row.cells):
1222
  if has_red_text(cell):
1223
  print(f" βœ… Found red text in next row, cell {cell_idx + 1}")
@@ -1228,13 +1258,11 @@ def process_tables(document, flat_json):
1228
  if cell_replacements > 0:
1229
  print(f" -> Replaced section content")
1230
 
1231
- # Handle single column sections
1232
  elif len(row.cells) == 1 or (len(row.cells) > 1 and not any(has_red_text(row.cells[i]) for i in range(1, len(row.cells)))):
1233
  if has_red_text(key_cell):
1234
  cell_replacements = process_single_column_sections(key_cell, key_text, flat_json)
1235
  replacements_made += cell_replacements
1236
 
1237
- # Handle regular key-value pairs
1238
  else:
1239
  for cell_idx in range(1, len(row.cells)):
1240
  value_cell = row.cells[cell_idx]
@@ -1244,7 +1272,6 @@ def process_tables(document, flat_json):
1244
  replacements_made += cell_replacements
1245
 
1246
  else:
1247
- # Fallback processing for unmatched keys
1248
  if len(row.cells) == 1 and has_red_text(key_cell):
1249
  red_text = ""
1250
  for paragraph in key_cell.paragraphs:
@@ -1258,14 +1285,12 @@ def process_tables(document, flat_json):
1258
  cell_replacements = replace_red_text_in_cell(key_cell, section_replacement)
1259
  replacements_made += cell_replacements
1260
 
1261
- # Process red text in all cells
1262
  for cell_idx in range(len(row.cells)):
1263
  cell = row.cells[cell_idx]
1264
  if has_red_text(cell):
1265
  cell_replacements = handle_multiple_red_segments_in_cell(cell, flat_json)
1266
  replacements_made += cell_replacements
1267
 
1268
- # Apply fixes if no replacements made
1269
  if cell_replacements == 0:
1270
  surgical_fix = handle_nature_business_multiline_fix(cell, flat_json)
1271
  replacements_made += surgical_fix
@@ -1274,7 +1299,7 @@ def process_tables(document, flat_json):
1274
  management_summary_fix = handle_management_summary_fix(cell, flat_json)
1275
  replacements_made += management_summary_fix
1276
 
1277
- # Handle Operator/Auditor Declaration tables (check last few tables)
1278
  print(f"\n🎯 Final check for Declaration tables...")
1279
  for table in document.tables[-3:]:
1280
  if len(table.rows) <= 4:
@@ -1300,7 +1325,6 @@ def process_paragraphs(document, flat_json):
1300
  json_value = find_matching_json_value(red_text_only, flat_json)
1301
 
1302
  if json_value is None:
1303
- # Enhanced pattern matching for signatures and dates
1304
  if "AUDITOR SIGNATURE" in red_text_only.upper() or "DATE" in red_text_only.upper():
1305
  json_value = find_matching_json_value("auditor signature", flat_json)
1306
  elif "OPERATOR SIGNATURE" in red_text_only.upper():
@@ -1326,11 +1350,9 @@ def process_headings(document, flat_json):
1326
 
1327
  for para_idx, paragraph in enumerate(paragraphs):
1328
  paragraph_text = paragraph.text.strip()
1329
-
1330
  if not paragraph_text:
1331
  continue
1332
 
1333
- # Check if this is a heading
1334
  matched_heading = None
1335
  for category, patterns in HEADING_PATTERNS.items():
1336
  for pattern in patterns:
@@ -1343,13 +1365,11 @@ def process_headings(document, flat_json):
1343
  if matched_heading:
1344
  print(f" πŸ“Œ Found heading at paragraph {para_idx + 1}: '{paragraph_text}'")
1345
 
1346
- # Check current heading paragraph
1347
  if has_red_text_in_paragraph(paragraph):
1348
  print(f" πŸ”΄ Found red text in heading itself")
1349
  heading_replacements = process_red_text_in_paragraph(paragraph, paragraph_text, flat_json)
1350
  replacements_made += heading_replacements
1351
 
1352
- # Look ahead for related content
1353
  for next_para_offset in range(1, 6):
1354
  next_para_idx = para_idx + next_para_offset
1355
  if next_para_idx >= len(paragraphs):
@@ -1361,7 +1381,6 @@ def process_headings(document, flat_json):
1361
  if not next_text:
1362
  continue
1363
 
1364
- # Stop if we hit another heading
1365
  is_another_heading = False
1366
  for category, patterns in HEADING_PATTERNS.items():
1367
  for pattern in patterns:
@@ -1374,10 +1393,8 @@ def process_headings(document, flat_json):
1374
  if is_another_heading:
1375
  break
1376
 
1377
- # Process red text with context
1378
  if has_red_text_in_paragraph(next_paragraph):
1379
  print(f" πŸ”΄ Found red text in paragraph {next_para_idx + 1} after heading")
1380
-
1381
  context_replacements = process_red_text_in_paragraph(
1382
  next_paragraph,
1383
  paragraph_text,
@@ -1403,11 +1420,8 @@ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
1403
  print(f" πŸ” Red text found: '{combined_red_text}'")
1404
 
1405
  json_value = None
1406
-
1407
- # Direct matching
1408
  json_value = find_matching_json_value(combined_red_text, flat_json)
1409
 
1410
- # Context-based matching
1411
  if json_value is None:
1412
  if "NHVAS APPROVED AUDITOR" in context_text.upper():
1413
  auditor_fields = ["auditor name", "auditor", "nhvas auditor", "approved auditor", "print name"]
@@ -1425,7 +1439,6 @@ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
1425
  print(f" βœ… Found operator match with field: '{field}'")
1426
  break
1427
 
1428
- # Combined context queries
1429
  if json_value is None:
1430
  context_queries = [
1431
  f"{context_text} {combined_red_text}",
@@ -1439,18 +1452,14 @@ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
1439
  print(f" βœ… Found match with combined query")
1440
  break
1441
 
1442
- # Replace if match found
1443
  if json_value is not None:
1444
  replacement_text = get_value_as_string(json_value, combined_red_text)
1445
-
1446
  red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
1447
  if red_runs:
1448
  red_runs[0].text = replacement_text
1449
  red_runs[0].font.color.rgb = RGBColor(0, 0, 0)
1450
-
1451
  for run in red_runs[1:]:
1452
  run.text = ''
1453
-
1454
  replacements_made = 1
1455
  print(f" βœ… Replaced with: '{replacement_text}'")
1456
  else:
@@ -1458,7 +1467,9 @@ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
1458
 
1459
  return replacements_made
1460
 
1461
-
 
 
1462
 
1463
  def process_hf(json_file, docx_file, output_file):
1464
  """Main processing function with comprehensive error handling"""
@@ -1490,16 +1501,23 @@ def process_hf(json_file, docx_file, output_file):
1490
  paragraph_replacements = process_paragraphs(doc, flat_json)
1491
  heading_replacements = process_headings(doc, flat_json)
1492
 
1493
- # Final force fix for any remaining red text
1494
- #force_replacements = force_red_text_replacement(doc, flat_json)
1495
 
1496
- total_replacements = table_replacements + paragraph_replacements + heading_replacements
1497
- #+ force_replacements
 
 
 
 
 
 
 
1498
 
1499
- # Save output
1500
  if hasattr(output_file, "write"):
1501
  doc.save(output_file)
1502
  else:
 
1503
  doc.save(output_file)
1504
 
1505
  print(f"\nβœ… Document saved as: {output_file}")
@@ -1507,7 +1525,6 @@ def process_hf(json_file, docx_file, output_file):
1507
  print(f" πŸ“Š Tables: {table_replacements}")
1508
  print(f" πŸ“ Paragraphs: {paragraph_replacements}")
1509
  print(f" πŸ“‹ Headings: {heading_replacements}")
1510
- #print(f" 🎯 Force fixes: {force_replacements}")
1511
  print(f"πŸŽ‰ Processing complete!")
1512
 
1513
  except FileNotFoundError as e:
@@ -1517,6 +1534,10 @@ def process_hf(json_file, docx_file, output_file):
1517
  import traceback
1518
  traceback.print_exc()
1519
 
 
 
 
 
1520
  if __name__ == "__main__":
1521
  import sys
1522
  if len(sys.argv) != 4:
 
8
  - safer force replacement (avoid short->long mapping)
9
  - prefer exact qualified keys for Print Name / Position Title lookups
10
  - preserved all other logic and prints/logging
11
+ - ADDED: header normalization, context-aware vehicle JSON selection,
12
+ management summary scoping, unmatched-headers logging
13
  """
14
 
15
  import json
 
17
  from docx.shared import RGBColor
18
  import re
19
  from typing import Any
20
+ import os
21
+
22
+ # ============================================================================
23
+ # Configuration / Heading patterns for document structure detection
24
+ # ============================================================================
25
 
 
26
  HEADING_PATTERNS = {
27
  "main": [
28
  r"NHVAS\s+Audit\s+Summary\s+Report",
 
46
  ]
47
  }
48
 
49
+ # ============================================================================
50
+ # State for unmatched headers (for iterative improvement)
51
+ # ============================================================================
52
+ _unmatched_headers = {}
53
+
54
+ def record_unmatched_header(header: str):
55
+ if not header:
56
+ return
57
+ _unmatched_headers[header] = _unmatched_headers.get(header, 0) + 1
58
+
59
  # ============================================================================
60
  # UTILITY FUNCTIONS
61
  # ============================================================================
 
77
 
78
  def is_red(run):
79
  color = run.font.color
 
80
  try:
81
+ return color and ((getattr(color, "rgb", None) and color.rgb == RGBColor(255, 0, 0)) or getattr(color, "theme_color", None) == 1)
82
  except Exception:
 
83
  return False
84
 
85
  def get_value_as_string(value, field_name=""):
 
116
  return True
117
  return False
118
 
119
+ # New helper: normalize header text (removes parentheticals, punctuation, etc.)
120
+ def normalize_header_text(s: str) -> str:
121
+ if not s:
122
+ return ""
123
+ # remove parenthetical content
124
+ s = re.sub(r'\([^)]*\)', ' ', s)
125
+ # replace slashes
126
+ s = s.replace("/", " ")
127
+ # remove punctuation except # and %
128
+ s = re.sub(r'[^\w\s\#\%]', ' ', s)
129
+ s = re.sub(r'\s+', ' ', s).strip().lower()
130
+ # common canonicalizations
131
+ s = s.replace('registrationno', 'registration number')
132
+ s = s.replace('registrationnumber', 'registration number')
133
+ s = s.replace('sub contracted', 'sub contractor')
134
+ s = s.replace('sub-contractor', 'sub contractor')
135
+ s = s.replace('date range', '')
136
+ s = s.replace('applicable for entry audit', '')
137
+ s = s.strip()
138
+ return s
139
+
140
  # ============================================================================
141
  # JSON MATCHING FUNCTIONS
142
  # ============================================================================
 
152
  print(f" βœ… Direct match found for key '{field_name}'")
153
  return flat_json[field_name]
154
 
155
+ # Case-insensitive exact match
156
  for key, value in flat_json.items():
157
  if key.lower() == field_name.lower():
158
  print(f" βœ… Case-insensitive match found for key '{field_name}' with JSON key '{key}'")
159
  return value
160
 
161
+ # Better Print Name detection for operator vs auditor
162
  if field_name.lower().strip() == "print name":
163
  operator_keys = [k for k in flat_json.keys() if "operator" in k.lower() and "print name" in k.lower()]
164
  auditor_keys = [k for k in flat_json.keys() if "auditor" in k.lower() and ("print name" in k.lower() or "name" in k.lower())]
 
170
  print(f" βœ… Auditor Name match: '{field_name}' -> '{auditor_keys[0]}'")
171
  return flat_json[auditor_keys[0]]
172
 
173
+ # Suffix matching for nested keys
174
  for key, value in flat_json.items():
175
  if '.' in key and key.split('.')[-1].lower() == field_name.lower():
176
  print(f" βœ… Suffix match found for key '{field_name}' with JSON key '{key}'")
177
  return value
178
 
179
+ # Clean & exact match attempt
180
  clean_field = re.sub(r'[^\w\s]', ' ', field_name.lower()).strip()
181
  clean_field = re.sub(r'\s+', ' ', clean_field)
182
  for key, value in flat_json.items():
 
186
  print(f" βœ… Clean match found for key '{field_name}' with JSON key '{key}'")
187
  return value
188
 
189
+ # Enhanced fuzzy matching with word-token scoring
190
  field_words = set(word.lower() for word in re.findall(r'\b\w+\b', field_name) if len(word) > 2)
191
  if not field_words:
192
  return None
 
200
  if not key_words:
201
  continue
202
 
 
203
  common_words = field_words.intersection(key_words)
204
  if common_words:
205
  similarity = len(common_words) / len(field_words.union(key_words))
 
223
  # ============================================================================
224
 
225
  def extract_red_text_segments(cell):
 
226
  red_segments = []
 
227
  for para_idx, paragraph in enumerate(cell.paragraphs):
228
  current_segment = ""
229
  segment_runs = []
 
230
  for run_idx, run in enumerate(paragraph.runs):
231
  if is_red(run):
232
  if run.text:
233
  current_segment += run.text
234
  segment_runs.append((para_idx, run_idx, run))
235
  else:
 
236
  if segment_runs:
237
  red_segments.append({
238
  'text': current_segment,
 
241
  })
242
  current_segment = ""
243
  segment_runs = []
 
 
244
  if segment_runs:
245
  red_segments.append({
246
  'text': current_segment,
247
  'runs': segment_runs.copy(),
248
  'paragraph_idx': para_idx
249
  })
 
250
  return red_segments
251
 
252
  def replace_all_red_segments(red_segments, replacement_text):
 
253
  if not red_segments:
254
  return 0
255
 
 
267
  first_run.text = replacement_lines[0]
268
  first_run.font.color.rgb = RGBColor(0, 0, 0)
269
  replacements_made = 1
 
270
  for _, _, run in first_segment['runs'][1:]:
271
  run.text = ''
272
 
 
278
  try:
279
  first_run = red_segments[0]['runs'][0][2]
280
  paragraph = first_run.element.getparent()
 
281
  from docx.oxml import OxmlElement
282
  parent = first_run.element.getparent()
283
  for line in replacement_lines[1:]:
284
  if line.strip():
285
  br = OxmlElement('w:br')
286
  first_run.element.append(br)
 
287
  new_run = paragraph.add_run(line.strip())
288
  new_run.font.color.rgb = RGBColor(0, 0, 0)
289
  except Exception:
 
295
  return replacements_made
296
 
297
  def replace_single_segment(segment, replacement_text):
 
298
  if not segment['runs']:
299
  return False
 
300
  first_run = segment['runs'][0][2]
301
  first_run.text = replacement_text
302
  first_run.font.color.rgb = RGBColor(0, 0, 0)
 
303
  for _, _, run in segment['runs'][1:]:
304
  run.text = ''
 
305
  return True
306
 
307
  def replace_red_text_in_cell(cell, replacement_text):
 
308
  red_segments = extract_red_text_segments(cell)
 
309
  if not red_segments:
310
  return 0
 
311
  return replace_all_red_segments(red_segments, replacement_text)
312
 
313
  # ============================================================================
 
315
  # ============================================================================
316
 
317
  def handle_australian_company_number(row, company_numbers):
 
318
  replacements_made = 0
319
  for i, digit in enumerate(company_numbers):
320
  cell_idx = i + 1
 
327
  return replacements_made
328
 
329
  def handle_vehicle_registration_table(table, flat_json):
330
+ """Handle vehicle registration table data replacement (improved header normalization and context-aware selection)"""
331
  replacements_made = 0
332
 
333
+ # build a table_text context (used to find mass/maintenance/fatigue)
334
+ table_text = ""
335
+ for r in table.rows[:3]:
336
+ for c in r.cells:
337
+ table_text += get_clean_text(c).lower() + " "
338
 
339
+ # 1) Detect the most relevant vehicle-related JSON section using context tokens
340
+ vehicle_section = None
341
+ context_tokens = []
342
+ if "mass" in table_text:
343
+ context_tokens.append("mass")
344
+ if "maintenance" in table_text:
345
+ context_tokens.append("maintenance")
346
+ if "fatigue" in table_text or "driver" in table_text or "scheduler" in table_text:
347
+ context_tokens.append("fatigue")
348
+
349
+ # candidate keys that mention 'registration' or 'vehicle'
350
+ candidates = []
351
  for key, value in flat_json.items():
352
+ k = key.lower()
353
+ if "registration" in k or "vehicle registration" in k or "vehicle" in k:
354
+ candidates.append((key, value))
355
+
356
+ # prefer candidates whose key contains one of the context tokens
357
+ if candidates and context_tokens:
358
+ for token in context_tokens:
359
+ for k, v in candidates:
360
+ if token in k.lower():
361
+ vehicle_section = v if isinstance(v, (list, dict)) else {k: v}
362
+ print(f" βœ… Found vehicle data by context token '{token}' in key '{k}'")
363
+ break
364
+ if vehicle_section:
365
  break
366
 
367
+ # fallback: choose candidate containing 'registration' explicitly
368
+ if vehicle_section is None and candidates:
369
+ for k, v in candidates:
370
+ if "registration" in k.lower():
371
+ vehicle_section = v if isinstance(v, (list, dict)) else {k: v}
372
+ print(f" βœ… Fallback vehicle data chosen from '{k}'")
373
+ break
374
+
375
+ # fallback: collect flattened keys that look like vehicle columns
376
+ if vehicle_section is None:
377
  potential_columns = {}
378
  for key, value in flat_json.items():
379
+ lk = key.lower()
380
+ if any(col_name in lk for col_name in ["registration number", "sub-contractor", "weight verification", "rfs suspension", "trip records", "suspension", "daily checks", "fault recording", "fault repair", "roadworthiness"]):
381
  if "." in key:
382
  column_name = key.split(".")[-1]
383
  else:
384
  column_name = key
385
  potential_columns[column_name] = value
 
386
  if potential_columns:
387
  vehicle_section = potential_columns
388
  print(f" βœ… Found vehicle data from flattened keys: {list(vehicle_section.keys())}")
389
+
390
+ if not vehicle_section:
391
+ print(f" ❌ Vehicle registration data not found in JSON")
392
+ return 0
393
+
394
+ # ensure vehicle_section is a dict mapping column_name -> list/value
395
+ if isinstance(vehicle_section, list):
396
+ # if a list of dicts, attempt to flatten into columns
397
+ if vehicle_section and isinstance(vehicle_section[0], dict):
398
+ flattened = {}
399
+ for entry in vehicle_section:
400
+ for k, v in entry.items():
401
+ flattened.setdefault(k, []).append(v)
402
+ vehicle_section = flattened
403
+
404
+ if not isinstance(vehicle_section, dict):
405
+ # convert single scalar to dict
406
+ try:
407
+ vehicle_section = dict(vehicle_section)
408
+ except Exception:
409
+ vehicle_section = {str(k): v for k, v in (vehicle_section.items() if isinstance(vehicle_section, dict) else [])}
410
 
411
  print(f" βœ… Found vehicle registration data with {len(vehicle_section)} columns")
412
 
413
+ # Find header row index by searching for a row that contains 'registration' + 'number'
414
  header_row_idx = -1
415
  header_row = None
 
416
  for row_idx, row in enumerate(table.rows):
417
+ row_text = " ".join(get_clean_text(cell).lower() for cell in row.cells)
418
  if "registration" in row_text and "number" in row_text:
419
  header_row_idx = row_idx
420
  header_row = row
421
  break
422
+ if header_row_idx == -1:
423
+ # try alternative detection: a row with 'registration' or 'reg no'
424
+ for row_idx, row in enumerate(table.rows):
425
+ row_text = " ".join(get_clean_text(cell).lower() for cell in row.cells)
426
+ if "registration" in row_text or "reg no" in row_text or "regno" in row_text:
427
+ header_row_idx = row_idx
428
+ header_row = row
429
+ break
430
 
431
  if header_row_idx == -1:
432
  print(f" ❌ Could not find header row in vehicle table")
 
434
 
435
  print(f" βœ… Found header row at index {header_row_idx}")
436
 
437
+ # Enhanced column mapping: normalize both header and candidate keys, token overlap scoring
438
  column_mapping = {}
439
+ # build normalized master map from vehicle_section keys
440
+ master_labels = {}
441
+ for orig_key in vehicle_section.keys():
442
+ norm = normalize_header_text(str(orig_key))
443
+ if norm:
444
+ master_labels.setdefault(norm, orig_key)
445
+
446
+ # add fallback synonyms for common labels (preserve existing)
447
+ fallback_synonyms = [
448
+ "no", "registration number", "reg no", "registration", "sub contractor", "sub-contractor",
449
+ "sub contracted", "weight verification records", "rfs suspension certification", "suspension system maintenance",
450
+ "trip records", "fault recording reporting", "daily checks", "roadworthiness certificates",
451
+ "maintenance records", "fault repair"
452
+ ]
453
+ for syn in fallback_synonyms:
454
+ norm = normalize_header_text(syn)
455
+ if norm and norm not in master_labels:
456
+ master_labels.setdefault(norm, syn)
457
+
458
+ # map header cells
459
  for col_idx, cell in enumerate(header_row.cells):
460
  header_text = get_clean_text(cell).strip()
461
+ if not header_text:
462
+ continue
463
+ # skip 'No.' column mapping attempts in many templates
464
+ if header_text.strip().lower() in {"no", "no.", "#"}:
465
  continue
466
 
467
+ norm_header = normalize_header_text(header_text)
468
  best_match = None
469
+ best_score = 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
470
 
471
+ # exact normalized match
472
+ if norm_header in master_labels:
473
+ best_match = master_labels[norm_header]
474
+ best_score = 1.0
475
+ else:
476
+ # token overlap scoring
477
+ header_tokens = set(t for t in norm_header.split() if len(t) > 2)
478
+ for norm_key, orig_label in master_labels.items():
479
+ key_tokens = set(t for t in norm_key.split() if len(t) > 2)
480
+ if not key_tokens:
481
+ continue
482
+ common = header_tokens.intersection(key_tokens)
483
+ if common:
484
+ score = len(common) / max(1, len(header_tokens.union(key_tokens)))
485
+ else:
486
+ # substring fallback
487
+ if norm_header in norm_key or norm_key in norm_header:
488
+ score = min(len(norm_header), len(norm_key)) / max(len(norm_header), len(norm_key))
489
+ else:
490
+ score = 0.0
491
+ if score > best_score:
492
  best_score = score
493
+ best_match = orig_label
494
 
495
+ if best_match and best_score >= 0.30:
 
 
 
 
 
 
 
 
 
 
496
  column_mapping[col_idx] = best_match
497
+ print(f" πŸ“Œ Column {col_idx}: '{header_text}' -> '{best_match}' (norm: '{norm_header}', score: {best_score:.2f})")
498
+ else:
499
+ print(f" ⚠️ No mapping found for '{header_text}' (norm: '{norm_header}')")
500
+ record_unmatched_header(header_text)
501
 
502
  if not column_mapping:
503
  print(f" ❌ No column mappings found")
504
  return 0
505
 
506
+ # Determine number of rows to populate
507
  max_data_rows = 0
508
  for json_key, data in vehicle_section.items():
509
  if isinstance(data, list):
 
511
 
512
  print(f" πŸ“Œ Need to populate {max_data_rows} data rows")
513
 
514
+ # Fill or add rows as needed
515
  for data_row_index in range(max_data_rows):
516
  table_row_idx = header_row_idx + 1 + data_row_index
517
 
518
  if table_row_idx >= len(table.rows):
519
  print(f" ⚠️ Row {table_row_idx + 1} doesn't exist - table only has {len(table.rows)} rows")
520
  print(f" βž• Adding new row for vehicle {data_row_index + 1}")
 
521
  new_row = table.add_row()
522
  print(f" βœ… Successfully added row {len(table.rows)} to the table")
523
 
 
550
  """Enhanced Attendance List processing with better detection"""
551
  replacements_made = 0
552
 
 
553
  attendance_patterns = [
554
  "attendance list",
555
  "names and position titles",
556
  "attendees"
557
  ]
558
 
 
559
  found_attendance_row = None
560
+ for row_idx, row in enumerate(table.rows[:3]):
 
561
  for cell_idx, cell in enumerate(row.cells):
562
  cell_text = get_clean_text(cell).lower()
 
 
563
  if any(pattern in cell_text for pattern in attendance_patterns):
564
  found_attendance_row = row_idx
565
  print(f" 🎯 ENHANCED: Found Attendance List in row {row_idx + 1}, cell {cell_idx + 1}")
566
  break
 
567
  if found_attendance_row is not None:
568
  break
569
 
570
  if found_attendance_row is None:
571
  return 0
572
 
 
573
  attendance_value = None
574
  attendance_search_keys = [
575
  "Attendance List (Names and Position Titles).Attendance List (Names and Position Titles)",
 
591
  print(f" ❌ No attendance data found in JSON")
592
  return 0
593
 
 
594
  target_cell = None
 
595
  print(f" πŸ” Scanning ALL cells in attendance table for red text...")
596
 
597
  for row_idx, row in enumerate(table.rows):
 
599
  if has_red_text(cell):
600
  print(f" 🎯 Found red text in row {row_idx + 1}, cell {cell_idx + 1}")
601
 
 
602
  red_text = ""
603
  for paragraph in cell.paragraphs:
604
  for run in paragraph.runs:
605
  if is_red(run):
606
  red_text += run.text
607
 
608
+ print(f" πŸ“‹ Red text content: '{red_text[:80]}...'")
609
 
 
610
  red_text_lower = red_text.lower()
611
+ if any(indicator in red_text_lower for indicator in ['manager', '–', '-']):
612
  target_cell = cell
613
  print(f" βœ… This looks like attendance data - using this cell")
614
  break
 
615
  if target_cell is not None:
616
  break
617
 
 
618
  if target_cell is None:
619
  print(f" ⚠️ No red text found that looks like attendance data")
620
  return 0
621
 
 
622
  if has_red_text(target_cell):
623
  print(f" πŸ”§ Replacing red text with properly formatted attendance list...")
624
 
 
625
  if isinstance(attendance_value, list):
626
  attendance_list = [str(item).strip() for item in attendance_value if str(item).strip()]
627
  else:
 
631
  for i, item in enumerate(attendance_list):
632
  print(f" {i+1}. {item}")
633
 
 
634
  replacement_text = "\n".join(attendance_list)
635
  cell_replacements = replace_red_text_in_cell(target_cell, replacement_text)
636
  replacements_made += cell_replacements
 
641
  return replacements_made
642
 
643
  def fix_management_summary_details_column(table, flat_json):
644
+ """Fix the DETAILS column in Management Summary table (multi-management aware)."""
645
  replacements_made = 0
646
 
647
  print(f" 🎯 FIX: Management Summary DETAILS column processing")
648
 
649
+ # Build table text to detect management type(s)
650
  table_text = ""
651
+ for row in table.rows[:3]:
652
  for cell in row.cells:
653
  table_text += get_clean_text(cell).lower() + " "
654
 
655
+ # Identify which management types this table likely represents
656
+ mgmt_types = []
657
+ if "mass management" in table_text or "mass" in table_text:
658
+ mgmt_types.append("Mass Management Summary")
659
+ if "maintenance management" in table_text or "maintenance" in table_text:
660
+ mgmt_types.append("Maintenance Management Summary")
661
+ if "fatigue management" in table_text or "fatigue" in table_text or "driver" in table_text:
662
+ mgmt_types.append("Fatigue Management Summary")
663
+
664
+ if not mgmt_types:
665
+ # fallback: try fuzzy detection through headings or presence of "Std 5." etc.
666
+ if any("std 5" in get_clean_text(c).lower() for r in table.rows for c in r.cells):
667
+ mgmt_types.append("Mass Management Summary")
668
+
669
+ if not mgmt_types:
670
  return 0
671
 
672
+ for mgmt_type in mgmt_types:
673
+ print(f" βœ… Confirmed {mgmt_type} table processing")
674
+ # find data dict in flat_json for mgmt_type
675
+ mgmt_data = flat_json.get(mgmt_type)
676
+ if not isinstance(mgmt_data, dict):
677
+ # attempt suffix based keys in flat_json
678
+ for key in flat_json.keys():
679
+ if mgmt_type.split()[0].lower() in key.lower() and "summary" in key.lower():
680
+ mgmt_data = flat_json.get(key)
681
+ break
682
+ if not isinstance(mgmt_data, dict):
683
+ print(f" ⚠️ No JSON management dict found for {mgmt_type}, skipping this type")
684
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
685
 
686
+ # Process rows looking for Std 5. and Std 6.
687
+ for row_idx, row in enumerate(table.rows):
688
+ if len(row.cells) >= 2:
689
+ standard_cell = row.cells[0]
690
+ details_cell = row.cells[1]
691
+ standard_text = get_clean_text(standard_cell).strip().lower()
692
+
693
+ # Std 5.
694
+ if "std 5" in standard_text or "verification" in standard_text:
695
+ if has_red_text(details_cell):
696
+ print(f" πŸ” Found Std 5/Verification with red text")
697
+ # try to find the appropriate key in mgmt_data
698
+ std_val = None
699
+ # exact key variants
700
+ for candidate in ("Std 5. Verification", "Std 5 Verification", "Std 5", "Verification"):
701
+ std_val = mgmt_data.get(candidate)
702
+ if std_val is not None:
703
+ break
704
+ # fuzzy fallback
705
+ if std_val is None:
706
+ for k, v in mgmt_data.items():
707
+ if 'std 5' in k.lower() or 'verification' in k.lower():
708
+ std_val = v
709
+ break
710
+ if std_val is not None:
711
+ replacement_text = get_value_as_string(std_val, "Std 5. Verification")
712
+ cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
713
+ replacements_made += cell_replacements
714
+ if cell_replacements:
715
+ print(f" βœ… Replaced Std 5. Verification details for {mgmt_type}")
716
+
717
+ # Std 6.
718
+ if "std 6" in standard_text or "internal review" in standard_text:
719
+ if has_red_text(details_cell):
720
+ print(f" πŸ” Found Std 6/Internal Review with red text")
721
+ std_val = None
722
+ for candidate in ("Std 6. Internal Review", "Std 6 Internal Review", "Std 6", "Internal Review"):
723
+ std_val = mgmt_data.get(candidate)
724
+ if std_val is not None:
725
+ break
726
+ if std_val is None:
727
+ for k, v in mgmt_data.items():
728
+ if 'std 6' in k.lower() or 'internal review' in k.lower():
729
+ std_val = v
730
+ break
731
+ if std_val is not None:
732
+ replacement_text = get_value_as_string(std_val, "Std 6. Internal Review")
733
+ cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
734
+ replacements_made += cell_replacements
735
+ if cell_replacements:
736
+ print(f" βœ… Replaced Std 6. Internal Review details for {mgmt_type}")
737
 
738
  return replacements_made
739
 
740
+ # Canonical operator declaration fixer (keeps original robust logic)
 
 
 
741
  def fix_operator_declaration_empty_values(table, flat_json):
 
 
 
 
 
 
742
  replacements_made = 0
743
 
744
  print(f" 🎯 FIX: Operator Declaration empty values processing")
745
 
 
746
  table_context = ""
747
  for row in table.rows:
748
  for cell in row.cells:
 
754
  print(f" βœ… Confirmed Operator Declaration table")
755
 
756
  def parse_name_and_position(value):
 
757
  if value is None:
758
  return None, None
 
 
759
  if isinstance(value, list):
760
  if len(value) == 0:
761
  return None, None
762
  if len(value) == 1:
763
  return str(value[0]).strip(), None
 
764
  first = str(value[0]).strip()
765
  second = str(value[1]).strip()
766
  if first and second:
 
771
  if not s:
772
  return None, None
773
 
 
774
  parts = re.split(r'\s+[-–—]\s+|\s*,\s*|\s*\|\s*', s)
775
  if len(parts) >= 2:
776
  left = parts[0].strip()
 
783
  return right, left
784
  return left, right
785
 
 
786
  tokens = s.split()
787
  if len(tokens) >= 2:
788
  last = tokens[-1]
 
791
  if any(ind == last.lower() for ind in role_indicators):
792
  return " ".join(tokens[:-1]), last
793
 
 
794
  return s, None
795
 
 
796
  for row_idx, row in enumerate(table.rows):
797
  if len(row.cells) >= 2:
798
  cell1_text = get_clean_text(row.cells[0]).strip().lower()
 
811
  position_text = get_clean_text(position_cell).strip()
812
  print(f" πŸ“‹ Current values: Name='{name_text}', Position='{position_text}'")
813
 
 
814
  name_value = find_matching_json_value("Operator Declaration.Print Name", flat_json)
815
  if name_value is None:
816
  name_value = find_matching_json_value("Print Name", flat_json)
 
819
  if position_value is None:
820
  position_value = find_matching_json_value("Position Title", flat_json)
821
 
 
822
  parsed_name_from_nameval, parsed_pos_from_nameval = parse_name_and_position(name_value) if name_value is not None else (None, None)
823
  parsed_name_from_posval, parsed_pos_from_posval = parse_name_and_position(position_value) if position_value is not None else (None, None)
824
 
 
825
  final_name = None
826
  final_pos = None
827
 
 
830
  elif name_value is not None:
831
  final_name = get_value_as_string(name_value)
832
 
 
833
  if parsed_pos_from_posval:
834
  final_pos = parsed_pos_from_posval
835
  elif position_value is not None:
 
837
  elif parsed_pos_from_nameval:
838
  final_pos = parsed_pos_from_nameval
839
 
 
840
  if isinstance(final_name, list):
841
  final_name = " ".join(str(x) for x in final_name).strip()
842
  if isinstance(final_pos, list):
 
855
  return False
856
  return len(name_str) > 1
857
 
 
858
  if (not name_text or has_red_text(name_cell)) and final_name and looks_like_person(final_name):
859
  if has_red_text(name_cell):
860
  replace_red_text_in_cell(name_cell, final_name)
 
863
  replacements_made += 1
864
  print(f" βœ… Updated Print Name -> '{final_name}'")
865
 
 
866
  if (not position_text or has_red_text(position_cell)) and final_pos:
867
  if has_red_text(position_cell):
868
  replace_red_text_in_cell(position_cell, final_pos)
 
873
 
874
  break
875
 
 
876
  if replacements_made > 0:
877
  try:
878
  setattr(table, "_processed_operator_declaration", True)
 
883
  return replacements_made
884
 
885
  def handle_multiple_red_segments_in_cell(cell, flat_json):
 
886
  replacements_made = 0
 
887
  red_segments = extract_red_text_segments(cell)
888
  if not red_segments:
889
  return 0
 
 
890
  for i, segment in enumerate(red_segments):
891
  segment_text = segment['text'].strip()
892
  if segment_text:
 
896
  if replace_single_segment(segment, replacement_text):
897
  replacements_made += 1
898
  print(f" βœ… Replaced segment {i+1}: '{segment_text}' -> '{replacement_text}'")
 
899
  return replacements_made
900
 
901
  def handle_nature_business_multiline_fix(cell, flat_json):
 
902
  replacements_made = 0
 
 
903
  red_text = ""
904
  for paragraph in cell.paragraphs:
905
  for run in paragraph.runs:
906
  if is_red(run):
907
  red_text += run.text
 
908
  red_text = red_text.strip()
909
  if not red_text:
910
  return 0
 
 
911
  nature_indicators = ["transport", "logistics", "freight", "delivery", "trucking", "haulage"]
912
  if any(indicator in red_text.lower() for indicator in nature_indicators):
 
913
  nature_value = find_matching_json_value("Nature of Business", flat_json)
914
  if nature_value is not None:
915
  replacement_text = get_value_as_string(nature_value, "Nature of Business")
916
  cell_replacements = replace_red_text_in_cell(cell, replacement_text)
917
  replacements_made += cell_replacements
918
  print(f" βœ… Fixed Nature of Business multiline content")
 
919
  return replacements_made
920
 
921
  def handle_management_summary_fix(cell, flat_json):
 
922
  replacements_made = 0
 
 
923
  red_text = ""
924
  for paragraph in cell.paragraphs:
925
  for run in paragraph.runs:
926
  if is_red(run):
927
  red_text += run.text
 
928
  red_text = red_text.strip()
929
  if not red_text:
930
  return 0
 
 
931
  management_types = ["Mass Management Summary", "Maintenance Management Summary", "Fatigue Management Summary"]
 
932
  for mgmt_type in management_types:
933
  if mgmt_type in flat_json:
934
  mgmt_data = flat_json[mgmt_type]
935
  if isinstance(mgmt_data, dict):
 
936
  for std_key, std_value in mgmt_data.items():
937
  if isinstance(std_value, list) and std_value:
 
938
  if len(red_text) > 10:
939
  for item in std_value:
940
  if red_text.lower() in str(item).lower() or str(item).lower() in red_text.lower():
 
943
  replacements_made += cell_replacements
944
  print(f" βœ… Fixed {mgmt_type} - {std_key}")
945
  return replacements_made
 
946
  return replacements_made
947
 
948
+ # ============================================================================
949
  # SMALL OPERATOR/AUDITOR TABLE HANDLER (skip if already processed)
950
+ # ============================================================================
951
 
952
  def handle_operator_declaration_fix(table, flat_json):
 
 
 
 
953
  replacements_made = 0
954
 
 
955
  if getattr(table, "_processed_operator_declaration", False):
956
  print(f" ⏭️ Skipping - Operator Declaration table already processed")
957
  return 0
958
 
 
959
  if len(table.rows) > 4:
960
  return 0
961
 
 
962
  replaced = fix_operator_declaration_empty_values(table, flat_json)
963
  replacements_made += replaced
964
  if replaced:
 
965
  return replacements_made
966
 
 
967
  def is_date_like(s: str) -> bool:
968
  if not s:
969
  return False
970
  s = s.strip()
 
971
  month_names = r"(jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec|january|february|march|april|may|june|july|august|september|october|november|december)"
 
972
  if re.search(r"\bDate\b", s, re.IGNORECASE):
973
  return True
974
  if re.search(r"\b\d{1,2}(?:st|nd|rd|th)?\b\s+" + month_names, s, re.IGNORECASE):
 
979
  return True
980
  if re.search(r"\b\d{4}[\/\.\-]\d{1,2}[\/\.\-]\d{1,2}\b", s):
981
  return True
 
982
  if re.fullmatch(r"\d{4}", s):
983
  return True
984
  return False
 
987
  if not s:
988
  return False
989
  low = s.lower().strip()
 
990
  bad_terms = ["pty ltd", "p/l", "plc", "company", "farming", "farm", "trust", "ltd"]
991
  if any(bt in low for bt in bad_terms):
992
  return False
 
993
  if len(low) < 3:
994
  return False
995
  return bool(re.search(r"[a-zA-Z]", low))
 
1001
  roles = ["manager", "auditor", "owner", "director", "supervisor", "coordinator", "driver", "operator", "representative", "chief"]
1002
  return any(r in low for r in roles)
1003
 
 
1004
  print(f" 🎯 Processing other declaration table (fallback small-table behavior)")
1005
 
1006
  for row_idx, row in enumerate(table.rows):
1007
  for cell_idx, cell in enumerate(row.cells):
1008
  if not has_red_text(cell):
 
1009
  continue
1010
 
 
1011
  declaration_fields = [
1012
  "NHVAS Approved Auditor Declaration.Print Name",
1013
  "Auditor name",
 
1025
  if not replacement_text:
1026
  continue
1027
 
 
1028
  if is_date_like(replacement_text):
 
 
 
1029
  red_text = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red(run)).strip()
1030
  if "date" not in red_text.lower():
1031
  print(f" ⚠️ Skipping date-like replacement for field '{field}' -> '{replacement_text[:30]}...'")
1032
  continue
1033
 
 
1034
  if (looks_like_person_name(replacement_text) or looks_like_position(replacement_text) or "signature" in field.lower() or "date" in field.lower()):
 
1035
  cell_replacements = replace_red_text_in_cell(cell, replacement_text)
1036
  if cell_replacements > 0:
1037
  replacements_made += cell_replacements
 
1039
  print(f" βœ… Fixed declaration field: {field} -> '{replacement_text}'")
1040
  break
1041
  else:
 
1042
  print(f" ⚠️ Replacement for field '{field}' does not look like name/role, skipping: '{replacement_text[:30]}...'")
1043
  continue
1044
 
 
1045
  if not replaced_this_cell:
1046
  red_text = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red(run)).strip().lower()
1047
  if "signature" in red_text:
 
1050
  replacements_made += cell_replacements
1051
  print(f" βœ… Inserted placeholder [Signature]")
1052
  elif "date" in red_text:
 
1053
  date_value = find_matching_json_value("Date", flat_json) or find_matching_json_value("Date of Audit", flat_json) or find_matching_json_value("Audit was conducted on", flat_json)
1054
  if date_value is not None:
1055
  date_text = get_value_as_string(date_value)
1056
  if not is_date_like(date_text):
 
1057
  print(f" ⚠️ Found date-value but not date-like, skipping: '{date_text}'")
1058
  else:
1059
  cell_replacements = replace_red_text_in_cell(cell, date_text)
 
1061
  replacements_made += cell_replacements
1062
  print(f" βœ… Inserted date value: '{date_text}'")
1063
 
 
1064
  if replacements_made > 0:
1065
  try:
1066
  setattr(table, "_processed_operator_declaration", True)
 
1071
  return replacements_made
1072
 
1073
  def handle_print_accreditation_section(table, flat_json):
 
1074
  replacements_made = 0
1075
 
 
1076
  if getattr(table, "_processed_operator_declaration", False):
1077
  print(f" ⏭️ Skipping Print Accreditation - this is an Operator Declaration table")
1078
  return 0
 
1079
 
 
1080
  table_context = ""
1081
  for row in table.rows:
1082
  for cell in row.cells:
1083
  table_context += get_clean_text(cell).lower() + " "
1084
 
 
1085
  if "operator declaration" in table_context or ("print name" in table_context and "position title" in table_context):
1086
  print(f" ⏭️ Skipping Print Accreditation - this is an Operator Declaration table")
1087
  return 0
 
1091
  for row_idx, row in enumerate(table.rows):
1092
  for cell_idx, cell in enumerate(row.cells):
1093
  if has_red_text(cell):
 
1094
  accreditation_fields = [
1095
  "(print accreditation name)",
1096
  "Operator name (Legal entity)",
1097
+ "Print accreditation name",
1098
+ "(print accreditation name)"
1099
  ]
1100
 
1101
  for field in accreditation_fields:
 
1112
  return replacements_made
1113
 
1114
  def process_single_column_sections(cell, key_text, flat_json):
 
1115
  replacements_made = 0
1116
 
1117
  if has_red_text(cell):
 
1122
  red_text += run.text
1123
 
1124
  if red_text.strip():
 
1125
  section_value = find_matching_json_value(red_text.strip(), flat_json)
1126
  if section_value is None:
 
1127
  section_value = find_matching_json_value(key_text, flat_json)
1128
 
1129
  if section_value is not None:
 
1146
  for table_idx, table in enumerate(document.tables):
1147
  print(f"\nπŸ” Processing table {table_idx + 1}:")
1148
 
1149
+ # collect brief context
1150
  table_text = ""
1151
  for row in table.rows[:3]:
1152
  for cell in row.cells:
1153
  table_text += get_clean_text(cell).lower() + " "
1154
 
1155
+ # detect management summary & details column
1156
  management_summary_indicators = ["mass management", "maintenance management", "fatigue management"]
1157
  has_management = any(indicator in table_text for indicator in management_summary_indicators)
1158
  has_details = "details" in table_text
 
1167
  for row_idx, row in enumerate(table.rows):
1168
  for cell_idx, cell in enumerate(row.cells):
1169
  if has_red_text(cell):
1170
+ # Try direct matching with new schema names first
1171
  for mgmt_type in ["Mass Management Summary", "Maintenance Management Summary", "Fatigue Management Summary"]:
1172
  if mgmt_type.lower().replace(" summary", "") in table_text:
 
1173
  if mgmt_type in flat_json:
1174
  mgmt_data = flat_json[mgmt_type]
1175
  if isinstance(mgmt_data, dict):
 
1193
  continue
1194
 
1195
  # Detect Vehicle Registration tables
1196
+ vehicle_indicators = ["registration number", "sub-contractor", "weight verification", "rfs suspension", "registration"]
1197
  indicator_count = sum(1 for indicator in vehicle_indicators if indicator in table_text)
1198
  if indicator_count >= 2:
1199
  print(f" πŸš— Detected Vehicle Registration table")
 
1212
  print_accreditation_indicators = ["print name", "position title"]
1213
  indicator_count = sum(1 for indicator in print_accreditation_indicators if indicator in table_text)
1214
 
 
1215
  if indicator_count >= 2 or ("print name" in table_text and "position title" in table_text):
1216
  print(f" πŸ“‹ Detected Print Accreditation/Operator Declaration table")
 
 
1217
  declaration_fixes = fix_operator_declaration_empty_values(table, flat_json)
1218
  replacements_made += declaration_fixes
1219
 
 
1220
  if not getattr(table, "_processed_operator_declaration", False):
1221
  print_accreditation_replacements = handle_print_accreditation_section(table, flat_json)
1222
  replacements_made += print_accreditation_replacements
1223
 
1224
  continue
1225
 
1226
+ # Process regular table rows (original logic preserved)
1227
  for row_idx, row in enumerate(table.rows):
1228
  if len(row.cells) < 1:
1229
  continue
 
1241
  if json_value is not None:
1242
  replacement_text = get_value_as_string(json_value, key_text)
1243
 
 
1244
  if ("australian company number" in key_text.lower() or "company number" in key_text.lower()) and isinstance(json_value, list):
1245
  cell_replacements = handle_australian_company_number(row, json_value)
1246
  replacements_made += cell_replacements
1247
 
 
1248
  elif ("attendance list" in key_text.lower() or "nature of" in key_text.lower()) and row_idx + 1 < len(table.rows):
1249
  print(f" βœ… Section header detected, checking next row...")
1250
  next_row = table.rows[row_idx + 1]
 
1251
  for cell_idx, cell in enumerate(next_row.cells):
1252
  if has_red_text(cell):
1253
  print(f" βœ… Found red text in next row, cell {cell_idx + 1}")
 
1258
  if cell_replacements > 0:
1259
  print(f" -> Replaced section content")
1260
 
 
1261
  elif len(row.cells) == 1 or (len(row.cells) > 1 and not any(has_red_text(row.cells[i]) for i in range(1, len(row.cells)))):
1262
  if has_red_text(key_cell):
1263
  cell_replacements = process_single_column_sections(key_cell, key_text, flat_json)
1264
  replacements_made += cell_replacements
1265
 
 
1266
  else:
1267
  for cell_idx in range(1, len(row.cells)):
1268
  value_cell = row.cells[cell_idx]
 
1272
  replacements_made += cell_replacements
1273
 
1274
  else:
 
1275
  if len(row.cells) == 1 and has_red_text(key_cell):
1276
  red_text = ""
1277
  for paragraph in key_cell.paragraphs:
 
1285
  cell_replacements = replace_red_text_in_cell(key_cell, section_replacement)
1286
  replacements_made += cell_replacements
1287
 
 
1288
  for cell_idx in range(len(row.cells)):
1289
  cell = row.cells[cell_idx]
1290
  if has_red_text(cell):
1291
  cell_replacements = handle_multiple_red_segments_in_cell(cell, flat_json)
1292
  replacements_made += cell_replacements
1293
 
 
1294
  if cell_replacements == 0:
1295
  surgical_fix = handle_nature_business_multiline_fix(cell, flat_json)
1296
  replacements_made += surgical_fix
 
1299
  management_summary_fix = handle_management_summary_fix(cell, flat_json)
1300
  replacements_made += management_summary_fix
1301
 
1302
+ # Final declaration checks on last few tables
1303
  print(f"\n🎯 Final check for Declaration tables...")
1304
  for table in document.tables[-3:]:
1305
  if len(table.rows) <= 4:
 
1325
  json_value = find_matching_json_value(red_text_only, flat_json)
1326
 
1327
  if json_value is None:
 
1328
  if "AUDITOR SIGNATURE" in red_text_only.upper() or "DATE" in red_text_only.upper():
1329
  json_value = find_matching_json_value("auditor signature", flat_json)
1330
  elif "OPERATOR SIGNATURE" in red_text_only.upper():
 
1350
 
1351
  for para_idx, paragraph in enumerate(paragraphs):
1352
  paragraph_text = paragraph.text.strip()
 
1353
  if not paragraph_text:
1354
  continue
1355
 
 
1356
  matched_heading = None
1357
  for category, patterns in HEADING_PATTERNS.items():
1358
  for pattern in patterns:
 
1365
  if matched_heading:
1366
  print(f" πŸ“Œ Found heading at paragraph {para_idx + 1}: '{paragraph_text}'")
1367
 
 
1368
  if has_red_text_in_paragraph(paragraph):
1369
  print(f" πŸ”΄ Found red text in heading itself")
1370
  heading_replacements = process_red_text_in_paragraph(paragraph, paragraph_text, flat_json)
1371
  replacements_made += heading_replacements
1372
 
 
1373
  for next_para_offset in range(1, 6):
1374
  next_para_idx = para_idx + next_para_offset
1375
  if next_para_idx >= len(paragraphs):
 
1381
  if not next_text:
1382
  continue
1383
 
 
1384
  is_another_heading = False
1385
  for category, patterns in HEADING_PATTERNS.items():
1386
  for pattern in patterns:
 
1393
  if is_another_heading:
1394
  break
1395
 
 
1396
  if has_red_text_in_paragraph(next_paragraph):
1397
  print(f" πŸ”΄ Found red text in paragraph {next_para_idx + 1} after heading")
 
1398
  context_replacements = process_red_text_in_paragraph(
1399
  next_paragraph,
1400
  paragraph_text,
 
1420
  print(f" πŸ” Red text found: '{combined_red_text}'")
1421
 
1422
  json_value = None
 
 
1423
  json_value = find_matching_json_value(combined_red_text, flat_json)
1424
 
 
1425
  if json_value is None:
1426
  if "NHVAS APPROVED AUDITOR" in context_text.upper():
1427
  auditor_fields = ["auditor name", "auditor", "nhvas auditor", "approved auditor", "print name"]
 
1439
  print(f" βœ… Found operator match with field: '{field}'")
1440
  break
1441
 
 
1442
  if json_value is None:
1443
  context_queries = [
1444
  f"{context_text} {combined_red_text}",
 
1452
  print(f" βœ… Found match with combined query")
1453
  break
1454
 
 
1455
  if json_value is not None:
1456
  replacement_text = get_value_as_string(json_value, combined_red_text)
 
1457
  red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
1458
  if red_runs:
1459
  red_runs[0].text = replacement_text
1460
  red_runs[0].font.color.rgb = RGBColor(0, 0, 0)
 
1461
  for run in red_runs[1:]:
1462
  run.text = ''
 
1463
  replacements_made = 1
1464
  print(f" βœ… Replaced with: '{replacement_text}'")
1465
  else:
 
1467
 
1468
  return replacements_made
1469
 
1470
+ # ============================================================================
1471
+ # Main process function
1472
+ # ============================================================================
1473
 
1474
  def process_hf(json_file, docx_file, output_file):
1475
  """Main processing function with comprehensive error handling"""
 
1501
  paragraph_replacements = process_paragraphs(doc, flat_json)
1502
  heading_replacements = process_headings(doc, flat_json)
1503
 
1504
+ total_replacements = table_replacements + paragraph_replacements + heading_replacements
 
1505
 
1506
+ # Save unmatched headers for iterative improvement
1507
+ if _unmatched_headers:
1508
+ try:
1509
+ tmp_path = "/tmp/unmatched_headers.json"
1510
+ with open(tmp_path, 'w', encoding='utf-8') as f:
1511
+ json.dump(_unmatched_headers, f, indent=2, ensure_ascii=False)
1512
+ print(f"βœ… Unmatched headers saved to {tmp_path}")
1513
+ except Exception as e:
1514
+ print(f"⚠️ Could not save unmatched headers: {e}")
1515
 
1516
+ # Save output docx
1517
  if hasattr(output_file, "write"):
1518
  doc.save(output_file)
1519
  else:
1520
+ # If output path is a file path string
1521
  doc.save(output_file)
1522
 
1523
  print(f"\nβœ… Document saved as: {output_file}")
 
1525
  print(f" πŸ“Š Tables: {table_replacements}")
1526
  print(f" πŸ“ Paragraphs: {paragraph_replacements}")
1527
  print(f" πŸ“‹ Headings: {heading_replacements}")
 
1528
  print(f"πŸŽ‰ Processing complete!")
1529
 
1530
  except FileNotFoundError as e:
 
1534
  import traceback
1535
  traceback.print_exc()
1536
 
1537
+ # ============================================================================
1538
+ # CLI entrypoint
1539
+ # ============================================================================
1540
+
1541
  if __name__ == "__main__":
1542
  import sys
1543
  if len(sys.argv) != 4: