Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

Shami96 commited on Aug 22

Commit

8bbc7e5

verified ·

1 Parent(s): a4dde4c

Update updated_word.py

Browse files

Files changed (1) hide show

updated_word.py +289 -268

updated_word.py CHANGED Viewed

@@ -8,6 +8,8 @@ Merged improvements:
  - safer force replacement (avoid short->long mapping)
  - prefer exact qualified keys for Print Name / Position Title lookups
  - preserved all other logic and prints/logging
 """
 import json
@@ -15,8 +17,12 @@ from docx import Document
 from docx.shared import RGBColor
 import re
 from typing import Any
-# Heading patterns for document structure detection
 HEADING_PATTERNS = {
     "main": [
         r"NHVAS\s+Audit\s+Summary\s+Report",
@@ -40,6 +46,16 @@ HEADING_PATTERNS = {
     ]
 }
 # ============================================================================
 # UTILITY FUNCTIONS
 # ============================================================================
@@ -61,11 +77,9 @@ def flatten_json(y, prefix=''):
 def is_red(run):
     color = run.font.color
-    # safe checks, handle theme_color fallback as before
     try:
-        return color and (getattr(color, "rgb", None) and color.rgb == RGBColor(255, 0, 0) or getattr(color, "theme_color", None) == 1)
     except Exception:
-        # best-effort: If object doesn't match expected shape, return False
         return False
 def get_value_as_string(value, field_name=""):
@@ -102,6 +116,27 @@ def has_red_text_in_paragraph(paragraph):
             return True
     return False
 # ============================================================================
 # JSON MATCHING FUNCTIONS
 # ============================================================================
@@ -117,13 +152,13 @@ def find_matching_json_value(field_name, flat_json):
         print(f"    ✅ Direct match found for key '{field_name}'")
         return flat_json[field_name]
-    # Try case-insensitive exact match
     for key, value in flat_json.items():
         if key.lower() == field_name.lower():
             print(f"    ✅ Case-insensitive match found for key '{field_name}' with JSON key '{key}'")
             return value
-    # Better Print Name detection for operator vs auditor (prefer fully-qualified keys)
     if field_name.lower().strip() == "print name":
         operator_keys = [k for k in flat_json.keys() if "operator" in k.lower() and "print name" in k.lower()]
         auditor_keys = [k for k in flat_json.keys() if "auditor" in k.lower() and ("print name" in k.lower() or "name" in k.lower())]
@@ -135,13 +170,13 @@ def find_matching_json_value(field_name, flat_json):
             print(f"    ✅ Auditor Name match: '{field_name}' -> '{auditor_keys[0]}'")
             return flat_json[auditor_keys[0]]
-    # Try suffix matching (for nested keys like "section.field")
     for key, value in flat_json.items():
         if '.' in key and key.split('.')[-1].lower() == field_name.lower():
             print(f"    ✅ Suffix match found for key '{field_name}' with JSON key '{key}'")
             return value
-    # Clean and exact match attempt
     clean_field = re.sub(r'[^\w\s]', ' ', field_name.lower()).strip()
     clean_field = re.sub(r'\s+', ' ', clean_field)
     for key, value in flat_json.items():
@@ -151,7 +186,7 @@ def find_matching_json_value(field_name, flat_json):
             print(f"    ✅ Clean match found for key '{field_name}' with JSON key '{key}'")
             return value
-    # Enhanced fuzzy matching with better scoring
     field_words = set(word.lower() for word in re.findall(r'\b\w+\b', field_name) if len(word) > 2)
     if not field_words:
         return None
@@ -165,7 +200,6 @@ def find_matching_json_value(field_name, flat_json):
         if not key_words:
             continue
-        # Calculate similarity score: Jaccard + coverage
         common_words = field_words.intersection(key_words)
         if common_words:
             similarity = len(common_words) / len(field_words.union(key_words))
@@ -189,20 +223,16 @@ def find_matching_json_value(field_name, flat_json):
 # ============================================================================
 def extract_red_text_segments(cell):
-    """Extract red text segments from a cell"""
     red_segments = []
     for para_idx, paragraph in enumerate(cell.paragraphs):
         current_segment = ""
         segment_runs = []
         for run_idx, run in enumerate(paragraph.runs):
             if is_red(run):
                 if run.text:
                     current_segment += run.text
                 segment_runs.append((para_idx, run_idx, run))
             else:
-                # End of current red segment
                 if segment_runs:
                     red_segments.append({
                         'text': current_segment,
@@ -211,19 +241,15 @@ def extract_red_text_segments(cell):
                     })
                     current_segment = ""
                     segment_runs = []
-        # Handle segment at end of paragraph
         if segment_runs:
             red_segments.append({
                 'text': current_segment,
                 'runs': segment_runs.copy(),
                 'paragraph_idx': para_idx
             })
     return red_segments
 def replace_all_red_segments(red_segments, replacement_text):
-    """Replace all red segments with replacement text"""
     if not red_segments:
         return 0
@@ -241,7 +267,6 @@ def replace_all_red_segments(red_segments, replacement_text):
             first_run.text = replacement_lines[0]
             first_run.font.color.rgb = RGBColor(0, 0, 0)
             replacements_made = 1
             for _, _, run in first_segment['runs'][1:]:
                 run.text = ''
@@ -253,14 +278,12 @@ def replace_all_red_segments(red_segments, replacement_text):
         try:
             first_run = red_segments[0]['runs'][0][2]
             paragraph = first_run.element.getparent()
-            # Add line breaks + new runs (best-effort)
             from docx.oxml import OxmlElement
             parent = first_run.element.getparent()
             for line in replacement_lines[1:]:
                 if line.strip():
                     br = OxmlElement('w:br')
                     first_run.element.append(br)
-                    # create a new run in the same paragraph node (docx high-level API)
                     new_run = paragraph.add_run(line.strip())
                     new_run.font.color.rgb = RGBColor(0, 0, 0)
         except Exception:
@@ -272,26 +295,19 @@ def replace_all_red_segments(red_segments, replacement_text):
     return replacements_made
 def replace_single_segment(segment, replacement_text):
-    """Replace a single red text segment"""
     if not segment['runs']:
         return False
     first_run = segment['runs'][0][2]
     first_run.text = replacement_text
     first_run.font.color.rgb = RGBColor(0, 0, 0)
     for _, _, run in segment['runs'][1:]:
         run.text = ''
     return True
 def replace_red_text_in_cell(cell, replacement_text):
-    """Replace red text in a cell with replacement text"""
     red_segments = extract_red_text_segments(cell)
     if not red_segments:
         return 0
     return replace_all_red_segments(red_segments, replacement_text)
 # ============================================================================
@@ -299,7 +315,6 @@ def replace_red_text_in_cell(cell, replacement_text):
 # ============================================================================
 def handle_australian_company_number(row, company_numbers):
-    """Handle Australian Company Number digit placement"""
     replacements_made = 0
     for i, digit in enumerate(company_numbers):
         cell_idx = i + 1
@@ -312,48 +327,106 @@ def handle_australian_company_number(row, company_numbers):
     return replacements_made
 def handle_vehicle_registration_table(table, flat_json):
-    """Handle vehicle registration table data replacement"""
     replacements_made = 0
-    # Try to find vehicle registration data
-    vehicle_section = None
     for key, value in flat_json.items():
-        if "vehicle registration numbers of records examined" in key.lower():
-            if isinstance(value, dict):
-                vehicle_section = value
-                print(f"    ✅ Found vehicle data in key: '{key}'")
                 break
-    if not vehicle_section:
         potential_columns = {}
         for key, value in flat_json.items():
-            if any(col_name in key.lower() for col_name in ["registration number", "sub-contractor", "weight verification", "rfs suspension", "trip records", "suspension"]):
                 if "." in key:
                     column_name = key.split(".")[-1]
                 else:
                     column_name = key
                 potential_columns[column_name] = value
         if potential_columns:
             vehicle_section = potential_columns
             print(f"    ✅ Found vehicle data from flattened keys: {list(vehicle_section.keys())}")
-        else:
-            print(f"    ❌ Vehicle registration data not found in JSON")
-            return 0
     print(f"    ✅ Found vehicle registration data with {len(vehicle_section)} columns")
-    # Find header row
     header_row_idx = -1
     header_row = None
     for row_idx, row in enumerate(table.rows):
-        row_text = "".join(get_clean_text(cell).lower() for cell in row.cells)
         if "registration" in row_text and "number" in row_text:
             header_row_idx = row_idx
             header_row = row
             break
     if header_row_idx == -1:
         print(f"    ❌ Could not find header row in vehicle table")
@@ -361,56 +434,76 @@ def handle_vehicle_registration_table(table, flat_json):
     print(f"    ✅ Found header row at index {header_row_idx}")
-    # Enhanced column mapping (same method as before)
     column_mapping = {}
     for col_idx, cell in enumerate(header_row.cells):
         header_text = get_clean_text(cell).strip()
-        if not header_text or header_text.lower() == "no.":
             continue
         best_match = None
-        best_score = 0
-        normalized_header = header_text.lower().replace("(", " (").replace(")", ") ").strip()
-        for json_key in vehicle_section.keys():
-            normalized_json = json_key.lower().strip()
-            if normalized_header == normalized_json:
-                best_match = json_key
-                best_score = 1.0
-                break
-            header_words = set(word.lower() for word in normalized_header.split() if len(word) > 2)
-            json_words = set(word.lower() for word in normalized_json.split() if len(word) > 2)
-            if header_words and json_words:
-                common_words = header_words.intersection(json_words)
-                score = len(common_words) / max(len(header_words), len(json_words))
-                if score > best_score and score >= 0.3:
                     best_score = score
-                    best_match = json_key
-            header_clean = normalized_header.replace(" ", "").replace("-", "").replace("(", "").replace(")", "")
-            json_clean = normalized_json.replace(" ", "").replace("-", "").replace("(", "").replace(")", "")
-            if header_clean in json_clean or json_clean in header_clean:
-                if len(header_clean) > 5 and len(json_clean) > 5:
-                    substring_score = min(len(header_clean), len(json_clean)) / max(len(header_clean), len(json_clean))
-                    if substring_score > best_score and substring_score >= 0.6:
-                        best_score = substring_score
-                        best_match = json_key
-        if best_match:
             column_mapping[col_idx] = best_match
-            print(f"      📌 Column {col_idx + 1} ('{header_text}') -> '{best_match}' (score: {best_score:.2f})")
     if not column_mapping:
         print(f"    ❌ No column mappings found")
         return 0
-    # Determine data rows needed
     max_data_rows = 0
     for json_key, data in vehicle_section.items():
         if isinstance(data, list):
@@ -418,14 +511,13 @@ def handle_vehicle_registration_table(table, flat_json):
     print(f"    📌 Need to populate {max_data_rows} data rows")
-    # Process data rows
     for data_row_index in range(max_data_rows):
         table_row_idx = header_row_idx + 1 + data_row_index
         if table_row_idx >= len(table.rows):
             print(f"    ⚠️ Row {table_row_idx + 1} doesn't exist - table only has {len(table.rows)} rows")
             print(f"    ➕ Adding new row for vehicle {data_row_index + 1}")
             new_row = table.add_row()
             print(f"    ✅ Successfully added row {len(table.rows)} to the table")
@@ -458,33 +550,26 @@ def handle_attendance_list_table_enhanced(table, flat_json):
     """Enhanced Attendance List processing with better detection"""
     replacements_made = 0
-    # Check multiple patterns for attendance list
     attendance_patterns = [
         "attendance list",
         "names and position titles",
         "attendees"
     ]
-    # Scan all cells in the first few rows for attendance list indicators
     found_attendance_row = None
-    for row_idx, row in enumerate(table.rows[:3]):  # Check first 3 rows
         for cell_idx, cell in enumerate(row.cells):
             cell_text = get_clean_text(cell).lower()
-            # Check if this cell contains attendance list header
             if any(pattern in cell_text for pattern in attendance_patterns):
                 found_attendance_row = row_idx
                 print(f"    🎯 ENHANCED: Found Attendance List in row {row_idx + 1}, cell {cell_idx + 1}")
                 break
         if found_attendance_row is not None:
             break
     if found_attendance_row is None:
         return 0
-    # Look for attendance data in JSON
     attendance_value = None
     attendance_search_keys = [
         "Attendance List (Names and Position Titles).Attendance List (Names and Position Titles)",
@@ -506,9 +591,7 @@ def handle_attendance_list_table_enhanced(table, flat_json):
         print(f"    ❌ No attendance data found in JSON")
         return 0
-    # Look for red text in ALL cells of the table
     target_cell = None
     print(f"    🔍 Scanning ALL cells in attendance table for red text...")
     for row_idx, row in enumerate(table.rows):
@@ -516,35 +599,29 @@ def handle_attendance_list_table_enhanced(table, flat_json):
             if has_red_text(cell):
                 print(f"        🎯 Found red text in row {row_idx + 1}, cell {cell_idx + 1}")
-                # Get the red text to see if it looks like attendance data
                 red_text = ""
                 for paragraph in cell.paragraphs:
                     for run in paragraph.runs:
                         if is_red(run):
                             red_text += run.text
-                print(f"        📋 Red text content: '{red_text[:50]}...'")
-                # Check if this red text looks like attendance data (contains names/manager/etc)
                 red_text_lower = red_text.lower()
-                if any(indicator in red_text_lower for indicator in ['manager', 'herbig', 'palin', '–', '-']):
                     target_cell = cell
                     print(f"        ✅ This looks like attendance data - using this cell")
                     break
         if target_cell is not None:
             break
-    # If no red text found that looks like attendance data, return
     if target_cell is None:
         print(f"    ⚠️ No red text found that looks like attendance data")
         return 0
-    # Replace red text with properly formatted attendance list
     if has_red_text(target_cell):
         print(f"    🔧 Replacing red text with properly formatted attendance list...")
-        # Ensure attendance_value is a list
         if isinstance(attendance_value, list):
             attendance_list = [str(item).strip() for item in attendance_value if str(item).strip()]
         else:
@@ -554,7 +631,6 @@ def handle_attendance_list_table_enhanced(table, flat_json):
         for i, item in enumerate(attendance_list):
             print(f"        {i+1}. {item}")
-        # Replace with line-separated attendance list
         replacement_text = "\n".join(attendance_list)
         cell_replacements = replace_red_text_in_cell(target_cell, replacement_text)
         replacements_made += cell_replacements
@@ -565,71 +641,108 @@ def handle_attendance_list_table_enhanced(table, flat_json):
     return replacements_made
 def fix_management_summary_details_column(table, flat_json):
-    """Fix the DETAILS column in Management Summary table"""
     replacements_made = 0
     print(f"    🎯 FIX: Management Summary DETAILS column processing")
-    # Check if this is a Management Summary table
     table_text = ""
-    for row in table.rows[:2]:
         for cell in row.cells:
             table_text += get_clean_text(cell).lower() + " "
-    if not ("mass management" in table_text and "details" in table_text):
         return 0
-    print(f"    ✅ Confirmed Mass Management Summary table")
-    # Process each row looking for Std 5. and Std 6. with red text
-    for row_idx, row in enumerate(table.rows):
-        if len(row.cells) >= 2:
-            standard_cell = row.cells[0]
-            details_cell = row.cells[1]
-            standard_text = get_clean_text(standard_cell).strip()
-            # Look for Std 5. Verification and Std 6. Internal Review specifically
-            if "Std 5." in standard_text and "Verification" in standard_text:
-                if has_red_text(details_cell):
-                    print(f"      🔍 Found Std 5. Verification with red text")
-                    json_value = find_matching_json_value("Std 5. Verification", flat_json)
-                    if json_value is not None:
-                        replacement_text = get_value_as_string(json_value, "Std 5. Verification")
-                        cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
-                        replacements_made += cell_replacements
-                        print(f"      ✅ Replaced Std 5. Verification details")
-            elif "Std 6." in standard_text and "Internal Review" in standard_text:
-                if has_red_text(details_cell):
-                    print(f"      🔍 Found Std 6. Internal Review with red text")
-                    json_value = find_matching_json_value("Std 6. Internal Review", flat_json)
-                    if json_value is not None:
-                        replacement_text = get_value_as_string(json_value, "Std 6. Internal Review")
-                        cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
-                        replacements_made += cell_replacements
-                        print(f"      ✅ Replaced Std 6. Internal Review details")
     return replacements_made
-# ========================================================================
-# IMPORTANT: Single canonical definition for Operator Declaration fixer
-# ========================================================================
 def fix_operator_declaration_empty_values(table, flat_json):
-    """Fix Operator Declaration table when values are empty or need updating.
-    - Prefer exact qualified keys.
-    - If JSON has combined 'Name - Position', split it safely.
-    - Only write into cells that are empty or contain red text.
-    - Mark table as processed on success.
-    """
     replacements_made = 0
     print(f"    🎯 FIX: Operator Declaration empty values processing")
-    # Check if this is an Operator Declaration table
     table_context = ""
     for row in table.rows:
         for cell in row.cells:
@@ -641,17 +754,13 @@ def fix_operator_declaration_empty_values(table, flat_json):
     print(f"    ✅ Confirmed Operator Declaration table")
     def parse_name_and_position(value):
-        """Try to split combined name/position values into (name, position)."""
         if value is None:
             return None, None
-        # If it's a list: common pattern is [name, position]
         if isinstance(value, list):
             if len(value) == 0:
                 return None, None
             if len(value) == 1:
                 return str(value[0]).strip(), None
-            # use first two sensible entries
             first = str(value[0]).strip()
             second = str(value[1]).strip()
             if first and second:
@@ -662,7 +771,6 @@ def fix_operator_declaration_empty_values(table, flat_json):
         if not s:
             return None, None
-        # Common separators: hyphen, en-dash, em-dash, comma, pipe
         parts = re.split(r'\s+[-–—]\s+|\s*,\s*|\s*\|\s*', s)
         if len(parts) >= 2:
             left = parts[0].strip()
@@ -675,7 +783,6 @@ def fix_operator_declaration_empty_values(table, flat_json):
                 return right, left
             return left, right
-        # If no separator, check trailing role token
         tokens = s.split()
         if len(tokens) >= 2:
             last = tokens[-1]
@@ -684,10 +791,8 @@ def fix_operator_declaration_empty_values(table, flat_json):
             if any(ind == last.lower() for ind in role_indicators):
                 return " ".join(tokens[:-1]), last
-        # fallback: treat entire string as name
         return s, None
-    # Locate header row + data row
     for row_idx, row in enumerate(table.rows):
         if len(row.cells) >= 2:
             cell1_text = get_clean_text(row.cells[0]).strip().lower()
@@ -706,7 +811,6 @@ def fix_operator_declaration_empty_values(table, flat_json):
                         position_text = get_clean_text(position_cell).strip()
                         print(f"      📋 Current values: Name='{name_text}', Position='{position_text}'")
-                        # Prefer exact qualified keys first
                         name_value = find_matching_json_value("Operator Declaration.Print Name", flat_json)
                         if name_value is None:
                             name_value = find_matching_json_value("Print Name", flat_json)
@@ -715,11 +819,9 @@ def fix_operator_declaration_empty_values(table, flat_json):
                         if position_value is None:
                             position_value = find_matching_json_value("Position Title", flat_json)
-                        # parse combined cases
                         parsed_name_from_nameval, parsed_pos_from_nameval = parse_name_and_position(name_value) if name_value is not None else (None, None)
                         parsed_name_from_posval, parsed_pos_from_posval = parse_name_and_position(position_value) if position_value is not None else (None, None)
-                        # decide final candidates
                         final_name = None
                         final_pos = None
@@ -728,7 +830,6 @@ def fix_operator_declaration_empty_values(table, flat_json):
                         elif name_value is not None:
                             final_name = get_value_as_string(name_value)
-                        # position preference: parsed_pos_from_posval > explicit position_value > parsed_pos_from_nameval
                         if parsed_pos_from_posval:
                             final_pos = parsed_pos_from_posval
                         elif position_value is not None:
@@ -736,7 +837,6 @@ def fix_operator_declaration_empty_values(table, flat_json):
                         elif parsed_pos_from_nameval:
                             final_pos = parsed_pos_from_nameval
-                        # normalize
                         if isinstance(final_name, list):
                             final_name = " ".join(str(x) for x in final_name).strip()
                         if isinstance(final_pos, list):
@@ -755,7 +855,6 @@ def fix_operator_declaration_empty_values(table, flat_json):
                                 return False
                             return len(name_str) > 1
-                        # Write name if empty or red
                         if (not name_text or has_red_text(name_cell)) and final_name and looks_like_person(final_name):
                             if has_red_text(name_cell):
                                 replace_red_text_in_cell(name_cell, final_name)
@@ -764,7 +863,6 @@ def fix_operator_declaration_empty_values(table, flat_json):
                             replacements_made += 1
                             print(f"      ✅ Updated Print Name -> '{final_name}'")
-                        # Write position if empty or red
                         if (not position_text or has_red_text(position_cell)) and final_pos:
                             if has_red_text(position_cell):
                                 replace_red_text_in_cell(position_cell, final_pos)
@@ -775,7 +873,6 @@ def fix_operator_declaration_empty_values(table, flat_json):
                 break
-    # mark processed
     if replacements_made > 0:
         try:
             setattr(table, "_processed_operator_declaration", True)
@@ -786,14 +883,10 @@ def fix_operator_declaration_empty_values(table, flat_json):
     return replacements_made
 def handle_multiple_red_segments_in_cell(cell, flat_json):
-    """Handle multiple red text segments within a single cell"""
     replacements_made = 0
     red_segments = extract_red_text_segments(cell)
     if not red_segments:
         return 0
-    # Try to match each segment individually
     for i, segment in enumerate(red_segments):
         segment_text = segment['text'].strip()
         if segment_text:
@@ -803,63 +896,45 @@ def handle_multiple_red_segments_in_cell(cell, flat_json):
                 if replace_single_segment(segment, replacement_text):
                     replacements_made += 1
                     print(f"      ✅ Replaced segment {i+1}: '{segment_text}' -> '{replacement_text}'")
     return replacements_made
 def handle_nature_business_multiline_fix(cell, flat_json):
-    """Handle Nature of Business multiline red text"""
     replacements_made = 0
-    # Extract red text to check if it looks like nature of business
     red_text = ""
     for paragraph in cell.paragraphs:
         for run in paragraph.runs:
             if is_red(run):
                 red_text += run.text
     red_text = red_text.strip()
     if not red_text:
         return 0
-    # Check if this looks like nature of business content
     nature_indicators = ["transport", "logistics", "freight", "delivery", "trucking", "haulage"]
     if any(indicator in red_text.lower() for indicator in nature_indicators):
-        # Try to find nature of business in JSON
         nature_value = find_matching_json_value("Nature of Business", flat_json)
         if nature_value is not None:
             replacement_text = get_value_as_string(nature_value, "Nature of Business")
             cell_replacements = replace_red_text_in_cell(cell, replacement_text)
             replacements_made += cell_replacements
             print(f"      ✅ Fixed Nature of Business multiline content")
     return replacements_made
 def handle_management_summary_fix(cell, flat_json):
-    """Handle Management Summary content fixes"""
     replacements_made = 0
-    # Extract red text
     red_text = ""
     for paragraph in cell.paragraphs:
         for run in paragraph.runs:
             if is_red(run):
                 red_text += run.text
     red_text = red_text.strip()
     if not red_text:
         return 0
-    # Look for management summary data in new schema format
     management_types = ["Mass Management Summary", "Maintenance Management Summary", "Fatigue Management Summary"]
     for mgmt_type in management_types:
         if mgmt_type in flat_json:
             mgmt_data = flat_json[mgmt_type]
             if isinstance(mgmt_data, dict):
-                # Try to match red text with any standard in this management type
                 for std_key, std_value in mgmt_data.items():
                     if isinstance(std_value, list) and std_value:
-                        # Check if red text matches this standard
                         if len(red_text) > 10:
                             for item in std_value:
                                 if red_text.lower() in str(item).lower() or str(item).lower() in red_text.lower():
@@ -868,44 +943,32 @@ def handle_management_summary_fix(cell, flat_json):
                                     replacements_made += cell_replacements
                                     print(f"      ✅ Fixed {mgmt_type} - {std_key}")
                                     return replacements_made
     return replacements_made
-# ========================================================================
 # SMALL OPERATOR/AUDITOR TABLE HANDLER (skip if already processed)
-# ========================================================================
 def handle_operator_declaration_fix(table, flat_json):
-    """Wrapper for small declaration tables. Delegate to canonical fix first.
-    If canonical did not change anything, fall back to the small-table auditor handling.
-    Safeguards: do not replace with date-like values; prefer person/role candidates.
-    """
     replacements_made = 0
-    # skip if already processed
     if getattr(table, "_processed_operator_declaration", False):
         print(f"    ⏭️ Skipping - Operator Declaration table already processed")
         return 0
-    # only intended for small tables; if large, skip
     if len(table.rows) > 4:
         return 0
-    # First: try canonical operator declaration handler (covers primary case)
     replaced = fix_operator_declaration_empty_values(table, flat_json)
     replacements_made += replaced
     if replaced:
-        # canonical handled it and set the processed flag
         return replacements_made
-    # --- Helper validators (local, minimal, safe) ---
     def is_date_like(s: str) -> bool:
         if not s:
             return False
         s = s.strip()
-        # common tokens that indicate a date string
         month_names = r"(jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec|january|february|march|april|may|june|july|august|september|october|november|december)"
-        # patterns: "2nd November 2023", "02/11/2023", "2023-11-02", "November 2023", "Date"
         if re.search(r"\bDate\b", s, re.IGNORECASE):
             return True
         if re.search(r"\b\d{1,2}(?:st|nd|rd|th)?\b\s+" + month_names, s, re.IGNORECASE):
@@ -916,7 +979,6 @@ def handle_operator_declaration_fix(table, flat_json):
             return True
         if re.search(r"\b\d{4}[\/\.\-]\d{1,2}[\/\.\-]\d{1,2}\b", s):
             return True
-        # single 4-digit year alone
         if re.fullmatch(r"\d{4}", s):
             return True
         return False
@@ -925,11 +987,9 @@ def handle_operator_declaration_fix(table, flat_json):
         if not s:
             return False
         low = s.lower().strip()
-        # reject org/company terms
         bad_terms = ["pty ltd", "p/l", "plc", "company", "farming", "farm", "trust", "ltd"]
         if any(bt in low for bt in bad_terms):
             return False
-        # minimal length check and presence of alphabetic characters
         if len(low) < 3:
             return False
         return bool(re.search(r"[a-zA-Z]", low))
@@ -941,16 +1001,13 @@ def handle_operator_declaration_fix(table, flat_json):
         roles = ["manager", "auditor", "owner", "director", "supervisor", "coordinator", "driver", "operator", "representative", "chief"]
         return any(r in low for r in roles)
-    # fallback: original small-table behaviour (auditor declaration etc.)
     print(f"    🎯 Processing other declaration table (fallback small-table behavior)")
     for row_idx, row in enumerate(table.rows):
         for cell_idx, cell in enumerate(row.cells):
             if not has_red_text(cell):
-                # do not overwrite non-red content in fallback
                 continue
-            # Try auditor-specific fields first
             declaration_fields = [
                 "NHVAS Approved Auditor Declaration.Print Name",
                 "Auditor name",
@@ -968,19 +1025,13 @@ def handle_operator_declaration_fix(table, flat_json):
                 if not replacement_text:
                     continue
-                # SAFEGUARD: do not replace with date-like text for name/position cells
                 if is_date_like(replacement_text):
-                    # allow genuinely date-targeted cells (if red text explicitly contains 'date')
-                    # but skip using a date string to fill 'name' or 'position' slots
-                    # check the red text in the cell to see if it expects a date
                     red_text = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red(run)).strip()
                     if "date" not in red_text.lower():
                         print(f"      ⚠️ Skipping date-like replacement for field '{field}' -> '{replacement_text[:30]}...'")
                         continue
-                # Further safeguard: if replacement looks like a person or role, only then write into name/position cells
                 if (looks_like_person_name(replacement_text) or looks_like_position(replacement_text) or "signature" in field.lower() or "date" in field.lower()):
-                    # Replace only red runs (safe)
                     cell_replacements = replace_red_text_in_cell(cell, replacement_text)
                     if cell_replacements > 0:
                         replacements_made += cell_replacements
@@ -988,11 +1039,9 @@ def handle_operator_declaration_fix(table, flat_json):
                         print(f"      ✅ Fixed declaration field: {field} -> '{replacement_text}'")
                         break
                 else:
-                    # Not a person or role-looking text — skip to avoid clobbering name/position with unrelated content
                     print(f"      ⚠️ Replacement for field '{field}' does not look like name/role, skipping: '{replacement_text[:30]}...'")
                     continue
-            # If not replaced by the declared fields, try to infer from the cell's red text (date/signature fallback)
             if not replaced_this_cell:
                 red_text = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red(run)).strip().lower()
                 if "signature" in red_text:
@@ -1001,12 +1050,10 @@ def handle_operator_declaration_fix(table, flat_json):
                         replacements_made += cell_replacements
                         print(f"      ✅ Inserted placeholder [Signature]")
                 elif "date" in red_text:
-                    # Try to find a date value in JSON for an explicit date slot else skip
                     date_value = find_matching_json_value("Date", flat_json) or find_matching_json_value("Date of Audit", flat_json) or find_matching_json_value("Audit was conducted on", flat_json)
                     if date_value is not None:
                         date_text = get_value_as_string(date_value)
                         if not is_date_like(date_text):
-                            # defensive: if the date value is not date-like, skip
                             print(f"      ⚠️ Found date-value but not date-like, skipping: '{date_text}'")
                         else:
                             cell_replacements = replace_red_text_in_cell(cell, date_text)
@@ -1014,7 +1061,6 @@ def handle_operator_declaration_fix(table, flat_json):
                                 replacements_made += cell_replacements
                                 print(f"      ✅ Inserted date value: '{date_text}'")
-    # if any replacements made here, mark processed
     if replacements_made > 0:
         try:
             setattr(table, "_processed_operator_declaration", True)
@@ -1025,22 +1071,17 @@ def handle_operator_declaration_fix(table, flat_json):
     return replacements_made
 def handle_print_accreditation_section(table, flat_json):
-    """Handle Print Accreditation section - SKIP Operator Declaration tables"""
     replacements_made = 0
-    # <<< PATCH: skip if operator declaration already processed
     if getattr(table, "_processed_operator_declaration", False):
         print(f"    ⏭️ Skipping Print Accreditation - this is an Operator Declaration table")
         return 0
-    # <<< END PATCH
-    # Get table context to check what type of table this is
     table_context = ""
     for row in table.rows:
         for cell in row.cells:
             table_context += get_clean_text(cell).lower() + " "
-    # SKIP if this is an Operator Declaration table
     if "operator declaration" in table_context or ("print name" in table_context and "position title" in table_context):
         print(f"    ⏭️ Skipping Print Accreditation - this is an Operator Declaration table")
         return 0
@@ -1050,11 +1091,11 @@ def handle_print_accreditation_section(table, flat_json):
     for row_idx, row in enumerate(table.rows):
         for cell_idx, cell in enumerate(row.cells):
             if has_red_text(cell):
-                # Try print accreditation fields
                 accreditation_fields = [
                     "(print accreditation name)",
                     "Operator name (Legal entity)",
-                    "Print accreditation name"
                 ]
                 for field in accreditation_fields:
@@ -1071,7 +1112,6 @@ def handle_print_accreditation_section(table, flat_json):
     return replacements_made
 def process_single_column_sections(cell, key_text, flat_json):
-    """Process single column sections with red text"""
     replacements_made = 0
     if has_red_text(cell):
@@ -1082,10 +1122,8 @@ def process_single_column_sections(cell, key_text, flat_json):
                     red_text += run.text
         if red_text.strip():
-            # Try direct matching first
             section_value = find_matching_json_value(red_text.strip(), flat_json)
             if section_value is None:
-                # Try key-based matching
                 section_value = find_matching_json_value(key_text, flat_json)
             if section_value is not None:
@@ -1108,13 +1146,13 @@ def process_tables(document, flat_json):
     for table_idx, table in enumerate(document.tables):
         print(f"\n🔍 Processing table {table_idx + 1}:")
-        # Get table context
         table_text = ""
         for row in table.rows[:3]:
             for cell in row.cells:
                 table_text += get_clean_text(cell).lower() + " "
-        # Detect Management Summary tables
         management_summary_indicators = ["mass management", "maintenance management", "fatigue management"]
         has_management = any(indicator in table_text for indicator in management_summary_indicators)
         has_details = "details" in table_text
@@ -1129,10 +1167,9 @@ def process_tables(document, flat_json):
             for row_idx, row in enumerate(table.rows):
                 for cell_idx, cell in enumerate(row.cells):
                     if has_red_text(cell):
-                        # Try direct matching with the new schema names first
                         for mgmt_type in ["Mass Management Summary", "Maintenance Management Summary", "Fatigue Management Summary"]:
                             if mgmt_type.lower().replace(" summary", "") in table_text:
-                                # Look for this standard in the JSON
                                 if mgmt_type in flat_json:
                                     mgmt_data = flat_json[mgmt_type]
                                     if isinstance(mgmt_data, dict):
@@ -1156,7 +1193,7 @@ def process_tables(document, flat_json):
             continue
         # Detect Vehicle Registration tables
-        vehicle_indicators = ["registration number", "sub-contractor", "weight verification", "rfs suspension"]
         indicator_count = sum(1 for indicator in vehicle_indicators if indicator in table_text)
         if indicator_count >= 2:
             print(f"    🚗 Detected Vehicle Registration table")
@@ -1175,22 +1212,18 @@ def process_tables(document, flat_json):
         print_accreditation_indicators = ["print name", "position title"]
         indicator_count = sum(1 for indicator in print_accreditation_indicators if indicator in table_text)
-        # <<< PATCH: require both indicators (or two matches) to reduce false positives
         if indicator_count >= 2 or ("print name" in table_text and "position title" in table_text):
             print(f"    📋 Detected Print Accreditation/Operator Declaration table")
-            # First, try strong operator declaration fix (exact keys)
             declaration_fixes = fix_operator_declaration_empty_values(table, flat_json)
             replacements_made += declaration_fixes
-            # Then only run print accreditation section if not marked processed
             if not getattr(table, "_processed_operator_declaration", False):
                 print_accreditation_replacements = handle_print_accreditation_section(table, flat_json)
                 replacements_made += print_accreditation_replacements
             continue
-        # Process regular table rows (same as your original logic)
         for row_idx, row in enumerate(table.rows):
             if len(row.cells) < 1:
                 continue
@@ -1208,16 +1241,13 @@ def process_tables(document, flat_json):
             if json_value is not None:
                 replacement_text = get_value_as_string(json_value, key_text)
-                # Handle Australian Company Number
                 if ("australian company number" in key_text.lower() or "company number" in key_text.lower()) and isinstance(json_value, list):
                     cell_replacements = handle_australian_company_number(row, json_value)
                     replacements_made += cell_replacements
-                # Handle section headers
                 elif ("attendance list" in key_text.lower() or "nature of" in key_text.lower()) and row_idx + 1 < len(table.rows):
                     print(f"    ✅ Section header detected, checking next row...")
                     next_row = table.rows[row_idx + 1]
                     for cell_idx, cell in enumerate(next_row.cells):
                         if has_red_text(cell):
                             print(f"    ✅ Found red text in next row, cell {cell_idx + 1}")
@@ -1228,13 +1258,11 @@ def process_tables(document, flat_json):
                             if cell_replacements > 0:
                                 print(f"    -> Replaced section content")
-                # Handle single column sections
                 elif len(row.cells) == 1 or (len(row.cells) > 1 and not any(has_red_text(row.cells[i]) for i in range(1, len(row.cells)))):
                     if has_red_text(key_cell):
                         cell_replacements = process_single_column_sections(key_cell, key_text, flat_json)
                         replacements_made += cell_replacements
-                # Handle regular key-value pairs
                 else:
                     for cell_idx in range(1, len(row.cells)):
                         value_cell = row.cells[cell_idx]
@@ -1244,7 +1272,6 @@ def process_tables(document, flat_json):
                             replacements_made += cell_replacements
             else:
-                # Fallback processing for unmatched keys
                 if len(row.cells) == 1 and has_red_text(key_cell):
                     red_text = ""
                     for paragraph in key_cell.paragraphs:
@@ -1258,14 +1285,12 @@ def process_tables(document, flat_json):
                             cell_replacements = replace_red_text_in_cell(key_cell, section_replacement)
                             replacements_made += cell_replacements
-                # Process red text in all cells
                 for cell_idx in range(len(row.cells)):
                     cell = row.cells[cell_idx]
                     if has_red_text(cell):
                         cell_replacements = handle_multiple_red_segments_in_cell(cell, flat_json)
                         replacements_made += cell_replacements
-                        # Apply fixes if no replacements made
                         if cell_replacements == 0:
                             surgical_fix = handle_nature_business_multiline_fix(cell, flat_json)
                             replacements_made += surgical_fix
@@ -1274,7 +1299,7 @@ def process_tables(document, flat_json):
                             management_summary_fix = handle_management_summary_fix(cell, flat_json)
                             replacements_made += management_summary_fix
-    # Handle Operator/Auditor Declaration tables (check last few tables)
     print(f"\n🎯 Final check for Declaration tables...")
     for table in document.tables[-3:]:
         if len(table.rows) <= 4:
@@ -1300,7 +1325,6 @@ def process_paragraphs(document, flat_json):
             json_value = find_matching_json_value(red_text_only, flat_json)
             if json_value is None:
-                # Enhanced pattern matching for signatures and dates
                 if "AUDITOR SIGNATURE" in red_text_only.upper() or "DATE" in red_text_only.upper():
                     json_value = find_matching_json_value("auditor signature", flat_json)
                 elif "OPERATOR SIGNATURE" in red_text_only.upper():
@@ -1326,11 +1350,9 @@ def process_headings(document, flat_json):
     for para_idx, paragraph in enumerate(paragraphs):
         paragraph_text = paragraph.text.strip()
         if not paragraph_text:
             continue
-        # Check if this is a heading
         matched_heading = None
         for category, patterns in HEADING_PATTERNS.items():
             for pattern in patterns:
@@ -1343,13 +1365,11 @@ def process_headings(document, flat_json):
         if matched_heading:
             print(f"  📌 Found heading at paragraph {para_idx + 1}: '{paragraph_text}'")
-            # Check current heading paragraph
             if has_red_text_in_paragraph(paragraph):
                 print(f"    🔴 Found red text in heading itself")
                 heading_replacements = process_red_text_in_paragraph(paragraph, paragraph_text, flat_json)
                 replacements_made += heading_replacements
-            # Look ahead for related content
             for next_para_offset in range(1, 6):
                 next_para_idx = para_idx + next_para_offset
                 if next_para_idx >= len(paragraphs):
@@ -1361,7 +1381,6 @@ def process_headings(document, flat_json):
                 if not next_text:
                     continue
-                # Stop if we hit another heading
                 is_another_heading = False
                 for category, patterns in HEADING_PATTERNS.items():
                     for pattern in patterns:
@@ -1374,10 +1393,8 @@ def process_headings(document, flat_json):
                 if is_another_heading:
                     break
-                # Process red text with context
                 if has_red_text_in_paragraph(next_paragraph):
                     print(f"    🔴 Found red text in paragraph {next_para_idx + 1} after heading")
                     context_replacements = process_red_text_in_paragraph(
                         next_paragraph,
                         paragraph_text,
@@ -1403,11 +1420,8 @@ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
     print(f"      🔍 Red text found: '{combined_red_text}'")
     json_value = None
-    # Direct matching
     json_value = find_matching_json_value(combined_red_text, flat_json)
-    # Context-based matching
     if json_value is None:
         if "NHVAS APPROVED AUDITOR" in context_text.upper():
             auditor_fields = ["auditor name", "auditor", "nhvas auditor", "approved auditor", "print name"]
@@ -1425,7 +1439,6 @@ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
                     print(f"      ✅ Found operator match with field: '{field}'")
                     break
-    # Combined context queries
     if json_value is None:
         context_queries = [
             f"{context_text} {combined_red_text}",
@@ -1439,18 +1452,14 @@ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
                 print(f"      ✅ Found match with combined query")
                 break
-    # Replace if match found
     if json_value is not None:
         replacement_text = get_value_as_string(json_value, combined_red_text)
         red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
         if red_runs:
             red_runs[0].text = replacement_text
             red_runs[0].font.color.rgb = RGBColor(0, 0, 0)
             for run in red_runs[1:]:
                 run.text = ''
             replacements_made = 1
             print(f"      ✅ Replaced with: '{replacement_text}'")
     else:
@@ -1458,7 +1467,9 @@ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
     return replacements_made
 def process_hf(json_file, docx_file, output_file):
     """Main processing function with comprehensive error handling"""
@@ -1490,16 +1501,23 @@ def process_hf(json_file, docx_file, output_file):
         paragraph_replacements = process_paragraphs(doc, flat_json)
         heading_replacements = process_headings(doc, flat_json)
-        # Final force fix for any remaining red text
-        #force_replacements = force_red_text_replacement(doc, flat_json)
-        total_replacements = table_replacements + paragraph_replacements + heading_replacements
-        #+ force_replacements
-        # Save output
         if hasattr(output_file, "write"):
             doc.save(output_file)
         else:
             doc.save(output_file)
         print(f"\n✅ Document saved as: {output_file}")
@@ -1507,7 +1525,6 @@ def process_hf(json_file, docx_file, output_file):
         print(f"   📊 Tables: {table_replacements}")
         print(f"   📝 Paragraphs: {paragraph_replacements}")
         print(f"   📋 Headings: {heading_replacements}")
-        #print(f"   🎯 Force fixes: {force_replacements}")
         print(f"🎉 Processing complete!")
     except FileNotFoundError as e:
@@ -1517,6 +1534,10 @@ def process_hf(json_file, docx_file, output_file):
         import traceback
         traceback.print_exc()
 if __name__ == "__main__":
     import sys
     if len(sys.argv) != 4:

  - safer force replacement (avoid short->long mapping)
  - prefer exact qualified keys for Print Name / Position Title lookups
  - preserved all other logic and prints/logging
+ - ADDED: header normalization, context-aware vehicle JSON selection,
+          management summary scoping, unmatched-headers logging
 """
 import json
 from docx.shared import RGBColor
 import re
 from typing import Any
+import os
+# ============================================================================
+# Configuration / Heading patterns for document structure detection
+# ============================================================================
 HEADING_PATTERNS = {
     "main": [
         r"NHVAS\s+Audit\s+Summary\s+Report",
     ]
 }
+# ============================================================================
+# State for unmatched headers (for iterative improvement)
+# ============================================================================
+_unmatched_headers = {}
+def record_unmatched_header(header: str):
+    if not header:
+        return
+    _unmatched_headers[header] = _unmatched_headers.get(header, 0) + 1
 # ============================================================================
 # UTILITY FUNCTIONS
 # ============================================================================
 def is_red(run):
     color = run.font.color
     try:
+        return color and ((getattr(color, "rgb", None) and color.rgb == RGBColor(255, 0, 0)) or getattr(color, "theme_color", None) == 1)
     except Exception:
         return False
 def get_value_as_string(value, field_name=""):
             return True
     return False
+# New helper: normalize header text (removes parentheticals, punctuation, etc.)
+def normalize_header_text(s: str) -> str:
+    if not s:
+        return ""
+    # remove parenthetical content
+    s = re.sub(r'\([^)]*\)', ' ', s)
+    # replace slashes
+    s = s.replace("/", " ")
+    # remove punctuation except # and %
+    s = re.sub(r'[^\w\s\#\%]', ' ', s)
+    s = re.sub(r'\s+', ' ', s).strip().lower()
+    # common canonicalizations
+    s = s.replace('registrationno', 'registration number')
+    s = s.replace('registrationnumber', 'registration number')
+    s = s.replace('sub contracted', 'sub contractor')
+    s = s.replace('sub-contractor', 'sub contractor')
+    s = s.replace('date range', '')
+    s = s.replace('applicable for entry audit', '')
+    s = s.strip()
+    return s
 # ============================================================================
 # JSON MATCHING FUNCTIONS
 # ============================================================================
         print(f"    ✅ Direct match found for key '{field_name}'")
         return flat_json[field_name]
+    # Case-insensitive exact match
     for key, value in flat_json.items():
         if key.lower() == field_name.lower():
             print(f"    ✅ Case-insensitive match found for key '{field_name}' with JSON key '{key}'")
             return value
+    # Better Print Name detection for operator vs auditor
     if field_name.lower().strip() == "print name":
         operator_keys = [k for k in flat_json.keys() if "operator" in k.lower() and "print name" in k.lower()]
         auditor_keys = [k for k in flat_json.keys() if "auditor" in k.lower() and ("print name" in k.lower() or "name" in k.lower())]
             print(f"    ✅ Auditor Name match: '{field_name}' -> '{auditor_keys[0]}'")
             return flat_json[auditor_keys[0]]
+    # Suffix matching for nested keys
     for key, value in flat_json.items():
         if '.' in key and key.split('.')[-1].lower() == field_name.lower():
             print(f"    ✅ Suffix match found for key '{field_name}' with JSON key '{key}'")
             return value
+    # Clean & exact match attempt
     clean_field = re.sub(r'[^\w\s]', ' ', field_name.lower()).strip()
     clean_field = re.sub(r'\s+', ' ', clean_field)
     for key, value in flat_json.items():
             print(f"    ✅ Clean match found for key '{field_name}' with JSON key '{key}'")
             return value
+    # Enhanced fuzzy matching with word-token scoring
     field_words = set(word.lower() for word in re.findall(r'\b\w+\b', field_name) if len(word) > 2)
     if not field_words:
         return None
         if not key_words:
             continue
         common_words = field_words.intersection(key_words)
         if common_words:
             similarity = len(common_words) / len(field_words.union(key_words))
 # ============================================================================
 def extract_red_text_segments(cell):
     red_segments = []
     for para_idx, paragraph in enumerate(cell.paragraphs):
         current_segment = ""
         segment_runs = []
         for run_idx, run in enumerate(paragraph.runs):
             if is_red(run):
                 if run.text:
                     current_segment += run.text
                 segment_runs.append((para_idx, run_idx, run))
             else:
                 if segment_runs:
                     red_segments.append({
                         'text': current_segment,
                     })
                     current_segment = ""
                     segment_runs = []
         if segment_runs:
             red_segments.append({
                 'text': current_segment,
                 'runs': segment_runs.copy(),
                 'paragraph_idx': para_idx
             })
     return red_segments
 def replace_all_red_segments(red_segments, replacement_text):
     if not red_segments:
         return 0
             first_run.text = replacement_lines[0]
             first_run.font.color.rgb = RGBColor(0, 0, 0)
             replacements_made = 1
             for _, _, run in first_segment['runs'][1:]:
                 run.text = ''
         try:
             first_run = red_segments[0]['runs'][0][2]
             paragraph = first_run.element.getparent()
             from docx.oxml import OxmlElement
             parent = first_run.element.getparent()
             for line in replacement_lines[1:]:
                 if line.strip():
                     br = OxmlElement('w:br')
                     first_run.element.append(br)
                     new_run = paragraph.add_run(line.strip())
                     new_run.font.color.rgb = RGBColor(0, 0, 0)
         except Exception:
     return replacements_made
 def replace_single_segment(segment, replacement_text):
     if not segment['runs']:
         return False
     first_run = segment['runs'][0][2]
     first_run.text = replacement_text
     first_run.font.color.rgb = RGBColor(0, 0, 0)
     for _, _, run in segment['runs'][1:]:
         run.text = ''
     return True
 def replace_red_text_in_cell(cell, replacement_text):
     red_segments = extract_red_text_segments(cell)
     if not red_segments:
         return 0
     return replace_all_red_segments(red_segments, replacement_text)
 # ============================================================================
 # ============================================================================
 def handle_australian_company_number(row, company_numbers):
     replacements_made = 0
     for i, digit in enumerate(company_numbers):
         cell_idx = i + 1
     return replacements_made
 def handle_vehicle_registration_table(table, flat_json):
+    """Handle vehicle registration table data replacement (improved header normalization and context-aware selection)"""
     replacements_made = 0
+    # build a table_text context (used to find mass/maintenance/fatigue)
+    table_text = ""
+    for r in table.rows[:3]:
+        for c in r.cells:
+            table_text += get_clean_text(c).lower() + " "
+    # 1) Detect the most relevant vehicle-related JSON section using context tokens
+    vehicle_section = None
+    context_tokens = []
+    if "mass" in table_text:
+        context_tokens.append("mass")
+    if "maintenance" in table_text:
+        context_tokens.append("maintenance")
+    if "fatigue" in table_text or "driver" in table_text or "scheduler" in table_text:
+        context_tokens.append("fatigue")
+    # candidate keys that mention 'registration' or 'vehicle'
+    candidates = []
     for key, value in flat_json.items():
+        k = key.lower()
+        if "registration" in k or "vehicle registration" in k or "vehicle" in k:
+            candidates.append((key, value))
+    # prefer candidates whose key contains one of the context tokens
+    if candidates and context_tokens:
+        for token in context_tokens:
+            for k, v in candidates:
+                if token in k.lower():
+                    vehicle_section = v if isinstance(v, (list, dict)) else {k: v}
+                    print(f"    ✅ Found vehicle data by context token '{token}' in key '{k}'")
+                    break
+            if vehicle_section:
                 break
+    # fallback: choose candidate containing 'registration' explicitly
+    if vehicle_section is None and candidates:
+        for k, v in candidates:
+            if "registration" in k.lower():
+                vehicle_section = v if isinstance(v, (list, dict)) else {k: v}
+                print(f"    ✅ Fallback vehicle data chosen from '{k}'")
+                break
+    # fallback: collect flattened keys that look like vehicle columns
+    if vehicle_section is None:
         potential_columns = {}
         for key, value in flat_json.items():
+            lk = key.lower()
+            if any(col_name in lk for col_name in ["registration number", "sub-contractor", "weight verification", "rfs suspension", "trip records", "suspension", "daily checks", "fault recording", "fault repair", "roadworthiness"]):
                 if "." in key:
                     column_name = key.split(".")[-1]
                 else:
                     column_name = key
                 potential_columns[column_name] = value
         if potential_columns:
             vehicle_section = potential_columns
             print(f"    ✅ Found vehicle data from flattened keys: {list(vehicle_section.keys())}")
+    if not vehicle_section:
+        print(f"    ❌ Vehicle registration data not found in JSON")
+        return 0
+    # ensure vehicle_section is a dict mapping column_name -> list/value
+    if isinstance(vehicle_section, list):
+        # if a list of dicts, attempt to flatten into columns
+        if vehicle_section and isinstance(vehicle_section[0], dict):
+            flattened = {}
+            for entry in vehicle_section:
+                for k, v in entry.items():
+                    flattened.setdefault(k, []).append(v)
+            vehicle_section = flattened
+    if not isinstance(vehicle_section, dict):
+        # convert single scalar to dict
+        try:
+            vehicle_section = dict(vehicle_section)
+        except Exception:
+            vehicle_section = {str(k): v for k, v in (vehicle_section.items() if isinstance(vehicle_section, dict) else [])}
     print(f"    ✅ Found vehicle registration data with {len(vehicle_section)} columns")
+    # Find header row index by searching for a row that contains 'registration' + 'number'
     header_row_idx = -1
     header_row = None
     for row_idx, row in enumerate(table.rows):
+        row_text = " ".join(get_clean_text(cell).lower() for cell in row.cells)
         if "registration" in row_text and "number" in row_text:
             header_row_idx = row_idx
             header_row = row
             break
+    if header_row_idx == -1:
+        # try alternative detection: a row with 'registration' or 'reg no'
+        for row_idx, row in enumerate(table.rows):
+            row_text = " ".join(get_clean_text(cell).lower() for cell in row.cells)
+            if "registration" in row_text or "reg no" in row_text or "regno" in row_text:
+                header_row_idx = row_idx
+                header_row = row
+                break
     if header_row_idx == -1:
         print(f"    ❌ Could not find header row in vehicle table")
     print(f"    ✅ Found header row at index {header_row_idx}")
+    # Enhanced column mapping: normalize both header and candidate keys, token overlap scoring
     column_mapping = {}
+    # build normalized master map from vehicle_section keys
+    master_labels = {}
+    for orig_key in vehicle_section.keys():
+        norm = normalize_header_text(str(orig_key))
+        if norm:
+            master_labels.setdefault(norm, orig_key)
+    # add fallback synonyms for common labels (preserve existing)
+    fallback_synonyms = [
+        "no", "registration number", "reg no", "registration", "sub contractor", "sub-contractor",
+        "sub contracted", "weight verification records", "rfs suspension certification", "suspension system maintenance",
+        "trip records", "fault recording reporting", "daily checks", "roadworthiness certificates",
+        "maintenance records", "fault repair"
+    ]
+    for syn in fallback_synonyms:
+        norm = normalize_header_text(syn)
+        if norm and norm not in master_labels:
+            master_labels.setdefault(norm, syn)
+    # map header cells
     for col_idx, cell in enumerate(header_row.cells):
         header_text = get_clean_text(cell).strip()
+        if not header_text:
+            continue
+        # skip 'No.' column mapping attempts in many templates
+        if header_text.strip().lower() in {"no", "no.", "#"}:
             continue
+        norm_header = normalize_header_text(header_text)
         best_match = None
+        best_score = 0.0
+        # exact normalized match
+        if norm_header in master_labels:
+            best_match = master_labels[norm_header]
+            best_score = 1.0
+        else:
+            # token overlap scoring
+            header_tokens = set(t for t in norm_header.split() if len(t) > 2)
+            for norm_key, orig_label in master_labels.items():
+                key_tokens = set(t for t in norm_key.split() if len(t) > 2)
+                if not key_tokens:
+                    continue
+                common = header_tokens.intersection(key_tokens)
+                if common:
+                    score = len(common) / max(1, len(header_tokens.union(key_tokens)))
+                else:
+                    # substring fallback
+                    if norm_header in norm_key or norm_key in norm_header:
+                        score = min(len(norm_header), len(norm_key)) / max(len(norm_header), len(norm_key))
+                    else:
+                        score = 0.0
+                if score > best_score:
                     best_score = score
+                    best_match = orig_label
+        if best_match and best_score >= 0.30:
             column_mapping[col_idx] = best_match
+            print(f"      📌 Column {col_idx}: '{header_text}' -> '{best_match}' (norm: '{norm_header}', score: {best_score:.2f})")
+        else:
+            print(f"      ⚠️ No mapping found for '{header_text}' (norm: '{norm_header}')")
+            record_unmatched_header(header_text)
     if not column_mapping:
         print(f"    ❌ No column mappings found")
         return 0
+    # Determine number of rows to populate
     max_data_rows = 0
     for json_key, data in vehicle_section.items():
         if isinstance(data, list):
     print(f"    📌 Need to populate {max_data_rows} data rows")
+    # Fill or add rows as needed
     for data_row_index in range(max_data_rows):
         table_row_idx = header_row_idx + 1 + data_row_index
         if table_row_idx >= len(table.rows):
             print(f"    ⚠️ Row {table_row_idx + 1} doesn't exist - table only has {len(table.rows)} rows")
             print(f"    ➕ Adding new row for vehicle {data_row_index + 1}")
             new_row = table.add_row()
             print(f"    ✅ Successfully added row {len(table.rows)} to the table")
     """Enhanced Attendance List processing with better detection"""
     replacements_made = 0
     attendance_patterns = [
         "attendance list",
         "names and position titles",
         "attendees"
     ]
     found_attendance_row = None
+    for row_idx, row in enumerate(table.rows[:3]):
         for cell_idx, cell in enumerate(row.cells):
             cell_text = get_clean_text(cell).lower()
             if any(pattern in cell_text for pattern in attendance_patterns):
                 found_attendance_row = row_idx
                 print(f"    🎯 ENHANCED: Found Attendance List in row {row_idx + 1}, cell {cell_idx + 1}")
                 break
         if found_attendance_row is not None:
             break
     if found_attendance_row is None:
         return 0
     attendance_value = None
     attendance_search_keys = [
         "Attendance List (Names and Position Titles).Attendance List (Names and Position Titles)",
         print(f"    ❌ No attendance data found in JSON")
         return 0
     target_cell = None
     print(f"    🔍 Scanning ALL cells in attendance table for red text...")
     for row_idx, row in enumerate(table.rows):
             if has_red_text(cell):
                 print(f"        🎯 Found red text in row {row_idx + 1}, cell {cell_idx + 1}")
                 red_text = ""
                 for paragraph in cell.paragraphs:
                     for run in paragraph.runs:
                         if is_red(run):
                             red_text += run.text
+                print(f"        📋 Red text content: '{red_text[:80]}...'")
                 red_text_lower = red_text.lower()
+                if any(indicator in red_text_lower for indicator in ['manager', '–', '-']):
                     target_cell = cell
                     print(f"        ✅ This looks like attendance data - using this cell")
                     break
         if target_cell is not None:
             break
     if target_cell is None:
         print(f"    ⚠️ No red text found that looks like attendance data")
         return 0
     if has_red_text(target_cell):
         print(f"    🔧 Replacing red text with properly formatted attendance list...")
         if isinstance(attendance_value, list):
             attendance_list = [str(item).strip() for item in attendance_value if str(item).strip()]
         else:
         for i, item in enumerate(attendance_list):
             print(f"        {i+1}. {item}")
         replacement_text = "\n".join(attendance_list)
         cell_replacements = replace_red_text_in_cell(target_cell, replacement_text)
         replacements_made += cell_replacements
     return replacements_made
 def fix_management_summary_details_column(table, flat_json):
+    """Fix the DETAILS column in Management Summary table (multi-management aware)."""
     replacements_made = 0
     print(f"    🎯 FIX: Management Summary DETAILS column processing")
+    # Build table text to detect management type(s)
     table_text = ""
+    for row in table.rows[:3]:
         for cell in row.cells:
             table_text += get_clean_text(cell).lower() + " "
+    # Identify which management types this table likely represents
+    mgmt_types = []
+    if "mass management" in table_text or "mass" in table_text:
+        mgmt_types.append("Mass Management Summary")
+    if "maintenance management" in table_text or "maintenance" in table_text:
+        mgmt_types.append("Maintenance Management Summary")
+    if "fatigue management" in table_text or "fatigue" in table_text or "driver" in table_text:
+        mgmt_types.append("Fatigue Management Summary")
+    if not mgmt_types:
+        # fallback: try fuzzy detection through headings or presence of "Std 5." etc.
+        if any("std 5" in get_clean_text(c).lower() for r in table.rows for c in r.cells):
+            mgmt_types.append("Mass Management Summary")
+    if not mgmt_types:
         return 0
+    for mgmt_type in mgmt_types:
+        print(f"    ✅ Confirmed {mgmt_type} table processing")
+        # find data dict in flat_json for mgmt_type
+        mgmt_data = flat_json.get(mgmt_type)
+        if not isinstance(mgmt_data, dict):
+            # attempt suffix based keys in flat_json
+            for key in flat_json.keys():
+                if mgmt_type.split()[0].lower() in key.lower() and "summary" in key.lower():
+                    mgmt_data = flat_json.get(key)
+                    break
+        if not isinstance(mgmt_data, dict):
+            print(f"    ⚠️ No JSON management dict found for {mgmt_type}, skipping this type")
+            continue
+        # Process rows looking for Std 5. and Std 6.
+        for row_idx, row in enumerate(table.rows):
+            if len(row.cells) >= 2:
+                standard_cell = row.cells[0]
+                details_cell = row.cells[1]
+                standard_text = get_clean_text(standard_cell).strip().lower()
+                # Std 5.
+                if "std 5" in standard_text or "verification" in standard_text:
+                    if has_red_text(details_cell):
+                        print(f"      🔍 Found Std 5/Verification with red text")
+                        # try to find the appropriate key in mgmt_data
+                        std_val = None
+                        # exact key variants
+                        for candidate in ("Std 5. Verification", "Std 5 Verification", "Std 5", "Verification"):
+                            std_val = mgmt_data.get(candidate)
+                            if std_val is not None:
+                                break
+                        # fuzzy fallback
+                        if std_val is None:
+                            for k, v in mgmt_data.items():
+                                if 'std 5' in k.lower() or 'verification' in k.lower():
+                                    std_val = v
+                                    break
+                        if std_val is not None:
+                            replacement_text = get_value_as_string(std_val, "Std 5. Verification")
+                            cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
+                            replacements_made += cell_replacements
+                            if cell_replacements:
+                                print(f"      ✅ Replaced Std 5. Verification details for {mgmt_type}")
+                # Std 6.
+                if "std 6" in standard_text or "internal review" in standard_text:
+                    if has_red_text(details_cell):
+                        print(f"      🔍 Found Std 6/Internal Review with red text")
+                        std_val = None
+                        for candidate in ("Std 6. Internal Review", "Std 6 Internal Review", "Std 6", "Internal Review"):
+                            std_val = mgmt_data.get(candidate)
+                            if std_val is not None:
+                                break
+                        if std_val is None:
+                            for k, v in mgmt_data.items():
+                                if 'std 6' in k.lower() or 'internal review' in k.lower():
+                                    std_val = v
+                                    break
+                        if std_val is not None:
+                            replacement_text = get_value_as_string(std_val, "Std 6. Internal Review")
+                            cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
+                            replacements_made += cell_replacements
+                            if cell_replacements:
+                                print(f"      ✅ Replaced Std 6. Internal Review details for {mgmt_type}")
     return replacements_made
+# Canonical operator declaration fixer (keeps original robust logic)
 def fix_operator_declaration_empty_values(table, flat_json):
     replacements_made = 0
     print(f"    🎯 FIX: Operator Declaration empty values processing")
     table_context = ""
     for row in table.rows:
         for cell in row.cells:
     print(f"    ✅ Confirmed Operator Declaration table")
     def parse_name_and_position(value):
         if value is None:
             return None, None
         if isinstance(value, list):
             if len(value) == 0:
                 return None, None
             if len(value) == 1:
                 return str(value[0]).strip(), None
             first = str(value[0]).strip()
             second = str(value[1]).strip()
             if first and second:
         if not s:
             return None, None
         parts = re.split(r'\s+[-–—]\s+|\s*,\s*|\s*\|\s*', s)
         if len(parts) >= 2:
             left = parts[0].strip()
                 return right, left
             return left, right
         tokens = s.split()
         if len(tokens) >= 2:
             last = tokens[-1]
             if any(ind == last.lower() for ind in role_indicators):
                 return " ".join(tokens[:-1]), last
         return s, None
     for row_idx, row in enumerate(table.rows):
         if len(row.cells) >= 2:
             cell1_text = get_clean_text(row.cells[0]).strip().lower()
                         position_text = get_clean_text(position_cell).strip()
                         print(f"      📋 Current values: Name='{name_text}', Position='{position_text}'")
                         name_value = find_matching_json_value("Operator Declaration.Print Name", flat_json)
                         if name_value is None:
                             name_value = find_matching_json_value("Print Name", flat_json)
                         if position_value is None:
                             position_value = find_matching_json_value("Position Title", flat_json)
                         parsed_name_from_nameval, parsed_pos_from_nameval = parse_name_and_position(name_value) if name_value is not None else (None, None)
                         parsed_name_from_posval, parsed_pos_from_posval = parse_name_and_position(position_value) if position_value is not None else (None, None)
                         final_name = None
                         final_pos = None
                         elif name_value is not None:
                             final_name = get_value_as_string(name_value)
                         if parsed_pos_from_posval:
                             final_pos = parsed_pos_from_posval
                         elif position_value is not None:
                         elif parsed_pos_from_nameval:
                             final_pos = parsed_pos_from_nameval
                         if isinstance(final_name, list):
                             final_name = " ".join(str(x) for x in final_name).strip()
                         if isinstance(final_pos, list):
                                 return False
                             return len(name_str) > 1
                         if (not name_text or has_red_text(name_cell)) and final_name and looks_like_person(final_name):
                             if has_red_text(name_cell):
                                 replace_red_text_in_cell(name_cell, final_name)
                             replacements_made += 1
                             print(f"      ✅ Updated Print Name -> '{final_name}'")
                         if (not position_text or has_red_text(position_cell)) and final_pos:
                             if has_red_text(position_cell):
                                 replace_red_text_in_cell(position_cell, final_pos)
                 break
     if replacements_made > 0:
         try:
             setattr(table, "_processed_operator_declaration", True)
     return replacements_made
 def handle_multiple_red_segments_in_cell(cell, flat_json):
     replacements_made = 0
     red_segments = extract_red_text_segments(cell)
     if not red_segments:
         return 0
     for i, segment in enumerate(red_segments):
         segment_text = segment['text'].strip()
         if segment_text:
                 if replace_single_segment(segment, replacement_text):
                     replacements_made += 1
                     print(f"      ✅ Replaced segment {i+1}: '{segment_text}' -> '{replacement_text}'")
     return replacements_made
 def handle_nature_business_multiline_fix(cell, flat_json):
     replacements_made = 0
     red_text = ""
     for paragraph in cell.paragraphs:
         for run in paragraph.runs:
             if is_red(run):
                 red_text += run.text
     red_text = red_text.strip()
     if not red_text:
         return 0
     nature_indicators = ["transport", "logistics", "freight", "delivery", "trucking", "haulage"]
     if any(indicator in red_text.lower() for indicator in nature_indicators):
         nature_value = find_matching_json_value("Nature of Business", flat_json)
         if nature_value is not None:
             replacement_text = get_value_as_string(nature_value, "Nature of Business")
             cell_replacements = replace_red_text_in_cell(cell, replacement_text)
             replacements_made += cell_replacements
             print(f"      ✅ Fixed Nature of Business multiline content")
     return replacements_made
 def handle_management_summary_fix(cell, flat_json):
     replacements_made = 0
     red_text = ""
     for paragraph in cell.paragraphs:
         for run in paragraph.runs:
             if is_red(run):
                 red_text += run.text
     red_text = red_text.strip()
     if not red_text:
         return 0
     management_types = ["Mass Management Summary", "Maintenance Management Summary", "Fatigue Management Summary"]
     for mgmt_type in management_types:
         if mgmt_type in flat_json:
             mgmt_data = flat_json[mgmt_type]
             if isinstance(mgmt_data, dict):
                 for std_key, std_value in mgmt_data.items():
                     if isinstance(std_value, list) and std_value:
                         if len(red_text) > 10:
                             for item in std_value:
                                 if red_text.lower() in str(item).lower() or str(item).lower() in red_text.lower():
                                     replacements_made += cell_replacements
                                     print(f"      ✅ Fixed {mgmt_type} - {std_key}")
                                     return replacements_made
     return replacements_made
+# ============================================================================
 # SMALL OPERATOR/AUDITOR TABLE HANDLER (skip if already processed)
+# ============================================================================
 def handle_operator_declaration_fix(table, flat_json):
     replacements_made = 0
     if getattr(table, "_processed_operator_declaration", False):
         print(f"    ⏭️ Skipping - Operator Declaration table already processed")
         return 0
     if len(table.rows) > 4:
         return 0
     replaced = fix_operator_declaration_empty_values(table, flat_json)
     replacements_made += replaced
     if replaced:
         return replacements_made
     def is_date_like(s: str) -> bool:
         if not s:
             return False
         s = s.strip()
         month_names = r"(jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec|january|february|march|april|may|june|july|august|september|october|november|december)"
         if re.search(r"\bDate\b", s, re.IGNORECASE):
             return True
         if re.search(r"\b\d{1,2}(?:st|nd|rd|th)?\b\s+" + month_names, s, re.IGNORECASE):
             return True
         if re.search(r"\b\d{4}[\/\.\-]\d{1,2}[\/\.\-]\d{1,2}\b", s):
             return True
         if re.fullmatch(r"\d{4}", s):
             return True
         return False
         if not s:
             return False
         low = s.lower().strip()
         bad_terms = ["pty ltd", "p/l", "plc", "company", "farming", "farm", "trust", "ltd"]
         if any(bt in low for bt in bad_terms):
             return False
         if len(low) < 3:
             return False
         return bool(re.search(r"[a-zA-Z]", low))
         roles = ["manager", "auditor", "owner", "director", "supervisor", "coordinator", "driver", "operator", "representative", "chief"]
         return any(r in low for r in roles)
     print(f"    🎯 Processing other declaration table (fallback small-table behavior)")
     for row_idx, row in enumerate(table.rows):
         for cell_idx, cell in enumerate(row.cells):
             if not has_red_text(cell):
                 continue
             declaration_fields = [
                 "NHVAS Approved Auditor Declaration.Print Name",
                 "Auditor name",
                 if not replacement_text:
                     continue
                 if is_date_like(replacement_text):
                     red_text = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red(run)).strip()
                     if "date" not in red_text.lower():
                         print(f"      ⚠️ Skipping date-like replacement for field '{field}' -> '{replacement_text[:30]}...'")
                         continue
                 if (looks_like_person_name(replacement_text) or looks_like_position(replacement_text) or "signature" in field.lower() or "date" in field.lower()):
                     cell_replacements = replace_red_text_in_cell(cell, replacement_text)
                     if cell_replacements > 0:
                         replacements_made += cell_replacements
                         print(f"      ✅ Fixed declaration field: {field} -> '{replacement_text}'")
                         break
                 else:
                     print(f"      ⚠️ Replacement for field '{field}' does not look like name/role, skipping: '{replacement_text[:30]}...'")
                     continue
             if not replaced_this_cell:
                 red_text = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red(run)).strip().lower()
                 if "signature" in red_text:
                         replacements_made += cell_replacements
                         print(f"      ✅ Inserted placeholder [Signature]")
                 elif "date" in red_text:
                     date_value = find_matching_json_value("Date", flat_json) or find_matching_json_value("Date of Audit", flat_json) or find_matching_json_value("Audit was conducted on", flat_json)
                     if date_value is not None:
                         date_text = get_value_as_string(date_value)
                         if not is_date_like(date_text):
                             print(f"      ⚠️ Found date-value but not date-like, skipping: '{date_text}'")
                         else:
                             cell_replacements = replace_red_text_in_cell(cell, date_text)
                                 replacements_made += cell_replacements
                                 print(f"      ✅ Inserted date value: '{date_text}'")
     if replacements_made > 0:
         try:
             setattr(table, "_processed_operator_declaration", True)
     return replacements_made
 def handle_print_accreditation_section(table, flat_json):
     replacements_made = 0
     if getattr(table, "_processed_operator_declaration", False):
         print(f"    ⏭️ Skipping Print Accreditation - this is an Operator Declaration table")
         return 0
     table_context = ""
     for row in table.rows:
         for cell in row.cells:
             table_context += get_clean_text(cell).lower() + " "
     if "operator declaration" in table_context or ("print name" in table_context and "position title" in table_context):
         print(f"    ⏭️ Skipping Print Accreditation - this is an Operator Declaration table")
         return 0
     for row_idx, row in enumerate(table.rows):
         for cell_idx, cell in enumerate(row.cells):
             if has_red_text(cell):
                 accreditation_fields = [
                     "(print accreditation name)",
                     "Operator name (Legal entity)",
+                    "Print accreditation name",
+                    "(print accreditation name)"
                 ]
                 for field in accreditation_fields:
     return replacements_made
 def process_single_column_sections(cell, key_text, flat_json):
     replacements_made = 0
     if has_red_text(cell):
                     red_text += run.text
         if red_text.strip():
             section_value = find_matching_json_value(red_text.strip(), flat_json)
             if section_value is None:
                 section_value = find_matching_json_value(key_text, flat_json)
             if section_value is not None:
     for table_idx, table in enumerate(document.tables):
         print(f"\n🔍 Processing table {table_idx + 1}:")
+        # collect brief context
         table_text = ""
         for row in table.rows[:3]:
             for cell in row.cells:
                 table_text += get_clean_text(cell).lower() + " "
+        # detect management summary & details column
         management_summary_indicators = ["mass management", "maintenance management", "fatigue management"]
         has_management = any(indicator in table_text for indicator in management_summary_indicators)
         has_details = "details" in table_text
             for row_idx, row in enumerate(table.rows):
                 for cell_idx, cell in enumerate(row.cells):
                     if has_red_text(cell):
+                        # Try direct matching with new schema names first
                         for mgmt_type in ["Mass Management Summary", "Maintenance Management Summary", "Fatigue Management Summary"]:
                             if mgmt_type.lower().replace(" summary", "") in table_text:
                                 if mgmt_type in flat_json:
                                     mgmt_data = flat_json[mgmt_type]
                                     if isinstance(mgmt_data, dict):
             continue
         # Detect Vehicle Registration tables
+        vehicle_indicators = ["registration number", "sub-contractor", "weight verification", "rfs suspension", "registration"]
         indicator_count = sum(1 for indicator in vehicle_indicators if indicator in table_text)
         if indicator_count >= 2:
             print(f"    🚗 Detected Vehicle Registration table")
         print_accreditation_indicators = ["print name", "position title"]
         indicator_count = sum(1 for indicator in print_accreditation_indicators if indicator in table_text)
         if indicator_count >= 2 or ("print name" in table_text and "position title" in table_text):
             print(f"    📋 Detected Print Accreditation/Operator Declaration table")
             declaration_fixes = fix_operator_declaration_empty_values(table, flat_json)
             replacements_made += declaration_fixes
             if not getattr(table, "_processed_operator_declaration", False):
                 print_accreditation_replacements = handle_print_accreditation_section(table, flat_json)
                 replacements_made += print_accreditation_replacements
             continue
+        # Process regular table rows (original logic preserved)
         for row_idx, row in enumerate(table.rows):
             if len(row.cells) < 1:
                 continue
             if json_value is not None:
                 replacement_text = get_value_as_string(json_value, key_text)
                 if ("australian company number" in key_text.lower() or "company number" in key_text.lower()) and isinstance(json_value, list):
                     cell_replacements = handle_australian_company_number(row, json_value)
                     replacements_made += cell_replacements
                 elif ("attendance list" in key_text.lower() or "nature of" in key_text.lower()) and row_idx + 1 < len(table.rows):
                     print(f"    ✅ Section header detected, checking next row...")
                     next_row = table.rows[row_idx + 1]
                     for cell_idx, cell in enumerate(next_row.cells):
                         if has_red_text(cell):
                             print(f"    ✅ Found red text in next row, cell {cell_idx + 1}")
                             if cell_replacements > 0:
                                 print(f"    -> Replaced section content")
                 elif len(row.cells) == 1 or (len(row.cells) > 1 and not any(has_red_text(row.cells[i]) for i in range(1, len(row.cells)))):
                     if has_red_text(key_cell):
                         cell_replacements = process_single_column_sections(key_cell, key_text, flat_json)
                         replacements_made += cell_replacements
                 else:
                     for cell_idx in range(1, len(row.cells)):
                         value_cell = row.cells[cell_idx]
                             replacements_made += cell_replacements
             else:
                 if len(row.cells) == 1 and has_red_text(key_cell):
                     red_text = ""
                     for paragraph in key_cell.paragraphs:
                             cell_replacements = replace_red_text_in_cell(key_cell, section_replacement)
                             replacements_made += cell_replacements
                 for cell_idx in range(len(row.cells)):
                     cell = row.cells[cell_idx]
                     if has_red_text(cell):
                         cell_replacements = handle_multiple_red_segments_in_cell(cell, flat_json)
                         replacements_made += cell_replacements
                         if cell_replacements == 0:
                             surgical_fix = handle_nature_business_multiline_fix(cell, flat_json)
                             replacements_made += surgical_fix
                             management_summary_fix = handle_management_summary_fix(cell, flat_json)
                             replacements_made += management_summary_fix
+    # Final declaration checks on last few tables
     print(f"\n🎯 Final check for Declaration tables...")
     for table in document.tables[-3:]:
         if len(table.rows) <= 4:
             json_value = find_matching_json_value(red_text_only, flat_json)
             if json_value is None:
                 if "AUDITOR SIGNATURE" in red_text_only.upper() or "DATE" in red_text_only.upper():
                     json_value = find_matching_json_value("auditor signature", flat_json)
                 elif "OPERATOR SIGNATURE" in red_text_only.upper():
     for para_idx, paragraph in enumerate(paragraphs):
         paragraph_text = paragraph.text.strip()
         if not paragraph_text:
             continue
         matched_heading = None
         for category, patterns in HEADING_PATTERNS.items():
             for pattern in patterns:
         if matched_heading:
             print(f"  📌 Found heading at paragraph {para_idx + 1}: '{paragraph_text}'")
             if has_red_text_in_paragraph(paragraph):
                 print(f"    🔴 Found red text in heading itself")
                 heading_replacements = process_red_text_in_paragraph(paragraph, paragraph_text, flat_json)
                 replacements_made += heading_replacements
             for next_para_offset in range(1, 6):
                 next_para_idx = para_idx + next_para_offset
                 if next_para_idx >= len(paragraphs):
                 if not next_text:
                     continue
                 is_another_heading = False
                 for category, patterns in HEADING_PATTERNS.items():
                     for pattern in patterns:
                 if is_another_heading:
                     break
                 if has_red_text_in_paragraph(next_paragraph):
                     print(f"    🔴 Found red text in paragraph {next_para_idx + 1} after heading")
                     context_replacements = process_red_text_in_paragraph(
                         next_paragraph,
                         paragraph_text,
     print(f"      🔍 Red text found: '{combined_red_text}'")
     json_value = None
     json_value = find_matching_json_value(combined_red_text, flat_json)
     if json_value is None:
         if "NHVAS APPROVED AUDITOR" in context_text.upper():
             auditor_fields = ["auditor name", "auditor", "nhvas auditor", "approved auditor", "print name"]
                     print(f"      ✅ Found operator match with field: '{field}'")
                     break
     if json_value is None:
         context_queries = [
             f"{context_text} {combined_red_text}",
                 print(f"      ✅ Found match with combined query")
                 break
     if json_value is not None:
         replacement_text = get_value_as_string(json_value, combined_red_text)
         red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
         if red_runs:
             red_runs[0].text = replacement_text
             red_runs[0].font.color.rgb = RGBColor(0, 0, 0)
             for run in red_runs[1:]:
                 run.text = ''
             replacements_made = 1
             print(f"      ✅ Replaced with: '{replacement_text}'")
     else:
     return replacements_made
+# ============================================================================
+# Main process function
+# ============================================================================
 def process_hf(json_file, docx_file, output_file):
     """Main processing function with comprehensive error handling"""
         paragraph_replacements = process_paragraphs(doc, flat_json)
         heading_replacements = process_headings(doc, flat_json)
+        total_replacements = table_replacements + paragraph_replacements + heading_replacements
+        # Save unmatched headers for iterative improvement
+        if _unmatched_headers:
+            try:
+                tmp_path = "/tmp/unmatched_headers.json"
+                with open(tmp_path, 'w', encoding='utf-8') as f:
+                    json.dump(_unmatched_headers, f, indent=2, ensure_ascii=False)
+                print(f"✅ Unmatched headers saved to {tmp_path}")
+            except Exception as e:
+                print(f"⚠️ Could not save unmatched headers: {e}")
+        # Save output docx
         if hasattr(output_file, "write"):
             doc.save(output_file)
         else:
+            # If output path is a file path string
             doc.save(output_file)
         print(f"\n✅ Document saved as: {output_file}")
         print(f"   📊 Tables: {table_replacements}")
         print(f"   📝 Paragraphs: {paragraph_replacements}")
         print(f"   📋 Headings: {heading_replacements}")
         print(f"🎉 Processing complete!")
     except FileNotFoundError as e:
         import traceback
         traceback.print_exc()
+# ============================================================================
+# CLI entrypoint
+# ============================================================================
 if __name__ == "__main__":
     import sys
     if len(sys.argv) != 4: