Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

Shami96 commited on Aug 22

Commit

d77de54

verified ·

1 Parent(s): 8bbc7e5

Update updated_word.py

Browse files

Files changed (1) hide show

updated_word.py +270 -517

updated_word.py CHANGED Viewed

@@ -1,28 +1,24 @@
 #!/usr/bin/env python3
 """
-Updated pipeline.py
-Merged improvements:
- - removed duplicate functions
- - table processed-marker to avoid multiple handlers clobbering the same table
- - stricter detection of print-accreditation/operator-declaration tables
- - safer force replacement (avoid short->long mapping)
- - prefer exact qualified keys for Print Name / Position Title lookups
- - preserved all other logic and prints/logging
- - ADDED: header normalization, context-aware vehicle JSON selection,
-          management summary scoping, unmatched-headers logging
 """
 import json
 from docx import Document
 from docx.shared import RGBColor
 import re
-from typing import Any
-import os
 # ============================================================================
-# Configuration / Heading patterns for document structure detection
 # ============================================================================
 HEADING_PATTERNS = {
     "main": [
         r"NHVAS\s+Audit\s+Summary\s+Report",
@@ -47,19 +43,14 @@ HEADING_PATTERNS = {
 }
 # ============================================================================
-# State for unmatched headers (for iterative improvement)
 # ============================================================================
 _unmatched_headers = {}
 def record_unmatched_header(header: str):
     if not header:
         return
     _unmatched_headers[header] = _unmatched_headers.get(header, 0) + 1
-# ============================================================================
-# UTILITY FUNCTIONS
-# ============================================================================
 def load_json(filepath):
     with open(filepath, 'r', encoding='utf-8') as file:
         return json.load(file)
@@ -89,10 +80,10 @@ def get_value_as_string(value, field_name=""):
         elif len(value) == 1:
             return str(value[0])
         else:
             if "australian company number" in field_name.lower() or "company number" in field_name.lower():
                 return value
-            else:
-                return " ".join(str(v) for v in value)
     else:
         return str(value)
@@ -116,112 +107,124 @@ def has_red_text_in_paragraph(paragraph):
             return True
     return False
-# New helper: normalize header text (removes parentheticals, punctuation, etc.)
 def normalize_header_text(s: str) -> str:
     if not s:
         return ""
-    # remove parenthetical content
-    s = re.sub(r'\([^)]*\)', ' ', s)
-    # replace slashes
     s = s.replace("/", " ")
-    # remove punctuation except # and %
     s = re.sub(r'[^\w\s\#\%]', ' ', s)
     s = re.sub(r'\s+', ' ', s).strip().lower()
-    # common canonicalizations
     s = s.replace('registrationno', 'registration number')
     s = s.replace('registrationnumber', 'registration number')
-    s = s.replace('sub contracted', 'sub contractor')
     s = s.replace('sub-contractor', 'sub contractor')
-    s = s.replace('date range', '')
-    s = s.replace('applicable for entry audit', '')
-    s = s.strip()
-    return s
 # ============================================================================
-# JSON MATCHING FUNCTIONS
 # ============================================================================
 def find_matching_json_value(field_name, flat_json):
-    """Find matching value in JSON with multiple strategies"""
     field_name = (field_name or "").strip()
     if not field_name:
         return None
-    # Try exact match first
     if field_name in flat_json:
         print(f"    ✅ Direct match found for key '{field_name}'")
-        return flat_json[field_name]
-    # Case-insensitive exact match
     for key, value in flat_json.items():
         if key.lower() == field_name.lower():
-            print(f"    ✅ Case-insensitive match found for key '{field_name}' with JSON key '{key}'")
-            return value
-    # Better Print Name detection for operator vs auditor
     if field_name.lower().strip() == "print name":
         operator_keys = [k for k in flat_json.keys() if "operator" in k.lower() and "print name" in k.lower()]
         auditor_keys = [k for k in flat_json.keys() if "auditor" in k.lower() and ("print name" in k.lower() or "name" in k.lower())]
         if operator_keys:
             print(f"    ✅ Operator Print Name match: '{field_name}' -> '{operator_keys[0]}'")
-            return flat_json[operator_keys[0]]
         elif auditor_keys:
             print(f"    ✅ Auditor Name match: '{field_name}' -> '{auditor_keys[0]}'")
-            return flat_json[auditor_keys[0]]
-    # Suffix matching for nested keys
     for key, value in flat_json.items():
         if '.' in key and key.split('.')[-1].lower() == field_name.lower():
-            print(f"    ✅ Suffix match found for key '{field_name}' with JSON key '{key}'")
-            return value
-    # Clean & exact match attempt
     clean_field = re.sub(r'[^\w\s]', ' ', field_name.lower()).strip()
     clean_field = re.sub(r'\s+', ' ', clean_field)
     for key, value in flat_json.items():
         clean_key = re.sub(r'[^\w\s]', ' ', key.lower()).strip()
         clean_key = re.sub(r'\s+', ' ', clean_key)
         if clean_field == clean_key:
-            print(f"    ✅ Clean match found for key '{field_name}' with JSON key '{key}'")
-            return value
-    # Enhanced fuzzy matching with word-token scoring
     field_words = set(word.lower() for word in re.findall(r'\b\w+\b', field_name) if len(word) > 2)
     if not field_words:
         return None
-    best_match = None
-    best_score = 0
     best_key = None
     for key, value in flat_json.items():
         key_words = set(word.lower() for word in re.findall(r'\b\w+\b', key) if len(word) > 2)
         if not key_words:
             continue
-        common_words = field_words.intersection(key_words)
-        if common_words:
-            similarity = len(common_words) / len(field_words.union(key_words))
-            coverage = len(common_words) / len(field_words)
             final_score = (similarity * 0.6) + (coverage * 0.4)
-            if final_score > best_score:
-                best_score = final_score
-                best_match = value
-                best_key = key
-    if best_match and best_score >= 0.25:
         print(f"    ✅ Fuzzy match found for key '{field_name}' with JSON key '{best_key}' (score: {best_score:.2f})")
-        return best_match
     print(f"    ❌ No match found for '{field_name}'")
     return None
 # ============================================================================
-# RED TEXT PROCESSING FUNCTIONS
 # ============================================================================
 def extract_red_text_segments(cell):
     red_segments = []
     for para_idx, paragraph in enumerate(cell.paragraphs):
@@ -234,52 +237,37 @@ def extract_red_text_segments(cell):
                 segment_runs.append((para_idx, run_idx, run))
             else:
                 if segment_runs:
-                    red_segments.append({
-                        'text': current_segment,
-                        'runs': segment_runs.copy(),
-                        'paragraph_idx': para_idx
-                    })
                     current_segment = ""
                     segment_runs = []
         if segment_runs:
-            red_segments.append({
-                'text': current_segment,
-                'runs': segment_runs.copy(),
-                'paragraph_idx': para_idx
-            })
     return red_segments
 def replace_all_red_segments(red_segments, replacement_text):
     if not red_segments:
         return 0
     if '\n' in replacement_text:
         replacement_lines = replacement_text.split('\n')
     else:
         replacement_lines = [replacement_text]
     replacements_made = 0
-    if red_segments and replacement_lines:
-        first_segment = red_segments[0]
-        if first_segment['runs']:
-            first_run = first_segment['runs'][0][2]
-            first_run.text = replacement_lines[0]
-            first_run.font.color.rgb = RGBColor(0, 0, 0)
-            replacements_made = 1
-            for _, _, run in first_segment['runs'][1:]:
-                run.text = ''
     for segment in red_segments[1:]:
         for _, _, run in segment['runs']:
             run.text = ''
     if len(replacement_lines) > 1 and red_segments:
         try:
             first_run = red_segments[0]['runs'][0][2]
             paragraph = first_run.element.getparent()
             from docx.oxml import OxmlElement
-            parent = first_run.element.getparent()
             for line in replacement_lines[1:]:
                 if line.strip():
                     br = OxmlElement('w:br')
@@ -291,7 +279,6 @@ def replace_all_red_segments(red_segments, replacement_text):
                 first_run = red_segments[0]['runs'][0][2]
                 first_run.text = ' '.join(replacement_lines)
                 first_run.font.color.rgb = RGBColor(0, 0, 0)
     return replacements_made
 def replace_single_segment(segment, replacement_text):
@@ -311,7 +298,7 @@ def replace_red_text_in_cell(cell, replacement_text):
     return replace_all_red_segments(red_segments, replacement_text)
 # ============================================================================
-# SPECIALIZED TABLE HANDLERS
 # ============================================================================
 def handle_australian_company_number(row, company_numbers):
@@ -327,57 +314,27 @@ def handle_australian_company_number(row, company_numbers):
     return replacements_made
 def handle_vehicle_registration_table(table, flat_json):
-    """Handle vehicle registration table data replacement (improved header normalization and context-aware selection)"""
     replacements_made = 0
-    # build a table_text context (used to find mass/maintenance/fatigue)
-    table_text = ""
-    for r in table.rows[:3]:
-        for c in r.cells:
-            table_text += get_clean_text(c).lower() + " "
-    # 1) Detect the most relevant vehicle-related JSON section using context tokens
     vehicle_section = None
-    context_tokens = []
-    if "mass" in table_text:
-        context_tokens.append("mass")
-    if "maintenance" in table_text:
-        context_tokens.append("maintenance")
-    if "fatigue" in table_text or "driver" in table_text or "scheduler" in table_text:
-        context_tokens.append("fatigue")
-    # candidate keys that mention 'registration' or 'vehicle'
-    candidates = []
-    for key, value in flat_json.items():
-        k = key.lower()
-        if "registration" in k or "vehicle registration" in k or "vehicle" in k:
-            candidates.append((key, value))
-    # prefer candidates whose key contains one of the context tokens
-    if candidates and context_tokens:
-        for token in context_tokens:
-            for k, v in candidates:
-                if token in k.lower():
-                    vehicle_section = v if isinstance(v, (list, dict)) else {k: v}
-                    print(f"    ✅ Found vehicle data by context token '{token}' in key '{k}'")
-                    break
-            if vehicle_section:
-                break
-    # fallback: choose candidate containing 'registration' explicitly
-    if vehicle_section is None and candidates:
-        for k, v in candidates:
-            if "registration" in k.lower():
-                vehicle_section = v if isinstance(v, (list, dict)) else {k: v}
-                print(f"    ✅ Fallback vehicle data chosen from '{k}'")
-                break
     # fallback: collect flattened keys that look like vehicle columns
     if vehicle_section is None:
         potential_columns = {}
         for key, value in flat_json.items():
             lk = key.lower()
-            if any(col_name in lk for col_name in ["registration number", "sub-contractor", "weight verification", "rfs suspension", "trip records", "suspension", "daily checks", "fault recording", "fault repair", "roadworthiness"]):
                 if "." in key:
                     column_name = key.split(".")[-1]
                 else:
@@ -391,42 +348,36 @@ def handle_vehicle_registration_table(table, flat_json):
         print(f"    ❌ Vehicle registration data not found in JSON")
         return 0
-    # ensure vehicle_section is a dict mapping column_name -> list/value
     if isinstance(vehicle_section, list):
-        # if a list of dicts, attempt to flatten into columns
         if vehicle_section and isinstance(vehicle_section[0], dict):
             flattened = {}
             for entry in vehicle_section:
                 for k, v in entry.items():
                     flattened.setdefault(k, []).append(v)
             vehicle_section = flattened
     if not isinstance(vehicle_section, dict):
-        # convert single scalar to dict
         try:
             vehicle_section = dict(vehicle_section)
         except Exception:
-            vehicle_section = {str(k): v for k, v in (vehicle_section.items() if isinstance(vehicle_section, dict) else [])}
     print(f"    ✅ Found vehicle registration data with {len(vehicle_section)} columns")
-    # Find header row index by searching for a row that contains 'registration' + 'number'
     header_row_idx = -1
     header_row = None
     for row_idx, row in enumerate(table.rows):
         row_text = " ".join(get_clean_text(cell).lower() for cell in row.cells)
-        if "registration" in row_text and "number" in row_text:
             header_row_idx = row_idx
             header_row = row
             break
-    if header_row_idx == -1:
-        # try alternative detection: a row with 'registration' or 'reg no'
-        for row_idx, row in enumerate(table.rows):
-            row_text = " ".join(get_clean_text(cell).lower() for cell in row.cells)
-            if "registration" in row_text or "reg no" in row_text or "regno" in row_text:
-                header_row_idx = row_idx
-                header_row = row
-                break
     if header_row_idx == -1:
         print(f"    ❌ Could not find header row in vehicle table")
@@ -434,34 +385,26 @@ def handle_vehicle_registration_table(table, flat_json):
     print(f"    ✅ Found header row at index {header_row_idx}")
-    # Enhanced column mapping: normalize both header and candidate keys, token overlap scoring
-    column_mapping = {}
-    # build normalized master map from vehicle_section keys
     master_labels = {}
     for orig_key in vehicle_section.keys():
         norm = normalize_header_text(str(orig_key))
         if norm:
-            master_labels.setdefault(norm, orig_key)
-    # add fallback synonyms for common labels (preserve existing)
-    fallback_synonyms = [
-        "no", "registration number", "reg no", "registration", "sub contractor", "sub-contractor",
-        "sub contracted", "weight verification records", "rfs suspension certification", "suspension system maintenance",
-        "trip records", "fault recording reporting", "daily checks", "roadworthiness certificates",
-        "maintenance records", "fault repair"
-    ]
-    for syn in fallback_synonyms:
-        norm = normalize_header_text(syn)
-        if norm and norm not in master_labels:
-            master_labels.setdefault(norm, syn)
-    # map header cells
     for col_idx, cell in enumerate(header_row.cells):
         header_text = get_clean_text(cell).strip()
         if not header_text:
             continue
-        # skip 'No.' column mapping attempts in many templates
-        if header_text.strip().lower() in {"no", "no.", "#"}:
             continue
         norm_header = normalize_header_text(header_text)
@@ -473,7 +416,7 @@ def handle_vehicle_registration_table(table, flat_json):
             best_match = master_labels[norm_header]
             best_score = 1.0
         else:
-            # token overlap scoring
             header_tokens = set(t for t in norm_header.split() if len(t) > 2)
             for norm_key, orig_label in master_labels.items():
                 key_tokens = set(t for t in norm_key.split() if len(t) > 2)
@@ -483,7 +426,7 @@ def handle_vehicle_registration_table(table, flat_json):
                 if common:
                     score = len(common) / max(1, len(header_tokens.union(key_tokens)))
                 else:
-                    # substring fallback
                     if norm_header in norm_key or norm_key in norm_header:
                         score = min(len(norm_header), len(norm_key)) / max(len(norm_header), len(norm_key))
                     else:
@@ -492,18 +435,26 @@ def handle_vehicle_registration_table(table, flat_json):
                     best_score = score
                     best_match = orig_label
         if best_match and best_score >= 0.30:
             column_mapping[col_idx] = best_match
-            print(f"      📌 Column {col_idx}: '{header_text}' -> '{best_match}' (norm: '{norm_header}', score: {best_score:.2f})")
         else:
-            print(f"      ⚠️ No mapping found for '{header_text}' (norm: '{norm_header}')")
             record_unmatched_header(header_text)
     if not column_mapping:
         print(f"    ❌ No column mappings found")
         return 0
-    # Determine number of rows to populate
     max_data_rows = 0
     for json_key, data in vehicle_section.items():
         if isinstance(data, list):
@@ -511,51 +462,39 @@ def handle_vehicle_registration_table(table, flat_json):
     print(f"    📌 Need to populate {max_data_rows} data rows")
-    # Fill or add rows as needed
     for data_row_index in range(max_data_rows):
         table_row_idx = header_row_idx + 1 + data_row_index
         if table_row_idx >= len(table.rows):
-            print(f"    ⚠️ Row {table_row_idx + 1} doesn't exist - table only has {len(table.rows)} rows")
-            print(f"    ➕ Adding new row for vehicle {data_row_index + 1}")
-            new_row = table.add_row()
-            print(f"    ✅ Successfully added row {len(table.rows)} to the table")
         row = table.rows[table_row_idx]
         print(f"    📌 Processing data row {table_row_idx + 1} (vehicle {data_row_index + 1})")
         for col_idx, json_key in column_mapping.items():
             if col_idx < len(row.cells):
                 cell = row.cells[col_idx]
                 column_data = vehicle_section.get(json_key, [])
                 if isinstance(column_data, list) and data_row_index < len(column_data):
                     replacement_value = str(column_data[data_row_index])
                     cell_text = get_clean_text(cell)
                     if has_red_text(cell) or not cell_text.strip():
                         if not cell_text.strip():
                             cell.text = replacement_value
                             replacements_made += 1
-                            print(f"      -> Added '{replacement_value}' to empty cell (column '{json_key}')")
                         else:
                             cell_replacements = replace_red_text_in_cell(cell, replacement_value)
                             replacements_made += cell_replacements
                             if cell_replacements > 0:
-                                print(f"      -> Replaced red text with '{replacement_value}' (column '{json_key}')")
     return replacements_made
 def handle_attendance_list_table_enhanced(table, flat_json):
-    """Enhanced Attendance List processing with better detection"""
     replacements_made = 0
-    attendance_patterns = [
-        "attendance list",
-        "names and position titles",
-        "attendees"
-    ]
     found_attendance_row = None
     for row_idx, row in enumerate(table.rows[:3]):
         for cell_idx, cell in enumerate(row.cells):
@@ -566,7 +505,6 @@ def handle_attendance_list_table_enhanced(table, flat_json):
                 break
         if found_attendance_row is not None:
             break
     if found_attendance_row is None:
         return 0
@@ -577,42 +515,38 @@ def handle_attendance_list_table_enhanced(table, flat_json):
         "attendance list",
         "attendees"
     ]
     print(f"    🔍 Searching for attendance data in JSON...")
     for search_key in attendance_search_keys:
-        attendance_value = find_matching_json_value(search_key, flat_json)
-        if attendance_value is not None:
-            print(f"    ✅ Found attendance data with key: '{search_key}'")
             print(f"    📊 Raw value: {attendance_value}")
             break
     if attendance_value is None:
         print(f"    ❌ No attendance data found in JSON")
         return 0
     target_cell = None
     print(f"    🔍 Scanning ALL cells in attendance table for red text...")
     for row_idx, row in enumerate(table.rows):
         for cell_idx, cell in enumerate(row.cells):
             if has_red_text(cell):
-                print(f"        🎯 Found red text in row {row_idx + 1}, cell {cell_idx + 1}")
                 red_text = ""
                 for paragraph in cell.paragraphs:
                     for run in paragraph.runs:
                         if is_red(run):
                             red_text += run.text
-                print(f"        📋 Red text content: '{red_text[:80]}...'")
-                red_text_lower = red_text.lower()
-                if any(indicator in red_text_lower for indicator in ['manager', '–', '-']):
-                    target_cell = cell
-                    print(f"        ✅ This looks like attendance data - using this cell")
-                    break
-        if target_cell is not None:
             break
     if target_cell is None:
@@ -621,60 +555,44 @@ def handle_attendance_list_table_enhanced(table, flat_json):
     if has_red_text(target_cell):
         print(f"    🔧 Replacing red text with properly formatted attendance list...")
         if isinstance(attendance_value, list):
             attendance_list = [str(item).strip() for item in attendance_value if str(item).strip()]
         else:
             attendance_list = [str(attendance_value).strip()]
         print(f"    📝 Attendance items to add:")
         for i, item in enumerate(attendance_list):
             print(f"        {i+1}. {item}")
         replacement_text = "\n".join(attendance_list)
         cell_replacements = replace_red_text_in_cell(target_cell, replacement_text)
         replacements_made += cell_replacements
         print(f"    ✅ Added {len(attendance_list)} attendance items")
         print(f"    📊 Replacements made: {cell_replacements}")
     return replacements_made
 def fix_management_summary_details_column(table, flat_json):
-    """Fix the DETAILS column in Management Summary table (multi-management aware)."""
     replacements_made = 0
     print(f"    🎯 FIX: Management Summary DETAILS column processing")
-    # Build table text to detect management type(s)
     table_text = ""
     for row in table.rows[:3]:
         for cell in row.cells:
             table_text += get_clean_text(cell).lower() + " "
-    # Identify which management types this table likely represents
     mgmt_types = []
     if "mass management" in table_text or "mass" in table_text:
         mgmt_types.append("Mass Management Summary")
     if "maintenance management" in table_text or "maintenance" in table_text:
         mgmt_types.append("Maintenance Management Summary")
-    if "fatigue management" in table_text or "fatigue" in table_text or "driver" in table_text:
         mgmt_types.append("Fatigue Management Summary")
     if not mgmt_types:
-        # fallback: try fuzzy detection through headings or presence of "Std 5." etc.
         if any("std 5" in get_clean_text(c).lower() for r in table.rows for c in r.cells):
             mgmt_types.append("Mass Management Summary")
     if not mgmt_types:
         return 0
     for mgmt_type in mgmt_types:
         print(f"    ✅ Confirmed {mgmt_type} table processing")
-        # find data dict in flat_json for mgmt_type
         mgmt_data = flat_json.get(mgmt_type)
         if not isinstance(mgmt_data, dict):
-            # attempt suffix based keys in flat_json
             for key in flat_json.keys():
                 if mgmt_type.split()[0].lower() in key.lower() and "summary" in key.lower():
                     mgmt_data = flat_json.get(key)
@@ -682,26 +600,18 @@ def fix_management_summary_details_column(table, flat_json):
         if not isinstance(mgmt_data, dict):
             print(f"    ⚠️ No JSON management dict found for {mgmt_type}, skipping this type")
             continue
-        # Process rows looking for Std 5. and Std 6.
         for row_idx, row in enumerate(table.rows):
             if len(row.cells) >= 2:
                 standard_cell = row.cells[0]
                 details_cell = row.cells[1]
                 standard_text = get_clean_text(standard_cell).strip().lower()
-                # Std 5.
                 if "std 5" in standard_text or "verification" in standard_text:
                     if has_red_text(details_cell):
-                        print(f"      🔍 Found Std 5/Verification with red text")
-                        # try to find the appropriate key in mgmt_data
                         std_val = None
-                        # exact key variants
                         for candidate in ("Std 5. Verification", "Std 5 Verification", "Std 5", "Verification"):
                             std_val = mgmt_data.get(candidate)
                             if std_val is not None:
                                 break
-                        # fuzzy fallback
                         if std_val is None:
                             for k, v in mgmt_data.items():
                                 if 'std 5' in k.lower() or 'verification' in k.lower():
@@ -713,11 +623,8 @@ def fix_management_summary_details_column(table, flat_json):
                             replacements_made += cell_replacements
                             if cell_replacements:
                                 print(f"      ✅ Replaced Std 5. Verification details for {mgmt_type}")
-                # Std 6.
                 if "std 6" in standard_text or "internal review" in standard_text:
                     if has_red_text(details_cell):
-                        print(f"      🔍 Found Std 6/Internal Review with red text")
                         std_val = None
                         for candidate in ("Std 6. Internal Review", "Std 6 Internal Review", "Std 6", "Internal Review"):
                             std_val = mgmt_data.get(candidate)
@@ -734,23 +641,20 @@ def fix_management_summary_details_column(table, flat_json):
                             replacements_made += cell_replacements
                             if cell_replacements:
                                 print(f"      ✅ Replaced Std 6. Internal Review details for {mgmt_type}")
     return replacements_made
-# Canonical operator declaration fixer (keeps original robust logic)
 def fix_operator_declaration_empty_values(table, flat_json):
     replacements_made = 0
     print(f"    🎯 FIX: Operator Declaration empty values processing")
     table_context = ""
     for row in table.rows:
         for cell in row.cells:
             table_context += get_clean_text(cell).lower() + " "
     if not ("print name" in table_context and "position title" in table_context):
         return 0
     print(f"    ✅ Confirmed Operator Declaration table")
     def parse_name_and_position(value):
@@ -761,16 +665,15 @@ def fix_operator_declaration_empty_values(table, flat_json):
                 return None, None
             if len(value) == 1:
                 return str(value[0]).strip(), None
             first = str(value[0]).strip()
             second = str(value[1]).strip()
             if first and second:
                 return first, second
             value = " ".join(str(v).strip() for v in value if str(v).strip())
         s = str(value).strip()
         if not s:
             return None, None
         parts = re.split(r'\s+[-–—]\s+|\s*,\s*|\s*\|\s*', s)
         if len(parts) >= 2:
             left = parts[0].strip()
@@ -782,7 +685,6 @@ def fix_operator_declaration_empty_values(table, flat_json):
             if any(ind in left.lower() for ind in role_indicators) and not any(ind in right.lower() for ind in role_indicators):
                 return right, left
             return left, right
         tokens = s.split()
         if len(tokens) >= 2:
             last = tokens[-1]
@@ -790,35 +692,35 @@ def fix_operator_declaration_empty_values(table, flat_json):
                                'coordinator', 'driver', 'operator', 'representative', 'chief']
             if any(ind == last.lower() for ind in role_indicators):
                 return " ".join(tokens[:-1]), last
         return s, None
     for row_idx, row in enumerate(table.rows):
         if len(row.cells) >= 2:
             cell1_text = get_clean_text(row.cells[0]).strip().lower()
             cell2_text = get_clean_text(row.cells[1]).strip().lower()
             if "print name" in cell1_text and "position" in cell2_text:
                 print(f"      📌 Found header row at {row_idx + 1}")
                 if row_idx + 1 < len(table.rows):
                     data_row = table.rows[row_idx + 1]
                     if len(data_row.cells) >= 2:
                         name_cell = data_row.cells[0]
                         position_cell = data_row.cells[1]
                         name_text = get_clean_text(name_cell).strip()
                         position_text = get_clean_text(position_cell).strip()
                         print(f"      📋 Current values: Name='{name_text}', Position='{position_text}'")
-                        name_value = find_matching_json_value("Operator Declaration.Print Name", flat_json)
-                        if name_value is None:
-                            name_value = find_matching_json_value("Print Name", flat_json)
-                        position_value = find_matching_json_value("Operator Declaration.Position Title", flat_json)
-                        if position_value is None:
-                            position_value = find_matching_json_value("Position Title", flat_json)
                         parsed_name_from_nameval, parsed_pos_from_nameval = parse_name_and_position(name_value) if name_value is not None else (None, None)
                         parsed_name_from_posval, parsed_pos_from_posval = parse_name_and_position(position_value) if position_value is not None else (None, None)
@@ -830,13 +732,43 @@ def fix_operator_declaration_empty_values(table, flat_json):
                         elif name_value is not None:
                             final_name = get_value_as_string(name_value)
-                        if parsed_pos_from_posval:
-                            final_pos = parsed_pos_from_posval
-                        elif position_value is not None:
-                            final_pos = get_value_as_string(position_value)
-                        elif parsed_pos_from_nameval:
-                            final_pos = parsed_pos_from_nameval
                         if isinstance(final_name, list):
                             final_name = " ".join(str(x) for x in final_name).strip()
                         if isinstance(final_pos, list):
@@ -853,8 +785,9 @@ def fix_operator_declaration_empty_values(table, flat_json):
                             low = name_str.lower()
                             if any(bp in low for bp in bad_phrases):
                                 return False
-                            return len(name_str) > 1
                         if (not name_text or has_red_text(name_cell)) and final_name and looks_like_person(final_name):
                             if has_red_text(name_cell):
                                 replace_red_text_in_cell(name_cell, final_name)
@@ -863,7 +796,8 @@ def fix_operator_declaration_empty_values(table, flat_json):
                             replacements_made += 1
                             print(f"      ✅ Updated Print Name -> '{final_name}'")
-                        if (not position_text or has_red_text(position_cell)) and final_pos:
                             if has_red_text(position_cell):
                                 replace_red_text_in_cell(position_cell, final_pos)
                             else:
@@ -890,9 +824,9 @@ def handle_multiple_red_segments_in_cell(cell, flat_json):
     for i, segment in enumerate(red_segments):
         segment_text = segment['text'].strip()
         if segment_text:
-            json_value = find_matching_json_value(segment_text, flat_json)
-            if json_value is not None:
-                replacement_text = get_value_as_string(json_value, segment_text)
                 if replace_single_segment(segment, replacement_text):
                     replacements_made += 1
                     print(f"      ✅ Replaced segment {i+1}: '{segment_text}' -> '{replacement_text}'")
@@ -910,9 +844,9 @@ def handle_nature_business_multiline_fix(cell, flat_json):
         return 0
     nature_indicators = ["transport", "logistics", "freight", "delivery", "trucking", "haulage"]
     if any(indicator in red_text.lower() for indicator in nature_indicators):
-        nature_value = find_matching_json_value("Nature of Business", flat_json)
-        if nature_value is not None:
-            replacement_text = get_value_as_string(nature_value, "Nature of Business")
             cell_replacements = replace_red_text_in_cell(cell, replacement_text)
             replacements_made += cell_replacements
             print(f"      ✅ Fixed Nature of Business multiline content")
@@ -930,229 +864,85 @@ def handle_management_summary_fix(cell, flat_json):
         return 0
     management_types = ["Mass Management Summary", "Maintenance Management Summary", "Fatigue Management Summary"]
     for mgmt_type in management_types:
-        if mgmt_type in flat_json:
             mgmt_data = flat_json[mgmt_type]
-            if isinstance(mgmt_data, dict):
-                for std_key, std_value in mgmt_data.items():
-                    if isinstance(std_value, list) and std_value:
-                        if len(red_text) > 10:
-                            for item in std_value:
-                                if red_text.lower() in str(item).lower() or str(item).lower() in red_text.lower():
-                                    replacement_text = "\n".join(str(i) for i in std_value)
-                                    cell_replacements = replace_red_text_in_cell(cell, replacement_text)
-                                    replacements_made += cell_replacements
-                                    print(f"      ✅ Fixed {mgmt_type} - {std_key}")
-                                    return replacements_made
-    return replacements_made
-# ============================================================================
-# SMALL OPERATOR/AUDITOR TABLE HANDLER (skip if already processed)
-# ============================================================================
-def handle_operator_declaration_fix(table, flat_json):
-    replacements_made = 0
-    if getattr(table, "_processed_operator_declaration", False):
-        print(f"    ⏭️ Skipping - Operator Declaration table already processed")
-        return 0
-    if len(table.rows) > 4:
-        return 0
-    replaced = fix_operator_declaration_empty_values(table, flat_json)
-    replacements_made += replaced
-    if replaced:
-        return replacements_made
-    def is_date_like(s: str) -> bool:
-        if not s:
-            return False
-        s = s.strip()
-        month_names = r"(jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec|january|february|march|april|may|june|july|august|september|october|november|december)"
-        if re.search(r"\bDate\b", s, re.IGNORECASE):
-            return True
-        if re.search(r"\b\d{1,2}(?:st|nd|rd|th)?\b\s+" + month_names, s, re.IGNORECASE):
-            return True
-        if re.search(month_names + r".*\b\d{4}\b", s, re.IGNORECASE):
-            return True
-        if re.search(r"\b\d{1,2}[\/\.\-]\d{1,2}[\/\.\-]\d{2,4}\b", s):
-            return True
-        if re.search(r"\b\d{4}[\/\.\-]\d{1,2}[\/\.\-]\d{1,2}\b", s):
-            return True
-        if re.fullmatch(r"\d{4}", s):
-            return True
-        return False
-    def looks_like_person_name(s: str) -> bool:
-        if not s:
-            return False
-        low = s.lower().strip()
-        bad_terms = ["pty ltd", "p/l", "plc", "company", "farming", "farm", "trust", "ltd"]
-        if any(bt in low for bt in bad_terms):
-            return False
-        if len(low) < 3:
-            return False
-        return bool(re.search(r"[a-zA-Z]", low))
-    def looks_like_position(s: str) -> bool:
-        if not s:
-            return False
-        low = s.lower()
-        roles = ["manager", "auditor", "owner", "director", "supervisor", "coordinator", "driver", "operator", "representative", "chief"]
-        return any(r in low for r in roles)
-    print(f"    🎯 Processing other declaration table (fallback small-table behavior)")
-    for row_idx, row in enumerate(table.rows):
-        for cell_idx, cell in enumerate(row.cells):
-            if not has_red_text(cell):
-                continue
-            declaration_fields = [
-                "NHVAS Approved Auditor Declaration.Print Name",
-                "Auditor name",
-                "Signature",
-                "Date"
-            ]
-            replaced_this_cell = False
-            for field in declaration_fields:
-                field_value = find_matching_json_value(field, flat_json)
-                if field_value is None:
-                    continue
-                replacement_text = get_value_as_string(field_value, field).strip()
-                if not replacement_text:
-                    continue
-                if is_date_like(replacement_text):
-                    red_text = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red(run)).strip()
-                    if "date" not in red_text.lower():
-                        print(f"      ⚠️ Skipping date-like replacement for field '{field}' -> '{replacement_text[:30]}...'")
-                        continue
-                if (looks_like_person_name(replacement_text) or looks_like_position(replacement_text) or "signature" in field.lower() or "date" in field.lower()):
-                    cell_replacements = replace_red_text_in_cell(cell, replacement_text)
-                    if cell_replacements > 0:
-                        replacements_made += cell_replacements
-                        replaced_this_cell = True
-                        print(f"      ✅ Fixed declaration field: {field} -> '{replacement_text}'")
-                        break
-                else:
-                    print(f"      ⚠️ Replacement for field '{field}' does not look like name/role, skipping: '{replacement_text[:30]}...'")
-                    continue
-            if not replaced_this_cell:
-                red_text = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red(run)).strip().lower()
-                if "signature" in red_text:
-                    cell_replacements = replace_red_text_in_cell(cell, "[Signature]")
-                    if cell_replacements > 0:
-                        replacements_made += cell_replacements
-                        print(f"      ✅ Inserted placeholder [Signature]")
-                elif "date" in red_text:
-                    date_value = find_matching_json_value("Date", flat_json) or find_matching_json_value("Date of Audit", flat_json) or find_matching_json_value("Audit was conducted on", flat_json)
-                    if date_value is not None:
-                        date_text = get_value_as_string(date_value)
-                        if not is_date_like(date_text):
-                            print(f"      ⚠️ Found date-value but not date-like, skipping: '{date_text}'")
-                        else:
-                            cell_replacements = replace_red_text_in_cell(cell, date_text)
-                            if cell_replacements > 0:
                                 replacements_made += cell_replacements
-                                print(f"      ✅ Inserted date value: '{date_text}'")
-    if replacements_made > 0:
-        try:
-            setattr(table, "_processed_operator_declaration", True)
-            print("    🔖 Marked table as processed by operator declaration fallback")
-        except Exception:
-            pass
     return replacements_made
 def handle_print_accreditation_section(table, flat_json):
     replacements_made = 0
     if getattr(table, "_processed_operator_declaration", False):
         print(f"    ⏭️ Skipping Print Accreditation - this is an Operator Declaration table")
         return 0
     table_context = ""
     for row in table.rows:
         for cell in row.cells:
             table_context += get_clean_text(cell).lower() + " "
     if "operator declaration" in table_context or ("print name" in table_context and "position title" in table_context):
         print(f"    ⏭️ Skipping Print Accreditation - this is an Operator Declaration table")
         return 0
     print(f"    📋 Processing Print Accreditation section")
     for row_idx, row in enumerate(table.rows):
         for cell_idx, cell in enumerate(row.cells):
             if has_red_text(cell):
                 accreditation_fields = [
                     "(print accreditation name)",
                     "Operator name (Legal entity)",
-                    "Print accreditation name",
-                    "(print accreditation name)"
                 ]
                 for field in accreditation_fields:
-                    field_value = find_matching_json_value(field, flat_json)
-                    if field_value is not None:
-                        replacement_text = get_value_as_string(field_value, field)
                         if replacement_text.strip():
                             cell_replacements = replace_red_text_in_cell(cell, replacement_text)
                             replacements_made += cell_replacements
                             if cell_replacements > 0:
-                                print(f"      ✅ Fixed accreditation: {field}")
                             break
     return replacements_made
 def process_single_column_sections(cell, key_text, flat_json):
     replacements_made = 0
     if has_red_text(cell):
         red_text = ""
         for paragraph in cell.paragraphs:
             for run in paragraph.runs:
                 if is_red(run):
                     red_text += run.text
         if red_text.strip():
-            section_value = find_matching_json_value(red_text.strip(), flat_json)
-            if section_value is None:
-                section_value = find_matching_json_value(key_text, flat_json)
-            if section_value is not None:
-                section_replacement = get_value_as_string(section_value, red_text.strip())
                 cell_replacements = replace_red_text_in_cell(cell, section_replacement)
                 replacements_made += cell_replacements
                 if cell_replacements > 0:
                     print(f"      ✅ Fixed single column section: '{key_text}'")
     return replacements_made
 # ============================================================================
-# MAIN TABLE/PARAGRAPH PROCESSING
 # ============================================================================
 def process_tables(document, flat_json):
-    """Process all tables in the document with comprehensive fixes"""
     replacements_made = 0
     for table_idx, table in enumerate(document.tables):
         print(f"\n🔍 Processing table {table_idx + 1}:")
-        # collect brief context
         table_text = ""
         for row in table.rows[:3]:
             for cell in row.cells:
                 table_text += get_clean_text(cell).lower() + " "
-        # detect management summary & details column
         management_summary_indicators = ["mass management", "maintenance management", "fatigue management"]
         has_management = any(indicator in table_text for indicator in management_summary_indicators)
         has_details = "details" in table_text
@@ -1162,12 +952,10 @@ def process_tables(document, flat_json):
             summary_fixes = fix_management_summary_details_column(table, flat_json)
             replacements_made += summary_fixes
-            # Process remaining red text in management summary
             summary_replacements = 0
             for row_idx, row in enumerate(table.rows):
                 for cell_idx, cell in enumerate(row.cells):
                     if has_red_text(cell):
-                        # Try direct matching with new schema names first
                         for mgmt_type in ["Mass Management Summary", "Maintenance Management Summary", "Fatigue Management Summary"]:
                             if mgmt_type.lower().replace(" summary", "") in table_text:
                                 if mgmt_type in flat_json:
@@ -1192,7 +980,7 @@ def process_tables(document, flat_json):
             replacements_made += summary_replacements
             continue
-        # Detect Vehicle Registration tables
         vehicle_indicators = ["registration number", "sub-contractor", "weight verification", "rfs suspension", "registration"]
         indicator_count = sum(1 for indicator in vehicle_indicators if indicator in table_text)
         if indicator_count >= 2:
@@ -1201,50 +989,46 @@ def process_tables(document, flat_json):
             replacements_made += vehicle_replacements
             continue
-        # Detect Attendance List tables
         if "attendance list" in table_text and "names and position titles" in table_text:
             print(f"    👥 Detected Attendance List table")
             attendance_replacements = handle_attendance_list_table_enhanced(table, flat_json)
             replacements_made += attendance_replacements
             continue
-        # Detect Print Accreditation / Operator Declaration tables
         print_accreditation_indicators = ["print name", "position title"]
         indicator_count = sum(1 for indicator in print_accreditation_indicators if indicator in table_text)
         if indicator_count >= 2 or ("print name" in table_text and "position title" in table_text):
             print(f"    📋 Detected Print Accreditation/Operator Declaration table")
             declaration_fixes = fix_operator_declaration_empty_values(table, flat_json)
             replacements_made += declaration_fixes
             if not getattr(table, "_processed_operator_declaration", False):
                 print_accreditation_replacements = handle_print_accreditation_section(table, flat_json)
                 replacements_made += print_accreditation_replacements
             continue
-        # Process regular table rows (original logic preserved)
         for row_idx, row in enumerate(table.rows):
             if len(row.cells) < 1:
                 continue
             key_cell = row.cells[0]
             key_text = get_clean_text(key_cell)
             if not key_text:
                 continue
             print(f"  📌 Row {row_idx + 1}: Key = '{key_text}'")
-            json_value = find_matching_json_value(key_text, flat_json)
             if json_value is not None:
                 replacement_text = get_value_as_string(json_value, key_text)
                 if ("australian company number" in key_text.lower() or "company number" in key_text.lower()) and isinstance(json_value, list):
                     cell_replacements = handle_australian_company_number(row, json_value)
                     replacements_made += cell_replacements
                 elif ("attendance list" in key_text.lower() or "nature of" in key_text.lower()) and row_idx + 1 < len(table.rows):
                     print(f"    ✅ Section header detected, checking next row...")
                     next_row = table.rows[row_idx + 1]
@@ -1252,17 +1036,21 @@ def process_tables(document, flat_json):
                         if has_red_text(cell):
                             print(f"    ✅ Found red text in next row, cell {cell_idx + 1}")
                             if isinstance(json_value, list):
-                                replacement_text = "\n".join(str(item) for item in json_value)
-                            cell_replacements = replace_red_text_in_cell(cell, replacement_text)
                             replacements_made += cell_replacements
                             if cell_replacements > 0:
                                 print(f"    -> Replaced section content")
                 elif len(row.cells) == 1 or (len(row.cells) > 1 and not any(has_red_text(row.cells[i]) for i in range(1, len(row.cells)))):
                     if has_red_text(key_cell):
                         cell_replacements = process_single_column_sections(key_cell, key_text, flat_json)
                         replacements_made += cell_replacements
                 else:
                     for cell_idx in range(1, len(row.cells)):
                         value_cell = row.cells[cell_idx]
@@ -1272,6 +1060,7 @@ def process_tables(document, flat_json):
                             replacements_made += cell_replacements
             else:
                 if len(row.cells) == 1 and has_red_text(key_cell):
                     red_text = ""
                     for paragraph in key_cell.paragraphs:
@@ -1279,56 +1068,55 @@ def process_tables(document, flat_json):
                             if is_red(run):
                                 red_text += run.text
                     if red_text.strip():
-                        section_value = find_matching_json_value(red_text.strip(), flat_json)
-                        if section_value is not None:
-                            section_replacement = get_value_as_string(section_value, red_text.strip())
                             cell_replacements = replace_red_text_in_cell(key_cell, section_replacement)
                             replacements_made += cell_replacements
                 for cell_idx in range(len(row.cells)):
                     cell = row.cells[cell_idx]
                     if has_red_text(cell):
                         cell_replacements = handle_multiple_red_segments_in_cell(cell, flat_json)
                         replacements_made += cell_replacements
                         if cell_replacements == 0:
                             surgical_fix = handle_nature_business_multiline_fix(cell, flat_json)
                             replacements_made += surgical_fix
                         if cell_replacements == 0:
                             management_summary_fix = handle_management_summary_fix(cell, flat_json)
                             replacements_made += management_summary_fix
-    # Final declaration checks on last few tables
     print(f"\n🎯 Final check for Declaration tables...")
     for table in document.tables[-3:]:
         if len(table.rows) <= 4:
             if getattr(table, "_processed_operator_declaration", False):
                 print(f"    ⏭️ Skipping - already processed by operator declaration handler")
                 continue
-            declaration_fix = handle_operator_declaration_fix(table, flat_json)
             replacements_made += declaration_fix
     return replacements_made
 def process_paragraphs(document, flat_json):
-    """Process all paragraphs in the document"""
     replacements_made = 0
     print(f"\n🔍 Processing paragraphs:")
     for para_idx, paragraph in enumerate(document.paragraphs):
         red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
         if red_runs:
             red_text_only = "".join(run.text for run in red_runs).strip()
             print(f"  📌 Paragraph {para_idx + 1}: Found red text: '{red_text_only}'")
-            json_value = find_matching_json_value(red_text_only, flat_json)
             if json_value is None:
                 if "AUDITOR SIGNATURE" in red_text_only.upper() or "DATE" in red_text_only.upper():
-                    json_value = find_matching_json_value("auditor signature", flat_json)
                 elif "OPERATOR SIGNATURE" in red_text_only.upper():
-                    json_value = find_matching_json_value("operator signature", flat_json)
             if json_value is not None:
                 replacement_text = get_value_as_string(json_value)
@@ -1338,21 +1126,16 @@ def process_paragraphs(document, flat_json):
                 for run in red_runs[1:]:
                     run.text = ''
                 replacements_made += 1
     return replacements_made
 def process_headings(document, flat_json):
-    """Process headings and their related content"""
     replacements_made = 0
     print(f"\n🔍 Processing headings:")
     paragraphs = document.paragraphs
     for para_idx, paragraph in enumerate(paragraphs):
         paragraph_text = paragraph.text.strip()
         if not paragraph_text:
             continue
         matched_heading = None
         for category, patterns in HEADING_PATTERNS.items():
             for pattern in patterns:
@@ -1361,26 +1144,20 @@ def process_headings(document, flat_json):
                     break
             if matched_heading:
                 break
         if matched_heading:
             print(f"  📌 Found heading at paragraph {para_idx + 1}: '{paragraph_text}'")
             if has_red_text_in_paragraph(paragraph):
                 print(f"    🔴 Found red text in heading itself")
                 heading_replacements = process_red_text_in_paragraph(paragraph, paragraph_text, flat_json)
                 replacements_made += heading_replacements
             for next_para_offset in range(1, 6):
                 next_para_idx = para_idx + next_para_offset
                 if next_para_idx >= len(paragraphs):
                     break
                 next_paragraph = paragraphs[next_para_idx]
                 next_text = next_paragraph.text.strip()
                 if not next_text:
                     continue
                 is_another_heading = False
                 for category, patterns in HEADING_PATTERNS.items():
                     for pattern in patterns:
@@ -1389,10 +1166,8 @@ def process_headings(document, flat_json):
                             break
                     if is_another_heading:
                         break
                 if is_another_heading:
                     break
                 if has_red_text_in_paragraph(next_paragraph):
                     print(f"    🔴 Found red text in paragraph {next_para_idx + 1} after heading")
                     context_replacements = process_red_text_in_paragraph(
@@ -1401,55 +1176,46 @@ def process_headings(document, flat_json):
                         flat_json
                     )
                     replacements_made += context_replacements
     return replacements_made
 def process_red_text_in_paragraph(paragraph, context_text, flat_json):
-    """Process red text within a paragraph using context"""
     replacements_made = 0
     red_text_segments = []
     for run in paragraph.runs:
         if is_red(run) and run.text.strip():
             red_text_segments.append(run.text.strip())
     if not red_text_segments:
         return 0
     combined_red_text = " ".join(red_text_segments).strip()
     print(f"      🔍 Red text found: '{combined_red_text}'")
-    json_value = None
-    json_value = find_matching_json_value(combined_red_text, flat_json)
     if json_value is None:
         if "NHVAS APPROVED AUDITOR" in context_text.upper():
             auditor_fields = ["auditor name", "auditor", "nhvas auditor", "approved auditor", "print name"]
             for field in auditor_fields:
-                json_value = find_matching_json_value(field, flat_json)
-                if json_value is not None:
-                    print(f"      ✅ Found auditor match with field: '{field}'")
                     break
         elif "OPERATOR DECLARATION" in context_text.upper():
             operator_fields = ["operator name", "operator", "company name", "organisation name", "print name"]
             for field in operator_fields:
-                json_value = find_matching_json_value(field, flat_json)
-                if json_value is not None:
-                    print(f"      ✅ Found operator match with field: '{field}'")
                     break
     if json_value is None:
-        context_queries = [
-            f"{context_text} {combined_red_text}",
-            combined_red_text,
-            context_text
-        ]
         for query in context_queries:
-            json_value = find_matching_json_value(query, flat_json)
-            if json_value is not None:
-                print(f"      ✅ Found match with combined query")
                 break
     if json_value is not None:
@@ -1468,13 +1234,10 @@ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
     return replacements_made
 # ============================================================================
-# Main process function
 # ============================================================================
 def process_hf(json_file, docx_file, output_file):
-    """Main processing function with comprehensive error handling"""
     try:
-        # Load JSON
         if hasattr(json_file, "read"):
             json_data = json.load(json_file)
         else:
@@ -1488,19 +1251,15 @@ def process_hf(json_file, docx_file, output_file):
                 print(f"  - {key}: {value}")
         print(f"  ... and {len(flat_json) - 10} more keys\n")
-        # Load DOCX
         if hasattr(docx_file, "read"):
             doc = Document(docx_file)
         else:
             doc = Document(docx_file)
-        # Process document with all fixes
         print("🚀 Starting comprehensive document processing...")
         table_replacements = process_tables(doc, flat_json)
         paragraph_replacements = process_paragraphs(doc, flat_json)
         heading_replacements = process_headings(doc, flat_json)
         total_replacements = table_replacements + paragraph_replacements + heading_replacements
         # Save unmatched headers for iterative improvement
@@ -1513,11 +1272,9 @@ def process_hf(json_file, docx_file, output_file):
             except Exception as e:
                 print(f"⚠️ Could not save unmatched headers: {e}")
-        # Save output docx
         if hasattr(output_file, "write"):
             doc.save(output_file)
         else:
-            # If output path is a file path string
             doc.save(output_file)
         print(f"\n✅ Document saved as: {output_file}")
@@ -1534,10 +1291,6 @@ def process_hf(json_file, docx_file, output_file):
         import traceback
         traceback.print_exc()
-# ============================================================================
-# CLI entrypoint
-# ============================================================================
 if __name__ == "__main__":
     import sys
     if len(sys.argv) != 4:

 #!/usr/bin/env python3
 """
+pipeline.py — safer matching and operator-declaration protections
+Key improvements:
+ - find_matching_json_key_and_value() returns (key, value) so callers can accept/reject by key.
+ - Higher fuzzy thresholds for risky substitutions.
+ - Operator Declaration: avoid using attendance lists / unrelated keys for Position Title.
+ - Vehicle header mapping: stronger normalized substring/ token matching for long headers.
+ - Preserves existing logging and all previous handlers/logic.
 """
 import json
 from docx import Document
 from docx.shared import RGBColor
 import re
+from typing import Any, Tuple, Optional
 # ============================================================================
+# Heading patterns for document structure detection (unchanged)
 # ============================================================================
 HEADING_PATTERNS = {
     "main": [
         r"NHVAS\s+Audit\s+Summary\s+Report",
 }
 # ============================================================================
+# Utility helpers
 # ============================================================================
 _unmatched_headers = {}
 def record_unmatched_header(header: str):
     if not header:
         return
     _unmatched_headers[header] = _unmatched_headers.get(header, 0) + 1
 def load_json(filepath):
     with open(filepath, 'r', encoding='utf-8') as file:
         return json.load(file)
         elif len(value) == 1:
             return str(value[0])
         else:
+            # Keep lists intact for special patterns (e.g., ACN digits) but default to join
             if "australian company number" in field_name.lower() or "company number" in field_name.lower():
                 return value
+            return " ".join(str(v) for v in value)
     else:
         return str(value)
             return True
     return False
 def normalize_header_text(s: str) -> str:
     if not s:
         return ""
+    s = re.sub(r'\([^)]*\)', ' ', s)   # remove parenthetical content
     s = s.replace("/", " ")
     s = re.sub(r'[^\w\s\#\%]', ' ', s)
     s = re.sub(r'\s+', ' ', s).strip().lower()
+    # canonical tweaks
     s = s.replace('registrationno', 'registration number')
     s = s.replace('registrationnumber', 'registration number')
     s = s.replace('sub-contractor', 'sub contractor')
+    s = s.replace('sub contracted', 'sub contractor')
+    return s.strip()
 # ============================================================================
+# JSON matching functions
+#  - find_matching_json_value: (keeps behavior used elsewhere)
+#  - find_matching_json_key_and_value: returns (key, value) so callers can
+#    decide whether to use an entry based on the matched key.
 # ============================================================================
 def find_matching_json_value(field_name, flat_json):
+    """Legacy API: return value only (preserves existing callers)."""
+    result = find_matching_json_key_and_value(field_name, flat_json)
+    return result[1] if result else None
+def find_matching_json_key_and_value(field_name, flat_json) -> Optional[Tuple[str, Any]]:
+    """
+    Return (matched_key, matched_value) or None.
+    Safer thresholds: fuzzy matches require >=0.35 by default.
+    """
     field_name = (field_name or "").strip()
     if not field_name:
         return None
+    # Exact match
     if field_name in flat_json:
         print(f"    ✅ Direct match found for key '{field_name}'")
+        return field_name, flat_json[field_name]
+    # Case-insensitive exact
     for key, value in flat_json.items():
         if key.lower() == field_name.lower():
+            print(f"    ✅ Case-insensitive match found for key '{field_name}' -> '{key}'")
+            return key, value
+    # Special-case 'print name' preference for operator vs auditor (prefer fully-qualified)
     if field_name.lower().strip() == "print name":
         operator_keys = [k for k in flat_json.keys() if "operator" in k.lower() and "print name" in k.lower()]
         auditor_keys = [k for k in flat_json.keys() if "auditor" in k.lower() and ("print name" in k.lower() or "name" in k.lower())]
         if operator_keys:
             print(f"    ✅ Operator Print Name match: '{field_name}' -> '{operator_keys[0]}'")
+            return operator_keys[0], flat_json[operator_keys[0]]
         elif auditor_keys:
             print(f"    ✅ Auditor Name match: '{field_name}' -> '{auditor_keys[0]}'")
+            return auditor_keys[0], flat_json[auditor_keys[0]]
+    # Suffix match for nested keys (e.g., 'section.field')
     for key, value in flat_json.items():
         if '.' in key and key.split('.')[-1].lower() == field_name.lower():
+            print(f"    ✅ Suffix match found for key '{field_name}' -> '{key}'")
+            return key, value
+    # Clean and exact
     clean_field = re.sub(r'[^\w\s]', ' ', field_name.lower()).strip()
     clean_field = re.sub(r'\s+', ' ', clean_field)
     for key, value in flat_json.items():
         clean_key = re.sub(r'[^\w\s]', ' ', key.lower()).strip()
         clean_key = re.sub(r'\s+', ' ', clean_key)
         if clean_field == clean_key:
+            print(f"    ✅ Clean match found for key '{field_name}' -> '{key}'")
+            return key, value
+    # Fuzzy matching with token scoring
     field_words = set(word.lower() for word in re.findall(r'\b\w+\b', field_name) if len(word) > 2)
     if not field_words:
         return None
     best_key = None
+    best_value = None
+    best_score = 0.0
     for key, value in flat_json.items():
         key_words = set(word.lower() for word in re.findall(r'\b\w+\b', key) if len(word) > 2)
         if not key_words:
             continue
+        common = field_words.intersection(key_words)
+        if not common:
+            # allow substring in normalized forms as a weaker fallback
+            norm_field = normalize_header_text(field_name)
+            norm_key = normalize_header_text(key)
+            if norm_field and norm_key and (norm_field in norm_key or norm_key in norm_field):
+                # substring score based on length ratio
+                substring_score = min(len(norm_field), len(norm_key)) / max(len(norm_field), len(norm_key))
+                final_score = 0.4 * substring_score
+            else:
+                final_score = 0.0
+        else:
+            similarity = len(common) / len(field_words.union(key_words))
+            coverage = len(common) / len(field_words)
             final_score = (similarity * 0.6) + (coverage * 0.4)
+        if final_score > best_score:
+            best_score = final_score
+            best_key = key
+            best_value = value
+    # Accept only reasonable fuzzy matches (threshold 0.35)
+    if best_key and best_score >= 0.35:
         print(f"    ✅ Fuzzy match found for key '{field_name}' with JSON key '{best_key}' (score: {best_score:.2f})")
+        return best_key, best_value
     print(f"    ❌ No match found for '{field_name}'")
     return None
 # ============================================================================
+# Red text helpers (unchanged except kept robust)
 # ============================================================================
 def extract_red_text_segments(cell):
     red_segments = []
     for para_idx, paragraph in enumerate(cell.paragraphs):
                 segment_runs.append((para_idx, run_idx, run))
             else:
                 if segment_runs:
+                    red_segments.append({'text': current_segment, 'runs': segment_runs.copy(), 'paragraph_idx': para_idx})
                     current_segment = ""
                     segment_runs = []
         if segment_runs:
+            red_segments.append({'text': current_segment, 'runs': segment_runs.copy(), 'paragraph_idx': para_idx})
     return red_segments
 def replace_all_red_segments(red_segments, replacement_text):
     if not red_segments:
         return 0
     if '\n' in replacement_text:
         replacement_lines = replacement_text.split('\n')
     else:
         replacement_lines = [replacement_text]
     replacements_made = 0
+    first_segment = red_segments[0]
+    if first_segment['runs']:
+        first_run = first_segment['runs'][0][2]
+        first_run.text = replacement_lines[0]
+        first_run.font.color.rgb = RGBColor(0, 0, 0)
+        replacements_made = 1
+        for _, _, run in first_segment['runs'][1:]:
+            run.text = ''
     for segment in red_segments[1:]:
         for _, _, run in segment['runs']:
             run.text = ''
     if len(replacement_lines) > 1 and red_segments:
         try:
             first_run = red_segments[0]['runs'][0][2]
             paragraph = first_run.element.getparent()
             from docx.oxml import OxmlElement
             for line in replacement_lines[1:]:
                 if line.strip():
                     br = OxmlElement('w:br')
                 first_run = red_segments[0]['runs'][0][2]
                 first_run.text = ' '.join(replacement_lines)
                 first_run.font.color.rgb = RGBColor(0, 0, 0)
     return replacements_made
 def replace_single_segment(segment, replacement_text):
     return replace_all_red_segments(red_segments, replacement_text)
 # ============================================================================
+# Specialized handlers (vehicle, attendance, management, operator) with fixes
 # ============================================================================
 def handle_australian_company_number(row, company_numbers):
     return replacements_made
 def handle_vehicle_registration_table(table, flat_json):
+    """
+    Stronger header normalization + substring matching for long headers.
+    Keeps existing behavior but reduces 'No mapping found' by using normalized substring matching.
+    """
     replacements_made = 0
+    # Build candidate vehicle_section similar to prior logic
     vehicle_section = None
+    # Prefer keys explicitly mentioning 'registration' or 'vehicle'
+    candidates = [(k, v) for k, v in flat_json.items() if 'registration' in k.lower() or 'vehicle' in k.lower()]
+    if candidates:
+        # prefer the one with longest key match (likely most specific)
+        candidates.sort(key=lambda kv: -len(kv[0]))
+        vehicle_section = candidates[0][1]
     # fallback: collect flattened keys that look like vehicle columns
     if vehicle_section is None:
         potential_columns = {}
         for key, value in flat_json.items():
             lk = key.lower()
+            if any(col_name in lk for col_name in ["registration number", "sub-contractor", "weight verification", "rfs suspension", "trip records", "fault recording", "fault repair", "daily checks", "roadworthiness"]):
                 if "." in key:
                     column_name = key.split(".")[-1]
                 else:
         print(f"    ❌ Vehicle registration data not found in JSON")
         return 0
+    # Normalize vehicle_section into dict of column_label -> list/value
     if isinstance(vehicle_section, list):
+        # if list of dicts, pivot
         if vehicle_section and isinstance(vehicle_section[0], dict):
             flattened = {}
             for entry in vehicle_section:
                 for k, v in entry.items():
                     flattened.setdefault(k, []).append(v)
             vehicle_section = flattened
+        else:
+            # can't interpret, bail
+            vehicle_section = {}
     if not isinstance(vehicle_section, dict):
         try:
             vehicle_section = dict(vehicle_section)
         except Exception:
+            vehicle_section = {}
     print(f"    ✅ Found vehicle registration data with {len(vehicle_section)} columns")
+    # Find header row (look for registration + number or reg no)
     header_row_idx = -1
     header_row = None
     for row_idx, row in enumerate(table.rows):
         row_text = " ".join(get_clean_text(cell).lower() for cell in row.cells)
+        if ("registration" in row_text and "number" in row_text) or "reg no" in row_text or "registration no" in row_text:
             header_row_idx = row_idx
             header_row = row
             break
     if header_row_idx == -1:
         print(f"    ❌ Could not find header row in vehicle table")
     print(f"    ✅ Found header row at index {header_row_idx}")
+    # Build master labels from vehicle_section keys
     master_labels = {}
     for orig_key in vehicle_section.keys():
         norm = normalize_header_text(str(orig_key))
         if norm:
+            # if there is collision, prefer longer orig_key (more specific)
+            if norm in master_labels:
+                if len(orig_key) > len(master_labels[norm]):
+                    master_labels[norm] = orig_key
+            else:
+                master_labels[norm] = orig_key
+    # Map header cells using normalized token overlap + substring fallback
+    column_mapping = {}
     for col_idx, cell in enumerate(header_row.cells):
         header_text = get_clean_text(cell).strip()
         if not header_text:
             continue
+        header_key = header_text.strip().lower()
+        if header_key in {"no", "no.", "#"}:
             continue
         norm_header = normalize_header_text(header_text)
             best_match = master_labels[norm_header]
             best_score = 1.0
         else:
+            # token overlap
             header_tokens = set(t for t in norm_header.split() if len(t) > 2)
             for norm_key, orig_label in master_labels.items():
                 key_tokens = set(t for t in norm_key.split() if len(t) > 2)
                 if common:
                     score = len(common) / max(1, len(header_tokens.union(key_tokens)))
                 else:
+                    # substring fallback on normalized strings
                     if norm_header in norm_key or norm_key in norm_header:
                         score = min(len(norm_header), len(norm_key)) / max(len(norm_header), len(norm_key))
                     else:
                     best_score = score
                     best_match = orig_label
+            # additional heuristic: if header contains 'roadworthiness' and any master_labels key contains that token, accept
+            if not best_match:
+                for norm_key, orig_label in master_labels.items():
+                    if 'roadworthiness' in norm_header and 'roadworthiness' in norm_key:
+                        best_match = orig_label
+                        best_score = 0.65
+                        break
         if best_match and best_score >= 0.30:
             column_mapping[col_idx] = best_match
+            print(f"      📌 Column {col_idx}: '{header_text}' -> '{best_match}' (norm:'{norm_header}' score:{best_score:.2f})")
         else:
+            print(f"      ⚠️ No mapping found for '{header_text}' (norm:'{norm_header}')")
             record_unmatched_header(header_text)
     if not column_mapping:
         print(f"    ❌ No column mappings found")
         return 0
+    # Determine how many rows of data to populate
     max_data_rows = 0
     for json_key, data in vehicle_section.items():
         if isinstance(data, list):
     print(f"    📌 Need to populate {max_data_rows} data rows")
+    # Populate or add rows
     for data_row_index in range(max_data_rows):
         table_row_idx = header_row_idx + 1 + data_row_index
         if table_row_idx >= len(table.rows):
+            print(f"    ⚠️ Row {table_row_idx + 1} doesn't exist, adding one")
+            table.add_row()
         row = table.rows[table_row_idx]
         print(f"    📌 Processing data row {table_row_idx + 1} (vehicle {data_row_index + 1})")
         for col_idx, json_key in column_mapping.items():
             if col_idx < len(row.cells):
                 cell = row.cells[col_idx]
                 column_data = vehicle_section.get(json_key, [])
                 if isinstance(column_data, list) and data_row_index < len(column_data):
                     replacement_value = str(column_data[data_row_index])
                     cell_text = get_clean_text(cell)
                     if has_red_text(cell) or not cell_text.strip():
                         if not cell_text.strip():
                             cell.text = replacement_value
                             replacements_made += 1
+                            print(f"      -> Added '{replacement_value}' to empty cell (col '{json_key}')")
                         else:
                             cell_replacements = replace_red_text_in_cell(cell, replacement_value)
                             replacements_made += cell_replacements
                             if cell_replacements > 0:
+                                print(f"      -> Replaced red text with '{replacement_value}' (col '{json_key}')")
     return replacements_made
 def handle_attendance_list_table_enhanced(table, flat_json):
+    """Same as before — preserved behavior."""
     replacements_made = 0
+    attendance_patterns = ["attendance list", "names and position titles", "attendees"]
     found_attendance_row = None
     for row_idx, row in enumerate(table.rows[:3]):
         for cell_idx, cell in enumerate(row.cells):
                 break
         if found_attendance_row is not None:
             break
     if found_attendance_row is None:
         return 0
         "attendance list",
         "attendees"
     ]
     print(f"    🔍 Searching for attendance data in JSON...")
     for search_key in attendance_search_keys:
+        kv = find_matching_json_key_and_value(search_key, flat_json)
+        if kv:
+            attendance_value = kv[1]
+            print(f"    ✅ Found attendance data with key: '{kv[0]}'")
             print(f"    📊 Raw value: {attendance_value}")
             break
     if attendance_value is None:
         print(f"    ❌ No attendance data found in JSON")
         return 0
+    # Find red text candidate cell
     target_cell = None
     print(f"    🔍 Scanning ALL cells in attendance table for red text...")
     for row_idx, row in enumerate(table.rows):
         for cell_idx, cell in enumerate(row.cells):
             if has_red_text(cell):
                 red_text = ""
                 for paragraph in cell.paragraphs:
                     for run in paragraph.runs:
                         if is_red(run):
                             red_text += run.text
+                if red_text.strip():
+                    print(f"        🎯 Found red text in row {row_idx + 1}, cell {cell_idx + 1}")
+                    print(f"        📋 Red text content: '{red_text[:60]}...'")
+                    red_lower = red_text.lower()
+                    if any(ind in red_lower for ind in ['manager', 'director', 'auditor', '–', '-']):
+                        target_cell = cell
+                        print(f"        ✅ This looks like attendance data - using this cell")
+                        break
+        if target_cell:
             break
     if target_cell is None:
     if has_red_text(target_cell):
         print(f"    🔧 Replacing red text with properly formatted attendance list...")
         if isinstance(attendance_value, list):
             attendance_list = [str(item).strip() for item in attendance_value if str(item).strip()]
         else:
             attendance_list = [str(attendance_value).strip()]
         print(f"    📝 Attendance items to add:")
         for i, item in enumerate(attendance_list):
             print(f"        {i+1}. {item}")
         replacement_text = "\n".join(attendance_list)
         cell_replacements = replace_red_text_in_cell(target_cell, replacement_text)
         replacements_made += cell_replacements
         print(f"    ✅ Added {len(attendance_list)} attendance items")
         print(f"    📊 Replacements made: {cell_replacements}")
     return replacements_made
 def fix_management_summary_details_column(table, flat_json):
+    """Preserve behavior but prefer scoped mgmt dicts."""
     replacements_made = 0
     print(f"    🎯 FIX: Management Summary DETAILS column processing")
     table_text = ""
     for row in table.rows[:3]:
         for cell in row.cells:
             table_text += get_clean_text(cell).lower() + " "
     mgmt_types = []
     if "mass management" in table_text or "mass" in table_text:
         mgmt_types.append("Mass Management Summary")
     if "maintenance management" in table_text or "maintenance" in table_text:
         mgmt_types.append("Maintenance Management Summary")
+    if "fatigue management" in table_text or "fatigue" in table_text:
         mgmt_types.append("Fatigue Management Summary")
     if not mgmt_types:
         if any("std 5" in get_clean_text(c).lower() for r in table.rows for c in r.cells):
             mgmt_types.append("Mass Management Summary")
     if not mgmt_types:
         return 0
     for mgmt_type in mgmt_types:
         print(f"    ✅ Confirmed {mgmt_type} table processing")
         mgmt_data = flat_json.get(mgmt_type)
         if not isinstance(mgmt_data, dict):
             for key in flat_json.keys():
                 if mgmt_type.split()[0].lower() in key.lower() and "summary" in key.lower():
                     mgmt_data = flat_json.get(key)
         if not isinstance(mgmt_data, dict):
             print(f"    ⚠️ No JSON management dict found for {mgmt_type}, skipping this type")
             continue
         for row_idx, row in enumerate(table.rows):
             if len(row.cells) >= 2:
                 standard_cell = row.cells[0]
                 details_cell = row.cells[1]
                 standard_text = get_clean_text(standard_cell).strip().lower()
                 if "std 5" in standard_text or "verification" in standard_text:
                     if has_red_text(details_cell):
                         std_val = None
                         for candidate in ("Std 5. Verification", "Std 5 Verification", "Std 5", "Verification"):
                             std_val = mgmt_data.get(candidate)
                             if std_val is not None:
                                 break
                         if std_val is None:
                             for k, v in mgmt_data.items():
                                 if 'std 5' in k.lower() or 'verification' in k.lower():
                             replacements_made += cell_replacements
                             if cell_replacements:
                                 print(f"      ✅ Replaced Std 5. Verification details for {mgmt_type}")
                 if "std 6" in standard_text or "internal review" in standard_text:
                     if has_red_text(details_cell):
                         std_val = None
                         for candidate in ("Std 6. Internal Review", "Std 6 Internal Review", "Std 6", "Internal Review"):
                             std_val = mgmt_data.get(candidate)
                             replacements_made += cell_replacements
                             if cell_replacements:
                                 print(f"      ✅ Replaced Std 6. Internal Review details for {mgmt_type}")
     return replacements_made
+# ============================================================================
+# Canonical operator declaration fixer — SAFER
+# ============================================================================
 def fix_operator_declaration_empty_values(table, flat_json):
     replacements_made = 0
     print(f"    🎯 FIX: Operator Declaration empty values processing")
     table_context = ""
     for row in table.rows:
         for cell in row.cells:
             table_context += get_clean_text(cell).lower() + " "
     if not ("print name" in table_context and "position title" in table_context):
         return 0
     print(f"    ✅ Confirmed Operator Declaration table")
     def parse_name_and_position(value):
                 return None, None
             if len(value) == 1:
                 return str(value[0]).strip(), None
+            # common [name, position] pattern
             first = str(value[0]).strip()
             second = str(value[1]).strip()
             if first and second:
                 return first, second
             value = " ".join(str(v).strip() for v in value if str(v).strip())
         s = str(value).strip()
         if not s:
             return None, None
         parts = re.split(r'\s+[-–—]\s+|\s*,\s*|\s*\|\s*', s)
         if len(parts) >= 2:
             left = parts[0].strip()
             if any(ind in left.lower() for ind in role_indicators) and not any(ind in right.lower() for ind in role_indicators):
                 return right, left
             return left, right
         tokens = s.split()
         if len(tokens) >= 2:
             last = tokens[-1]
                                'coordinator', 'driver', 'operator', 'representative', 'chief']
             if any(ind == last.lower() for ind in role_indicators):
                 return " ".join(tokens[:-1]), last
         return s, None
     for row_idx, row in enumerate(table.rows):
         if len(row.cells) >= 2:
             cell1_text = get_clean_text(row.cells[0]).strip().lower()
             cell2_text = get_clean_text(row.cells[1]).strip().lower()
+            # header detection
             if "print name" in cell1_text and "position" in cell2_text:
                 print(f"      📌 Found header row at {row_idx + 1}")
                 if row_idx + 1 < len(table.rows):
                     data_row = table.rows[row_idx + 1]
                     if len(data_row.cells) >= 2:
                         name_cell = data_row.cells[0]
                         position_cell = data_row.cells[1]
                         name_text = get_clean_text(name_cell).strip()
                         position_text = get_clean_text(position_cell).strip()
                         print(f"      📋 Current values: Name='{name_text}', Position='{position_text}'")
+                        # Prefer exact qualified keys first (use key-aware lookup)
+                        name_kv = find_matching_json_key_and_value("Operator Declaration.Print Name", flat_json) or find_matching_json_key_and_value("Print Name", flat_json)
+                        position_kv = find_matching_json_key_and_value("Operator Declaration.Position Title", flat_json) or find_matching_json_key_and_value("Position Title", flat_json)
+                        name_value = name_kv[1] if name_kv else None
+                        name_key = name_kv[0] if name_kv else None
+                        position_value = position_kv[1] if position_kv else None
+                        position_key = position_kv[0] if position_kv else None
+                        # parse combined cases
                         parsed_name_from_nameval, parsed_pos_from_nameval = parse_name_and_position(name_value) if name_value is not None else (None, None)
                         parsed_name_from_posval, parsed_pos_from_posval = parse_name_and_position(position_value) if position_value is not None else (None, None)
                         elif name_value is not None:
                             final_name = get_value_as_string(name_value)
+                        # Position acceptance policy:
+                        # - Accept position_value ONLY if matched key indicates position/title OR parsed value looks like a role
+                        def looks_like_role(s: str) -> bool:
+                            if not s:
+                                return False
+                            s = s.lower()
+                            roles = ['manager', 'auditor', 'owner', 'director', 'supervisor', 'coordinator', 'driver', 'operator', 'representative', 'chief']
+                            # short role descriptions or containing role token
+                            if any(r in s for r in roles):
+                                return True
+                            # single/short token likely role (<=4 tokens)
+                            if len(s.split()) <= 4 and any(c.isalpha() for c in s):
+                                return True
+                            return False
+                        # Only use position_value if the matched key strongly indicates position/title
+                        use_position = False
+                        if position_kv:
+                            k_lower = (position_key or "").lower()
+                            if ("position" in k_lower or "title" in k_lower or "role" in k_lower):
+                                use_position = True
+                        # Avoid using attendance keys or attendance text as position source
+                        if position_kv and ("attendance" in position_key.lower() or "attendance list" in position_key.lower() or "attendees" in position_key.lower()):
+                            use_position = False
+                        if use_position:
+                            # choose parsed pos if available
+                            if parsed_pos_from_posval:
+                                final_pos = parsed_pos_from_posval
+                            else:
+                                final_pos = get_value_as_string(position_value) if position_value is not None else None
+                        else:
+                            # allow parsed position gleaned from name_value (if it looks like a role)
+                            if parsed_pos_from_nameval and looks_like_role(parsed_pos_from_nameval):
+                                final_pos = parsed_pos_from_nameval
+                        # final normalization
                         if isinstance(final_name, list):
                             final_name = " ".join(str(x) for x in final_name).strip()
                         if isinstance(final_pos, list):
                             low = name_str.lower()
                             if any(bp in low for bp in bad_phrases):
                                 return False
+                            return len(name_str) > 1 and any(c.isalpha() for c in name_str)
+                        # Write name if empty or red
                         if (not name_text or has_red_text(name_cell)) and final_name and looks_like_person(final_name):
                             if has_red_text(name_cell):
                                 replace_red_text_in_cell(name_cell, final_name)
                             replacements_made += 1
                             print(f"      ✅ Updated Print Name -> '{final_name}'")
+                        # Write position if empty or red and final_pos appears role-like
+                        if (not position_text or has_red_text(position_cell)) and final_pos and looks_like_role(final_pos):
                             if has_red_text(position_cell):
                                 replace_red_text_in_cell(position_cell, final_pos)
                             else:
     for i, segment in enumerate(red_segments):
         segment_text = segment['text'].strip()
         if segment_text:
+            kv = find_matching_json_key_and_value(segment_text, flat_json)
+            if kv:
+                replacement_text = get_value_as_string(kv[1], segment_text)
                 if replace_single_segment(segment, replacement_text):
                     replacements_made += 1
                     print(f"      ✅ Replaced segment {i+1}: '{segment_text}' -> '{replacement_text}'")
         return 0
     nature_indicators = ["transport", "logistics", "freight", "delivery", "trucking", "haulage"]
     if any(indicator in red_text.lower() for indicator in nature_indicators):
+        kv = find_matching_json_key_and_value("Nature of Business", flat_json) or find_matching_json_key_and_value("Nature of the Operators Business (Summary)", flat_json)
+        if kv:
+            replacement_text = get_value_as_string(kv[1], "Nature of Business")
             cell_replacements = replace_red_text_in_cell(cell, replacement_text)
             replacements_made += cell_replacements
             print(f"      ✅ Fixed Nature of Business multiline content")
         return 0
     management_types = ["Mass Management Summary", "Maintenance Management Summary", "Fatigue Management Summary"]
     for mgmt_type in management_types:
+        if mgmt_type in flat_json and isinstance(flat_json[mgmt_type], dict):
             mgmt_data = flat_json[mgmt_type]
+            for std_key, std_value in mgmt_data.items():
+                if isinstance(std_value, list) and std_value:
+                    if len(red_text) > 10:
+                        for item in std_value:
+                            if red_text.lower() in str(item).lower() or str(item).lower() in red_text.lower():
+                                replacement_text = "\n".join(str(i) for i in std_value)
+                                cell_replacements = replace_red_text_in_cell(cell, replacement_text)
                                 replacements_made += cell_replacements
+                                print(f"      ✅ Fixed {mgmt_type} - {std_key}")
+                                return replacements_made
     return replacements_made
 def handle_print_accreditation_section(table, flat_json):
     replacements_made = 0
     if getattr(table, "_processed_operator_declaration", False):
         print(f"    ⏭️ Skipping Print Accreditation - this is an Operator Declaration table")
         return 0
     table_context = ""
     for row in table.rows:
         for cell in row.cells:
             table_context += get_clean_text(cell).lower() + " "
     if "operator declaration" in table_context or ("print name" in table_context and "position title" in table_context):
         print(f"    ⏭️ Skipping Print Accreditation - this is an Operator Declaration table")
         return 0
     print(f"    📋 Processing Print Accreditation section")
     for row_idx, row in enumerate(table.rows):
         for cell_idx, cell in enumerate(row.cells):
             if has_red_text(cell):
                 accreditation_fields = [
                     "(print accreditation name)",
                     "Operator name (Legal entity)",
+                    "Print accreditation name"
                 ]
                 for field in accreditation_fields:
+                    kv = find_matching_json_key_and_value(field, flat_json)
+                    if kv:
+                        replacement_text = get_value_as_string(kv[1], field)
                         if replacement_text.strip():
                             cell_replacements = replace_red_text_in_cell(cell, replacement_text)
                             replacements_made += cell_replacements
                             if cell_replacements > 0:
+                                print(f"      ✅ Fixed accreditation: {kv[0]}")
                             break
     return replacements_made
 def process_single_column_sections(cell, key_text, flat_json):
     replacements_made = 0
     if has_red_text(cell):
         red_text = ""
         for paragraph in cell.paragraphs:
             for run in paragraph.runs:
                 if is_red(run):
                     red_text += run.text
         if red_text.strip():
+            kv = find_matching_json_key_and_value(red_text.strip(), flat_json)
+            if not kv:
+                kv = find_matching_json_key_and_value(key_text, flat_json)
+            if kv:
+                section_replacement = get_value_as_string(kv[1], red_text.strip())
                 cell_replacements = replace_red_text_in_cell(cell, section_replacement)
                 replacements_made += cell_replacements
                 if cell_replacements > 0:
                     print(f"      ✅ Fixed single column section: '{key_text}'")
     return replacements_made
 # ============================================================================
+# Main table/paragraph/heading processing (preserve logic + use new helpers)
 # ============================================================================
 def process_tables(document, flat_json):
     replacements_made = 0
     for table_idx, table in enumerate(document.tables):
         print(f"\n🔍 Processing table {table_idx + 1}:")
         table_text = ""
         for row in table.rows[:3]:
             for cell in row.cells:
                 table_text += get_clean_text(cell).lower() + " "
         management_summary_indicators = ["mass management", "maintenance management", "fatigue management"]
         has_management = any(indicator in table_text for indicator in management_summary_indicators)
         has_details = "details" in table_text
             summary_fixes = fix_management_summary_details_column(table, flat_json)
             replacements_made += summary_fixes
             summary_replacements = 0
             for row_idx, row in enumerate(table.rows):
                 for cell_idx, cell in enumerate(row.cells):
                     if has_red_text(cell):
                         for mgmt_type in ["Mass Management Summary", "Maintenance Management Summary", "Fatigue Management Summary"]:
                             if mgmt_type.lower().replace(" summary", "") in table_text:
                                 if mgmt_type in flat_json:
             replacements_made += summary_replacements
             continue
+        # Vehicle tables detection
         vehicle_indicators = ["registration number", "sub-contractor", "weight verification", "rfs suspension", "registration"]
         indicator_count = sum(1 for indicator in vehicle_indicators if indicator in table_text)
         if indicator_count >= 2:
             replacements_made += vehicle_replacements
             continue
+        # Attendance
         if "attendance list" in table_text and "names and position titles" in table_text:
             print(f"    👥 Detected Attendance List table")
             attendance_replacements = handle_attendance_list_table_enhanced(table, flat_json)
             replacements_made += attendance_replacements
             continue
+        # Print Accreditation / Operator Declaration
         print_accreditation_indicators = ["print name", "position title"]
         indicator_count = sum(1 for indicator in print_accreditation_indicators if indicator in table_text)
         if indicator_count >= 2 or ("print name" in table_text and "position title" in table_text):
             print(f"    📋 Detected Print Accreditation/Operator Declaration table")
             declaration_fixes = fix_operator_declaration_empty_values(table, flat_json)
             replacements_made += declaration_fixes
             if not getattr(table, "_processed_operator_declaration", False):
                 print_accreditation_replacements = handle_print_accreditation_section(table, flat_json)
                 replacements_made += print_accreditation_replacements
             continue
+        # Regular table rows handling (preserved)
         for row_idx, row in enumerate(table.rows):
             if len(row.cells) < 1:
                 continue
             key_cell = row.cells[0]
             key_text = get_clean_text(key_cell)
             if not key_text:
                 continue
             print(f"  📌 Row {row_idx + 1}: Key = '{key_text}'")
+            kv = find_matching_json_key_and_value(key_text, flat_json)
+            json_value = kv[1] if kv else None
             if json_value is not None:
                 replacement_text = get_value_as_string(json_value, key_text)
+                # ACN handling
                 if ("australian company number" in key_text.lower() or "company number" in key_text.lower()) and isinstance(json_value, list):
                     cell_replacements = handle_australian_company_number(row, json_value)
                     replacements_made += cell_replacements
+                # section headers
                 elif ("attendance list" in key_text.lower() or "nature of" in key_text.lower()) and row_idx + 1 < len(table.rows):
                     print(f"    ✅ Section header detected, checking next row...")
                     next_row = table.rows[row_idx + 1]
                         if has_red_text(cell):
                             print(f"    ✅ Found red text in next row, cell {cell_idx + 1}")
                             if isinstance(json_value, list):
+                                section_text = "\n".join(str(item) for item in json_value)
+                            else:
+                                section_text = replacement_text
+                            cell_replacements = replace_red_text_in_cell(cell, section_text)
                             replacements_made += cell_replacements
                             if cell_replacements > 0:
                                 print(f"    -> Replaced section content")
+                # single column
                 elif len(row.cells) == 1 or (len(row.cells) > 1 and not any(has_red_text(row.cells[i]) for i in range(1, len(row.cells)))):
                     if has_red_text(key_cell):
                         cell_replacements = process_single_column_sections(key_cell, key_text, flat_json)
                         replacements_made += cell_replacements
+                # key-value pairs
                 else:
                     for cell_idx in range(1, len(row.cells)):
                         value_cell = row.cells[cell_idx]
                             replacements_made += cell_replacements
             else:
+                # fallback single cell red-text key
                 if len(row.cells) == 1 and has_red_text(key_cell):
                     red_text = ""
                     for paragraph in key_cell.paragraphs:
                             if is_red(run):
                                 red_text += run.text
                     if red_text.strip():
+                        kv2 = find_matching_json_key_and_value(red_text.strip(), flat_json)
+                        if kv2:
+                            section_replacement = get_value_as_string(kv2[1], red_text.strip())
                             cell_replacements = replace_red_text_in_cell(key_cell, section_replacement)
                             replacements_made += cell_replacements
+                # attempt multiple red-segments or surgical fixes
                 for cell_idx in range(len(row.cells)):
                     cell = row.cells[cell_idx]
                     if has_red_text(cell):
                         cell_replacements = handle_multiple_red_segments_in_cell(cell, flat_json)
                         replacements_made += cell_replacements
                         if cell_replacements == 0:
                             surgical_fix = handle_nature_business_multiline_fix(cell, flat_json)
                             replacements_made += surgical_fix
                         if cell_replacements == 0:
                             management_summary_fix = handle_management_summary_fix(cell, flat_json)
                             replacements_made += management_summary_fix
+    # Final operator/auditor declaration check on last few tables
     print(f"\n🎯 Final check for Declaration tables...")
     for table in document.tables[-3:]:
         if len(table.rows) <= 4:
             if getattr(table, "_processed_operator_declaration", False):
                 print(f"    ⏭️ Skipping - already processed by operator declaration handler")
                 continue
+            declaration_fix = fix_operator_declaration_empty_values(table, flat_json)
             replacements_made += declaration_fix
     return replacements_made
 def process_paragraphs(document, flat_json):
     replacements_made = 0
     print(f"\n🔍 Processing paragraphs:")
     for para_idx, paragraph in enumerate(document.paragraphs):
         red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
         if red_runs:
             red_text_only = "".join(run.text for run in red_runs).strip()
             print(f"  📌 Paragraph {para_idx + 1}: Found red text: '{red_text_only}'")
+            kv = find_matching_json_key_and_value(red_text_only, flat_json)
+            json_value = kv[1] if kv else None
             if json_value is None:
                 if "AUDITOR SIGNATURE" in red_text_only.upper() or "DATE" in red_text_only.upper():
+                    kv = find_matching_json_key_and_value("auditor signature", flat_json)
                 elif "OPERATOR SIGNATURE" in red_text_only.upper():
+                    kv = find_matching_json_key_and_value("operator signature", flat_json)
+                json_value = kv[1] if kv else None
             if json_value is not None:
                 replacement_text = get_value_as_string(json_value)
                 for run in red_runs[1:]:
                     run.text = ''
                 replacements_made += 1
     return replacements_made
 def process_headings(document, flat_json):
     replacements_made = 0
     print(f"\n🔍 Processing headings:")
     paragraphs = document.paragraphs
     for para_idx, paragraph in enumerate(paragraphs):
         paragraph_text = paragraph.text.strip()
         if not paragraph_text:
             continue
         matched_heading = None
         for category, patterns in HEADING_PATTERNS.items():
             for pattern in patterns:
                     break
             if matched_heading:
                 break
         if matched_heading:
             print(f"  📌 Found heading at paragraph {para_idx + 1}: '{paragraph_text}'")
             if has_red_text_in_paragraph(paragraph):
                 print(f"    🔴 Found red text in heading itself")
                 heading_replacements = process_red_text_in_paragraph(paragraph, paragraph_text, flat_json)
                 replacements_made += heading_replacements
             for next_para_offset in range(1, 6):
                 next_para_idx = para_idx + next_para_offset
                 if next_para_idx >= len(paragraphs):
                     break
                 next_paragraph = paragraphs[next_para_idx]
                 next_text = next_paragraph.text.strip()
                 if not next_text:
                     continue
                 is_another_heading = False
                 for category, patterns in HEADING_PATTERNS.items():
                     for pattern in patterns:
                             break
                     if is_another_heading:
                         break
                 if is_another_heading:
                     break
                 if has_red_text_in_paragraph(next_paragraph):
                     print(f"    🔴 Found red text in paragraph {next_para_idx + 1} after heading")
                     context_replacements = process_red_text_in_paragraph(
                         flat_json
                     )
                     replacements_made += context_replacements
     return replacements_made
 def process_red_text_in_paragraph(paragraph, context_text, flat_json):
     replacements_made = 0
     red_text_segments = []
     for run in paragraph.runs:
         if is_red(run) and run.text.strip():
             red_text_segments.append(run.text.strip())
     if not red_text_segments:
         return 0
     combined_red_text = " ".join(red_text_segments).strip()
     print(f"      🔍 Red text found: '{combined_red_text}'")
+    kv = find_matching_json_key_and_value(combined_red_text, flat_json)
+    json_value = kv[1] if kv else None
     if json_value is None:
         if "NHVAS APPROVED AUDITOR" in context_text.upper():
             auditor_fields = ["auditor name", "auditor", "nhvas auditor", "approved auditor", "print name"]
             for field in auditor_fields:
+                kv = find_matching_json_key_and_value(field, flat_json)
+                if kv:
+                    print(f"      ✅ Found auditor match with field: '{kv[0]}'")
+                    json_value = kv[1]
                     break
         elif "OPERATOR DECLARATION" in context_text.upper():
             operator_fields = ["operator name", "operator", "company name", "organisation name", "print name"]
             for field in operator_fields:
+                kv = find_matching_json_key_and_value(field, flat_json)
+                if kv:
+                    print(f"      ✅ Found operator match with field: '{kv[0]}'")
+                    json_value = kv[1]
                     break
     if json_value is None:
+        context_queries = [f"{context_text} {combined_red_text}", combined_red_text, context_text]
         for query in context_queries:
+            kv = find_matching_json_key_and_value(query, flat_json)
+            if kv:
+                print(f"      ✅ Found match with combined query -> {kv[0]}")
+                json_value = kv[1]
                 break
     if json_value is not None:
     return replacements_made
 # ============================================================================
+# Orchestrator
 # ============================================================================
 def process_hf(json_file, docx_file, output_file):
     try:
         if hasattr(json_file, "read"):
             json_data = json.load(json_file)
         else:
                 print(f"  - {key}: {value}")
         print(f"  ... and {len(flat_json) - 10} more keys\n")
         if hasattr(docx_file, "read"):
             doc = Document(docx_file)
         else:
             doc = Document(docx_file)
         print("🚀 Starting comprehensive document processing...")
         table_replacements = process_tables(doc, flat_json)
         paragraph_replacements = process_paragraphs(doc, flat_json)
         heading_replacements = process_headings(doc, flat_json)
         total_replacements = table_replacements + paragraph_replacements + heading_replacements
         # Save unmatched headers for iterative improvement
             except Exception as e:
                 print(f"⚠️ Could not save unmatched headers: {e}")
         if hasattr(output_file, "write"):
             doc.save(output_file)
         else:
             doc.save(output_file)
         print(f"\n✅ Document saved as: {output_file}")
         import traceback
         traceback.print_exc()
 if __name__ == "__main__":
     import sys
     if len(sys.argv) != 4: