Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

Shami96 commited on Aug 21

Commit

c0e794c

verified ·

1 Parent(s): b93e8d9

Update updated_word.py

Browse files

Files changed (1) hide show

updated_word.py +261 -71

updated_word.py CHANGED Viewed

@@ -602,7 +602,7 @@ def fix_operator_declaration_empty_values(table, flat_json):
     """Fix Operator Declaration table when values are empty"""
     replacements_made = 0
-    print(f"    🎯 FIX 2: Operator Declaration empty values processing")
     # Check if this is an Operator Declaration table
     table_context = ""
@@ -657,13 +657,11 @@ def fix_operator_declaration_empty_values(table, flat_json):
                                     if name_replacement.strip():
                                         # Extract just the name if it's a company name
                                         if "Pty Ltd" in name_replacement or "Company" in name_replacement:
-                                            # Try to get individual name instead
                                             continue
                                         if has_red_text(name_cell):
                                             cell_replacements = replace_red_text_in_cell(name_cell, name_replacement)
                                         else:
-                                            # Cell is empty, add text directly
                                             name_cell.text = name_replacement
                                             cell_replacements = 1
@@ -675,7 +673,6 @@ def fix_operator_declaration_empty_values(table, flat_json):
                         if not position_text or has_red_text(position_cell):
                             print(f"      🔧 Fixing empty/red Position Title")
-                            # Try multiple sources for position
                             position_sources = [
                                 "Operator Declaration.Position Title",
                                 "Position Title"
@@ -690,7 +687,6 @@ def fix_operator_declaration_empty_values(table, flat_json):
                                         if has_red_text(position_cell):
                                             cell_replacements = replace_red_text_in_cell(position_cell, position_replacement)
                                         else:
-                                            # Cell is empty, add text directly
                                             position_cell.text = position_replacement
                                             cell_replacements = 1
@@ -703,25 +699,226 @@ def fix_operator_declaration_empty_values(table, flat_json):
                                 position_cell.text = "Manager"
                                 replacements_made += 1
                                 print(f"      ✅ Used fallback Position Title: 'Manager'")
-                break  # Found the table, stop looking
     return replacements_made
 def process_tables(document, flat_json):
-    """Your original function with ALL surgical fixes added"""
     replacements_made = 0
     for table_idx, table in enumerate(document.tables):
         print(f"\n🔍 Processing table {table_idx + 1}:")
-        # Your original logic
         table_text = ""
         for row in table.rows[:3]:
             for cell in row.cells:
                 table_text += get_clean_text(cell).lower() + " "
-        # 🎯 NEW: Detect Management Summary tables (with DETAILS column)
         management_summary_indicators = ["mass management", "maintenance management", "fatigue management"]
         has_management = any(indicator in table_text for indicator in management_summary_indicators)
         has_details = "details" in table_text
@@ -730,7 +927,8 @@ def process_tables(document, flat_json):
             print(f"    📋 Detected Management Summary table")
             summary_fixes = fix_management_summary_details_column(table, flat_json)
             replacements_made += summary_fixes
-            # Process each cell in the table to find red text and apply the existing fix
             summary_replacements = 0
             for row_idx, row in enumerate(table.rows):
                 for cell_idx, cell in enumerate(row.cells):
@@ -764,7 +962,7 @@ def process_tables(document, flat_json):
             replacements_made += summary_replacements
             continue
-        # Enhanced vehicle registration detection
         vehicle_indicators = ["registration number", "sub-contractor", "weight verification", "rfs suspension"]
         indicator_count = sum(1 for indicator in vehicle_indicators if indicator in table_text)
         if indicator_count >= 2:
@@ -773,23 +971,29 @@ def process_tables(document, flat_json):
             replacements_made += vehicle_replacements
             continue
-        # 🎯 FINAL FIX 1: Enhanced attendance list detection
         if "attendance list" in table_text and "names and position titles" in table_text:
             print(f"    👥 Detected Attendance List table")
             attendance_replacements = handle_attendance_list_table_enhanced(table, flat_json)
             replacements_made += attendance_replacements
             continue
-        # Enhanced print accreditation detection
         print_accreditation_indicators = ["print name", "position title"]
         indicator_count = sum(1 for indicator in print_accreditation_indicators if indicator in table_text)
         if indicator_count >= 1:
             print(f"    📋 Detected Print Accreditation table")
             print_accreditation_replacements = handle_print_accreditation_section(table, flat_json)
             replacements_made += print_accreditation_replacements
             continue
-        # Your existing row processing
         for row_idx, row in enumerate(table.rows):
             if len(row.cells) < 1:
                 continue
@@ -807,14 +1011,14 @@ def process_tables(document, flat_json):
             if json_value is not None:
                 replacement_text = get_value_as_string(json_value, key_text)
-                # Enhanced ACN handling
                 if ("australian company number" in key_text.lower() or "company number" in key_text.lower()) and isinstance(json_value, list):
                     cell_replacements = handle_australian_company_number(row, json_value)
                     replacements_made += cell_replacements
-                # Enhanced section header handling
                 elif ("attendance list" in key_text.lower() or "nature of" in key_text.lower()) and row_idx + 1 < len(table.rows):
-                    print(f"    ✅ Section header detected, checking next row for content...")
                     next_row = table.rows[row_idx + 1]
                     for cell_idx, cell in enumerate(next_row.cells):
@@ -825,12 +1029,15 @@ def process_tables(document, flat_json):
                             cell_replacements = replace_red_text_in_cell(cell, replacement_text)
                             replacements_made += cell_replacements
                             if cell_replacements > 0:
-                                print(f"    -> Replaced section content with: '{replacement_text[:100]}...'")
                 elif len(row.cells) == 1 or (len(row.cells) > 1 and not any(has_red_text(row.cells[i]) for i in range(1, len(row.cells)))):
                     if has_red_text(key_cell):
                         cell_replacements = process_single_column_sections(key_cell, key_text, flat_json)
                         replacements_made += cell_replacements
                 else:
                     for cell_idx in range(1, len(row.cells)):
                         value_cell = row.cells[cell_idx]
@@ -838,8 +1045,9 @@ def process_tables(document, flat_json):
                             print(f"    ✅ Found red text in column {cell_idx + 1}")
                             cell_replacements = replace_red_text_in_cell(value_cell, replacement_text)
                             replacements_made += cell_replacements
             else:
-                # Enhanced fallback processing for unmatched keys
                 if len(row.cells) == 1 and has_red_text(key_cell):
                     red_text = ""
                     for paragraph in key_cell.paragraphs:
@@ -853,49 +1061,42 @@ def process_tables(document, flat_json):
                             cell_replacements = replace_red_text_in_cell(key_cell, section_replacement)
                             replacements_made += cell_replacements
-                # Enhanced red text processing for all cells
                 for cell_idx in range(len(row.cells)):
                     cell = row.cells[cell_idx]
                     if has_red_text(cell):
                         cell_replacements = handle_multiple_red_segments_in_cell(cell, flat_json)
                         replacements_made += cell_replacements
-                        # 🎯 SURGICAL FIX 1: Only if no replacements were made
                         if cell_replacements == 0:
                             surgical_fix = handle_nature_business_multiline_fix(cell, flat_json)
                             replacements_made += surgical_fix
-                        # 🎯 FINAL FIX 2: Only if still no replacements were made, try ANY Management Summary fix
-                        if cell_replacements == 0 and surgical_fix == 0:
                             management_summary_fix = handle_management_summary_fix(cell, flat_json)
                             replacements_made += management_summary_fix
-                # 🎯 SURGICAL FIX 3: Handle Operator Declaration tables (only check last few tables)
-                print(f"\n🎯 SURGICAL FIX: Checking for Operator/Auditor Declaration tables...")
-                for table in document.tables[-3:]:  # Only check last 3 tables
-                    if len(table.rows) <= 4:  # Only small tables
-                        declaration_fix = handle_operator_declaration_fix(table, flat_json)
-                        replacements_made += declaration_fix
-                # Check for declaration tables that need fixing
-                if "print name" in table_text and "position" in table_text:
-                    declaration_fixes = fix_operator_declaration_empty_values(table, flat_json)
-                    replacements_made += declaration_fixes
-return replacements_made
 def process_paragraphs(document, flat_json):
-    """Your original function (unchanged)"""
     replacements_made = 0
     print(f"\n🔍 Processing paragraphs:")
     for para_idx, paragraph in enumerate(document.paragraphs):
         red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
         if red_runs:
-            full_text = paragraph.text.strip()
             red_text_only = "".join(run.text for run in red_runs).strip()
             print(f"  📌 Paragraph {para_idx + 1}: Found red text: '{red_text_only}'")
-            # Your existing matching logic
             json_value = find_matching_json_value(red_text_only, flat_json)
             if json_value is None:
@@ -917,7 +1118,7 @@ def process_paragraphs(document, flat_json):
     return replacements_made
 def process_headings(document, flat_json):
-    """Your original function (unchanged)"""
     replacements_made = 0
     print(f"\n🔍 Processing headings:")
@@ -929,7 +1130,7 @@ def process_headings(document, flat_json):
         if not paragraph_text:
             continue
-        # Enhanced heading detection
         matched_heading = None
         for category, patterns in HEADING_PATTERNS.items():
             for pattern in patterns:
@@ -948,8 +1149,8 @@ def process_headings(document, flat_json):
                 heading_replacements = process_red_text_in_paragraph(paragraph, paragraph_text, flat_json)
                 replacements_made += heading_replacements
-            # Enhanced: Look further ahead for related content
-            for next_para_offset in range(1, 6):  # Extended range
                 next_para_idx = para_idx + next_para_offset
                 if next_para_idx >= len(paragraphs):
                     break
@@ -973,9 +1174,9 @@ def process_headings(document, flat_json):
                 if is_another_heading:
                     break
-                # Process red text with enhanced context
                 if has_red_text_in_paragraph(next_paragraph):
-                    print(f"    🔴 Found red text in paragraph {next_para_idx + 1} after heading: '{next_text[:50]}...'")
                     context_replacements = process_red_text_in_paragraph(
                         next_paragraph,
@@ -986,15 +1187,8 @@ def process_headings(document, flat_json):
     return replacements_made
-def has_red_text_in_paragraph(paragraph):
-    """Your original function (unchanged)"""
-    for run in paragraph.runs:
-        if is_red(run) and run.text.strip():
-            return True
-    return False
 def process_red_text_in_paragraph(paragraph, context_text, flat_json):
-    """Your original function (unchanged)"""
     replacements_made = 0
     red_text_segments = []
@@ -1010,10 +1204,10 @@ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
     json_value = None
-    # Strategy 1: Direct matching
     json_value = find_matching_json_value(combined_red_text, flat_json)
-    # Strategy 2: Enhanced context-based matching
     if json_value is None:
         if "NHVAS APPROVED AUDITOR" in context_text.upper():
             auditor_fields = ["auditor name", "auditor", "nhvas auditor", "approved auditor", "print name"]
@@ -1031,7 +1225,7 @@ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
                     print(f"      ✅ Found operator match with field: '{field}'")
                     break
-    # Strategy 3: Enhanced context combination
     if json_value is None:
         context_queries = [
             f"{context_text} {combined_red_text}",
@@ -1042,7 +1236,7 @@ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
         for query in context_queries:
             json_value = find_matching_json_value(query, flat_json)
             if json_value is not None:
-                print(f"      ✅ Found match with combined query: '{query[:50]}...'")
                 break
     # Replace if match found
@@ -1065,22 +1259,20 @@ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
     return replacements_made
 def force_red_text_replacement(document, flat_json):
-    """Force replacement of any remaining red text by trying ALL JSON values - FIXED"""
     replacements_made = 0
     print(f"\n🎯 FORCE FIX: Scanning for any remaining red text...")
-    # Collect ALL possible replacement values from JSON - FIXED to handle lists properly
     all_values = {}
     for key, value in flat_json.items():
         if value:
-            # Convert value to string properly
             value_str = get_value_as_string(value, key)
-            # Only add if we have a valid string
             if value_str and isinstance(value_str, str) and value_str.strip():
                 all_values[key] = value_str.strip()
-                # Also store individual items from lists for partial matching
                 if isinstance(value, list):
                     for i, item in enumerate(value):
                         item_str = str(item).strip() if item else ""
@@ -1106,28 +1298,27 @@ def force_red_text_replacement(document, flat_json):
                     combined_red_text = " ".join(red_text_parts).strip()
                     print(f"        Red text: '{combined_red_text}'")
-                    # Try to find a match
                     best_match = None
                     best_key = None
-                    # First try exact matching
                     for key, value in all_values.items():
                         if combined_red_text.lower() == value.lower():
                             best_match = value
                             best_key = key
                             break
-                    # If no exact match, try partial matching
                     if not best_match:
                         for key, value in all_values.items():
-                            # Try if red text contains this value or vice versa
                             if (len(value) > 3 and value.lower() in combined_red_text.lower()) or \
                                (len(combined_red_text) > 3 and combined_red_text.lower() in value.lower()):
                                 best_match = value
                                 best_key = key
                                 break
-                    # If still no match, try word-by-word matching for names/dates
                     if not best_match:
                         red_words = set(word.lower() for word in combined_red_text.split() if len(word) > 2)
                         best_score = 0
@@ -1216,9 +1407,8 @@ def force_red_text_replacement(document, flat_json):
     return replacements_made
 def process_hf(json_file, docx_file, output_file):
-    """Your original main function with force fix added at the end"""
     try:
         # Load JSON
         if hasattr(json_file, "read"):
@@ -1240,14 +1430,14 @@ def process_hf(json_file, docx_file, output_file):
         else:
             doc = Document(docx_file)
-        # Your original processing with surgical fixes
-        print("🚀 Starting processing with minimal surgical fixes...")
         table_replacements = process_tables(doc, flat_json)
         paragraph_replacements = process_paragraphs(doc, flat_json)
         heading_replacements = process_headings(doc, flat_json)
-        # 🎯 ADD THIS: Force fix for any remaining red text
         force_replacements = force_red_text_replacement(doc, flat_json)
         total_replacements = table_replacements + paragraph_replacements + heading_replacements + force_replacements

     """Fix Operator Declaration table when values are empty"""
     replacements_made = 0
+    print(f"    🎯 FIX: Operator Declaration empty values processing")
     # Check if this is an Operator Declaration table
     table_context = ""
                                     if name_replacement.strip():
                                         # Extract just the name if it's a company name
                                         if "Pty Ltd" in name_replacement or "Company" in name_replacement:
                                             continue
                                         if has_red_text(name_cell):
                                             cell_replacements = replace_red_text_in_cell(name_cell, name_replacement)
                                         else:
                                             name_cell.text = name_replacement
                                             cell_replacements = 1
                         if not position_text or has_red_text(position_cell):
                             print(f"      🔧 Fixing empty/red Position Title")
                             position_sources = [
                                 "Operator Declaration.Position Title",
                                 "Position Title"
                                         if has_red_text(position_cell):
                                             cell_replacements = replace_red_text_in_cell(position_cell, position_replacement)
                                         else:
                                             position_cell.text = position_replacement
                                             cell_replacements = 1
                                 position_cell.text = "Manager"
                                 replacements_made += 1
                                 print(f"      ✅ Used fallback Position Title: 'Manager'")
+                break
+    return replacements_made
+def handle_multiple_red_segments_in_cell(cell, flat_json):
+    """Handle multiple red text segments within a single cell"""
+    replacements_made = 0
+    red_segments = extract_red_text_segments(cell)
+    if not red_segments:
+        return 0
+    # Try to match each segment individually
+    for i, segment in enumerate(red_segments):
+        segment_text = segment['text'].strip()
+        if segment_text:
+            json_value = find_matching_json_value(segment_text, flat_json)
+            if json_value is not None:
+                replacement_text = get_value_as_string(json_value, segment_text)
+                if replace_single_segment(segment, replacement_text):
+                    replacements_made += 1
+                    print(f"      ✅ Replaced segment {i+1}: '{segment_text}' -> '{replacement_text}'")
+    return replacements_made
+def handle_nature_business_multiline_fix(cell, flat_json):
+    """Handle Nature of Business multiline red text"""
+    replacements_made = 0
+    # Extract red text to check if it looks like nature of business
+    red_text = ""
+    for paragraph in cell.paragraphs:
+        for run in paragraph.runs:
+            if is_red(run):
+                red_text += run.text
+    red_text = red_text.strip()
+    if not red_text:
+        return 0
+    # Check if this looks like nature of business content
+    nature_indicators = ["transport", "logistics", "freight", "delivery", "trucking", "haulage"]
+    if any(indicator in red_text.lower() for indicator in nature_indicators):
+        # Try to find nature of business in JSON
+        nature_value = find_matching_json_value("Nature of Business", flat_json)
+        if nature_value is not None:
+            replacement_text = get_value_as_string(nature_value, "Nature of Business")
+            cell_replacements = replace_red_text_in_cell(cell, replacement_text)
+            replacements_made += cell_replacements
+            print(f"      ✅ Fixed Nature of Business multiline content")
+    return replacements_made
+def handle_management_summary_fix(cell, flat_json):
+    """Handle Management Summary content fixes"""
+    replacements_made = 0
+    # Extract red text
+    red_text = ""
+    for paragraph in cell.paragraphs:
+        for run in paragraph.runs:
+            if is_red(run):
+                red_text += run.text
+    red_text = red_text.strip()
+    if not red_text:
+        return 0
+    # Look for management summary data in new schema format
+    management_types = ["Mass Management Summary", "Maintenance Management Summary", "Fatigue Management Summary"]
+    for mgmt_type in management_types:
+        if mgmt_type in flat_json:
+            mgmt_data = flat_json[mgmt_type]
+            if isinstance(mgmt_data, dict):
+                # Try to match red text with any standard in this management type
+                for std_key, std_value in mgmt_data.items():
+                    if isinstance(std_value, list) and std_value:
+                        # Check if red text matches this standard
+                        if len(red_text) > 10:
+                            for item in std_value:
+                                if red_text.lower() in str(item).lower() or str(item).lower() in red_text.lower():
+                                    replacement_text = "\n".join(str(i) for i in std_value)
+                                    cell_replacements = replace_red_text_in_cell(cell, replacement_text)
+                                    replacements_made += cell_replacements
+                                    print(f"      ✅ Fixed {mgmt_type} - {std_key}")
+                                    return replacements_made
+    return replacements_made
+def handle_operator_declaration_fix(table, flat_json):
+    """Handle small Operator/Auditor Declaration tables"""
+    replacements_made = 0
+    if len(table.rows) > 4:  # Only process small tables
+        return 0
+    # Get table context
+    table_text = ""
+    for row in table.rows:
+        for cell in row.cells:
+            table_text += get_clean_text(cell).lower() + " "
+    # Check if this is a declaration table
+    if not ("print name" in table_text or "signature" in table_text or "date" in table_text):
+        return 0
+    print(f"    🎯 Processing declaration table")
+    # Process each cell with red text
+    for row_idx, row in enumerate(table.rows):
+        for cell_idx, cell in enumerate(row.cells):
+            if has_red_text(cell):
+                # Try common declaration fields
+                declaration_fields = [
+                    "Print Name", "Position Title", "Signature", "Date",
+                    "Operator Declaration.Print Name", "Operator Declaration.Position Title",
+                    "NHVAS Approved Auditor Declaration.Print Name"
+                ]
+                replaced = False
+                for field in declaration_fields:
+                    field_value = find_matching_json_value(field, flat_json)
+                    if field_value is not None:
+                        replacement_text = get_value_as_string(field_value, field)
+                        if replacement_text.strip():
+                            cell_replacements = replace_red_text_in_cell(cell, replacement_text)
+                            if cell_replacements > 0:
+                                replacements_made += cell_replacements
+                                print(f"      ✅ Fixed declaration field: {field}")
+                                replaced = True
+                                break
+                # If no specific field match, try generic signature/date
+                if not replaced:
+                    red_text = ""
+                    for paragraph in cell.paragraphs:
+                        for run in paragraph.runs:
+                            if is_red(run):
+                                red_text += run.text
+                    if "signature" in red_text.lower():
+                        cell_replacements = replace_red_text_in_cell(cell, "[Signature]")
+                        replacements_made += cell_replacements
+                    elif "date" in red_text.lower():
+                        cell_replacements = replace_red_text_in_cell(cell, "[Date]")
+                        replacements_made += cell_replacements
+    return replacements_made
+def handle_print_accreditation_section(table, flat_json):
+    """Handle Print Accreditation section"""
+    replacements_made = 0
+    print(f"    📋 Processing Print Accreditation section")
+    for row_idx, row in enumerate(table.rows):
+        for cell_idx, cell in enumerate(row.cells):
+            if has_red_text(cell):
+                # Try print accreditation fields
+                accreditation_fields = [
+                    "(print accreditation name)",
+                    "Print Name",
+                    "Operator name (Legal entity)"
+                ]
+                for field in accreditation_fields:
+                    field_value = find_matching_json_value(field, flat_json)
+                    if field_value is not None:
+                        replacement_text = get_value_as_string(field_value, field)
+                        if replacement_text.strip():
+                            cell_replacements = replace_red_text_in_cell(cell, replacement_text)
+                            replacements_made += cell_replacements
+                            if cell_replacements > 0:
+                                print(f"      ✅ Fixed accreditation: {field}")
+                            break
+    return replacements_made
+def process_single_column_sections(cell, key_text, flat_json):
+    """Process single column sections with red text"""
+    replacements_made = 0
+    if has_red_text(cell):
+        red_text = ""
+        for paragraph in cell.paragraphs:
+            for run in paragraph.runs:
+                if is_red(run):
+                    red_text += run.text
+        if red_text.strip():
+            # Try direct matching first
+            section_value = find_matching_json_value(red_text.strip(), flat_json)
+            if section_value is None:
+                # Try key-based matching
+                section_value = find_matching_json_value(key_text, flat_json)
+            if section_value is not None:
+                section_replacement = get_value_as_string(section_value, red_text.strip())
+                cell_replacements = replace_red_text_in_cell(cell, section_replacement)
+                replacements_made += cell_replacements
+                if cell_replacements > 0:
+                    print(f"      ✅ Fixed single column section: '{key_text}'")
     return replacements_made
 def process_tables(document, flat_json):
+    """Process all tables in the document with comprehensive fixes"""
     replacements_made = 0
     for table_idx, table in enumerate(document.tables):
         print(f"\n🔍 Processing table {table_idx + 1}:")
+        # Get table context
         table_text = ""
         for row in table.rows[:3]:
             for cell in row.cells:
                 table_text += get_clean_text(cell).lower() + " "
+        # Detect Management Summary tables
         management_summary_indicators = ["mass management", "maintenance management", "fatigue management"]
         has_management = any(indicator in table_text for indicator in management_summary_indicators)
         has_details = "details" in table_text
             print(f"    📋 Detected Management Summary table")
             summary_fixes = fix_management_summary_details_column(table, flat_json)
             replacements_made += summary_fixes
+            # Process remaining red text in management summary
             summary_replacements = 0
             for row_idx, row in enumerate(table.rows):
                 for cell_idx, cell in enumerate(row.cells):
             replacements_made += summary_replacements
             continue
+        # Detect Vehicle Registration tables
         vehicle_indicators = ["registration number", "sub-contractor", "weight verification", "rfs suspension"]
         indicator_count = sum(1 for indicator in vehicle_indicators if indicator in table_text)
         if indicator_count >= 2:
             replacements_made += vehicle_replacements
             continue
+        # Detect Attendance List tables
         if "attendance list" in table_text and "names and position titles" in table_text:
             print(f"    👥 Detected Attendance List table")
             attendance_replacements = handle_attendance_list_table_enhanced(table, flat_json)
             replacements_made += attendance_replacements
             continue
+        # Detect Print Accreditation tables
         print_accreditation_indicators = ["print name", "position title"]
         indicator_count = sum(1 for indicator in print_accreditation_indicators if indicator in table_text)
         if indicator_count >= 1:
             print(f"    📋 Detected Print Accreditation table")
+            # Check for declaration tables that need fixing
+            if "print name" in table_text and "position" in table_text:
+                declaration_fixes = fix_operator_declaration_empty_values(table, flat_json)
+                replacements_made += declaration_fixes
             print_accreditation_replacements = handle_print_accreditation_section(table, flat_json)
             replacements_made += print_accreditation_replacements
             continue
+        # Process regular table rows
         for row_idx, row in enumerate(table.rows):
             if len(row.cells) < 1:
                 continue
             if json_value is not None:
                 replacement_text = get_value_as_string(json_value, key_text)
+                # Handle Australian Company Number
                 if ("australian company number" in key_text.lower() or "company number" in key_text.lower()) and isinstance(json_value, list):
                     cell_replacements = handle_australian_company_number(row, json_value)
                     replacements_made += cell_replacements
+                # Handle section headers
                 elif ("attendance list" in key_text.lower() or "nature of" in key_text.lower()) and row_idx + 1 < len(table.rows):
+                    print(f"    ✅ Section header detected, checking next row...")
                     next_row = table.rows[row_idx + 1]
                     for cell_idx, cell in enumerate(next_row.cells):
                             cell_replacements = replace_red_text_in_cell(cell, replacement_text)
                             replacements_made += cell_replacements
                             if cell_replacements > 0:
+                                print(f"    -> Replaced section content")
+                # Handle single column sections
                 elif len(row.cells) == 1 or (len(row.cells) > 1 and not any(has_red_text(row.cells[i]) for i in range(1, len(row.cells)))):
                     if has_red_text(key_cell):
                         cell_replacements = process_single_column_sections(key_cell, key_text, flat_json)
                         replacements_made += cell_replacements
+                # Handle regular key-value pairs
                 else:
                     for cell_idx in range(1, len(row.cells)):
                         value_cell = row.cells[cell_idx]
                             print(f"    ✅ Found red text in column {cell_idx + 1}")
                             cell_replacements = replace_red_text_in_cell(value_cell, replacement_text)
                             replacements_made += cell_replacements
             else:
+                # Fallback processing for unmatched keys
                 if len(row.cells) == 1 and has_red_text(key_cell):
                     red_text = ""
                     for paragraph in key_cell.paragraphs:
                             cell_replacements = replace_red_text_in_cell(key_cell, section_replacement)
                             replacements_made += cell_replacements
+                # Process red text in all cells
                 for cell_idx in range(len(row.cells)):
                     cell = row.cells[cell_idx]
                     if has_red_text(cell):
                         cell_replacements = handle_multiple_red_segments_in_cell(cell, flat_json)
                         replacements_made += cell_replacements
+                        # Apply fixes if no replacements made
                         if cell_replacements == 0:
                             surgical_fix = handle_nature_business_multiline_fix(cell, flat_json)
                             replacements_made += surgical_fix
+                        if cell_replacements == 0:
                             management_summary_fix = handle_management_summary_fix(cell, flat_json)
                             replacements_made += management_summary_fix
+    # Handle Operator/Auditor Declaration tables (check last few tables)
+    print(f"\n🎯 Final check for Declaration tables...")
+    for table in document.tables[-3:]:
+        if len(table.rows) <= 4:
+            declaration_fix = handle_operator_declaration_fix(table, flat_json)
+            replacements_made += declaration_fix
+    return replacements_made
 def process_paragraphs(document, flat_json):
+    """Process all paragraphs in the document"""
     replacements_made = 0
     print(f"\n🔍 Processing paragraphs:")
     for para_idx, paragraph in enumerate(document.paragraphs):
         red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
         if red_runs:
             red_text_only = "".join(run.text for run in red_runs).strip()
             print(f"  📌 Paragraph {para_idx + 1}: Found red text: '{red_text_only}'")
             json_value = find_matching_json_value(red_text_only, flat_json)
             if json_value is None:
     return replacements_made
 def process_headings(document, flat_json):
+    """Process headings and their related content"""
     replacements_made = 0
     print(f"\n🔍 Processing headings:")
         if not paragraph_text:
             continue
+        # Check if this is a heading
         matched_heading = None
         for category, patterns in HEADING_PATTERNS.items():
             for pattern in patterns:
                 heading_replacements = process_red_text_in_paragraph(paragraph, paragraph_text, flat_json)
                 replacements_made += heading_replacements
+            # Look ahead for related content
+            for next_para_offset in range(1, 6):
                 next_para_idx = para_idx + next_para_offset
                 if next_para_idx >= len(paragraphs):
                     break
                 if is_another_heading:
                     break
+                # Process red text with context
                 if has_red_text_in_paragraph(next_paragraph):
+                    print(f"    🔴 Found red text in paragraph {next_para_idx + 1} after heading")
                     context_replacements = process_red_text_in_paragraph(
                         next_paragraph,
     return replacements_made
 def process_red_text_in_paragraph(paragraph, context_text, flat_json):
+    """Process red text within a paragraph using context"""
     replacements_made = 0
     red_text_segments = []
     json_value = None
+    # Direct matching
     json_value = find_matching_json_value(combined_red_text, flat_json)
+    # Context-based matching
     if json_value is None:
         if "NHVAS APPROVED AUDITOR" in context_text.upper():
             auditor_fields = ["auditor name", "auditor", "nhvas auditor", "approved auditor", "print name"]
                     print(f"      ✅ Found operator match with field: '{field}'")
                     break
+    # Combined context queries
     if json_value is None:
         context_queries = [
             f"{context_text} {combined_red_text}",
         for query in context_queries:
             json_value = find_matching_json_value(query, flat_json)
             if json_value is not None:
+                print(f"      ✅ Found match with combined query")
                 break
     # Replace if match found
     return replacements_made
 def force_red_text_replacement(document, flat_json):
+    """Force replacement of any remaining red text by trying ALL JSON values"""
     replacements_made = 0
     print(f"\n🎯 FORCE FIX: Scanning for any remaining red text...")
+    # Collect all possible replacement values from JSON
     all_values = {}
     for key, value in flat_json.items():
         if value:
             value_str = get_value_as_string(value, key)
             if value_str and isinstance(value_str, str) and value_str.strip():
                 all_values[key] = value_str.strip()
+                # Store individual items from lists for partial matching
                 if isinstance(value, list):
                     for i, item in enumerate(value):
                         item_str = str(item).strip() if item else ""
                     combined_red_text = " ".join(red_text_parts).strip()
                     print(f"        Red text: '{combined_red_text}'")
+                    # Find best match
                     best_match = None
                     best_key = None
+                    # Exact matching
                     for key, value in all_values.items():
                         if combined_red_text.lower() == value.lower():
                             best_match = value
                             best_key = key
                             break
+                    # Partial matching
                     if not best_match:
                         for key, value in all_values.items():
                             if (len(value) > 3 and value.lower() in combined_red_text.lower()) or \
                                (len(combined_red_text) > 3 and combined_red_text.lower() in value.lower()):
                                 best_match = value
                                 best_key = key
                                 break
+                    # Word-by-word matching for names/dates
                     if not best_match:
                         red_words = set(word.lower() for word in combined_red_text.split() if len(word) > 2)
                         best_score = 0
     return replacements_made
 def process_hf(json_file, docx_file, output_file):
+    """Main processing function with comprehensive error handling"""
     try:
         # Load JSON
         if hasattr(json_file, "read"):
         else:
             doc = Document(docx_file)
+        # Process document with all fixes
+        print("🚀 Starting comprehensive document processing...")
         table_replacements = process_tables(doc, flat_json)
         paragraph_replacements = process_paragraphs(doc, flat_json)
         heading_replacements = process_headings(doc, flat_json)
+        # Final force fix for any remaining red text
         force_replacements = force_red_text_replacement(doc, flat_json)
         total_replacements = table_replacements + paragraph_replacements + heading_replacements + force_replacements