Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

Shami96 commited on Aug 20

Commit

412e2ed

verified ·

1 Parent(s): 5efc8a5

Update updated_word.py

Browse files

Files changed (1) hide show

updated_word.py +115 -372

updated_word.py CHANGED Viewed

@@ -3,7 +3,7 @@ from docx import Document
 from docx.shared import RGBColor
 import re
-# Enhanced heading patterns (ADDITIVE - keeps your existing ones)
 HEADING_PATTERNS = {
     "main": [
         r"NHVAS\s+Audit\s+Summary\s+Report",
@@ -61,7 +61,7 @@ def get_value_as_string(value, field_name=""):
         return str(value)
 def find_matching_json_value(field_name, flat_json):
-    """Enhanced dynamic matching without manual mappings"""
     field_name = field_name.strip()
     # Try exact match first
@@ -122,7 +122,7 @@ def find_matching_json_value(field_name, flat_json):
                 best_match = value
                 best_key = key
-    if best_match and best_score >= 0.25:  # Lowered threshold for better coverage
         print(f"    ✅ Fuzzy match found for key '{field_name}' with JSON key '{best_key}' (score: {best_score:.2f})")
         return best_match
@@ -144,7 +144,7 @@ def has_red_text(cell):
     return False
 def extract_red_text_segments(cell):
-    """Enhanced red text extraction with better multi-line handling"""
     red_segments = []
     for para_idx, paragraph in enumerate(cell.paragraphs):
@@ -178,7 +178,7 @@ def extract_red_text_segments(cell):
     return red_segments
 def replace_red_text_in_cell(cell, replacement_text):
-    """Enhanced cell replacement with improved multi-line handling"""
     red_segments = extract_red_text_segments(cell)
     if not red_segments:
@@ -197,7 +197,7 @@ def replace_red_text_in_cell(cell, replacement_text):
     return replace_all_red_segments(red_segments, replacement_text)
 def replace_all_red_segments(red_segments, replacement_text):
-    """Enhanced replacement with better line handling"""
     if not red_segments:
         return 0
@@ -244,55 +244,22 @@ def replace_all_red_segments(red_segments, replacement_text):
     return replacements_made
-def analyze_table_structure(table):
-    """NEW: Dynamic table structure analysis"""
-    structure = {
-        'type': 'unknown',
-        'orientation': 'unknown',
-        'has_headers': False,
-        'column_count': 0,
-        'row_count': 0,
-        'red_text_locations': []
-    }
-    if not table.rows:
-        return structure
-    structure['row_count'] = len(table.rows)
-    structure['column_count'] = len(table.rows[0].cells) if table.rows else 0
-    # Analyze first row for headers
-    first_row_text = []
-    for cell in table.rows[0].cells:
-        cell_text = get_clean_text(cell).strip()
-        first_row_text.append(cell_text)
-    # Detect table type based on content patterns
-    combined_text = " ".join(first_row_text).lower()
-    if any(indicator in combined_text for indicator in ["registration", "vehicle", "maintenance", "mass"]):
-        structure['type'] = 'vehicle_registration'
-    elif any(indicator in combined_text for indicator in ["print name", "position", "auditor", "operator"]):
-        structure['type'] = 'declaration'
-    elif any(indicator in combined_text for indicator in ["std", "standard", "compliance"]):
-        structure['type'] = 'compliance_matrix'
-    elif len(table.rows[0].cells) == 2 and not any(indicator in combined_text for indicator in ["no.", "number"]):
-        structure['type'] = 'key_value'
-    else:
-        structure['type'] = 'data_grid'
-    # Find red text locations
-    for row_idx, row in enumerate(table.rows):
-        for cell_idx, cell in enumerate(row.cells):
-            if has_red_text(cell):
-                structure['red_text_locations'].append((row_idx, cell_idx))
-    structure['has_headers'] = len(structure['red_text_locations']) > 0 and (0, 0) not in structure['red_text_locations']
-    return structure
 def handle_multiple_red_segments_in_cell(cell, flat_json):
-    """Enhanced multi-segment handling"""
     red_segments = extract_red_text_segments(cell)
     if not red_segments:
@@ -340,251 +307,110 @@ def handle_multiple_red_segments_in_cell(cell, flat_json):
     return replacements_made
-def replace_single_segment(segment, replacement_text):
-    """Enhanced single segment replacement"""
-    if not segment['runs']:
-        return False
-    first_run = segment['runs'][0][2]
-    first_run.text = replacement_text
-    first_run.font.color.rgb = RGBColor(0, 0, 0)
-    for _, _, run in segment['runs'][1:]:
-        run.text = ''
-    return True
-def detect_table_type(table):
-    """Enhanced table type detection"""
-    structure = analyze_table_structure(table)
-    return structure['type']
-def try_context_based_replacement(cell, row, table, flat_json):
-    """Enhanced context-based replacement"""
     replacements_made = 0
-    row_context = ""
-    if len(row.cells) > 1:
-        first_cell_text = get_clean_text(row.cells[0]).strip()
-        if first_cell_text and not has_red_text(row.cells[0]):
-            row_context = first_cell_text
-    red_segments = extract_red_text_segments(cell)
     for segment in red_segments:
-        red_text = segment['text'].strip()
-        if not red_text:
             continue
-        if row_context:
-            context_queries = [
-                f"{row_context} {red_text}",
-                f"{row_context}",
-                red_text
-            ]
-            for query in context_queries:
-                json_value = find_matching_json_value(query, flat_json)
-                if json_value is not None:
-                    replacement_text = get_value_as_string(json_value, query)
-                    success = replace_single_segment(segment, replacement_text)
-                    if success:
-                        replacements_made += 1
-                        print(f"      ✅ Context-based replacement: '{query}' -> '{replacement_text[:30]}...'")
-                        break
-    return replacements_made
-def smart_fallback_processor(element, flat_json):
-    """NEW: Smart fallback for missed red text"""
-    replacements_made = 0
-    # Check if element has red text that wasn't processed
-    if hasattr(element, 'paragraphs'):
-        for paragraph in element.paragraphs:
-            for run in paragraph.runs:
-                if is_red(run) and run.text.strip():
-                    # Try advanced pattern matching
-                    red_text = run.text.strip()
-                    # Try semantic matching
-                    json_value = semantic_text_matching(red_text, flat_json)
-                    if json_value:
-                        replacement_text = get_value_as_string(json_value, red_text)
-                        run.text = replacement_text
-                        run.font.color.rgb = RGBColor(0, 0, 0)
-                        replacements_made += 1
-                        print(f"      🎯 Fallback match: '{red_text}' -> '{replacement_text[:30]}...'")
-    return replacements_made
-def semantic_text_matching(text, flat_json):
-    """NEW: Advanced semantic matching for edge cases"""
-    text_lower = text.lower().strip()
-    # Common semantic patterns
-    semantic_patterns = {
-        'name': ['name', 'manager', 'operator', 'auditor', 'driver'],
-        'date': ['date', 'expiry', 'conducted', 'completed'],
-        'address': ['address', 'location', 'road', 'street'],
-        'number': ['number', 'registration', 'phone', 'telephone'],
-        'email': ['email', 'mail'],
-        'position': ['position', 'title', 'role']
-    }
-    # Find semantic category
-    for category, keywords in semantic_patterns.items():
-        if any(keyword in text_lower for keyword in keywords):
-            # Look for JSON keys in this semantic category
-            for key, value in flat_json.items():
-                key_lower = key.lower()
-                if any(keyword in key_lower for keyword in keywords):
-                    return value
-    return None
-def handle_nature_of_business_section(table, flat_json):
-    """TARGETED FIX for Issue 1: Nature of Business multi-line and sub-fields"""
-    replacements_made = 0
-    for row_idx, row in enumerate(table.rows):
-        if len(row.cells) >= 1:
-            cell = row.cells[0]
-            cell_text = get_clean_text(cell).lower()
-            # Check if this is the "Nature of the Operators Business" section
-            if "nature of the operators business" in cell_text or "nature of the operator business" in cell_text:
-                print(f"    🎯 Found Nature of Business section in row {row_idx + 1}")
-                # Handle main business description (multi-line red text)
-                if has_red_text(cell):
-                    # Try to find business description in JSON
-                    business_desc_keys = [
-                        "nature of the operators business",
-                        "business description",
-                        "operator business summary",
-                        "business summary"
-                    ]
-                    business_value = None
-                    for key in business_desc_keys:
-                        business_value = find_matching_json_value(key, flat_json)
-                        if business_value:
-                            break
-                    if business_value:
-                        business_text = get_value_as_string(business_value)
-                        cell_replacements = replace_red_text_in_cell(cell, business_text)
-                        replacements_made += cell_replacements
-                        print(f"      ✅ Updated main business description")
-                # Look for sub-fields in the next few rows
-                for sub_row_idx in range(row_idx + 1, min(row_idx + 4, len(table.rows))):
-                    sub_row = table.rows[sub_row_idx]
-                    if len(sub_row.cells) >= 1:
-                        sub_cell = sub_row.cells[0]
-                        sub_text = get_clean_text(sub_cell).lower()
-                        # Handle Accreditation Number
-                        if "accreditation number" in sub_text and has_red_text(sub_cell):
-                            accred_value = find_matching_json_value("accreditation number", flat_json)
-                            if not accred_value:
-                                accred_value = find_matching_json_value("nhvas accreditation no", flat_json)
-                            if accred_value:
-                                accred_text = get_value_as_string(accred_value)
-                                cell_replacements = replace_red_text_in_cell(sub_cell, accred_text)
-                                replacements_made += cell_replacements
-                                print(f"      ✅ Updated Accreditation Number: {accred_text}")
-                        # Handle Expiry Date
-                        elif "expiry date" in sub_text and has_red_text(sub_cell):
-                            expiry_value = find_matching_json_value("expiry date", flat_json)
-                            if not expiry_value:
-                                expiry_value = find_matching_json_value("accreditation expiry", flat_json)
-                            if expiry_value:
-                                expiry_text = get_value_as_string(expiry_value)
-                                cell_replacements = replace_red_text_in_cell(sub_cell, expiry_text)
-                                replacements_made += cell_replacements
-                                print(f"      ✅ Updated Expiry Date: {expiry_text}")
-                break  # Found the section, no need to continue
     return replacements_made
-def handle_operator_declaration_table(table, flat_json):
-    """TARGETED FIX for Issue 2: Operator Declaration Print Name and Position Title"""
     replacements_made = 0
     for row_idx, row in enumerate(table.rows):
         if len(row.cells) >= 2:
-            cell1_text = get_clean_text(row.cells[0]).lower()
-            cell2_text = get_clean_text(row.cells[1]).lower()
-            # Check if this is the header row with "Print Name" and "Position Title"
-            if "print name" in cell1_text and ("position title" in cell2_text or "position" in cell2_text):
-                print(f"    🎯 Found Operator Declaration header row {row_idx + 1}")
-                # Look for the data row (next row with red text)
                 if row_idx + 1 < len(table.rows):
                     data_row = table.rows[row_idx + 1]
                     if len(data_row.cells) >= 2:
                         name_cell = data_row.cells[0]
                         position_cell = data_row.cells[1]
-                        # Handle Print Name (first column)
                         if has_red_text(name_cell):
-                            # Try to find operator name
-                            name_keys = [
-                                "operator name",
-                                "print name",
-                                "legal entity",
-                                "operator"
-                            ]
-                            name_value = None
-                            for key in name_keys:
-                                name_value = find_matching_json_value(key, flat_json)
-                                if name_value:
-                                    break
-                            if name_value:
-                                name_text = get_value_as_string(name_value)
-                                cell_replacements = replace_red_text_in_cell(name_cell, name_text)
-                                replacements_made += cell_replacements
-                                print(f"      ✅ Updated Print Name: {name_text}")
-                        # Handle Position Title (second column)
                         if has_red_text(position_cell):
-                            # Try to find position/title
-                            position_keys = [
-                                "position title",
-                                "position",
-                                "title",
-                                "job title",
-                                "role"
-                            ]
-                            position_value = None
-                            for key in position_keys:
-                                position_value = find_matching_json_value(key, flat_json)
-                                if position_value:
-                                    break
-                            # If no specific position found, default to "Manager"
-                            if not position_value:
-                                position_value = "Manager"
-                            position_text = get_value_as_string(position_value)
-                            cell_replacements = replace_red_text_in_cell(position_cell, position_text)
-                            replacements_made += cell_replacements
-                            print(f"      ✅ Updated Position Title: {position_text}")
-                break  # Found the section, no need to continue
     return replacements_made
 def handle_australian_company_number(row, company_numbers):
-    """Enhanced ACN handling"""
     replacements_made = 0
     for i, digit in enumerate(company_numbers):
         cell_idx = i + 1
@@ -597,7 +423,7 @@ def handle_australian_company_number(row, company_numbers):
     return replacements_made
 def handle_vehicle_registration_table(table, flat_json):
-    """Enhanced vehicle registration table handling"""
     replacements_made = 0
     # Try to find vehicle registration data
@@ -740,7 +566,7 @@ def handle_vehicle_registration_table(table, flat_json):
     return replacements_made
 def handle_print_accreditation_section(table, flat_json):
-    """Enhanced print accreditation handling"""
     replacements_made = 0
     print_data = flat_json.get("print accreditation name.print accreditation name", [])
@@ -780,7 +606,7 @@ def handle_print_accreditation_section(table, flat_json):
     return replacements_made
 def process_single_column_sections(cell, field_name, flat_json):
-    """Enhanced single column processing"""
     json_value = find_matching_json_value(field_name, flat_json)
     if json_value is not None:
         replacement_text = get_value_as_string(json_value, field_name)
@@ -796,17 +622,13 @@ def process_single_column_sections(cell, field_name, flat_json):
     return 0
 def process_tables(document, flat_json):
-    """ENHANCED: Your existing function + smart enhancements"""
     replacements_made = 0
     for table_idx, table in enumerate(document.tables):
         print(f"\n🔍 Processing table {table_idx + 1}:")
-        # ENHANCED: Dynamic table analysis
-        table_structure = analyze_table_structure(table)
-        print(f"    📊 Table structure: {table_structure['type']} ({table_structure['row_count']}x{table_structure['column_count']})")
-        # Your existing logic with enhancements
         table_text = ""
         for row in table.rows[:3]:
             for cell in row.cells:
@@ -815,7 +637,7 @@ def process_tables(document, flat_json):
         # Enhanced vehicle registration detection
         vehicle_indicators = ["registration number", "sub-contractor", "weight verification", "rfs suspension"]
         indicator_count = sum(1 for indicator in vehicle_indicators if indicator in table_text)
-        if indicator_count >= 2 or table_structure['type'] == 'vehicle_registration':  # Lowered threshold
             print(f"    🚗 Detected Vehicle Registration table")
             vehicle_replacements = handle_vehicle_registration_table(table, flat_json)
             replacements_made += vehicle_replacements
@@ -824,28 +646,13 @@ def process_tables(document, flat_json):
         # Enhanced print accreditation detection
         print_accreditation_indicators = ["print name", "position title"]
         indicator_count = sum(1 for indicator in print_accreditation_indicators if indicator in table_text)
-        if indicator_count >= 1 or table_structure['type'] == 'declaration':  # Lowered threshold
             print(f"    📋 Detected Print Accreditation table")
             print_accreditation_replacements = handle_print_accreditation_section(table, flat_json)
             replacements_made += print_accreditation_replacements
             continue
-        # 🎯 NEW: TARGETED FIX 1 - Nature of Business section
-        if "nature of the operator" in table_text:
-            print(f"    🎯 Detected Nature of Business section")
-            nature_replacements = handle_nature_of_business_section(table, flat_json)
-            replacements_made += nature_replacements
-            # Don't continue - let it fall through to regular processing too
-        # 🎯 NEW: TARGETED FIX 2 - Operator Declaration table
-        if "print name" in table_text and "position title" in table_text:
-            print(f"    🎯 Detected Operator Declaration table")
-            declaration_replacements = handle_operator_declaration_table(table, flat_json)
-            replacements_made += declaration_replacements
-            # Don't continue - let it fall through to regular processing too
-        # Your existing row processing with enhancements
         for row_idx, row in enumerate(table.rows):
             if len(row.cells) < 1:
                 continue
@@ -916,20 +723,22 @@ def process_tables(document, flat_json):
                         cell_replacements = handle_multiple_red_segments_in_cell(cell, flat_json)
                         replacements_made += cell_replacements
-                        # ENHANCED: Fallback for still unmatched red text
                         if cell_replacements == 0:
-                            context_replacements = try_context_based_replacement(cell, row, table, flat_json)
-                            replacements_made += context_replacements
-                            # ENHANCED: Smart fallback processor
-                            if context_replacements == 0:
-                                fallback_replacements = smart_fallback_processor(cell, flat_json)
-                                replacements_made += fallback_replacements
     return replacements_made
 def process_paragraphs(document, flat_json):
-    """ENHANCED: Your existing function + smart fallbacks"""
     replacements_made = 0
     print(f"\n🔍 Processing paragraphs:")
@@ -949,9 +758,6 @@ def process_paragraphs(document, flat_json):
                     json_value = find_matching_json_value("auditor signature", flat_json)
                 elif "OPERATOR SIGNATURE" in red_text_only.upper():
                     json_value = find_matching_json_value("operator signature", flat_json)
-                # ENHANCED: Try semantic matching
-                elif json_value is None:
-                    json_value = semantic_text_matching(red_text_only, flat_json)
             if json_value is not None:
                 replacement_text = get_value_as_string(json_value)
@@ -961,15 +767,11 @@ def process_paragraphs(document, flat_json):
                 for run in red_runs[1:]:
                     run.text = ''
                 replacements_made += 1
-            else:
-                # ENHANCED: Try smart fallback
-                fallback_replacements = smart_fallback_processor(paragraph, flat_json)
-                replacements_made += fallback_replacements
     return replacements_made
 def process_headings(document, flat_json):
-    """ENHANCED: Your existing function + comprehensive coverage"""
     replacements_made = 0
     print(f"\n🔍 Processing headings:")
@@ -1035,23 +837,18 @@ def process_headings(document, flat_json):
                         flat_json
                     )
                     replacements_made += context_replacements
-                    # ENHANCED: Smart fallback if still no match
-                    if context_replacements == 0:
-                        fallback_replacements = smart_fallback_processor(next_paragraph, flat_json)
-                        replacements_made += fallback_replacements
     return replacements_made
 def has_red_text_in_paragraph(paragraph):
-    """Enhanced paragraph red text detection"""
     for run in paragraph.runs:
         if is_red(run) and run.text.strip():
             return True
     return False
 def process_red_text_in_paragraph(paragraph, context_text, flat_json):
-    """ENHANCED: Your existing function + smarter matching"""
     replacements_made = 0
     red_text_segments = []
@@ -1102,12 +899,6 @@ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
                 print(f"      ✅ Found match with combined query: '{query[:50]}...'")
                 break
-    # ENHANCED: Strategy 4: Semantic matching
-    if json_value is None:
-        json_value = semantic_text_matching(combined_red_text, flat_json)
-        if json_value:
-            print(f"      ✅ Found semantic match for: '{combined_red_text}'")
     # Replace if match found
     if json_value is not None:
         replacement_text = get_value_as_string(json_value, combined_red_text)
@@ -1127,51 +918,8 @@ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
     return replacements_made
-def comprehensive_document_scan(document, flat_json):
-    """NEW: Final comprehensive scan for any missed red text"""
-    print(f"\n🔍 Comprehensive final scan for missed red text:")
-    replacements_made = 0
-    # Scan all elements in document
-    for element in document.element.body:
-        # Check tables
-        if element.tag.endswith('tbl'):
-            table_obj = None
-            for table in document.tables:
-                if table._element == element:
-                    table_obj = table
-                    break
-            if table_obj:
-                for row in table_obj.rows:
-                    for cell in row.cells:
-                        if has_red_text(cell):
-                            # Try one more time with enhanced fallback
-                            cell_replacements = smart_fallback_processor(cell, flat_json)
-                            replacements_made += cell_replacements
-        # Check paragraphs
-        elif element.tag.endswith('p'):
-            paragraph_obj = None
-            for para in document.paragraphs:
-                if para._element == element:
-                    paragraph_obj = para
-                    break
-            if paragraph_obj and has_red_text_in_paragraph(paragraph_obj):
-                # Try enhanced fallback
-                para_replacements = smart_fallback_processor(paragraph_obj, flat_json)
-                replacements_made += para_replacements
-    if replacements_made > 0:
-        print(f"    ✅ Final scan caught {replacements_made} additional replacements!")
-    else:
-        print(f"    ✅ No additional red text found - document fully processed!")
-    return replacements_made
 def process_hf(json_file, docx_file, output_file):
-    """ENHANCED: Your existing main function + comprehensive processing"""
     try:
         # Load JSON
         if hasattr(json_file, "read"):
@@ -1193,18 +941,14 @@ def process_hf(json_file, docx_file, output_file):
         else:
             doc = Document(docx_file)
-        # ENHANCED: Multi-pass processing for 100% coverage
-        print("🚀 Starting enhanced multi-pass processing...")
-        # Pass 1: Your existing processors (enhanced)
         table_replacements = process_tables(doc, flat_json)
         paragraph_replacements = process_paragraphs(doc, flat_json)
         heading_replacements = process_headings(doc, flat_json)
-        # Pass 2: NEW - Comprehensive final scan
-        final_scan_replacements = comprehensive_document_scan(doc, flat_json)
-        total_replacements = table_replacements + paragraph_replacements + heading_replacements + final_scan_replacements
         # Save output
         if hasattr(output_file, "write"):
@@ -1217,8 +961,7 @@ def process_hf(json_file, docx_file, output_file):
         print(f"   📊 Tables: {table_replacements}")
         print(f"   📝 Paragraphs: {paragraph_replacements}")
         print(f"   📋 Headings: {heading_replacements}")
-        print(f"   🎯 Final scan: {final_scan_replacements}")
-        print(f"🎉 Processing complete with enhanced coverage!")
     except FileNotFoundError as e:
         print(f"❌ File not found: {e}")
@@ -1230,7 +973,7 @@ def process_hf(json_file, docx_file, output_file):
 if __name__ == "__main__":
     import sys
     if len(sys.argv) != 4:
-        print("Usage: python enhanced_pipeline.py <input_docx> <updated_json> <output_docx>")
         exit(1)
     docx_path = sys.argv[1]
     json_path = sys.argv[2]

 from docx.shared import RGBColor
 import re
+# Your original heading patterns (unchanged)
 HEADING_PATTERNS = {
     "main": [
         r"NHVAS\s+Audit\s+Summary\s+Report",
         return str(value)
 def find_matching_json_value(field_name, flat_json):
+    """Your original matching function (unchanged)"""
     field_name = field_name.strip()
     # Try exact match first
                 best_match = value
                 best_key = key
+    if best_match and best_score >= 0.25:
         print(f"    ✅ Fuzzy match found for key '{field_name}' with JSON key '{best_key}' (score: {best_score:.2f})")
         return best_match
     return False
 def extract_red_text_segments(cell):
+    """Your original red text extraction (unchanged)"""
     red_segments = []
     for para_idx, paragraph in enumerate(cell.paragraphs):
     return red_segments
 def replace_red_text_in_cell(cell, replacement_text):
+    """Your original replacement function (unchanged)"""
     red_segments = extract_red_text_segments(cell)
     if not red_segments:
     return replace_all_red_segments(red_segments, replacement_text)
 def replace_all_red_segments(red_segments, replacement_text):
+    """Your original function (unchanged)"""
     if not red_segments:
         return 0
     return replacements_made
+def replace_single_segment(segment, replacement_text):
+    """Your original function (unchanged)"""
+    if not segment['runs']:
+        return False
+    first_run = segment['runs'][0][2]
+    first_run.text = replacement_text
+    first_run.font.color.rgb = RGBColor(0, 0, 0)
+    for _, _, run in segment['runs'][1:]:
+        run.text = ''
+    return True
 def handle_multiple_red_segments_in_cell(cell, flat_json):
+    """Your original function (unchanged)"""
     red_segments = extract_red_text_segments(cell)
     if not red_segments:
     return replacements_made
+# 🎯 SURGICAL FIX 1: Handle Nature of Business multi-line red text
+def handle_nature_business_multiline_fix(cell, flat_json):
+    """SURGICAL FIX: Handle multi-line red text in Nature of Business cells"""
+    if not has_red_text(cell):
+        return 0
+    # Check if this cell contains "Nature of the Operators Business"
+    cell_text = get_clean_text(cell).lower()
+    if "nature of the operators business" not in cell_text and "nature of the operator business" not in cell_text:
+        return 0
+    print(f"    🎯 SURGICAL FIX: Nature of Business multi-line processing")
+    # Look for sub-fields like "Accreditation Number:" and "Expiry Date:"
+    red_segments = extract_red_text_segments(cell)
     replacements_made = 0
+    # Try to replace each segment individually first
     for segment in red_segments:
+        segment_text = segment['text'].strip()
+        if not segment_text:
             continue
+        json_value = find_matching_json_value(segment_text, flat_json)
+        if json_value is not None:
+            replacement_text = get_value_as_string(json_value, segment_text)
+            success = replace_single_segment(segment, replacement_text)
+            if success:
+                replacements_made += 1
+                print(f"        ✅ Fixed segment: '{segment_text[:30]}...'")
+    # If no individual matches, try combined approach
+    if replacements_made == 0 and red_segments:
+        combined_text = " ".join(seg['text'] for seg in red_segments).strip()
+        json_value = find_matching_json_value(combined_text, flat_json)
+        if json_value is not None:
+            replacement_text = get_value_as_string(json_value, combined_text)
+            replacements_made = replace_all_red_segments(red_segments, replacement_text)
+            print(f"        ✅ Fixed combined text")
     return replacements_made
+# 🎯 SURGICAL FIX 2: Handle Operator Declaration table
+def handle_operator_declaration_fix(table, flat_json):
+    """SURGICAL FIX: Handle Operator Declaration Print Name and Position Title"""
     replacements_made = 0
+    # Very specific detection: must have EXACTLY these headers
     for row_idx, row in enumerate(table.rows):
         if len(row.cells) >= 2:
+            cell1_text = get_clean_text(row.cells[0]).strip()
+            cell2_text = get_clean_text(row.cells[1]).strip()
+            # VERY specific match for operator declaration table
+            if ("print name" in cell1_text.lower() and "position title" in cell2_text.lower() and
+                len(table.rows) <= 4):  # Small table only
+                print(f"    🎯 SURGICAL FIX: Operator Declaration table detected")
+                # Look for the data row (should be next row)
                 if row_idx + 1 < len(table.rows):
                     data_row = table.rows[row_idx + 1]
                     if len(data_row.cells) >= 2:
                         name_cell = data_row.cells[0]
                         position_cell = data_row.cells[1]
+                        # Fix Print Name (first column)
                         if has_red_text(name_cell):
+                            red_text = ""
+                            for paragraph in name_cell.paragraphs:
+                                for run in paragraph.runs:
+                                    if is_red(run):
+                                        red_text += run.text
+                            if red_text.strip():
+                                json_value = find_matching_json_value(red_text.strip(), flat_json)
+                                if json_value is not None:
+                                    replacement_text = get_value_as_string(json_value)
+                                    cell_replacements = replace_red_text_in_cell(name_cell, replacement_text)
+                                    replacements_made += cell_replacements
+                                    print(f"        ✅ Fixed Print Name: '{replacement_text}'")
+                        # Fix Position Title (second column)
                         if has_red_text(position_cell):
+                            red_text = ""
+                            for paragraph in position_cell.paragraphs:
+                                for run in paragraph.runs:
+                                    if is_red(run):
+                                        red_text += run.text
+                            if red_text.strip():
+                                json_value = find_matching_json_value(red_text.strip(), flat_json)
+                                if json_value is not None:
+                                    replacement_text = get_value_as_string(json_value)
+                                    cell_replacements = replace_red_text_in_cell(position_cell, replacement_text)
+                                    replacements_made += cell_replacements
+                                    print(f"        ✅ Fixed Position Title: '{replacement_text}'")
+                break  # Found the table, stop looking
     return replacements_made
 def handle_australian_company_number(row, company_numbers):
+    """Your original function (unchanged)"""
     replacements_made = 0
     for i, digit in enumerate(company_numbers):
         cell_idx = i + 1
     return replacements_made
 def handle_vehicle_registration_table(table, flat_json):
+    """Your original function (unchanged)"""
     replacements_made = 0
     # Try to find vehicle registration data
     return replacements_made
 def handle_print_accreditation_section(table, flat_json):
+    """Your original function (unchanged)"""
     replacements_made = 0
     print_data = flat_json.get("print accreditation name.print accreditation name", [])
     return replacements_made
 def process_single_column_sections(cell, field_name, flat_json):
+    """Your original function (unchanged)"""
     json_value = find_matching_json_value(field_name, flat_json)
     if json_value is not None:
         replacement_text = get_value_as_string(json_value, field_name)
     return 0
 def process_tables(document, flat_json):
+    """Your original function with minimal surgical fixes added"""
     replacements_made = 0
     for table_idx, table in enumerate(document.tables):
         print(f"\n🔍 Processing table {table_idx + 1}:")
+        # Your original logic
         table_text = ""
         for row in table.rows[:3]:
             for cell in row.cells:
         # Enhanced vehicle registration detection
         vehicle_indicators = ["registration number", "sub-contractor", "weight verification", "rfs suspension"]
         indicator_count = sum(1 for indicator in vehicle_indicators if indicator in table_text)
+        if indicator_count >= 2:
             print(f"    🚗 Detected Vehicle Registration table")
             vehicle_replacements = handle_vehicle_registration_table(table, flat_json)
             replacements_made += vehicle_replacements
         # Enhanced print accreditation detection
         print_accreditation_indicators = ["print name", "position title"]
         indicator_count = sum(1 for indicator in print_accreditation_indicators if indicator in table_text)
+        if indicator_count >= 1:
             print(f"    📋 Detected Print Accreditation table")
             print_accreditation_replacements = handle_print_accreditation_section(table, flat_json)
             replacements_made += print_accreditation_replacements
             continue
+        # Your existing row processing
         for row_idx, row in enumerate(table.rows):
             if len(row.cells) < 1:
                 continue
                         cell_replacements = handle_multiple_red_segments_in_cell(cell, flat_json)
                         replacements_made += cell_replacements
+                        # 🎯 SURGICAL FIX 1: Only if no replacements were made
                         if cell_replacements == 0:
+                            surgical_fix = handle_nature_business_multiline_fix(cell, flat_json)
+                            replacements_made += surgical_fix
+    # 🎯 SURGICAL FIX 2: Handle Operator Declaration tables (only check last few tables)
+    print(f"\n🎯 SURGICAL FIX: Checking for Operator Declaration tables...")
+    for table in document.tables[-3:]:  # Only check last 3 tables
+        if len(table.rows) <= 4:  # Only small tables
+            declaration_fix = handle_operator_declaration_fix(table, flat_json)
+            replacements_made += declaration_fix
     return replacements_made
 def process_paragraphs(document, flat_json):
+    """Your original function (unchanged)"""
     replacements_made = 0
     print(f"\n🔍 Processing paragraphs:")
                     json_value = find_matching_json_value("auditor signature", flat_json)
                 elif "OPERATOR SIGNATURE" in red_text_only.upper():
                     json_value = find_matching_json_value("operator signature", flat_json)
             if json_value is not None:
                 replacement_text = get_value_as_string(json_value)
                 for run in red_runs[1:]:
                     run.text = ''
                 replacements_made += 1
     return replacements_made
 def process_headings(document, flat_json):
+    """Your original function (unchanged)"""
     replacements_made = 0
     print(f"\n🔍 Processing headings:")
                         flat_json
                     )
                     replacements_made += context_replacements
     return replacements_made
 def has_red_text_in_paragraph(paragraph):
+    """Your original function (unchanged)"""
     for run in paragraph.runs:
         if is_red(run) and run.text.strip():
             return True
     return False
 def process_red_text_in_paragraph(paragraph, context_text, flat_json):
+    """Your original function (unchanged)"""
     replacements_made = 0
     red_text_segments = []
                 print(f"      ✅ Found match with combined query: '{query[:50]}...'")
                 break
     # Replace if match found
     if json_value is not None:
         replacement_text = get_value_as_string(json_value, combined_red_text)
     return replacements_made
 def process_hf(json_file, docx_file, output_file):
+    """Your original main function (unchanged)"""
     try:
         # Load JSON
         if hasattr(json_file, "read"):
         else:
             doc = Document(docx_file)
+        # Your original processing
+        print("🚀 Starting processing with surgical fixes...")
         table_replacements = process_tables(doc, flat_json)
         paragraph_replacements = process_paragraphs(doc, flat_json)
         heading_replacements = process_headings(doc, flat_json)
+        total_replacements = table_replacements + paragraph_replacements + heading_replacements
         # Save output
         if hasattr(output_file, "write"):
         print(f"   📊 Tables: {table_replacements}")
         print(f"   📝 Paragraphs: {paragraph_replacements}")
         print(f"   📋 Headings: {heading_replacements}")
+        print(f"🎉 Processing complete!")
     except FileNotFoundError as e:
         print(f"❌ File not found: {e}")
 if __name__ == "__main__":
     import sys
     if len(sys.argv) != 4:
+        print("Usage: python pipeline.py <input_docx> <updated_json> <output_docx>")
         exit(1)
     docx_path = sys.argv[1]
     json_path = sys.argv[2]