Spaces:

Shami96
/

PDF-Data_Extractor

Running

File size: 52,775 Bytes

import json
from docx import Document
from docx.shared import RGBColor
import re

# Your original heading patterns (unchanged)
HEADING_PATTERNS = {
    "main": [
        r"NHVAS\s+Audit\s+Summary\s+Report",
        r"NATIONAL\s+HEAVY\s+VEHICLE\s+ACCREDITATION\s+AUDIT\s+SUMMARY\s+REPORT",
        r"NHVAS\s+AUDIT\s+SUMMARY\s+REPORT"
    ],
    "sub": [
        r"AUDIT\s+OBSERVATIONS\s+AND\s+COMMENTS",
        r"MAINTENANCE\s+MANAGEMENT",
        r"MASS\s+MANAGEMENT",
        r"FATIGUE\s+MANAGEMENT",
        r"Fatigue\s+Management\s+Summary\s+of\s+Audit\s+findings",
        r"MAINTENANCE\s+MANAGEMENT\s+SUMMARY\s+OF\s+AUDIT\s+FINDINGS",
        r"MASS\s+MANAGEMENT\s+SUMMARY\s+OF\s+AUDIT\s+FINDINGS",
        r"Vehicle\s+Registration\s+Numbers\s+of\s+Records\s+Examined",
        r"CORRECTIVE\s+ACTION\s+REQUEST\s+\(CAR\)",
        r"NHVAS\s+APPROVED\s+AUDITOR\s+DECLARATION",
        r"Operator\s+Declaration",
        r"Operator\s+Information",
        r"Driver\s*/\s*Scheduler\s+Records\s+Examined"
    ]
}

def load_json(filepath):
    with open(filepath, 'r') as file:
        return json.load(file)

def flatten_json(y, prefix=''):
    out = {}
    for key, val in y.items():
        new_key = f"{prefix}.{key}" if prefix else key
        if isinstance(val, dict):
            out.update(flatten_json(val, new_key))
        else:
            out[new_key] = val
            out[key] = val
    return out

def is_red(run):
    color = run.font.color
    return color and (color.rgb == RGBColor(255, 0, 0) or getattr(color, "theme_color", None) == 1)

def get_value_as_string(value, field_name=""):
    if isinstance(value, list):
        if len(value) == 0:
            return ""
        elif len(value) == 1:
            return str(value[0])
        else:
            if "australian company number" in field_name.lower() or "company number" in field_name.lower():
                return value
            else:
                return " ".join(str(v) for v in value)
    else:
        return str(value)

def find_matching_json_value(field_name, flat_json):
    """Your original matching function with minimal improvements"""
    field_name = field_name.strip()
    
    # Try exact match first
    if field_name in flat_json:
        print(f"    ✅ Direct match found for key '{field_name}'")
        return flat_json[field_name]
    
    # Try case-insensitive exact match
    for key, value in flat_json.items():
        if key.lower() == field_name.lower():
            print(f"    ✅ Case-insensitive match found for key '{field_name}' with JSON key '{key}'")
            return value
    
    # 🎯 MINIMAL IMPROVEMENT: Better Print Name detection for operator vs auditor
    if field_name.lower().strip() == "print name":
        # Look in the flat_json keys to see what context we're in
        operator_keys = [k for k in flat_json.keys() if "operator" in k.lower() and "print name" in k.lower()]
        auditor_keys = [k for k in flat_json.keys() if "auditor" in k.lower() and ("print name" in k.lower() or "name" in k.lower())]
        
        # If we have operator-specific keys, prefer those in operator context
        if operator_keys:
            print(f"    ✅ Operator Print Name match: '{field_name}' -> '{operator_keys[0]}'")
            return flat_json[operator_keys[0]]
        elif auditor_keys:
            print(f"    ✅ Auditor Name match: '{field_name}' -> '{auditor_keys[0]}'")
            return flat_json[auditor_keys[0]]
    
    # Try suffix matching (for nested keys like "section.field")
    for key, value in flat_json.items():
        if '.' in key and key.split('.')[-1].lower() == field_name.lower():
            print(f"    ✅ Suffix match found for key '{field_name}' with JSON key '{key}'")
            return value
    
    # Try partial matching - remove parentheses and special chars
    clean_field = re.sub(r'[^\w\s]', ' ', field_name.lower()).strip()
    clean_field = re.sub(r'\s+', ' ', clean_field)
    
    for key, value in flat_json.items():
        clean_key = re.sub(r'[^\w\s]', ' ', key.lower()).strip()
        clean_key = re.sub(r'\s+', ' ', clean_key)
        
        if clean_field == clean_key:
            print(f"    ✅ Clean match found for key '{field_name}' with JSON key '{key}'")
            return value
    
    # Enhanced fuzzy matching with better scoring
    field_words = set(word.lower() for word in re.findall(r'\b\w+\b', field_name) if len(word) > 2)
    if not field_words:
        return None
    
    best_match = None
    best_score = 0
    best_key = None
    
    for key, value in flat_json.items():
        key_words = set(word.lower() for word in re.findall(r'\b\w+\b', key) if len(word) > 2)
        if not key_words:
            continue
            
        # Calculate similarity score
        common_words = field_words.intersection(key_words)
        if common_words:
            # Use Jaccard similarity: intersection / union
            similarity = len(common_words) / len(field_words.union(key_words))
            
            # Bonus for high word coverage in field_name
            coverage = len(common_words) / len(field_words)
            final_score = (similarity * 0.6) + (coverage * 0.4)
            
            if final_score > best_score:
                best_score = final_score
                best_match = value
                best_key = key
    
    if best_match and best_score >= 0.25:
        print(f"    ✅ Fuzzy match found for key '{field_name}' with JSON key '{best_key}' (score: {best_score:.2f})")
        return best_match
    
    print(f"    ❌ No match found for '{field_name}'")
    return None

def get_clean_text(cell):
    text = ""
    for paragraph in cell.paragraphs:
        for run in paragraph.runs:
            text += run.text
    return text.strip()

def has_red_text(cell):
    for paragraph in cell.paragraphs:
        for run in paragraph.runs:
            if is_red(run) and run.text.strip():
                return True
    return False

def extract_red_text_segments(cell):
    """Your original red text extraction (unchanged)"""
    red_segments = []
    
    for para_idx, paragraph in enumerate(cell.paragraphs):
        current_segment = ""
        segment_runs = []
        
        for run_idx, run in enumerate(paragraph.runs):
            if is_red(run):
                if run.text:
                    current_segment += run.text
                segment_runs.append((para_idx, run_idx, run))
            else:
                # End of current red segment
                if segment_runs:
                    red_segments.append({
                        'text': current_segment,
                        'runs': segment_runs.copy(),
                        'paragraph_idx': para_idx
                    })
                    current_segment = ""
                    segment_runs = []
        
        # Handle segment at end of paragraph
        if segment_runs:
            red_segments.append({
                'text': current_segment,
                'runs': segment_runs.copy(),
                'paragraph_idx': para_idx
            })
    
    return red_segments

def replace_red_text_in_cell(cell, replacement_text):
    """Your original replacement function (unchanged)"""
    red_segments = extract_red_text_segments(cell)
    
    if not red_segments:
        return 0
    
    if len(red_segments) > 1:
        replacements_made = 0
        for segment in red_segments:
            segment_text = segment['text'].strip()
            if segment_text:
                pass
        
        if replacements_made == 0:
            return replace_all_red_segments(red_segments, replacement_text)
    
    return replace_all_red_segments(red_segments, replacement_text)

def replace_all_red_segments(red_segments, replacement_text):
    """Your original function (unchanged)"""
    if not red_segments:
        return 0
    
    if '\n' in replacement_text:
        replacement_lines = replacement_text.split('\n')
    else:
        replacement_lines = [replacement_text]
    
    replacements_made = 0
    
    if red_segments and replacement_lines:
        first_segment = red_segments[0]
        if first_segment['runs']:
            first_run = first_segment['runs'][0][2]
            first_run.text = replacement_lines[0]
            first_run.font.color.rgb = RGBColor(0, 0, 0)
            replacements_made = 1
            
            for _, _, run in first_segment['runs'][1:]:
                run.text = ''
    
    for segment in red_segments[1:]:
        for _, _, run in segment['runs']:
            run.text = ''
    
    if len(replacement_lines) > 1 and red_segments:
        try:
            first_run = red_segments[0]['runs'][0][2]
            paragraph = first_run.element.getparent()
            
            for line in replacement_lines[1:]:
                if line.strip():
                    from docx.oxml import OxmlElement, ns
                    br = OxmlElement('w:br')
                    first_run.element.append(br)
                    
                    new_run = paragraph.add_run(line.strip())
                    new_run.font.color.rgb = RGBColor(0, 0, 0)
        except:
            if red_segments and red_segments[0]['runs']:
                first_run = red_segments[0]['runs'][0][2]
                first_run.text = ' '.join(replacement_lines)
                first_run.font.color.rgb = RGBColor(0, 0, 0)
    
    return replacements_made

def replace_single_segment(segment, replacement_text):
    """Your original function (unchanged)"""
    if not segment['runs']:
        return False
    
    first_run = segment['runs'][0][2]
    first_run.text = replacement_text
    first_run.font.color.rgb = RGBColor(0, 0, 0)
    
    for _, _, run in segment['runs'][1:]:
        run.text = ''
    
    return True

def handle_multiple_red_segments_in_cell(cell, flat_json):
    """Your original function (unchanged)"""
    red_segments = extract_red_text_segments(cell)
    
    if not red_segments:
        return 0
    
    print(f"      🔍 Found {len(red_segments)} red text segments in cell")
    replacements_made = 0
    unmatched_segments = []
    
    for i, segment in enumerate(red_segments):
        segment_text = segment['text'].strip()
        if not segment_text:
            continue
            
        print(f"        Segment {i+1}: '{segment_text[:50]}...'")
        
        json_value = find_matching_json_value(segment_text, flat_json)
        
        if json_value is not None:
            replacement_text = get_value_as_string(json_value, segment_text)
            
            if isinstance(json_value, list) and len(json_value) > 1:
                replacement_text = "\n".join(str(item) for item in json_value if str(item).strip())
            
            success = replace_single_segment(segment, replacement_text)
            if success:
                replacements_made += 1
                print(f"        ✅ Replaced segment '{segment_text[:30]}...' with '{replacement_text[:30]}...'")
        else:
            unmatched_segments.append(segment)
            print(f"        ⏳ No individual match for segment '{segment_text[:30]}...'")
    
    if unmatched_segments and replacements_made == 0:
        combined_text = " ".join(seg['text'] for seg in red_segments).strip()
        print(f"      🔄 Trying combined text match: '{combined_text[:50]}...'")
        
        json_value = find_matching_json_value(combined_text, flat_json)
        if json_value is not None:
            replacement_text = get_value_as_string(json_value, combined_text)
            if isinstance(json_value, list) and len(json_value) > 1:
                replacement_text = "\n".join(str(item) for item in json_value if str(item).strip())
            
            replacements_made = replace_all_red_segments(red_segments, replacement_text)
            print(f"      ✅ Replaced combined text with '{replacement_text[:50]}...'")
    
    return replacements_made

# 🎯 SURGICAL FIX 1: Handle Nature of Business multi-line red text
def handle_nature_business_multiline_fix(cell, flat_json):
    """SURGICAL FIX: Handle multi-line red text in Nature of Business cells"""
    if not has_red_text(cell):
        return 0
    
    # Check if this cell contains "Nature of the Operators Business"
    cell_text = get_clean_text(cell).lower()
    if "nature of the operators business" not in cell_text and "nature of the operator business" not in cell_text:
        return 0
    
    print(f"    🎯 SURGICAL FIX: Nature of Business multi-line processing")
    
    # Look for sub-fields like "Accreditation Number:" and "Expiry Date:"
    red_segments = extract_red_text_segments(cell)
    replacements_made = 0
    
    # Try to replace each segment individually first
    for segment in red_segments:
        segment_text = segment['text'].strip()
        if not segment_text:
            continue
            
        json_value = find_matching_json_value(segment_text, flat_json)
        if json_value is not None:
            replacement_text = get_value_as_string(json_value, segment_text)
            success = replace_single_segment(segment, replacement_text)
            if success:
                replacements_made += 1
                print(f"        ✅ Fixed segment: '{segment_text[:30]}...'")
    
    # If no individual matches, try combined approach
    if replacements_made == 0 and red_segments:
        combined_text = " ".join(seg['text'] for seg in red_segments).strip()
        json_value = find_matching_json_value(combined_text, flat_json)
        if json_value is not None:
            replacement_text = get_value_as_string(json_value, combined_text)
            replacements_made = replace_all_red_segments(red_segments, replacement_text)
            print(f"        ✅ Fixed combined text")
    
    return replacements_made

# 🎯 SURGICAL FIX 2: Handle Operator Declaration table with context awareness
def handle_operator_declaration_fix(table, flat_json):
    """SURGICAL FIX: Handle Operator Declaration Print Name and Position Title with better context detection"""
    replacements_made = 0
    
    # Build table context to understand what type of declaration this is
    table_context = ""
    for row in table.rows:
        for cell in row.cells:
            table_context += get_clean_text(cell).lower() + " "
    
    # Determine if this is an operator declaration vs auditor declaration
    is_operator_declaration = any(keyword in table_context for keyword in [
        "hereby acknowledge", "findings detailed", "management system", 
        "accreditation to be shared", "operator signature"
    ])
    
    is_auditor_declaration = any(keyword in table_context for keyword in [
        "nhvas approved auditor", "auditor registration", "hereby certify",
        "auditor signature"
    ])
    
    # Process the table based on context
    for row_idx, row in enumerate(table.rows):
        if len(row.cells) >= 2:
            cell1_text = get_clean_text(row.cells[0]).strip()
            cell2_text = get_clean_text(row.cells[1]).strip()
            
            # Check if this is a header row with Print Name and Position Title
            if ("print name" in cell1_text.lower() and "position title" in cell2_text.lower() and 
                len(table.rows) <= 4):  # Small table only
                
                context_type = "Operator" if is_operator_declaration else ("Auditor" if is_auditor_declaration else "Unknown")
                print(f"    🎯 SURGICAL FIX: {context_type} Declaration table detected")
                
                # Look for the data row (should be next row)
                if row_idx + 1 < len(table.rows):
                    data_row = table.rows[row_idx + 1]
                    if len(data_row.cells) >= 2:
                        name_cell = data_row.cells[0]
                        position_cell = data_row.cells[1]
                        
                        # Fix Print Name based on context
                        if has_red_text(name_cell):
                            name_value = None
                            
                            if is_operator_declaration:
                                # Try operator-specific fields first
                                for field_attempt in ["Operator Declaration.Print Name", "operator.print name", "Print Name"]:
                                    name_value = find_matching_json_value(field_attempt, flat_json)
                                    if name_value is not None:
                                        break
                            elif is_auditor_declaration:
                                # Try auditor-specific fields first
                                for field_attempt in ["NHVAS Approved Auditor Declaration.Print Name", "auditor name", "auditor", "Print Name"]:
                                    name_value = find_matching_json_value(field_attempt, flat_json)
                                    if name_value is not None:
                                        break
                            else:
                                # Fallback to generic
                                name_value = find_matching_json_value("Print Name", flat_json)
                            
                            if name_value is not None:
                                name_text = get_value_as_string(name_value)
                                cell_replacements = replace_red_text_in_cell(name_cell, name_text)
                                replacements_made += cell_replacements
                                print(f"        ✅ Fixed {context_type} Print Name: '{name_text}'")
                        
                        # Fix Position Title based on context
                        if has_red_text(position_cell):
                            position_value = None
                            
                            if is_operator_declaration:
                                # Try operator-specific fields first
                                for field_attempt in ["Operator Declaration.Position Title", "operator.position title", "Position Title"]:
                                    position_value = find_matching_json_value(field_attempt, flat_json)
                                    if position_value is not None:
                                        break
                            elif is_auditor_declaration:
                                # Try auditor registration number for auditor declarations
                                for field_attempt in ["NHVR or Exemplar Global Auditor Registration Number", "auditor registration", "registration number"]:
                                    position_value = find_matching_json_value(field_attempt, flat_json)
                                    if position_value is not None:
                                        break
                            else:
                                # Fallback to generic
                                position_value = find_matching_json_value("Position Title", flat_json)
                            
                            if position_value is not None:
                                position_text = get_value_as_string(position_value)
                                cell_replacements = replace_red_text_in_cell(position_cell, position_text)
                                replacements_made += cell_replacements
                                print(f"        ✅ Fixed {context_type} Position/Registration: '{position_text}'")
                
                break  # Found the table, stop looking
    
    return replacements_made

def handle_australian_company_number(row, company_numbers):
    """Your original function (unchanged)"""
    replacements_made = 0
    for i, digit in enumerate(company_numbers):
        cell_idx = i + 1
        if cell_idx < len(row.cells):
            cell = row.cells[cell_idx]
            if has_red_text(cell):
                cell_replacements = replace_red_text_in_cell(cell, str(digit))
                replacements_made += cell_replacements
                print(f"      -> Placed digit '{digit}' in cell {cell_idx + 1}")
    return replacements_made

def handle_vehicle_registration_table(table, flat_json):
    """Your original function (unchanged)"""
    replacements_made = 0
    
    # Try to find vehicle registration data
    vehicle_section = None
    
    for key, value in flat_json.items():
        if "vehicle registration numbers of records examined" in key.lower():
            if isinstance(value, dict):
                vehicle_section = value
                print(f"    ✅ Found vehicle data in key: '{key}'")
                break
    
    if not vehicle_section:
        potential_columns = {}
        for key, value in flat_json.items():
            if any(col_name in key.lower() for col_name in ["registration number", "sub-contractor", "weight verification", "rfs suspension"]):
                if "." in key:
                    column_name = key.split(".")[-1]
                else:
                    column_name = key
                potential_columns[column_name] = value
        
        if potential_columns:
            vehicle_section = potential_columns
            print(f"    ✅ Found vehicle data from flattened keys: {list(vehicle_section.keys())}")
        else:
            print(f"    ❌ Vehicle registration data not found in JSON")
            return 0
    
    print(f"    ✅ Found vehicle registration data with {len(vehicle_section)} columns")
    
    # Find header row
    header_row_idx = -1
    header_row = None
    
    for row_idx, row in enumerate(table.rows):
        row_text = "".join(get_clean_text(cell).lower() for cell in row.cells)
        if "registration" in row_text and "number" in row_text:
            header_row_idx = row_idx
            header_row = row
            break
    
    if header_row_idx == -1:
        print(f"    ❌ Could not find header row in vehicle table")
        return 0
    
    print(f"    ✅ Found header row at index {header_row_idx}")
    
    # Enhanced column mapping
    column_mapping = {}
    for col_idx, cell in enumerate(header_row.cells):
        header_text = get_clean_text(cell).strip()
        if not header_text or header_text.lower() == "no.":
            continue
            
        best_match = None
        best_score = 0
        
        normalized_header = header_text.lower().replace("(", " (").replace(")", ") ").strip()
        
        for json_key in vehicle_section.keys():
            normalized_json = json_key.lower().strip()
            
            if normalized_header == normalized_json:
                best_match = json_key
                best_score = 1.0
                break
            
            header_words = set(word.lower() for word in normalized_header.split() if len(word) > 2)
            json_words = set(word.lower() for word in normalized_json.split() if len(word) > 2)
            
            if header_words and json_words:
                common_words = header_words.intersection(json_words)
                score = len(common_words) / max(len(header_words), len(json_words))
                
                if score > best_score and score >= 0.3:
                    best_score = score
                    best_match = json_key
            
            header_clean = normalized_header.replace(" ", "").replace("-", "").replace("(", "").replace(")", "")
            json_clean = normalized_json.replace(" ", "").replace("-", "").replace("(", "").replace(")", "")
            
            if header_clean in json_clean or json_clean in header_clean:
                if len(header_clean) > 5 and len(json_clean) > 5:
                    substring_score = min(len(header_clean), len(json_clean)) / max(len(header_clean), len(json_clean))
                    if substring_score > best_score and substring_score >= 0.6:
                        best_score = substring_score
                        best_match = json_key
        
        if best_match:
            column_mapping[col_idx] = best_match
            print(f"      📌 Column {col_idx + 1} ('{header_text}') -> '{best_match}' (score: {best_score:.2f})")
    
    if not column_mapping:
        print(f"    ❌ No column mappings found")
        return 0
    
    # Determine data rows needed
    max_data_rows = 0
    for json_key, data in vehicle_section.items():
        if isinstance(data, list):
            max_data_rows = max(max_data_rows, len(data))
    
    print(f"    📌 Need to populate {max_data_rows} data rows")
    
    # Process data rows
    for data_row_index in range(max_data_rows):
        table_row_idx = header_row_idx + 1 + data_row_index
        
        if table_row_idx >= len(table.rows):
            print(f"    ⚠️ Row {table_row_idx + 1} doesn't exist - table only has {len(table.rows)} rows")
            print(f"    ➕ Adding new row for vehicle {data_row_index + 1}")
            
            new_row = table.add_row()
            print(f"    ✅ Successfully added row {len(table.rows)} to the table")
            
        row = table.rows[table_row_idx]
        print(f"    📌 Processing data row {table_row_idx + 1} (vehicle {data_row_index + 1})")
        
        for col_idx, json_key in column_mapping.items():
            if col_idx < len(row.cells):
                cell = row.cells[col_idx]
                
                column_data = vehicle_section.get(json_key, [])
                if isinstance(column_data, list) and data_row_index < len(column_data):
                    replacement_value = str(column_data[data_row_index])
                    
                    cell_text = get_clean_text(cell)
                    if has_red_text(cell) or not cell_text.strip():
                        if not cell_text.strip():
                            cell.text = replacement_value
                            replacements_made += 1
                            print(f"      -> Added '{replacement_value}' to empty cell (column '{json_key}')")
                        else:
                            cell_replacements = replace_red_text_in_cell(cell, replacement_value)
                            replacements_made += cell_replacements
                            if cell_replacements > 0:
                                print(f"      -> Replaced red text with '{replacement_value}' (column '{json_key}')")
    
    return replacements_made

def handle_print_accreditation_section(table, flat_json):
    """Your original function (unchanged)"""
    replacements_made = 0
    
    print_data = flat_json.get("print accreditation name.print accreditation name", [])
    if not isinstance(print_data, list) or len(print_data) < 2:
        return 0
    
    name_value = print_data[0]
    position_value = print_data[1]
    
    print(f"    📋 Print accreditation data: Name='{name_value}', Position='{position_value}'")
    
    for row_idx, row in enumerate(table.rows):
        if len(row.cells) >= 2:
            cell1_text = get_clean_text(row.cells[0]).lower()
            cell2_text = get_clean_text(row.cells[1]).lower()
            
            if "print name" in cell1_text and "position title" in cell2_text:
                print(f"    📍 Found header row {row_idx + 1}: '{cell1_text}' | '{cell2_text}'")
                
                if row_idx + 1 < len(table.rows):
                    data_row = table.rows[row_idx + 1]
                    if len(data_row.cells) >= 2:
                        if has_red_text(data_row.cells[0]):
                            cell_replacements = replace_red_text_in_cell(data_row.cells[0], name_value)
                            replacements_made += cell_replacements
                            if cell_replacements > 0:
                                print(f"      ✅ Replaced Print Name: '{name_value}'")
                        
                        if has_red_text(data_row.cells[1]):
                            cell_replacements = replace_red_text_in_cell(data_row.cells[1], position_value)
                            replacements_made += cell_replacements
                            if cell_replacements > 0:
                                print(f"      ✅ Replaced Position Title: '{position_value}'")
                
                break
    
    return replacements_made

def process_single_column_sections(cell, field_name, flat_json):
    """Your original function (unchanged)"""
    json_value = find_matching_json_value(field_name, flat_json)
    if json_value is not None:
        replacement_text = get_value_as_string(json_value, field_name)
        if isinstance(json_value, list) and len(json_value) > 1:
            replacement_text = "\n".join(str(item) for item in json_value)
        if has_red_text(cell):
            print(f"    ✅ Replacing red text in single-column section: '{field_name}'")
            print(f"    ✅ Replacement text:\n{replacement_text}")
            cell_replacements = replace_red_text_in_cell(cell, replacement_text)
            if cell_replacements > 0:
                print(f"    -> Replaced with: '{replacement_text[:100]}...'")
                return cell_replacements
    return 0

# 🎯 FINAL FIX 1: Add this function to handle Attendance List (unchanged)
def handle_attendance_list_fix(table, flat_json):
    """FINAL FIX: Handle Attendance List table specifically"""
    replacements_made = 0
    
    # Look for attendance list table
    for row_idx, row in enumerate(table.rows):
        if len(row.cells) >= 1:
            cell_text = get_clean_text(row.cells[0]).lower()
            
            # Check if this is the attendance list header
            if "attendance list" in cell_text and "names and position titles" in cell_text:
                print(f"    🎯 FINAL FIX: Attendance List table detected at row {row_idx + 1}")
                
                # The content should be in the same cell, look for red text
                if has_red_text(row.cells[0]):
                    # Try to find attendance list data
                    attendance_value = None
                    for field_attempt in ["Attendance List (Names and Position Titles)", "attendance list", "Attendance List"]:
                        attendance_value = find_matching_json_value(field_attempt, flat_json)
                        if attendance_value is not None:
                            break
                    
                    if attendance_value is not None:
                        attendance_text = get_value_as_string(attendance_value)
                        # Handle list format for attendance
                        if isinstance(attendance_value, list):
                            attendance_text = "\n".join(str(item) for item in attendance_value)
                        
                        cell_replacements = replace_red_text_in_cell(row.cells[0], attendance_text)
                        replacements_made += cell_replacements
                        print(f"        ✅ Fixed Attendance List: '{attendance_text[:50]}...'")
                
                break  # Found the table, stop looking
    
    return replacements_made

# 🎯 FINAL FIX 2: Generic Management Summary fix for ALL types (Mass, Fatigue, Maintenance)
def handle_management_summary_fix(cell, flat_json):
    """FINAL FIX: Handle ANY Management Summary section (Mass/Fatigue/Maintenance) - RED TEXT ONLY"""
    if not has_red_text(cell):
        return 0
    
    # Check if this cell contains any Management Summary
    cell_text = get_clean_text(cell).lower()
    
    # Detect which type of management summary this is
    management_type = None
    if "mass management" in cell_text and "summary" in cell_text:
        management_type = "Mass Management"
    elif "fatigue management" in cell_text and "summary" in cell_text:
        management_type = "Fatigue Management"
    elif "maintenance management" in cell_text and "summary" in cell_text:
        management_type = "Maintenance Management"
    
    if not management_type:
        return 0
    
    print(f"    🎯 FINAL FIX: {management_type} Summary processing - RED TEXT ONLY")
    
    # ONLY process red text segments, not the entire cell text
    red_segments = extract_red_text_segments(cell)
    replacements_made = 0
    
    # Try to replace ONLY the red text segments
    for segment in red_segments:
        segment_text = segment['text'].strip()
        if not segment_text:
            continue
        
        print(f"        🔍 Processing red text segment: '{segment_text[:50]}...'")
        
        # Try multiple variations based on the management type
        summary_value = None
        field_attempts = [
            f"{management_type} Summary of Audit findings",
            f"{management_type} Summary",
            f"{management_type.lower()} summary",
            management_type.lower(),
            segment_text  # Also try the exact red text
        ]
        
        # Also try variations without "Management"
        base_type = management_type.replace(" Management", "")
        field_attempts.extend([
            f"{base_type} Management Summary of Audit findings",
            f"{base_type} Summary of Audit findings",
            f"{base_type} Summary",
            f"{base_type.lower()} summary"
        ])
        
        for field_attempt in field_attempts:
            summary_value = find_matching_json_value(field_attempt, flat_json)
            if summary_value is not None:
                print(f"        ✅ Found match with field: '{field_attempt}'")
                break
        
        if summary_value is not None:
            replacement_text = get_value_as_string(summary_value, segment_text)
            if isinstance(summary_value, list):
                replacement_text = "\n".join(str(item) for item in summary_value if str(item).strip())
            
            success = replace_single_segment(segment, replacement_text)
            if success:
                replacements_made += 1
                print(f"        ✅ Fixed {management_type} Summary segment: '{segment_text[:30]}...' -> '{replacement_text[:30]}...'")
        else:
            print(f"        ❌ No match found for red text: '{segment_text[:30]}...'")
    
    # If no individual segment matches, try combined approach on red text only
    if replacements_made == 0 and red_segments:
        combined_red_text = " ".join(seg['text'] for seg in red_segments).strip()
        print(f"        🔄 Trying combined red text match: '{combined_red_text[:50]}...'")
        
        # Try combined text matching with all field variations
        field_attempts = [
            f"{management_type} Summary of Audit findings",
            f"{management_type} Summary",
            f"{management_type.lower()} summary",
            combined_red_text
        ]
        
        base_type = management_type.replace(" Management", "")
        field_attempts.extend([
            f"{base_type} Management Summary of Audit findings",
            f"{base_type} Summary of Audit findings",
            f"{base_type} Summary"
        ])
        
        for field_attempt in field_attempts:
            summary_value = find_matching_json_value(field_attempt, flat_json)
            if summary_value is not None:
                replacement_text = get_value_as_string(summary_value, combined_red_text)
                if isinstance(summary_value, list):
                    replacement_text = "\n".join(str(item) for item in summary_value if str(item).strip())
                
                replacements_made = replace_all_red_segments(red_segments, replacement_text)
                print(f"        ✅ Fixed {management_type} Summary combined red text with field: '{field_attempt}'")
                break
    
    return replacements_made

def process_tables(document, flat_json):
    """Your original function with ALL surgical fixes added"""
    replacements_made = 0
    
    for table_idx, table in enumerate(document.tables):
        print(f"\n🔍 Processing table {table_idx + 1}:")
        
        # Your original logic
        table_text = ""
        for row in table.rows[:3]:
            for cell in row.cells:
                table_text += get_clean_text(cell).lower() + " "
        
        # Enhanced vehicle registration detection
        vehicle_indicators = ["registration number", "sub-contractor", "weight verification", "rfs suspension"]
        indicator_count = sum(1 for indicator in vehicle_indicators if indicator in table_text)
        if indicator_count >= 2:
            print(f"    🚗 Detected Vehicle Registration table")
            vehicle_replacements = handle_vehicle_registration_table(table, flat_json)
            replacements_made += vehicle_replacements
            continue
        
        # 🎯 FINAL FIX 1: Enhanced attendance list detection
        if "attendance list" in table_text and "names and position titles" in table_text:
            print(f"    👥 Detected Attendance List table")
            attendance_replacements = handle_attendance_list_fix(table, flat_json)
            replacements_made += attendance_replacements
            continue
        
        # Enhanced print accreditation detection
        print_accreditation_indicators = ["print name", "position title"]
        indicator_count = sum(1 for indicator in print_accreditation_indicators if indicator in table_text)
        if indicator_count >= 1:
            print(f"    📋 Detected Print Accreditation table")
            print_accreditation_replacements = handle_print_accreditation_section(table, flat_json)
            replacements_made += print_accreditation_replacements
            continue
        
        # Your existing row processing
        for row_idx, row in enumerate(table.rows):
            if len(row.cells) < 1:
                continue
                
            key_cell = row.cells[0]
            key_text = get_clean_text(key_cell)
            
            if not key_text:
                continue
            
            print(f"  📌 Row {row_idx + 1}: Key = '{key_text}'")
            
            json_value = find_matching_json_value(key_text, flat_json)
            
            if json_value is not None:
                replacement_text = get_value_as_string(json_value, key_text)
                
                # Enhanced ACN handling
                if ("australian company number" in key_text.lower() or "company number" in key_text.lower()) and isinstance(json_value, list):
                    cell_replacements = handle_australian_company_number(row, json_value)
                    replacements_made += cell_replacements
                    
                # Enhanced section header handling
                elif ("attendance list" in key_text.lower() or "nature of" in key_text.lower()) and row_idx + 1 < len(table.rows):
                    print(f"    ✅ Section header detected, checking next row for content...")
                    next_row = table.rows[row_idx + 1]
                    
                    for cell_idx, cell in enumerate(next_row.cells):
                        if has_red_text(cell):
                            print(f"    ✅ Found red text in next row, cell {cell_idx + 1}")
                            if isinstance(json_value, list):
                                replacement_text = "\n".join(str(item) for item in json_value)
                            cell_replacements = replace_red_text_in_cell(cell, replacement_text)
                            replacements_made += cell_replacements
                            if cell_replacements > 0:
                                print(f"    -> Replaced section content with: '{replacement_text[:100]}...'")
                                
                elif len(row.cells) == 1 or (len(row.cells) > 1 and not any(has_red_text(row.cells[i]) for i in range(1, len(row.cells)))):
                    if has_red_text(key_cell):
                        cell_replacements = process_single_column_sections(key_cell, key_text, flat_json)
                        replacements_made += cell_replacements
                else:
                    for cell_idx in range(1, len(row.cells)):
                        value_cell = row.cells[cell_idx]
                        if has_red_text(value_cell):
                            print(f"    ✅ Found red text in column {cell_idx + 1}")
                            cell_replacements = replace_red_text_in_cell(value_cell, replacement_text)
                            replacements_made += cell_replacements
            else:
                # Enhanced fallback processing for unmatched keys
                if len(row.cells) == 1 and has_red_text(key_cell):
                    red_text = ""
                    for paragraph in key_cell.paragraphs:
                        for run in paragraph.runs:
                            if is_red(run):
                                red_text += run.text
                    if red_text.strip():
                        section_value = find_matching_json_value(red_text.strip(), flat_json)
                        if section_value is not None:
                            section_replacement = get_value_as_string(section_value, red_text.strip())
                            cell_replacements = replace_red_text_in_cell(key_cell, section_replacement)
                            replacements_made += cell_replacements
                
                # Enhanced red text processing for all cells
                for cell_idx in range(len(row.cells)):
                    cell = row.cells[cell_idx]
                    if has_red_text(cell):
                        cell_replacements = handle_multiple_red_segments_in_cell(cell, flat_json)
                        replacements_made += cell_replacements
                        
                        # 🎯 SURGICAL FIX 1: Only if no replacements were made
                        if cell_replacements == 0:
                            surgical_fix = handle_nature_business_multiline_fix(cell, flat_json)
                            replacements_made += surgical_fix
                        
                        # 🎯 FINAL FIX 2: Only if still no replacements were made, try ANY Management Summary fix
                        if cell_replacements == 0 and surgical_fix == 0:
                            management_summary_fix = handle_management_summary_fix(cell, flat_json)
                            replacements_made += management_summary_fix
    
    # 🎯 SURGICAL FIX 3: Handle Operator Declaration tables (only check last few tables)
    print(f"\n🎯 SURGICAL FIX: Checking for Operator/Auditor Declaration tables...")
    for table in document.tables[-3:]:  # Only check last 3 tables
        if len(table.rows) <= 4:  # Only small tables
            declaration_fix = handle_operator_declaration_fix(table, flat_json)
            replacements_made += declaration_fix
    
    return replacements_made

def process_paragraphs(document, flat_json):
    """Your original function (unchanged)"""
    replacements_made = 0
    print(f"\n🔍 Processing paragraphs:")
    
    for para_idx, paragraph in enumerate(document.paragraphs):
        red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
        if red_runs:
            full_text = paragraph.text.strip()
            red_text_only = "".join(run.text for run in red_runs).strip()
            print(f"  📌 Paragraph {para_idx + 1}: Found red text: '{red_text_only}'")
            
            # Your existing matching logic
            json_value = find_matching_json_value(red_text_only, flat_json)
            
            if json_value is None:
                # Enhanced pattern matching for signatures and dates
                if "AUDITOR SIGNATURE" in red_text_only.upper() or "DATE" in red_text_only.upper():
                    json_value = find_matching_json_value("auditor signature", flat_json)
                elif "OPERATOR SIGNATURE" in red_text_only.upper():
                    json_value = find_matching_json_value("operator signature", flat_json)
                    
            if json_value is not None:
                replacement_text = get_value_as_string(json_value)
                print(f"    ✅ Replacing red text with: '{replacement_text}'")
                red_runs[0].text = replacement_text
                red_runs[0].font.color.rgb = RGBColor(0, 0, 0)
                for run in red_runs[1:]:
                    run.text = ''
                replacements_made += 1
                
    return replacements_made

def process_headings(document, flat_json):
    """Your original function (unchanged)"""
    replacements_made = 0
    print(f"\n🔍 Processing headings:")
    
    paragraphs = document.paragraphs
    
    for para_idx, paragraph in enumerate(paragraphs):
        paragraph_text = paragraph.text.strip()
        
        if not paragraph_text:
            continue
            
        # Enhanced heading detection
        matched_heading = None
        for category, patterns in HEADING_PATTERNS.items():
            for pattern in patterns:
                if re.search(pattern, paragraph_text, re.IGNORECASE):
                    matched_heading = pattern
                    break
            if matched_heading:
                break
        
        if matched_heading:
            print(f"  📌 Found heading at paragraph {para_idx + 1}: '{paragraph_text}'")
            
            # Check current heading paragraph
            if has_red_text_in_paragraph(paragraph):
                print(f"    🔴 Found red text in heading itself")
                heading_replacements = process_red_text_in_paragraph(paragraph, paragraph_text, flat_json)
                replacements_made += heading_replacements
            
            # Enhanced: Look further ahead for related content
            for next_para_offset in range(1, 6):  # Extended range
                next_para_idx = para_idx + next_para_offset
                if next_para_idx >= len(paragraphs):
                    break
                    
                next_paragraph = paragraphs[next_para_idx]
                next_text = next_paragraph.text.strip()
                
                if not next_text:
                    continue
                
                # Stop if we hit another heading
                is_another_heading = False
                for category, patterns in HEADING_PATTERNS.items():
                    for pattern in patterns:
                        if re.search(pattern, next_text, re.IGNORECASE):
                            is_another_heading = True
                            break
                    if is_another_heading:
                        break
                
                if is_another_heading:
                    break
                
                # Process red text with enhanced context
                if has_red_text_in_paragraph(next_paragraph):
                    print(f"    🔴 Found red text in paragraph {next_para_idx + 1} after heading: '{next_text[:50]}...'")
                    
                    context_replacements = process_red_text_in_paragraph(
                        next_paragraph, 
                        paragraph_text,
                        flat_json
                    )
                    replacements_made += context_replacements
    
    return replacements_made

def has_red_text_in_paragraph(paragraph):
    """Your original function (unchanged)"""
    for run in paragraph.runs:
        if is_red(run) and run.text.strip():
            return True
    return False

def process_red_text_in_paragraph(paragraph, context_text, flat_json):
    """Your original function (unchanged)"""
    replacements_made = 0
    
    red_text_segments = []
    for run in paragraph.runs:
        if is_red(run) and run.text.strip():
            red_text_segments.append(run.text.strip())
    
    if not red_text_segments:
        return 0
    
    combined_red_text = " ".join(red_text_segments).strip()
    print(f"      🔍 Red text found: '{combined_red_text}'")
    
    json_value = None
    
    # Strategy 1: Direct matching
    json_value = find_matching_json_value(combined_red_text, flat_json)
    
    # Strategy 2: Enhanced context-based matching
    if json_value is None:
        if "NHVAS APPROVED AUDITOR" in context_text.upper():
            auditor_fields = ["auditor name", "auditor", "nhvas auditor", "approved auditor", "print name"]
            for field in auditor_fields:
                json_value = find_matching_json_value(field, flat_json)
                if json_value is not None:
                    print(f"      ✅ Found auditor match with field: '{field}'")
                    break
                    
        elif "OPERATOR DECLARATION" in context_text.upper():
            operator_fields = ["operator name", "operator", "company name", "organisation name", "print name"]
            for field in operator_fields:
                json_value = find_matching_json_value(field, flat_json)
                if json_value is not None:
                    print(f"      ✅ Found operator match with field: '{field}'")
                    break
    
    # Strategy 3: Enhanced context combination
    if json_value is None:
        context_queries = [
            f"{context_text} {combined_red_text}",
            combined_red_text,
            context_text
        ]
        
        for query in context_queries:
            json_value = find_matching_json_value(query, flat_json)
            if json_value is not None:
                print(f"      ✅ Found match with combined query: '{query[:50]}...'")
                break
    
    # Replace if match found
    if json_value is not None:
        replacement_text = get_value_as_string(json_value, combined_red_text)
        
        red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
        if red_runs:
            red_runs[0].text = replacement_text
            red_runs[0].font.color.rgb = RGBColor(0, 0, 0)
            
            for run in red_runs[1:]:
                run.text = ''
            
            replacements_made = 1
            print(f"      ✅ Replaced with: '{replacement_text}'")
    else:
        print(f"      ❌ No match found for red text: '{combined_red_text}'")
    
    return replacements_made

def process_hf(json_file, docx_file, output_file):
    """Your original main function (unchanged)"""
    try:
        # Load JSON
        if hasattr(json_file, "read"):
            json_data = json.load(json_file)
        else:
            with open(json_file, 'r', encoding='utf-8') as f:
                json_data = json.load(f)
        
        flat_json = flatten_json(json_data)
        print("📄 Available JSON keys (sample):")
        for i, (key, value) in enumerate(sorted(flat_json.items())):
            if i < 10:
                print(f"  - {key}: {value}")
        print(f"  ... and {len(flat_json) - 10} more keys\n")

        # Load DOCX
        if hasattr(docx_file, "read"):
            doc = Document(docx_file)
        else:
            doc = Document(docx_file)

        # Your original processing with surgical fixes
        print("🚀 Starting processing with minimal surgical fixes...")
        
        table_replacements = process_tables(doc, flat_json)
        paragraph_replacements = process_paragraphs(doc, flat_json)
        heading_replacements = process_headings(doc, flat_json)
        
        total_replacements = table_replacements + paragraph_replacements + heading_replacements

        # Save output
        if hasattr(output_file, "write"):
            doc.save(output_file)
        else:
            doc.save(output_file)
            
        print(f"\n✅ Document saved as: {output_file}")
        print(f"✅ Total replacements: {total_replacements}")
        print(f"   📊 Tables: {table_replacements}")
        print(f"   📝 Paragraphs: {paragraph_replacements}")
        print(f"   📋 Headings: {heading_replacements}")
        print(f"🎉 Processing complete!")

    except FileNotFoundError as e:
        print(f"❌ File not found: {e}")
    except Exception as e:
        print(f"❌ Error: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    import sys
    if len(sys.argv) != 4:
        print("Usage: python pipeline.py <input_docx> <updated_json> <output_docx>")
        exit(1)
    docx_path = sys.argv[1]
    json_path = sys.argv[2]
    output_path = sys.argv[3]
    process_hf(json_path, docx_path, output_path)