Spaces:
Running
Running
Update updated_word.py
Browse files- updated_word.py +261 -71
updated_word.py
CHANGED
|
@@ -602,7 +602,7 @@ def fix_operator_declaration_empty_values(table, flat_json):
|
|
| 602 |
"""Fix Operator Declaration table when values are empty"""
|
| 603 |
replacements_made = 0
|
| 604 |
|
| 605 |
-
print(f" π― FIX
|
| 606 |
|
| 607 |
# Check if this is an Operator Declaration table
|
| 608 |
table_context = ""
|
|
@@ -657,13 +657,11 @@ def fix_operator_declaration_empty_values(table, flat_json):
|
|
| 657 |
if name_replacement.strip():
|
| 658 |
# Extract just the name if it's a company name
|
| 659 |
if "Pty Ltd" in name_replacement or "Company" in name_replacement:
|
| 660 |
-
# Try to get individual name instead
|
| 661 |
continue
|
| 662 |
|
| 663 |
if has_red_text(name_cell):
|
| 664 |
cell_replacements = replace_red_text_in_cell(name_cell, name_replacement)
|
| 665 |
else:
|
| 666 |
-
# Cell is empty, add text directly
|
| 667 |
name_cell.text = name_replacement
|
| 668 |
cell_replacements = 1
|
| 669 |
|
|
@@ -675,7 +673,6 @@ def fix_operator_declaration_empty_values(table, flat_json):
|
|
| 675 |
if not position_text or has_red_text(position_cell):
|
| 676 |
print(f" π§ Fixing empty/red Position Title")
|
| 677 |
|
| 678 |
-
# Try multiple sources for position
|
| 679 |
position_sources = [
|
| 680 |
"Operator Declaration.Position Title",
|
| 681 |
"Position Title"
|
|
@@ -690,7 +687,6 @@ def fix_operator_declaration_empty_values(table, flat_json):
|
|
| 690 |
if has_red_text(position_cell):
|
| 691 |
cell_replacements = replace_red_text_in_cell(position_cell, position_replacement)
|
| 692 |
else:
|
| 693 |
-
# Cell is empty, add text directly
|
| 694 |
position_cell.text = position_replacement
|
| 695 |
cell_replacements = 1
|
| 696 |
|
|
@@ -703,25 +699,226 @@ def fix_operator_declaration_empty_values(table, flat_json):
|
|
| 703 |
position_cell.text = "Manager"
|
| 704 |
replacements_made += 1
|
| 705 |
print(f" β
Used fallback Position Title: 'Manager'")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 706 |
|
| 707 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 708 |
|
| 709 |
return replacements_made
|
| 710 |
|
| 711 |
def process_tables(document, flat_json):
|
| 712 |
-
"""
|
| 713 |
replacements_made = 0
|
| 714 |
|
| 715 |
for table_idx, table in enumerate(document.tables):
|
| 716 |
print(f"\nπ Processing table {table_idx + 1}:")
|
| 717 |
|
| 718 |
-
#
|
| 719 |
table_text = ""
|
| 720 |
for row in table.rows[:3]:
|
| 721 |
for cell in row.cells:
|
| 722 |
table_text += get_clean_text(cell).lower() + " "
|
| 723 |
|
| 724 |
-
#
|
| 725 |
management_summary_indicators = ["mass management", "maintenance management", "fatigue management"]
|
| 726 |
has_management = any(indicator in table_text for indicator in management_summary_indicators)
|
| 727 |
has_details = "details" in table_text
|
|
@@ -730,7 +927,8 @@ def process_tables(document, flat_json):
|
|
| 730 |
print(f" π Detected Management Summary table")
|
| 731 |
summary_fixes = fix_management_summary_details_column(table, flat_json)
|
| 732 |
replacements_made += summary_fixes
|
| 733 |
-
|
|
|
|
| 734 |
summary_replacements = 0
|
| 735 |
for row_idx, row in enumerate(table.rows):
|
| 736 |
for cell_idx, cell in enumerate(row.cells):
|
|
@@ -764,7 +962,7 @@ def process_tables(document, flat_json):
|
|
| 764 |
replacements_made += summary_replacements
|
| 765 |
continue
|
| 766 |
|
| 767 |
-
#
|
| 768 |
vehicle_indicators = ["registration number", "sub-contractor", "weight verification", "rfs suspension"]
|
| 769 |
indicator_count = sum(1 for indicator in vehicle_indicators if indicator in table_text)
|
| 770 |
if indicator_count >= 2:
|
|
@@ -773,23 +971,29 @@ def process_tables(document, flat_json):
|
|
| 773 |
replacements_made += vehicle_replacements
|
| 774 |
continue
|
| 775 |
|
| 776 |
-
#
|
| 777 |
if "attendance list" in table_text and "names and position titles" in table_text:
|
| 778 |
print(f" π₯ Detected Attendance List table")
|
| 779 |
attendance_replacements = handle_attendance_list_table_enhanced(table, flat_json)
|
| 780 |
replacements_made += attendance_replacements
|
| 781 |
continue
|
| 782 |
|
| 783 |
-
#
|
| 784 |
print_accreditation_indicators = ["print name", "position title"]
|
| 785 |
indicator_count = sum(1 for indicator in print_accreditation_indicators if indicator in table_text)
|
| 786 |
if indicator_count >= 1:
|
| 787 |
print(f" π Detected Print Accreditation table")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 788 |
print_accreditation_replacements = handle_print_accreditation_section(table, flat_json)
|
| 789 |
replacements_made += print_accreditation_replacements
|
| 790 |
continue
|
| 791 |
|
| 792 |
-
#
|
| 793 |
for row_idx, row in enumerate(table.rows):
|
| 794 |
if len(row.cells) < 1:
|
| 795 |
continue
|
|
@@ -807,14 +1011,14 @@ def process_tables(document, flat_json):
|
|
| 807 |
if json_value is not None:
|
| 808 |
replacement_text = get_value_as_string(json_value, key_text)
|
| 809 |
|
| 810 |
-
#
|
| 811 |
if ("australian company number" in key_text.lower() or "company number" in key_text.lower()) and isinstance(json_value, list):
|
| 812 |
cell_replacements = handle_australian_company_number(row, json_value)
|
| 813 |
replacements_made += cell_replacements
|
| 814 |
|
| 815 |
-
#
|
| 816 |
elif ("attendance list" in key_text.lower() or "nature of" in key_text.lower()) and row_idx + 1 < len(table.rows):
|
| 817 |
-
print(f" β
Section header detected, checking next row
|
| 818 |
next_row = table.rows[row_idx + 1]
|
| 819 |
|
| 820 |
for cell_idx, cell in enumerate(next_row.cells):
|
|
@@ -825,12 +1029,15 @@ def process_tables(document, flat_json):
|
|
| 825 |
cell_replacements = replace_red_text_in_cell(cell, replacement_text)
|
| 826 |
replacements_made += cell_replacements
|
| 827 |
if cell_replacements > 0:
|
| 828 |
-
print(f" -> Replaced section content
|
| 829 |
|
|
|
|
| 830 |
elif len(row.cells) == 1 or (len(row.cells) > 1 and not any(has_red_text(row.cells[i]) for i in range(1, len(row.cells)))):
|
| 831 |
if has_red_text(key_cell):
|
| 832 |
cell_replacements = process_single_column_sections(key_cell, key_text, flat_json)
|
| 833 |
replacements_made += cell_replacements
|
|
|
|
|
|
|
| 834 |
else:
|
| 835 |
for cell_idx in range(1, len(row.cells)):
|
| 836 |
value_cell = row.cells[cell_idx]
|
|
@@ -838,8 +1045,9 @@ def process_tables(document, flat_json):
|
|
| 838 |
print(f" β
Found red text in column {cell_idx + 1}")
|
| 839 |
cell_replacements = replace_red_text_in_cell(value_cell, replacement_text)
|
| 840 |
replacements_made += cell_replacements
|
|
|
|
| 841 |
else:
|
| 842 |
-
#
|
| 843 |
if len(row.cells) == 1 and has_red_text(key_cell):
|
| 844 |
red_text = ""
|
| 845 |
for paragraph in key_cell.paragraphs:
|
|
@@ -853,49 +1061,42 @@ def process_tables(document, flat_json):
|
|
| 853 |
cell_replacements = replace_red_text_in_cell(key_cell, section_replacement)
|
| 854 |
replacements_made += cell_replacements
|
| 855 |
|
| 856 |
-
#
|
| 857 |
for cell_idx in range(len(row.cells)):
|
| 858 |
cell = row.cells[cell_idx]
|
| 859 |
if has_red_text(cell):
|
| 860 |
cell_replacements = handle_multiple_red_segments_in_cell(cell, flat_json)
|
| 861 |
replacements_made += cell_replacements
|
| 862 |
|
| 863 |
-
#
|
| 864 |
if cell_replacements == 0:
|
| 865 |
surgical_fix = handle_nature_business_multiline_fix(cell, flat_json)
|
| 866 |
replacements_made += surgical_fix
|
| 867 |
|
| 868 |
-
|
| 869 |
-
if cell_replacements == 0 and surgical_fix == 0:
|
| 870 |
management_summary_fix = handle_management_summary_fix(cell, flat_json)
|
| 871 |
replacements_made += management_summary_fix
|
| 872 |
|
| 873 |
-
|
| 874 |
-
|
| 875 |
-
|
| 876 |
-
|
| 877 |
-
|
| 878 |
-
|
| 879 |
-
|
| 880 |
-
|
| 881 |
-
declaration_fixes = fix_operator_declaration_empty_values(table, flat_json)
|
| 882 |
-
replacements_made += declaration_fixes
|
| 883 |
-
|
| 884 |
-
return replacements_made
|
| 885 |
|
| 886 |
def process_paragraphs(document, flat_json):
|
| 887 |
-
"""
|
| 888 |
replacements_made = 0
|
| 889 |
print(f"\nπ Processing paragraphs:")
|
| 890 |
|
| 891 |
for para_idx, paragraph in enumerate(document.paragraphs):
|
| 892 |
red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
|
| 893 |
if red_runs:
|
| 894 |
-
full_text = paragraph.text.strip()
|
| 895 |
red_text_only = "".join(run.text for run in red_runs).strip()
|
| 896 |
print(f" π Paragraph {para_idx + 1}: Found red text: '{red_text_only}'")
|
| 897 |
|
| 898 |
-
# Your existing matching logic
|
| 899 |
json_value = find_matching_json_value(red_text_only, flat_json)
|
| 900 |
|
| 901 |
if json_value is None:
|
|
@@ -917,7 +1118,7 @@ def process_paragraphs(document, flat_json):
|
|
| 917 |
return replacements_made
|
| 918 |
|
| 919 |
def process_headings(document, flat_json):
|
| 920 |
-
"""
|
| 921 |
replacements_made = 0
|
| 922 |
print(f"\nπ Processing headings:")
|
| 923 |
|
|
@@ -929,7 +1130,7 @@ def process_headings(document, flat_json):
|
|
| 929 |
if not paragraph_text:
|
| 930 |
continue
|
| 931 |
|
| 932 |
-
#
|
| 933 |
matched_heading = None
|
| 934 |
for category, patterns in HEADING_PATTERNS.items():
|
| 935 |
for pattern in patterns:
|
|
@@ -948,8 +1149,8 @@ def process_headings(document, flat_json):
|
|
| 948 |
heading_replacements = process_red_text_in_paragraph(paragraph, paragraph_text, flat_json)
|
| 949 |
replacements_made += heading_replacements
|
| 950 |
|
| 951 |
-
#
|
| 952 |
-
for next_para_offset in range(1, 6):
|
| 953 |
next_para_idx = para_idx + next_para_offset
|
| 954 |
if next_para_idx >= len(paragraphs):
|
| 955 |
break
|
|
@@ -973,9 +1174,9 @@ def process_headings(document, flat_json):
|
|
| 973 |
if is_another_heading:
|
| 974 |
break
|
| 975 |
|
| 976 |
-
# Process red text with
|
| 977 |
if has_red_text_in_paragraph(next_paragraph):
|
| 978 |
-
print(f" π΄ Found red text in paragraph {next_para_idx + 1} after heading
|
| 979 |
|
| 980 |
context_replacements = process_red_text_in_paragraph(
|
| 981 |
next_paragraph,
|
|
@@ -986,15 +1187,8 @@ def process_headings(document, flat_json):
|
|
| 986 |
|
| 987 |
return replacements_made
|
| 988 |
|
| 989 |
-
def has_red_text_in_paragraph(paragraph):
|
| 990 |
-
"""Your original function (unchanged)"""
|
| 991 |
-
for run in paragraph.runs:
|
| 992 |
-
if is_red(run) and run.text.strip():
|
| 993 |
-
return True
|
| 994 |
-
return False
|
| 995 |
-
|
| 996 |
def process_red_text_in_paragraph(paragraph, context_text, flat_json):
|
| 997 |
-
"""
|
| 998 |
replacements_made = 0
|
| 999 |
|
| 1000 |
red_text_segments = []
|
|
@@ -1010,10 +1204,10 @@ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
|
|
| 1010 |
|
| 1011 |
json_value = None
|
| 1012 |
|
| 1013 |
-
#
|
| 1014 |
json_value = find_matching_json_value(combined_red_text, flat_json)
|
| 1015 |
|
| 1016 |
-
#
|
| 1017 |
if json_value is None:
|
| 1018 |
if "NHVAS APPROVED AUDITOR" in context_text.upper():
|
| 1019 |
auditor_fields = ["auditor name", "auditor", "nhvas auditor", "approved auditor", "print name"]
|
|
@@ -1031,7 +1225,7 @@ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
|
|
| 1031 |
print(f" β
Found operator match with field: '{field}'")
|
| 1032 |
break
|
| 1033 |
|
| 1034 |
-
#
|
| 1035 |
if json_value is None:
|
| 1036 |
context_queries = [
|
| 1037 |
f"{context_text} {combined_red_text}",
|
|
@@ -1042,7 +1236,7 @@ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
|
|
| 1042 |
for query in context_queries:
|
| 1043 |
json_value = find_matching_json_value(query, flat_json)
|
| 1044 |
if json_value is not None:
|
| 1045 |
-
print(f" β
Found match with combined query
|
| 1046 |
break
|
| 1047 |
|
| 1048 |
# Replace if match found
|
|
@@ -1065,22 +1259,20 @@ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
|
|
| 1065 |
return replacements_made
|
| 1066 |
|
| 1067 |
def force_red_text_replacement(document, flat_json):
|
| 1068 |
-
"""Force replacement of any remaining red text by trying ALL JSON values
|
| 1069 |
replacements_made = 0
|
| 1070 |
print(f"\nπ― FORCE FIX: Scanning for any remaining red text...")
|
| 1071 |
|
| 1072 |
-
# Collect
|
| 1073 |
all_values = {}
|
| 1074 |
for key, value in flat_json.items():
|
| 1075 |
if value:
|
| 1076 |
-
# Convert value to string properly
|
| 1077 |
value_str = get_value_as_string(value, key)
|
| 1078 |
|
| 1079 |
-
# Only add if we have a valid string
|
| 1080 |
if value_str and isinstance(value_str, str) and value_str.strip():
|
| 1081 |
all_values[key] = value_str.strip()
|
| 1082 |
|
| 1083 |
-
#
|
| 1084 |
if isinstance(value, list):
|
| 1085 |
for i, item in enumerate(value):
|
| 1086 |
item_str = str(item).strip() if item else ""
|
|
@@ -1106,28 +1298,27 @@ def force_red_text_replacement(document, flat_json):
|
|
| 1106 |
combined_red_text = " ".join(red_text_parts).strip()
|
| 1107 |
print(f" Red text: '{combined_red_text}'")
|
| 1108 |
|
| 1109 |
-
#
|
| 1110 |
best_match = None
|
| 1111 |
best_key = None
|
| 1112 |
|
| 1113 |
-
#
|
| 1114 |
for key, value in all_values.items():
|
| 1115 |
if combined_red_text.lower() == value.lower():
|
| 1116 |
best_match = value
|
| 1117 |
best_key = key
|
| 1118 |
break
|
| 1119 |
|
| 1120 |
-
#
|
| 1121 |
if not best_match:
|
| 1122 |
for key, value in all_values.items():
|
| 1123 |
-
# Try if red text contains this value or vice versa
|
| 1124 |
if (len(value) > 3 and value.lower() in combined_red_text.lower()) or \
|
| 1125 |
(len(combined_red_text) > 3 and combined_red_text.lower() in value.lower()):
|
| 1126 |
best_match = value
|
| 1127 |
best_key = key
|
| 1128 |
break
|
| 1129 |
|
| 1130 |
-
#
|
| 1131 |
if not best_match:
|
| 1132 |
red_words = set(word.lower() for word in combined_red_text.split() if len(word) > 2)
|
| 1133 |
best_score = 0
|
|
@@ -1216,9 +1407,8 @@ def force_red_text_replacement(document, flat_json):
|
|
| 1216 |
|
| 1217 |
return replacements_made
|
| 1218 |
|
| 1219 |
-
|
| 1220 |
def process_hf(json_file, docx_file, output_file):
|
| 1221 |
-
"""
|
| 1222 |
try:
|
| 1223 |
# Load JSON
|
| 1224 |
if hasattr(json_file, "read"):
|
|
@@ -1240,14 +1430,14 @@ def process_hf(json_file, docx_file, output_file):
|
|
| 1240 |
else:
|
| 1241 |
doc = Document(docx_file)
|
| 1242 |
|
| 1243 |
-
#
|
| 1244 |
-
print("π Starting
|
| 1245 |
|
| 1246 |
table_replacements = process_tables(doc, flat_json)
|
| 1247 |
paragraph_replacements = process_paragraphs(doc, flat_json)
|
| 1248 |
heading_replacements = process_headings(doc, flat_json)
|
| 1249 |
|
| 1250 |
-
#
|
| 1251 |
force_replacements = force_red_text_replacement(doc, flat_json)
|
| 1252 |
|
| 1253 |
total_replacements = table_replacements + paragraph_replacements + heading_replacements + force_replacements
|
|
|
|
| 602 |
"""Fix Operator Declaration table when values are empty"""
|
| 603 |
replacements_made = 0
|
| 604 |
|
| 605 |
+
print(f" π― FIX: Operator Declaration empty values processing")
|
| 606 |
|
| 607 |
# Check if this is an Operator Declaration table
|
| 608 |
table_context = ""
|
|
|
|
| 657 |
if name_replacement.strip():
|
| 658 |
# Extract just the name if it's a company name
|
| 659 |
if "Pty Ltd" in name_replacement or "Company" in name_replacement:
|
|
|
|
| 660 |
continue
|
| 661 |
|
| 662 |
if has_red_text(name_cell):
|
| 663 |
cell_replacements = replace_red_text_in_cell(name_cell, name_replacement)
|
| 664 |
else:
|
|
|
|
| 665 |
name_cell.text = name_replacement
|
| 666 |
cell_replacements = 1
|
| 667 |
|
|
|
|
| 673 |
if not position_text or has_red_text(position_cell):
|
| 674 |
print(f" π§ Fixing empty/red Position Title")
|
| 675 |
|
|
|
|
| 676 |
position_sources = [
|
| 677 |
"Operator Declaration.Position Title",
|
| 678 |
"Position Title"
|
|
|
|
| 687 |
if has_red_text(position_cell):
|
| 688 |
cell_replacements = replace_red_text_in_cell(position_cell, position_replacement)
|
| 689 |
else:
|
|
|
|
| 690 |
position_cell.text = position_replacement
|
| 691 |
cell_replacements = 1
|
| 692 |
|
|
|
|
| 699 |
position_cell.text = "Manager"
|
| 700 |
replacements_made += 1
|
| 701 |
print(f" β
Used fallback Position Title: 'Manager'")
|
| 702 |
+
break
|
| 703 |
+
|
| 704 |
+
return replacements_made
|
| 705 |
+
|
| 706 |
+
def handle_multiple_red_segments_in_cell(cell, flat_json):
|
| 707 |
+
"""Handle multiple red text segments within a single cell"""
|
| 708 |
+
replacements_made = 0
|
| 709 |
+
|
| 710 |
+
red_segments = extract_red_text_segments(cell)
|
| 711 |
+
if not red_segments:
|
| 712 |
+
return 0
|
| 713 |
+
|
| 714 |
+
# Try to match each segment individually
|
| 715 |
+
for i, segment in enumerate(red_segments):
|
| 716 |
+
segment_text = segment['text'].strip()
|
| 717 |
+
if segment_text:
|
| 718 |
+
json_value = find_matching_json_value(segment_text, flat_json)
|
| 719 |
+
if json_value is not None:
|
| 720 |
+
replacement_text = get_value_as_string(json_value, segment_text)
|
| 721 |
+
if replace_single_segment(segment, replacement_text):
|
| 722 |
+
replacements_made += 1
|
| 723 |
+
print(f" β
Replaced segment {i+1}: '{segment_text}' -> '{replacement_text}'")
|
| 724 |
+
|
| 725 |
+
return replacements_made
|
| 726 |
+
|
| 727 |
+
def handle_nature_business_multiline_fix(cell, flat_json):
|
| 728 |
+
"""Handle Nature of Business multiline red text"""
|
| 729 |
+
replacements_made = 0
|
| 730 |
+
|
| 731 |
+
# Extract red text to check if it looks like nature of business
|
| 732 |
+
red_text = ""
|
| 733 |
+
for paragraph in cell.paragraphs:
|
| 734 |
+
for run in paragraph.runs:
|
| 735 |
+
if is_red(run):
|
| 736 |
+
red_text += run.text
|
| 737 |
+
|
| 738 |
+
red_text = red_text.strip()
|
| 739 |
+
if not red_text:
|
| 740 |
+
return 0
|
| 741 |
+
|
| 742 |
+
# Check if this looks like nature of business content
|
| 743 |
+
nature_indicators = ["transport", "logistics", "freight", "delivery", "trucking", "haulage"]
|
| 744 |
+
if any(indicator in red_text.lower() for indicator in nature_indicators):
|
| 745 |
+
# Try to find nature of business in JSON
|
| 746 |
+
nature_value = find_matching_json_value("Nature of Business", flat_json)
|
| 747 |
+
if nature_value is not None:
|
| 748 |
+
replacement_text = get_value_as_string(nature_value, "Nature of Business")
|
| 749 |
+
cell_replacements = replace_red_text_in_cell(cell, replacement_text)
|
| 750 |
+
replacements_made += cell_replacements
|
| 751 |
+
print(f" β
Fixed Nature of Business multiline content")
|
| 752 |
+
|
| 753 |
+
return replacements_made
|
| 754 |
+
|
| 755 |
+
def handle_management_summary_fix(cell, flat_json):
|
| 756 |
+
"""Handle Management Summary content fixes"""
|
| 757 |
+
replacements_made = 0
|
| 758 |
+
|
| 759 |
+
# Extract red text
|
| 760 |
+
red_text = ""
|
| 761 |
+
for paragraph in cell.paragraphs:
|
| 762 |
+
for run in paragraph.runs:
|
| 763 |
+
if is_red(run):
|
| 764 |
+
red_text += run.text
|
| 765 |
+
|
| 766 |
+
red_text = red_text.strip()
|
| 767 |
+
if not red_text:
|
| 768 |
+
return 0
|
| 769 |
+
|
| 770 |
+
# Look for management summary data in new schema format
|
| 771 |
+
management_types = ["Mass Management Summary", "Maintenance Management Summary", "Fatigue Management Summary"]
|
| 772 |
+
|
| 773 |
+
for mgmt_type in management_types:
|
| 774 |
+
if mgmt_type in flat_json:
|
| 775 |
+
mgmt_data = flat_json[mgmt_type]
|
| 776 |
+
if isinstance(mgmt_data, dict):
|
| 777 |
+
# Try to match red text with any standard in this management type
|
| 778 |
+
for std_key, std_value in mgmt_data.items():
|
| 779 |
+
if isinstance(std_value, list) and std_value:
|
| 780 |
+
# Check if red text matches this standard
|
| 781 |
+
if len(red_text) > 10:
|
| 782 |
+
for item in std_value:
|
| 783 |
+
if red_text.lower() in str(item).lower() or str(item).lower() in red_text.lower():
|
| 784 |
+
replacement_text = "\n".join(str(i) for i in std_value)
|
| 785 |
+
cell_replacements = replace_red_text_in_cell(cell, replacement_text)
|
| 786 |
+
replacements_made += cell_replacements
|
| 787 |
+
print(f" β
Fixed {mgmt_type} - {std_key}")
|
| 788 |
+
return replacements_made
|
| 789 |
+
|
| 790 |
+
return replacements_made
|
| 791 |
+
|
| 792 |
+
def handle_operator_declaration_fix(table, flat_json):
|
| 793 |
+
"""Handle small Operator/Auditor Declaration tables"""
|
| 794 |
+
replacements_made = 0
|
| 795 |
+
|
| 796 |
+
if len(table.rows) > 4: # Only process small tables
|
| 797 |
+
return 0
|
| 798 |
+
|
| 799 |
+
# Get table context
|
| 800 |
+
table_text = ""
|
| 801 |
+
for row in table.rows:
|
| 802 |
+
for cell in row.cells:
|
| 803 |
+
table_text += get_clean_text(cell).lower() + " "
|
| 804 |
+
|
| 805 |
+
# Check if this is a declaration table
|
| 806 |
+
if not ("print name" in table_text or "signature" in table_text or "date" in table_text):
|
| 807 |
+
return 0
|
| 808 |
+
|
| 809 |
+
print(f" π― Processing declaration table")
|
| 810 |
+
|
| 811 |
+
# Process each cell with red text
|
| 812 |
+
for row_idx, row in enumerate(table.rows):
|
| 813 |
+
for cell_idx, cell in enumerate(row.cells):
|
| 814 |
+
if has_red_text(cell):
|
| 815 |
+
# Try common declaration fields
|
| 816 |
+
declaration_fields = [
|
| 817 |
+
"Print Name", "Position Title", "Signature", "Date",
|
| 818 |
+
"Operator Declaration.Print Name", "Operator Declaration.Position Title",
|
| 819 |
+
"NHVAS Approved Auditor Declaration.Print Name"
|
| 820 |
+
]
|
| 821 |
|
| 822 |
+
replaced = False
|
| 823 |
+
for field in declaration_fields:
|
| 824 |
+
field_value = find_matching_json_value(field, flat_json)
|
| 825 |
+
if field_value is not None:
|
| 826 |
+
replacement_text = get_value_as_string(field_value, field)
|
| 827 |
+
if replacement_text.strip():
|
| 828 |
+
cell_replacements = replace_red_text_in_cell(cell, replacement_text)
|
| 829 |
+
if cell_replacements > 0:
|
| 830 |
+
replacements_made += cell_replacements
|
| 831 |
+
print(f" β
Fixed declaration field: {field}")
|
| 832 |
+
replaced = True
|
| 833 |
+
break
|
| 834 |
+
|
| 835 |
+
# If no specific field match, try generic signature/date
|
| 836 |
+
if not replaced:
|
| 837 |
+
red_text = ""
|
| 838 |
+
for paragraph in cell.paragraphs:
|
| 839 |
+
for run in paragraph.runs:
|
| 840 |
+
if is_red(run):
|
| 841 |
+
red_text += run.text
|
| 842 |
+
|
| 843 |
+
if "signature" in red_text.lower():
|
| 844 |
+
cell_replacements = replace_red_text_in_cell(cell, "[Signature]")
|
| 845 |
+
replacements_made += cell_replacements
|
| 846 |
+
elif "date" in red_text.lower():
|
| 847 |
+
cell_replacements = replace_red_text_in_cell(cell, "[Date]")
|
| 848 |
+
replacements_made += cell_replacements
|
| 849 |
+
|
| 850 |
+
return replacements_made
|
| 851 |
+
|
| 852 |
+
def handle_print_accreditation_section(table, flat_json):
|
| 853 |
+
"""Handle Print Accreditation section"""
|
| 854 |
+
replacements_made = 0
|
| 855 |
+
|
| 856 |
+
print(f" π Processing Print Accreditation section")
|
| 857 |
+
|
| 858 |
+
for row_idx, row in enumerate(table.rows):
|
| 859 |
+
for cell_idx, cell in enumerate(row.cells):
|
| 860 |
+
if has_red_text(cell):
|
| 861 |
+
# Try print accreditation fields
|
| 862 |
+
accreditation_fields = [
|
| 863 |
+
"(print accreditation name)",
|
| 864 |
+
"Print Name",
|
| 865 |
+
"Operator name (Legal entity)"
|
| 866 |
+
]
|
| 867 |
+
|
| 868 |
+
for field in accreditation_fields:
|
| 869 |
+
field_value = find_matching_json_value(field, flat_json)
|
| 870 |
+
if field_value is not None:
|
| 871 |
+
replacement_text = get_value_as_string(field_value, field)
|
| 872 |
+
if replacement_text.strip():
|
| 873 |
+
cell_replacements = replace_red_text_in_cell(cell, replacement_text)
|
| 874 |
+
replacements_made += cell_replacements
|
| 875 |
+
if cell_replacements > 0:
|
| 876 |
+
print(f" β
Fixed accreditation: {field}")
|
| 877 |
+
break
|
| 878 |
+
|
| 879 |
+
return replacements_made
|
| 880 |
+
|
| 881 |
+
def process_single_column_sections(cell, key_text, flat_json):
|
| 882 |
+
"""Process single column sections with red text"""
|
| 883 |
+
replacements_made = 0
|
| 884 |
+
|
| 885 |
+
if has_red_text(cell):
|
| 886 |
+
red_text = ""
|
| 887 |
+
for paragraph in cell.paragraphs:
|
| 888 |
+
for run in paragraph.runs:
|
| 889 |
+
if is_red(run):
|
| 890 |
+
red_text += run.text
|
| 891 |
+
|
| 892 |
+
if red_text.strip():
|
| 893 |
+
# Try direct matching first
|
| 894 |
+
section_value = find_matching_json_value(red_text.strip(), flat_json)
|
| 895 |
+
if section_value is None:
|
| 896 |
+
# Try key-based matching
|
| 897 |
+
section_value = find_matching_json_value(key_text, flat_json)
|
| 898 |
+
|
| 899 |
+
if section_value is not None:
|
| 900 |
+
section_replacement = get_value_as_string(section_value, red_text.strip())
|
| 901 |
+
cell_replacements = replace_red_text_in_cell(cell, section_replacement)
|
| 902 |
+
replacements_made += cell_replacements
|
| 903 |
+
if cell_replacements > 0:
|
| 904 |
+
print(f" β
Fixed single column section: '{key_text}'")
|
| 905 |
|
| 906 |
return replacements_made
|
| 907 |
|
| 908 |
def process_tables(document, flat_json):
|
| 909 |
+
"""Process all tables in the document with comprehensive fixes"""
|
| 910 |
replacements_made = 0
|
| 911 |
|
| 912 |
for table_idx, table in enumerate(document.tables):
|
| 913 |
print(f"\nπ Processing table {table_idx + 1}:")
|
| 914 |
|
| 915 |
+
# Get table context
|
| 916 |
table_text = ""
|
| 917 |
for row in table.rows[:3]:
|
| 918 |
for cell in row.cells:
|
| 919 |
table_text += get_clean_text(cell).lower() + " "
|
| 920 |
|
| 921 |
+
# Detect Management Summary tables
|
| 922 |
management_summary_indicators = ["mass management", "maintenance management", "fatigue management"]
|
| 923 |
has_management = any(indicator in table_text for indicator in management_summary_indicators)
|
| 924 |
has_details = "details" in table_text
|
|
|
|
| 927 |
print(f" π Detected Management Summary table")
|
| 928 |
summary_fixes = fix_management_summary_details_column(table, flat_json)
|
| 929 |
replacements_made += summary_fixes
|
| 930 |
+
|
| 931 |
+
# Process remaining red text in management summary
|
| 932 |
summary_replacements = 0
|
| 933 |
for row_idx, row in enumerate(table.rows):
|
| 934 |
for cell_idx, cell in enumerate(row.cells):
|
|
|
|
| 962 |
replacements_made += summary_replacements
|
| 963 |
continue
|
| 964 |
|
| 965 |
+
# Detect Vehicle Registration tables
|
| 966 |
vehicle_indicators = ["registration number", "sub-contractor", "weight verification", "rfs suspension"]
|
| 967 |
indicator_count = sum(1 for indicator in vehicle_indicators if indicator in table_text)
|
| 968 |
if indicator_count >= 2:
|
|
|
|
| 971 |
replacements_made += vehicle_replacements
|
| 972 |
continue
|
| 973 |
|
| 974 |
+
# Detect Attendance List tables
|
| 975 |
if "attendance list" in table_text and "names and position titles" in table_text:
|
| 976 |
print(f" π₯ Detected Attendance List table")
|
| 977 |
attendance_replacements = handle_attendance_list_table_enhanced(table, flat_json)
|
| 978 |
replacements_made += attendance_replacements
|
| 979 |
continue
|
| 980 |
|
| 981 |
+
# Detect Print Accreditation tables
|
| 982 |
print_accreditation_indicators = ["print name", "position title"]
|
| 983 |
indicator_count = sum(1 for indicator in print_accreditation_indicators if indicator in table_text)
|
| 984 |
if indicator_count >= 1:
|
| 985 |
print(f" π Detected Print Accreditation table")
|
| 986 |
+
|
| 987 |
+
# Check for declaration tables that need fixing
|
| 988 |
+
if "print name" in table_text and "position" in table_text:
|
| 989 |
+
declaration_fixes = fix_operator_declaration_empty_values(table, flat_json)
|
| 990 |
+
replacements_made += declaration_fixes
|
| 991 |
+
|
| 992 |
print_accreditation_replacements = handle_print_accreditation_section(table, flat_json)
|
| 993 |
replacements_made += print_accreditation_replacements
|
| 994 |
continue
|
| 995 |
|
| 996 |
+
# Process regular table rows
|
| 997 |
for row_idx, row in enumerate(table.rows):
|
| 998 |
if len(row.cells) < 1:
|
| 999 |
continue
|
|
|
|
| 1011 |
if json_value is not None:
|
| 1012 |
replacement_text = get_value_as_string(json_value, key_text)
|
| 1013 |
|
| 1014 |
+
# Handle Australian Company Number
|
| 1015 |
if ("australian company number" in key_text.lower() or "company number" in key_text.lower()) and isinstance(json_value, list):
|
| 1016 |
cell_replacements = handle_australian_company_number(row, json_value)
|
| 1017 |
replacements_made += cell_replacements
|
| 1018 |
|
| 1019 |
+
# Handle section headers
|
| 1020 |
elif ("attendance list" in key_text.lower() or "nature of" in key_text.lower()) and row_idx + 1 < len(table.rows):
|
| 1021 |
+
print(f" β
Section header detected, checking next row...")
|
| 1022 |
next_row = table.rows[row_idx + 1]
|
| 1023 |
|
| 1024 |
for cell_idx, cell in enumerate(next_row.cells):
|
|
|
|
| 1029 |
cell_replacements = replace_red_text_in_cell(cell, replacement_text)
|
| 1030 |
replacements_made += cell_replacements
|
| 1031 |
if cell_replacements > 0:
|
| 1032 |
+
print(f" -> Replaced section content")
|
| 1033 |
|
| 1034 |
+
# Handle single column sections
|
| 1035 |
elif len(row.cells) == 1 or (len(row.cells) > 1 and not any(has_red_text(row.cells[i]) for i in range(1, len(row.cells)))):
|
| 1036 |
if has_red_text(key_cell):
|
| 1037 |
cell_replacements = process_single_column_sections(key_cell, key_text, flat_json)
|
| 1038 |
replacements_made += cell_replacements
|
| 1039 |
+
|
| 1040 |
+
# Handle regular key-value pairs
|
| 1041 |
else:
|
| 1042 |
for cell_idx in range(1, len(row.cells)):
|
| 1043 |
value_cell = row.cells[cell_idx]
|
|
|
|
| 1045 |
print(f" β
Found red text in column {cell_idx + 1}")
|
| 1046 |
cell_replacements = replace_red_text_in_cell(value_cell, replacement_text)
|
| 1047 |
replacements_made += cell_replacements
|
| 1048 |
+
|
| 1049 |
else:
|
| 1050 |
+
# Fallback processing for unmatched keys
|
| 1051 |
if len(row.cells) == 1 and has_red_text(key_cell):
|
| 1052 |
red_text = ""
|
| 1053 |
for paragraph in key_cell.paragraphs:
|
|
|
|
| 1061 |
cell_replacements = replace_red_text_in_cell(key_cell, section_replacement)
|
| 1062 |
replacements_made += cell_replacements
|
| 1063 |
|
| 1064 |
+
# Process red text in all cells
|
| 1065 |
for cell_idx in range(len(row.cells)):
|
| 1066 |
cell = row.cells[cell_idx]
|
| 1067 |
if has_red_text(cell):
|
| 1068 |
cell_replacements = handle_multiple_red_segments_in_cell(cell, flat_json)
|
| 1069 |
replacements_made += cell_replacements
|
| 1070 |
|
| 1071 |
+
# Apply fixes if no replacements made
|
| 1072 |
if cell_replacements == 0:
|
| 1073 |
surgical_fix = handle_nature_business_multiline_fix(cell, flat_json)
|
| 1074 |
replacements_made += surgical_fix
|
| 1075 |
|
| 1076 |
+
if cell_replacements == 0:
|
|
|
|
| 1077 |
management_summary_fix = handle_management_summary_fix(cell, flat_json)
|
| 1078 |
replacements_made += management_summary_fix
|
| 1079 |
|
| 1080 |
+
# Handle Operator/Auditor Declaration tables (check last few tables)
|
| 1081 |
+
print(f"\nπ― Final check for Declaration tables...")
|
| 1082 |
+
for table in document.tables[-3:]:
|
| 1083 |
+
if len(table.rows) <= 4:
|
| 1084 |
+
declaration_fix = handle_operator_declaration_fix(table, flat_json)
|
| 1085 |
+
replacements_made += declaration_fix
|
| 1086 |
+
|
| 1087 |
+
return replacements_made
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1088 |
|
| 1089 |
def process_paragraphs(document, flat_json):
|
| 1090 |
+
"""Process all paragraphs in the document"""
|
| 1091 |
replacements_made = 0
|
| 1092 |
print(f"\nπ Processing paragraphs:")
|
| 1093 |
|
| 1094 |
for para_idx, paragraph in enumerate(document.paragraphs):
|
| 1095 |
red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
|
| 1096 |
if red_runs:
|
|
|
|
| 1097 |
red_text_only = "".join(run.text for run in red_runs).strip()
|
| 1098 |
print(f" π Paragraph {para_idx + 1}: Found red text: '{red_text_only}'")
|
| 1099 |
|
|
|
|
| 1100 |
json_value = find_matching_json_value(red_text_only, flat_json)
|
| 1101 |
|
| 1102 |
if json_value is None:
|
|
|
|
| 1118 |
return replacements_made
|
| 1119 |
|
| 1120 |
def process_headings(document, flat_json):
|
| 1121 |
+
"""Process headings and their related content"""
|
| 1122 |
replacements_made = 0
|
| 1123 |
print(f"\nπ Processing headings:")
|
| 1124 |
|
|
|
|
| 1130 |
if not paragraph_text:
|
| 1131 |
continue
|
| 1132 |
|
| 1133 |
+
# Check if this is a heading
|
| 1134 |
matched_heading = None
|
| 1135 |
for category, patterns in HEADING_PATTERNS.items():
|
| 1136 |
for pattern in patterns:
|
|
|
|
| 1149 |
heading_replacements = process_red_text_in_paragraph(paragraph, paragraph_text, flat_json)
|
| 1150 |
replacements_made += heading_replacements
|
| 1151 |
|
| 1152 |
+
# Look ahead for related content
|
| 1153 |
+
for next_para_offset in range(1, 6):
|
| 1154 |
next_para_idx = para_idx + next_para_offset
|
| 1155 |
if next_para_idx >= len(paragraphs):
|
| 1156 |
break
|
|
|
|
| 1174 |
if is_another_heading:
|
| 1175 |
break
|
| 1176 |
|
| 1177 |
+
# Process red text with context
|
| 1178 |
if has_red_text_in_paragraph(next_paragraph):
|
| 1179 |
+
print(f" π΄ Found red text in paragraph {next_para_idx + 1} after heading")
|
| 1180 |
|
| 1181 |
context_replacements = process_red_text_in_paragraph(
|
| 1182 |
next_paragraph,
|
|
|
|
| 1187 |
|
| 1188 |
return replacements_made
|
| 1189 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1190 |
def process_red_text_in_paragraph(paragraph, context_text, flat_json):
|
| 1191 |
+
"""Process red text within a paragraph using context"""
|
| 1192 |
replacements_made = 0
|
| 1193 |
|
| 1194 |
red_text_segments = []
|
|
|
|
| 1204 |
|
| 1205 |
json_value = None
|
| 1206 |
|
| 1207 |
+
# Direct matching
|
| 1208 |
json_value = find_matching_json_value(combined_red_text, flat_json)
|
| 1209 |
|
| 1210 |
+
# Context-based matching
|
| 1211 |
if json_value is None:
|
| 1212 |
if "NHVAS APPROVED AUDITOR" in context_text.upper():
|
| 1213 |
auditor_fields = ["auditor name", "auditor", "nhvas auditor", "approved auditor", "print name"]
|
|
|
|
| 1225 |
print(f" β
Found operator match with field: '{field}'")
|
| 1226 |
break
|
| 1227 |
|
| 1228 |
+
# Combined context queries
|
| 1229 |
if json_value is None:
|
| 1230 |
context_queries = [
|
| 1231 |
f"{context_text} {combined_red_text}",
|
|
|
|
| 1236 |
for query in context_queries:
|
| 1237 |
json_value = find_matching_json_value(query, flat_json)
|
| 1238 |
if json_value is not None:
|
| 1239 |
+
print(f" β
Found match with combined query")
|
| 1240 |
break
|
| 1241 |
|
| 1242 |
# Replace if match found
|
|
|
|
| 1259 |
return replacements_made
|
| 1260 |
|
| 1261 |
def force_red_text_replacement(document, flat_json):
|
| 1262 |
+
"""Force replacement of any remaining red text by trying ALL JSON values"""
|
| 1263 |
replacements_made = 0
|
| 1264 |
print(f"\nπ― FORCE FIX: Scanning for any remaining red text...")
|
| 1265 |
|
| 1266 |
+
# Collect all possible replacement values from JSON
|
| 1267 |
all_values = {}
|
| 1268 |
for key, value in flat_json.items():
|
| 1269 |
if value:
|
|
|
|
| 1270 |
value_str = get_value_as_string(value, key)
|
| 1271 |
|
|
|
|
| 1272 |
if value_str and isinstance(value_str, str) and value_str.strip():
|
| 1273 |
all_values[key] = value_str.strip()
|
| 1274 |
|
| 1275 |
+
# Store individual items from lists for partial matching
|
| 1276 |
if isinstance(value, list):
|
| 1277 |
for i, item in enumerate(value):
|
| 1278 |
item_str = str(item).strip() if item else ""
|
|
|
|
| 1298 |
combined_red_text = " ".join(red_text_parts).strip()
|
| 1299 |
print(f" Red text: '{combined_red_text}'")
|
| 1300 |
|
| 1301 |
+
# Find best match
|
| 1302 |
best_match = None
|
| 1303 |
best_key = None
|
| 1304 |
|
| 1305 |
+
# Exact matching
|
| 1306 |
for key, value in all_values.items():
|
| 1307 |
if combined_red_text.lower() == value.lower():
|
| 1308 |
best_match = value
|
| 1309 |
best_key = key
|
| 1310 |
break
|
| 1311 |
|
| 1312 |
+
# Partial matching
|
| 1313 |
if not best_match:
|
| 1314 |
for key, value in all_values.items():
|
|
|
|
| 1315 |
if (len(value) > 3 and value.lower() in combined_red_text.lower()) or \
|
| 1316 |
(len(combined_red_text) > 3 and combined_red_text.lower() in value.lower()):
|
| 1317 |
best_match = value
|
| 1318 |
best_key = key
|
| 1319 |
break
|
| 1320 |
|
| 1321 |
+
# Word-by-word matching for names/dates
|
| 1322 |
if not best_match:
|
| 1323 |
red_words = set(word.lower() for word in combined_red_text.split() if len(word) > 2)
|
| 1324 |
best_score = 0
|
|
|
|
| 1407 |
|
| 1408 |
return replacements_made
|
| 1409 |
|
|
|
|
| 1410 |
def process_hf(json_file, docx_file, output_file):
|
| 1411 |
+
"""Main processing function with comprehensive error handling"""
|
| 1412 |
try:
|
| 1413 |
# Load JSON
|
| 1414 |
if hasattr(json_file, "read"):
|
|
|
|
| 1430 |
else:
|
| 1431 |
doc = Document(docx_file)
|
| 1432 |
|
| 1433 |
+
# Process document with all fixes
|
| 1434 |
+
print("π Starting comprehensive document processing...")
|
| 1435 |
|
| 1436 |
table_replacements = process_tables(doc, flat_json)
|
| 1437 |
paragraph_replacements = process_paragraphs(doc, flat_json)
|
| 1438 |
heading_replacements = process_headings(doc, flat_json)
|
| 1439 |
|
| 1440 |
+
# Final force fix for any remaining red text
|
| 1441 |
force_replacements = force_red_text_replacement(doc, flat_json)
|
| 1442 |
|
| 1443 |
total_replacements = table_replacements + paragraph_replacements + heading_replacements + force_replacements
|