Shami96 commited on
Commit
c0e794c
Β·
verified Β·
1 Parent(s): b93e8d9

Update updated_word.py

Browse files
Files changed (1) hide show
  1. updated_word.py +261 -71
updated_word.py CHANGED
@@ -602,7 +602,7 @@ def fix_operator_declaration_empty_values(table, flat_json):
602
  """Fix Operator Declaration table when values are empty"""
603
  replacements_made = 0
604
 
605
- print(f" 🎯 FIX 2: Operator Declaration empty values processing")
606
 
607
  # Check if this is an Operator Declaration table
608
  table_context = ""
@@ -657,13 +657,11 @@ def fix_operator_declaration_empty_values(table, flat_json):
657
  if name_replacement.strip():
658
  # Extract just the name if it's a company name
659
  if "Pty Ltd" in name_replacement or "Company" in name_replacement:
660
- # Try to get individual name instead
661
  continue
662
 
663
  if has_red_text(name_cell):
664
  cell_replacements = replace_red_text_in_cell(name_cell, name_replacement)
665
  else:
666
- # Cell is empty, add text directly
667
  name_cell.text = name_replacement
668
  cell_replacements = 1
669
 
@@ -675,7 +673,6 @@ def fix_operator_declaration_empty_values(table, flat_json):
675
  if not position_text or has_red_text(position_cell):
676
  print(f" πŸ”§ Fixing empty/red Position Title")
677
 
678
- # Try multiple sources for position
679
  position_sources = [
680
  "Operator Declaration.Position Title",
681
  "Position Title"
@@ -690,7 +687,6 @@ def fix_operator_declaration_empty_values(table, flat_json):
690
  if has_red_text(position_cell):
691
  cell_replacements = replace_red_text_in_cell(position_cell, position_replacement)
692
  else:
693
- # Cell is empty, add text directly
694
  position_cell.text = position_replacement
695
  cell_replacements = 1
696
 
@@ -703,25 +699,226 @@ def fix_operator_declaration_empty_values(table, flat_json):
703
  position_cell.text = "Manager"
704
  replacements_made += 1
705
  print(f" βœ… Used fallback Position Title: 'Manager'")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
706
 
707
- break # Found the table, stop looking
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
708
 
709
  return replacements_made
710
 
711
  def process_tables(document, flat_json):
712
- """Your original function with ALL surgical fixes added"""
713
  replacements_made = 0
714
 
715
  for table_idx, table in enumerate(document.tables):
716
  print(f"\nπŸ” Processing table {table_idx + 1}:")
717
 
718
- # Your original logic
719
  table_text = ""
720
  for row in table.rows[:3]:
721
  for cell in row.cells:
722
  table_text += get_clean_text(cell).lower() + " "
723
 
724
- # 🎯 NEW: Detect Management Summary tables (with DETAILS column)
725
  management_summary_indicators = ["mass management", "maintenance management", "fatigue management"]
726
  has_management = any(indicator in table_text for indicator in management_summary_indicators)
727
  has_details = "details" in table_text
@@ -730,7 +927,8 @@ def process_tables(document, flat_json):
730
  print(f" πŸ“‹ Detected Management Summary table")
731
  summary_fixes = fix_management_summary_details_column(table, flat_json)
732
  replacements_made += summary_fixes
733
- # Process each cell in the table to find red text and apply the existing fix
 
734
  summary_replacements = 0
735
  for row_idx, row in enumerate(table.rows):
736
  for cell_idx, cell in enumerate(row.cells):
@@ -764,7 +962,7 @@ def process_tables(document, flat_json):
764
  replacements_made += summary_replacements
765
  continue
766
 
767
- # Enhanced vehicle registration detection
768
  vehicle_indicators = ["registration number", "sub-contractor", "weight verification", "rfs suspension"]
769
  indicator_count = sum(1 for indicator in vehicle_indicators if indicator in table_text)
770
  if indicator_count >= 2:
@@ -773,23 +971,29 @@ def process_tables(document, flat_json):
773
  replacements_made += vehicle_replacements
774
  continue
775
 
776
- # 🎯 FINAL FIX 1: Enhanced attendance list detection
777
  if "attendance list" in table_text and "names and position titles" in table_text:
778
  print(f" πŸ‘₯ Detected Attendance List table")
779
  attendance_replacements = handle_attendance_list_table_enhanced(table, flat_json)
780
  replacements_made += attendance_replacements
781
  continue
782
 
783
- # Enhanced print accreditation detection
784
  print_accreditation_indicators = ["print name", "position title"]
785
  indicator_count = sum(1 for indicator in print_accreditation_indicators if indicator in table_text)
786
  if indicator_count >= 1:
787
  print(f" πŸ“‹ Detected Print Accreditation table")
 
 
 
 
 
 
788
  print_accreditation_replacements = handle_print_accreditation_section(table, flat_json)
789
  replacements_made += print_accreditation_replacements
790
  continue
791
 
792
- # Your existing row processing
793
  for row_idx, row in enumerate(table.rows):
794
  if len(row.cells) < 1:
795
  continue
@@ -807,14 +1011,14 @@ def process_tables(document, flat_json):
807
  if json_value is not None:
808
  replacement_text = get_value_as_string(json_value, key_text)
809
 
810
- # Enhanced ACN handling
811
  if ("australian company number" in key_text.lower() or "company number" in key_text.lower()) and isinstance(json_value, list):
812
  cell_replacements = handle_australian_company_number(row, json_value)
813
  replacements_made += cell_replacements
814
 
815
- # Enhanced section header handling
816
  elif ("attendance list" in key_text.lower() or "nature of" in key_text.lower()) and row_idx + 1 < len(table.rows):
817
- print(f" βœ… Section header detected, checking next row for content...")
818
  next_row = table.rows[row_idx + 1]
819
 
820
  for cell_idx, cell in enumerate(next_row.cells):
@@ -825,12 +1029,15 @@ def process_tables(document, flat_json):
825
  cell_replacements = replace_red_text_in_cell(cell, replacement_text)
826
  replacements_made += cell_replacements
827
  if cell_replacements > 0:
828
- print(f" -> Replaced section content with: '{replacement_text[:100]}...'")
829
 
 
830
  elif len(row.cells) == 1 or (len(row.cells) > 1 and not any(has_red_text(row.cells[i]) for i in range(1, len(row.cells)))):
831
  if has_red_text(key_cell):
832
  cell_replacements = process_single_column_sections(key_cell, key_text, flat_json)
833
  replacements_made += cell_replacements
 
 
834
  else:
835
  for cell_idx in range(1, len(row.cells)):
836
  value_cell = row.cells[cell_idx]
@@ -838,8 +1045,9 @@ def process_tables(document, flat_json):
838
  print(f" βœ… Found red text in column {cell_idx + 1}")
839
  cell_replacements = replace_red_text_in_cell(value_cell, replacement_text)
840
  replacements_made += cell_replacements
 
841
  else:
842
- # Enhanced fallback processing for unmatched keys
843
  if len(row.cells) == 1 and has_red_text(key_cell):
844
  red_text = ""
845
  for paragraph in key_cell.paragraphs:
@@ -853,49 +1061,42 @@ def process_tables(document, flat_json):
853
  cell_replacements = replace_red_text_in_cell(key_cell, section_replacement)
854
  replacements_made += cell_replacements
855
 
856
- # Enhanced red text processing for all cells
857
  for cell_idx in range(len(row.cells)):
858
  cell = row.cells[cell_idx]
859
  if has_red_text(cell):
860
  cell_replacements = handle_multiple_red_segments_in_cell(cell, flat_json)
861
  replacements_made += cell_replacements
862
 
863
- # 🎯 SURGICAL FIX 1: Only if no replacements were made
864
  if cell_replacements == 0:
865
  surgical_fix = handle_nature_business_multiline_fix(cell, flat_json)
866
  replacements_made += surgical_fix
867
 
868
- # 🎯 FINAL FIX 2: Only if still no replacements were made, try ANY Management Summary fix
869
- if cell_replacements == 0 and surgical_fix == 0:
870
  management_summary_fix = handle_management_summary_fix(cell, flat_json)
871
  replacements_made += management_summary_fix
872
 
873
- # 🎯 SURGICAL FIX 3: Handle Operator Declaration tables (only check last few tables)
874
- print(f"\n🎯 SURGICAL FIX: Checking for Operator/Auditor Declaration tables...")
875
- for table in document.tables[-3:]: # Only check last 3 tables
876
- if len(table.rows) <= 4: # Only small tables
877
- declaration_fix = handle_operator_declaration_fix(table, flat_json)
878
- replacements_made += declaration_fix
879
- # Check for declaration tables that need fixing
880
- if "print name" in table_text and "position" in table_text:
881
- declaration_fixes = fix_operator_declaration_empty_values(table, flat_json)
882
- replacements_made += declaration_fixes
883
-
884
- return replacements_made
885
 
886
  def process_paragraphs(document, flat_json):
887
- """Your original function (unchanged)"""
888
  replacements_made = 0
889
  print(f"\nπŸ” Processing paragraphs:")
890
 
891
  for para_idx, paragraph in enumerate(document.paragraphs):
892
  red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
893
  if red_runs:
894
- full_text = paragraph.text.strip()
895
  red_text_only = "".join(run.text for run in red_runs).strip()
896
  print(f" πŸ“Œ Paragraph {para_idx + 1}: Found red text: '{red_text_only}'")
897
 
898
- # Your existing matching logic
899
  json_value = find_matching_json_value(red_text_only, flat_json)
900
 
901
  if json_value is None:
@@ -917,7 +1118,7 @@ def process_paragraphs(document, flat_json):
917
  return replacements_made
918
 
919
  def process_headings(document, flat_json):
920
- """Your original function (unchanged)"""
921
  replacements_made = 0
922
  print(f"\nπŸ” Processing headings:")
923
 
@@ -929,7 +1130,7 @@ def process_headings(document, flat_json):
929
  if not paragraph_text:
930
  continue
931
 
932
- # Enhanced heading detection
933
  matched_heading = None
934
  for category, patterns in HEADING_PATTERNS.items():
935
  for pattern in patterns:
@@ -948,8 +1149,8 @@ def process_headings(document, flat_json):
948
  heading_replacements = process_red_text_in_paragraph(paragraph, paragraph_text, flat_json)
949
  replacements_made += heading_replacements
950
 
951
- # Enhanced: Look further ahead for related content
952
- for next_para_offset in range(1, 6): # Extended range
953
  next_para_idx = para_idx + next_para_offset
954
  if next_para_idx >= len(paragraphs):
955
  break
@@ -973,9 +1174,9 @@ def process_headings(document, flat_json):
973
  if is_another_heading:
974
  break
975
 
976
- # Process red text with enhanced context
977
  if has_red_text_in_paragraph(next_paragraph):
978
- print(f" πŸ”΄ Found red text in paragraph {next_para_idx + 1} after heading: '{next_text[:50]}...'")
979
 
980
  context_replacements = process_red_text_in_paragraph(
981
  next_paragraph,
@@ -986,15 +1187,8 @@ def process_headings(document, flat_json):
986
 
987
  return replacements_made
988
 
989
- def has_red_text_in_paragraph(paragraph):
990
- """Your original function (unchanged)"""
991
- for run in paragraph.runs:
992
- if is_red(run) and run.text.strip():
993
- return True
994
- return False
995
-
996
  def process_red_text_in_paragraph(paragraph, context_text, flat_json):
997
- """Your original function (unchanged)"""
998
  replacements_made = 0
999
 
1000
  red_text_segments = []
@@ -1010,10 +1204,10 @@ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
1010
 
1011
  json_value = None
1012
 
1013
- # Strategy 1: Direct matching
1014
  json_value = find_matching_json_value(combined_red_text, flat_json)
1015
 
1016
- # Strategy 2: Enhanced context-based matching
1017
  if json_value is None:
1018
  if "NHVAS APPROVED AUDITOR" in context_text.upper():
1019
  auditor_fields = ["auditor name", "auditor", "nhvas auditor", "approved auditor", "print name"]
@@ -1031,7 +1225,7 @@ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
1031
  print(f" βœ… Found operator match with field: '{field}'")
1032
  break
1033
 
1034
- # Strategy 3: Enhanced context combination
1035
  if json_value is None:
1036
  context_queries = [
1037
  f"{context_text} {combined_red_text}",
@@ -1042,7 +1236,7 @@ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
1042
  for query in context_queries:
1043
  json_value = find_matching_json_value(query, flat_json)
1044
  if json_value is not None:
1045
- print(f" βœ… Found match with combined query: '{query[:50]}...'")
1046
  break
1047
 
1048
  # Replace if match found
@@ -1065,22 +1259,20 @@ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
1065
  return replacements_made
1066
 
1067
  def force_red_text_replacement(document, flat_json):
1068
- """Force replacement of any remaining red text by trying ALL JSON values - FIXED"""
1069
  replacements_made = 0
1070
  print(f"\n🎯 FORCE FIX: Scanning for any remaining red text...")
1071
 
1072
- # Collect ALL possible replacement values from JSON - FIXED to handle lists properly
1073
  all_values = {}
1074
  for key, value in flat_json.items():
1075
  if value:
1076
- # Convert value to string properly
1077
  value_str = get_value_as_string(value, key)
1078
 
1079
- # Only add if we have a valid string
1080
  if value_str and isinstance(value_str, str) and value_str.strip():
1081
  all_values[key] = value_str.strip()
1082
 
1083
- # Also store individual items from lists for partial matching
1084
  if isinstance(value, list):
1085
  for i, item in enumerate(value):
1086
  item_str = str(item).strip() if item else ""
@@ -1106,28 +1298,27 @@ def force_red_text_replacement(document, flat_json):
1106
  combined_red_text = " ".join(red_text_parts).strip()
1107
  print(f" Red text: '{combined_red_text}'")
1108
 
1109
- # Try to find a match
1110
  best_match = None
1111
  best_key = None
1112
 
1113
- # First try exact matching
1114
  for key, value in all_values.items():
1115
  if combined_red_text.lower() == value.lower():
1116
  best_match = value
1117
  best_key = key
1118
  break
1119
 
1120
- # If no exact match, try partial matching
1121
  if not best_match:
1122
  for key, value in all_values.items():
1123
- # Try if red text contains this value or vice versa
1124
  if (len(value) > 3 and value.lower() in combined_red_text.lower()) or \
1125
  (len(combined_red_text) > 3 and combined_red_text.lower() in value.lower()):
1126
  best_match = value
1127
  best_key = key
1128
  break
1129
 
1130
- # If still no match, try word-by-word matching for names/dates
1131
  if not best_match:
1132
  red_words = set(word.lower() for word in combined_red_text.split() if len(word) > 2)
1133
  best_score = 0
@@ -1216,9 +1407,8 @@ def force_red_text_replacement(document, flat_json):
1216
 
1217
  return replacements_made
1218
 
1219
-
1220
  def process_hf(json_file, docx_file, output_file):
1221
- """Your original main function with force fix added at the end"""
1222
  try:
1223
  # Load JSON
1224
  if hasattr(json_file, "read"):
@@ -1240,14 +1430,14 @@ def process_hf(json_file, docx_file, output_file):
1240
  else:
1241
  doc = Document(docx_file)
1242
 
1243
- # Your original processing with surgical fixes
1244
- print("πŸš€ Starting processing with minimal surgical fixes...")
1245
 
1246
  table_replacements = process_tables(doc, flat_json)
1247
  paragraph_replacements = process_paragraphs(doc, flat_json)
1248
  heading_replacements = process_headings(doc, flat_json)
1249
 
1250
- # 🎯 ADD THIS: Force fix for any remaining red text
1251
  force_replacements = force_red_text_replacement(doc, flat_json)
1252
 
1253
  total_replacements = table_replacements + paragraph_replacements + heading_replacements + force_replacements
 
602
  """Fix Operator Declaration table when values are empty"""
603
  replacements_made = 0
604
 
605
+ print(f" 🎯 FIX: Operator Declaration empty values processing")
606
 
607
  # Check if this is an Operator Declaration table
608
  table_context = ""
 
657
  if name_replacement.strip():
658
  # Extract just the name if it's a company name
659
  if "Pty Ltd" in name_replacement or "Company" in name_replacement:
 
660
  continue
661
 
662
  if has_red_text(name_cell):
663
  cell_replacements = replace_red_text_in_cell(name_cell, name_replacement)
664
  else:
 
665
  name_cell.text = name_replacement
666
  cell_replacements = 1
667
 
 
673
  if not position_text or has_red_text(position_cell):
674
  print(f" πŸ”§ Fixing empty/red Position Title")
675
 
 
676
  position_sources = [
677
  "Operator Declaration.Position Title",
678
  "Position Title"
 
687
  if has_red_text(position_cell):
688
  cell_replacements = replace_red_text_in_cell(position_cell, position_replacement)
689
  else:
 
690
  position_cell.text = position_replacement
691
  cell_replacements = 1
692
 
 
699
  position_cell.text = "Manager"
700
  replacements_made += 1
701
  print(f" βœ… Used fallback Position Title: 'Manager'")
702
+ break
703
+
704
+ return replacements_made
705
+
706
+ def handle_multiple_red_segments_in_cell(cell, flat_json):
707
+ """Handle multiple red text segments within a single cell"""
708
+ replacements_made = 0
709
+
710
+ red_segments = extract_red_text_segments(cell)
711
+ if not red_segments:
712
+ return 0
713
+
714
+ # Try to match each segment individually
715
+ for i, segment in enumerate(red_segments):
716
+ segment_text = segment['text'].strip()
717
+ if segment_text:
718
+ json_value = find_matching_json_value(segment_text, flat_json)
719
+ if json_value is not None:
720
+ replacement_text = get_value_as_string(json_value, segment_text)
721
+ if replace_single_segment(segment, replacement_text):
722
+ replacements_made += 1
723
+ print(f" βœ… Replaced segment {i+1}: '{segment_text}' -> '{replacement_text}'")
724
+
725
+ return replacements_made
726
+
727
+ def handle_nature_business_multiline_fix(cell, flat_json):
728
+ """Handle Nature of Business multiline red text"""
729
+ replacements_made = 0
730
+
731
+ # Extract red text to check if it looks like nature of business
732
+ red_text = ""
733
+ for paragraph in cell.paragraphs:
734
+ for run in paragraph.runs:
735
+ if is_red(run):
736
+ red_text += run.text
737
+
738
+ red_text = red_text.strip()
739
+ if not red_text:
740
+ return 0
741
+
742
+ # Check if this looks like nature of business content
743
+ nature_indicators = ["transport", "logistics", "freight", "delivery", "trucking", "haulage"]
744
+ if any(indicator in red_text.lower() for indicator in nature_indicators):
745
+ # Try to find nature of business in JSON
746
+ nature_value = find_matching_json_value("Nature of Business", flat_json)
747
+ if nature_value is not None:
748
+ replacement_text = get_value_as_string(nature_value, "Nature of Business")
749
+ cell_replacements = replace_red_text_in_cell(cell, replacement_text)
750
+ replacements_made += cell_replacements
751
+ print(f" βœ… Fixed Nature of Business multiline content")
752
+
753
+ return replacements_made
754
+
755
+ def handle_management_summary_fix(cell, flat_json):
756
+ """Handle Management Summary content fixes"""
757
+ replacements_made = 0
758
+
759
+ # Extract red text
760
+ red_text = ""
761
+ for paragraph in cell.paragraphs:
762
+ for run in paragraph.runs:
763
+ if is_red(run):
764
+ red_text += run.text
765
+
766
+ red_text = red_text.strip()
767
+ if not red_text:
768
+ return 0
769
+
770
+ # Look for management summary data in new schema format
771
+ management_types = ["Mass Management Summary", "Maintenance Management Summary", "Fatigue Management Summary"]
772
+
773
+ for mgmt_type in management_types:
774
+ if mgmt_type in flat_json:
775
+ mgmt_data = flat_json[mgmt_type]
776
+ if isinstance(mgmt_data, dict):
777
+ # Try to match red text with any standard in this management type
778
+ for std_key, std_value in mgmt_data.items():
779
+ if isinstance(std_value, list) and std_value:
780
+ # Check if red text matches this standard
781
+ if len(red_text) > 10:
782
+ for item in std_value:
783
+ if red_text.lower() in str(item).lower() or str(item).lower() in red_text.lower():
784
+ replacement_text = "\n".join(str(i) for i in std_value)
785
+ cell_replacements = replace_red_text_in_cell(cell, replacement_text)
786
+ replacements_made += cell_replacements
787
+ print(f" βœ… Fixed {mgmt_type} - {std_key}")
788
+ return replacements_made
789
+
790
+ return replacements_made
791
+
792
+ def handle_operator_declaration_fix(table, flat_json):
793
+ """Handle small Operator/Auditor Declaration tables"""
794
+ replacements_made = 0
795
+
796
+ if len(table.rows) > 4: # Only process small tables
797
+ return 0
798
+
799
+ # Get table context
800
+ table_text = ""
801
+ for row in table.rows:
802
+ for cell in row.cells:
803
+ table_text += get_clean_text(cell).lower() + " "
804
+
805
+ # Check if this is a declaration table
806
+ if not ("print name" in table_text or "signature" in table_text or "date" in table_text):
807
+ return 0
808
+
809
+ print(f" 🎯 Processing declaration table")
810
+
811
+ # Process each cell with red text
812
+ for row_idx, row in enumerate(table.rows):
813
+ for cell_idx, cell in enumerate(row.cells):
814
+ if has_red_text(cell):
815
+ # Try common declaration fields
816
+ declaration_fields = [
817
+ "Print Name", "Position Title", "Signature", "Date",
818
+ "Operator Declaration.Print Name", "Operator Declaration.Position Title",
819
+ "NHVAS Approved Auditor Declaration.Print Name"
820
+ ]
821
 
822
+ replaced = False
823
+ for field in declaration_fields:
824
+ field_value = find_matching_json_value(field, flat_json)
825
+ if field_value is not None:
826
+ replacement_text = get_value_as_string(field_value, field)
827
+ if replacement_text.strip():
828
+ cell_replacements = replace_red_text_in_cell(cell, replacement_text)
829
+ if cell_replacements > 0:
830
+ replacements_made += cell_replacements
831
+ print(f" βœ… Fixed declaration field: {field}")
832
+ replaced = True
833
+ break
834
+
835
+ # If no specific field match, try generic signature/date
836
+ if not replaced:
837
+ red_text = ""
838
+ for paragraph in cell.paragraphs:
839
+ for run in paragraph.runs:
840
+ if is_red(run):
841
+ red_text += run.text
842
+
843
+ if "signature" in red_text.lower():
844
+ cell_replacements = replace_red_text_in_cell(cell, "[Signature]")
845
+ replacements_made += cell_replacements
846
+ elif "date" in red_text.lower():
847
+ cell_replacements = replace_red_text_in_cell(cell, "[Date]")
848
+ replacements_made += cell_replacements
849
+
850
+ return replacements_made
851
+
852
+ def handle_print_accreditation_section(table, flat_json):
853
+ """Handle Print Accreditation section"""
854
+ replacements_made = 0
855
+
856
+ print(f" πŸ“‹ Processing Print Accreditation section")
857
+
858
+ for row_idx, row in enumerate(table.rows):
859
+ for cell_idx, cell in enumerate(row.cells):
860
+ if has_red_text(cell):
861
+ # Try print accreditation fields
862
+ accreditation_fields = [
863
+ "(print accreditation name)",
864
+ "Print Name",
865
+ "Operator name (Legal entity)"
866
+ ]
867
+
868
+ for field in accreditation_fields:
869
+ field_value = find_matching_json_value(field, flat_json)
870
+ if field_value is not None:
871
+ replacement_text = get_value_as_string(field_value, field)
872
+ if replacement_text.strip():
873
+ cell_replacements = replace_red_text_in_cell(cell, replacement_text)
874
+ replacements_made += cell_replacements
875
+ if cell_replacements > 0:
876
+ print(f" βœ… Fixed accreditation: {field}")
877
+ break
878
+
879
+ return replacements_made
880
+
881
+ def process_single_column_sections(cell, key_text, flat_json):
882
+ """Process single column sections with red text"""
883
+ replacements_made = 0
884
+
885
+ if has_red_text(cell):
886
+ red_text = ""
887
+ for paragraph in cell.paragraphs:
888
+ for run in paragraph.runs:
889
+ if is_red(run):
890
+ red_text += run.text
891
+
892
+ if red_text.strip():
893
+ # Try direct matching first
894
+ section_value = find_matching_json_value(red_text.strip(), flat_json)
895
+ if section_value is None:
896
+ # Try key-based matching
897
+ section_value = find_matching_json_value(key_text, flat_json)
898
+
899
+ if section_value is not None:
900
+ section_replacement = get_value_as_string(section_value, red_text.strip())
901
+ cell_replacements = replace_red_text_in_cell(cell, section_replacement)
902
+ replacements_made += cell_replacements
903
+ if cell_replacements > 0:
904
+ print(f" βœ… Fixed single column section: '{key_text}'")
905
 
906
  return replacements_made
907
 
908
  def process_tables(document, flat_json):
909
+ """Process all tables in the document with comprehensive fixes"""
910
  replacements_made = 0
911
 
912
  for table_idx, table in enumerate(document.tables):
913
  print(f"\nπŸ” Processing table {table_idx + 1}:")
914
 
915
+ # Get table context
916
  table_text = ""
917
  for row in table.rows[:3]:
918
  for cell in row.cells:
919
  table_text += get_clean_text(cell).lower() + " "
920
 
921
+ # Detect Management Summary tables
922
  management_summary_indicators = ["mass management", "maintenance management", "fatigue management"]
923
  has_management = any(indicator in table_text for indicator in management_summary_indicators)
924
  has_details = "details" in table_text
 
927
  print(f" πŸ“‹ Detected Management Summary table")
928
  summary_fixes = fix_management_summary_details_column(table, flat_json)
929
  replacements_made += summary_fixes
930
+
931
+ # Process remaining red text in management summary
932
  summary_replacements = 0
933
  for row_idx, row in enumerate(table.rows):
934
  for cell_idx, cell in enumerate(row.cells):
 
962
  replacements_made += summary_replacements
963
  continue
964
 
965
+ # Detect Vehicle Registration tables
966
  vehicle_indicators = ["registration number", "sub-contractor", "weight verification", "rfs suspension"]
967
  indicator_count = sum(1 for indicator in vehicle_indicators if indicator in table_text)
968
  if indicator_count >= 2:
 
971
  replacements_made += vehicle_replacements
972
  continue
973
 
974
+ # Detect Attendance List tables
975
  if "attendance list" in table_text and "names and position titles" in table_text:
976
  print(f" πŸ‘₯ Detected Attendance List table")
977
  attendance_replacements = handle_attendance_list_table_enhanced(table, flat_json)
978
  replacements_made += attendance_replacements
979
  continue
980
 
981
+ # Detect Print Accreditation tables
982
  print_accreditation_indicators = ["print name", "position title"]
983
  indicator_count = sum(1 for indicator in print_accreditation_indicators if indicator in table_text)
984
  if indicator_count >= 1:
985
  print(f" πŸ“‹ Detected Print Accreditation table")
986
+
987
+ # Check for declaration tables that need fixing
988
+ if "print name" in table_text and "position" in table_text:
989
+ declaration_fixes = fix_operator_declaration_empty_values(table, flat_json)
990
+ replacements_made += declaration_fixes
991
+
992
  print_accreditation_replacements = handle_print_accreditation_section(table, flat_json)
993
  replacements_made += print_accreditation_replacements
994
  continue
995
 
996
+ # Process regular table rows
997
  for row_idx, row in enumerate(table.rows):
998
  if len(row.cells) < 1:
999
  continue
 
1011
  if json_value is not None:
1012
  replacement_text = get_value_as_string(json_value, key_text)
1013
 
1014
+ # Handle Australian Company Number
1015
  if ("australian company number" in key_text.lower() or "company number" in key_text.lower()) and isinstance(json_value, list):
1016
  cell_replacements = handle_australian_company_number(row, json_value)
1017
  replacements_made += cell_replacements
1018
 
1019
+ # Handle section headers
1020
  elif ("attendance list" in key_text.lower() or "nature of" in key_text.lower()) and row_idx + 1 < len(table.rows):
1021
+ print(f" βœ… Section header detected, checking next row...")
1022
  next_row = table.rows[row_idx + 1]
1023
 
1024
  for cell_idx, cell in enumerate(next_row.cells):
 
1029
  cell_replacements = replace_red_text_in_cell(cell, replacement_text)
1030
  replacements_made += cell_replacements
1031
  if cell_replacements > 0:
1032
+ print(f" -> Replaced section content")
1033
 
1034
+ # Handle single column sections
1035
  elif len(row.cells) == 1 or (len(row.cells) > 1 and not any(has_red_text(row.cells[i]) for i in range(1, len(row.cells)))):
1036
  if has_red_text(key_cell):
1037
  cell_replacements = process_single_column_sections(key_cell, key_text, flat_json)
1038
  replacements_made += cell_replacements
1039
+
1040
+ # Handle regular key-value pairs
1041
  else:
1042
  for cell_idx in range(1, len(row.cells)):
1043
  value_cell = row.cells[cell_idx]
 
1045
  print(f" βœ… Found red text in column {cell_idx + 1}")
1046
  cell_replacements = replace_red_text_in_cell(value_cell, replacement_text)
1047
  replacements_made += cell_replacements
1048
+
1049
  else:
1050
+ # Fallback processing for unmatched keys
1051
  if len(row.cells) == 1 and has_red_text(key_cell):
1052
  red_text = ""
1053
  for paragraph in key_cell.paragraphs:
 
1061
  cell_replacements = replace_red_text_in_cell(key_cell, section_replacement)
1062
  replacements_made += cell_replacements
1063
 
1064
+ # Process red text in all cells
1065
  for cell_idx in range(len(row.cells)):
1066
  cell = row.cells[cell_idx]
1067
  if has_red_text(cell):
1068
  cell_replacements = handle_multiple_red_segments_in_cell(cell, flat_json)
1069
  replacements_made += cell_replacements
1070
 
1071
+ # Apply fixes if no replacements made
1072
  if cell_replacements == 0:
1073
  surgical_fix = handle_nature_business_multiline_fix(cell, flat_json)
1074
  replacements_made += surgical_fix
1075
 
1076
+ if cell_replacements == 0:
 
1077
  management_summary_fix = handle_management_summary_fix(cell, flat_json)
1078
  replacements_made += management_summary_fix
1079
 
1080
+ # Handle Operator/Auditor Declaration tables (check last few tables)
1081
+ print(f"\n🎯 Final check for Declaration tables...")
1082
+ for table in document.tables[-3:]:
1083
+ if len(table.rows) <= 4:
1084
+ declaration_fix = handle_operator_declaration_fix(table, flat_json)
1085
+ replacements_made += declaration_fix
1086
+
1087
+ return replacements_made
 
 
 
 
1088
 
1089
  def process_paragraphs(document, flat_json):
1090
+ """Process all paragraphs in the document"""
1091
  replacements_made = 0
1092
  print(f"\nπŸ” Processing paragraphs:")
1093
 
1094
  for para_idx, paragraph in enumerate(document.paragraphs):
1095
  red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
1096
  if red_runs:
 
1097
  red_text_only = "".join(run.text for run in red_runs).strip()
1098
  print(f" πŸ“Œ Paragraph {para_idx + 1}: Found red text: '{red_text_only}'")
1099
 
 
1100
  json_value = find_matching_json_value(red_text_only, flat_json)
1101
 
1102
  if json_value is None:
 
1118
  return replacements_made
1119
 
1120
  def process_headings(document, flat_json):
1121
+ """Process headings and their related content"""
1122
  replacements_made = 0
1123
  print(f"\nπŸ” Processing headings:")
1124
 
 
1130
  if not paragraph_text:
1131
  continue
1132
 
1133
+ # Check if this is a heading
1134
  matched_heading = None
1135
  for category, patterns in HEADING_PATTERNS.items():
1136
  for pattern in patterns:
 
1149
  heading_replacements = process_red_text_in_paragraph(paragraph, paragraph_text, flat_json)
1150
  replacements_made += heading_replacements
1151
 
1152
+ # Look ahead for related content
1153
+ for next_para_offset in range(1, 6):
1154
  next_para_idx = para_idx + next_para_offset
1155
  if next_para_idx >= len(paragraphs):
1156
  break
 
1174
  if is_another_heading:
1175
  break
1176
 
1177
+ # Process red text with context
1178
  if has_red_text_in_paragraph(next_paragraph):
1179
+ print(f" πŸ”΄ Found red text in paragraph {next_para_idx + 1} after heading")
1180
 
1181
  context_replacements = process_red_text_in_paragraph(
1182
  next_paragraph,
 
1187
 
1188
  return replacements_made
1189
 
 
 
 
 
 
 
 
1190
  def process_red_text_in_paragraph(paragraph, context_text, flat_json):
1191
+ """Process red text within a paragraph using context"""
1192
  replacements_made = 0
1193
 
1194
  red_text_segments = []
 
1204
 
1205
  json_value = None
1206
 
1207
+ # Direct matching
1208
  json_value = find_matching_json_value(combined_red_text, flat_json)
1209
 
1210
+ # Context-based matching
1211
  if json_value is None:
1212
  if "NHVAS APPROVED AUDITOR" in context_text.upper():
1213
  auditor_fields = ["auditor name", "auditor", "nhvas auditor", "approved auditor", "print name"]
 
1225
  print(f" βœ… Found operator match with field: '{field}'")
1226
  break
1227
 
1228
+ # Combined context queries
1229
  if json_value is None:
1230
  context_queries = [
1231
  f"{context_text} {combined_red_text}",
 
1236
  for query in context_queries:
1237
  json_value = find_matching_json_value(query, flat_json)
1238
  if json_value is not None:
1239
+ print(f" βœ… Found match with combined query")
1240
  break
1241
 
1242
  # Replace if match found
 
1259
  return replacements_made
1260
 
1261
  def force_red_text_replacement(document, flat_json):
1262
+ """Force replacement of any remaining red text by trying ALL JSON values"""
1263
  replacements_made = 0
1264
  print(f"\n🎯 FORCE FIX: Scanning for any remaining red text...")
1265
 
1266
+ # Collect all possible replacement values from JSON
1267
  all_values = {}
1268
  for key, value in flat_json.items():
1269
  if value:
 
1270
  value_str = get_value_as_string(value, key)
1271
 
 
1272
  if value_str and isinstance(value_str, str) and value_str.strip():
1273
  all_values[key] = value_str.strip()
1274
 
1275
+ # Store individual items from lists for partial matching
1276
  if isinstance(value, list):
1277
  for i, item in enumerate(value):
1278
  item_str = str(item).strip() if item else ""
 
1298
  combined_red_text = " ".join(red_text_parts).strip()
1299
  print(f" Red text: '{combined_red_text}'")
1300
 
1301
+ # Find best match
1302
  best_match = None
1303
  best_key = None
1304
 
1305
+ # Exact matching
1306
  for key, value in all_values.items():
1307
  if combined_red_text.lower() == value.lower():
1308
  best_match = value
1309
  best_key = key
1310
  break
1311
 
1312
+ # Partial matching
1313
  if not best_match:
1314
  for key, value in all_values.items():
 
1315
  if (len(value) > 3 and value.lower() in combined_red_text.lower()) or \
1316
  (len(combined_red_text) > 3 and combined_red_text.lower() in value.lower()):
1317
  best_match = value
1318
  best_key = key
1319
  break
1320
 
1321
+ # Word-by-word matching for names/dates
1322
  if not best_match:
1323
  red_words = set(word.lower() for word in combined_red_text.split() if len(word) > 2)
1324
  best_score = 0
 
1407
 
1408
  return replacements_made
1409
 
 
1410
  def process_hf(json_file, docx_file, output_file):
1411
+ """Main processing function with comprehensive error handling"""
1412
  try:
1413
  # Load JSON
1414
  if hasattr(json_file, "read"):
 
1430
  else:
1431
  doc = Document(docx_file)
1432
 
1433
+ # Process document with all fixes
1434
+ print("πŸš€ Starting comprehensive document processing...")
1435
 
1436
  table_replacements = process_tables(doc, flat_json)
1437
  paragraph_replacements = process_paragraphs(doc, flat_json)
1438
  heading_replacements = process_headings(doc, flat_json)
1439
 
1440
+ # Final force fix for any remaining red text
1441
  force_replacements = force_red_text_replacement(doc, flat_json)
1442
 
1443
  total_replacements = table_replacements + paragraph_replacements + heading_replacements + force_replacements