Shami96 commited on
Commit
412e2ed
Β·
verified Β·
1 Parent(s): 5efc8a5

Update updated_word.py

Browse files
Files changed (1) hide show
  1. updated_word.py +115 -372
updated_word.py CHANGED
@@ -3,7 +3,7 @@ from docx import Document
3
  from docx.shared import RGBColor
4
  import re
5
 
6
- # Enhanced heading patterns (ADDITIVE - keeps your existing ones)
7
  HEADING_PATTERNS = {
8
  "main": [
9
  r"NHVAS\s+Audit\s+Summary\s+Report",
@@ -61,7 +61,7 @@ def get_value_as_string(value, field_name=""):
61
  return str(value)
62
 
63
  def find_matching_json_value(field_name, flat_json):
64
- """Enhanced dynamic matching without manual mappings"""
65
  field_name = field_name.strip()
66
 
67
  # Try exact match first
@@ -122,7 +122,7 @@ def find_matching_json_value(field_name, flat_json):
122
  best_match = value
123
  best_key = key
124
 
125
- if best_match and best_score >= 0.25: # Lowered threshold for better coverage
126
  print(f" βœ… Fuzzy match found for key '{field_name}' with JSON key '{best_key}' (score: {best_score:.2f})")
127
  return best_match
128
 
@@ -144,7 +144,7 @@ def has_red_text(cell):
144
  return False
145
 
146
  def extract_red_text_segments(cell):
147
- """Enhanced red text extraction with better multi-line handling"""
148
  red_segments = []
149
 
150
  for para_idx, paragraph in enumerate(cell.paragraphs):
@@ -178,7 +178,7 @@ def extract_red_text_segments(cell):
178
  return red_segments
179
 
180
  def replace_red_text_in_cell(cell, replacement_text):
181
- """Enhanced cell replacement with improved multi-line handling"""
182
  red_segments = extract_red_text_segments(cell)
183
 
184
  if not red_segments:
@@ -197,7 +197,7 @@ def replace_red_text_in_cell(cell, replacement_text):
197
  return replace_all_red_segments(red_segments, replacement_text)
198
 
199
  def replace_all_red_segments(red_segments, replacement_text):
200
- """Enhanced replacement with better line handling"""
201
  if not red_segments:
202
  return 0
203
 
@@ -244,55 +244,22 @@ def replace_all_red_segments(red_segments, replacement_text):
244
 
245
  return replacements_made
246
 
247
- def analyze_table_structure(table):
248
- """NEW: Dynamic table structure analysis"""
249
- structure = {
250
- 'type': 'unknown',
251
- 'orientation': 'unknown',
252
- 'has_headers': False,
253
- 'column_count': 0,
254
- 'row_count': 0,
255
- 'red_text_locations': []
256
- }
257
-
258
- if not table.rows:
259
- return structure
260
-
261
- structure['row_count'] = len(table.rows)
262
- structure['column_count'] = len(table.rows[0].cells) if table.rows else 0
263
-
264
- # Analyze first row for headers
265
- first_row_text = []
266
- for cell in table.rows[0].cells:
267
- cell_text = get_clean_text(cell).strip()
268
- first_row_text.append(cell_text)
269
-
270
- # Detect table type based on content patterns
271
- combined_text = " ".join(first_row_text).lower()
272
-
273
- if any(indicator in combined_text for indicator in ["registration", "vehicle", "maintenance", "mass"]):
274
- structure['type'] = 'vehicle_registration'
275
- elif any(indicator in combined_text for indicator in ["print name", "position", "auditor", "operator"]):
276
- structure['type'] = 'declaration'
277
- elif any(indicator in combined_text for indicator in ["std", "standard", "compliance"]):
278
- structure['type'] = 'compliance_matrix'
279
- elif len(table.rows[0].cells) == 2 and not any(indicator in combined_text for indicator in ["no.", "number"]):
280
- structure['type'] = 'key_value'
281
- else:
282
- structure['type'] = 'data_grid'
283
 
284
- # Find red text locations
285
- for row_idx, row in enumerate(table.rows):
286
- for cell_idx, cell in enumerate(row.cells):
287
- if has_red_text(cell):
288
- structure['red_text_locations'].append((row_idx, cell_idx))
289
 
290
- structure['has_headers'] = len(structure['red_text_locations']) > 0 and (0, 0) not in structure['red_text_locations']
 
291
 
292
- return structure
293
 
294
  def handle_multiple_red_segments_in_cell(cell, flat_json):
295
- """Enhanced multi-segment handling"""
296
  red_segments = extract_red_text_segments(cell)
297
 
298
  if not red_segments:
@@ -340,251 +307,110 @@ def handle_multiple_red_segments_in_cell(cell, flat_json):
340
 
341
  return replacements_made
342
 
343
- def replace_single_segment(segment, replacement_text):
344
- """Enhanced single segment replacement"""
345
- if not segment['runs']:
346
- return False
 
347
 
348
- first_run = segment['runs'][0][2]
349
- first_run.text = replacement_text
350
- first_run.font.color.rgb = RGBColor(0, 0, 0)
 
351
 
352
- for _, _, run in segment['runs'][1:]:
353
- run.text = ''
354
 
355
- return True
356
-
357
- def detect_table_type(table):
358
- """Enhanced table type detection"""
359
- structure = analyze_table_structure(table)
360
- return structure['type']
361
-
362
- def try_context_based_replacement(cell, row, table, flat_json):
363
- """Enhanced context-based replacement"""
364
  replacements_made = 0
365
 
366
- row_context = ""
367
- if len(row.cells) > 1:
368
- first_cell_text = get_clean_text(row.cells[0]).strip()
369
- if first_cell_text and not has_red_text(row.cells[0]):
370
- row_context = first_cell_text
371
-
372
- red_segments = extract_red_text_segments(cell)
373
  for segment in red_segments:
374
- red_text = segment['text'].strip()
375
- if not red_text:
376
  continue
377
-
378
- if row_context:
379
- context_queries = [
380
- f"{row_context} {red_text}",
381
- f"{row_context}",
382
- red_text
383
- ]
384
 
385
- for query in context_queries:
386
- json_value = find_matching_json_value(query, flat_json)
387
- if json_value is not None:
388
- replacement_text = get_value_as_string(json_value, query)
389
- success = replace_single_segment(segment, replacement_text)
390
- if success:
391
- replacements_made += 1
392
- print(f" βœ… Context-based replacement: '{query}' -> '{replacement_text[:30]}...'")
393
- break
394
-
395
- return replacements_made
396
-
397
- def smart_fallback_processor(element, flat_json):
398
- """NEW: Smart fallback for missed red text"""
399
- replacements_made = 0
400
-
401
- # Check if element has red text that wasn't processed
402
- if hasattr(element, 'paragraphs'):
403
- for paragraph in element.paragraphs:
404
- for run in paragraph.runs:
405
- if is_red(run) and run.text.strip():
406
- # Try advanced pattern matching
407
- red_text = run.text.strip()
408
-
409
- # Try semantic matching
410
- json_value = semantic_text_matching(red_text, flat_json)
411
- if json_value:
412
- replacement_text = get_value_as_string(json_value, red_text)
413
- run.text = replacement_text
414
- run.font.color.rgb = RGBColor(0, 0, 0)
415
- replacements_made += 1
416
- print(f" 🎯 Fallback match: '{red_text}' -> '{replacement_text[:30]}...'")
417
-
418
- return replacements_made
419
-
420
- def semantic_text_matching(text, flat_json):
421
- """NEW: Advanced semantic matching for edge cases"""
422
- text_lower = text.lower().strip()
423
-
424
- # Common semantic patterns
425
- semantic_patterns = {
426
- 'name': ['name', 'manager', 'operator', 'auditor', 'driver'],
427
- 'date': ['date', 'expiry', 'conducted', 'completed'],
428
- 'address': ['address', 'location', 'road', 'street'],
429
- 'number': ['number', 'registration', 'phone', 'telephone'],
430
- 'email': ['email', 'mail'],
431
- 'position': ['position', 'title', 'role']
432
- }
433
-
434
- # Find semantic category
435
- for category, keywords in semantic_patterns.items():
436
- if any(keyword in text_lower for keyword in keywords):
437
- # Look for JSON keys in this semantic category
438
- for key, value in flat_json.items():
439
- key_lower = key.lower()
440
- if any(keyword in key_lower for keyword in keywords):
441
- return value
442
-
443
- return None
444
-
445
- def handle_nature_of_business_section(table, flat_json):
446
- """TARGETED FIX for Issue 1: Nature of Business multi-line and sub-fields"""
447
- replacements_made = 0
448
 
449
- for row_idx, row in enumerate(table.rows):
450
- if len(row.cells) >= 1:
451
- cell = row.cells[0]
452
- cell_text = get_clean_text(cell).lower()
453
-
454
- # Check if this is the "Nature of the Operators Business" section
455
- if "nature of the operators business" in cell_text or "nature of the operator business" in cell_text:
456
- print(f" 🎯 Found Nature of Business section in row {row_idx + 1}")
457
-
458
- # Handle main business description (multi-line red text)
459
- if has_red_text(cell):
460
- # Try to find business description in JSON
461
- business_desc_keys = [
462
- "nature of the operators business",
463
- "business description",
464
- "operator business summary",
465
- "business summary"
466
- ]
467
-
468
- business_value = None
469
- for key in business_desc_keys:
470
- business_value = find_matching_json_value(key, flat_json)
471
- if business_value:
472
- break
473
-
474
- if business_value:
475
- business_text = get_value_as_string(business_value)
476
- cell_replacements = replace_red_text_in_cell(cell, business_text)
477
- replacements_made += cell_replacements
478
- print(f" βœ… Updated main business description")
479
-
480
- # Look for sub-fields in the next few rows
481
- for sub_row_idx in range(row_idx + 1, min(row_idx + 4, len(table.rows))):
482
- sub_row = table.rows[sub_row_idx]
483
- if len(sub_row.cells) >= 1:
484
- sub_cell = sub_row.cells[0]
485
- sub_text = get_clean_text(sub_cell).lower()
486
-
487
- # Handle Accreditation Number
488
- if "accreditation number" in sub_text and has_red_text(sub_cell):
489
- accred_value = find_matching_json_value("accreditation number", flat_json)
490
- if not accred_value:
491
- accred_value = find_matching_json_value("nhvas accreditation no", flat_json)
492
- if accred_value:
493
- accred_text = get_value_as_string(accred_value)
494
- cell_replacements = replace_red_text_in_cell(sub_cell, accred_text)
495
- replacements_made += cell_replacements
496
- print(f" βœ… Updated Accreditation Number: {accred_text}")
497
-
498
- # Handle Expiry Date
499
- elif "expiry date" in sub_text and has_red_text(sub_cell):
500
- expiry_value = find_matching_json_value("expiry date", flat_json)
501
- if not expiry_value:
502
- expiry_value = find_matching_json_value("accreditation expiry", flat_json)
503
- if expiry_value:
504
- expiry_text = get_value_as_string(expiry_value)
505
- cell_replacements = replace_red_text_in_cell(sub_cell, expiry_text)
506
- replacements_made += cell_replacements
507
- print(f" βœ… Updated Expiry Date: {expiry_text}")
508
-
509
- break # Found the section, no need to continue
510
 
511
  return replacements_made
512
 
513
-
514
- def handle_operator_declaration_table(table, flat_json):
515
- """TARGETED FIX for Issue 2: Operator Declaration Print Name and Position Title"""
516
  replacements_made = 0
517
 
 
518
  for row_idx, row in enumerate(table.rows):
519
  if len(row.cells) >= 2:
520
- cell1_text = get_clean_text(row.cells[0]).lower()
521
- cell2_text = get_clean_text(row.cells[1]).lower()
522
 
523
- # Check if this is the header row with "Print Name" and "Position Title"
524
- if "print name" in cell1_text and ("position title" in cell2_text or "position" in cell2_text):
525
- print(f" 🎯 Found Operator Declaration header row {row_idx + 1}")
 
 
526
 
527
- # Look for the data row (next row with red text)
528
  if row_idx + 1 < len(table.rows):
529
  data_row = table.rows[row_idx + 1]
530
  if len(data_row.cells) >= 2:
531
  name_cell = data_row.cells[0]
532
  position_cell = data_row.cells[1]
533
 
534
- # Handle Print Name (first column)
535
  if has_red_text(name_cell):
536
- # Try to find operator name
537
- name_keys = [
538
- "operator name",
539
- "print name",
540
- "legal entity",
541
- "operator"
542
- ]
543
-
544
- name_value = None
545
- for key in name_keys:
546
- name_value = find_matching_json_value(key, flat_json)
547
- if name_value:
548
- break
549
 
550
- if name_value:
551
- name_text = get_value_as_string(name_value)
552
- cell_replacements = replace_red_text_in_cell(name_cell, name_text)
553
- replacements_made += cell_replacements
554
- print(f" βœ… Updated Print Name: {name_text}")
 
 
555
 
556
- # Handle Position Title (second column)
557
  if has_red_text(position_cell):
558
- # Try to find position/title
559
- position_keys = [
560
- "position title",
561
- "position",
562
- "title",
563
- "job title",
564
- "role"
565
- ]
566
 
567
- position_value = None
568
- for key in position_keys:
569
- position_value = find_matching_json_value(key, flat_json)
570
- if position_value:
571
- break
572
-
573
- # If no specific position found, default to "Manager"
574
- if not position_value:
575
- position_value = "Manager"
576
-
577
- position_text = get_value_as_string(position_value)
578
- cell_replacements = replace_red_text_in_cell(position_cell, position_text)
579
- replacements_made += cell_replacements
580
- print(f" βœ… Updated Position Title: {position_text}")
581
 
582
- break # Found the section, no need to continue
583
 
584
  return replacements_made
585
 
586
  def handle_australian_company_number(row, company_numbers):
587
- """Enhanced ACN handling"""
588
  replacements_made = 0
589
  for i, digit in enumerate(company_numbers):
590
  cell_idx = i + 1
@@ -597,7 +423,7 @@ def handle_australian_company_number(row, company_numbers):
597
  return replacements_made
598
 
599
  def handle_vehicle_registration_table(table, flat_json):
600
- """Enhanced vehicle registration table handling"""
601
  replacements_made = 0
602
 
603
  # Try to find vehicle registration data
@@ -740,7 +566,7 @@ def handle_vehicle_registration_table(table, flat_json):
740
  return replacements_made
741
 
742
  def handle_print_accreditation_section(table, flat_json):
743
- """Enhanced print accreditation handling"""
744
  replacements_made = 0
745
 
746
  print_data = flat_json.get("print accreditation name.print accreditation name", [])
@@ -780,7 +606,7 @@ def handle_print_accreditation_section(table, flat_json):
780
  return replacements_made
781
 
782
  def process_single_column_sections(cell, field_name, flat_json):
783
- """Enhanced single column processing"""
784
  json_value = find_matching_json_value(field_name, flat_json)
785
  if json_value is not None:
786
  replacement_text = get_value_as_string(json_value, field_name)
@@ -796,17 +622,13 @@ def process_single_column_sections(cell, field_name, flat_json):
796
  return 0
797
 
798
  def process_tables(document, flat_json):
799
- """ENHANCED: Your existing function + smart enhancements"""
800
  replacements_made = 0
801
 
802
  for table_idx, table in enumerate(document.tables):
803
  print(f"\nπŸ” Processing table {table_idx + 1}:")
804
 
805
- # ENHANCED: Dynamic table analysis
806
- table_structure = analyze_table_structure(table)
807
- print(f" πŸ“Š Table structure: {table_structure['type']} ({table_structure['row_count']}x{table_structure['column_count']})")
808
-
809
- # Your existing logic with enhancements
810
  table_text = ""
811
  for row in table.rows[:3]:
812
  for cell in row.cells:
@@ -815,7 +637,7 @@ def process_tables(document, flat_json):
815
  # Enhanced vehicle registration detection
816
  vehicle_indicators = ["registration number", "sub-contractor", "weight verification", "rfs suspension"]
817
  indicator_count = sum(1 for indicator in vehicle_indicators if indicator in table_text)
818
- if indicator_count >= 2 or table_structure['type'] == 'vehicle_registration': # Lowered threshold
819
  print(f" πŸš— Detected Vehicle Registration table")
820
  vehicle_replacements = handle_vehicle_registration_table(table, flat_json)
821
  replacements_made += vehicle_replacements
@@ -824,28 +646,13 @@ def process_tables(document, flat_json):
824
  # Enhanced print accreditation detection
825
  print_accreditation_indicators = ["print name", "position title"]
826
  indicator_count = sum(1 for indicator in print_accreditation_indicators if indicator in table_text)
827
- if indicator_count >= 1 or table_structure['type'] == 'declaration': # Lowered threshold
828
  print(f" πŸ“‹ Detected Print Accreditation table")
829
  print_accreditation_replacements = handle_print_accreditation_section(table, flat_json)
830
  replacements_made += print_accreditation_replacements
831
  continue
832
-
833
- # 🎯 NEW: TARGETED FIX 1 - Nature of Business section
834
- if "nature of the operator" in table_text:
835
- print(f" 🎯 Detected Nature of Business section")
836
- nature_replacements = handle_nature_of_business_section(table, flat_json)
837
- replacements_made += nature_replacements
838
- # Don't continue - let it fall through to regular processing too
839
-
840
- # 🎯 NEW: TARGETED FIX 2 - Operator Declaration table
841
- if "print name" in table_text and "position title" in table_text:
842
- print(f" 🎯 Detected Operator Declaration table")
843
- declaration_replacements = handle_operator_declaration_table(table, flat_json)
844
- replacements_made += declaration_replacements
845
- # Don't continue - let it fall through to regular processing too
846
-
847
 
848
- # Your existing row processing with enhancements
849
  for row_idx, row in enumerate(table.rows):
850
  if len(row.cells) < 1:
851
  continue
@@ -916,20 +723,22 @@ def process_tables(document, flat_json):
916
  cell_replacements = handle_multiple_red_segments_in_cell(cell, flat_json)
917
  replacements_made += cell_replacements
918
 
919
- # ENHANCED: Fallback for still unmatched red text
920
  if cell_replacements == 0:
921
- context_replacements = try_context_based_replacement(cell, row, table, flat_json)
922
- replacements_made += context_replacements
923
-
924
- # ENHANCED: Smart fallback processor
925
- if context_replacements == 0:
926
- fallback_replacements = smart_fallback_processor(cell, flat_json)
927
- replacements_made += fallback_replacements
 
 
928
 
929
  return replacements_made
930
 
931
  def process_paragraphs(document, flat_json):
932
- """ENHANCED: Your existing function + smart fallbacks"""
933
  replacements_made = 0
934
  print(f"\nπŸ” Processing paragraphs:")
935
 
@@ -949,9 +758,6 @@ def process_paragraphs(document, flat_json):
949
  json_value = find_matching_json_value("auditor signature", flat_json)
950
  elif "OPERATOR SIGNATURE" in red_text_only.upper():
951
  json_value = find_matching_json_value("operator signature", flat_json)
952
- # ENHANCED: Try semantic matching
953
- elif json_value is None:
954
- json_value = semantic_text_matching(red_text_only, flat_json)
955
 
956
  if json_value is not None:
957
  replacement_text = get_value_as_string(json_value)
@@ -961,15 +767,11 @@ def process_paragraphs(document, flat_json):
961
  for run in red_runs[1:]:
962
  run.text = ''
963
  replacements_made += 1
964
- else:
965
- # ENHANCED: Try smart fallback
966
- fallback_replacements = smart_fallback_processor(paragraph, flat_json)
967
- replacements_made += fallback_replacements
968
 
969
  return replacements_made
970
 
971
  def process_headings(document, flat_json):
972
- """ENHANCED: Your existing function + comprehensive coverage"""
973
  replacements_made = 0
974
  print(f"\nπŸ” Processing headings:")
975
 
@@ -1035,23 +837,18 @@ def process_headings(document, flat_json):
1035
  flat_json
1036
  )
1037
  replacements_made += context_replacements
1038
-
1039
- # ENHANCED: Smart fallback if still no match
1040
- if context_replacements == 0:
1041
- fallback_replacements = smart_fallback_processor(next_paragraph, flat_json)
1042
- replacements_made += fallback_replacements
1043
 
1044
  return replacements_made
1045
 
1046
  def has_red_text_in_paragraph(paragraph):
1047
- """Enhanced paragraph red text detection"""
1048
  for run in paragraph.runs:
1049
  if is_red(run) and run.text.strip():
1050
  return True
1051
  return False
1052
 
1053
  def process_red_text_in_paragraph(paragraph, context_text, flat_json):
1054
- """ENHANCED: Your existing function + smarter matching"""
1055
  replacements_made = 0
1056
 
1057
  red_text_segments = []
@@ -1102,12 +899,6 @@ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
1102
  print(f" βœ… Found match with combined query: '{query[:50]}...'")
1103
  break
1104
 
1105
- # ENHANCED: Strategy 4: Semantic matching
1106
- if json_value is None:
1107
- json_value = semantic_text_matching(combined_red_text, flat_json)
1108
- if json_value:
1109
- print(f" βœ… Found semantic match for: '{combined_red_text}'")
1110
-
1111
  # Replace if match found
1112
  if json_value is not None:
1113
  replacement_text = get_value_as_string(json_value, combined_red_text)
@@ -1127,51 +918,8 @@ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
1127
 
1128
  return replacements_made
1129
 
1130
- def comprehensive_document_scan(document, flat_json):
1131
- """NEW: Final comprehensive scan for any missed red text"""
1132
- print(f"\nπŸ” Comprehensive final scan for missed red text:")
1133
- replacements_made = 0
1134
-
1135
- # Scan all elements in document
1136
- for element in document.element.body:
1137
- # Check tables
1138
- if element.tag.endswith('tbl'):
1139
- table_obj = None
1140
- for table in document.tables:
1141
- if table._element == element:
1142
- table_obj = table
1143
- break
1144
-
1145
- if table_obj:
1146
- for row in table_obj.rows:
1147
- for cell in row.cells:
1148
- if has_red_text(cell):
1149
- # Try one more time with enhanced fallback
1150
- cell_replacements = smart_fallback_processor(cell, flat_json)
1151
- replacements_made += cell_replacements
1152
-
1153
- # Check paragraphs
1154
- elif element.tag.endswith('p'):
1155
- paragraph_obj = None
1156
- for para in document.paragraphs:
1157
- if para._element == element:
1158
- paragraph_obj = para
1159
- break
1160
-
1161
- if paragraph_obj and has_red_text_in_paragraph(paragraph_obj):
1162
- # Try enhanced fallback
1163
- para_replacements = smart_fallback_processor(paragraph_obj, flat_json)
1164
- replacements_made += para_replacements
1165
-
1166
- if replacements_made > 0:
1167
- print(f" βœ… Final scan caught {replacements_made} additional replacements!")
1168
- else:
1169
- print(f" βœ… No additional red text found - document fully processed!")
1170
-
1171
- return replacements_made
1172
-
1173
  def process_hf(json_file, docx_file, output_file):
1174
- """ENHANCED: Your existing main function + comprehensive processing"""
1175
  try:
1176
  # Load JSON
1177
  if hasattr(json_file, "read"):
@@ -1193,18 +941,14 @@ def process_hf(json_file, docx_file, output_file):
1193
  else:
1194
  doc = Document(docx_file)
1195
 
1196
- # ENHANCED: Multi-pass processing for 100% coverage
1197
- print("πŸš€ Starting enhanced multi-pass processing...")
1198
 
1199
- # Pass 1: Your existing processors (enhanced)
1200
  table_replacements = process_tables(doc, flat_json)
1201
  paragraph_replacements = process_paragraphs(doc, flat_json)
1202
  heading_replacements = process_headings(doc, flat_json)
1203
 
1204
- # Pass 2: NEW - Comprehensive final scan
1205
- final_scan_replacements = comprehensive_document_scan(doc, flat_json)
1206
-
1207
- total_replacements = table_replacements + paragraph_replacements + heading_replacements + final_scan_replacements
1208
 
1209
  # Save output
1210
  if hasattr(output_file, "write"):
@@ -1217,8 +961,7 @@ def process_hf(json_file, docx_file, output_file):
1217
  print(f" πŸ“Š Tables: {table_replacements}")
1218
  print(f" πŸ“ Paragraphs: {paragraph_replacements}")
1219
  print(f" πŸ“‹ Headings: {heading_replacements}")
1220
- print(f" 🎯 Final scan: {final_scan_replacements}")
1221
- print(f"πŸŽ‰ Processing complete with enhanced coverage!")
1222
 
1223
  except FileNotFoundError as e:
1224
  print(f"❌ File not found: {e}")
@@ -1230,7 +973,7 @@ def process_hf(json_file, docx_file, output_file):
1230
  if __name__ == "__main__":
1231
  import sys
1232
  if len(sys.argv) != 4:
1233
- print("Usage: python enhanced_pipeline.py <input_docx> <updated_json> <output_docx>")
1234
  exit(1)
1235
  docx_path = sys.argv[1]
1236
  json_path = sys.argv[2]
 
3
  from docx.shared import RGBColor
4
  import re
5
 
6
+ # Your original heading patterns (unchanged)
7
  HEADING_PATTERNS = {
8
  "main": [
9
  r"NHVAS\s+Audit\s+Summary\s+Report",
 
61
  return str(value)
62
 
63
  def find_matching_json_value(field_name, flat_json):
64
+ """Your original matching function (unchanged)"""
65
  field_name = field_name.strip()
66
 
67
  # Try exact match first
 
122
  best_match = value
123
  best_key = key
124
 
125
+ if best_match and best_score >= 0.25:
126
  print(f" βœ… Fuzzy match found for key '{field_name}' with JSON key '{best_key}' (score: {best_score:.2f})")
127
  return best_match
128
 
 
144
  return False
145
 
146
  def extract_red_text_segments(cell):
147
+ """Your original red text extraction (unchanged)"""
148
  red_segments = []
149
 
150
  for para_idx, paragraph in enumerate(cell.paragraphs):
 
178
  return red_segments
179
 
180
  def replace_red_text_in_cell(cell, replacement_text):
181
+ """Your original replacement function (unchanged)"""
182
  red_segments = extract_red_text_segments(cell)
183
 
184
  if not red_segments:
 
197
  return replace_all_red_segments(red_segments, replacement_text)
198
 
199
  def replace_all_red_segments(red_segments, replacement_text):
200
+ """Your original function (unchanged)"""
201
  if not red_segments:
202
  return 0
203
 
 
244
 
245
  return replacements_made
246
 
247
+ def replace_single_segment(segment, replacement_text):
248
+ """Your original function (unchanged)"""
249
+ if not segment['runs']:
250
+ return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
+ first_run = segment['runs'][0][2]
253
+ first_run.text = replacement_text
254
+ first_run.font.color.rgb = RGBColor(0, 0, 0)
 
 
255
 
256
+ for _, _, run in segment['runs'][1:]:
257
+ run.text = ''
258
 
259
+ return True
260
 
261
  def handle_multiple_red_segments_in_cell(cell, flat_json):
262
+ """Your original function (unchanged)"""
263
  red_segments = extract_red_text_segments(cell)
264
 
265
  if not red_segments:
 
307
 
308
  return replacements_made
309
 
310
+ # 🎯 SURGICAL FIX 1: Handle Nature of Business multi-line red text
311
+ def handle_nature_business_multiline_fix(cell, flat_json):
312
+ """SURGICAL FIX: Handle multi-line red text in Nature of Business cells"""
313
+ if not has_red_text(cell):
314
+ return 0
315
 
316
+ # Check if this cell contains "Nature of the Operators Business"
317
+ cell_text = get_clean_text(cell).lower()
318
+ if "nature of the operators business" not in cell_text and "nature of the operator business" not in cell_text:
319
+ return 0
320
 
321
+ print(f" 🎯 SURGICAL FIX: Nature of Business multi-line processing")
 
322
 
323
+ # Look for sub-fields like "Accreditation Number:" and "Expiry Date:"
324
+ red_segments = extract_red_text_segments(cell)
 
 
 
 
 
 
 
325
  replacements_made = 0
326
 
327
+ # Try to replace each segment individually first
 
 
 
 
 
 
328
  for segment in red_segments:
329
+ segment_text = segment['text'].strip()
330
+ if not segment_text:
331
  continue
 
 
 
 
 
 
 
332
 
333
+ json_value = find_matching_json_value(segment_text, flat_json)
334
+ if json_value is not None:
335
+ replacement_text = get_value_as_string(json_value, segment_text)
336
+ success = replace_single_segment(segment, replacement_text)
337
+ if success:
338
+ replacements_made += 1
339
+ print(f" βœ… Fixed segment: '{segment_text[:30]}...'")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
340
 
341
+ # If no individual matches, try combined approach
342
+ if replacements_made == 0 and red_segments:
343
+ combined_text = " ".join(seg['text'] for seg in red_segments).strip()
344
+ json_value = find_matching_json_value(combined_text, flat_json)
345
+ if json_value is not None:
346
+ replacement_text = get_value_as_string(json_value, combined_text)
347
+ replacements_made = replace_all_red_segments(red_segments, replacement_text)
348
+ print(f" βœ… Fixed combined text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
 
350
  return replacements_made
351
 
352
+ # 🎯 SURGICAL FIX 2: Handle Operator Declaration table
353
+ def handle_operator_declaration_fix(table, flat_json):
354
+ """SURGICAL FIX: Handle Operator Declaration Print Name and Position Title"""
355
  replacements_made = 0
356
 
357
+ # Very specific detection: must have EXACTLY these headers
358
  for row_idx, row in enumerate(table.rows):
359
  if len(row.cells) >= 2:
360
+ cell1_text = get_clean_text(row.cells[0]).strip()
361
+ cell2_text = get_clean_text(row.cells[1]).strip()
362
 
363
+ # VERY specific match for operator declaration table
364
+ if ("print name" in cell1_text.lower() and "position title" in cell2_text.lower() and
365
+ len(table.rows) <= 4): # Small table only
366
+
367
+ print(f" 🎯 SURGICAL FIX: Operator Declaration table detected")
368
 
369
+ # Look for the data row (should be next row)
370
  if row_idx + 1 < len(table.rows):
371
  data_row = table.rows[row_idx + 1]
372
  if len(data_row.cells) >= 2:
373
  name_cell = data_row.cells[0]
374
  position_cell = data_row.cells[1]
375
 
376
+ # Fix Print Name (first column)
377
  if has_red_text(name_cell):
378
+ red_text = ""
379
+ for paragraph in name_cell.paragraphs:
380
+ for run in paragraph.runs:
381
+ if is_red(run):
382
+ red_text += run.text
 
 
 
 
 
 
 
 
383
 
384
+ if red_text.strip():
385
+ json_value = find_matching_json_value(red_text.strip(), flat_json)
386
+ if json_value is not None:
387
+ replacement_text = get_value_as_string(json_value)
388
+ cell_replacements = replace_red_text_in_cell(name_cell, replacement_text)
389
+ replacements_made += cell_replacements
390
+ print(f" βœ… Fixed Print Name: '{replacement_text}'")
391
 
392
+ # Fix Position Title (second column)
393
  if has_red_text(position_cell):
394
+ red_text = ""
395
+ for paragraph in position_cell.paragraphs:
396
+ for run in paragraph.runs:
397
+ if is_red(run):
398
+ red_text += run.text
 
 
 
399
 
400
+ if red_text.strip():
401
+ json_value = find_matching_json_value(red_text.strip(), flat_json)
402
+ if json_value is not None:
403
+ replacement_text = get_value_as_string(json_value)
404
+ cell_replacements = replace_red_text_in_cell(position_cell, replacement_text)
405
+ replacements_made += cell_replacements
406
+ print(f" βœ… Fixed Position Title: '{replacement_text}'")
 
 
 
 
 
 
 
407
 
408
+ break # Found the table, stop looking
409
 
410
  return replacements_made
411
 
412
  def handle_australian_company_number(row, company_numbers):
413
+ """Your original function (unchanged)"""
414
  replacements_made = 0
415
  for i, digit in enumerate(company_numbers):
416
  cell_idx = i + 1
 
423
  return replacements_made
424
 
425
  def handle_vehicle_registration_table(table, flat_json):
426
+ """Your original function (unchanged)"""
427
  replacements_made = 0
428
 
429
  # Try to find vehicle registration data
 
566
  return replacements_made
567
 
568
  def handle_print_accreditation_section(table, flat_json):
569
+ """Your original function (unchanged)"""
570
  replacements_made = 0
571
 
572
  print_data = flat_json.get("print accreditation name.print accreditation name", [])
 
606
  return replacements_made
607
 
608
  def process_single_column_sections(cell, field_name, flat_json):
609
+ """Your original function (unchanged)"""
610
  json_value = find_matching_json_value(field_name, flat_json)
611
  if json_value is not None:
612
  replacement_text = get_value_as_string(json_value, field_name)
 
622
  return 0
623
 
624
  def process_tables(document, flat_json):
625
+ """Your original function with minimal surgical fixes added"""
626
  replacements_made = 0
627
 
628
  for table_idx, table in enumerate(document.tables):
629
  print(f"\nπŸ” Processing table {table_idx + 1}:")
630
 
631
+ # Your original logic
 
 
 
 
632
  table_text = ""
633
  for row in table.rows[:3]:
634
  for cell in row.cells:
 
637
  # Enhanced vehicle registration detection
638
  vehicle_indicators = ["registration number", "sub-contractor", "weight verification", "rfs suspension"]
639
  indicator_count = sum(1 for indicator in vehicle_indicators if indicator in table_text)
640
+ if indicator_count >= 2:
641
  print(f" πŸš— Detected Vehicle Registration table")
642
  vehicle_replacements = handle_vehicle_registration_table(table, flat_json)
643
  replacements_made += vehicle_replacements
 
646
  # Enhanced print accreditation detection
647
  print_accreditation_indicators = ["print name", "position title"]
648
  indicator_count = sum(1 for indicator in print_accreditation_indicators if indicator in table_text)
649
+ if indicator_count >= 1:
650
  print(f" πŸ“‹ Detected Print Accreditation table")
651
  print_accreditation_replacements = handle_print_accreditation_section(table, flat_json)
652
  replacements_made += print_accreditation_replacements
653
  continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
654
 
655
+ # Your existing row processing
656
  for row_idx, row in enumerate(table.rows):
657
  if len(row.cells) < 1:
658
  continue
 
723
  cell_replacements = handle_multiple_red_segments_in_cell(cell, flat_json)
724
  replacements_made += cell_replacements
725
 
726
+ # 🎯 SURGICAL FIX 1: Only if no replacements were made
727
  if cell_replacements == 0:
728
+ surgical_fix = handle_nature_business_multiline_fix(cell, flat_json)
729
+ replacements_made += surgical_fix
730
+
731
+ # 🎯 SURGICAL FIX 2: Handle Operator Declaration tables (only check last few tables)
732
+ print(f"\n🎯 SURGICAL FIX: Checking for Operator Declaration tables...")
733
+ for table in document.tables[-3:]: # Only check last 3 tables
734
+ if len(table.rows) <= 4: # Only small tables
735
+ declaration_fix = handle_operator_declaration_fix(table, flat_json)
736
+ replacements_made += declaration_fix
737
 
738
  return replacements_made
739
 
740
  def process_paragraphs(document, flat_json):
741
+ """Your original function (unchanged)"""
742
  replacements_made = 0
743
  print(f"\nπŸ” Processing paragraphs:")
744
 
 
758
  json_value = find_matching_json_value("auditor signature", flat_json)
759
  elif "OPERATOR SIGNATURE" in red_text_only.upper():
760
  json_value = find_matching_json_value("operator signature", flat_json)
 
 
 
761
 
762
  if json_value is not None:
763
  replacement_text = get_value_as_string(json_value)
 
767
  for run in red_runs[1:]:
768
  run.text = ''
769
  replacements_made += 1
 
 
 
 
770
 
771
  return replacements_made
772
 
773
  def process_headings(document, flat_json):
774
+ """Your original function (unchanged)"""
775
  replacements_made = 0
776
  print(f"\nπŸ” Processing headings:")
777
 
 
837
  flat_json
838
  )
839
  replacements_made += context_replacements
 
 
 
 
 
840
 
841
  return replacements_made
842
 
843
  def has_red_text_in_paragraph(paragraph):
844
+ """Your original function (unchanged)"""
845
  for run in paragraph.runs:
846
  if is_red(run) and run.text.strip():
847
  return True
848
  return False
849
 
850
  def process_red_text_in_paragraph(paragraph, context_text, flat_json):
851
+ """Your original function (unchanged)"""
852
  replacements_made = 0
853
 
854
  red_text_segments = []
 
899
  print(f" βœ… Found match with combined query: '{query[:50]}...'")
900
  break
901
 
 
 
 
 
 
 
902
  # Replace if match found
903
  if json_value is not None:
904
  replacement_text = get_value_as_string(json_value, combined_red_text)
 
918
 
919
  return replacements_made
920
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
921
  def process_hf(json_file, docx_file, output_file):
922
+ """Your original main function (unchanged)"""
923
  try:
924
  # Load JSON
925
  if hasattr(json_file, "read"):
 
941
  else:
942
  doc = Document(docx_file)
943
 
944
+ # Your original processing
945
+ print("πŸš€ Starting processing with surgical fixes...")
946
 
 
947
  table_replacements = process_tables(doc, flat_json)
948
  paragraph_replacements = process_paragraphs(doc, flat_json)
949
  heading_replacements = process_headings(doc, flat_json)
950
 
951
+ total_replacements = table_replacements + paragraph_replacements + heading_replacements
 
 
 
952
 
953
  # Save output
954
  if hasattr(output_file, "write"):
 
961
  print(f" πŸ“Š Tables: {table_replacements}")
962
  print(f" πŸ“ Paragraphs: {paragraph_replacements}")
963
  print(f" πŸ“‹ Headings: {heading_replacements}")
964
+ print(f"πŸŽ‰ Processing complete!")
 
965
 
966
  except FileNotFoundError as e:
967
  print(f"❌ File not found: {e}")
 
973
  if __name__ == "__main__":
974
  import sys
975
  if len(sys.argv) != 4:
976
+ print("Usage: python pipeline.py <input_docx> <updated_json> <output_docx>")
977
  exit(1)
978
  docx_path = sys.argv[1]
979
  json_path = sys.argv[2]