Shami96 commited on
Commit
5efc8a5
Β·
verified Β·
1 Parent(s): 7755a4a

Update updated_word.py

Browse files
Files changed (1) hide show
  1. updated_word.py +156 -0
updated_word.py CHANGED
@@ -442,6 +442,147 @@ def semantic_text_matching(text, flat_json):
442
 
443
  return None
444
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
445
  def handle_australian_company_number(row, company_numbers):
446
  """Enhanced ACN handling"""
447
  replacements_made = 0
@@ -688,6 +829,21 @@ def process_tables(document, flat_json):
688
  print_accreditation_replacements = handle_print_accreditation_section(table, flat_json)
689
  replacements_made += print_accreditation_replacements
690
  continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
691
 
692
  # Your existing row processing with enhancements
693
  for row_idx, row in enumerate(table.rows):
 
442
 
443
  return None
444
 
445
+ def handle_nature_of_business_section(table, flat_json):
446
+ """TARGETED FIX for Issue 1: Nature of Business multi-line and sub-fields"""
447
+ replacements_made = 0
448
+
449
+ for row_idx, row in enumerate(table.rows):
450
+ if len(row.cells) >= 1:
451
+ cell = row.cells[0]
452
+ cell_text = get_clean_text(cell).lower()
453
+
454
+ # Check if this is the "Nature of the Operators Business" section
455
+ if "nature of the operators business" in cell_text or "nature of the operator business" in cell_text:
456
+ print(f" 🎯 Found Nature of Business section in row {row_idx + 1}")
457
+
458
+ # Handle main business description (multi-line red text)
459
+ if has_red_text(cell):
460
+ # Try to find business description in JSON
461
+ business_desc_keys = [
462
+ "nature of the operators business",
463
+ "business description",
464
+ "operator business summary",
465
+ "business summary"
466
+ ]
467
+
468
+ business_value = None
469
+ for key in business_desc_keys:
470
+ business_value = find_matching_json_value(key, flat_json)
471
+ if business_value:
472
+ break
473
+
474
+ if business_value:
475
+ business_text = get_value_as_string(business_value)
476
+ cell_replacements = replace_red_text_in_cell(cell, business_text)
477
+ replacements_made += cell_replacements
478
+ print(f" βœ… Updated main business description")
479
+
480
+ # Look for sub-fields in the next few rows
481
+ for sub_row_idx in range(row_idx + 1, min(row_idx + 4, len(table.rows))):
482
+ sub_row = table.rows[sub_row_idx]
483
+ if len(sub_row.cells) >= 1:
484
+ sub_cell = sub_row.cells[0]
485
+ sub_text = get_clean_text(sub_cell).lower()
486
+
487
+ # Handle Accreditation Number
488
+ if "accreditation number" in sub_text and has_red_text(sub_cell):
489
+ accred_value = find_matching_json_value("accreditation number", flat_json)
490
+ if not accred_value:
491
+ accred_value = find_matching_json_value("nhvas accreditation no", flat_json)
492
+ if accred_value:
493
+ accred_text = get_value_as_string(accred_value)
494
+ cell_replacements = replace_red_text_in_cell(sub_cell, accred_text)
495
+ replacements_made += cell_replacements
496
+ print(f" βœ… Updated Accreditation Number: {accred_text}")
497
+
498
+ # Handle Expiry Date
499
+ elif "expiry date" in sub_text and has_red_text(sub_cell):
500
+ expiry_value = find_matching_json_value("expiry date", flat_json)
501
+ if not expiry_value:
502
+ expiry_value = find_matching_json_value("accreditation expiry", flat_json)
503
+ if expiry_value:
504
+ expiry_text = get_value_as_string(expiry_value)
505
+ cell_replacements = replace_red_text_in_cell(sub_cell, expiry_text)
506
+ replacements_made += cell_replacements
507
+ print(f" βœ… Updated Expiry Date: {expiry_text}")
508
+
509
+ break # Found the section, no need to continue
510
+
511
+ return replacements_made
512
+
513
+
514
+ def handle_operator_declaration_table(table, flat_json):
515
+ """TARGETED FIX for Issue 2: Operator Declaration Print Name and Position Title"""
516
+ replacements_made = 0
517
+
518
+ for row_idx, row in enumerate(table.rows):
519
+ if len(row.cells) >= 2:
520
+ cell1_text = get_clean_text(row.cells[0]).lower()
521
+ cell2_text = get_clean_text(row.cells[1]).lower()
522
+
523
+ # Check if this is the header row with "Print Name" and "Position Title"
524
+ if "print name" in cell1_text and ("position title" in cell2_text or "position" in cell2_text):
525
+ print(f" 🎯 Found Operator Declaration header row {row_idx + 1}")
526
+
527
+ # Look for the data row (next row with red text)
528
+ if row_idx + 1 < len(table.rows):
529
+ data_row = table.rows[row_idx + 1]
530
+ if len(data_row.cells) >= 2:
531
+ name_cell = data_row.cells[0]
532
+ position_cell = data_row.cells[1]
533
+
534
+ # Handle Print Name (first column)
535
+ if has_red_text(name_cell):
536
+ # Try to find operator name
537
+ name_keys = [
538
+ "operator name",
539
+ "print name",
540
+ "legal entity",
541
+ "operator"
542
+ ]
543
+
544
+ name_value = None
545
+ for key in name_keys:
546
+ name_value = find_matching_json_value(key, flat_json)
547
+ if name_value:
548
+ break
549
+
550
+ if name_value:
551
+ name_text = get_value_as_string(name_value)
552
+ cell_replacements = replace_red_text_in_cell(name_cell, name_text)
553
+ replacements_made += cell_replacements
554
+ print(f" βœ… Updated Print Name: {name_text}")
555
+
556
+ # Handle Position Title (second column)
557
+ if has_red_text(position_cell):
558
+ # Try to find position/title
559
+ position_keys = [
560
+ "position title",
561
+ "position",
562
+ "title",
563
+ "job title",
564
+ "role"
565
+ ]
566
+
567
+ position_value = None
568
+ for key in position_keys:
569
+ position_value = find_matching_json_value(key, flat_json)
570
+ if position_value:
571
+ break
572
+
573
+ # If no specific position found, default to "Manager"
574
+ if not position_value:
575
+ position_value = "Manager"
576
+
577
+ position_text = get_value_as_string(position_value)
578
+ cell_replacements = replace_red_text_in_cell(position_cell, position_text)
579
+ replacements_made += cell_replacements
580
+ print(f" βœ… Updated Position Title: {position_text}")
581
+
582
+ break # Found the section, no need to continue
583
+
584
+ return replacements_made
585
+
586
  def handle_australian_company_number(row, company_numbers):
587
  """Enhanced ACN handling"""
588
  replacements_made = 0
 
829
  print_accreditation_replacements = handle_print_accreditation_section(table, flat_json)
830
  replacements_made += print_accreditation_replacements
831
  continue
832
+
833
+ # 🎯 NEW: TARGETED FIX 1 - Nature of Business section
834
+ if "nature of the operator" in table_text:
835
+ print(f" 🎯 Detected Nature of Business section")
836
+ nature_replacements = handle_nature_of_business_section(table, flat_json)
837
+ replacements_made += nature_replacements
838
+ # Don't continue - let it fall through to regular processing too
839
+
840
+ # 🎯 NEW: TARGETED FIX 2 - Operator Declaration table
841
+ if "print name" in table_text and "position title" in table_text:
842
+ print(f" 🎯 Detected Operator Declaration table")
843
+ declaration_replacements = handle_operator_declaration_table(table, flat_json)
844
+ replacements_made += declaration_replacements
845
+ # Don't continue - let it fall through to regular processing too
846
+
847
 
848
  # Your existing row processing with enhancements
849
  for row_idx, row in enumerate(table.rows):