Spaces:
Running
Running
Update updated_word.py
Browse files- updated_word.py +156 -0
updated_word.py
CHANGED
|
@@ -442,6 +442,147 @@ def semantic_text_matching(text, flat_json):
|
|
| 442 |
|
| 443 |
return None
|
| 444 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 445 |
def handle_australian_company_number(row, company_numbers):
|
| 446 |
"""Enhanced ACN handling"""
|
| 447 |
replacements_made = 0
|
|
@@ -688,6 +829,21 @@ def process_tables(document, flat_json):
|
|
| 688 |
print_accreditation_replacements = handle_print_accreditation_section(table, flat_json)
|
| 689 |
replacements_made += print_accreditation_replacements
|
| 690 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 691 |
|
| 692 |
# Your existing row processing with enhancements
|
| 693 |
for row_idx, row in enumerate(table.rows):
|
|
|
|
| 442 |
|
| 443 |
return None
|
| 444 |
|
| 445 |
+
def handle_nature_of_business_section(table, flat_json):
|
| 446 |
+
"""TARGETED FIX for Issue 1: Nature of Business multi-line and sub-fields"""
|
| 447 |
+
replacements_made = 0
|
| 448 |
+
|
| 449 |
+
for row_idx, row in enumerate(table.rows):
|
| 450 |
+
if len(row.cells) >= 1:
|
| 451 |
+
cell = row.cells[0]
|
| 452 |
+
cell_text = get_clean_text(cell).lower()
|
| 453 |
+
|
| 454 |
+
# Check if this is the "Nature of the Operators Business" section
|
| 455 |
+
if "nature of the operators business" in cell_text or "nature of the operator business" in cell_text:
|
| 456 |
+
print(f" π― Found Nature of Business section in row {row_idx + 1}")
|
| 457 |
+
|
| 458 |
+
# Handle main business description (multi-line red text)
|
| 459 |
+
if has_red_text(cell):
|
| 460 |
+
# Try to find business description in JSON
|
| 461 |
+
business_desc_keys = [
|
| 462 |
+
"nature of the operators business",
|
| 463 |
+
"business description",
|
| 464 |
+
"operator business summary",
|
| 465 |
+
"business summary"
|
| 466 |
+
]
|
| 467 |
+
|
| 468 |
+
business_value = None
|
| 469 |
+
for key in business_desc_keys:
|
| 470 |
+
business_value = find_matching_json_value(key, flat_json)
|
| 471 |
+
if business_value:
|
| 472 |
+
break
|
| 473 |
+
|
| 474 |
+
if business_value:
|
| 475 |
+
business_text = get_value_as_string(business_value)
|
| 476 |
+
cell_replacements = replace_red_text_in_cell(cell, business_text)
|
| 477 |
+
replacements_made += cell_replacements
|
| 478 |
+
print(f" β
Updated main business description")
|
| 479 |
+
|
| 480 |
+
# Look for sub-fields in the next few rows
|
| 481 |
+
for sub_row_idx in range(row_idx + 1, min(row_idx + 4, len(table.rows))):
|
| 482 |
+
sub_row = table.rows[sub_row_idx]
|
| 483 |
+
if len(sub_row.cells) >= 1:
|
| 484 |
+
sub_cell = sub_row.cells[0]
|
| 485 |
+
sub_text = get_clean_text(sub_cell).lower()
|
| 486 |
+
|
| 487 |
+
# Handle Accreditation Number
|
| 488 |
+
if "accreditation number" in sub_text and has_red_text(sub_cell):
|
| 489 |
+
accred_value = find_matching_json_value("accreditation number", flat_json)
|
| 490 |
+
if not accred_value:
|
| 491 |
+
accred_value = find_matching_json_value("nhvas accreditation no", flat_json)
|
| 492 |
+
if accred_value:
|
| 493 |
+
accred_text = get_value_as_string(accred_value)
|
| 494 |
+
cell_replacements = replace_red_text_in_cell(sub_cell, accred_text)
|
| 495 |
+
replacements_made += cell_replacements
|
| 496 |
+
print(f" β
Updated Accreditation Number: {accred_text}")
|
| 497 |
+
|
| 498 |
+
# Handle Expiry Date
|
| 499 |
+
elif "expiry date" in sub_text and has_red_text(sub_cell):
|
| 500 |
+
expiry_value = find_matching_json_value("expiry date", flat_json)
|
| 501 |
+
if not expiry_value:
|
| 502 |
+
expiry_value = find_matching_json_value("accreditation expiry", flat_json)
|
| 503 |
+
if expiry_value:
|
| 504 |
+
expiry_text = get_value_as_string(expiry_value)
|
| 505 |
+
cell_replacements = replace_red_text_in_cell(sub_cell, expiry_text)
|
| 506 |
+
replacements_made += cell_replacements
|
| 507 |
+
print(f" β
Updated Expiry Date: {expiry_text}")
|
| 508 |
+
|
| 509 |
+
break # Found the section, no need to continue
|
| 510 |
+
|
| 511 |
+
return replacements_made
|
| 512 |
+
|
| 513 |
+
|
| 514 |
+
def handle_operator_declaration_table(table, flat_json):
|
| 515 |
+
"""TARGETED FIX for Issue 2: Operator Declaration Print Name and Position Title"""
|
| 516 |
+
replacements_made = 0
|
| 517 |
+
|
| 518 |
+
for row_idx, row in enumerate(table.rows):
|
| 519 |
+
if len(row.cells) >= 2:
|
| 520 |
+
cell1_text = get_clean_text(row.cells[0]).lower()
|
| 521 |
+
cell2_text = get_clean_text(row.cells[1]).lower()
|
| 522 |
+
|
| 523 |
+
# Check if this is the header row with "Print Name" and "Position Title"
|
| 524 |
+
if "print name" in cell1_text and ("position title" in cell2_text or "position" in cell2_text):
|
| 525 |
+
print(f" π― Found Operator Declaration header row {row_idx + 1}")
|
| 526 |
+
|
| 527 |
+
# Look for the data row (next row with red text)
|
| 528 |
+
if row_idx + 1 < len(table.rows):
|
| 529 |
+
data_row = table.rows[row_idx + 1]
|
| 530 |
+
if len(data_row.cells) >= 2:
|
| 531 |
+
name_cell = data_row.cells[0]
|
| 532 |
+
position_cell = data_row.cells[1]
|
| 533 |
+
|
| 534 |
+
# Handle Print Name (first column)
|
| 535 |
+
if has_red_text(name_cell):
|
| 536 |
+
# Try to find operator name
|
| 537 |
+
name_keys = [
|
| 538 |
+
"operator name",
|
| 539 |
+
"print name",
|
| 540 |
+
"legal entity",
|
| 541 |
+
"operator"
|
| 542 |
+
]
|
| 543 |
+
|
| 544 |
+
name_value = None
|
| 545 |
+
for key in name_keys:
|
| 546 |
+
name_value = find_matching_json_value(key, flat_json)
|
| 547 |
+
if name_value:
|
| 548 |
+
break
|
| 549 |
+
|
| 550 |
+
if name_value:
|
| 551 |
+
name_text = get_value_as_string(name_value)
|
| 552 |
+
cell_replacements = replace_red_text_in_cell(name_cell, name_text)
|
| 553 |
+
replacements_made += cell_replacements
|
| 554 |
+
print(f" β
Updated Print Name: {name_text}")
|
| 555 |
+
|
| 556 |
+
# Handle Position Title (second column)
|
| 557 |
+
if has_red_text(position_cell):
|
| 558 |
+
# Try to find position/title
|
| 559 |
+
position_keys = [
|
| 560 |
+
"position title",
|
| 561 |
+
"position",
|
| 562 |
+
"title",
|
| 563 |
+
"job title",
|
| 564 |
+
"role"
|
| 565 |
+
]
|
| 566 |
+
|
| 567 |
+
position_value = None
|
| 568 |
+
for key in position_keys:
|
| 569 |
+
position_value = find_matching_json_value(key, flat_json)
|
| 570 |
+
if position_value:
|
| 571 |
+
break
|
| 572 |
+
|
| 573 |
+
# If no specific position found, default to "Manager"
|
| 574 |
+
if not position_value:
|
| 575 |
+
position_value = "Manager"
|
| 576 |
+
|
| 577 |
+
position_text = get_value_as_string(position_value)
|
| 578 |
+
cell_replacements = replace_red_text_in_cell(position_cell, position_text)
|
| 579 |
+
replacements_made += cell_replacements
|
| 580 |
+
print(f" β
Updated Position Title: {position_text}")
|
| 581 |
+
|
| 582 |
+
break # Found the section, no need to continue
|
| 583 |
+
|
| 584 |
+
return replacements_made
|
| 585 |
+
|
| 586 |
def handle_australian_company_number(row, company_numbers):
|
| 587 |
"""Enhanced ACN handling"""
|
| 588 |
replacements_made = 0
|
|
|
|
| 829 |
print_accreditation_replacements = handle_print_accreditation_section(table, flat_json)
|
| 830 |
replacements_made += print_accreditation_replacements
|
| 831 |
continue
|
| 832 |
+
|
| 833 |
+
# π― NEW: TARGETED FIX 1 - Nature of Business section
|
| 834 |
+
if "nature of the operator" in table_text:
|
| 835 |
+
print(f" π― Detected Nature of Business section")
|
| 836 |
+
nature_replacements = handle_nature_of_business_section(table, flat_json)
|
| 837 |
+
replacements_made += nature_replacements
|
| 838 |
+
# Don't continue - let it fall through to regular processing too
|
| 839 |
+
|
| 840 |
+
# π― NEW: TARGETED FIX 2 - Operator Declaration table
|
| 841 |
+
if "print name" in table_text and "position title" in table_text:
|
| 842 |
+
print(f" π― Detected Operator Declaration table")
|
| 843 |
+
declaration_replacements = handle_operator_declaration_table(table, flat_json)
|
| 844 |
+
replacements_made += declaration_replacements
|
| 845 |
+
# Don't continue - let it fall through to regular processing too
|
| 846 |
+
|
| 847 |
|
| 848 |
# Your existing row processing with enhancements
|
| 849 |
for row_idx, row in enumerate(table.rows):
|