Spaces:
Running
Running
Update updated_word.py
Browse files- updated_word.py +136 -32
updated_word.py
CHANGED
|
@@ -570,13 +570,16 @@ def handle_attendance_list_table_enhanced(table, flat_json):
|
|
| 570 |
return replacements_made
|
| 571 |
|
| 572 |
def fix_management_summary_details_column(table, flat_json):
|
| 573 |
-
"""
|
| 574 |
replacements_made = 0
|
| 575 |
print(f" π― FIX: Management Summary DETAILS column processing")
|
|
|
|
|
|
|
| 576 |
table_text = ""
|
| 577 |
for row in table.rows[:3]:
|
| 578 |
for cell in row.cells:
|
| 579 |
table_text += get_clean_text(cell).lower() + " "
|
|
|
|
| 580 |
mgmt_types = []
|
| 581 |
if "mass management" in table_text or "mass" in table_text:
|
| 582 |
mgmt_types.append("Mass Management Summary")
|
|
@@ -584,65 +587,125 @@ def fix_management_summary_details_column(table, flat_json):
|
|
| 584 |
mgmt_types.append("Maintenance Management Summary")
|
| 585 |
if "fatigue management" in table_text or "fatigue" in table_text:
|
| 586 |
mgmt_types.append("Fatigue Management Summary")
|
|
|
|
|
|
|
| 587 |
if not mgmt_types:
|
| 588 |
if any("std 5" in get_clean_text(c).lower() for r in table.rows for c in r.cells):
|
| 589 |
mgmt_types.append("Mass Management Summary")
|
|
|
|
| 590 |
if not mgmt_types:
|
|
|
|
| 591 |
return 0
|
|
|
|
| 592 |
for mgmt_type in mgmt_types:
|
| 593 |
print(f" β
Confirmed {mgmt_type} table processing")
|
| 594 |
-
|
| 595 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 596 |
for key in flat_json.keys():
|
| 597 |
-
|
| 598 |
-
|
|
|
|
|
|
|
|
|
|
| 599 |
break
|
| 600 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 601 |
print(f" β οΈ No JSON management dict found for {mgmt_type}, skipping this type")
|
| 602 |
continue
|
|
|
|
|
|
|
| 603 |
for row_idx, row in enumerate(table.rows):
|
| 604 |
if len(row.cells) >= 2:
|
| 605 |
standard_cell = row.cells[0]
|
| 606 |
details_cell = row.cells[1]
|
| 607 |
standard_text = get_clean_text(standard_cell).strip().lower()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 608 |
if "std 5" in standard_text or "verification" in standard_text:
|
| 609 |
if has_red_text(details_cell):
|
| 610 |
-
std_val =
|
| 611 |
-
|
| 612 |
-
std_val = mgmt_data.get(candidate)
|
| 613 |
-
if std_val is not None:
|
| 614 |
-
break
|
| 615 |
-
if std_val is None:
|
| 616 |
-
for k, v in mgmt_data.items():
|
| 617 |
-
if 'std 5' in k.lower() or 'verification' in k.lower():
|
| 618 |
-
std_val = v
|
| 619 |
-
break
|
| 620 |
-
if std_val is not None:
|
| 621 |
replacement_text = get_value_as_string(std_val, "Std 5. Verification")
|
| 622 |
cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
|
| 623 |
replacements_made += cell_replacements
|
| 624 |
if cell_replacements:
|
| 625 |
print(f" β
Replaced Std 5. Verification details for {mgmt_type}")
|
| 626 |
-
|
|
|
|
| 627 |
if has_red_text(details_cell):
|
| 628 |
-
std_val =
|
| 629 |
-
|
| 630 |
-
std_val = mgmt_data.get(candidate)
|
| 631 |
-
if std_val is not None:
|
| 632 |
-
break
|
| 633 |
-
if std_val is None:
|
| 634 |
-
for k, v in mgmt_data.items():
|
| 635 |
-
if 'std 6' in k.lower() or 'internal review' in k.lower():
|
| 636 |
-
std_val = v
|
| 637 |
-
break
|
| 638 |
-
if std_val is not None:
|
| 639 |
replacement_text = get_value_as_string(std_val, "Std 6. Internal Review")
|
| 640 |
cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
|
| 641 |
replacements_made += cell_replacements
|
| 642 |
if cell_replacements:
|
| 643 |
print(f" β
Replaced Std 6. Internal Review details for {mgmt_type}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 644 |
return replacements_made
|
| 645 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 646 |
# ============================================================================
|
| 647 |
# Canonical operator declaration fixer β SAFER
|
| 648 |
# ============================================================================
|
|
@@ -1282,13 +1345,40 @@ def process_paragraphs(document, flat_json):
|
|
| 1282 |
return replacements_made
|
| 1283 |
|
| 1284 |
def process_headings(document, flat_json):
|
|
|
|
|
|
|
|
|
|
| 1285 |
replacements_made = 0
|
| 1286 |
print(f"\nπ Processing headings:")
|
| 1287 |
paragraphs = document.paragraphs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1288 |
for para_idx, paragraph in enumerate(paragraphs):
|
| 1289 |
paragraph_text = paragraph.text.strip()
|
| 1290 |
if not paragraph_text:
|
| 1291 |
continue
|
|
|
|
| 1292 |
matched_heading = None
|
| 1293 |
for category, patterns in HEADING_PATTERNS.items():
|
| 1294 |
for pattern in patterns:
|
|
@@ -1297,20 +1387,29 @@ def process_headings(document, flat_json):
|
|
| 1297 |
break
|
| 1298 |
if matched_heading:
|
| 1299 |
break
|
|
|
|
| 1300 |
if matched_heading:
|
| 1301 |
print(f" π Found heading at paragraph {para_idx + 1}: '{paragraph_text}'")
|
|
|
|
|
|
|
| 1302 |
if has_red_text_in_paragraph(paragraph):
|
| 1303 |
print(f" π΄ Found red text in heading itself")
|
| 1304 |
-
heading_replacements =
|
| 1305 |
replacements_made += heading_replacements
|
|
|
|
|
|
|
| 1306 |
for next_para_offset in range(1, 6):
|
| 1307 |
next_para_idx = para_idx + next_para_offset
|
| 1308 |
if next_para_idx >= len(paragraphs):
|
| 1309 |
break
|
|
|
|
| 1310 |
next_paragraph = paragraphs[next_para_idx]
|
| 1311 |
next_text = next_paragraph.text.strip()
|
|
|
|
| 1312 |
if not next_text:
|
| 1313 |
continue
|
|
|
|
|
|
|
| 1314 |
is_another_heading = False
|
| 1315 |
for category, patterns in HEADING_PATTERNS.items():
|
| 1316 |
for pattern in patterns:
|
|
@@ -1319,18 +1418,23 @@ def process_headings(document, flat_json):
|
|
| 1319 |
break
|
| 1320 |
if is_another_heading:
|
| 1321 |
break
|
|
|
|
| 1322 |
if is_another_heading:
|
| 1323 |
break
|
|
|
|
| 1324 |
if has_red_text_in_paragraph(next_paragraph):
|
| 1325 |
print(f" π΄ Found red text in paragraph {next_para_idx + 1} after heading")
|
| 1326 |
-
context_replacements =
|
| 1327 |
next_paragraph,
|
| 1328 |
paragraph_text,
|
| 1329 |
-
flat_json
|
|
|
|
| 1330 |
)
|
| 1331 |
replacements_made += context_replacements
|
|
|
|
| 1332 |
return replacements_made
|
| 1333 |
|
|
|
|
| 1334 |
def process_red_text_in_paragraph(paragraph, context_text, flat_json):
|
| 1335 |
replacements_made = 0
|
| 1336 |
red_text_segments = []
|
|
|
|
| 570 |
return replacements_made
|
| 571 |
|
| 572 |
def fix_management_summary_details_column(table, flat_json):
|
| 573 |
+
"""Enhanced management summary processing with better data matching"""
|
| 574 |
replacements_made = 0
|
| 575 |
print(f" π― FIX: Management Summary DETAILS column processing")
|
| 576 |
+
|
| 577 |
+
# Determine which type of management summary this is
|
| 578 |
table_text = ""
|
| 579 |
for row in table.rows[:3]:
|
| 580 |
for cell in row.cells:
|
| 581 |
table_text += get_clean_text(cell).lower() + " "
|
| 582 |
+
|
| 583 |
mgmt_types = []
|
| 584 |
if "mass management" in table_text or "mass" in table_text:
|
| 585 |
mgmt_types.append("Mass Management Summary")
|
|
|
|
| 587 |
mgmt_types.append("Maintenance Management Summary")
|
| 588 |
if "fatigue management" in table_text or "fatigue" in table_text:
|
| 589 |
mgmt_types.append("Fatigue Management Summary")
|
| 590 |
+
|
| 591 |
+
# Fallback detection
|
| 592 |
if not mgmt_types:
|
| 593 |
if any("std 5" in get_clean_text(c).lower() for r in table.rows for c in r.cells):
|
| 594 |
mgmt_types.append("Mass Management Summary")
|
| 595 |
+
|
| 596 |
if not mgmt_types:
|
| 597 |
+
print(f" β οΈ Could not determine management summary type")
|
| 598 |
return 0
|
| 599 |
+
|
| 600 |
for mgmt_type in mgmt_types:
|
| 601 |
print(f" β
Confirmed {mgmt_type} table processing")
|
| 602 |
+
|
| 603 |
+
# Look for management data in the JSON
|
| 604 |
+
mgmt_data = None
|
| 605 |
+
|
| 606 |
+
# Try direct key match first
|
| 607 |
+
if mgmt_type in flat_json:
|
| 608 |
+
mgmt_data = flat_json[mgmt_type]
|
| 609 |
+
|
| 610 |
+
# Try variations of the key
|
| 611 |
+
if not mgmt_data:
|
| 612 |
for key in flat_json.keys():
|
| 613 |
+
key_lower = key.lower()
|
| 614 |
+
mgmt_lower = mgmt_type.lower()
|
| 615 |
+
if mgmt_lower in key_lower or key_lower in mgmt_lower:
|
| 616 |
+
mgmt_data = flat_json[key]
|
| 617 |
+
print(f" β
Found data using key variation: '{key}'")
|
| 618 |
break
|
| 619 |
+
|
| 620 |
+
# If still no data, look for individual standard data
|
| 621 |
+
if not mgmt_data:
|
| 622 |
+
# Collect individual standard entries
|
| 623 |
+
mgmt_data = {}
|
| 624 |
+
for key, value in flat_json.items():
|
| 625 |
+
key_lower = key.lower()
|
| 626 |
+
# Look for standard entries related to this management type
|
| 627 |
+
if ("std " in key_lower and
|
| 628 |
+
(("mass" in mgmt_type.lower() and any(term in key_lower for term in ["verification", "internal review"])) or
|
| 629 |
+
("maintenance" in mgmt_type.lower() and any(term in key_lower for term in ["daily check", "internal review"])) or
|
| 630 |
+
("fatigue" in mgmt_type.lower() and any(term in key_lower for term in ["internal review"])))):
|
| 631 |
+
mgmt_data[key] = value
|
| 632 |
+
|
| 633 |
+
if mgmt_data:
|
| 634 |
+
print(f" β
Collected individual standard data: {list(mgmt_data.keys())}")
|
| 635 |
+
|
| 636 |
+
if not mgmt_data or not isinstance(mgmt_data, dict):
|
| 637 |
print(f" β οΈ No JSON management dict found for {mgmt_type}, skipping this type")
|
| 638 |
continue
|
| 639 |
+
|
| 640 |
+
# Process the table rows
|
| 641 |
for row_idx, row in enumerate(table.rows):
|
| 642 |
if len(row.cells) >= 2:
|
| 643 |
standard_cell = row.cells[0]
|
| 644 |
details_cell = row.cells[1]
|
| 645 |
standard_text = get_clean_text(standard_cell).strip().lower()
|
| 646 |
+
|
| 647 |
+
# Skip header rows
|
| 648 |
+
if "standard" in standard_text or "requirement" in standard_text or "details" in standard_text:
|
| 649 |
+
continue
|
| 650 |
+
|
| 651 |
+
# Look for specific standards
|
| 652 |
if "std 5" in standard_text or "verification" in standard_text:
|
| 653 |
if has_red_text(details_cell):
|
| 654 |
+
std_val = find_best_standard_value(mgmt_data, ["Std 5. Verification", "Std 5 Verification", "Std 5", "Verification"])
|
| 655 |
+
if std_val:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 656 |
replacement_text = get_value_as_string(std_val, "Std 5. Verification")
|
| 657 |
cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
|
| 658 |
replacements_made += cell_replacements
|
| 659 |
if cell_replacements:
|
| 660 |
print(f" β
Replaced Std 5. Verification details for {mgmt_type}")
|
| 661 |
+
|
| 662 |
+
elif "std 6" in standard_text or "internal review" in standard_text:
|
| 663 |
if has_red_text(details_cell):
|
| 664 |
+
std_val = find_best_standard_value(mgmt_data, ["Std 6. Internal Review", "Std 6 Internal Review", "Std 6", "Internal Review"])
|
| 665 |
+
if std_val:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 666 |
replacement_text = get_value_as_string(std_val, "Std 6. Internal Review")
|
| 667 |
cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
|
| 668 |
replacements_made += cell_replacements
|
| 669 |
if cell_replacements:
|
| 670 |
print(f" β
Replaced Std 6. Internal Review details for {mgmt_type}")
|
| 671 |
+
|
| 672 |
+
elif "std 1" in standard_text or "daily check" in standard_text:
|
| 673 |
+
if has_red_text(details_cell):
|
| 674 |
+
std_val = find_best_standard_value(mgmt_data, ["Std 1. Daily Check", "Std 1 Daily Check", "Std 1", "Daily Check"])
|
| 675 |
+
if std_val:
|
| 676 |
+
replacement_text = get_value_as_string(std_val, "Std 1. Daily Check")
|
| 677 |
+
cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
|
| 678 |
+
replacements_made += cell_replacements
|
| 679 |
+
if cell_replacements:
|
| 680 |
+
print(f" β
Replaced Std 1. Daily Check details for {mgmt_type}")
|
| 681 |
+
|
| 682 |
+
elif "std 7" in standard_text:
|
| 683 |
+
if has_red_text(details_cell):
|
| 684 |
+
std_val = find_best_standard_value(mgmt_data, ["Std 7. Internal Review", "Std 7 Internal Review", "Std 7"])
|
| 685 |
+
if std_val:
|
| 686 |
+
replacement_text = get_value_as_string(std_val, "Std 7. Internal Review")
|
| 687 |
+
cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
|
| 688 |
+
replacements_made += cell_replacements
|
| 689 |
+
if cell_replacements:
|
| 690 |
+
print(f" β
Replaced Std 7. Internal Review details for {mgmt_type}")
|
| 691 |
+
|
| 692 |
return replacements_made
|
| 693 |
|
| 694 |
+
|
| 695 |
+
def find_best_standard_value(mgmt_data, candidate_keys):
|
| 696 |
+
"""Find the best matching value for a standard from management data"""
|
| 697 |
+
for candidate in candidate_keys:
|
| 698 |
+
if candidate in mgmt_data:
|
| 699 |
+
return mgmt_data[candidate]
|
| 700 |
+
|
| 701 |
+
# Try fuzzy matching
|
| 702 |
+
for key, value in mgmt_data.items():
|
| 703 |
+
for candidate in candidate_keys:
|
| 704 |
+
if candidate.lower() in key.lower() or key.lower() in candidate.lower():
|
| 705 |
+
return value
|
| 706 |
+
|
| 707 |
+
return None
|
| 708 |
+
|
| 709 |
# ============================================================================
|
| 710 |
# Canonical operator declaration fixer β SAFER
|
| 711 |
# ============================================================================
|
|
|
|
| 1345 |
return replacements_made
|
| 1346 |
|
| 1347 |
def process_headings(document, flat_json):
|
| 1348 |
+
"""
|
| 1349 |
+
IMPROVED: Better heading processing that avoids mixing company data
|
| 1350 |
+
"""
|
| 1351 |
replacements_made = 0
|
| 1352 |
print(f"\nπ Processing headings:")
|
| 1353 |
paragraphs = document.paragraphs
|
| 1354 |
+
|
| 1355 |
+
# Extract the correct operator name from the JSON data
|
| 1356 |
+
operator_name = None
|
| 1357 |
+
for key, value in flat_json.items():
|
| 1358 |
+
if "operator name" in key.lower() and "legal entity" in key.lower():
|
| 1359 |
+
if isinstance(value, list) and value:
|
| 1360 |
+
operator_name = str(value[0]).strip()
|
| 1361 |
+
else:
|
| 1362 |
+
operator_name = str(value).strip()
|
| 1363 |
+
break
|
| 1364 |
+
|
| 1365 |
+
if not operator_name:
|
| 1366 |
+
# Fallback - try other operator name keys
|
| 1367 |
+
for key, value in flat_json.items():
|
| 1368 |
+
if ("operator" in key.lower() and "name" in key.lower()) or key.lower() == "operator name":
|
| 1369 |
+
if isinstance(value, list) and value:
|
| 1370 |
+
operator_name = str(value[0]).strip()
|
| 1371 |
+
elif value:
|
| 1372 |
+
operator_name = str(value).strip()
|
| 1373 |
+
break
|
| 1374 |
+
|
| 1375 |
+
print(f" π Using operator name: '{operator_name}'")
|
| 1376 |
+
|
| 1377 |
for para_idx, paragraph in enumerate(paragraphs):
|
| 1378 |
paragraph_text = paragraph.text.strip()
|
| 1379 |
if not paragraph_text:
|
| 1380 |
continue
|
| 1381 |
+
|
| 1382 |
matched_heading = None
|
| 1383 |
for category, patterns in HEADING_PATTERNS.items():
|
| 1384 |
for pattern in patterns:
|
|
|
|
| 1387 |
break
|
| 1388 |
if matched_heading:
|
| 1389 |
break
|
| 1390 |
+
|
| 1391 |
if matched_heading:
|
| 1392 |
print(f" π Found heading at paragraph {para_idx + 1}: '{paragraph_text}'")
|
| 1393 |
+
|
| 1394 |
+
# Check if the heading itself has red text
|
| 1395 |
if has_red_text_in_paragraph(paragraph):
|
| 1396 |
print(f" π΄ Found red text in heading itself")
|
| 1397 |
+
heading_replacements = process_red_text_in_heading_paragraph(paragraph, paragraph_text, flat_json, operator_name)
|
| 1398 |
replacements_made += heading_replacements
|
| 1399 |
+
|
| 1400 |
+
# Look for red text in paragraphs immediately following this heading
|
| 1401 |
for next_para_offset in range(1, 6):
|
| 1402 |
next_para_idx = para_idx + next_para_offset
|
| 1403 |
if next_para_idx >= len(paragraphs):
|
| 1404 |
break
|
| 1405 |
+
|
| 1406 |
next_paragraph = paragraphs[next_para_idx]
|
| 1407 |
next_text = next_paragraph.text.strip()
|
| 1408 |
+
|
| 1409 |
if not next_text:
|
| 1410 |
continue
|
| 1411 |
+
|
| 1412 |
+
# Stop if we hit another heading
|
| 1413 |
is_another_heading = False
|
| 1414 |
for category, patterns in HEADING_PATTERNS.items():
|
| 1415 |
for pattern in patterns:
|
|
|
|
| 1418 |
break
|
| 1419 |
if is_another_heading:
|
| 1420 |
break
|
| 1421 |
+
|
| 1422 |
if is_another_heading:
|
| 1423 |
break
|
| 1424 |
+
|
| 1425 |
if has_red_text_in_paragraph(next_paragraph):
|
| 1426 |
print(f" π΄ Found red text in paragraph {next_para_idx + 1} after heading")
|
| 1427 |
+
context_replacements = process_red_text_in_context_paragraph(
|
| 1428 |
next_paragraph,
|
| 1429 |
paragraph_text,
|
| 1430 |
+
flat_json,
|
| 1431 |
+
operator_name
|
| 1432 |
)
|
| 1433 |
replacements_made += context_replacements
|
| 1434 |
+
|
| 1435 |
return replacements_made
|
| 1436 |
|
| 1437 |
+
|
| 1438 |
def process_red_text_in_paragraph(paragraph, context_text, flat_json):
|
| 1439 |
replacements_made = 0
|
| 1440 |
red_text_segments = []
|