Spaces:
Running
Running
Update updated_word.py
Browse files- updated_word.py +139 -25
updated_word.py
CHANGED
|
@@ -428,45 +428,159 @@ def fill_mass_vehicle_table_preserve_headers(table: Table, arrays: Dict[str, Lis
|
|
| 428 |
|
| 429 |
# Modified function for Management Summary tables only
|
| 430 |
def overwrite_summary_details_cells(doc: Document, section_name: str, section_dict: Dict[str, List[str]]) -> int:
|
| 431 |
-
"""
|
| 432 |
-
|
| 433 |
-
|
|
|
|
| 434 |
desired: Dict[str, str] = { _std_key(k): join_value(v) for k, v in section_dict.items() }
|
|
|
|
|
|
|
| 435 |
|
| 436 |
-
|
| 437 |
-
wanted_prefix = canon_label(section_name.split()[0]) # "maintenance" | "mass" | "fatigue"
|
| 438 |
-
|
| 439 |
updated = 0
|
| 440 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 441 |
cols = _looks_like_summary_table(t)
|
| 442 |
if not cols:
|
| 443 |
continue
|
| 444 |
label_col, details_col = cols
|
| 445 |
-
|
| 446 |
head_txt = table_header_text(t, up_to_rows=2)
|
| 447 |
-
if wanted_prefix not in head_txt:
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
for i in range(1, len(t.rows)):
|
| 452 |
-
row = t.rows[i]
|
| 453 |
-
key = _std_key(cell_text(row.cells[label_col]))
|
| 454 |
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 459 |
if m:
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
|
|
|
| 463 |
break
|
| 464 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 465 |
continue
|
| 466 |
|
| 467 |
-
#
|
| 468 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 469 |
updated += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 470 |
return updated
|
| 471 |
|
| 472 |
SPLIT_SENT_PAT = re.compile(r"(?<=\.|\?|!)\s+")
|
|
|
|
| 428 |
|
| 429 |
# Modified function for Management Summary tables only
|
| 430 |
def overwrite_summary_details_cells(doc: Document, section_name: str, section_dict: Dict[str, List[str]]) -> int:
|
| 431 |
+
"""
|
| 432 |
+
Overwrite Summary table DETAILS cells robustly, with a strict fallback that
|
| 433 |
+
prefers rows whose DETAILS cell looks like a real sentence (not 'V'/'NC' markers).
|
| 434 |
+
"""
|
| 435 |
desired: Dict[str, str] = { _std_key(k): join_value(v) for k, v in section_dict.items() }
|
| 436 |
+
desired_orig = { _std_key(k): canon_label(k) for k in section_dict.keys() }
|
| 437 |
+
wanted_prefix = canon_label(section_name.split()[0])
|
| 438 |
|
| 439 |
+
tables = list(doc.tables)
|
|
|
|
|
|
|
| 440 |
updated = 0
|
| 441 |
+
matched_keys = set()
|
| 442 |
+
matched_positions = {}
|
| 443 |
+
|
| 444 |
+
def is_sentencey(s: str) -> bool:
|
| 445 |
+
s = re.sub(r"\s+", " ", (s or "").strip())
|
| 446 |
+
# short guard: require some letters and reasonable length
|
| 447 |
+
return bool(s) and len(s) >= 20 and re.search(r"[A-Za-z]", s)
|
| 448 |
+
|
| 449 |
+
# 1) Prefer headered summary tables that match the section prefix
|
| 450 |
+
for t_index, t in enumerate(tables):
|
| 451 |
cols = _looks_like_summary_table(t)
|
| 452 |
if not cols:
|
| 453 |
continue
|
| 454 |
label_col, details_col = cols
|
|
|
|
| 455 |
head_txt = table_header_text(t, up_to_rows=2)
|
| 456 |
+
if wanted_prefix not in head_txt:
|
| 457 |
+
# still allow headered tables, but prefer ones with section prefix
|
| 458 |
+
# (we do not skip entirely because some docs are inconsistent)
|
| 459 |
+
pass
|
|
|
|
|
|
|
|
|
|
| 460 |
|
| 461 |
+
hdr_rows = count_header_rows(t, scan_up_to=6)
|
| 462 |
+
for row_idx in range(hdr_rows, len(t.rows)):
|
| 463 |
+
row = t.rows[row_idx]
|
| 464 |
+
if label_col >= len(row.cells):
|
| 465 |
+
continue
|
| 466 |
+
left_text = cell_text(row.cells[label_col]).strip()
|
| 467 |
+
if not left_text:
|
| 468 |
+
continue
|
| 469 |
+
left_norm = canon_label(left_text)
|
| 470 |
+
|
| 471 |
+
# exact std number match
|
| 472 |
+
mstd = re.search(r"\bstd[\s\.]*?(\d{1,2})\b", left_norm, flags=re.I)
|
| 473 |
+
cand_key = None
|
| 474 |
+
if mstd:
|
| 475 |
+
k = f"std {int(mstd.group(1))}"
|
| 476 |
+
if k in desired:
|
| 477 |
+
cand_key = k
|
| 478 |
+
# exact normalized label match
|
| 479 |
+
if not cand_key and left_norm in desired:
|
| 480 |
+
cand_key = left_norm
|
| 481 |
+
# prefix match (std N prefix)
|
| 482 |
+
if not cand_key:
|
| 483 |
+
m = re.match(r"(std\s+\d+)", left_norm)
|
| 484 |
if m:
|
| 485 |
+
pre = m.group(1)
|
| 486 |
+
for k2 in desired.keys():
|
| 487 |
+
if k2.startswith(pre):
|
| 488 |
+
cand_key = k2
|
| 489 |
break
|
| 490 |
+
# containment / orig label fuzzy
|
| 491 |
+
if not cand_key:
|
| 492 |
+
for k2, orig in desired_orig.items():
|
| 493 |
+
if orig and (orig in left_norm or left_norm in orig):
|
| 494 |
+
cand_key = k2
|
| 495 |
+
break
|
| 496 |
+
|
| 497 |
+
if not cand_key:
|
| 498 |
+
# debug
|
| 499 |
+
print(f"[DEBUG] table#{t_index} row#{row_idx} left='{left_text}' -> NO CANDIDATE")
|
| 500 |
continue
|
| 501 |
|
| 502 |
+
# ensure details_col exists, fallback to next cell
|
| 503 |
+
use_details = details_col if details_col < len(row.cells) else (label_col+1 if label_col+1 < len(row.cells) else len(row.cells)-1)
|
| 504 |
+
existing_details = cell_text(row.cells[use_details]).strip() if use_details < len(row.cells) else ""
|
| 505 |
+
# write regardless, but mark matched
|
| 506 |
+
print(f"[DEBUG] table#{t_index} row#{row_idx} left='{left_text}' matched_key={cand_key} -> updating details_col={use_details}")
|
| 507 |
+
_set_cell_text_black_with_line_breaks(row.cells[use_details], desired[cand_key])
|
| 508 |
+
matched_keys.add(cand_key)
|
| 509 |
+
matched_positions[cand_key] = (t_index, row_idx)
|
| 510 |
updated += 1
|
| 511 |
+
|
| 512 |
+
# 2) Strict fragment fallback: for any still-missing std, find the best row across ALL tables
|
| 513 |
+
missing = [k for k in desired.keys() if k not in matched_keys]
|
| 514 |
+
if missing:
|
| 515 |
+
print(f"[DEBUG] Strict fallback for missing keys: {missing}")
|
| 516 |
+
|
| 517 |
+
for k in missing:
|
| 518 |
+
best_candidate = None
|
| 519 |
+
best_score = -1
|
| 520 |
+
orig_label = desired_orig.get(k, k)
|
| 521 |
+
|
| 522 |
+
# search all rows in all tables for a row whose left cell contains the label/std and whose
|
| 523 |
+
# details cell contains sentence-length text. choose best by longest details length.
|
| 524 |
+
for t_index, t in enumerate(tables):
|
| 525 |
+
# candidate may have label in any column (some fragments are odd)
|
| 526 |
+
for row_idx, row in enumerate(t.rows):
|
| 527 |
+
for c_idx, cell in enumerate(row.cells):
|
| 528 |
+
left_cell_text = cell_text(cell).strip()
|
| 529 |
+
if not left_cell_text:
|
| 530 |
+
continue
|
| 531 |
+
left_norm = canon_label(left_cell_text)
|
| 532 |
+
|
| 533 |
+
found_label = False
|
| 534 |
+
# numeric std match
|
| 535 |
+
mstd = re.search(r"\bstd[\s\.]*?(\d{1,2})\b", left_norm, flags=re.I)
|
| 536 |
+
if mstd:
|
| 537 |
+
if f"std {int(mstd.group(1))}" == k:
|
| 538 |
+
found_label = True
|
| 539 |
+
# normalized containment
|
| 540 |
+
if not found_label and orig_label and (orig_label in left_norm or left_norm in orig_label):
|
| 541 |
+
found_label = True
|
| 542 |
+
|
| 543 |
+
if not found_label:
|
| 544 |
+
continue
|
| 545 |
+
|
| 546 |
+
# determine details cell index: prefer next cell, otherwise last cell
|
| 547 |
+
details_idx = c_idx + 1 if (c_idx + 1) < len(row.cells) else (len(row.cells) - 1)
|
| 548 |
+
details_text = cell_text(row.cells[details_idx]).strip() if details_idx < len(row.cells) else ""
|
| 549 |
+
score = len(details_text)
|
| 550 |
+
sentencey = is_sentencey(details_text) or is_sentencey(left_cell_text)
|
| 551 |
+
|
| 552 |
+
# boost sentencey rows heavily
|
| 553 |
+
if sentencey:
|
| 554 |
+
score += 10000
|
| 555 |
+
|
| 556 |
+
# prefer tables whose header contains the wanted_prefix (if header present)
|
| 557 |
+
cols = _looks_like_summary_table(t)
|
| 558 |
+
if cols:
|
| 559 |
+
head_txt = table_header_text(t, up_to_rows=2)
|
| 560 |
+
if wanted_prefix in head_txt:
|
| 561 |
+
score += 500
|
| 562 |
+
|
| 563 |
+
# avoid writing into rows where the details are tiny markers only
|
| 564 |
+
if re.fullmatch(r"^[^\w]{0,2}\w?$", details_text):
|
| 565 |
+
# penalize strongly
|
| 566 |
+
score -= 5000
|
| 567 |
+
|
| 568 |
+
if score > best_score:
|
| 569 |
+
best_score = score
|
| 570 |
+
best_candidate = (t_index, row_idx, details_idx, left_cell_text, details_text)
|
| 571 |
+
|
| 572 |
+
if best_candidate and best_score > 0:
|
| 573 |
+
t_index, row_idx, details_idx, ltxt, dtxt = best_candidate
|
| 574 |
+
print(f"[DEBUG-FB] matched missing key {k} -> table#{t_index} row#{row_idx} left='{ltxt}' details_len={len(dtxt)}")
|
| 575 |
+
t = tables[t_index]
|
| 576 |
+
_set_cell_text_black_with_line_breaks(t.rows[row_idx].cells[details_idx], desired[k])
|
| 577 |
+
updated += 1
|
| 578 |
+
matched_keys.add(k)
|
| 579 |
+
matched_positions[k] = (t_index, row_idx)
|
| 580 |
+
else:
|
| 581 |
+
print(f"[DEBUG-FB] no suitable sentencey candidate found for {k}; skipping.")
|
| 582 |
+
|
| 583 |
+
print(f"[DEBUG] overwrite_summary_details_cells: total updated = {updated}")
|
| 584 |
return updated
|
| 585 |
|
| 586 |
SPLIT_SENT_PAT = re.compile(r"(?<=\.|\?|!)\s+")
|