Shami96 commited on
Commit
2f0b7d3
·
verified ·
1 Parent(s): 955539a

Update updated_word.py

Browse files
Files changed (1) hide show
  1. updated_word.py +139 -25
updated_word.py CHANGED
@@ -428,45 +428,159 @@ def fill_mass_vehicle_table_preserve_headers(table: Table, arrays: Dict[str, Lis
428
 
429
  # Modified function for Management Summary tables only
430
  def overwrite_summary_details_cells(doc: Document, section_name: str, section_dict: Dict[str, List[str]]) -> int:
431
- """For a Summary table (Maintenance/Mass/Fatigue), replace the entire DETAILS cell
432
- for each Std N row with the JSON text (written in black with line breaks after periods)."""
433
- # build desired texts
 
434
  desired: Dict[str, str] = { _std_key(k): join_value(v) for k, v in section_dict.items() }
 
 
435
 
436
- # pick which tables belong to this section by header sniff
437
- wanted_prefix = canon_label(section_name.split()[0]) # "maintenance" | "mass" | "fatigue"
438
-
439
  updated = 0
440
- for t in doc.tables:
 
 
 
 
 
 
 
 
 
441
  cols = _looks_like_summary_table(t)
442
  if not cols:
443
  continue
444
  label_col, details_col = cols
445
-
446
  head_txt = table_header_text(t, up_to_rows=2)
447
- if wanted_prefix not in head_txt: # keep to the correct section
448
- continue
449
-
450
- # walk body rows
451
- for i in range(1, len(t.rows)):
452
- row = t.rows[i]
453
- key = _std_key(cell_text(row.cells[label_col]))
454
 
455
- # exact match or "std N" prefix match
456
- cand = desired.get(key)
457
- if not cand:
458
- m = re.match(r"(std\s+\d+)", key)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
459
  if m:
460
- for k2, v2 in desired.items():
461
- if k2.startswith(m.group(1)):
462
- cand = v2
 
463
  break
464
- if not cand:
 
 
 
 
 
 
 
 
 
465
  continue
466
 
467
- # Use the special function with line breaks for Management Summary tables
468
- _set_cell_text_black_with_line_breaks(row.cells[details_col], cand)
 
 
 
 
 
 
469
  updated += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
470
  return updated
471
 
472
  SPLIT_SENT_PAT = re.compile(r"(?<=\.|\?|!)\s+")
 
428
 
429
  # Modified function for Management Summary tables only
430
  def overwrite_summary_details_cells(doc: Document, section_name: str, section_dict: Dict[str, List[str]]) -> int:
431
+ """
432
+ Overwrite Summary table DETAILS cells robustly, with a strict fallback that
433
+ prefers rows whose DETAILS cell looks like a real sentence (not 'V'/'NC' markers).
434
+ """
435
  desired: Dict[str, str] = { _std_key(k): join_value(v) for k, v in section_dict.items() }
436
+ desired_orig = { _std_key(k): canon_label(k) for k in section_dict.keys() }
437
+ wanted_prefix = canon_label(section_name.split()[0])
438
 
439
+ tables = list(doc.tables)
 
 
440
  updated = 0
441
+ matched_keys = set()
442
+ matched_positions = {}
443
+
444
+ def is_sentencey(s: str) -> bool:
445
+ s = re.sub(r"\s+", " ", (s or "").strip())
446
+ # short guard: require some letters and reasonable length
447
+ return bool(s) and len(s) >= 20 and re.search(r"[A-Za-z]", s)
448
+
449
+ # 1) Prefer headered summary tables that match the section prefix
450
+ for t_index, t in enumerate(tables):
451
  cols = _looks_like_summary_table(t)
452
  if not cols:
453
  continue
454
  label_col, details_col = cols
 
455
  head_txt = table_header_text(t, up_to_rows=2)
456
+ if wanted_prefix not in head_txt:
457
+ # still allow headered tables, but prefer ones with section prefix
458
+ # (we do not skip entirely because some docs are inconsistent)
459
+ pass
 
 
 
460
 
461
+ hdr_rows = count_header_rows(t, scan_up_to=6)
462
+ for row_idx in range(hdr_rows, len(t.rows)):
463
+ row = t.rows[row_idx]
464
+ if label_col >= len(row.cells):
465
+ continue
466
+ left_text = cell_text(row.cells[label_col]).strip()
467
+ if not left_text:
468
+ continue
469
+ left_norm = canon_label(left_text)
470
+
471
+ # exact std number match
472
+ mstd = re.search(r"\bstd[\s\.]*?(\d{1,2})\b", left_norm, flags=re.I)
473
+ cand_key = None
474
+ if mstd:
475
+ k = f"std {int(mstd.group(1))}"
476
+ if k in desired:
477
+ cand_key = k
478
+ # exact normalized label match
479
+ if not cand_key and left_norm in desired:
480
+ cand_key = left_norm
481
+ # prefix match (std N prefix)
482
+ if not cand_key:
483
+ m = re.match(r"(std\s+\d+)", left_norm)
484
  if m:
485
+ pre = m.group(1)
486
+ for k2 in desired.keys():
487
+ if k2.startswith(pre):
488
+ cand_key = k2
489
  break
490
+ # containment / orig label fuzzy
491
+ if not cand_key:
492
+ for k2, orig in desired_orig.items():
493
+ if orig and (orig in left_norm or left_norm in orig):
494
+ cand_key = k2
495
+ break
496
+
497
+ if not cand_key:
498
+ # debug
499
+ print(f"[DEBUG] table#{t_index} row#{row_idx} left='{left_text}' -> NO CANDIDATE")
500
  continue
501
 
502
+ # ensure details_col exists, fallback to next cell
503
+ use_details = details_col if details_col < len(row.cells) else (label_col+1 if label_col+1 < len(row.cells) else len(row.cells)-1)
504
+ existing_details = cell_text(row.cells[use_details]).strip() if use_details < len(row.cells) else ""
505
+ # write regardless, but mark matched
506
+ print(f"[DEBUG] table#{t_index} row#{row_idx} left='{left_text}' matched_key={cand_key} -> updating details_col={use_details}")
507
+ _set_cell_text_black_with_line_breaks(row.cells[use_details], desired[cand_key])
508
+ matched_keys.add(cand_key)
509
+ matched_positions[cand_key] = (t_index, row_idx)
510
  updated += 1
511
+
512
+ # 2) Strict fragment fallback: for any still-missing std, find the best row across ALL tables
513
+ missing = [k for k in desired.keys() if k not in matched_keys]
514
+ if missing:
515
+ print(f"[DEBUG] Strict fallback for missing keys: {missing}")
516
+
517
+ for k in missing:
518
+ best_candidate = None
519
+ best_score = -1
520
+ orig_label = desired_orig.get(k, k)
521
+
522
+ # search all rows in all tables for a row whose left cell contains the label/std and whose
523
+ # details cell contains sentence-length text. choose best by longest details length.
524
+ for t_index, t in enumerate(tables):
525
+ # candidate may have label in any column (some fragments are odd)
526
+ for row_idx, row in enumerate(t.rows):
527
+ for c_idx, cell in enumerate(row.cells):
528
+ left_cell_text = cell_text(cell).strip()
529
+ if not left_cell_text:
530
+ continue
531
+ left_norm = canon_label(left_cell_text)
532
+
533
+ found_label = False
534
+ # numeric std match
535
+ mstd = re.search(r"\bstd[\s\.]*?(\d{1,2})\b", left_norm, flags=re.I)
536
+ if mstd:
537
+ if f"std {int(mstd.group(1))}" == k:
538
+ found_label = True
539
+ # normalized containment
540
+ if not found_label and orig_label and (orig_label in left_norm or left_norm in orig_label):
541
+ found_label = True
542
+
543
+ if not found_label:
544
+ continue
545
+
546
+ # determine details cell index: prefer next cell, otherwise last cell
547
+ details_idx = c_idx + 1 if (c_idx + 1) < len(row.cells) else (len(row.cells) - 1)
548
+ details_text = cell_text(row.cells[details_idx]).strip() if details_idx < len(row.cells) else ""
549
+ score = len(details_text)
550
+ sentencey = is_sentencey(details_text) or is_sentencey(left_cell_text)
551
+
552
+ # boost sentencey rows heavily
553
+ if sentencey:
554
+ score += 10000
555
+
556
+ # prefer tables whose header contains the wanted_prefix (if header present)
557
+ cols = _looks_like_summary_table(t)
558
+ if cols:
559
+ head_txt = table_header_text(t, up_to_rows=2)
560
+ if wanted_prefix in head_txt:
561
+ score += 500
562
+
563
+ # avoid writing into rows where the details are tiny markers only
564
+ if re.fullmatch(r"^[^\w]{0,2}\w?$", details_text):
565
+ # penalize strongly
566
+ score -= 5000
567
+
568
+ if score > best_score:
569
+ best_score = score
570
+ best_candidate = (t_index, row_idx, details_idx, left_cell_text, details_text)
571
+
572
+ if best_candidate and best_score > 0:
573
+ t_index, row_idx, details_idx, ltxt, dtxt = best_candidate
574
+ print(f"[DEBUG-FB] matched missing key {k} -> table#{t_index} row#{row_idx} left='{ltxt}' details_len={len(dtxt)}")
575
+ t = tables[t_index]
576
+ _set_cell_text_black_with_line_breaks(t.rows[row_idx].cells[details_idx], desired[k])
577
+ updated += 1
578
+ matched_keys.add(k)
579
+ matched_positions[k] = (t_index, row_idx)
580
+ else:
581
+ print(f"[DEBUG-FB] no suitable sentencey candidate found for {k}; skipping.")
582
+
583
+ print(f"[DEBUG] overwrite_summary_details_cells: total updated = {updated}")
584
  return updated
585
 
586
  SPLIT_SENT_PAT = re.compile(r"(?<=\.|\?|!)\s+")