Shami96 commited on
Commit
b6f7c7f
Β·
verified Β·
1 Parent(s): 0781251

Update extract_red_text.py

Browse files
Files changed (1) hide show
  1. extract_red_text.py +146 -39
extract_red_text.py CHANGED
@@ -11,9 +11,9 @@ MONTHS = r"(January|February|March|April|May|June|July|August|September|October|
11
  DATE_RE = re.compile(rf"\b(\d{{1,2}})\s*(st|nd|rd|th)?\s+{MONTHS}\s+\d{{4}}\b", re.I)
12
  DATE_NUM_RE = re.compile(r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b")
13
 
14
- # Inline sub-label regexes for Nature paragraph
15
- ACCRED_RE = re.compile(r"\bAccreditation\s*Number[:\s-]*([A-Za-z0-9\s/-]{2,})", re.I)
16
- EXPIRY_RE = re.compile(r"\bExpiry\s*Date[:\s-]*([A-Za-z0-9\s,/-]{2,})", re.I)
17
 
18
  # Parent name aliases to prevent Mass Management vs Mass Management Summary mismatches
19
  AMBIGUOUS_PARENTS = [
@@ -903,35 +903,51 @@ def extract_red_text(input_doc):
903
  if op_dec:
904
  out["Operator Declaration"] = op_dec
905
 
906
- # β€”β€” Handle ambiguous parents without creating unwanted duplicates β€”β€”
907
- # Only create aliases for legitimately different content, not summary tables
908
- summary_sections = {k for k in out.keys() if "Summary" in k}
909
-
910
- processed_pairs = set()
911
- for a, b in AMBIGUOUS_PARENTS:
912
- pair_key = tuple(sorted([a, b]))
913
- if pair_key in processed_pairs:
914
- continue
915
- processed_pairs.add(pair_key)
916
-
917
- # Skip if one is a Summary table and the other isn't - these should remain separate
918
- if ("Summary" in a) != ("Summary" in b):
919
- continue
920
-
921
- if a in out and b in out:
922
- # Both exist - check if they have identical content
923
- if out[a] == out[b]:
924
- # Remove the duplicate (prefer Summary version if available)
925
- to_remove = b if "Summary" in a else a
926
- del out[to_remove]
927
- elif a in out and b not in out:
928
- # Only create alias if not a summary table and names are different
929
- if a != b and "Summary" not in a and len(out[a]) > 1:
930
- out[b] = out[a]
931
- elif b in out and a not in out:
932
- # Only create alias if not a summary table and names are different
933
- if a != b and "Summary" not in b and len(out[b]) > 1:
934
- out[a] = out[b]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
935
 
936
  # β€”β€” add Accreditation Number and Expiry Date from Nature paragraph (do NOT edit the paragraph) β€”β€”
937
  for sec_key, section in list(out.items()):
@@ -966,33 +982,124 @@ def extract_red_text(input_doc):
966
  if v:
967
  section.setdefault("Expiry Date", []).append(v)
968
 
969
- # fallback when labels are missing but values appear at the end
970
  acc_missing = not section.get("Accreditation Number")
971
  exp_missing = not section.get("Expiry Date")
972
 
973
  if acc_missing or exp_missing:
974
- # find the last date-like token (wordy month or numeric)
975
  last_date_match = None
976
- for md in DATE_RE.finditer(para):
 
 
977
  last_date_match = md
 
 
 
 
978
  if not last_date_match:
979
  for md in DATE_NUM_RE.finditer(para):
980
  last_date_match = md
981
 
 
982
  if last_date_match:
 
 
 
 
 
983
  if exp_missing:
984
- date_txt = _fix_ordinal_space(last_date_match.group(0))
985
  section.setdefault("Expiry Date", []).append(normalize_text(date_txt))
986
 
 
987
  if acc_missing:
988
- # take digits immediately before the date
989
- before = para[: last_date_match.start()]
990
- m_num = re.search(r"(\d[\d\s]{3,12})\s*$", before)
991
  if m_num:
992
  num_txt = _compact_digits(normalize_text(m_num.group(1)))
993
  if num_txt:
994
  section.setdefault("Accreditation Number", []).append(num_txt)
995
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
996
  return out
997
 
998
  def extract_red_text_filelike(input_file, output_file):
 
11
  DATE_RE = re.compile(rf"\b(\d{{1,2}})\s*(st|nd|rd|th)?\s+{MONTHS}\s+\d{{4}}\b", re.I)
12
  DATE_NUM_RE = re.compile(r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b")
13
 
14
+ # permissive inline label regexes β€” allow OCR-space noise and varied punctuation
15
+ ACCRED_RE = re.compile(r"\bAccreditation\s*Number[:\s\-–:]*([A-Za-z0-9\s\.\-/,]{2,})", re.I)
16
+ EXPIRY_RE = re.compile(r"\bExpiry\s*Date[:\s\-–:]*([A-Za-z0-9\s\.\-/,]{2,})", re.I)
17
 
18
  # Parent name aliases to prevent Mass Management vs Mass Management Summary mismatches
19
  AMBIGUOUS_PARENTS = [
 
903
  if op_dec:
904
  out["Operator Declaration"] = op_dec
905
 
906
+ # β€”β€” Handle ambiguous parents without creating unwanted duplicates β€”β€”
907
+ # Prefer the "Summary" variant when both keys derive from the same Std-style content.
908
+ summary_pairs = [
909
+ ("Mass Management Summary", "Mass Management"),
910
+ ("Maintenance Management Summary", "Maintenance Management"),
911
+ ("Fatigue Management Summary", "Fatigue Management"),
912
+ ]
913
+
914
+ for summary_key, alt_key in summary_pairs:
915
+ # if only alt exists, consider promoting it to the summary name
916
+ if alt_key in out and summary_key not in out:
917
+ # only promote if the alt content looks like a standards/details map
918
+ alt_section = out.get(alt_key)
919
+ if isinstance(alt_section, dict) and any(k.strip().startswith("Std") for k in alt_section.keys()):
920
+ out[summary_key] = alt_section
921
+ del out[alt_key]
922
+ continue
923
+
924
+ # if both exist, merge alt into summary (avoiding duplicates)
925
+ if summary_key in out and alt_key in out:
926
+ s = out[summary_key] or {}
927
+ a = out[alt_key] or {}
928
+ # Only auto-merge when both are dicts and look like Std mappings (safe heuristic)
929
+ if isinstance(s, dict) and isinstance(a, dict) and \
930
+ (any(k.strip().startswith("Std") for k in s.keys()) or any(k.strip().startswith("Std") for k in a.keys())):
931
+ for k, v in a.items():
932
+ if not v:
933
+ continue
934
+ if k in s:
935
+ # append unique items
936
+ if isinstance(s[k], list) and isinstance(v, list):
937
+ for item in v:
938
+ if item not in s[k]:
939
+ s[k].append(item)
940
+ else:
941
+ # fallback: convert to lists
942
+ s.setdefault(k, [])
943
+ for item in (v if isinstance(v, list) else [v]):
944
+ if item not in s[k]:
945
+ s[k].append(item)
946
+ else:
947
+ s[k] = v if isinstance(v, list) else [v]
948
+ out[summary_key] = s
949
+ # remove the alt key to avoid duplicate sections
950
+ del out[alt_key]
951
 
952
  # β€”β€” add Accreditation Number and Expiry Date from Nature paragraph (do NOT edit the paragraph) β€”β€”
953
  for sec_key, section in list(out.items()):
 
982
  if v:
983
  section.setdefault("Expiry Date", []).append(v)
984
 
985
+ # fallback when labels are missing but values appear at the end
986
  acc_missing = not section.get("Accreditation Number")
987
  exp_missing = not section.get("Expiry Date")
988
 
989
  if acc_missing or exp_missing:
990
+ # 1) Try to find the last date-like token (wordy month or numeric)
991
  last_date_match = None
992
+ # prefer textual month matches (allowing OCR noise like "22 nd September 2023" or "202 3")
993
+ month_rx = re.compile(rf"\b\d{{1,2}}\s*(?:st|nd|rd|th)?\s+{MONTHS}\s+\d{{2,4}}\b", re.I)
994
+ for md in month_rx.finditer(para):
995
  last_date_match = md
996
+ # fallback numeric date forms (dd/mm/yyyy or dd-mm-yyyy)
997
+ if not last_date_match:
998
+ for md in DATE_RE.finditer(para):
999
+ last_date_match = md
1000
  if not last_date_match:
1001
  for md in DATE_NUM_RE.finditer(para):
1002
  last_date_match = md
1003
 
1004
+ # 2) If we found a candidate expiry date, normalise and use it
1005
  if last_date_match:
1006
+ date_txt = last_date_match.group(0)
1007
+ # fix noisy ordinals/spacing and collapsed digit noise (e.g., "202 3" -> "2023")
1008
+ date_txt = _fix_ordinal_space(date_txt)
1009
+ date_txt = re.sub(r"\b(20)\s?(\d{2})\b", r"\1\2", date_txt)
1010
+ date_txt = re.sub(r"\b(19)\s?(\d{2})\b", r"\1\2", date_txt)
1011
  if exp_missing:
 
1012
  section.setdefault("Expiry Date", []).append(normalize_text(date_txt))
1013
 
1014
+ # 3) If accreditation is missing, try to extract digits immediately *before* the date
1015
  if acc_missing:
1016
+ before = para[: last_date_match.start()].strip()
1017
+ # look for long digit run (allow spaces between digits)
1018
+ m_num = re.search(r"(\d[\d\s]{3,16}\d)\s*$", before)
1019
  if m_num:
1020
  num_txt = _compact_digits(normalize_text(m_num.group(1)))
1021
  if num_txt:
1022
  section.setdefault("Accreditation Number", []).append(num_txt)
1023
 
1024
+ # 4) If we still didn't find an accreditation number, try scanning entire paragraph for the longest digit run
1025
+ if acc_missing:
1026
+ # collect digit-like tokens, collapse internal spaces and pick the longest
1027
+ digit_tokens = [ _compact_digits(t) for t in re.findall(r"[\d\s]{4,}", para) ]
1028
+ digit_tokens = [d for d in digit_tokens if len(re.sub(r'\D','',d)) >= 5] # require >=5 digits
1029
+ if digit_tokens:
1030
+ # choose the longest / most plausible digits (deterministic)
1031
+ digit_tokens.sort(key=lambda s: (-len(re.sub(r'\D','',s)), s))
1032
+ section.setdefault("Accreditation Number", []).append(digit_tokens[0])
1033
+
1034
+ # 5) If expiry still missing, do a broad textual month search anywhere in the paragraph
1035
+ if exp_missing:
1036
+ broad_month_rx = re.compile(rf"\b\d{{1,2}}\s*(?:st|nd|rd|th)?\s+{MONTHS}\s+\d{{2,4}}\b|\b{MONTHS}\s+\d{{2,4}}\b", re.I)
1037
+ md_any = list(broad_month_rx.finditer(para))
1038
+ if md_any:
1039
+ candidate = md_any[-1].group(0)
1040
+ candidate = _fix_ordinal_space(candidate)
1041
+ candidate = re.sub(r"\b(20)\s?(\d{2})\b", r"\1\2", candidate)
1042
+ if candidate:
1043
+ section.setdefault("Expiry Date", []).append(normalize_text(candidate))
1044
+
1045
+
1046
+ # β€”β€” STRONGER: canonicalise & merge "X Summary" <-> "X" variants (case-insensitive) β€”β€”
1047
+ def _base_name(k: str) -> str:
1048
+ # remove trailing "summary" and punctuation, normalise spaces
1049
+ if not isinstance(k, str):
1050
+ return ""
1051
+ b = re.sub(r"[\(\)\[\]\:]+", " ", k)
1052
+ b = re.sub(r"\bsummary\b\s*[:\-]*", "", b, flags=re.I)
1053
+ b = re.sub(r"\s+", " ", b).strip().lower()
1054
+ return b
1055
+
1056
+ # Build index: base -> list of original keys
1057
+ base_index = {}
1058
+ for key in list(out.keys()):
1059
+ base = _base_name(key)
1060
+ if not base:
1061
+ continue
1062
+ base_index.setdefault(base, []).append(key)
1063
+
1064
+ # For each base that maps to >1 key, merge into the Summary-preferring canonical key
1065
+ for base, keys in base_index.items():
1066
+ if len(keys) < 2:
1067
+ continue
1068
+ # prefer a key that explicitly contains 'summary' (case-insensitive)
1069
+ canonical = None
1070
+ for k in keys:
1071
+ if re.search(r"\bsummary\b", k, re.I):
1072
+ canonical = k
1073
+ break
1074
+ # else pick the lexicographically first (deterministic)
1075
+ canonical = canonical or sorted(keys, key=lambda s: s.lower())[0]
1076
+
1077
+ # merge everything else into canonical
1078
+ for k in keys:
1079
+ if k == canonical:
1080
+ continue
1081
+ src = out.get(k)
1082
+ dst = out.get(canonical)
1083
+ # only merge dict-like Std mappings (safe-guard)
1084
+ if isinstance(dst, dict) and isinstance(src, dict):
1085
+ for std_key, vals in src.items():
1086
+ if not vals:
1087
+ continue
1088
+ if std_key in dst:
1089
+ # append unique items preserving order
1090
+ for v in vals if isinstance(vals, list) else [vals]:
1091
+ if v not in dst[std_key]:
1092
+ dst[std_key].append(v)
1093
+ else:
1094
+ dst[std_key] = list(vals) if isinstance(vals, list) else [vals]
1095
+ out[canonical] = dst
1096
+ # remove source key
1097
+ del out[k]
1098
+ else:
1099
+ # If not both dicts, prefer keeping canonical and drop duplicates conservatively
1100
+ if k in out:
1101
+ del out[k]
1102
+
1103
  return out
1104
 
1105
  def extract_red_text_filelike(input_file, output_file):