Spaces:
Running
Running
Update extract_red_text.py
Browse files- extract_red_text.py +146 -39
extract_red_text.py
CHANGED
|
@@ -11,9 +11,9 @@ MONTHS = r"(January|February|March|April|May|June|July|August|September|October|
|
|
| 11 |
DATE_RE = re.compile(rf"\b(\d{{1,2}})\s*(st|nd|rd|th)?\s+{MONTHS}\s+\d{{4}}\b", re.I)
|
| 12 |
DATE_NUM_RE = re.compile(r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b")
|
| 13 |
|
| 14 |
-
#
|
| 15 |
-
ACCRED_RE = re.compile(r"\bAccreditation\s*Number[:\s
|
| 16 |
-
EXPIRY_RE = re.compile(r"\bExpiry\s*Date[:\s
|
| 17 |
|
| 18 |
# Parent name aliases to prevent Mass Management vs Mass Management Summary mismatches
|
| 19 |
AMBIGUOUS_PARENTS = [
|
|
@@ -903,35 +903,51 @@ def extract_red_text(input_doc):
|
|
| 903 |
if op_dec:
|
| 904 |
out["Operator Declaration"] = op_dec
|
| 905 |
|
| 906 |
-
|
| 907 |
-
|
| 908 |
-
|
| 909 |
-
|
| 910 |
-
|
| 911 |
-
|
| 912 |
-
|
| 913 |
-
|
| 914 |
-
|
| 915 |
-
|
| 916 |
-
|
| 917 |
-
|
| 918 |
-
|
| 919 |
-
|
| 920 |
-
|
| 921 |
-
|
| 922 |
-
|
| 923 |
-
|
| 924 |
-
|
| 925 |
-
|
| 926 |
-
|
| 927 |
-
|
| 928 |
-
|
| 929 |
-
|
| 930 |
-
|
| 931 |
-
|
| 932 |
-
|
| 933 |
-
|
| 934 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 935 |
|
| 936 |
# ββ add Accreditation Number and Expiry Date from Nature paragraph (do NOT edit the paragraph) ββ
|
| 937 |
for sec_key, section in list(out.items()):
|
|
@@ -966,33 +982,124 @@ def extract_red_text(input_doc):
|
|
| 966 |
if v:
|
| 967 |
section.setdefault("Expiry Date", []).append(v)
|
| 968 |
|
| 969 |
-
|
| 970 |
acc_missing = not section.get("Accreditation Number")
|
| 971 |
exp_missing = not section.get("Expiry Date")
|
| 972 |
|
| 973 |
if acc_missing or exp_missing:
|
| 974 |
-
# find the last date-like token (wordy month or numeric)
|
| 975 |
last_date_match = None
|
| 976 |
-
|
|
|
|
|
|
|
| 977 |
last_date_match = md
|
|
|
|
|
|
|
|
|
|
|
|
|
| 978 |
if not last_date_match:
|
| 979 |
for md in DATE_NUM_RE.finditer(para):
|
| 980 |
last_date_match = md
|
| 981 |
|
|
|
|
| 982 |
if last_date_match:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 983 |
if exp_missing:
|
| 984 |
-
date_txt = _fix_ordinal_space(last_date_match.group(0))
|
| 985 |
section.setdefault("Expiry Date", []).append(normalize_text(date_txt))
|
| 986 |
|
|
|
|
| 987 |
if acc_missing:
|
| 988 |
-
|
| 989 |
-
|
| 990 |
-
m_num = re.search(r"(\d[\d\s]{3,
|
| 991 |
if m_num:
|
| 992 |
num_txt = _compact_digits(normalize_text(m_num.group(1)))
|
| 993 |
if num_txt:
|
| 994 |
section.setdefault("Accreditation Number", []).append(num_txt)
|
| 995 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 996 |
return out
|
| 997 |
|
| 998 |
def extract_red_text_filelike(input_file, output_file):
|
|
|
|
| 11 |
DATE_RE = re.compile(rf"\b(\d{{1,2}})\s*(st|nd|rd|th)?\s+{MONTHS}\s+\d{{4}}\b", re.I)
|
| 12 |
DATE_NUM_RE = re.compile(r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b")
|
| 13 |
|
| 14 |
+
# permissive inline label regexes β allow OCR-space noise and varied punctuation
|
| 15 |
+
ACCRED_RE = re.compile(r"\bAccreditation\s*Number[:\s\-β:]*([A-Za-z0-9\s\.\-/,]{2,})", re.I)
|
| 16 |
+
EXPIRY_RE = re.compile(r"\bExpiry\s*Date[:\s\-β:]*([A-Za-z0-9\s\.\-/,]{2,})", re.I)
|
| 17 |
|
| 18 |
# Parent name aliases to prevent Mass Management vs Mass Management Summary mismatches
|
| 19 |
AMBIGUOUS_PARENTS = [
|
|
|
|
| 903 |
if op_dec:
|
| 904 |
out["Operator Declaration"] = op_dec
|
| 905 |
|
| 906 |
+
# ββ Handle ambiguous parents without creating unwanted duplicates ββ
|
| 907 |
+
# Prefer the "Summary" variant when both keys derive from the same Std-style content.
|
| 908 |
+
summary_pairs = [
|
| 909 |
+
("Mass Management Summary", "Mass Management"),
|
| 910 |
+
("Maintenance Management Summary", "Maintenance Management"),
|
| 911 |
+
("Fatigue Management Summary", "Fatigue Management"),
|
| 912 |
+
]
|
| 913 |
+
|
| 914 |
+
for summary_key, alt_key in summary_pairs:
|
| 915 |
+
# if only alt exists, consider promoting it to the summary name
|
| 916 |
+
if alt_key in out and summary_key not in out:
|
| 917 |
+
# only promote if the alt content looks like a standards/details map
|
| 918 |
+
alt_section = out.get(alt_key)
|
| 919 |
+
if isinstance(alt_section, dict) and any(k.strip().startswith("Std") for k in alt_section.keys()):
|
| 920 |
+
out[summary_key] = alt_section
|
| 921 |
+
del out[alt_key]
|
| 922 |
+
continue
|
| 923 |
+
|
| 924 |
+
# if both exist, merge alt into summary (avoiding duplicates)
|
| 925 |
+
if summary_key in out and alt_key in out:
|
| 926 |
+
s = out[summary_key] or {}
|
| 927 |
+
a = out[alt_key] or {}
|
| 928 |
+
# Only auto-merge when both are dicts and look like Std mappings (safe heuristic)
|
| 929 |
+
if isinstance(s, dict) and isinstance(a, dict) and \
|
| 930 |
+
(any(k.strip().startswith("Std") for k in s.keys()) or any(k.strip().startswith("Std") for k in a.keys())):
|
| 931 |
+
for k, v in a.items():
|
| 932 |
+
if not v:
|
| 933 |
+
continue
|
| 934 |
+
if k in s:
|
| 935 |
+
# append unique items
|
| 936 |
+
if isinstance(s[k], list) and isinstance(v, list):
|
| 937 |
+
for item in v:
|
| 938 |
+
if item not in s[k]:
|
| 939 |
+
s[k].append(item)
|
| 940 |
+
else:
|
| 941 |
+
# fallback: convert to lists
|
| 942 |
+
s.setdefault(k, [])
|
| 943 |
+
for item in (v if isinstance(v, list) else [v]):
|
| 944 |
+
if item not in s[k]:
|
| 945 |
+
s[k].append(item)
|
| 946 |
+
else:
|
| 947 |
+
s[k] = v if isinstance(v, list) else [v]
|
| 948 |
+
out[summary_key] = s
|
| 949 |
+
# remove the alt key to avoid duplicate sections
|
| 950 |
+
del out[alt_key]
|
| 951 |
|
| 952 |
# ββ add Accreditation Number and Expiry Date from Nature paragraph (do NOT edit the paragraph) ββ
|
| 953 |
for sec_key, section in list(out.items()):
|
|
|
|
| 982 |
if v:
|
| 983 |
section.setdefault("Expiry Date", []).append(v)
|
| 984 |
|
| 985 |
+
# fallback when labels are missing but values appear at the end
|
| 986 |
acc_missing = not section.get("Accreditation Number")
|
| 987 |
exp_missing = not section.get("Expiry Date")
|
| 988 |
|
| 989 |
if acc_missing or exp_missing:
|
| 990 |
+
# 1) Try to find the last date-like token (wordy month or numeric)
|
| 991 |
last_date_match = None
|
| 992 |
+
# prefer textual month matches (allowing OCR noise like "22 nd September 2023" or "202 3")
|
| 993 |
+
month_rx = re.compile(rf"\b\d{{1,2}}\s*(?:st|nd|rd|th)?\s+{MONTHS}\s+\d{{2,4}}\b", re.I)
|
| 994 |
+
for md in month_rx.finditer(para):
|
| 995 |
last_date_match = md
|
| 996 |
+
# fallback numeric date forms (dd/mm/yyyy or dd-mm-yyyy)
|
| 997 |
+
if not last_date_match:
|
| 998 |
+
for md in DATE_RE.finditer(para):
|
| 999 |
+
last_date_match = md
|
| 1000 |
if not last_date_match:
|
| 1001 |
for md in DATE_NUM_RE.finditer(para):
|
| 1002 |
last_date_match = md
|
| 1003 |
|
| 1004 |
+
# 2) If we found a candidate expiry date, normalise and use it
|
| 1005 |
if last_date_match:
|
| 1006 |
+
date_txt = last_date_match.group(0)
|
| 1007 |
+
# fix noisy ordinals/spacing and collapsed digit noise (e.g., "202 3" -> "2023")
|
| 1008 |
+
date_txt = _fix_ordinal_space(date_txt)
|
| 1009 |
+
date_txt = re.sub(r"\b(20)\s?(\d{2})\b", r"\1\2", date_txt)
|
| 1010 |
+
date_txt = re.sub(r"\b(19)\s?(\d{2})\b", r"\1\2", date_txt)
|
| 1011 |
if exp_missing:
|
|
|
|
| 1012 |
section.setdefault("Expiry Date", []).append(normalize_text(date_txt))
|
| 1013 |
|
| 1014 |
+
# 3) If accreditation is missing, try to extract digits immediately *before* the date
|
| 1015 |
if acc_missing:
|
| 1016 |
+
before = para[: last_date_match.start()].strip()
|
| 1017 |
+
# look for long digit run (allow spaces between digits)
|
| 1018 |
+
m_num = re.search(r"(\d[\d\s]{3,16}\d)\s*$", before)
|
| 1019 |
if m_num:
|
| 1020 |
num_txt = _compact_digits(normalize_text(m_num.group(1)))
|
| 1021 |
if num_txt:
|
| 1022 |
section.setdefault("Accreditation Number", []).append(num_txt)
|
| 1023 |
|
| 1024 |
+
# 4) If we still didn't find an accreditation number, try scanning entire paragraph for the longest digit run
|
| 1025 |
+
if acc_missing:
|
| 1026 |
+
# collect digit-like tokens, collapse internal spaces and pick the longest
|
| 1027 |
+
digit_tokens = [ _compact_digits(t) for t in re.findall(r"[\d\s]{4,}", para) ]
|
| 1028 |
+
digit_tokens = [d for d in digit_tokens if len(re.sub(r'\D','',d)) >= 5] # require >=5 digits
|
| 1029 |
+
if digit_tokens:
|
| 1030 |
+
# choose the longest / most plausible digits (deterministic)
|
| 1031 |
+
digit_tokens.sort(key=lambda s: (-len(re.sub(r'\D','',s)), s))
|
| 1032 |
+
section.setdefault("Accreditation Number", []).append(digit_tokens[0])
|
| 1033 |
+
|
| 1034 |
+
# 5) If expiry still missing, do a broad textual month search anywhere in the paragraph
|
| 1035 |
+
if exp_missing:
|
| 1036 |
+
broad_month_rx = re.compile(rf"\b\d{{1,2}}\s*(?:st|nd|rd|th)?\s+{MONTHS}\s+\d{{2,4}}\b|\b{MONTHS}\s+\d{{2,4}}\b", re.I)
|
| 1037 |
+
md_any = list(broad_month_rx.finditer(para))
|
| 1038 |
+
if md_any:
|
| 1039 |
+
candidate = md_any[-1].group(0)
|
| 1040 |
+
candidate = _fix_ordinal_space(candidate)
|
| 1041 |
+
candidate = re.sub(r"\b(20)\s?(\d{2})\b", r"\1\2", candidate)
|
| 1042 |
+
if candidate:
|
| 1043 |
+
section.setdefault("Expiry Date", []).append(normalize_text(candidate))
|
| 1044 |
+
|
| 1045 |
+
|
| 1046 |
+
# ββ STRONGER: canonicalise & merge "X Summary" <-> "X" variants (case-insensitive) ββ
|
| 1047 |
+
def _base_name(k: str) -> str:
|
| 1048 |
+
# remove trailing "summary" and punctuation, normalise spaces
|
| 1049 |
+
if not isinstance(k, str):
|
| 1050 |
+
return ""
|
| 1051 |
+
b = re.sub(r"[\(\)\[\]\:]+", " ", k)
|
| 1052 |
+
b = re.sub(r"\bsummary\b\s*[:\-]*", "", b, flags=re.I)
|
| 1053 |
+
b = re.sub(r"\s+", " ", b).strip().lower()
|
| 1054 |
+
return b
|
| 1055 |
+
|
| 1056 |
+
# Build index: base -> list of original keys
|
| 1057 |
+
base_index = {}
|
| 1058 |
+
for key in list(out.keys()):
|
| 1059 |
+
base = _base_name(key)
|
| 1060 |
+
if not base:
|
| 1061 |
+
continue
|
| 1062 |
+
base_index.setdefault(base, []).append(key)
|
| 1063 |
+
|
| 1064 |
+
# For each base that maps to >1 key, merge into the Summary-preferring canonical key
|
| 1065 |
+
for base, keys in base_index.items():
|
| 1066 |
+
if len(keys) < 2:
|
| 1067 |
+
continue
|
| 1068 |
+
# prefer a key that explicitly contains 'summary' (case-insensitive)
|
| 1069 |
+
canonical = None
|
| 1070 |
+
for k in keys:
|
| 1071 |
+
if re.search(r"\bsummary\b", k, re.I):
|
| 1072 |
+
canonical = k
|
| 1073 |
+
break
|
| 1074 |
+
# else pick the lexicographically first (deterministic)
|
| 1075 |
+
canonical = canonical or sorted(keys, key=lambda s: s.lower())[0]
|
| 1076 |
+
|
| 1077 |
+
# merge everything else into canonical
|
| 1078 |
+
for k in keys:
|
| 1079 |
+
if k == canonical:
|
| 1080 |
+
continue
|
| 1081 |
+
src = out.get(k)
|
| 1082 |
+
dst = out.get(canonical)
|
| 1083 |
+
# only merge dict-like Std mappings (safe-guard)
|
| 1084 |
+
if isinstance(dst, dict) and isinstance(src, dict):
|
| 1085 |
+
for std_key, vals in src.items():
|
| 1086 |
+
if not vals:
|
| 1087 |
+
continue
|
| 1088 |
+
if std_key in dst:
|
| 1089 |
+
# append unique items preserving order
|
| 1090 |
+
for v in vals if isinstance(vals, list) else [vals]:
|
| 1091 |
+
if v not in dst[std_key]:
|
| 1092 |
+
dst[std_key].append(v)
|
| 1093 |
+
else:
|
| 1094 |
+
dst[std_key] = list(vals) if isinstance(vals, list) else [vals]
|
| 1095 |
+
out[canonical] = dst
|
| 1096 |
+
# remove source key
|
| 1097 |
+
del out[k]
|
| 1098 |
+
else:
|
| 1099 |
+
# If not both dicts, prefer keeping canonical and drop duplicates conservatively
|
| 1100 |
+
if k in out:
|
| 1101 |
+
del out[k]
|
| 1102 |
+
|
| 1103 |
return out
|
| 1104 |
|
| 1105 |
def extract_red_text_filelike(input_file, output_file):
|