Spaces:
Running
Running
Update update_docx_with_pdf.py
Browse files- update_docx_with_pdf.py +79 -20
update_docx_with_pdf.py
CHANGED
|
@@ -1139,25 +1139,81 @@ class NHVASMerger:
|
|
| 1139 |
elif "mass" in txt: extracted["mass_compliance"] = comp
|
| 1140 |
elif "fatigue" in txt: extracted["fatigue_compliance"] = comp
|
| 1141 |
|
| 1142 |
-
def _extract_text_content(self, text_pages: List[Dict], extracted: Dict):
|
| 1143 |
all_text = " ".join(page.get("text", "") for page in text_pages)
|
| 1144 |
all_text = _smart_space(all_text)
|
| 1145 |
|
| 1146 |
-
# business
|
| 1147 |
-
|
| 1148 |
-
|
| 1149 |
-
|
| 1150 |
-
|
| 1151 |
-
|
| 1152 |
-
|
| 1153 |
-
|
| 1154 |
-
|
| 1155 |
-
|
| 1156 |
-
|
| 1157 |
-
|
| 1158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1159 |
|
| 1160 |
-
# audit conducted date
|
| 1161 |
for p in [
|
| 1162 |
r"Audit was conducted on\s+([0-9]+(?:st|nd|rd|th)?\s+[A-Za-z]+\s+\d{4})",
|
| 1163 |
r"DATE\s+([0-9]+(?:st|nd|rd|th)?\s+[A-Za-z]+\s+\d{4})",
|
|
@@ -1168,7 +1224,7 @@ class NHVASMerger:
|
|
| 1168 |
extracted["audit_conducted_date"] = _smart_space(m.group(1).strip())
|
| 1169 |
break
|
| 1170 |
|
| 1171 |
-
# print accreditation name
|
| 1172 |
for p in [
|
| 1173 |
r"\(print accreditation name\)\s*([A-Za-z0-9\s&().,'/\-]+?)(?:\s+DOES|\s+does|\n|$)",
|
| 1174 |
r"print accreditation name.*?\n\s*([A-Za-z0-9\s&().,'/\-]+?)(?:\s+DOES|\s+does|\n|$)"
|
|
@@ -1178,7 +1234,7 @@ class NHVASMerger:
|
|
| 1178 |
extracted["print_accreditation_name"] = _smart_space(m.group(1).strip())
|
| 1179 |
break
|
| 1180 |
|
| 1181 |
-
#
|
| 1182 |
for p in [
|
| 1183 |
r"Number of powered vehicles\s+(\d+)",
|
| 1184 |
r"powered vehicles\s+(\d+)",
|
|
@@ -1190,9 +1246,12 @@ class NHVASMerger:
|
|
| 1190 |
m = re.search(p, all_text, re.IGNORECASE)
|
| 1191 |
if m:
|
| 1192 |
val = m.group(1)
|
| 1193 |
-
if "powered" in p:
|
| 1194 |
-
|
| 1195 |
-
elif "
|
|
|
|
|
|
|
|
|
|
| 1196 |
|
| 1197 |
def _extract_detailed_management_data(self, extracted_data: Dict, extracted: Dict):
|
| 1198 |
all_tables = extracted_data.get("all_tables", [])
|
|
|
|
| 1139 |
elif "mass" in txt: extracted["mass_compliance"] = comp
|
| 1140 |
elif "fatigue" in txt: extracted["fatigue_compliance"] = comp
|
| 1141 |
|
| 1142 |
+
def _extract_text_content(self, text_pages: List[Dict], extracted: Dict) -> None:
|
| 1143 |
all_text = " ".join(page.get("text", "") for page in text_pages)
|
| 1144 |
all_text = _smart_space(all_text)
|
| 1145 |
|
| 1146 |
+
# ------- Nature of business (positional, robust to collapsed newlines) ----------
|
| 1147 |
+
heading_rx = re.compile(r"(Nature of the Operators? Business(?:\s*\(Summary\))?\s*[:\-]?)", flags=re.I)
|
| 1148 |
+
start_m = heading_rx.search(all_text)
|
| 1149 |
+
if start_m:
|
| 1150 |
+
start_idx = start_m.end()
|
| 1151 |
+
|
| 1152 |
+
# Stop phrases (no newlines). Use word boundaries where appropriate so we don't
|
| 1153 |
+
# accidentally cut on substrings inside words.
|
| 1154 |
+
stop_phrases = [
|
| 1155 |
+
r"\bAccreditation Vehicle Summary\b",
|
| 1156 |
+
r"\bACCREDITATION VEHICLE SUMMARY\b",
|
| 1157 |
+
r"\bAUDIT OBSERVATIONS\b",
|
| 1158 |
+
r"\bNHVAS AUDIT SUMMARY REPORT\b",
|
| 1159 |
+
r"\bPage\s+\d+\s+of\s+\d+\b",
|
| 1160 |
+
r"\bVehicle Registration Numbers\b",
|
| 1161 |
+
r"\bVehicle Registration Numbers of Records Examined\b",
|
| 1162 |
+
r"\bAUDIT SUMMARY REPORT\b",
|
| 1163 |
+
]
|
| 1164 |
+
|
| 1165 |
+
# Find earliest occurrence of any stop phrase after the heading
|
| 1166 |
+
next_idx = None
|
| 1167 |
+
for sp in stop_phrases:
|
| 1168 |
+
m = re.search(sp, all_text[start_idx:], flags=re.I)
|
| 1169 |
+
if m:
|
| 1170 |
+
idx = start_idx + m.start()
|
| 1171 |
+
if next_idx is None or idx < next_idx:
|
| 1172 |
+
next_idx = idx
|
| 1173 |
+
|
| 1174 |
+
# Slice from end of heading to earliest stop phrase (or limit to a reasonable length)
|
| 1175 |
+
end_idx = next_idx if next_idx is not None else min(len(all_text), start_idx + 4000)
|
| 1176 |
+
candidate = all_text[start_idx:end_idx].strip()
|
| 1177 |
+
|
| 1178 |
+
# Defensive trimming of trailing uppercase boilerplate or table header noise
|
| 1179 |
+
candidate = re.sub(
|
| 1180 |
+
r"(ACCREDITATION VEHICLE SUMMARY|AUDIT OBSERVATIONS|NHVAS AUDIT SUMMARY REPORT|STD\s+\d+\.).*$",
|
| 1181 |
+
"",
|
| 1182 |
+
candidate,
|
| 1183 |
+
flags=re.I | re.DOTALL,
|
| 1184 |
+
)
|
| 1185 |
+
candidate = re.sub(r"\s+", " ", candidate).strip()
|
| 1186 |
+
|
| 1187 |
+
if 20 < len(candidate) < 5000:
|
| 1188 |
+
extracted["business_summary"] = candidate
|
| 1189 |
+
|
| 1190 |
+
# Extract Accreditation Number / Expiry only if they appear inline in this small block
|
| 1191 |
+
m_acc = re.search(r"\bAccreditation\s*Number[:\s-]*([A-Za-z0-9\s\-\/]+)", candidate, flags=re.I)
|
| 1192 |
+
m_exp = re.search(r"\bExpiry\s*Date[:\s-]*([A-Za-z0-9\s,\/\-]+)", candidate, flags=re.I)
|
| 1193 |
+
if m_acc:
|
| 1194 |
+
acc = re.sub(r"\s+", " ", m_acc.group(1)).strip()
|
| 1195 |
+
acc = re.sub(r"[^\d]", "", acc) or acc
|
| 1196 |
+
extracted.setdefault("business_summary_extras", {})["accreditation_number"] = acc
|
| 1197 |
+
if m_exp:
|
| 1198 |
+
exp = _fix_ocr_date_noise(m_exp.group(1).strip())
|
| 1199 |
+
extracted.setdefault("business_summary_extras", {})["expiry_date"] = _smart_space(exp)
|
| 1200 |
+
|
| 1201 |
+
# --- fallback (preserve previous behaviour for other templates) ---
|
| 1202 |
+
if "business_summary" not in extracted:
|
| 1203 |
+
patt = [
|
| 1204 |
+
r"Nature of the Operators? Business.*?:\s*(.*?)(?:Accreditation Number|Expiry Date|$)",
|
| 1205 |
+
r"Nature of.*?Business.*?Summary.*?:\s*(.*?)(?:Accreditation|$)"
|
| 1206 |
+
]
|
| 1207 |
+
for p in patt:
|
| 1208 |
+
m = re.search(p, all_text, re.IGNORECASE | re.DOTALL)
|
| 1209 |
+
if m:
|
| 1210 |
+
txt = re.sub(r'\s+', ' ', m.group(1).strip())
|
| 1211 |
+
txt = re.sub(r'\s*(Accreditation Number.*|Expiry Date.*)', '', txt, flags=re.IGNORECASE)
|
| 1212 |
+
if len(txt) > 50:
|
| 1213 |
+
extracted["business_summary"] = txt
|
| 1214 |
+
break
|
| 1215 |
|
| 1216 |
+
# --- audit conducted date (unchanged) ---
|
| 1217 |
for p in [
|
| 1218 |
r"Audit was conducted on\s+([0-9]+(?:st|nd|rd|th)?\s+[A-Za-z]+\s+\d{4})",
|
| 1219 |
r"DATE\s+([0-9]+(?:st|nd|rd|th)?\s+[A-Za-z]+\s+\d{4})",
|
|
|
|
| 1224 |
extracted["audit_conducted_date"] = _smart_space(m.group(1).strip())
|
| 1225 |
break
|
| 1226 |
|
| 1227 |
+
# --- print accreditation name (unchanged) ---
|
| 1228 |
for p in [
|
| 1229 |
r"\(print accreditation name\)\s*([A-Za-z0-9\s&().,'/\-]+?)(?:\s+DOES|\s+does|\n|$)",
|
| 1230 |
r"print accreditation name.*?\n\s*([A-Za-z0-9\s&().,'/\-]+?)(?:\s+DOES|\s+does|\n|$)"
|
|
|
|
| 1234 |
extracted["print_accreditation_name"] = _smart_space(m.group(1).strip())
|
| 1235 |
break
|
| 1236 |
|
| 1237 |
+
# --- Vehicle/driver simple numbers (unchanged) ---
|
| 1238 |
for p in [
|
| 1239 |
r"Number of powered vehicles\s+(\d+)",
|
| 1240 |
r"powered vehicles\s+(\d+)",
|
|
|
|
| 1246 |
m = re.search(p, all_text, re.IGNORECASE)
|
| 1247 |
if m:
|
| 1248 |
val = m.group(1)
|
| 1249 |
+
if "powered" in p:
|
| 1250 |
+
extracted.setdefault("vehicle_summary", {})["powered_vehicles"] = val
|
| 1251 |
+
elif "trailing" in p:
|
| 1252 |
+
extracted.setdefault("vehicle_summary", {})["trailing_vehicles"] = val
|
| 1253 |
+
elif "bfm" in p.lower():
|
| 1254 |
+
extracted.setdefault("vehicle_summary", {})["drivers_bfm"] = val
|
| 1255 |
|
| 1256 |
def _extract_detailed_management_data(self, extracted_data: Dict, extracted: Dict):
|
| 1257 |
all_tables = extracted_data.get("all_tables", [])
|