Shami96 commited on
Commit
955539a
·
verified ·
1 Parent(s): b6f7c7f

Update update_docx_with_pdf.py

Browse files
Files changed (1) hide show
  1. update_docx_with_pdf.py +79 -20
update_docx_with_pdf.py CHANGED
@@ -1139,25 +1139,81 @@ class NHVASMerger:
1139
  elif "mass" in txt: extracted["mass_compliance"] = comp
1140
  elif "fatigue" in txt: extracted["fatigue_compliance"] = comp
1141
 
1142
- def _extract_text_content(self, text_pages: List[Dict], extracted: Dict):
1143
  all_text = " ".join(page.get("text", "") for page in text_pages)
1144
  all_text = _smart_space(all_text)
1145
 
1146
- # business summary
1147
- patt = [
1148
- r"Nature of the Operators? Business.*?:\s*(.*?)(?:Accreditation Number|Expiry Date|$)",
1149
- r"Nature of.*?Business.*?Summary.*?:\s*(.*?)(?:Accreditation|$)"
1150
- ]
1151
- for p in patt:
1152
- m = re.search(p, all_text, re.IGNORECASE | re.DOTALL)
1153
- if m:
1154
- txt = re.sub(r'\s+', ' ', m.group(1).strip())
1155
- txt = re.sub(r'\s*(Accreditation Number.*|Expiry Date.*)', '', txt, flags=re.IGNORECASE)
1156
- if len(txt) > 50:
1157
- extracted["business_summary"] = txt
1158
- break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1159
 
1160
- # audit conducted date
1161
  for p in [
1162
  r"Audit was conducted on\s+([0-9]+(?:st|nd|rd|th)?\s+[A-Za-z]+\s+\d{4})",
1163
  r"DATE\s+([0-9]+(?:st|nd|rd|th)?\s+[A-Za-z]+\s+\d{4})",
@@ -1168,7 +1224,7 @@ class NHVASMerger:
1168
  extracted["audit_conducted_date"] = _smart_space(m.group(1).strip())
1169
  break
1170
 
1171
- # print accreditation name
1172
  for p in [
1173
  r"\(print accreditation name\)\s*([A-Za-z0-9\s&().,'/\-]+?)(?:\s+DOES|\s+does|\n|$)",
1174
  r"print accreditation name.*?\n\s*([A-Za-z0-9\s&().,'/\-]+?)(?:\s+DOES|\s+does|\n|$)"
@@ -1178,7 +1234,7 @@ class NHVASMerger:
1178
  extracted["print_accreditation_name"] = _smart_space(m.group(1).strip())
1179
  break
1180
 
1181
- # numbers in text (optional)
1182
  for p in [
1183
  r"Number of powered vehicles\s+(\d+)",
1184
  r"powered vehicles\s+(\d+)",
@@ -1190,9 +1246,12 @@ class NHVASMerger:
1190
  m = re.search(p, all_text, re.IGNORECASE)
1191
  if m:
1192
  val = m.group(1)
1193
- if "powered" in p: extracted.setdefault("vehicle_summary", {})["powered_vehicles"] = val
1194
- elif "trailing" in p: extracted.setdefault("vehicle_summary", {})["trailing_vehicles"] = val
1195
- elif "bfm" in p.lower(): extracted.setdefault("vehicle_summary", {})["drivers_bfm"] = val
 
 
 
1196
 
1197
  def _extract_detailed_management_data(self, extracted_data: Dict, extracted: Dict):
1198
  all_tables = extracted_data.get("all_tables", [])
 
1139
  elif "mass" in txt: extracted["mass_compliance"] = comp
1140
  elif "fatigue" in txt: extracted["fatigue_compliance"] = comp
1141
 
1142
+ def _extract_text_content(self, text_pages: List[Dict], extracted: Dict) -> None:
1143
  all_text = " ".join(page.get("text", "") for page in text_pages)
1144
  all_text = _smart_space(all_text)
1145
 
1146
+ # ------- Nature of business (positional, robust to collapsed newlines) ----------
1147
+ heading_rx = re.compile(r"(Nature of the Operators? Business(?:\s*\(Summary\))?\s*[:\-]?)", flags=re.I)
1148
+ start_m = heading_rx.search(all_text)
1149
+ if start_m:
1150
+ start_idx = start_m.end()
1151
+
1152
+ # Stop phrases (no newlines). Use word boundaries where appropriate so we don't
1153
+ # accidentally cut on substrings inside words.
1154
+ stop_phrases = [
1155
+ r"\bAccreditation Vehicle Summary\b",
1156
+ r"\bACCREDITATION VEHICLE SUMMARY\b",
1157
+ r"\bAUDIT OBSERVATIONS\b",
1158
+ r"\bNHVAS AUDIT SUMMARY REPORT\b",
1159
+ r"\bPage\s+\d+\s+of\s+\d+\b",
1160
+ r"\bVehicle Registration Numbers\b",
1161
+ r"\bVehicle Registration Numbers of Records Examined\b",
1162
+ r"\bAUDIT SUMMARY REPORT\b",
1163
+ ]
1164
+
1165
+ # Find earliest occurrence of any stop phrase after the heading
1166
+ next_idx = None
1167
+ for sp in stop_phrases:
1168
+ m = re.search(sp, all_text[start_idx:], flags=re.I)
1169
+ if m:
1170
+ idx = start_idx + m.start()
1171
+ if next_idx is None or idx < next_idx:
1172
+ next_idx = idx
1173
+
1174
+ # Slice from end of heading to earliest stop phrase (or limit to a reasonable length)
1175
+ end_idx = next_idx if next_idx is not None else min(len(all_text), start_idx + 4000)
1176
+ candidate = all_text[start_idx:end_idx].strip()
1177
+
1178
+ # Defensive trimming of trailing uppercase boilerplate or table header noise
1179
+ candidate = re.sub(
1180
+ r"(ACCREDITATION VEHICLE SUMMARY|AUDIT OBSERVATIONS|NHVAS AUDIT SUMMARY REPORT|STD\s+\d+\.).*$",
1181
+ "",
1182
+ candidate,
1183
+ flags=re.I | re.DOTALL,
1184
+ )
1185
+ candidate = re.sub(r"\s+", " ", candidate).strip()
1186
+
1187
+ if 20 < len(candidate) < 5000:
1188
+ extracted["business_summary"] = candidate
1189
+
1190
+ # Extract Accreditation Number / Expiry only if they appear inline in this small block
1191
+ m_acc = re.search(r"\bAccreditation\s*Number[:\s-]*([A-Za-z0-9\s\-\/]+)", candidate, flags=re.I)
1192
+ m_exp = re.search(r"\bExpiry\s*Date[:\s-]*([A-Za-z0-9\s,\/\-]+)", candidate, flags=re.I)
1193
+ if m_acc:
1194
+ acc = re.sub(r"\s+", " ", m_acc.group(1)).strip()
1195
+ acc = re.sub(r"[^\d]", "", acc) or acc
1196
+ extracted.setdefault("business_summary_extras", {})["accreditation_number"] = acc
1197
+ if m_exp:
1198
+ exp = _fix_ocr_date_noise(m_exp.group(1).strip())
1199
+ extracted.setdefault("business_summary_extras", {})["expiry_date"] = _smart_space(exp)
1200
+
1201
+ # --- fallback (preserve previous behaviour for other templates) ---
1202
+ if "business_summary" not in extracted:
1203
+ patt = [
1204
+ r"Nature of the Operators? Business.*?:\s*(.*?)(?:Accreditation Number|Expiry Date|$)",
1205
+ r"Nature of.*?Business.*?Summary.*?:\s*(.*?)(?:Accreditation|$)"
1206
+ ]
1207
+ for p in patt:
1208
+ m = re.search(p, all_text, re.IGNORECASE | re.DOTALL)
1209
+ if m:
1210
+ txt = re.sub(r'\s+', ' ', m.group(1).strip())
1211
+ txt = re.sub(r'\s*(Accreditation Number.*|Expiry Date.*)', '', txt, flags=re.IGNORECASE)
1212
+ if len(txt) > 50:
1213
+ extracted["business_summary"] = txt
1214
+ break
1215
 
1216
+ # --- audit conducted date (unchanged) ---
1217
  for p in [
1218
  r"Audit was conducted on\s+([0-9]+(?:st|nd|rd|th)?\s+[A-Za-z]+\s+\d{4})",
1219
  r"DATE\s+([0-9]+(?:st|nd|rd|th)?\s+[A-Za-z]+\s+\d{4})",
 
1224
  extracted["audit_conducted_date"] = _smart_space(m.group(1).strip())
1225
  break
1226
 
1227
+ # --- print accreditation name (unchanged) ---
1228
  for p in [
1229
  r"\(print accreditation name\)\s*([A-Za-z0-9\s&().,'/\-]+?)(?:\s+DOES|\s+does|\n|$)",
1230
  r"print accreditation name.*?\n\s*([A-Za-z0-9\s&().,'/\-]+?)(?:\s+DOES|\s+does|\n|$)"
 
1234
  extracted["print_accreditation_name"] = _smart_space(m.group(1).strip())
1235
  break
1236
 
1237
+ # --- Vehicle/driver simple numbers (unchanged) ---
1238
  for p in [
1239
  r"Number of powered vehicles\s+(\d+)",
1240
  r"powered vehicles\s+(\d+)",
 
1246
  m = re.search(p, all_text, re.IGNORECASE)
1247
  if m:
1248
  val = m.group(1)
1249
+ if "powered" in p:
1250
+ extracted.setdefault("vehicle_summary", {})["powered_vehicles"] = val
1251
+ elif "trailing" in p:
1252
+ extracted.setdefault("vehicle_summary", {})["trailing_vehicles"] = val
1253
+ elif "bfm" in p.lower():
1254
+ extracted.setdefault("vehicle_summary", {})["drivers_bfm"] = val
1255
 
1256
  def _extract_detailed_management_data(self, extracted_data: Dict, extracted: Dict):
1257
  all_tables = extracted_data.get("all_tables", [])