Spaces:
Running
Running
Update update_docx_with_pdf.py
Browse files- update_docx_with_pdf.py +173 -68
update_docx_with_pdf.py
CHANGED
|
@@ -15,6 +15,28 @@ from collections import OrderedDict # <-- add this
|
|
| 15 |
def _nz(x):
|
| 16 |
return x if isinstance(x, str) and x.strip() else ""
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
SUMMARY_SECTIONS = {
|
| 19 |
"MAINTENANCE MANAGEMENT": "Maintenance Management Summary",
|
| 20 |
"MASS MANAGEMENT": "Mass Management Summary",
|
|
@@ -770,6 +792,34 @@ class NHVASMerger:
|
|
| 770 |
page_num = table.get("page", 0)
|
| 771 |
self.log_debug(f"Processing table on page {page_num} with headers: {headers[:3]}...")
|
| 772 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 773 |
# 🔧 NEW: collapse possible multi-line headers once up front
|
| 774 |
collapsed_headers, collapsed_rows = self._collapse_multiline_headers(headers, data_rows)
|
| 775 |
|
|
@@ -1052,78 +1102,122 @@ class NHVASMerger:
|
|
| 1052 |
return added
|
| 1053 |
|
| 1054 |
def _extract_driver_table(self, headers: List[str], data_rows: List[List], extracted: Dict):
|
| 1055 |
-
"""
|
| 1056 |
drivers = []
|
| 1057 |
-
|
| 1058 |
-
|
| 1059 |
-
|
| 1060 |
-
|
| 1061 |
-
|
| 1062 |
-
|
| 1063 |
-
|
| 1064 |
-
|
| 1065 |
-
|
| 1066 |
-
|
| 1067 |
-
|
| 1068 |
-
|
| 1069 |
-
|
| 1070 |
-
|
| 1071 |
-
|
| 1072 |
-
|
| 1073 |
-
r"\bdriver\s+name\b", r"\bscheduler\s+name\b", r"\bname\b"])
|
| 1074 |
-
tlif_d_idx = find_col(["driver tlif"])
|
| 1075 |
-
tlif_s_idx = find_col(["scheduler tlif"])
|
| 1076 |
-
medical_idx= find_col(["medical", "expiry"])
|
| 1077 |
-
roster_idx = find_col_rx([r"\broster\b", r"\bsafe\s+driving\s+plan\b", r"\bschedule\b(?!r\b)"])
|
| 1078 |
-
fit_idx = find_col(["fit for duty"])
|
| 1079 |
-
diary_idx = find_col(["work diary", "electronic work diary", "page numbers"])
|
| 1080 |
-
|
| 1081 |
-
for row in data_rows:
|
| 1082 |
if not row:
|
| 1083 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1084 |
|
|
|
|
| 1085 |
name = None
|
| 1086 |
-
|
| 1087 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1088 |
if not name:
|
|
|
|
| 1089 |
continue
|
| 1090 |
|
| 1091 |
-
|
| 1092 |
-
|
| 1093 |
-
|
| 1094 |
-
|
| 1095 |
-
|
| 1096 |
-
|
| 1097 |
-
|
| 1098 |
-
|
| 1099 |
-
|
| 1100 |
-
|
| 1101 |
-
|
| 1102 |
-
|
| 1103 |
-
|
| 1104 |
-
|
| 1105 |
-
|
| 1106 |
-
#
|
| 1107 |
-
|
| 1108 |
-
|
| 1109 |
-
|
| 1110 |
-
|
| 1111 |
-
|
| 1112 |
-
|
| 1113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1114 |
break
|
| 1115 |
|
| 1116 |
-
|
| 1117 |
-
|
| 1118 |
-
|
| 1119 |
-
|
| 1120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1121 |
drivers.append(d)
|
|
|
|
| 1122 |
|
| 1123 |
if drivers:
|
| 1124 |
extracted["drivers_detailed"] = drivers
|
| 1125 |
-
self.log_debug(f"Driver rows extracted
|
| 1126 |
-
|
|
|
|
| 1127 |
|
| 1128 |
def _extract_management_table(self, data_rows: List[List], extracted: Dict, headers: List[str]):
|
| 1129 |
txt = " ".join(str(h) for h in headers).lower()
|
|
@@ -1177,7 +1271,7 @@ class NHVASMerger:
|
|
| 1177 |
|
| 1178 |
# Defensive trimming of trailing uppercase boilerplate or table header noise
|
| 1179 |
candidate = re.sub(
|
| 1180 |
-
r"(ACCREDITATION VEHICLE SUMMARY|AUDIT OBSERVATIONS|NHVAS AUDIT SUMMARY REPORT|STD\s+\d+\.).*$",
|
| 1181 |
"",
|
| 1182 |
candidate,
|
| 1183 |
flags=re.I | re.DOTALL,
|
|
@@ -1189,7 +1283,7 @@ class NHVASMerger:
|
|
| 1189 |
|
| 1190 |
# Extract Accreditation Number / Expiry only if they appear inline in this small block
|
| 1191 |
m_acc = re.search(r"\bAccreditation\s*Number[:\s-]*([A-Za-z0-9\s\-\/]+)", candidate, flags=re.I)
|
| 1192 |
-
m_exp = re.search(r"\
|
| 1193 |
if m_acc:
|
| 1194 |
acc = re.sub(r"\s+", " ", m_acc.group(1)).strip()
|
| 1195 |
acc = re.sub(r"[^\d]", "", acc) or acc
|
|
@@ -1483,7 +1577,17 @@ class NHVASMerger:
|
|
| 1483 |
|
| 1484 |
# Business summary
|
| 1485 |
if "business_summary" in pdf_extracted and "Nature of the Operators Business (Summary)" in merged:
|
| 1486 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1487 |
|
| 1488 |
# Vehicle summary
|
| 1489 |
if "vehicle_summary" in pdf_extracted:
|
|
@@ -1526,18 +1630,19 @@ class NHVASMerger:
|
|
| 1526 |
)
|
| 1527 |
|
| 1528 |
|
| 1529 |
-
|
| 1530 |
if "drivers_detailed" in pdf_extracted and "Driver / Scheduler Records Examined" in merged:
|
| 1531 |
drivers = pdf_extracted["drivers_detailed"]
|
| 1532 |
-
|
| 1533 |
-
|
| 1534 |
-
|
| 1535 |
-
|
|
|
|
|
|
|
| 1536 |
merged["Driver / Scheduler Records Examined"]["Roster / Schedule / Safe Driving Plan (Date Range)"] = [d.get("roster_schedule","") for d in drivers]
|
| 1537 |
-
merged["Driver / Scheduler Records Examined"]["Fit for Duty Statement Completed (Yes/No)"]
|
| 1538 |
merged["Driver / Scheduler Records Examined"]["Work Diary Pages (Page Numbers) Electronic Work Diary Records (Date Range)"] = [d.get("work_diary","") for d in drivers]
|
| 1539 |
|
| 1540 |
-
|
| 1541 |
# --- Print accreditation name (robust, no UnboundLocalError) ---
|
| 1542 |
if "Print accreditation name" in merged:
|
| 1543 |
acc_name = "" # init
|
|
|
|
| 15 |
def _nz(x):
|
| 16 |
return x if isinstance(x, str) and x.strip() else ""
|
| 17 |
|
| 18 |
+
def _fix_ocr_date_noise(date_str: str) -> str:
|
| 19 |
+
"""Clean up OCR date noise and standardize date format."""
|
| 20 |
+
if not date_str:
|
| 21 |
+
return ""
|
| 22 |
+
|
| 23 |
+
# Remove common OCR artifacts
|
| 24 |
+
cleaned = re.sub(r'\s+', ' ', date_str.strip())
|
| 25 |
+
cleaned = re.sub(r'[^\w\s/\-]', '', cleaned)
|
| 26 |
+
|
| 27 |
+
# Try to extract month/year patterns
|
| 28 |
+
month_year_match = re.search(r'([A-Za-z]+)\s+(\d{4})', cleaned)
|
| 29 |
+
if month_year_match:
|
| 30 |
+
return f"{month_year_match.group(1)} {month_year_match.group(2)}"
|
| 31 |
+
|
| 32 |
+
# Try to extract date patterns like "21st October 2022"
|
| 33 |
+
date_match = re.search(r'(\d{1,2})(?:st|nd|rd|th)?\s+([A-Za-z]+)\s+(\d{4})', cleaned)
|
| 34 |
+
if date_match:
|
| 35 |
+
return f"{date_match.group(1)} {date_match.group(2)} {date_match.group(3)}"
|
| 36 |
+
|
| 37 |
+
# Return cleaned version if no specific pattern found
|
| 38 |
+
return cleaned
|
| 39 |
+
|
| 40 |
SUMMARY_SECTIONS = {
|
| 41 |
"MAINTENANCE MANAGEMENT": "Maintenance Management Summary",
|
| 42 |
"MASS MANAGEMENT": "Mass Management Summary",
|
|
|
|
| 792 |
page_num = table.get("page", 0)
|
| 793 |
self.log_debug(f"Processing table on page {page_num} with headers: {headers[:3]}...")
|
| 794 |
|
| 795 |
+
# NEW: Check for single-column Nature of Business table
|
| 796 |
+
if (len(headers) == 1 and
|
| 797 |
+
"nature of the operators business" in str(headers[0]).lower() and
|
| 798 |
+
len(data_rows) > 0 and len(data_rows[0]) > 0):
|
| 799 |
+
|
| 800 |
+
text = str(data_rows[0][0])
|
| 801 |
+
self.log_debug(f"Found Nature of Business table with text: {text[:100]}...")
|
| 802 |
+
|
| 803 |
+
# Extract inline expiry date and accreditation number
|
| 804 |
+
m_exp = re.search(r"\b(?:Mass and Maintenance\s+)?Expiry\s*Date[:\s-]*([0-9\.\/\-]+)", text, flags=re.I)
|
| 805 |
+
m_acc = re.search(r"\bAccreditation\s*Number[:\s-]*([A-Za-z0-9\s\-\/]+)", text, flags=re.I)
|
| 806 |
+
|
| 807 |
+
if m_exp:
|
| 808 |
+
exp_date = m_exp.group(1).strip()
|
| 809 |
+
extracted.setdefault("business_summary_extras", {})["expiry_date"] = exp_date
|
| 810 |
+
self.log_debug(f"Extracted expiry date: {exp_date}")
|
| 811 |
+
|
| 812 |
+
if m_acc:
|
| 813 |
+
acc_num = m_acc.group(1).strip()
|
| 814 |
+
extracted.setdefault("business_summary_extras", {})["accreditation_number"] = acc_num
|
| 815 |
+
self.log_debug(f"Extracted accreditation number: {acc_num}")
|
| 816 |
+
|
| 817 |
+
# Store the clean text (without the inline date/number)
|
| 818 |
+
clean_text = re.sub(r"\s*(?:Mass and Maintenance\s+)?Expiry\s*Date[:\s-]*[0-9\.\/\-]+", "", text, flags=re.I)
|
| 819 |
+
clean_text = re.sub(r"\s*Accreditation\s*Number[:\s-]*[A-Za-z0-9\s\-\/]+", "", clean_text, flags=re.I)
|
| 820 |
+
extracted["business_summary"] = clean_text.strip()
|
| 821 |
+
continue
|
| 822 |
+
|
| 823 |
# 🔧 NEW: collapse possible multi-line headers once up front
|
| 824 |
collapsed_headers, collapsed_rows = self._collapse_multiline_headers(headers, data_rows)
|
| 825 |
|
|
|
|
| 1102 |
return added
|
| 1103 |
|
| 1104 |
def _extract_driver_table(self, headers: List[str], data_rows: List[List], extracted: Dict):
|
| 1105 |
+
"""Enhanced header-driven extraction for Driver / Scheduler Records."""
|
| 1106 |
drivers = []
|
| 1107 |
+
|
| 1108 |
+
self.log_debug(f"Driver table has {len(data_rows)} rows")
|
| 1109 |
+
|
| 1110 |
+
# Skip header continuation rows - look for the first row that starts with a number
|
| 1111 |
+
actual_data_start = 0
|
| 1112 |
+
for i, row in enumerate(data_rows):
|
| 1113 |
+
if row and str(row[0]).strip().startswith(('1.', '1')):
|
| 1114 |
+
actual_data_start = i
|
| 1115 |
+
self.log_debug(f"Found actual data starting at row {i}")
|
| 1116 |
+
break
|
| 1117 |
+
|
| 1118 |
+
if actual_data_start == 0:
|
| 1119 |
+
self.log_debug("Warning: Could not find numbered data rows")
|
| 1120 |
+
|
| 1121 |
+
# Process only the actual data rows (skip header continuation rows)
|
| 1122 |
+
for row_idx, row in enumerate(data_rows[actual_data_start:], start=actual_data_start):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1123 |
if not row:
|
| 1124 |
continue
|
| 1125 |
+
|
| 1126 |
+
self.log_debug(f"Processing data row {row_idx}: {row}")
|
| 1127 |
+
|
| 1128 |
+
# Check if this is a numbered row (1., 2., etc.)
|
| 1129 |
+
first_cell = str(row[0]).strip()
|
| 1130 |
+
if not (first_cell.endswith('.') and first_cell[:-1].isdigit()):
|
| 1131 |
+
self.log_debug(f"Skipping row {row_idx} - not a numbered data row")
|
| 1132 |
+
continue
|
| 1133 |
|
| 1134 |
+
# Based on the raw data structure, extract from fixed positions
|
| 1135 |
name = None
|
| 1136 |
+
driver_tlif = ""
|
| 1137 |
+
scheduler_tlif = ""
|
| 1138 |
+
medical = ""
|
| 1139 |
+
roster = ""
|
| 1140 |
+
fit_duty = ""
|
| 1141 |
+
work_diary = ""
|
| 1142 |
+
|
| 1143 |
+
# Look for name in columns around index 3-4
|
| 1144 |
+
for i in range(2, min(6, len(row))):
|
| 1145 |
+
candidate = _smart_space(str(row[i]).strip())
|
| 1146 |
+
if (candidate and
|
| 1147 |
+
len(candidate) > 3 and
|
| 1148 |
+
any(c.isalpha() for c in candidate) and
|
| 1149 |
+
candidate.lower() not in ['entry', 'n/a', 'yes', 'no', 'name'] and
|
| 1150 |
+
not candidate.isdigit() and
|
| 1151 |
+
not candidate.endswith('.')):
|
| 1152 |
+
name = candidate
|
| 1153 |
+
self.log_debug(f"Found name at column {i}: {name}")
|
| 1154 |
+
break
|
| 1155 |
+
|
| 1156 |
if not name:
|
| 1157 |
+
self.log_debug(f"Skipping row {row_idx} - no valid name found")
|
| 1158 |
continue
|
| 1159 |
|
| 1160 |
+
# Extract other fields from approximate positions based on raw data
|
| 1161 |
+
# Driver TLIF around column 6
|
| 1162 |
+
for i in range(5, min(8, len(row))):
|
| 1163 |
+
val = str(row[i]).strip()
|
| 1164 |
+
if val and val.lower() in ['yes', 'no']:
|
| 1165 |
+
driver_tlif = val.title()
|
| 1166 |
+
break
|
| 1167 |
+
|
| 1168 |
+
# Scheduler TLIF around column 9
|
| 1169 |
+
for i in range(8, min(12, len(row))):
|
| 1170 |
+
val = str(row[i]).strip()
|
| 1171 |
+
if val and val.lower() in ['yes', 'no']:
|
| 1172 |
+
scheduler_tlif = val.title()
|
| 1173 |
+
break
|
| 1174 |
+
|
| 1175 |
+
# Medical around column 12
|
| 1176 |
+
for i in range(11, min(15, len(row))):
|
| 1177 |
+
val = _smart_space(str(row[i]).strip())
|
| 1178 |
+
if val and val.lower() not in ['', 'entry']:
|
| 1179 |
+
medical = val
|
| 1180 |
+
break
|
| 1181 |
+
|
| 1182 |
+
# Roster around column 15
|
| 1183 |
+
for i in range(14, min(18, len(row))):
|
| 1184 |
+
val = _smart_space(str(row[i]).strip())
|
| 1185 |
+
if val:
|
| 1186 |
+
roster = val
|
| 1187 |
+
break
|
| 1188 |
+
|
| 1189 |
+
# Fit for Duty around column 18
|
| 1190 |
+
for i in range(17, min(21, len(row))):
|
| 1191 |
+
val = str(row[i]).strip()
|
| 1192 |
+
if val and val.lower() in ['yes', 'no']:
|
| 1193 |
+
fit_duty = val.title()
|
| 1194 |
+
break
|
| 1195 |
+
|
| 1196 |
+
# Work Diary around column 21
|
| 1197 |
+
for i in range(20, min(len(row), 24)):
|
| 1198 |
+
val = _smart_space(str(row[i]).strip())
|
| 1199 |
+
if val:
|
| 1200 |
+
work_diary = val
|
| 1201 |
break
|
| 1202 |
|
| 1203 |
+
d = {
|
| 1204 |
+
"name": name,
|
| 1205 |
+
"driver_tlif": driver_tlif,
|
| 1206 |
+
"scheduler_tlif": scheduler_tlif,
|
| 1207 |
+
"medical_expiry": medical,
|
| 1208 |
+
"roster_schedule": roster,
|
| 1209 |
+
"fit_for_duty": fit_duty,
|
| 1210 |
+
"work_diary": work_diary
|
| 1211 |
+
}
|
| 1212 |
+
|
| 1213 |
drivers.append(d)
|
| 1214 |
+
self.log_debug(f"Added driver: {d}")
|
| 1215 |
|
| 1216 |
if drivers:
|
| 1217 |
extracted["drivers_detailed"] = drivers
|
| 1218 |
+
self.log_debug(f"Driver rows extracted: {len(drivers)}")
|
| 1219 |
+
else:
|
| 1220 |
+
self.log_debug("No drivers extracted")
|
| 1221 |
|
| 1222 |
def _extract_management_table(self, data_rows: List[List], extracted: Dict, headers: List[str]):
|
| 1223 |
txt = " ".join(str(h) for h in headers).lower()
|
|
|
|
| 1271 |
|
| 1272 |
# Defensive trimming of trailing uppercase boilerplate or table header noise
|
| 1273 |
candidate = re.sub(
|
| 1274 |
+
r"(Mass and Maintenance Expiry Date:|ACCREDITATION DRIVER SUMMARY|ACCREDITATION VEHICLE SUMMARY|AUDIT OBSERVATIONS|NHVAS AUDIT SUMMARY REPORT|STD\s+\d+\.).*$",
|
| 1275 |
"",
|
| 1276 |
candidate,
|
| 1277 |
flags=re.I | re.DOTALL,
|
|
|
|
| 1283 |
|
| 1284 |
# Extract Accreditation Number / Expiry only if they appear inline in this small block
|
| 1285 |
m_acc = re.search(r"\bAccreditation\s*Number[:\s-]*([A-Za-z0-9\s\-\/]+)", candidate, flags=re.I)
|
| 1286 |
+
m_exp = re.search(r"\b(?:Mass and Maintenance\s+)?Expiry\s*Date[:\s-]*([A-Za-z0-9\s,\/\-\.]+)", candidate, flags=re.I)
|
| 1287 |
if m_acc:
|
| 1288 |
acc = re.sub(r"\s+", " ", m_acc.group(1)).strip()
|
| 1289 |
acc = re.sub(r"[^\d]", "", acc) or acc
|
|
|
|
| 1577 |
|
| 1578 |
# Business summary
|
| 1579 |
if "business_summary" in pdf_extracted and "Nature of the Operators Business (Summary)" in merged:
|
| 1580 |
+
# Clean the main text by removing either inline date pattern
|
| 1581 |
+
business_text = pdf_extracted["business_summary"]
|
| 1582 |
+
clean_text = re.sub(r"\s*(?:Mass and Maintenance\s+)?Expiry Date[:\s]*[A-Za-z0-9\s,\/\-\.]+", "", business_text, flags=re.I)
|
| 1583 |
+
merged["Nature of the Operators Business (Summary)"]["Nature of the Operators Business (Summary):"] = [_smart_space(clean_text)]
|
| 1584 |
+
|
| 1585 |
+
# Override with extracted inline values
|
| 1586 |
+
extras = pdf_extracted.get("business_summary_extras", {})
|
| 1587 |
+
if extras.get("expiry_date"):
|
| 1588 |
+
merged["Nature of the Operators Business (Summary)"]["Expiry Date"] = [extras["expiry_date"]]
|
| 1589 |
+
if extras.get("accreditation_number"):
|
| 1590 |
+
merged["Nature of the Operators Business (Summary)"]["Accreditation Number"] = [extras["accreditation_number"]]
|
| 1591 |
|
| 1592 |
# Vehicle summary
|
| 1593 |
if "vehicle_summary" in pdf_extracted:
|
|
|
|
| 1630 |
)
|
| 1631 |
|
| 1632 |
|
| 1633 |
+
# Complete driver mapping - add these lines:
|
| 1634 |
if "drivers_detailed" in pdf_extracted and "Driver / Scheduler Records Examined" in merged:
|
| 1635 |
drivers = pdf_extracted["drivers_detailed"]
|
| 1636 |
+
|
| 1637 |
+
# Map ALL the driver fields
|
| 1638 |
+
merged["Driver / Scheduler Records Examined"]["Driver / Scheduler Name"] = [d.get("name","") for d in drivers]
|
| 1639 |
+
merged["Driver / Scheduler Records Examined"]["Driver TLIF Course # Completed"] = [d.get("driver_tlif","") for d in drivers]
|
| 1640 |
+
merged["Driver / Scheduler Records Examined"]["Scheduler TLIF Course # Completed"] = [d.get("scheduler_tlif","") for d in drivers]
|
| 1641 |
+
merged["Driver / Scheduler Records Examined"]["Medical Certificates (Current Yes/No) Date of expiry"] = [d.get("medical_expiry","") for d in drivers]
|
| 1642 |
merged["Driver / Scheduler Records Examined"]["Roster / Schedule / Safe Driving Plan (Date Range)"] = [d.get("roster_schedule","") for d in drivers]
|
| 1643 |
+
merged["Driver / Scheduler Records Examined"]["Fit for Duty Statement Completed (Yes/No)"] = [d.get("fit_for_duty","") for d in drivers]
|
| 1644 |
merged["Driver / Scheduler Records Examined"]["Work Diary Pages (Page Numbers) Electronic Work Diary Records (Date Range)"] = [d.get("work_diary","") for d in drivers]
|
| 1645 |
|
|
|
|
| 1646 |
# --- Print accreditation name (robust, no UnboundLocalError) ---
|
| 1647 |
if "Print accreditation name" in merged:
|
| 1648 |
acc_name = "" # init
|