Shami96 commited on
Commit
d053da2
·
verified ·
1 Parent(s): 8df3e10

Update update_docx_with_pdf.py

Browse files
Files changed (1) hide show
  1. update_docx_with_pdf.py +173 -68
update_docx_with_pdf.py CHANGED
@@ -15,6 +15,28 @@ from collections import OrderedDict # <-- add this
15
  def _nz(x):
16
  return x if isinstance(x, str) and x.strip() else ""
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  SUMMARY_SECTIONS = {
19
  "MAINTENANCE MANAGEMENT": "Maintenance Management Summary",
20
  "MASS MANAGEMENT": "Mass Management Summary",
@@ -770,6 +792,34 @@ class NHVASMerger:
770
  page_num = table.get("page", 0)
771
  self.log_debug(f"Processing table on page {page_num} with headers: {headers[:3]}...")
772
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
773
  # 🔧 NEW: collapse possible multi-line headers once up front
774
  collapsed_headers, collapsed_rows = self._collapse_multiline_headers(headers, data_rows)
775
 
@@ -1052,78 +1102,122 @@ class NHVASMerger:
1052
  return added
1053
 
1054
  def _extract_driver_table(self, headers: List[str], data_rows: List[List], extracted: Dict):
1055
- """Header-driven extraction for Driver / Scheduler Records."""
1056
  drivers = []
1057
- ch = [_canon_header(h) for h in headers or []]
1058
-
1059
- # helpers
1060
- def find_col(needles: list[str]) -> Optional[int]:
1061
- for i, h in enumerate(ch):
1062
- if any(n in h for n in needles):
1063
- return i
1064
- return None
1065
-
1066
- def find_col_rx(patterns: list[str]) -> Optional[int]:
1067
- for i, h in enumerate(ch):
1068
- if any(re.search(p, h) for p in patterns):
1069
- return i
1070
- return None
1071
-
1072
- name_idx = find_col_rx([r"\bdriver\s*/\s*scheduler\s*name\b",
1073
- r"\bdriver\s+name\b", r"\bscheduler\s+name\b", r"\bname\b"])
1074
- tlif_d_idx = find_col(["driver tlif"])
1075
- tlif_s_idx = find_col(["scheduler tlif"])
1076
- medical_idx= find_col(["medical", "expiry"])
1077
- roster_idx = find_col_rx([r"\broster\b", r"\bsafe\s+driving\s+plan\b", r"\bschedule\b(?!r\b)"])
1078
- fit_idx = find_col(["fit for duty"])
1079
- diary_idx = find_col(["work diary", "electronic work diary", "page numbers"])
1080
-
1081
- for row in data_rows:
1082
  if not row:
1083
  continue
 
 
 
 
 
 
 
 
1084
 
 
1085
  name = None
1086
- if name_idx is not None and name_idx < len(row):
1087
- name = _smart_space(str(row[name_idx]).strip())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1088
  if not name:
 
1089
  continue
1090
 
1091
- d = {"name": name}
1092
-
1093
- if tlif_d_idx is not None and tlif_d_idx < len(row):
1094
- d["driver_tlif"] = _smart_space(str(row[tlif_d_idx]).strip())
1095
- if tlif_s_idx is not None and tlif_s_idx < len(row):
1096
- d["scheduler_tlif"] = _smart_space(str(row[tlif_s_idx]).strip())
1097
- if medical_idx is not None and medical_idx < len(row):
1098
- d["medical_expiry"] = _smart_space(str(row[medical_idx]).strip())
1099
-
1100
- # Roster/Schedule/SDP: prefer the detected column; accept only date/range-like, not the name
1101
- if roster_idx is not None and roster_idx < len(row):
1102
- raw_roster = _smart_space(str(row[roster_idx]).strip())
1103
- if raw_roster and re.search(r"[0-9/–-]", raw_roster) and raw_roster.lower() != name.lower():
1104
- d["roster_schedule"] = raw_roster
1105
-
1106
- # Fallback: scan the row for the first date/range-like cell that's not the name cell
1107
- if "roster_schedule" not in d:
1108
- for j, cell in enumerate(row):
1109
- if j == name_idx:
1110
- continue
1111
- s = _smart_space(str(cell).strip())
1112
- if s and re.search(r"[0-9/–-]", s) and s.lower() != name.lower():
1113
- d["roster_schedule"] = s
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1114
  break
1115
 
1116
- if fit_idx is not None and fit_idx < len(row):
1117
- d["fit_for_duty"] = _smart_space(str(row[fit_idx]).strip())
1118
- if diary_idx is not None and diary_idx < len(row):
1119
- d["work_diary"] = _smart_space(str(row[diary_idx]).strip())
1120
-
 
 
 
 
 
1121
  drivers.append(d)
 
1122
 
1123
  if drivers:
1124
  extracted["drivers_detailed"] = drivers
1125
- self.log_debug(f"Driver rows extracted (header-based): {len(drivers)}")
1126
-
 
1127
 
1128
  def _extract_management_table(self, data_rows: List[List], extracted: Dict, headers: List[str]):
1129
  txt = " ".join(str(h) for h in headers).lower()
@@ -1177,7 +1271,7 @@ class NHVASMerger:
1177
 
1178
  # Defensive trimming of trailing uppercase boilerplate or table header noise
1179
  candidate = re.sub(
1180
- r"(ACCREDITATION VEHICLE SUMMARY|AUDIT OBSERVATIONS|NHVAS AUDIT SUMMARY REPORT|STD\s+\d+\.).*$",
1181
  "",
1182
  candidate,
1183
  flags=re.I | re.DOTALL,
@@ -1189,7 +1283,7 @@ class NHVASMerger:
1189
 
1190
  # Extract Accreditation Number / Expiry only if they appear inline in this small block
1191
  m_acc = re.search(r"\bAccreditation\s*Number[:\s-]*([A-Za-z0-9\s\-\/]+)", candidate, flags=re.I)
1192
- m_exp = re.search(r"\bExpiry\s*Date[:\s-]*([A-Za-z0-9\s,\/\-]+)", candidate, flags=re.I)
1193
  if m_acc:
1194
  acc = re.sub(r"\s+", " ", m_acc.group(1)).strip()
1195
  acc = re.sub(r"[^\d]", "", acc) or acc
@@ -1483,7 +1577,17 @@ class NHVASMerger:
1483
 
1484
  # Business summary
1485
  if "business_summary" in pdf_extracted and "Nature of the Operators Business (Summary)" in merged:
1486
- merged["Nature of the Operators Business (Summary)"]["Nature of the Operators Business (Summary):"] = [_smart_space(pdf_extracted["business_summary"])]
 
 
 
 
 
 
 
 
 
 
1487
 
1488
  # Vehicle summary
1489
  if "vehicle_summary" in pdf_extracted:
@@ -1526,18 +1630,19 @@ class NHVASMerger:
1526
  )
1527
 
1528
 
1529
- # replace the whole Drivers/Scheduler block with:
1530
  if "drivers_detailed" in pdf_extracted and "Driver / Scheduler Records Examined" in merged:
1531
  drivers = pdf_extracted["drivers_detailed"]
1532
-
1533
- def _looks_like_range(s):
1534
- return bool(re.search(r"[0-9]{1,2}[/-]", s or ""))
1535
-
 
 
1536
  merged["Driver / Scheduler Records Examined"]["Roster / Schedule / Safe Driving Plan (Date Range)"] = [d.get("roster_schedule","") for d in drivers]
1537
- merged["Driver / Scheduler Records Examined"]["Fit for Duty Statement Completed (Yes/No)"] = [d.get("fit_for_duty","") for d in drivers]
1538
  merged["Driver / Scheduler Records Examined"]["Work Diary Pages (Page Numbers) Electronic Work Diary Records (Date Range)"] = [d.get("work_diary","") for d in drivers]
1539
 
1540
-
1541
  # --- Print accreditation name (robust, no UnboundLocalError) ---
1542
  if "Print accreditation name" in merged:
1543
  acc_name = "" # init
 
15
  def _nz(x):
16
  return x if isinstance(x, str) and x.strip() else ""
17
 
18
+ def _fix_ocr_date_noise(date_str: str) -> str:
19
+ """Clean up OCR date noise and standardize date format."""
20
+ if not date_str:
21
+ return ""
22
+
23
+ # Remove common OCR artifacts
24
+ cleaned = re.sub(r'\s+', ' ', date_str.strip())
25
+ cleaned = re.sub(r'[^\w\s/\-]', '', cleaned)
26
+
27
+ # Try to extract month/year patterns
28
+ month_year_match = re.search(r'([A-Za-z]+)\s+(\d{4})', cleaned)
29
+ if month_year_match:
30
+ return f"{month_year_match.group(1)} {month_year_match.group(2)}"
31
+
32
+ # Try to extract date patterns like "21st October 2022"
33
+ date_match = re.search(r'(\d{1,2})(?:st|nd|rd|th)?\s+([A-Za-z]+)\s+(\d{4})', cleaned)
34
+ if date_match:
35
+ return f"{date_match.group(1)} {date_match.group(2)} {date_match.group(3)}"
36
+
37
+ # Return cleaned version if no specific pattern found
38
+ return cleaned
39
+
40
  SUMMARY_SECTIONS = {
41
  "MAINTENANCE MANAGEMENT": "Maintenance Management Summary",
42
  "MASS MANAGEMENT": "Mass Management Summary",
 
792
  page_num = table.get("page", 0)
793
  self.log_debug(f"Processing table on page {page_num} with headers: {headers[:3]}...")
794
 
795
+ # NEW: Check for single-column Nature of Business table
796
+ if (len(headers) == 1 and
797
+ "nature of the operators business" in str(headers[0]).lower() and
798
+ len(data_rows) > 0 and len(data_rows[0]) > 0):
799
+
800
+ text = str(data_rows[0][0])
801
+ self.log_debug(f"Found Nature of Business table with text: {text[:100]}...")
802
+
803
+ # Extract inline expiry date and accreditation number
804
+ m_exp = re.search(r"\b(?:Mass and Maintenance\s+)?Expiry\s*Date[:\s-]*([0-9\.\/\-]+)", text, flags=re.I)
805
+ m_acc = re.search(r"\bAccreditation\s*Number[:\s-]*([A-Za-z0-9\s\-\/]+)", text, flags=re.I)
806
+
807
+ if m_exp:
808
+ exp_date = m_exp.group(1).strip()
809
+ extracted.setdefault("business_summary_extras", {})["expiry_date"] = exp_date
810
+ self.log_debug(f"Extracted expiry date: {exp_date}")
811
+
812
+ if m_acc:
813
+ acc_num = m_acc.group(1).strip()
814
+ extracted.setdefault("business_summary_extras", {})["accreditation_number"] = acc_num
815
+ self.log_debug(f"Extracted accreditation number: {acc_num}")
816
+
817
+ # Store the clean text (without the inline date/number)
818
+ clean_text = re.sub(r"\s*(?:Mass and Maintenance\s+)?Expiry\s*Date[:\s-]*[0-9\.\/\-]+", "", text, flags=re.I)
819
+ clean_text = re.sub(r"\s*Accreditation\s*Number[:\s-]*[A-Za-z0-9\s\-\/]+", "", clean_text, flags=re.I)
820
+ extracted["business_summary"] = clean_text.strip()
821
+ continue
822
+
823
  # 🔧 NEW: collapse possible multi-line headers once up front
824
  collapsed_headers, collapsed_rows = self._collapse_multiline_headers(headers, data_rows)
825
 
 
1102
  return added
1103
 
1104
  def _extract_driver_table(self, headers: List[str], data_rows: List[List], extracted: Dict):
1105
+ """Enhanced header-driven extraction for Driver / Scheduler Records."""
1106
  drivers = []
1107
+
1108
+ self.log_debug(f"Driver table has {len(data_rows)} rows")
1109
+
1110
+ # Skip header continuation rows - look for the first row that starts with a number
1111
+ actual_data_start = 0
1112
+ for i, row in enumerate(data_rows):
1113
+ if row and str(row[0]).strip().startswith(('1.', '1')):
1114
+ actual_data_start = i
1115
+ self.log_debug(f"Found actual data starting at row {i}")
1116
+ break
1117
+
1118
+ if actual_data_start == 0:
1119
+ self.log_debug("Warning: Could not find numbered data rows")
1120
+
1121
+ # Process only the actual data rows (skip header continuation rows)
1122
+ for row_idx, row in enumerate(data_rows[actual_data_start:], start=actual_data_start):
 
 
 
 
 
 
 
 
 
1123
  if not row:
1124
  continue
1125
+
1126
+ self.log_debug(f"Processing data row {row_idx}: {row}")
1127
+
1128
+ # Check if this is a numbered row (1., 2., etc.)
1129
+ first_cell = str(row[0]).strip()
1130
+ if not (first_cell.endswith('.') and first_cell[:-1].isdigit()):
1131
+ self.log_debug(f"Skipping row {row_idx} - not a numbered data row")
1132
+ continue
1133
 
1134
+ # Based on the raw data structure, extract from fixed positions
1135
  name = None
1136
+ driver_tlif = ""
1137
+ scheduler_tlif = ""
1138
+ medical = ""
1139
+ roster = ""
1140
+ fit_duty = ""
1141
+ work_diary = ""
1142
+
1143
+ # Look for name in columns around index 3-4
1144
+ for i in range(2, min(6, len(row))):
1145
+ candidate = _smart_space(str(row[i]).strip())
1146
+ if (candidate and
1147
+ len(candidate) > 3 and
1148
+ any(c.isalpha() for c in candidate) and
1149
+ candidate.lower() not in ['entry', 'n/a', 'yes', 'no', 'name'] and
1150
+ not candidate.isdigit() and
1151
+ not candidate.endswith('.')):
1152
+ name = candidate
1153
+ self.log_debug(f"Found name at column {i}: {name}")
1154
+ break
1155
+
1156
  if not name:
1157
+ self.log_debug(f"Skipping row {row_idx} - no valid name found")
1158
  continue
1159
 
1160
+ # Extract other fields from approximate positions based on raw data
1161
+ # Driver TLIF around column 6
1162
+ for i in range(5, min(8, len(row))):
1163
+ val = str(row[i]).strip()
1164
+ if val and val.lower() in ['yes', 'no']:
1165
+ driver_tlif = val.title()
1166
+ break
1167
+
1168
+ # Scheduler TLIF around column 9
1169
+ for i in range(8, min(12, len(row))):
1170
+ val = str(row[i]).strip()
1171
+ if val and val.lower() in ['yes', 'no']:
1172
+ scheduler_tlif = val.title()
1173
+ break
1174
+
1175
+ # Medical around column 12
1176
+ for i in range(11, min(15, len(row))):
1177
+ val = _smart_space(str(row[i]).strip())
1178
+ if val and val.lower() not in ['', 'entry']:
1179
+ medical = val
1180
+ break
1181
+
1182
+ # Roster around column 15
1183
+ for i in range(14, min(18, len(row))):
1184
+ val = _smart_space(str(row[i]).strip())
1185
+ if val:
1186
+ roster = val
1187
+ break
1188
+
1189
+ # Fit for Duty around column 18
1190
+ for i in range(17, min(21, len(row))):
1191
+ val = str(row[i]).strip()
1192
+ if val and val.lower() in ['yes', 'no']:
1193
+ fit_duty = val.title()
1194
+ break
1195
+
1196
+ # Work Diary around column 21
1197
+ for i in range(20, min(len(row), 24)):
1198
+ val = _smart_space(str(row[i]).strip())
1199
+ if val:
1200
+ work_diary = val
1201
  break
1202
 
1203
+ d = {
1204
+ "name": name,
1205
+ "driver_tlif": driver_tlif,
1206
+ "scheduler_tlif": scheduler_tlif,
1207
+ "medical_expiry": medical,
1208
+ "roster_schedule": roster,
1209
+ "fit_for_duty": fit_duty,
1210
+ "work_diary": work_diary
1211
+ }
1212
+
1213
  drivers.append(d)
1214
+ self.log_debug(f"Added driver: {d}")
1215
 
1216
  if drivers:
1217
  extracted["drivers_detailed"] = drivers
1218
+ self.log_debug(f"Driver rows extracted: {len(drivers)}")
1219
+ else:
1220
+ self.log_debug("No drivers extracted")
1221
 
1222
  def _extract_management_table(self, data_rows: List[List], extracted: Dict, headers: List[str]):
1223
  txt = " ".join(str(h) for h in headers).lower()
 
1271
 
1272
  # Defensive trimming of trailing uppercase boilerplate or table header noise
1273
  candidate = re.sub(
1274
+ r"(Mass and Maintenance Expiry Date:|ACCREDITATION DRIVER SUMMARY|ACCREDITATION VEHICLE SUMMARY|AUDIT OBSERVATIONS|NHVAS AUDIT SUMMARY REPORT|STD\s+\d+\.).*$",
1275
  "",
1276
  candidate,
1277
  flags=re.I | re.DOTALL,
 
1283
 
1284
  # Extract Accreditation Number / Expiry only if they appear inline in this small block
1285
  m_acc = re.search(r"\bAccreditation\s*Number[:\s-]*([A-Za-z0-9\s\-\/]+)", candidate, flags=re.I)
1286
+ m_exp = re.search(r"\b(?:Mass and Maintenance\s+)?Expiry\s*Date[:\s-]*([A-Za-z0-9\s,\/\-\.]+)", candidate, flags=re.I)
1287
  if m_acc:
1288
  acc = re.sub(r"\s+", " ", m_acc.group(1)).strip()
1289
  acc = re.sub(r"[^\d]", "", acc) or acc
 
1577
 
1578
  # Business summary
1579
  if "business_summary" in pdf_extracted and "Nature of the Operators Business (Summary)" in merged:
1580
+ # Clean the main text by removing either inline date pattern
1581
+ business_text = pdf_extracted["business_summary"]
1582
+ clean_text = re.sub(r"\s*(?:Mass and Maintenance\s+)?Expiry Date[:\s]*[A-Za-z0-9\s,\/\-\.]+", "", business_text, flags=re.I)
1583
+ merged["Nature of the Operators Business (Summary)"]["Nature of the Operators Business (Summary):"] = [_smart_space(clean_text)]
1584
+
1585
+ # Override with extracted inline values
1586
+ extras = pdf_extracted.get("business_summary_extras", {})
1587
+ if extras.get("expiry_date"):
1588
+ merged["Nature of the Operators Business (Summary)"]["Expiry Date"] = [extras["expiry_date"]]
1589
+ if extras.get("accreditation_number"):
1590
+ merged["Nature of the Operators Business (Summary)"]["Accreditation Number"] = [extras["accreditation_number"]]
1591
 
1592
  # Vehicle summary
1593
  if "vehicle_summary" in pdf_extracted:
 
1630
  )
1631
 
1632
 
1633
+ # Complete driver mapping - add these lines:
1634
  if "drivers_detailed" in pdf_extracted and "Driver / Scheduler Records Examined" in merged:
1635
  drivers = pdf_extracted["drivers_detailed"]
1636
+
1637
+ # Map ALL the driver fields
1638
+ merged["Driver / Scheduler Records Examined"]["Driver / Scheduler Name"] = [d.get("name","") for d in drivers]
1639
+ merged["Driver / Scheduler Records Examined"]["Driver TLIF Course # Completed"] = [d.get("driver_tlif","") for d in drivers]
1640
+ merged["Driver / Scheduler Records Examined"]["Scheduler TLIF Course # Completed"] = [d.get("scheduler_tlif","") for d in drivers]
1641
+ merged["Driver / Scheduler Records Examined"]["Medical Certificates (Current Yes/No) Date of expiry"] = [d.get("medical_expiry","") for d in drivers]
1642
  merged["Driver / Scheduler Records Examined"]["Roster / Schedule / Safe Driving Plan (Date Range)"] = [d.get("roster_schedule","") for d in drivers]
1643
+ merged["Driver / Scheduler Records Examined"]["Fit for Duty Statement Completed (Yes/No)"] = [d.get("fit_for_duty","") for d in drivers]
1644
  merged["Driver / Scheduler Records Examined"]["Work Diary Pages (Page Numbers) Electronic Work Diary Records (Date Range)"] = [d.get("work_diary","") for d in drivers]
1645
 
 
1646
  # --- Print accreditation name (robust, no UnboundLocalError) ---
1647
  if "Print accreditation name" in merged:
1648
  acc_name = "" # init