Spaces:
Running
Running
Update master_key.py
Browse files- master_key.py +29 -78
master_key.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
# master_key.py
|
| 2 |
"""
|
| 3 |
Comprehensive Master Key for NHVAS Audit extraction (updated & hardened)
|
|
|
|
| 4 |
Usage:
|
| 5 |
- Import TABLE_SCHEMAS, HEADING_PATTERNS, PARAGRAPH_PATTERNS, GLOBAL_SETTINGS
|
| 6 |
- The pipeline should:
|
|
@@ -45,83 +46,6 @@ GLOBAL_SETTINGS = {
|
|
| 45 |
]
|
| 46 |
}
|
| 47 |
|
| 48 |
-
# ---------------------------
|
| 49 |
-
# Extra header synonyms for common OCR variants and noisy long phrases
|
| 50 |
-
# ---------------------------
|
| 51 |
-
# NOTE: keys in this mapping should be applied to a *normalized* header key
|
| 52 |
-
# (lowercased, punctuation removed, whitespace collapsed) before lookup.
|
| 53 |
-
# Example normalized keys: "registrationnumber", "subcontractoryesno", "rfssuspensioncertificationn/a"
|
| 54 |
-
EXTRA_HEADER_SYNONYMS = {
|
| 55 |
-
# Registration / common short variants
|
| 56 |
-
"registrationnumber": "Registration Number",
|
| 57 |
-
"registration number": "Registration Number",
|
| 58 |
-
"registrationno": "Registration Number",
|
| 59 |
-
"reg no": "Registration Number",
|
| 60 |
-
"regno": "Registration Number",
|
| 61 |
-
"registration": "Registration Number",
|
| 62 |
-
"no": "No.",
|
| 63 |
-
|
| 64 |
-
# Roadworthiness / maintenance
|
| 65 |
-
"roadworthinesscertificates": "Roadworthiness Certificates",
|
| 66 |
-
"roadworthiness certificate": "Roadworthiness Certificates",
|
| 67 |
-
"roadworthiness certificates": "Roadworthiness Certificates",
|
| 68 |
-
|
| 69 |
-
"maintenancerecords": "Maintenance Records",
|
| 70 |
-
"maintenance records": "Maintenance Records",
|
| 71 |
-
"triprecords": "Maintenance Records",
|
| 72 |
-
"trip records": "Maintenance Records",
|
| 73 |
-
|
| 74 |
-
"dailychecks": "Daily Checks",
|
| 75 |
-
"daily check": "Daily Checks",
|
| 76 |
-
"daily checks": "Daily Checks",
|
| 77 |
-
|
| 78 |
-
# Faults
|
| 79 |
-
"faultrecordingreporting": "Fault Recording/ Reporting",
|
| 80 |
-
"fault recording reporting": "Fault Recording/ Reporting",
|
| 81 |
-
"fault recording/reporting": "Fault Recording/ Reporting",
|
| 82 |
-
"faultrecording/reportingonsuspensionsystem": "Fault Recording/ Reporting",
|
| 83 |
-
|
| 84 |
-
"faultrepair": "Fault Repair",
|
| 85 |
-
"fault repair": "Fault Repair",
|
| 86 |
-
|
| 87 |
-
# Sub-contractor / compliance long forms
|
| 88 |
-
"subcontractoryesno": "Sub-contractor (Yes/No)",
|
| 89 |
-
"sub-contractor(yesno)": "Sub-contractor (Yes/No)",
|
| 90 |
-
"sub-contractedvehiclesstatementofcompliance": "Sub-contracted Vehicles Statement of Compliance (Yes/No)",
|
| 91 |
-
"sub contracted vehicles statement of compliance": "Sub-contracted Vehicles Statement of Compliance (Yes/No)",
|
| 92 |
-
"sub contracted": "Sub-contractor (Yes/No)",
|
| 93 |
-
"sub-contracted": "Sub-contractor (Yes/No)",
|
| 94 |
-
"sub contractor": "Sub-contractor (Yes/No)",
|
| 95 |
-
|
| 96 |
-
# RFS / suspension / weight verification
|
| 97 |
-
"rfs suspension certification": "RFS Suspension Certification # (N/A if not applicable)",
|
| 98 |
-
"rfs suspension certification # (n/a if not applicable)": "RFS Suspension Certification # (N/A if not applicable)",
|
| 99 |
-
"weightverificationrecords": "Weight Verification Records (Date Range)",
|
| 100 |
-
"weight verification records": "Weight Verification Records (Date Range)",
|
| 101 |
-
"suspensionsystemmaintenance": "Suspension System Maintenance (Date Range)",
|
| 102 |
-
"suspension system maintenance": "Suspension System Maintenance (Date Range)",
|
| 103 |
-
|
| 104 |
-
# NHVR / auditor registration
|
| 105 |
-
"nhvrorexemplarglobalauditorregistrationnumber": "NHVR or Exemplar Global Auditor Registration Number",
|
| 106 |
-
"nhvr auditor registration number": "NHVR or Exemplar Global Auditor Registration Number",
|
| 107 |
-
"nhvr auditor reg no": "NHVR or Exemplar Global Auditor Registration Number",
|
| 108 |
-
|
| 109 |
-
# Print/Accreditation
|
| 110 |
-
"printname": "Print Name",
|
| 111 |
-
"print accreditation name": "(print accreditation name)",
|
| 112 |
-
|
| 113 |
-
# Operator Declaration/Position short forms
|
| 114 |
-
"positiontitle": "Position Title",
|
| 115 |
-
"position": "Position Title",
|
| 116 |
-
|
| 117 |
-
# Misc helpful fallbacks
|
| 118 |
-
"details": "DETAILS",
|
| 119 |
-
"management": "Management",
|
| 120 |
-
"maintenance management": "MAINTENANCE MANAGEMENT",
|
| 121 |
-
"mass management": "MASS MANAGEMENT",
|
| 122 |
-
"fatigue management": "FATIGUE MANAGEMENT"
|
| 123 |
-
}
|
| 124 |
-
|
| 125 |
# ---------------------------
|
| 126 |
# Table schemas
|
| 127 |
# ---------------------------
|
|
@@ -576,4 +500,31 @@ PARAGRAPH_PATTERNS = {
|
|
| 576 |
"date_line": r"^\s*\d{1,2}(?:st|nd|rd|th)?\s+[A-Za-z]+\s+\d{4}\s*$|^Date$",
|
| 577 |
# extra patterns to help skip/identify date-like strings in name fields
|
| 578 |
"date_like": GLOBAL_SETTINGS["date_like_pattern"]
|
| 579 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# master_key.py
|
| 2 |
"""
|
| 3 |
Comprehensive Master Key for NHVAS Audit extraction (updated & hardened)
|
| 4 |
+
|
| 5 |
Usage:
|
| 6 |
- Import TABLE_SCHEMAS, HEADING_PATTERNS, PARAGRAPH_PATTERNS, GLOBAL_SETTINGS
|
| 7 |
- The pipeline should:
|
|
|
|
| 46 |
]
|
| 47 |
}
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
# ---------------------------
|
| 50 |
# Table schemas
|
| 51 |
# ---------------------------
|
|
|
|
| 500 |
"date_line": r"^\s*\d{1,2}(?:st|nd|rd|th)?\s+[A-Za-z]+\s+\d{4}\s*$|^Date$",
|
| 501 |
# extra patterns to help skip/identify date-like strings in name fields
|
| 502 |
"date_like": GLOBAL_SETTINGS["date_like_pattern"]
|
| 503 |
+
}
|
| 504 |
+
|
| 505 |
+
# ---------------------------
|
| 506 |
+
# Helpful hints for pipeline implementer (non-executable comments)
|
| 507 |
+
# ---------------------------
|
| 508 |
+
# 1) Matching order recommendation (pipeline):
|
| 509 |
+
# - Try exact qualified keys in schema['preferred_json_keys']
|
| 510 |
+
# - Try exact label match against schema['labels'] and schema['label_synonyms']
|
| 511 |
+
# - Apply normalization rules from GLOBAL_SETTINGS then try clean match
|
| 512 |
+
# - Fuzzy match using GLOBAL_SETTINGS['fuzzy_thresholds'] keyed by schema priority
|
| 513 |
+
# - If schema has 'fallback_keys', try them last
|
| 514 |
+
#
|
| 515 |
+
# 2) Date-safety: for any schema field listed under 'skip_if_date_like', verify JSON value
|
| 516 |
+
# or cell content is not date-like (use PARAGRAPH_PATTERNS['date_like']) before replacing.
|
| 517 |
+
#
|
| 518 |
+
# 3) Multi-value cells: If schema contains 'split_on', join JSON list items using '\n'
|
| 519 |
+
# or split a single JSON string on these delimiters to create separate lines.
|
| 520 |
+
#
|
| 521 |
+
# 4) OCR repairs: run GLOBAL_SETTINGS['ocr_repair_rules'] on extracted header text
|
| 522 |
+
# before attempting matching (fix common OCR confusion).
|
| 523 |
+
#
|
| 524 |
+
# 5) Processed flags: keep per-table attribute (e.g. table._processed_operator_declaration = True)
|
| 525 |
+
# to avoid running two different handlers on the same Operator Declaration table.
|
| 526 |
+
#
|
| 527 |
+
# 6) Tuning: If a schema is consistently mis-matched, add the exact observed header string
|
| 528 |
+
# into schema['label_synonyms'] for deterministic matching.
|
| 529 |
+
#
|
| 530 |
+
# End of master key
|