Spaces:
Running
Running
Update master_key.py
Browse files- master_key.py +78 -29
master_key.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
# master_key.py
|
| 2 |
"""
|
| 3 |
Comprehensive Master Key for NHVAS Audit extraction (updated & hardened)
|
| 4 |
-
|
| 5 |
Usage:
|
| 6 |
- Import TABLE_SCHEMAS, HEADING_PATTERNS, PARAGRAPH_PATTERNS, GLOBAL_SETTINGS
|
| 7 |
- The pipeline should:
|
|
@@ -46,6 +45,83 @@ GLOBAL_SETTINGS = {
|
|
| 46 |
]
|
| 47 |
}
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
# ---------------------------
|
| 50 |
# Table schemas
|
| 51 |
# ---------------------------
|
|
@@ -500,31 +576,4 @@ PARAGRAPH_PATTERNS = {
|
|
| 500 |
"date_line": r"^\s*\d{1,2}(?:st|nd|rd|th)?\s+[A-Za-z]+\s+\d{4}\s*$|^Date$",
|
| 501 |
# extra patterns to help skip/identify date-like strings in name fields
|
| 502 |
"date_like": GLOBAL_SETTINGS["date_like_pattern"]
|
| 503 |
-
}
|
| 504 |
-
|
| 505 |
-
# ---------------------------
|
| 506 |
-
# Helpful hints for pipeline implementer (non-executable comments)
|
| 507 |
-
# ---------------------------
|
| 508 |
-
# 1) Matching order recommendation (pipeline):
|
| 509 |
-
# - Try exact qualified keys in schema['preferred_json_keys']
|
| 510 |
-
# - Try exact label match against schema['labels'] and schema['label_synonyms']
|
| 511 |
-
# - Apply normalization rules from GLOBAL_SETTINGS then try clean match
|
| 512 |
-
# - Fuzzy match using GLOBAL_SETTINGS['fuzzy_thresholds'] keyed by schema priority
|
| 513 |
-
# - If schema has 'fallback_keys', try them last
|
| 514 |
-
#
|
| 515 |
-
# 2) Date-safety: for any schema field listed under 'skip_if_date_like', verify JSON value
|
| 516 |
-
# or cell content is not date-like (use PARAGRAPH_PATTERNS['date_like']) before replacing.
|
| 517 |
-
#
|
| 518 |
-
# 3) Multi-value cells: If schema contains 'split_on', join JSON list items using '\n'
|
| 519 |
-
# or split a single JSON string on these delimiters to create separate lines.
|
| 520 |
-
#
|
| 521 |
-
# 4) OCR repairs: run GLOBAL_SETTINGS['ocr_repair_rules'] on extracted header text
|
| 522 |
-
# before attempting matching (fix common OCR confusion).
|
| 523 |
-
#
|
| 524 |
-
# 5) Processed flags: keep per-table attribute (e.g. table._processed_operator_declaration = True)
|
| 525 |
-
# to avoid running two different handlers on the same Operator Declaration table.
|
| 526 |
-
#
|
| 527 |
-
# 6) Tuning: If a schema is consistently mis-matched, add the exact observed header string
|
| 528 |
-
# into schema['label_synonyms'] for deterministic matching.
|
| 529 |
-
#
|
| 530 |
-
# End of master key
|
|
|
|
| 1 |
# master_key.py
|
| 2 |
"""
|
| 3 |
Comprehensive Master Key for NHVAS Audit extraction (updated & hardened)
|
|
|
|
| 4 |
Usage:
|
| 5 |
- Import TABLE_SCHEMAS, HEADING_PATTERNS, PARAGRAPH_PATTERNS, GLOBAL_SETTINGS
|
| 6 |
- The pipeline should:
|
|
|
|
| 45 |
]
|
| 46 |
}
|
| 47 |
|
| 48 |
+
# ---------------------------
|
| 49 |
+
# Extra header synonyms for common OCR variants and noisy long phrases
|
| 50 |
+
# ---------------------------
|
| 51 |
+
# NOTE: keys in this mapping should be applied to a *normalized* header key
|
| 52 |
+
# (lowercased, punctuation removed, whitespace collapsed) before lookup.
|
| 53 |
+
# Example normalized keys: "registrationnumber", "subcontractoryesno", "rfssuspensioncertificationn/a"
|
| 54 |
+
EXTRA_HEADER_SYNONYMS = {
|
| 55 |
+
# Registration / common short variants
|
| 56 |
+
"registrationnumber": "Registration Number",
|
| 57 |
+
"registration number": "Registration Number",
|
| 58 |
+
"registrationno": "Registration Number",
|
| 59 |
+
"reg no": "Registration Number",
|
| 60 |
+
"regno": "Registration Number",
|
| 61 |
+
"registration": "Registration Number",
|
| 62 |
+
"no": "No.",
|
| 63 |
+
|
| 64 |
+
# Roadworthiness / maintenance
|
| 65 |
+
"roadworthinesscertificates": "Roadworthiness Certificates",
|
| 66 |
+
"roadworthiness certificate": "Roadworthiness Certificates",
|
| 67 |
+
"roadworthiness certificates": "Roadworthiness Certificates",
|
| 68 |
+
|
| 69 |
+
"maintenancerecords": "Maintenance Records",
|
| 70 |
+
"maintenance records": "Maintenance Records",
|
| 71 |
+
"triprecords": "Maintenance Records",
|
| 72 |
+
"trip records": "Maintenance Records",
|
| 73 |
+
|
| 74 |
+
"dailychecks": "Daily Checks",
|
| 75 |
+
"daily check": "Daily Checks",
|
| 76 |
+
"daily checks": "Daily Checks",
|
| 77 |
+
|
| 78 |
+
# Faults
|
| 79 |
+
"faultrecordingreporting": "Fault Recording/ Reporting",
|
| 80 |
+
"fault recording reporting": "Fault Recording/ Reporting",
|
| 81 |
+
"fault recording/reporting": "Fault Recording/ Reporting",
|
| 82 |
+
"faultrecording/reportingonsuspensionsystem": "Fault Recording/ Reporting",
|
| 83 |
+
|
| 84 |
+
"faultrepair": "Fault Repair",
|
| 85 |
+
"fault repair": "Fault Repair",
|
| 86 |
+
|
| 87 |
+
# Sub-contractor / compliance long forms
|
| 88 |
+
"subcontractoryesno": "Sub-contractor (Yes/No)",
|
| 89 |
+
"sub-contractor(yesno)": "Sub-contractor (Yes/No)",
|
| 90 |
+
"sub-contractedvehiclesstatementofcompliance": "Sub-contracted Vehicles Statement of Compliance (Yes/No)",
|
| 91 |
+
"sub contracted vehicles statement of compliance": "Sub-contracted Vehicles Statement of Compliance (Yes/No)",
|
| 92 |
+
"sub contracted": "Sub-contractor (Yes/No)",
|
| 93 |
+
"sub-contracted": "Sub-contractor (Yes/No)",
|
| 94 |
+
"sub contractor": "Sub-contractor (Yes/No)",
|
| 95 |
+
|
| 96 |
+
# RFS / suspension / weight verification
|
| 97 |
+
"rfs suspension certification": "RFS Suspension Certification # (N/A if not applicable)",
|
| 98 |
+
"rfs suspension certification # (n/a if not applicable)": "RFS Suspension Certification # (N/A if not applicable)",
|
| 99 |
+
"weightverificationrecords": "Weight Verification Records (Date Range)",
|
| 100 |
+
"weight verification records": "Weight Verification Records (Date Range)",
|
| 101 |
+
"suspensionsystemmaintenance": "Suspension System Maintenance (Date Range)",
|
| 102 |
+
"suspension system maintenance": "Suspension System Maintenance (Date Range)",
|
| 103 |
+
|
| 104 |
+
# NHVR / auditor registration
|
| 105 |
+
"nhvrorexemplarglobalauditorregistrationnumber": "NHVR or Exemplar Global Auditor Registration Number",
|
| 106 |
+
"nhvr auditor registration number": "NHVR or Exemplar Global Auditor Registration Number",
|
| 107 |
+
"nhvr auditor reg no": "NHVR or Exemplar Global Auditor Registration Number",
|
| 108 |
+
|
| 109 |
+
# Print/Accreditation
|
| 110 |
+
"printname": "Print Name",
|
| 111 |
+
"print accreditation name": "(print accreditation name)",
|
| 112 |
+
|
| 113 |
+
# Operator Declaration/Position short forms
|
| 114 |
+
"positiontitle": "Position Title",
|
| 115 |
+
"position": "Position Title",
|
| 116 |
+
|
| 117 |
+
# Misc helpful fallbacks
|
| 118 |
+
"details": "DETAILS",
|
| 119 |
+
"management": "Management",
|
| 120 |
+
"maintenance management": "MAINTENANCE MANAGEMENT",
|
| 121 |
+
"mass management": "MASS MANAGEMENT",
|
| 122 |
+
"fatigue management": "FATIGUE MANAGEMENT"
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
# ---------------------------
|
| 126 |
# Table schemas
|
| 127 |
# ---------------------------
|
|
|
|
| 576 |
"date_line": r"^\s*\d{1,2}(?:st|nd|rd|th)?\s+[A-Za-z]+\s+\d{4}\s*$|^Date$",
|
| 577 |
# extra patterns to help skip/identify date-like strings in name fields
|
| 578 |
"date_like": GLOBAL_SETTINGS["date_like_pattern"]
|
| 579 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|