Shami96 commited on
Commit
9880bcc
·
verified ·
1 Parent(s): f4b6b63

Update master_key.py

Browse files
Files changed (1) hide show
  1. master_key.py +78 -29
master_key.py CHANGED
@@ -1,7 +1,6 @@
1
  # master_key.py
2
  """
3
  Comprehensive Master Key for NHVAS Audit extraction (updated & hardened)
4
-
5
  Usage:
6
  - Import TABLE_SCHEMAS, HEADING_PATTERNS, PARAGRAPH_PATTERNS, GLOBAL_SETTINGS
7
  - The pipeline should:
@@ -46,6 +45,83 @@ GLOBAL_SETTINGS = {
46
  ]
47
  }
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  # ---------------------------
50
  # Table schemas
51
  # ---------------------------
@@ -500,31 +576,4 @@ PARAGRAPH_PATTERNS = {
500
  "date_line": r"^\s*\d{1,2}(?:st|nd|rd|th)?\s+[A-Za-z]+\s+\d{4}\s*$|^Date$",
501
  # extra patterns to help skip/identify date-like strings in name fields
502
  "date_like": GLOBAL_SETTINGS["date_like_pattern"]
503
- }
504
-
505
- # ---------------------------
506
- # Helpful hints for pipeline implementer (non-executable comments)
507
- # ---------------------------
508
- # 1) Matching order recommendation (pipeline):
509
- # - Try exact qualified keys in schema['preferred_json_keys']
510
- # - Try exact label match against schema['labels'] and schema['label_synonyms']
511
- # - Apply normalization rules from GLOBAL_SETTINGS then try clean match
512
- # - Fuzzy match using GLOBAL_SETTINGS['fuzzy_thresholds'] keyed by schema priority
513
- # - If schema has 'fallback_keys', try them last
514
- #
515
- # 2) Date-safety: for any schema field listed under 'skip_if_date_like', verify JSON value
516
- # or cell content is not date-like (use PARAGRAPH_PATTERNS['date_like']) before replacing.
517
- #
518
- # 3) Multi-value cells: If schema contains 'split_on', join JSON list items using '\n'
519
- # or split a single JSON string on these delimiters to create separate lines.
520
- #
521
- # 4) OCR repairs: run GLOBAL_SETTINGS['ocr_repair_rules'] on extracted header text
522
- # before attempting matching (fix common OCR confusion).
523
- #
524
- # 5) Processed flags: keep per-table attribute (e.g. table._processed_operator_declaration = True)
525
- # to avoid running two different handlers on the same Operator Declaration table.
526
- #
527
- # 6) Tuning: If a schema is consistently mis-matched, add the exact observed header string
528
- # into schema['label_synonyms'] for deterministic matching.
529
- #
530
- # End of master key
 
1
  # master_key.py
2
  """
3
  Comprehensive Master Key for NHVAS Audit extraction (updated & hardened)
 
4
  Usage:
5
  - Import TABLE_SCHEMAS, HEADING_PATTERNS, PARAGRAPH_PATTERNS, GLOBAL_SETTINGS
6
  - The pipeline should:
 
45
  ]
46
  }
47
 
48
+ # ---------------------------
49
+ # Extra header synonyms for common OCR variants and noisy long phrases
50
+ # ---------------------------
51
+ # NOTE: keys in this mapping should be applied to a *normalized* header key
52
+ # (lowercased, punctuation removed, whitespace collapsed) before lookup.
53
+ # Example normalized keys: "registrationnumber", "subcontractoryesno", "rfssuspensioncertificationn/a"
54
+ EXTRA_HEADER_SYNONYMS = {
55
+ # Registration / common short variants
56
+ "registrationnumber": "Registration Number",
57
+ "registration number": "Registration Number",
58
+ "registrationno": "Registration Number",
59
+ "reg no": "Registration Number",
60
+ "regno": "Registration Number",
61
+ "registration": "Registration Number",
62
+ "no": "No.",
63
+
64
+ # Roadworthiness / maintenance
65
+ "roadworthinesscertificates": "Roadworthiness Certificates",
66
+ "roadworthiness certificate": "Roadworthiness Certificates",
67
+ "roadworthiness certificates": "Roadworthiness Certificates",
68
+
69
+ "maintenancerecords": "Maintenance Records",
70
+ "maintenance records": "Maintenance Records",
71
+ "triprecords": "Maintenance Records",
72
+ "trip records": "Maintenance Records",
73
+
74
+ "dailychecks": "Daily Checks",
75
+ "daily check": "Daily Checks",
76
+ "daily checks": "Daily Checks",
77
+
78
+ # Faults
79
+ "faultrecordingreporting": "Fault Recording/ Reporting",
80
+ "fault recording reporting": "Fault Recording/ Reporting",
81
+ "fault recording/reporting": "Fault Recording/ Reporting",
82
+ "faultrecording/reportingonsuspensionsystem": "Fault Recording/ Reporting",
83
+
84
+ "faultrepair": "Fault Repair",
85
+ "fault repair": "Fault Repair",
86
+
87
+ # Sub-contractor / compliance long forms
88
+ "subcontractoryesno": "Sub-contractor (Yes/No)",
89
+ "sub-contractor(yesno)": "Sub-contractor (Yes/No)",
90
+ "sub-contractedvehiclesstatementofcompliance": "Sub-contracted Vehicles Statement of Compliance (Yes/No)",
91
+ "sub contracted vehicles statement of compliance": "Sub-contracted Vehicles Statement of Compliance (Yes/No)",
92
+ "sub contracted": "Sub-contractor (Yes/No)",
93
+ "sub-contracted": "Sub-contractor (Yes/No)",
94
+ "sub contractor": "Sub-contractor (Yes/No)",
95
+
96
+ # RFS / suspension / weight verification
97
+ "rfs suspension certification": "RFS Suspension Certification # (N/A if not applicable)",
98
+ "rfs suspension certification # (n/a if not applicable)": "RFS Suspension Certification # (N/A if not applicable)",
99
+ "weightverificationrecords": "Weight Verification Records (Date Range)",
100
+ "weight verification records": "Weight Verification Records (Date Range)",
101
+ "suspensionsystemmaintenance": "Suspension System Maintenance (Date Range)",
102
+ "suspension system maintenance": "Suspension System Maintenance (Date Range)",
103
+
104
+ # NHVR / auditor registration
105
+ "nhvrorexemplarglobalauditorregistrationnumber": "NHVR or Exemplar Global Auditor Registration Number",
106
+ "nhvr auditor registration number": "NHVR or Exemplar Global Auditor Registration Number",
107
+ "nhvr auditor reg no": "NHVR or Exemplar Global Auditor Registration Number",
108
+
109
+ # Print/Accreditation
110
+ "printname": "Print Name",
111
+ "print accreditation name": "(print accreditation name)",
112
+
113
+ # Operator Declaration/Position short forms
114
+ "positiontitle": "Position Title",
115
+ "position": "Position Title",
116
+
117
+ # Misc helpful fallbacks
118
+ "details": "DETAILS",
119
+ "management": "Management",
120
+ "maintenance management": "MAINTENANCE MANAGEMENT",
121
+ "mass management": "MASS MANAGEMENT",
122
+ "fatigue management": "FATIGUE MANAGEMENT"
123
+ }
124
+
125
  # ---------------------------
126
  # Table schemas
127
  # ---------------------------
 
576
  "date_line": r"^\s*\d{1,2}(?:st|nd|rd|th)?\s+[A-Za-z]+\s+\d{4}\s*$|^Date$",
577
  # extra patterns to help skip/identify date-like strings in name fields
578
  "date_like": GLOBAL_SETTINGS["date_like_pattern"]
579
+ }