Shami96 commited on
Commit
b2d3d7e
·
verified ·
1 Parent(s): dff6f36

Update master_key.py

Browse files
Files changed (1) hide show
  1. master_key.py +29 -78
master_key.py CHANGED
@@ -1,6 +1,7 @@
1
  # master_key.py
2
  """
3
  Comprehensive Master Key for NHVAS Audit extraction (updated & hardened)
 
4
  Usage:
5
  - Import TABLE_SCHEMAS, HEADING_PATTERNS, PARAGRAPH_PATTERNS, GLOBAL_SETTINGS
6
  - The pipeline should:
@@ -45,83 +46,6 @@ GLOBAL_SETTINGS = {
45
  ]
46
  }
47
 
48
- # ---------------------------
49
- # Extra header synonyms for common OCR variants and noisy long phrases
50
- # ---------------------------
51
- # NOTE: keys in this mapping should be applied to a *normalized* header key
52
- # (lowercased, punctuation removed, whitespace collapsed) before lookup.
53
- # Example normalized keys: "registrationnumber", "subcontractoryesno", "rfssuspensioncertificationn/a"
54
- EXTRA_HEADER_SYNONYMS = {
55
- # Registration / common short variants
56
- "registrationnumber": "Registration Number",
57
- "registration number": "Registration Number",
58
- "registrationno": "Registration Number",
59
- "reg no": "Registration Number",
60
- "regno": "Registration Number",
61
- "registration": "Registration Number",
62
- "no": "No.",
63
-
64
- # Roadworthiness / maintenance
65
- "roadworthinesscertificates": "Roadworthiness Certificates",
66
- "roadworthiness certificate": "Roadworthiness Certificates",
67
- "roadworthiness certificates": "Roadworthiness Certificates",
68
-
69
- "maintenancerecords": "Maintenance Records",
70
- "maintenance records": "Maintenance Records",
71
- "triprecords": "Maintenance Records",
72
- "trip records": "Maintenance Records",
73
-
74
- "dailychecks": "Daily Checks",
75
- "daily check": "Daily Checks",
76
- "daily checks": "Daily Checks",
77
-
78
- # Faults
79
- "faultrecordingreporting": "Fault Recording/ Reporting",
80
- "fault recording reporting": "Fault Recording/ Reporting",
81
- "fault recording/reporting": "Fault Recording/ Reporting",
82
- "faultrecording/reportingonsuspensionsystem": "Fault Recording/ Reporting",
83
-
84
- "faultrepair": "Fault Repair",
85
- "fault repair": "Fault Repair",
86
-
87
- # Sub-contractor / compliance long forms
88
- "subcontractoryesno": "Sub-contractor (Yes/No)",
89
- "sub-contractor(yesno)": "Sub-contractor (Yes/No)",
90
- "sub-contractedvehiclesstatementofcompliance": "Sub-contracted Vehicles Statement of Compliance (Yes/No)",
91
- "sub contracted vehicles statement of compliance": "Sub-contracted Vehicles Statement of Compliance (Yes/No)",
92
- "sub contracted": "Sub-contractor (Yes/No)",
93
- "sub-contracted": "Sub-contractor (Yes/No)",
94
- "sub contractor": "Sub-contractor (Yes/No)",
95
-
96
- # RFS / suspension / weight verification
97
- "rfs suspension certification": "RFS Suspension Certification # (N/A if not applicable)",
98
- "rfs suspension certification # (n/a if not applicable)": "RFS Suspension Certification # (N/A if not applicable)",
99
- "weightverificationrecords": "Weight Verification Records (Date Range)",
100
- "weight verification records": "Weight Verification Records (Date Range)",
101
- "suspensionsystemmaintenance": "Suspension System Maintenance (Date Range)",
102
- "suspension system maintenance": "Suspension System Maintenance (Date Range)",
103
-
104
- # NHVR / auditor registration
105
- "nhvrorexemplarglobalauditorregistrationnumber": "NHVR or Exemplar Global Auditor Registration Number",
106
- "nhvr auditor registration number": "NHVR or Exemplar Global Auditor Registration Number",
107
- "nhvr auditor reg no": "NHVR or Exemplar Global Auditor Registration Number",
108
-
109
- # Print/Accreditation
110
- "printname": "Print Name",
111
- "print accreditation name": "(print accreditation name)",
112
-
113
- # Operator Declaration/Position short forms
114
- "positiontitle": "Position Title",
115
- "position": "Position Title",
116
-
117
- # Misc helpful fallbacks
118
- "details": "DETAILS",
119
- "management": "Management",
120
- "maintenance management": "MAINTENANCE MANAGEMENT",
121
- "mass management": "MASS MANAGEMENT",
122
- "fatigue management": "FATIGUE MANAGEMENT"
123
- }
124
-
125
  # ---------------------------
126
  # Table schemas
127
  # ---------------------------
@@ -576,4 +500,31 @@ PARAGRAPH_PATTERNS = {
576
  "date_line": r"^\s*\d{1,2}(?:st|nd|rd|th)?\s+[A-Za-z]+\s+\d{4}\s*$|^Date$",
577
  # extra patterns to help skip/identify date-like strings in name fields
578
  "date_like": GLOBAL_SETTINGS["date_like_pattern"]
579
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # master_key.py
2
  """
3
  Comprehensive Master Key for NHVAS Audit extraction (updated & hardened)
4
+
5
  Usage:
6
  - Import TABLE_SCHEMAS, HEADING_PATTERNS, PARAGRAPH_PATTERNS, GLOBAL_SETTINGS
7
  - The pipeline should:
 
46
  ]
47
  }
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  # ---------------------------
50
  # Table schemas
51
  # ---------------------------
 
500
  "date_line": r"^\s*\d{1,2}(?:st|nd|rd|th)?\s+[A-Za-z]+\s+\d{4}\s*$|^Date$",
501
  # extra patterns to help skip/identify date-like strings in name fields
502
  "date_like": GLOBAL_SETTINGS["date_like_pattern"]
503
+ }
504
+
505
+ # ---------------------------
506
+ # Helpful hints for pipeline implementer (non-executable comments)
507
+ # ---------------------------
508
+ # 1) Matching order recommendation (pipeline):
509
+ # - Try exact qualified keys in schema['preferred_json_keys']
510
+ # - Try exact label match against schema['labels'] and schema['label_synonyms']
511
+ # - Apply normalization rules from GLOBAL_SETTINGS then try clean match
512
+ # - Fuzzy match using GLOBAL_SETTINGS['fuzzy_thresholds'] keyed by schema priority
513
+ # - If schema has 'fallback_keys', try them last
514
+ #
515
+ # 2) Date-safety: for any schema field listed under 'skip_if_date_like', verify JSON value
516
+ # or cell content is not date-like (use PARAGRAPH_PATTERNS['date_like']) before replacing.
517
+ #
518
+ # 3) Multi-value cells: If schema contains 'split_on', join JSON list items using '\n'
519
+ # or split a single JSON string on these delimiters to create separate lines.
520
+ #
521
+ # 4) OCR repairs: run GLOBAL_SETTINGS['ocr_repair_rules'] on extracted header text
522
+ # before attempting matching (fix common OCR confusion).
523
+ #
524
+ # 5) Processed flags: keep per-table attribute (e.g. table._processed_operator_declaration = True)
525
+ # to avoid running two different handlers on the same Operator Declaration table.
526
+ #
527
+ # 6) Tuning: If a schema is consistently mis-matched, add the exact observed header string
528
+ # into schema['label_synonyms'] for deterministic matching.
529
+ #
530
+ # End of master key