Spaces:
Running
Running
| # master_key.py | |
| """ | |
| Comprehensive Master Key for NHVAS Audit extraction (updated & hardened) | |
| Usage: | |
| - Import TABLE_SCHEMAS, HEADING_PATTERNS, PARAGRAPH_PATTERNS, GLOBAL_SETTINGS | |
| - The pipeline should: | |
| 1) normalize candidate header/key using GLOBAL_SETTINGS['normalize'] | |
| 2) attempt exact matches using schema['preferred_json_keys'] (if provided) | |
| 3) try direct label matches (schema['labels'] + schema['label_synonyms']) | |
| 4) fallback to fuzzy matching using GLOBAL_SETTINGS thresholds | |
| 5) respect schema['skip_if_date_like'] for sensitive fields (e.g. Print Name) | |
| 6) for multi-value cells, use schema['split_on'] to split JSON values into lines | |
| """ | |
| import re | |
| # --------------------------- | |
| # Global matching / normalization settings | |
| # --------------------------- | |
| GLOBAL_SETTINGS = { | |
| "normalize": { | |
| # Normalization pipeline (applied in this order) | |
| "lower": True, | |
| "strip_punctuation": True, # remove punctuation like ()'.,– | |
| "collapse_whitespace": True, | |
| "replace_smart_dashes": True # normalize – — to - | |
| }, | |
| "fuzzy_thresholds": { | |
| # Minimum score thresholds (0-100) for different priority tiers | |
| "high_priority": 70, | |
| "medium_priority": 60, | |
| "low_priority": 45 | |
| }, | |
| "fuzzy_algorithm": "token_set_ratio", # recommendation | |
| "split_on": [" – ", " - ", ";", "\n", " / ", " & "], # common attendance / multi-name delimiters | |
| "date_like_pattern": r"^\s*(\d{1,2}(st|nd|rd|th)?\s+[A-Za-z]+|\d{1,2}\/\d{1,2}\/\d{2,4}|\d{1,2}\.\d{1,2}\.\d{2,4}|\d{1,2}\s+[A-Za-z]{3,})", | |
| # used for skip_if_date_like checks | |
| "ocr_repair_rules": [ | |
| # small fixes for common OCR noise in headers | |
| (r"\s*\(\s*Yes\s*/\s*No\s*\)", " (Yes/No)"), | |
| (r"R[e3]gistrat[i1]on", "Registration"), | |
| (r"Prin?t", "Print"), | |
| (r"Accredi[ta]tion", "Accreditation"), | |
| (r"[^\w\s\-\&\(\)\/:]", " "), # remove stray symbols but keep parentheses/slash | |
| ] | |
| } | |
| # --------------------------- | |
| # Extra header synonyms for common OCR variants and noisy long phrases | |
| # --------------------------- | |
| # NOTE: keys in this mapping should be applied to a *normalized* header key | |
| # (lowercased, punctuation removed, whitespace collapsed) before lookup. | |
| # Example normalized keys: "registrationnumber", "subcontractoryesno", "rfssuspensioncertificationn/a" | |
| EXTRA_HEADER_SYNONYMS = { | |
| # Registration / common short variants | |
| "registrationnumber": "Registration Number", | |
| "registration number": "Registration Number", | |
| "registrationno": "Registration Number", | |
| "reg no": "Registration Number", | |
| "regno": "Registration Number", | |
| "registration": "Registration Number", | |
| "no": "No.", | |
| # Roadworthiness / maintenance | |
| "roadworthinesscertificates": "Roadworthiness Certificates", | |
| "roadworthiness certificate": "Roadworthiness Certificates", | |
| "roadworthiness certificates": "Roadworthiness Certificates", | |
| "maintenancerecords": "Maintenance Records", | |
| "maintenance records": "Maintenance Records", | |
| "triprecords": "Maintenance Records", | |
| "trip records": "Maintenance Records", | |
| "dailychecks": "Daily Checks", | |
| "daily check": "Daily Checks", | |
| "daily checks": "Daily Checks", | |
| # Faults | |
| "faultrecordingreporting": "Fault Recording/ Reporting", | |
| "fault recording reporting": "Fault Recording/ Reporting", | |
| "fault recording/reporting": "Fault Recording/ Reporting", | |
| "faultrecording/reportingonsuspensionsystem": "Fault Recording/ Reporting", | |
| "faultrepair": "Fault Repair", | |
| "fault repair": "Fault Repair", | |
| # Sub-contractor / compliance long forms | |
| "subcontractoryesno": "Sub-contractor (Yes/No)", | |
| "sub-contractor(yesno)": "Sub-contractor (Yes/No)", | |
| "sub-contractedvehiclesstatementofcompliance": "Sub-contracted Vehicles Statement of Compliance (Yes/No)", | |
| "sub contracted vehicles statement of compliance": "Sub-contracted Vehicles Statement of Compliance (Yes/No)", | |
| "sub contracted": "Sub-contractor (Yes/No)", | |
| "sub-contracted": "Sub-contractor (Yes/No)", | |
| "sub contractor": "Sub-contractor (Yes/No)", | |
| # RFS / suspension / weight verification | |
| "rfs suspension certification": "RFS Suspension Certification # (N/A if not applicable)", | |
| "rfs suspension certification # (n/a if not applicable)": "RFS Suspension Certification # (N/A if not applicable)", | |
| "weightverificationrecords": "Weight Verification Records (Date Range)", | |
| "weight verification records": "Weight Verification Records (Date Range)", | |
| "suspensionsystemmaintenance": "Suspension System Maintenance (Date Range)", | |
| "suspension system maintenance": "Suspension System Maintenance (Date Range)", | |
| # NHVR / auditor registration | |
| "nhvrorexemplarglobalauditorregistrationnumber": "NHVR or Exemplar Global Auditor Registration Number", | |
| "nhvr auditor registration number": "NHVR or Exemplar Global Auditor Registration Number", | |
| "nhvr auditor reg no": "NHVR or Exemplar Global Auditor Registration Number", | |
| # Print/Accreditation | |
| "printname": "Print Name", | |
| "print accreditation name": "(print accreditation name)", | |
| # Operator Declaration/Position short forms | |
| "positiontitle": "Position Title", | |
| "position": "Position Title", | |
| # Misc helpful fallbacks | |
| "details": "DETAILS", | |
| "management": "Management", | |
| "maintenance management": "MAINTENANCE MANAGEMENT", | |
| "mass management": "MASS MANAGEMENT", | |
| "fatigue management": "FATIGUE MANAGEMENT" | |
| } | |
| # --------------------------- | |
| # Table schemas | |
| # --------------------------- | |
| TABLE_SCHEMAS = { | |
| # Basic top-of-form tick boxes | |
| "Tick as appropriate": { | |
| "headings": [ | |
| {"level": 1, "text": r"NHVAS\s+Audit\s+Summary\s+Report"}, | |
| ], | |
| "orientation": "left", | |
| "labels": ["Tick as appropriate"], | |
| "label_synonyms": ["Tick as appropriate:", "Tick where appropriate", "Tick as required"], | |
| "priority": 90, | |
| "preferred_json_keys": ["Tick as appropriate", "Tick as appropriate.Tick as appropriate"], | |
| "context_keywords": ["tick", "appropriate"] | |
| }, | |
| "Audit Information": { | |
| "orientation": "left", | |
| "labels": [ | |
| "Date of Audit", | |
| "Location of audit", | |
| "Auditor name", | |
| "Audit Matrix Identifier (Name or Number)", | |
| "Auditor Exemplar Global Reg No.", | |
| "Expiry Date", | |
| "NHVR Auditor Registration Number" | |
| ], | |
| "label_synonyms": [ | |
| "Audit Date", "Date of audit", "Location", "Auditor", "Exemplar Global Reg No.", | |
| "Auditor Reg No", "Auditor Registration Number" | |
| ], | |
| "priority": 80, | |
| "preferred_json_keys": [ | |
| "Audit Information.Date of Audit", | |
| "Audit Information.Auditor name", | |
| "Audit Information.Location of audit" | |
| ], | |
| "context_keywords": ["audit", "auditor", "matrix", "registration"] | |
| }, | |
| "Operator Information": { | |
| "headings": [ | |
| {"level": 1, "text": r"Operator\s+Information"} | |
| ], | |
| "orientation": "left", | |
| "labels": [ | |
| "Operator name (Legal entity)", | |
| "Operator’s Name (legal entity)", | |
| "NHVAS Accreditation No. (If applicable)", | |
| "Registered trading name/s", | |
| "Australian Company Number", | |
| "NHVAS Manual (Policies and Procedures) developed by" | |
| ], | |
| "label_synonyms": [ | |
| "Operator name", "Operator name (Legal entity)", "Operator’s name", "Operator's Name (legal entity)" | |
| ], | |
| "priority": 85, | |
| "preferred_json_keys": [ | |
| "Operator Information.Operator name (Legal entity)", | |
| "Operator name (Legal entity)" | |
| ], | |
| "context_keywords": ["operator", "legal entity", "accreditation", "company number"] | |
| }, | |
| "Operator contact details": { | |
| "orientation": "left", | |
| "labels": [ | |
| "Operator business address", | |
| "Operator Postal address", | |
| "Email address", | |
| "Operator Telephone Number" | |
| ], | |
| "label_synonyms": ["Business address", "Postal address", "Email", "Telephone", "Phone"], | |
| "priority": 75, | |
| "context_keywords": ["contact", "address", "email", "telephone", "phone"] | |
| }, | |
| "Attendance List (Names and Position Titles)": { | |
| "headings": [ | |
| {"level": 1, "text": r"Attendance\s+List\s*\(?Names\s+and\s+Position\s+Titles\)?", "re": True} | |
| ], | |
| "orientation": "row1", | |
| "labels": ["Attendance List (Names and Position Titles)"], | |
| "label_synonyms": [ | |
| "Attendance List", "Attendees", "Attendance", "Names and Position Titles", | |
| "Names & Position Titles", "Attendance list (names & positions)" | |
| ], | |
| "priority": 90, | |
| "preferred_json_keys": [ | |
| "Attendance List (Names and Position Titles).Attendance List (Names and Position Titles)", | |
| "Attendance List (Names and Position Titles)" | |
| ], | |
| "split_on": [" – ", " - ", ";", "\n", " / "], # ensure one person per line | |
| "context_keywords": ["attendance", "attendees", "names", "position"], | |
| "skip_if_date_like": False | |
| }, | |
| "Nature of the Operators Business (Summary)": { | |
| "orientation": "row1", | |
| "labels": ["Nature of the Operators Business (Summary):", "Nature of the Operator's Business (Summary):"], | |
| "label_synonyms": ["Nature of the Operators Business", "Nature of Business", "Nature of the Operator Business"], | |
| "split_labels": ["Accreditation Number:", "Expiry Date:"], | |
| "priority": 85, | |
| "context_keywords": ["nature", "business", "operators business", "summary"] | |
| }, | |
| "Accreditation Vehicle Summary": { | |
| "orientation": "left", | |
| "labels": ["Number of powered vehicles", "Number of trailing vehicles"], | |
| "label_synonyms": ["powered vehicles", "trailing vehicles", "Number of vehicles"], | |
| "priority": 80, | |
| "context_keywords": ["vehicle", "powered vehicles", "trailing"] | |
| }, | |
| "Accreditation Driver Summary": { | |
| "orientation": "left", | |
| "labels": ["Number of drivers in BFM", "Number of drivers in AFM"], | |
| "priority": 80, | |
| "context_keywords": ["driver", "bfm", "afm"] | |
| }, | |
| "Compliance Codes": { | |
| "orientation": "left", | |
| "labels": ["V", "NC", "TNC", "SFI", "NAP", "NA"], | |
| "priority": 70, | |
| "context_exclusions": ["MASS MANAGEMENT", "MAINTENANCE MANAGEMENT", "FATIGUE MANAGEMENT"] | |
| }, | |
| "Corrective Action Request Identification": { | |
| "orientation": "row1", | |
| "labels": ["Title", "Abbreviation", "Description"], | |
| "label_synonyms": ["Corrective Action Request", "CAR Identification", "CAR"], | |
| "priority": 80, | |
| "context_keywords": ["corrective", "action", "CAR"] | |
| }, | |
| # Basic Management schemas (lower priority than summary versions) | |
| "Maintenance Management": { | |
| "headings": [ | |
| {"level": 1, "text": r"NHVAS\s+AUDIT\s+SUMMARY\s+REPORT", "re": True} | |
| ], | |
| "orientation": "left", | |
| "labels": [ | |
| "Std 1. Daily Check", | |
| "Std 2. Fault Recording and Reporting", | |
| "Std 3. Fault Repair", | |
| "Std 4. Maintenance Schedules and Methods", | |
| "Std 5. Records and Documentation", | |
| "Std 6. Responsibilities", | |
| "Std 7. Internal Review", | |
| "Std 8. Training and Education" | |
| ], | |
| "label_synonyms": [ | |
| "Daily Check", "Fault Recording/ Reporting", "Fault Repair", | |
| "Maintenance Schedules", "Records and Documentation", "Internal Review" | |
| ], | |
| "priority": 60, | |
| "context_keywords": ["maintenance"], | |
| "context_exclusions": ["summary", "details", "audit findings"] | |
| }, | |
| "Mass Management": { | |
| "headings": [ | |
| {"level": 1, "text": r"NHVAS\s+AUDIT\s+SUMMARY\s+REPORT", "re": True} | |
| ], | |
| "orientation": "left", | |
| "labels": [ | |
| "Std 1. Responsibilities", | |
| "Std 2. Vehicle Control", | |
| "Std 3. Vehicle Use", | |
| "Std 4. Records and Documentation", | |
| "Std 5. Verification", | |
| "Std 6. Internal Review", | |
| "Std 7. Training and Education", | |
| "Std 8. Maintenance of Suspension" | |
| ], | |
| "priority": 60, | |
| "context_keywords": ["mass"], | |
| "context_exclusions": ["summary", "details", "audit findings"] | |
| }, | |
| "Fatigue Management": { | |
| "headings": [ | |
| {"level": 1, "text": r"NHVAS\s+AUDIT\s+SUMMARY\s+REPORT", "re": True} | |
| ], | |
| "orientation": "left", | |
| "labels": [ | |
| "Std 1. Scheduling and Rostering", | |
| "Std 2. Health and wellbeing for performed duty", | |
| "Std 3. Training and Education", | |
| "Std 4. Responsibilities and management practices", | |
| "Std 5. Internal Review", | |
| "Std 6. Records and Documentation", | |
| "Std 7. Workplace conditions" | |
| ], | |
| "priority": 60, | |
| "context_keywords": ["fatigue"], | |
| "context_exclusions": ["summary", "details", "audit findings"] | |
| }, | |
| # Summary management schemas (higher priority) | |
| "Maintenance Management Summary": { | |
| "headings": [ | |
| {"level": 1, "text": r"Audit\s+Observations\s+and\s+Comments", "re": True}, | |
| {"level": 2, "text": r"Maintenance\s+Management\s+Summary\s+of\s+Audit\s+findings", "re": True} | |
| ], | |
| "orientation": "left", | |
| "columns": ["MAINTENANCE MANAGEMENT", "DETAILS"], | |
| "labels": [ | |
| "Std 1. Daily Check", | |
| "Std 2. Fault Recording and Reporting", | |
| "Std 3. Fault Repair", | |
| "Std 4. Maintenance Schedules and Methods", | |
| "Std 5. Records and Documentation", | |
| "Std 6. Responsibilities", | |
| "Std 7. Internal Review", | |
| "Std 8. Training and Education" | |
| ], | |
| "label_synonyms": ["Details", "Management", "Maintenance Management Summary"], | |
| "priority": 85, | |
| "context_keywords": ["maintenance", "summary", "details", "audit findings"] | |
| }, | |
| "Mass Management Summary": { | |
| "headings": [ | |
| {"level": 1, "text": r"Mass\s+Management\s+Summary\s+of\s+Audit\s+findings", "re": True} | |
| ], | |
| "orientation": "left", | |
| "columns": ["MASS MANAGEMENT", "DETAILS"], | |
| "labels": [ | |
| "Std 1. Responsibilities", | |
| "Std 2. Vehicle Control", | |
| "Std 3. Vehicle Use", | |
| "Std 4. Records and Documentation", | |
| "Std 5. Verification", | |
| "Std 6. Internal Review", | |
| "Std 7. Training and Education", | |
| "Std 8. Maintenance of Suspension" | |
| ], | |
| "priority": 85, | |
| "context_keywords": ["mass", "summary", "details", "audit findings"] | |
| }, | |
| "Fatigue Management Summary": { | |
| "headings": [ | |
| {"level": 1, "text": r"Fatigue\s+Management\s+Summary\s+of\s+Audit\s+findings", "re": True} | |
| ], | |
| "orientation": "left", | |
| "columns": ["FATIGUE MANAGEMENT", "DETAILS"], | |
| "labels": [ | |
| "Std 1. Scheduling and Rostering", | |
| "Std 2. Health and wellbeing for performed duty", | |
| "Std 3. Training and Education", | |
| "Std 4. Responsibilities and management practices", | |
| "Std 5. Internal Review", | |
| "Std 6. Records and Documentation", | |
| "Std 7. Workplace conditions" | |
| ], | |
| "priority": 85, | |
| "context_keywords": ["fatigue", "summary", "details", "audit findings"] | |
| }, | |
| # Vehicle registration tables (mass / maintenance variants) | |
| "Vehicle Registration Numbers Mass": { | |
| "headings": [ | |
| {"level": 1, "text": r"Vehicle\s+Registration\s+Numbers\s+of\s+Records\s+Examined", "re": True}, | |
| {"level": 2, "text": r"MASS\s+MANAGEMENT", "re": True} | |
| ], | |
| "orientation": "row1", | |
| "labels": [ | |
| "No.", "Registration Number", "Sub contractor", "Sub-contractor (Yes/No)", | |
| "Sub-contracted Vehicles Statement of Compliance", | |
| "Weight Verification Records", "RFS Suspension Certification #", | |
| "Suspension System Maintenance", "Trip Records", | |
| "Fault Recording/ Reporting on Suspension System" | |
| ], | |
| "label_synonyms": [ | |
| "Reg No", "RegistrationNo", "RegistrationNumber", | |
| "Sub-contractor", "Sub contracted", "Sub contractor" | |
| ], | |
| "priority": 90, | |
| "context_keywords": ["mass", "vehicle registration", "rfs suspension", "weight verification"], | |
| "context_exclusions": ["maintenance", "daily checks"] | |
| }, | |
| "Vehicle Registration Numbers Maintenance": { | |
| "headings": [ | |
| {"level": 1, "text": r"Vehicle\s+Registration\s+Numbers\s+of\s+Records\s+Examined", "re": True}, | |
| {"level": 2, "text": r"Maintenance\s+Management", "re": True} | |
| ], | |
| "orientation": "row1", | |
| "labels": [ | |
| "No.", "Registration Number", "Roadworthiness Certificates", | |
| "Maintenance Records", "Daily Checks", "Fault Recording/ Reporting", "Fault Repair" | |
| ], | |
| "label_synonyms": ["Roadworthiness", "Daily Checks", "Maintenance Records"], | |
| "priority": 85, | |
| "context_keywords": ["maintenance", "vehicle registration", "roadworthiness", "daily checks"] | |
| }, | |
| "Driver / Scheduler Records Examined": { | |
| "headings": [ | |
| {"level": 1, "text": r"Driver\s*/\s*Scheduler\s+Records\s+Examined", "re": True}, | |
| {"level": 2, "text": r"FATIGUE\s+MANAGEMENT", "re": True} | |
| ], | |
| "orientation": "row1", | |
| "labels": [ | |
| "No.", | |
| "Driver / Scheduler Name", | |
| "Driver TLIF Course # Completed", | |
| "Scheduler TLIF Course # Completed", | |
| "Medical Certificates (Current Yes/No) Date of expiry", | |
| "Roster / Schedule / Safe Driving Plan (Date Range)", | |
| "Fit for Duty Statement Completed (Yes/No)", | |
| "Work Diary Pages (Page Numbers) Electronic Work Diary Records (Date Range)" | |
| ], | |
| "label_synonyms": ["Driver Name", "Scheduler Name", "Work Diary", "Roster / Schedule"], | |
| "priority": 80, | |
| "context_keywords": ["driver", "scheduler", "fatigue"] | |
| }, | |
| # Other tables | |
| "Operator's Name (legal entity)": { | |
| "headings": [ | |
| {"level": 1, "text": r"CORRECTIVE\s+ACTION\s+REQUEST\s+\(CAR\)", "re": True} | |
| ], | |
| "orientation": "left", | |
| "labels": ["Operator's Name (legal entity)", "Operator’s Name (legal entity)"], | |
| "label_synonyms": ["Operator's Name", "Operator Name (legal)"], | |
| "priority": 85 | |
| }, | |
| "Non-conformance and CAR details": { | |
| "orientation": "left", | |
| "labels": [ | |
| "Non-conformance agreed close out date", | |
| "Module and Standard", | |
| "Corrective Action Request (CAR) Number", | |
| "Observed Non-conformance:", | |
| "Corrective Action taken or to be taken by operator:", | |
| "Operator or Representative Signature", | |
| "Position", | |
| "Date", | |
| "Comments:", | |
| "Auditor signature" | |
| ], | |
| "label_synonyms": ["Non-conformance", "CAR Number", "Observed Non-conformance"], | |
| "priority": 75, | |
| "context_keywords": ["non-conformance", "corrective action", "CAR"] | |
| }, | |
| "NHVAS Approved Auditor Declaration": { | |
| "headings": [ | |
| {"level": 1, "text": r"NHVAS\s+APPROVED\s+AUDITOR\s+DECLARATION", "re": True} | |
| ], | |
| "orientation": "row1", | |
| "labels": ["Print Name", "NHVR or Exemplar Global Auditor Registration Number", "Auditor name"], | |
| "label_synonyms": ["Print Name (Auditor)", "Auditor Print Name", "NHVR Auditor Reg No"], | |
| "priority": 90, | |
| "context_exclusions": ["manager", "operator declaration"], | |
| # Defend against date-like values accidentally being used as a name | |
| "skip_if_date_like": ["Print Name", "Auditor name"] | |
| }, | |
| "Audit Declaration dates": { | |
| "headings": [ | |
| {"level": 1, "text": r"Audit Declaration dates", "re": True} | |
| ], | |
| "orientation": "left", | |
| "labels": [ | |
| "Audit was conducted on", | |
| "Unconditional CARs closed out on:", | |
| "Conditional CARs to be closed out by:" | |
| ], | |
| "label_synonyms": ["Audit was conducted", "Audit date", "CARs closed out"], | |
| "priority": 80 | |
| }, | |
| "Print accreditation name": { | |
| "headings": [ | |
| {"level": 1, "text": r"\(?print accreditation name\)?", "re": True} | |
| ], | |
| "orientation": "left", | |
| "labels": ["(print accreditation name)", "Print accreditation name", "Print Accreditation Name"], | |
| "label_synonyms": ["Print name accreditation", "Print accreditation"], | |
| "priority": 85, | |
| "preferred_json_keys": [ | |
| "Print accreditation name.(print accreditation name)", | |
| "(print accreditation name)" | |
| ] | |
| }, | |
| "Operator Declaration": { | |
| "headings": [ | |
| {"level": 1, "text": r"Operator\s+Declaration", "re": True} | |
| ], | |
| "orientation": "row1", | |
| "labels": ["Print Name", "Position Title"], | |
| "label_synonyms": [ | |
| "Print Name", "Printed Name", "Name (Print)", "Position", "Position Title", "Title" | |
| ], | |
| "priority": 90, | |
| "preferred_json_keys": [ | |
| "Operator Declaration.Print Name", | |
| "Operator Declaration.Position Title", | |
| "Operator Declaration.Print Name.Print Name" | |
| ], | |
| "fallback_keys": ["Print Name", "Position Title", "Print accreditation name.(print accreditation name)"], | |
| "context_keywords": ["operator declaration", "manager", "operator"], | |
| "context_exclusions": ["auditor", "nhvas approved"], | |
| "skip_if_date_like": ["Print Name"], # do not replace name field with date-like values | |
| "split_on": [" – ", " - ", "\n"], # in case JSON contains multiple names | |
| # NB: pipeline should attempt exact-qualified keys first, then fallbacks | |
| } | |
| } | |
| # --------------------------- | |
| # Heading / paragraph patterns (expanded) | |
| # --------------------------- | |
| HEADING_PATTERNS = { | |
| "main": [ | |
| r"NHVAS\s+Audit\s+Summary\s+Report", | |
| r"NATIONAL\s+HEAVY\s+VEHICLE\s+ACCREDITATION\s+AUDIT\s+SUMMARY\s+REPORT", | |
| r"NHVAS\s+AUDIT\s+SUMMARY\s+REPORT" | |
| ], | |
| "sub": [ | |
| r"AUDIT\s+OBSERVATIONS\s+AND\s+COMMENTS", | |
| r"MAINTENANCE\s+MANAGEMENT", | |
| r"MASS\s+MANAGEMENT", | |
| r"FATIGUE\s+MANAGEMENT", | |
| r"Fatigue\s+Management\s+Summary\s+of\s+Audit\s+findings", | |
| r"MAINTENANCE\s+MANAGEMENT\s+SUMMARY\s+OF\s+AUDIT\s+FINDINGS", | |
| r"MASS\s+MANAGEMENT\s+SUMMARY\s+OF\s+AUDIT\s+FINDINGS", | |
| r"Vehicle\s+Registration\s+Numbers\s+of\s+Records\s+Examined", | |
| r"CORRECTIVE\s+ACTION\s+REQUEST\s+\(CAR\)", | |
| r"NHVAS\s+APPROVED\s+AUDITOR\s+DECLARATION", | |
| r"Operator\s+Declaration", | |
| r"Operator\s+Information", | |
| # catch small variants / OCR noise | |
| r"OPERATOR'?S\s+NAME", | |
| ] | |
| } | |
| # --------------------------- | |
| # Paragraph patterns (improved) | |
| # --------------------------- | |
| PARAGRAPH_PATTERNS = { | |
| "findings_summary": r"Provide a summary of findings based on the evidence gathered during the audit\.?", | |
| "declaration_text": r"I hereby acknowledge and agree with the findings.*", | |
| "introductory_note": r"This audit assesses the.*", | |
| "date_line": r"^\s*\d{1,2}(?:st|nd|rd|th)?\s+[A-Za-z]+\s+\d{4}\s*$|^Date$", | |
| # extra patterns to help skip/identify date-like strings in name fields | |
| "date_like": GLOBAL_SETTINGS["date_like_pattern"] | |
| } |