Spaces:
Paused
Paused
File size: 4,518 Bytes
e300623 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
"""Field extraction utilities for OCR text processing.
This module provides field extraction and mapping from OCR results
to structured KYB field formats.
"""
import re
from typing import Optional
from models import ExtractedField, IdCardFields, MRZData
class FieldExtractor:
"""Field extraction and mapping from OCR results."""
# Field mapping patterns for Dutch ID cards
FIELD_PATTERNS = {
"document_number": [
r"documentnummer[:\s]*([A-Z0-9]+)",
r"document\s*number[:\s]*([A-Z0-9]+)",
r"nr[:\s]*([A-Z0-9]+)"
],
"surname": [
r"achternaam[:\s]*([A-Z]+)",
r"surname[:\s]*([A-Z]+)",
r"family\s*name[:\s]*([A-Z]+)"
],
"given_names": [
r"voornamen[:\s]*([A-Z]+)",
r"given\s*names[:\s]*([A-Z]+)",
r"first\s*name[:\s]*([A-Z]+)"
],
"nationality": [
r"nationaliteit[:\s]*([A-Za-z]+)",
r"nationality[:\s]*([A-Za-z]+)"
],
"date_of_birth": [
r"geboortedatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
r"date\s*of\s*birth[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
r"born[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})"
],
"gender": [
r"geslacht[:\s]*([MF])",
r"gender[:\s]*([MF])",
r"sex[:\s]*([MF])"
],
"place_of_birth": [
r"geboorteplaats[:\s]*([A-Za-z\s]+)",
r"place\s*of\s*birth[:\s]*([A-Za-z\s]+)",
r"born\s*in[:\s]*([A-Za-z\s]+)"
],
"date_of_issue": [
r"uitgiftedatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
r"date\s*of\s*issue[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
r"issued[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})"
],
"date_of_expiry": [
r"vervaldatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
r"date\s*of\s*expiry[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
r"expires[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})"
],
"personal_number": [
r"persoonsnummer[:\s]*(\d{9})",
r"personal\s*number[:\s]*(\d{9})",
r"bsn[:\s]*(\d{9})"
]
}
@classmethod
def extract_fields(cls, ocr_text: str) -> IdCardFields:
"""Extract structured fields from OCR text.
Args:
ocr_text: Raw OCR text from document processing
Returns:
IdCardFields object with extracted field data
"""
fields = {}
for field_name, patterns in cls.FIELD_PATTERNS.items():
value = None
confidence = 0.0
for pattern in patterns:
match = re.search(pattern, ocr_text, re.IGNORECASE)
if match:
value = match.group(1).strip()
confidence = 0.8 # Base confidence for pattern match
break
if value:
fields[field_name] = ExtractedField(
field_name=field_name,
value=value,
confidence=confidence,
source="ocr"
)
return IdCardFields(**fields)
@classmethod
def extract_mrz(cls, ocr_text: str) -> Optional[MRZData]:
"""Extract MRZ data from OCR text.
Args:
ocr_text: Raw OCR text from document processing
Returns:
MRZData object if MRZ detected, None otherwise
"""
# Look for MRZ patterns (TD1, TD2, TD3)
mrz_patterns = [
r"(P<[A-Z0-9<]+\n[A-Z0-9<]+)", # Generic passport format (try first)
r"([A-Z0-9<]{30}\n[A-Z0-9<]{30})", # TD1 format
r"([A-Z0-9<]{44}\n[A-Z0-9<]{44})", # TD2 format
r"([A-Z0-9<]{44}\n[A-Z0-9<]{44}\n[A-Z0-9<]{44})" # TD3 format
]
for pattern in mrz_patterns:
match = re.search(pattern, ocr_text, re.MULTILINE)
if match:
raw_mrz = match.group(1)
# Basic MRZ parsing (simplified)
return MRZData(
raw_text=raw_mrz,
format_type="TD3" if len(raw_mrz.split('\n')) == 3 else "TD2",
is_valid=True, # Assume valid if present
checksum_errors=[], # Not implemented in basic version
confidence=0.9
)
return None
|