File size: 4,518 Bytes
e300623
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
"""Field extraction utilities for OCR text processing.

This module provides field extraction and mapping from OCR results
to structured KYB field formats.
"""

import re
from typing import Optional
from models import ExtractedField, IdCardFields, MRZData


class FieldExtractor:
    """Field extraction and mapping from OCR results."""
    
    # Field mapping patterns for Dutch ID cards
    FIELD_PATTERNS = {
        "document_number": [
            r"documentnummer[:\s]*([A-Z0-9]+)",
            r"document\s*number[:\s]*([A-Z0-9]+)",
            r"nr[:\s]*([A-Z0-9]+)"
        ],
        "surname": [
            r"achternaam[:\s]*([A-Z]+)",
            r"surname[:\s]*([A-Z]+)",
            r"family\s*name[:\s]*([A-Z]+)"
        ],
        "given_names": [
            r"voornamen[:\s]*([A-Z]+)",
            r"given\s*names[:\s]*([A-Z]+)",
            r"first\s*name[:\s]*([A-Z]+)"
        ],
        "nationality": [
            r"nationaliteit[:\s]*([A-Za-z]+)",
            r"nationality[:\s]*([A-Za-z]+)"
        ],
        "date_of_birth": [
            r"geboortedatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
            r"date\s*of\s*birth[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
            r"born[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})"
        ],
        "gender": [
            r"geslacht[:\s]*([MF])",
            r"gender[:\s]*([MF])",
            r"sex[:\s]*([MF])"
        ],
        "place_of_birth": [
            r"geboorteplaats[:\s]*([A-Za-z\s]+)",
            r"place\s*of\s*birth[:\s]*([A-Za-z\s]+)",
            r"born\s*in[:\s]*([A-Za-z\s]+)"
        ],
        "date_of_issue": [
            r"uitgiftedatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
            r"date\s*of\s*issue[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
            r"issued[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})"
        ],
        "date_of_expiry": [
            r"vervaldatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
            r"date\s*of\s*expiry[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
            r"expires[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})"
        ],
        "personal_number": [
            r"persoonsnummer[:\s]*(\d{9})",
            r"personal\s*number[:\s]*(\d{9})",
            r"bsn[:\s]*(\d{9})"
        ]
    }
    
    @classmethod
    def extract_fields(cls, ocr_text: str) -> IdCardFields:
        """Extract structured fields from OCR text.
        
        Args:
            ocr_text: Raw OCR text from document processing
            
        Returns:
            IdCardFields object with extracted field data
        """
        fields = {}
        
        for field_name, patterns in cls.FIELD_PATTERNS.items():
            value = None
            confidence = 0.0
            
            for pattern in patterns:
                match = re.search(pattern, ocr_text, re.IGNORECASE)
                if match:
                    value = match.group(1).strip()
                    confidence = 0.8  # Base confidence for pattern match
                    break
            
            if value:
                fields[field_name] = ExtractedField(
                    field_name=field_name,
                    value=value,
                    confidence=confidence,
                    source="ocr"
                )
        
        return IdCardFields(**fields)
    
    @classmethod
    def extract_mrz(cls, ocr_text: str) -> Optional[MRZData]:
        """Extract MRZ data from OCR text.
        
        Args:
            ocr_text: Raw OCR text from document processing
            
        Returns:
            MRZData object if MRZ detected, None otherwise
        """
        # Look for MRZ patterns (TD1, TD2, TD3)
        mrz_patterns = [
            r"(P<[A-Z0-9<]+\n[A-Z0-9<]+)",  # Generic passport format (try first)
            r"([A-Z0-9<]{30}\n[A-Z0-9<]{30})",  # TD1 format
            r"([A-Z0-9<]{44}\n[A-Z0-9<]{44})",  # TD2 format
            r"([A-Z0-9<]{44}\n[A-Z0-9<]{44}\n[A-Z0-9<]{44})"  # TD3 format
        ]
        
        for pattern in mrz_patterns:
            match = re.search(pattern, ocr_text, re.MULTILINE)
            if match:
                raw_mrz = match.group(1)
                # Basic MRZ parsing (simplified)
                return MRZData(
                    raw_text=raw_mrz,
                    format_type="TD3" if len(raw_mrz.split('\n')) == 3 else "TD2",
                    is_valid=True,  # Assume valid if present
                    checksum_errors=[],  # Not implemented in basic version
                    confidence=0.9
                )
        
        return None