"""Tests for field extraction functionality.""" import pytest from src.kybtech_dots_ocr.enhanced_field_extraction import EnhancedFieldExtractor class TestEnhancedFieldExtractor: """Test cases for EnhancedFieldExtractor.""" def test_extract_fields_dutch_id(self): """Test field extraction with Dutch ID card text.""" extractor = EnhancedFieldExtractor() text = """ IDENTITEITSKAART Documentnummer: NLD123456789 Achternaam: MULDER Voornamen: THOMAS JAN Nationaliteit: NLD Geboortedatum: 15-03-1990 Geslacht: M """ fields = extractor.extract_fields(text) assert fields.document_number is not None assert fields.document_number.value == "NLD123456789" assert fields.surname is not None assert fields.surname.value == "MULDER" assert fields.given_names is not None assert fields.given_names.value == "THOMAS JAN" def test_extract_fields_english_id(self): """Test field extraction with English ID card text.""" extractor = EnhancedFieldExtractor() text = """ IDENTITY CARD Document Number: NLD123456789 Surname: MULDER Given Names: THOMAS JAN Nationality: NLD Date of Birth: 15-03-1990 Gender: M """ fields = extractor.extract_fields(text) assert fields.document_number is not None assert fields.document_number.value == "NLD123456789" assert fields.surname is not None assert fields.surname.value == "MULDER" def test_extract_mrz_data(self): """Test MRZ data extraction.""" extractor = EnhancedFieldExtractor() text = """ P 0.8 def test_extract_fields_empty_text(self): """Test field extraction with empty text.""" extractor = EnhancedFieldExtractor() fields = extractor.extract_fields("") # Should return empty fields assert fields.document_number is None assert fields.surname is None def test_confidence_scoring(self): """Test confidence scoring functionality.""" extractor = EnhancedFieldExtractor() # High quality text high_quality = "Documentnummer: NLD123456789 Achternaam: MULDER" fields_high = extractor.extract_fields(high_quality) # Lower quality text low_quality = "doc nr: NLD123" fields_low = extractor.extract_fields(low_quality) if fields_high.document_number and fields_low.document_number: assert fields_high.document_number.confidence >= fields_low.document_number.confidence