dots-ocr-idcard / tests /test_field_extraction.py
tommulder's picture
feat(api): fast FastAPI app + model loader refactor; add mock mode for tests\n\n- Add pyproject + setuptools config and console entrypoint\n- Implement enhanced field extraction + MRZ heuristics\n- Add response builder with compatibility for legacy MRZ fields\n- New preprocessing pipeline for PDFs/images\n- HF Spaces GPU: cache ENV, optional flash-attn, configurable base image\n- Add Make targets for Spaces GPU and local CPU\n- Add httpx for TestClient; tests pass in mock mode\n- Remove embedded model files and legacy app/modules
211e423
raw
history blame
3.02 kB
"""Tests for field extraction functionality."""
import pytest
from src.kybtech_dots_ocr.enhanced_field_extraction import EnhancedFieldExtractor
class TestEnhancedFieldExtractor:
"""Test cases for EnhancedFieldExtractor."""
def test_extract_fields_dutch_id(self):
"""Test field extraction with Dutch ID card text."""
extractor = EnhancedFieldExtractor()
text = """
IDENTITEITSKAART
Documentnummer: NLD123456789
Achternaam: MULDER
Voornamen: THOMAS JAN
Nationaliteit: NLD
Geboortedatum: 15-03-1990
Geslacht: M
"""
fields = extractor.extract_fields(text)
assert fields.document_number is not None
assert fields.document_number.value == "NLD123456789"
assert fields.surname is not None
assert fields.surname.value == "MULDER"
assert fields.given_names is not None
assert fields.given_names.value == "THOMAS JAN"
def test_extract_fields_english_id(self):
"""Test field extraction with English ID card text."""
extractor = EnhancedFieldExtractor()
text = """
IDENTITY CARD
Document Number: NLD123456789
Surname: MULDER
Given Names: THOMAS JAN
Nationality: NLD
Date of Birth: 15-03-1990
Gender: M
"""
fields = extractor.extract_fields(text)
assert fields.document_number is not None
assert fields.document_number.value == "NLD123456789"
assert fields.surname is not None
assert fields.surname.value == "MULDER"
def test_extract_mrz_data(self):
"""Test MRZ data extraction."""
extractor = EnhancedFieldExtractor()
text = """
P<NLDMULDER<<THOMAS<<<<<<<<<<<<<<<<<<<<<<<<<
NLD123456789NLD9003151M300101123456789<<<<<<<<
"""
mrz_data = extractor.extract_mrz(text)
assert mrz_data is not None
assert mrz_data.format_type == "TD3"
assert mrz_data.confidence > 0.8
def test_extract_fields_empty_text(self):
"""Test field extraction with empty text."""
extractor = EnhancedFieldExtractor()
fields = extractor.extract_fields("")
# Should return empty fields
assert fields.document_number is None
assert fields.surname is None
def test_confidence_scoring(self):
"""Test confidence scoring functionality."""
extractor = EnhancedFieldExtractor()
# High quality text
high_quality = "Documentnummer: NLD123456789 Achternaam: MULDER"
fields_high = extractor.extract_fields(high_quality)
# Lower quality text
low_quality = "doc nr: NLD123"
fields_low = extractor.extract_fields(low_quality)
if fields_high.document_number and fields_low.document_number:
assert fields_high.document_number.confidence >= fields_low.document_number.confidence