Spaces:

algoryn
/

dots-ocr-idcard

Paused

dots-ocr-idcard / tests /test_field_extraction.py

feat(api): fast FastAPI app + model loader refactor; add mock mode for tests\n\n- Add pyproject + setuptools config and console entrypoint\n- Implement enhanced field extraction + MRZ heuristics\n- Add response builder with compatibility for legacy MRZ fields\n- New preprocessing pipeline for PDFs/images\n- HF Spaces GPU: cache ENV, optional flash-attn, configurable base image\n- Add Make targets for Spaces GPU and local CPU\n- Add httpx for TestClient; tests pass in mock mode\n- Remove embedded model files and legacy app/modules

211e423 about 2 months ago

raw

history blame contribute delete

3.02 kB

	"""Tests for field extraction functionality."""

	import pytest
	from src.kybtech_dots_ocr.enhanced_field_extraction import EnhancedFieldExtractor


	class TestEnhancedFieldExtractor:
	"""Test cases for EnhancedFieldExtractor."""

	def test_extract_fields_dutch_id(self):
	"""Test field extraction with Dutch ID card text."""
	extractor = EnhancedFieldExtractor()
	text = """
	IDENTITEITSKAART
	Documentnummer: NLD123456789
	Achternaam: MULDER
	Voornamen: THOMAS JAN
	Nationaliteit: NLD
	Geboortedatum: 15-03-1990
	Geslacht: M
	"""

	fields = extractor.extract_fields(text)

	assert fields.document_number is not None
	assert fields.document_number.value == "NLD123456789"
	assert fields.surname is not None
	assert fields.surname.value == "MULDER"
	assert fields.given_names is not None
	assert fields.given_names.value == "THOMAS JAN"

	def test_extract_fields_english_id(self):
	"""Test field extraction with English ID card text."""
	extractor = EnhancedFieldExtractor()
	text = """
	IDENTITY CARD
	Document Number: NLD123456789
	Surname: MULDER
	Given Names: THOMAS JAN
	Nationality: NLD
	Date of Birth: 15-03-1990
	Gender: M
	"""

	fields = extractor.extract_fields(text)

	assert fields.document_number is not None
	assert fields.document_number.value == "NLD123456789"
	assert fields.surname is not None
	assert fields.surname.value == "MULDER"

	def test_extract_mrz_data(self):
	"""Test MRZ data extraction."""
	extractor = EnhancedFieldExtractor()
	text = """
	P<NLDMULDER<<THOMAS<<<<<<<<<<<<<<<<<<<<<<<<<
	NLD123456789NLD9003151M300101123456789<<<<<<<<
	"""

	mrz_data = extractor.extract_mrz(text)

	assert mrz_data is not None
	assert mrz_data.format_type == "TD3"
	assert mrz_data.confidence > 0.8

	def test_extract_fields_empty_text(self):
	"""Test field extraction with empty text."""
	extractor = EnhancedFieldExtractor()
	fields = extractor.extract_fields("")

	# Should return empty fields
	assert fields.document_number is None
	assert fields.surname is None

	def test_confidence_scoring(self):
	"""Test confidence scoring functionality."""
	extractor = EnhancedFieldExtractor()

	# High quality text
	high_quality = "Documentnummer: NLD123456789 Achternaam: MULDER"
	fields_high = extractor.extract_fields(high_quality)

	# Lower quality text
	low_quality = "doc nr: NLD123"
	fields_low = extractor.extract_fields(low_quality)

	if fields_high.document_number and fields_low.document_number:
	assert fields_high.document_number.confidence >= fields_low.document_number.confidence