Spaces:
Paused
Paused
feat(api): fast FastAPI app + model loader refactor; add mock mode for tests\n\n- Add pyproject + setuptools config and console entrypoint\n- Implement enhanced field extraction + MRZ heuristics\n- Add response builder with compatibility for legacy MRZ fields\n- New preprocessing pipeline for PDFs/images\n- HF Spaces GPU: cache ENV, optional flash-attn, configurable base image\n- Add Make targets for Spaces GPU and local CPU\n- Add httpx for TestClient; tests pass in mock mode\n- Remove embedded model files and legacy app/modules
211e423
| """Tests for field extraction functionality.""" | |
| import pytest | |
| from src.kybtech_dots_ocr.enhanced_field_extraction import EnhancedFieldExtractor | |
| class TestEnhancedFieldExtractor: | |
| """Test cases for EnhancedFieldExtractor.""" | |
| def test_extract_fields_dutch_id(self): | |
| """Test field extraction with Dutch ID card text.""" | |
| extractor = EnhancedFieldExtractor() | |
| text = """ | |
| IDENTITEITSKAART | |
| Documentnummer: NLD123456789 | |
| Achternaam: MULDER | |
| Voornamen: THOMAS JAN | |
| Nationaliteit: NLD | |
| Geboortedatum: 15-03-1990 | |
| Geslacht: M | |
| """ | |
| fields = extractor.extract_fields(text) | |
| assert fields.document_number is not None | |
| assert fields.document_number.value == "NLD123456789" | |
| assert fields.surname is not None | |
| assert fields.surname.value == "MULDER" | |
| assert fields.given_names is not None | |
| assert fields.given_names.value == "THOMAS JAN" | |
| def test_extract_fields_english_id(self): | |
| """Test field extraction with English ID card text.""" | |
| extractor = EnhancedFieldExtractor() | |
| text = """ | |
| IDENTITY CARD | |
| Document Number: NLD123456789 | |
| Surname: MULDER | |
| Given Names: THOMAS JAN | |
| Nationality: NLD | |
| Date of Birth: 15-03-1990 | |
| Gender: M | |
| """ | |
| fields = extractor.extract_fields(text) | |
| assert fields.document_number is not None | |
| assert fields.document_number.value == "NLD123456789" | |
| assert fields.surname is not None | |
| assert fields.surname.value == "MULDER" | |
| def test_extract_mrz_data(self): | |
| """Test MRZ data extraction.""" | |
| extractor = EnhancedFieldExtractor() | |
| text = """ | |
| P<NLDMULDER<<THOMAS<<<<<<<<<<<<<<<<<<<<<<<<< | |
| NLD123456789NLD9003151M300101123456789<<<<<<<< | |
| """ | |
| mrz_data = extractor.extract_mrz(text) | |
| assert mrz_data is not None | |
| assert mrz_data.format_type == "TD3" | |
| assert mrz_data.confidence > 0.8 | |
| def test_extract_fields_empty_text(self): | |
| """Test field extraction with empty text.""" | |
| extractor = EnhancedFieldExtractor() | |
| fields = extractor.extract_fields("") | |
| # Should return empty fields | |
| assert fields.document_number is None | |
| assert fields.surname is None | |
| def test_confidence_scoring(self): | |
| """Test confidence scoring functionality.""" | |
| extractor = EnhancedFieldExtractor() | |
| # High quality text | |
| high_quality = "Documentnummer: NLD123456789 Achternaam: MULDER" | |
| fields_high = extractor.extract_fields(high_quality) | |
| # Lower quality text | |
| low_quality = "doc nr: NLD123" | |
| fields_low = extractor.extract_fields(low_quality) | |
| if fields_high.document_number and fields_low.document_number: | |
| assert fields_high.document_number.confidence >= fields_low.document_number.confidence | |