tommulder commited on
Commit
5537ceb
·
1 Parent(s): d405999

style: format Python files with Black

Browse files
src/kybtech_dots_ocr/__init__.py CHANGED
@@ -8,7 +8,13 @@ __author__ = "Algoryn"
8
  __email__ = "info@algoryn.com"
9
 
10
  from .app import app
11
- from .api_models import OCRResponse, OCRDetection, ExtractedFields, MRZData, ExtractedField
 
 
 
 
 
 
12
  from .model_loader import load_model, extract_text, is_model_loaded, get_model_info
13
  from .preprocessing import process_document, validate_file_size, get_document_info
14
  from .response_builder import build_ocr_response, build_error_response
@@ -16,7 +22,7 @@ from .response_builder import build_ocr_response, build_error_response
16
  __all__ = [
17
  "app",
18
  "OCRResponse",
19
- "OCRDetection",
20
  "ExtractedFields",
21
  "MRZData",
22
  "ExtractedField",
 
8
  __email__ = "info@algoryn.com"
9
 
10
  from .app import app
11
+ from .api_models import (
12
+ OCRResponse,
13
+ OCRDetection,
14
+ ExtractedFields,
15
+ MRZData,
16
+ ExtractedField,
17
+ )
18
  from .model_loader import load_model, extract_text, is_model_loaded, get_model_info
19
  from .preprocessing import process_document, validate_file_size, get_document_info
20
  from .response_builder import build_ocr_response, build_error_response
 
22
  __all__ = [
23
  "app",
24
  "OCRResponse",
25
+ "OCRDetection",
26
  "ExtractedFields",
27
  "MRZData",
28
  "ExtractedField",
src/kybtech_dots_ocr/api_models.py CHANGED
@@ -10,6 +10,7 @@ from pydantic import BaseModel, Field
10
 
11
  class BoundingBox(BaseModel):
12
  """Normalized bounding box coordinates."""
 
13
  x1: float = Field(..., ge=0.0, le=1.0, description="Top-left x coordinate")
14
  y1: float = Field(..., ge=0.0, le=1.0, description="Top-left y coordinate")
15
  x2: float = Field(..., ge=0.0, le=1.0, description="Bottom-right x coordinate")
@@ -18,6 +19,7 @@ class BoundingBox(BaseModel):
18
 
19
  class ExtractedField(BaseModel):
20
  """Individual extracted field with confidence and source."""
 
21
  field_name: str = Field(..., description="Standardized field name")
22
  value: Optional[str] = Field(None, description="Extracted field value")
23
  confidence: float = Field(..., ge=0.0, le=1.0, description="Extraction confidence")
@@ -26,10 +28,19 @@ class ExtractedField(BaseModel):
26
 
27
  class IdCardFields(BaseModel):
28
  """Structured fields extracted from identity documents."""
29
- document_number: Optional[ExtractedField] = Field(None, description="Document number/ID")
30
- document_type: Optional[ExtractedField] = Field(None, description="Type of document")
31
- issuing_country: Optional[ExtractedField] = Field(None, description="Issuing country code")
32
- issuing_authority: Optional[ExtractedField] = Field(None, description="Issuing authority")
 
 
 
 
 
 
 
 
 
33
 
34
  # Personal Information
35
  surname: Optional[ExtractedField] = Field(None, description="Family name/surname")
@@ -42,15 +53,22 @@ class IdCardFields(BaseModel):
42
  # Validity Information
43
  date_of_issue: Optional[ExtractedField] = Field(None, description="Date of issue")
44
  date_of_expiry: Optional[ExtractedField] = Field(None, description="Date of expiry")
45
- personal_number: Optional[ExtractedField] = Field(None, description="Personal number")
 
 
46
 
47
  # Additional fields for specific document types
48
- optional_data_1: Optional[ExtractedField] = Field(None, description="Optional data field 1")
49
- optional_data_2: Optional[ExtractedField] = Field(None, description="Optional data field 2")
 
 
 
 
50
 
51
 
52
  class ExtractedFields(BaseModel):
53
  """All extracted fields from identity document."""
 
54
  document_number: Optional[ExtractedField] = None
55
  document_type: Optional[ExtractedField] = None
56
  issuing_country: Optional[ExtractedField] = None
@@ -70,8 +88,11 @@ class ExtractedFields(BaseModel):
70
 
71
  class MRZData(BaseModel):
72
  """Machine Readable Zone data."""
 
73
  # Primary canonical fields
74
- document_type: Optional[str] = Field(None, description="MRZ document type (TD1|TD2|TD3)")
 
 
75
  issuing_country: Optional[str] = Field(None, description="Issuing country code")
76
  surname: Optional[str] = Field(None, description="Surname from MRZ")
77
  given_names: Optional[str] = Field(None, description="Given names from MRZ")
@@ -82,22 +103,30 @@ class MRZData(BaseModel):
82
  date_of_expiry: Optional[str] = Field(None, description="Date of expiry from MRZ")
83
  personal_number: Optional[str] = Field(None, description="Personal number from MRZ")
84
  raw_mrz: Optional[str] = Field(None, description="Raw MRZ text")
85
- confidence: float = Field(0.0, ge=0.0, le=1.0, description="MRZ extraction confidence")
 
 
86
 
87
  # Backwards compatibility fields (some older code/tests expect these names)
88
  # These duplicate information from the canonical fields above.
89
- format_type: Optional[str] = Field(None, description="Alias of document_type for backward compatibility")
90
- raw_text: Optional[str] = Field(None, description="Alias of raw_mrz for backward compatibility")
 
 
 
 
91
 
92
 
93
  class OCRDetection(BaseModel):
94
  """Single OCR detection result."""
 
95
  mrz_data: Optional[MRZData] = Field(None, description="MRZ data if detected")
96
  extracted_fields: ExtractedFields = Field(..., description="Extracted field data")
97
 
98
 
99
  class OCRResponse(BaseModel):
100
  """OCR API response."""
 
101
  request_id: str = Field(..., description="Unique request identifier")
102
  media_type: str = Field(..., description="Media type processed")
103
  processing_time: float = Field(..., description="Processing time in seconds")
 
10
 
11
  class BoundingBox(BaseModel):
12
  """Normalized bounding box coordinates."""
13
+
14
  x1: float = Field(..., ge=0.0, le=1.0, description="Top-left x coordinate")
15
  y1: float = Field(..., ge=0.0, le=1.0, description="Top-left y coordinate")
16
  x2: float = Field(..., ge=0.0, le=1.0, description="Bottom-right x coordinate")
 
19
 
20
  class ExtractedField(BaseModel):
21
  """Individual extracted field with confidence and source."""
22
+
23
  field_name: str = Field(..., description="Standardized field name")
24
  value: Optional[str] = Field(None, description="Extracted field value")
25
  confidence: float = Field(..., ge=0.0, le=1.0, description="Extraction confidence")
 
28
 
29
  class IdCardFields(BaseModel):
30
  """Structured fields extracted from identity documents."""
31
+
32
+ document_number: Optional[ExtractedField] = Field(
33
+ None, description="Document number/ID"
34
+ )
35
+ document_type: Optional[ExtractedField] = Field(
36
+ None, description="Type of document"
37
+ )
38
+ issuing_country: Optional[ExtractedField] = Field(
39
+ None, description="Issuing country code"
40
+ )
41
+ issuing_authority: Optional[ExtractedField] = Field(
42
+ None, description="Issuing authority"
43
+ )
44
 
45
  # Personal Information
46
  surname: Optional[ExtractedField] = Field(None, description="Family name/surname")
 
53
  # Validity Information
54
  date_of_issue: Optional[ExtractedField] = Field(None, description="Date of issue")
55
  date_of_expiry: Optional[ExtractedField] = Field(None, description="Date of expiry")
56
+ personal_number: Optional[ExtractedField] = Field(
57
+ None, description="Personal number"
58
+ )
59
 
60
  # Additional fields for specific document types
61
+ optional_data_1: Optional[ExtractedField] = Field(
62
+ None, description="Optional data field 1"
63
+ )
64
+ optional_data_2: Optional[ExtractedField] = Field(
65
+ None, description="Optional data field 2"
66
+ )
67
 
68
 
69
  class ExtractedFields(BaseModel):
70
  """All extracted fields from identity document."""
71
+
72
  document_number: Optional[ExtractedField] = None
73
  document_type: Optional[ExtractedField] = None
74
  issuing_country: Optional[ExtractedField] = None
 
88
 
89
  class MRZData(BaseModel):
90
  """Machine Readable Zone data."""
91
+
92
  # Primary canonical fields
93
+ document_type: Optional[str] = Field(
94
+ None, description="MRZ document type (TD1|TD2|TD3)"
95
+ )
96
  issuing_country: Optional[str] = Field(None, description="Issuing country code")
97
  surname: Optional[str] = Field(None, description="Surname from MRZ")
98
  given_names: Optional[str] = Field(None, description="Given names from MRZ")
 
103
  date_of_expiry: Optional[str] = Field(None, description="Date of expiry from MRZ")
104
  personal_number: Optional[str] = Field(None, description="Personal number from MRZ")
105
  raw_mrz: Optional[str] = Field(None, description="Raw MRZ text")
106
+ confidence: float = Field(
107
+ 0.0, ge=0.0, le=1.0, description="MRZ extraction confidence"
108
+ )
109
 
110
  # Backwards compatibility fields (some older code/tests expect these names)
111
  # These duplicate information from the canonical fields above.
112
+ format_type: Optional[str] = Field(
113
+ None, description="Alias of document_type for backward compatibility"
114
+ )
115
+ raw_text: Optional[str] = Field(
116
+ None, description="Alias of raw_mrz for backward compatibility"
117
+ )
118
 
119
 
120
  class OCRDetection(BaseModel):
121
  """Single OCR detection result."""
122
+
123
  mrz_data: Optional[MRZData] = Field(None, description="MRZ data if detected")
124
  extracted_fields: ExtractedFields = Field(..., description="Extracted field data")
125
 
126
 
127
  class OCRResponse(BaseModel):
128
  """OCR API response."""
129
+
130
  request_id: str = Field(..., description="Unique request identifier")
131
  media_type: str = Field(..., description="Media type processed")
132
  processing_time: float = Field(..., description="Processing time in seconds")
src/kybtech_dots_ocr/app.py CHANGED
@@ -17,7 +17,14 @@ from fastapi import FastAPI, File, Form, HTTPException, UploadFile
17
  from fastapi.responses import JSONResponse
18
 
19
  # Import local modules
20
- from .api_models import BoundingBox, ExtractedField, ExtractedFields, MRZData, OCRDetection, OCRResponse
 
 
 
 
 
 
 
21
  from .enhanced_field_extraction import EnhancedFieldExtractor
22
  from .model_loader import load_model, extract_text, is_model_loaded, get_model_info
23
  from .preprocessing import process_document, validate_file_size, get_document_info
@@ -27,6 +34,13 @@ from .response_builder import build_ocr_response, build_error_response
27
  logging.basicConfig(level=logging.INFO)
28
  logger = logging.getLogger(__name__)
29
 
 
 
 
 
 
 
 
30
  # Global model state
31
  model_loaded = False
32
 
@@ -34,13 +48,11 @@ model_loaded = False
34
  # FieldExtractor is now imported from the shared module
35
 
36
 
37
-
38
-
39
  @asynccontextmanager
40
  async def lifespan(app: FastAPI):
41
  """Application lifespan manager for model loading."""
42
  global model_loaded
43
-
44
  # Allow tests and lightweight environments to skip model loading
45
  # Set DOTS_OCR_SKIP_MODEL_LOAD=1 to bypass heavy downloads during tests/CI
46
  skip_model_load = os.getenv("DOTS_OCR_SKIP_MODEL_LOAD", "0") == "1"
@@ -50,25 +62,27 @@ async def lifespan(app: FastAPI):
50
  if skip_model_load:
51
  # Explicitly skip model loading for fast startup in tests/CI
52
  model_loaded = False
53
- logger.warning("DOTS_OCR_SKIP_MODEL_LOAD=1 set - skipping model load (mock mode)")
 
 
54
  else:
55
  # Load the model using the new model loader
56
  load_model()
57
  model_loaded = True
58
  logger.info("Dots.OCR model loaded successfully")
59
-
60
  # Log model information
61
  model_info = get_model_info()
62
  logger.info(f"Model info: {model_info}")
63
-
64
  except Exception as e:
65
  logger.error(f"Failed to load Dots.OCR model: {e}")
66
  # Don't raise - allow mock mode for development
67
  model_loaded = False
68
  logger.warning("Model loading failed - using mock implementation")
69
-
70
  yield
71
-
72
  logger.info("Shutting down Dots.OCR endpoint...")
73
 
74
 
@@ -76,61 +90,79 @@ app = FastAPI(
76
  title="KYB Dots.OCR Text Extraction",
77
  description="Dots.OCR for identity document text extraction with ROI support",
78
  version="1.0.0",
79
- lifespan=lifespan
80
  )
81
 
82
 
83
  @app.get("/")
84
  async def root():
85
  """Root route for uptime checks."""
86
- return {"status": "ok", "service": "kybtech-dots-ocr", "version": "1.0.0"}
87
 
88
 
89
  @app.get("/health")
90
  async def health_check():
91
  """Health check endpoint."""
92
  global model_loaded
93
-
94
  status = "healthy" if model_loaded else "degraded"
95
  model_info = get_model_info() if model_loaded else None
96
-
97
  return {
98
- "status": status,
99
  "version": "1.0.0",
100
  "model_loaded": model_loaded,
101
- "model_info": model_info
102
  }
103
 
104
 
105
  @app.post("/v1/id/ocr", response_model=OCRResponse)
106
  async def extract_text_endpoint(
107
  file: UploadFile = File(..., description="Image or PDF file to process"),
108
- roi: Optional[str] = Form(None, description="ROI coordinates as JSON string")
 
 
 
 
 
 
109
  ):
110
  """Extract text from identity document image or PDF."""
111
  global model_loaded
112
-
113
  # Allow mock mode when model isn't loaded to support tests/CI and dev flows
114
  allow_mock = os.getenv("DOTS_OCR_ALLOW_MOCK", "1") == "1"
115
  is_mock_mode = (not model_loaded) and allow_mock
116
  if not model_loaded and not allow_mock:
117
  raise HTTPException(status_code=503, detail="Model not loaded")
118
-
 
 
 
 
 
 
 
 
 
 
 
 
119
  start_time = time.time()
120
  request_id = str(uuid.uuid4())
121
-
122
  try:
123
  # Read file data
124
  file_data = await file.read()
125
-
126
  # Validate file size
127
  if not validate_file_size(file_data):
128
  raise HTTPException(status_code=413, detail="File size exceeds limit")
129
-
130
  # Get document information
131
  doc_info = get_document_info(file_data)
132
  logger.info(f"Processing document: {doc_info}")
133
-
134
  # Parse ROI if provided
135
  roi_coords = None
136
  if roi:
@@ -142,19 +174,21 @@ async def extract_text_endpoint(
142
  except Exception as e:
143
  logger.warning(f"Invalid ROI provided: {e}")
144
  raise HTTPException(status_code=400, detail=f"Invalid ROI format: {e}")
145
-
146
  # Process document (PDF to images or single image)
147
  try:
148
  processed_images = process_document(file_data, roi_coords)
149
  logger.info(f"Processed {len(processed_images)} images from document")
150
  except Exception as e:
151
  logger.error(f"Document processing failed: {e}")
152
- raise HTTPException(status_code=400, detail=f"Document processing failed: {e}")
153
-
 
 
154
  # Process each image and extract text
155
  ocr_texts = []
156
  page_metadata = []
157
-
158
  for i, image in enumerate(processed_images):
159
  try:
160
  # Extract text using the loaded model, or produce mock output in mock mode
@@ -163,47 +197,50 @@ async def extract_text_endpoint(
163
  ocr_text = ""
164
  else:
165
  ocr_text = extract_text(image)
166
- logger.info(f"Page {i + 1} - Extracted text length: {len(ocr_text)} characters")
167
-
 
 
168
  ocr_texts.append(ocr_text)
169
-
170
  # Collect page metadata
171
  page_meta = {
172
  "page_index": i,
173
  "image_size": image.size,
174
  "text_length": len(ocr_text),
175
- "processing_successful": True
176
  }
177
  page_metadata.append(page_meta)
178
-
179
  except Exception as e:
180
  logger.error(f"Text extraction failed for page {i + 1}: {e}")
181
  # Add empty text for failed page
182
  ocr_texts.append("")
183
-
184
  page_meta = {
185
  "page_index": i,
186
- "image_size": image.size if hasattr(image, 'size') else (0, 0),
187
  "text_length": 0,
188
  "processing_successful": False,
189
- "error": str(e)
190
  }
191
  page_metadata.append(page_meta)
192
-
193
  # Determine media type for response
194
  media_type = "pdf" if doc_info["is_pdf"] else "image"
195
-
196
  processing_time = time.time() - start_time
197
-
198
  # Build response using the response builder
199
  return build_ocr_response(
200
  request_id=request_id,
201
  media_type=media_type,
202
  processing_time=processing_time,
203
  ocr_texts=ocr_texts,
204
- page_metadata=page_metadata
 
205
  )
206
-
207
  except HTTPException:
208
  # Re-raise HTTP exceptions as-is
209
  raise
@@ -213,11 +250,12 @@ async def extract_text_endpoint(
213
  error_response = build_error_response(
214
  request_id=request_id,
215
  error_message=f"OCR extraction failed: {str(e)}",
216
- processing_time=processing_time
217
  )
218
  raise HTTPException(status_code=500, detail=error_response.dict())
219
 
220
 
221
  if __name__ == "__main__":
222
  import uvicorn
 
223
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
17
  from fastapi.responses import JSONResponse
18
 
19
  # Import local modules
20
+ from .api_models import (
21
+ BoundingBox,
22
+ ExtractedField,
23
+ ExtractedFields,
24
+ MRZData,
25
+ OCRDetection,
26
+ OCRResponse,
27
+ )
28
  from .enhanced_field_extraction import EnhancedFieldExtractor
29
  from .model_loader import load_model, extract_text, is_model_loaded, get_model_info
30
  from .preprocessing import process_document, validate_file_size, get_document_info
 
34
  logging.basicConfig(level=logging.INFO)
35
  logger = logging.getLogger(__name__)
36
 
37
+ # Enable verbose logging globally if DOTS_OCR_DEBUG env var is set.
38
+ _env_debug = os.getenv("DOTS_OCR_DEBUG", "0").lower() in {"1", "true", "yes"}
39
+ if _env_debug:
40
+ # Elevate root logger to DEBUG to include lower-level events from submodules
41
+ logging.getLogger().setLevel(logging.DEBUG)
42
+ logger.info("DOTS_OCR_DEBUG enabled via environment — verbose logging active")
43
+
44
  # Global model state
45
  model_loaded = False
46
 
 
48
  # FieldExtractor is now imported from the shared module
49
 
50
 
 
 
51
  @asynccontextmanager
52
  async def lifespan(app: FastAPI):
53
  """Application lifespan manager for model loading."""
54
  global model_loaded
55
+
56
  # Allow tests and lightweight environments to skip model loading
57
  # Set DOTS_OCR_SKIP_MODEL_LOAD=1 to bypass heavy downloads during tests/CI
58
  skip_model_load = os.getenv("DOTS_OCR_SKIP_MODEL_LOAD", "0") == "1"
 
62
  if skip_model_load:
63
  # Explicitly skip model loading for fast startup in tests/CI
64
  model_loaded = False
65
+ logger.warning(
66
+ "DOTS_OCR_SKIP_MODEL_LOAD=1 set - skipping model load (mock mode)"
67
+ )
68
  else:
69
  # Load the model using the new model loader
70
  load_model()
71
  model_loaded = True
72
  logger.info("Dots.OCR model loaded successfully")
73
+
74
  # Log model information
75
  model_info = get_model_info()
76
  logger.info(f"Model info: {model_info}")
77
+
78
  except Exception as e:
79
  logger.error(f"Failed to load Dots.OCR model: {e}")
80
  # Don't raise - allow mock mode for development
81
  model_loaded = False
82
  logger.warning("Model loading failed - using mock implementation")
83
+
84
  yield
85
+
86
  logger.info("Shutting down Dots.OCR endpoint...")
87
 
88
 
 
90
  title="KYB Dots.OCR Text Extraction",
91
  description="Dots.OCR for identity document text extraction with ROI support",
92
  version="1.0.0",
93
+ lifespan=lifespan,
94
  )
95
 
96
 
97
  @app.get("/")
98
  async def root():
99
  """Root route for uptime checks."""
100
+ return {"status": "ok"}
101
 
102
 
103
  @app.get("/health")
104
  async def health_check():
105
  """Health check endpoint."""
106
  global model_loaded
107
+
108
  status = "healthy" if model_loaded else "degraded"
109
  model_info = get_model_info() if model_loaded else None
110
+
111
  return {
112
+ "status": status,
113
  "version": "1.0.0",
114
  "model_loaded": model_loaded,
115
+ "model_info": model_info,
116
  }
117
 
118
 
119
  @app.post("/v1/id/ocr", response_model=OCRResponse)
120
  async def extract_text_endpoint(
121
  file: UploadFile = File(..., description="Image or PDF file to process"),
122
+ roi: Optional[str] = Form(None, description="ROI coordinates as JSON string"),
123
+ debug: Optional[bool] = Form(
124
+ None,
125
+ description=(
126
+ "Enable verbose debug logging for this request. Overrides env when True."
127
+ ),
128
+ ),
129
  ):
130
  """Extract text from identity document image or PDF."""
131
  global model_loaded
132
+
133
  # Allow mock mode when model isn't loaded to support tests/CI and dev flows
134
  allow_mock = os.getenv("DOTS_OCR_ALLOW_MOCK", "1") == "1"
135
  is_mock_mode = (not model_loaded) and allow_mock
136
  if not model_loaded and not allow_mock:
137
  raise HTTPException(status_code=503, detail="Model not loaded")
138
+
139
+ # Determine effective debug mode for this request
140
+ env_debug = os.getenv("DOTS_OCR_DEBUG", "0").lower() in {"1", "true", "yes"}
141
+ debug_enabled = bool(debug) if debug is not None else env_debug
142
+ if debug_enabled:
143
+ logger.info(
144
+ f"[debug] Request {request_id}: debug logging enabled (env={env_debug}, form={debug})"
145
+ )
146
+ if is_mock_mode:
147
+ logger.warning(
148
+ "Using mock mode — OCR text will be empty. To enable real inference, ensure the model loads successfully (unset DOTS_OCR_SKIP_MODEL_LOAD and provide resources)."
149
+ )
150
+
151
  start_time = time.time()
152
  request_id = str(uuid.uuid4())
153
+
154
  try:
155
  # Read file data
156
  file_data = await file.read()
157
+
158
  # Validate file size
159
  if not validate_file_size(file_data):
160
  raise HTTPException(status_code=413, detail="File size exceeds limit")
161
+
162
  # Get document information
163
  doc_info = get_document_info(file_data)
164
  logger.info(f"Processing document: {doc_info}")
165
+
166
  # Parse ROI if provided
167
  roi_coords = None
168
  if roi:
 
174
  except Exception as e:
175
  logger.warning(f"Invalid ROI provided: {e}")
176
  raise HTTPException(status_code=400, detail=f"Invalid ROI format: {e}")
177
+
178
  # Process document (PDF to images or single image)
179
  try:
180
  processed_images = process_document(file_data, roi_coords)
181
  logger.info(f"Processed {len(processed_images)} images from document")
182
  except Exception as e:
183
  logger.error(f"Document processing failed: {e}")
184
+ raise HTTPException(
185
+ status_code=400, detail=f"Document processing failed: {e}"
186
+ )
187
+
188
  # Process each image and extract text
189
  ocr_texts = []
190
  page_metadata = []
191
+
192
  for i, image in enumerate(processed_images):
193
  try:
194
  # Extract text using the loaded model, or produce mock output in mock mode
 
197
  ocr_text = ""
198
  else:
199
  ocr_text = extract_text(image)
200
+ logger.info(
201
+ f"Page {i + 1} - Extracted text length: {len(ocr_text)} characters"
202
+ )
203
+
204
  ocr_texts.append(ocr_text)
205
+
206
  # Collect page metadata
207
  page_meta = {
208
  "page_index": i,
209
  "image_size": image.size,
210
  "text_length": len(ocr_text),
211
+ "processing_successful": True,
212
  }
213
  page_metadata.append(page_meta)
214
+
215
  except Exception as e:
216
  logger.error(f"Text extraction failed for page {i + 1}: {e}")
217
  # Add empty text for failed page
218
  ocr_texts.append("")
219
+
220
  page_meta = {
221
  "page_index": i,
222
+ "image_size": image.size if hasattr(image, "size") else (0, 0),
223
  "text_length": 0,
224
  "processing_successful": False,
225
+ "error": str(e),
226
  }
227
  page_metadata.append(page_meta)
228
+
229
  # Determine media type for response
230
  media_type = "pdf" if doc_info["is_pdf"] else "image"
231
+
232
  processing_time = time.time() - start_time
233
+
234
  # Build response using the response builder
235
  return build_ocr_response(
236
  request_id=request_id,
237
  media_type=media_type,
238
  processing_time=processing_time,
239
  ocr_texts=ocr_texts,
240
+ page_metadata=page_metadata,
241
+ debug=debug_enabled,
242
  )
243
+
244
  except HTTPException:
245
  # Re-raise HTTP exceptions as-is
246
  raise
 
250
  error_response = build_error_response(
251
  request_id=request_id,
252
  error_message=f"OCR extraction failed: {str(e)}",
253
+ processing_time=processing_time,
254
  )
255
  raise HTTPException(status_code=500, detail=error_response.dict())
256
 
257
 
258
  if __name__ == "__main__":
259
  import uvicorn
260
+
261
  uvicorn.run(app, host="0.0.0.0", port=7860)
src/kybtech_dots_ocr/enhanced_field_extraction.py CHANGED
@@ -16,7 +16,7 @@ logger = logging.getLogger(__name__)
16
 
17
  class EnhancedFieldExtractor:
18
  """Enhanced field extraction with improved confidence scoring and validation."""
19
-
20
  # Enhanced field mapping patterns with confidence scoring
21
  FIELD_PATTERNS = {
22
  "document_number": [
@@ -35,7 +35,10 @@ class EnhancedFieldExtractor:
35
  ],
36
  "given_names": [
37
  (r"^\s*voornamen[:\s]*([^\r\n]+)", 0.95), # Dutch format (line-anchored)
38
- (r"^\s*given\s*names[:\s]*([^\r\n]+)", 0.9), # English format (line-anchored)
 
 
 
39
  (r"^\s*first\s*name[:\s]*([^\r\n]+)", 0.85), # First name only
40
  (r"^\s*voorletters[:\s]*([^\r\n]+)", 0.75), # Dutch initials
41
  ],
@@ -46,7 +49,10 @@ class EnhancedFieldExtractor:
46
  ],
47
  "date_of_birth": [
48
  (r"geboortedatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.9), # Dutch format
49
- (r"date\s*of\s*birth[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.85), # English format
 
 
 
50
  (r"born[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.8), # Short English
51
  (r"(\d{2}[./-]\d{2}[./-]\d{4})", 0.6), # Generic date pattern
52
  ],
@@ -64,14 +70,23 @@ class EnhancedFieldExtractor:
64
  ],
65
  "date_of_issue": [
66
  (r"uitgiftedatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.9), # Dutch format
67
- (r"date\s*of\s*issue[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.85), # English format
 
 
 
68
  (r"issued[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.8), # Short English
69
  ],
70
  "date_of_expiry": [
71
  (r"vervaldatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.9), # Dutch format
72
- (r"date\s*of\s*expiry[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.85), # English format
 
 
 
73
  (r"expires[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.8), # Short English
74
- (r"valid\s*until[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.8), # Alternative English
 
 
 
75
  ],
76
  "personal_number": [
77
  (r"persoonsnummer[:\s]*(\d{9})", 0.9), # Dutch format
@@ -95,39 +110,48 @@ class EnhancedFieldExtractor:
95
  (r"issuing\s*authority[:\s]*([A-Za-z\s]{3,30})", 0.8), # English format
96
  (r"uitgevende\s*autoriteit[:\s]*([A-Za-z\s]{3,30})", 0.9), # Dutch format
97
  (r"authority[:\s]*([A-Za-z\s]{3,30})", 0.7), # Short format
98
- ]
99
  }
100
-
101
  # MRZ patterns with confidence scoring
102
  MRZ_PATTERNS = [
103
  # Strict formats first, allowing leading/trailing whitespace per line
104
- (r"^\s*((?:[A-Z0-9<]{44})\s*\n\s*(?:[A-Z0-9<]{44}))\s*$", 0.95), # TD3: Passport (2 x 44)
105
- (r"^\s*((?:[A-Z0-9<]{36})\s*\n\s*(?:[A-Z0-9<]{36}))\s*$", 0.9), # TD2: ID card (2 x 36)
106
- (r"^\s*((?:[A-Z0-9<]{30})\s*\n\s*(?:[A-Z0-9<]{30})\s*\n\s*(?:[A-Z0-9<]{30}))\s*$", 0.85), # TD1: (3 x 30)
 
 
 
 
 
 
 
 
 
107
  # Fallback generic: a line starting with P< followed by another MRZ-like line
108
  (r"(P<[^\r\n]+\n[^\r\n]+)", 0.85),
109
  ]
110
-
111
  @classmethod
112
  def extract_fields(cls, ocr_text: str) -> IdCardFields:
113
  """Extract structured fields from OCR text with enhanced confidence scoring.
114
-
115
  Args:
116
  ocr_text: Raw OCR text from document processing
117
-
118
  Returns:
119
  IdCardFields object with extracted field data
120
  """
121
  logger.info(f"Extracting fields from text of length: {len(ocr_text)}")
122
-
123
  fields = {}
124
  extraction_stats = {"total_patterns": 0, "matches_found": 0}
125
-
126
  for field_name, patterns in cls.FIELD_PATTERNS.items():
127
  value = None
128
  confidence = 0.0
129
  best_pattern = None
130
-
131
  for pattern, base_confidence in patterns:
132
  extraction_stats["total_patterns"] += 1
133
  match = re.search(pattern, ocr_text, re.IGNORECASE | re.MULTILINE)
@@ -139,37 +163,43 @@ class EnhancedFieldExtractor:
139
  confidence = base_confidence
140
  best_pattern = pattern
141
  extraction_stats["matches_found"] += 1
142
- logger.debug(f"Found {field_name}: '{value}' (confidence: {confidence:.2f})")
 
 
143
  break
144
-
145
  if value:
146
  # Apply additional confidence adjustments
147
- confidence = cls._adjust_confidence(field_name, value, confidence, ocr_text)
148
-
 
 
149
  fields[field_name] = ExtractedField(
150
  field_name=field_name,
151
  value=value,
152
  confidence=confidence,
153
- source="ocr"
154
  )
155
-
156
- logger.info(f"Field extraction complete: {extraction_stats['matches_found']}/{extraction_stats['total_patterns']} patterns matched")
 
 
157
  return IdCardFields(**fields)
158
-
159
  @classmethod
160
  def _validate_field_value(cls, field_name: str, value: str) -> bool:
161
  """Validate extracted field value based on field type.
162
-
163
  Args:
164
  field_name: Name of the field
165
  value: Extracted value to validate
166
-
167
  Returns:
168
  True if value is valid
169
  """
170
  if not value or len(value.strip()) == 0:
171
  return False
172
-
173
  # Field-specific validation
174
  if field_name == "document_number":
175
  return len(value) >= 6 and len(value) <= 15
@@ -185,16 +215,16 @@ class EnhancedFieldExtractor:
185
  return len(value) == 9 and value.isdigit()
186
  elif field_name == "issuing_country":
187
  return len(value) == 3 and value.isalpha()
188
-
189
  return True
190
-
191
  @classmethod
192
  def _validate_date_format(cls, date_str: str) -> bool:
193
  """Validate date format and basic date logic.
194
-
195
  Args:
196
  date_str: Date string to validate
197
-
198
  Returns:
199
  True if date format is valid
200
  """
@@ -206,59 +236,63 @@ class EnhancedFieldExtractor:
206
  if len(parts) == 3:
207
  day, month, year = parts
208
  # Basic validation
209
- if (1 <= int(day) <= 31 and
210
- 1 <= int(month) <= 12 and
211
- 1900 <= int(year) <= 2100):
 
 
212
  return True
213
  except (ValueError, IndexError):
214
  pass
215
  return False
216
-
217
  @classmethod
218
- def _adjust_confidence(cls, field_name: str, value: str, base_confidence: float, full_text: str) -> float:
 
 
219
  """Adjust confidence based on additional factors.
220
-
221
  Args:
222
  field_name: Name of the field
223
  value: Extracted value
224
  base_confidence: Base confidence from pattern matching
225
  full_text: Full OCR text for context
226
-
227
  Returns:
228
  Adjusted confidence score
229
  """
230
  confidence = base_confidence
231
-
232
  # Length-based adjustments
233
  if field_name in ["surname", "given_names"] and len(value) < 3:
234
  confidence *= 0.8 # Shorter names are less reliable
235
-
236
  # Context-based adjustments
237
  if field_name == "document_number" and "passport" in full_text.lower():
238
  confidence *= 1.1 # Higher confidence in passport context
239
-
240
  # Multiple occurrence bonus
241
  if value in full_text and full_text.count(value) > 1:
242
  confidence *= 1.05 # Slight bonus for repeated values
243
-
244
  # Ensure confidence stays within bounds
245
  return min(max(confidence, 0.0), 1.0)
246
-
247
  @classmethod
248
  def extract_mrz(cls, ocr_text: str) -> Optional[MRZData]:
249
  """Extract MRZ data from OCR text with enhanced validation.
250
-
251
  Args:
252
  ocr_text: Raw OCR text from document processing
253
-
254
  Returns:
255
  MRZData object if MRZ detected, None otherwise
256
  """
257
  logger.info("Extracting MRZ data from OCR text")
258
-
259
  best_match = None
260
  best_confidence = 0.0
261
-
262
  for pattern, base_confidence in cls.MRZ_PATTERNS:
263
  match = re.search(pattern, ocr_text, re.MULTILINE)
264
  if match:
@@ -268,23 +302,24 @@ class EnhancedFieldExtractor:
268
  confidence = base_confidence
269
  # Adjust confidence based on MRZ quality
270
  confidence = cls._adjust_mrz_confidence(raw_mrz, confidence)
271
-
272
  if confidence > best_confidence:
273
  best_match = raw_mrz
274
  best_confidence = confidence
275
  logger.debug(f"Found MRZ with confidence {confidence:.2f}")
276
-
277
  if best_match:
278
  # Parse MRZ to determine format type
279
  format_type = cls._determine_mrz_format(best_match)
280
-
281
  # Basic checksum validation
282
  is_valid, errors = cls._validate_mrz_checksums(best_match, format_type)
283
-
284
  logger.info(f"MRZ extracted: {format_type} format, valid: {is_valid}")
285
-
286
  # Convert to the format expected by the API
287
  from .api_models import MRZData as APIMRZData
 
288
  # Populate both canonical and legacy alias fields for compatibility
289
  return APIMRZData(
290
  document_type=format_type,
@@ -302,47 +337,47 @@ class EnhancedFieldExtractor:
302
  raw_text=best_match, # legacy alias
303
  confidence=best_confidence,
304
  )
305
-
306
  logger.info("No MRZ data found in OCR text")
307
  return None
308
-
309
  @classmethod
310
  def _validate_mrz_format(cls, mrz_text: str) -> bool:
311
  """Validate basic MRZ format.
312
-
313
  Args:
314
  mrz_text: Raw MRZ text
315
-
316
  Returns:
317
  True if format is valid
318
  """
319
- lines = mrz_text.strip().split('\n')
320
  if len(lines) < 2:
321
  return False
322
-
323
  # Normalize whitespace and validate character set only.
324
  normalized_lines = [re.sub(r"\s+", "", line) for line in lines]
325
  for line in normalized_lines:
326
- if not re.match(r'^[A-Z0-9<]+$', line):
327
  return False
328
-
329
  return True
330
-
331
  @classmethod
332
  def _determine_mrz_format(cls, mrz_text: str) -> str:
333
  """Determine MRZ format type.
334
-
335
  Args:
336
  mrz_text: Raw MRZ text
337
-
338
  Returns:
339
  Format type (TD1, TD2, TD3, etc.)
340
  """
341
- lines = mrz_text.strip().split('\n')
342
  lines = [re.sub(r"\s+", "", line) for line in lines]
343
  line_count = len(lines)
344
  line_length = len(lines[0]) if lines else 0
345
-
346
  # Heuristic mapping: prioritize semantics over exact lengths for robustness
347
  if line_count == 2 and lines[0].startswith("P<"):
348
  return "TD3" # Passport format commonly starts with P<
@@ -351,53 +386,56 @@ class EnhancedFieldExtractor:
351
  if line_count == 3:
352
  return "TD1"
353
  return "UNKNOWN"
354
-
355
  @classmethod
356
  def _adjust_mrz_confidence(cls, mrz_text: str, base_confidence: float) -> float:
357
  """Adjust MRZ confidence based on quality indicators.
358
-
359
  Args:
360
  mrz_text: Raw MRZ text
361
  base_confidence: Base confidence from pattern matching
362
-
363
  Returns:
364
  Adjusted confidence
365
  """
366
  confidence = base_confidence
367
-
368
  # Check line consistency
369
- lines = mrz_text.strip().split('\n')
370
  if len(set(len(line) for line in lines)) == 1:
371
  confidence *= 1.05 # Bonus for consistent line lengths
372
-
373
  return min(max(confidence, 0.0), 1.0)
374
-
375
  @classmethod
376
- def _validate_mrz_checksums(cls, mrz_text: str, format_type: str) -> Tuple[bool, List[str]]:
 
 
377
  """Validate MRZ checksums (simplified implementation).
378
-
379
  Args:
380
  mrz_text: Raw MRZ text
381
  format_type: MRZ format type
382
-
383
  Returns:
384
  Tuple of (is_valid, list_of_errors)
385
  """
386
  # This is a simplified implementation
387
  # In production, you would implement full MRZ checksum validation
388
  errors = []
389
-
390
  # Basic validation - check for reasonable character distribution
391
- if mrz_text.count('<') > len(mrz_text) * 0.3:
392
  errors.append("Too many fill characters")
393
-
394
  # For now, assume valid if basic format is correct
395
  is_valid = len(errors) == 0
396
-
397
  return is_valid, errors
398
 
399
 
400
  # Backward compatibility - use enhanced extractor as default
401
  class FieldExtractor(EnhancedFieldExtractor):
402
  """Backward compatible field extractor using enhanced implementation."""
 
403
  pass
 
16
 
17
  class EnhancedFieldExtractor:
18
  """Enhanced field extraction with improved confidence scoring and validation."""
19
+
20
  # Enhanced field mapping patterns with confidence scoring
21
  FIELD_PATTERNS = {
22
  "document_number": [
 
35
  ],
36
  "given_names": [
37
  (r"^\s*voornamen[:\s]*([^\r\n]+)", 0.95), # Dutch format (line-anchored)
38
+ (
39
+ r"^\s*given\s*names[:\s]*([^\r\n]+)",
40
+ 0.9,
41
+ ), # English format (line-anchored)
42
  (r"^\s*first\s*name[:\s]*([^\r\n]+)", 0.85), # First name only
43
  (r"^\s*voorletters[:\s]*([^\r\n]+)", 0.75), # Dutch initials
44
  ],
 
49
  ],
50
  "date_of_birth": [
51
  (r"geboortedatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.9), # Dutch format
52
+ (
53
+ r"date\s*of\s*birth[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
54
+ 0.85,
55
+ ), # English format
56
  (r"born[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.8), # Short English
57
  (r"(\d{2}[./-]\d{2}[./-]\d{4})", 0.6), # Generic date pattern
58
  ],
 
70
  ],
71
  "date_of_issue": [
72
  (r"uitgiftedatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.9), # Dutch format
73
+ (
74
+ r"date\s*of\s*issue[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
75
+ 0.85,
76
+ ), # English format
77
  (r"issued[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.8), # Short English
78
  ],
79
  "date_of_expiry": [
80
  (r"vervaldatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.9), # Dutch format
81
+ (
82
+ r"date\s*of\s*expiry[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
83
+ 0.85,
84
+ ), # English format
85
  (r"expires[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.8), # Short English
86
+ (
87
+ r"valid\s*until[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
88
+ 0.8,
89
+ ), # Alternative English
90
  ],
91
  "personal_number": [
92
  (r"persoonsnummer[:\s]*(\d{9})", 0.9), # Dutch format
 
110
  (r"issuing\s*authority[:\s]*([A-Za-z\s]{3,30})", 0.8), # English format
111
  (r"uitgevende\s*autoriteit[:\s]*([A-Za-z\s]{3,30})", 0.9), # Dutch format
112
  (r"authority[:\s]*([A-Za-z\s]{3,30})", 0.7), # Short format
113
+ ],
114
  }
115
+
116
  # MRZ patterns with confidence scoring
117
  MRZ_PATTERNS = [
118
  # Strict formats first, allowing leading/trailing whitespace per line
119
+ (
120
+ r"^\s*((?:[A-Z0-9<]{44})\s*\n\s*(?:[A-Z0-9<]{44}))\s*$",
121
+ 0.95,
122
+ ), # TD3: Passport (2 x 44)
123
+ (
124
+ r"^\s*((?:[A-Z0-9<]{36})\s*\n\s*(?:[A-Z0-9<]{36}))\s*$",
125
+ 0.9,
126
+ ), # TD2: ID card (2 x 36)
127
+ (
128
+ r"^\s*((?:[A-Z0-9<]{30})\s*\n\s*(?:[A-Z0-9<]{30})\s*\n\s*(?:[A-Z0-9<]{30}))\s*$",
129
+ 0.85,
130
+ ), # TD1: (3 x 30)
131
  # Fallback generic: a line starting with P< followed by another MRZ-like line
132
  (r"(P<[^\r\n]+\n[^\r\n]+)", 0.85),
133
  ]
134
+
135
  @classmethod
136
  def extract_fields(cls, ocr_text: str) -> IdCardFields:
137
  """Extract structured fields from OCR text with enhanced confidence scoring.
138
+
139
  Args:
140
  ocr_text: Raw OCR text from document processing
141
+
142
  Returns:
143
  IdCardFields object with extracted field data
144
  """
145
  logger.info(f"Extracting fields from text of length: {len(ocr_text)}")
146
+
147
  fields = {}
148
  extraction_stats = {"total_patterns": 0, "matches_found": 0}
149
+
150
  for field_name, patterns in cls.FIELD_PATTERNS.items():
151
  value = None
152
  confidence = 0.0
153
  best_pattern = None
154
+
155
  for pattern, base_confidence in patterns:
156
  extraction_stats["total_patterns"] += 1
157
  match = re.search(pattern, ocr_text, re.IGNORECASE | re.MULTILINE)
 
163
  confidence = base_confidence
164
  best_pattern = pattern
165
  extraction_stats["matches_found"] += 1
166
+ logger.debug(
167
+ f"Found {field_name}: '{value}' (confidence: {confidence:.2f})"
168
+ )
169
  break
170
+
171
  if value:
172
  # Apply additional confidence adjustments
173
+ confidence = cls._adjust_confidence(
174
+ field_name, value, confidence, ocr_text
175
+ )
176
+
177
  fields[field_name] = ExtractedField(
178
  field_name=field_name,
179
  value=value,
180
  confidence=confidence,
181
+ source="ocr",
182
  )
183
+
184
+ logger.info(
185
+ f"Field extraction complete: {extraction_stats['matches_found']}/{extraction_stats['total_patterns']} patterns matched"
186
+ )
187
  return IdCardFields(**fields)
188
+
189
  @classmethod
190
  def _validate_field_value(cls, field_name: str, value: str) -> bool:
191
  """Validate extracted field value based on field type.
192
+
193
  Args:
194
  field_name: Name of the field
195
  value: Extracted value to validate
196
+
197
  Returns:
198
  True if value is valid
199
  """
200
  if not value or len(value.strip()) == 0:
201
  return False
202
+
203
  # Field-specific validation
204
  if field_name == "document_number":
205
  return len(value) >= 6 and len(value) <= 15
 
215
  return len(value) == 9 and value.isdigit()
216
  elif field_name == "issuing_country":
217
  return len(value) == 3 and value.isalpha()
218
+
219
  return True
220
+
221
  @classmethod
222
  def _validate_date_format(cls, date_str: str) -> bool:
223
  """Validate date format and basic date logic.
224
+
225
  Args:
226
  date_str: Date string to validate
227
+
228
  Returns:
229
  True if date format is valid
230
  """
 
236
  if len(parts) == 3:
237
  day, month, year = parts
238
  # Basic validation
239
+ if (
240
+ 1 <= int(day) <= 31
241
+ and 1 <= int(month) <= 12
242
+ and 1900 <= int(year) <= 2100
243
+ ):
244
  return True
245
  except (ValueError, IndexError):
246
  pass
247
  return False
248
+
249
  @classmethod
250
+ def _adjust_confidence(
251
+ cls, field_name: str, value: str, base_confidence: float, full_text: str
252
+ ) -> float:
253
  """Adjust confidence based on additional factors.
254
+
255
  Args:
256
  field_name: Name of the field
257
  value: Extracted value
258
  base_confidence: Base confidence from pattern matching
259
  full_text: Full OCR text for context
260
+
261
  Returns:
262
  Adjusted confidence score
263
  """
264
  confidence = base_confidence
265
+
266
  # Length-based adjustments
267
  if field_name in ["surname", "given_names"] and len(value) < 3:
268
  confidence *= 0.8 # Shorter names are less reliable
269
+
270
  # Context-based adjustments
271
  if field_name == "document_number" and "passport" in full_text.lower():
272
  confidence *= 1.1 # Higher confidence in passport context
273
+
274
  # Multiple occurrence bonus
275
  if value in full_text and full_text.count(value) > 1:
276
  confidence *= 1.05 # Slight bonus for repeated values
277
+
278
  # Ensure confidence stays within bounds
279
  return min(max(confidence, 0.0), 1.0)
280
+
281
  @classmethod
282
  def extract_mrz(cls, ocr_text: str) -> Optional[MRZData]:
283
  """Extract MRZ data from OCR text with enhanced validation.
284
+
285
  Args:
286
  ocr_text: Raw OCR text from document processing
287
+
288
  Returns:
289
  MRZData object if MRZ detected, None otherwise
290
  """
291
  logger.info("Extracting MRZ data from OCR text")
292
+
293
  best_match = None
294
  best_confidence = 0.0
295
+
296
  for pattern, base_confidence in cls.MRZ_PATTERNS:
297
  match = re.search(pattern, ocr_text, re.MULTILINE)
298
  if match:
 
302
  confidence = base_confidence
303
  # Adjust confidence based on MRZ quality
304
  confidence = cls._adjust_mrz_confidence(raw_mrz, confidence)
305
+
306
  if confidence > best_confidence:
307
  best_match = raw_mrz
308
  best_confidence = confidence
309
  logger.debug(f"Found MRZ with confidence {confidence:.2f}")
310
+
311
  if best_match:
312
  # Parse MRZ to determine format type
313
  format_type = cls._determine_mrz_format(best_match)
314
+
315
  # Basic checksum validation
316
  is_valid, errors = cls._validate_mrz_checksums(best_match, format_type)
317
+
318
  logger.info(f"MRZ extracted: {format_type} format, valid: {is_valid}")
319
+
320
  # Convert to the format expected by the API
321
  from .api_models import MRZData as APIMRZData
322
+
323
  # Populate both canonical and legacy alias fields for compatibility
324
  return APIMRZData(
325
  document_type=format_type,
 
337
  raw_text=best_match, # legacy alias
338
  confidence=best_confidence,
339
  )
340
+
341
  logger.info("No MRZ data found in OCR text")
342
  return None
343
+
344
  @classmethod
345
  def _validate_mrz_format(cls, mrz_text: str) -> bool:
346
  """Validate basic MRZ format.
347
+
348
  Args:
349
  mrz_text: Raw MRZ text
350
+
351
  Returns:
352
  True if format is valid
353
  """
354
+ lines = mrz_text.strip().split("\n")
355
  if len(lines) < 2:
356
  return False
357
+
358
  # Normalize whitespace and validate character set only.
359
  normalized_lines = [re.sub(r"\s+", "", line) for line in lines]
360
  for line in normalized_lines:
361
+ if not re.match(r"^[A-Z0-9<]+$", line):
362
  return False
363
+
364
  return True
365
+
366
  @classmethod
367
  def _determine_mrz_format(cls, mrz_text: str) -> str:
368
  """Determine MRZ format type.
369
+
370
  Args:
371
  mrz_text: Raw MRZ text
372
+
373
  Returns:
374
  Format type (TD1, TD2, TD3, etc.)
375
  """
376
+ lines = mrz_text.strip().split("\n")
377
  lines = [re.sub(r"\s+", "", line) for line in lines]
378
  line_count = len(lines)
379
  line_length = len(lines[0]) if lines else 0
380
+
381
  # Heuristic mapping: prioritize semantics over exact lengths for robustness
382
  if line_count == 2 and lines[0].startswith("P<"):
383
  return "TD3" # Passport format commonly starts with P<
 
386
  if line_count == 3:
387
  return "TD1"
388
  return "UNKNOWN"
389
+
390
  @classmethod
391
  def _adjust_mrz_confidence(cls, mrz_text: str, base_confidence: float) -> float:
392
  """Adjust MRZ confidence based on quality indicators.
393
+
394
  Args:
395
  mrz_text: Raw MRZ text
396
  base_confidence: Base confidence from pattern matching
397
+
398
  Returns:
399
  Adjusted confidence
400
  """
401
  confidence = base_confidence
402
+
403
  # Check line consistency
404
+ lines = mrz_text.strip().split("\n")
405
  if len(set(len(line) for line in lines)) == 1:
406
  confidence *= 1.05 # Bonus for consistent line lengths
407
+
408
  return min(max(confidence, 0.0), 1.0)
409
+
410
  @classmethod
411
+ def _validate_mrz_checksums(
412
+ cls, mrz_text: str, format_type: str
413
+ ) -> Tuple[bool, List[str]]:
414
  """Validate MRZ checksums (simplified implementation).
415
+
416
  Args:
417
  mrz_text: Raw MRZ text
418
  format_type: MRZ format type
419
+
420
  Returns:
421
  Tuple of (is_valid, list_of_errors)
422
  """
423
  # This is a simplified implementation
424
  # In production, you would implement full MRZ checksum validation
425
  errors = []
426
+
427
  # Basic validation - check for reasonable character distribution
428
+ if mrz_text.count("<") > len(mrz_text) * 0.3:
429
  errors.append("Too many fill characters")
430
+
431
  # For now, assume valid if basic format is correct
432
  is_valid = len(errors) == 0
433
+
434
  return is_valid, errors
435
 
436
 
437
  # Backward compatibility - use enhanced extractor as default
438
  class FieldExtractor(EnhancedFieldExtractor):
439
  """Backward compatible field extractor using enhanced implementation."""
440
+
441
  pass
src/kybtech_dots_ocr/field_extraction.py CHANGED
@@ -11,100 +11,96 @@ from .api_models import ExtractedField, IdCardFields, MRZData
11
 
12
  class FieldExtractor:
13
  """Field extraction and mapping from OCR results."""
14
-
15
  # Field mapping patterns for Dutch ID cards
16
  FIELD_PATTERNS = {
17
  "document_number": [
18
  r"documentnummer[:\s]*([A-Z0-9]+)",
19
  r"document\s*number[:\s]*([A-Z0-9]+)",
20
- r"nr[:\s]*([A-Z0-9]+)"
21
  ],
22
  "surname": [
23
  r"achternaam[:\s]*([A-Z]+)",
24
  r"surname[:\s]*([A-Z]+)",
25
- r"family\s*name[:\s]*([A-Z]+)"
26
  ],
27
  "given_names": [
28
  r"voornamen[:\s]*([A-Z]+)",
29
  r"given\s*names[:\s]*([A-Z]+)",
30
- r"first\s*name[:\s]*([A-Z]+)"
31
  ],
32
  "nationality": [
33
  r"nationaliteit[:\s]*([A-Za-z]+)",
34
- r"nationality[:\s]*([A-Za-z]+)"
35
  ],
36
  "date_of_birth": [
37
  r"geboortedatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
38
  r"date\s*of\s*birth[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
39
- r"born[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})"
40
- ],
41
- "gender": [
42
- r"geslacht[:\s]*([MF])",
43
- r"gender[:\s]*([MF])",
44
- r"sex[:\s]*([MF])"
45
  ],
 
46
  "place_of_birth": [
47
  r"geboorteplaats[:\s]*([A-Za-z\s]+)",
48
  r"place\s*of\s*birth[:\s]*([A-Za-z\s]+)",
49
- r"born\s*in[:\s]*([A-Za-z\s]+)"
50
  ],
51
  "date_of_issue": [
52
  r"uitgiftedatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
53
  r"date\s*of\s*issue[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
54
- r"issued[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})"
55
  ],
56
  "date_of_expiry": [
57
  r"vervaldatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
58
  r"date\s*of\s*expiry[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
59
- r"expires[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})"
60
  ],
61
  "personal_number": [
62
  r"persoonsnummer[:\s]*(\d{9})",
63
  r"personal\s*number[:\s]*(\d{9})",
64
- r"bsn[:\s]*(\d{9})"
65
- ]
66
  }
67
-
68
  @classmethod
69
  def extract_fields(cls, ocr_text: str) -> IdCardFields:
70
  """Extract structured fields from OCR text.
71
-
72
  Args:
73
  ocr_text: Raw OCR text from document processing
74
-
75
  Returns:
76
  IdCardFields object with extracted field data
77
  """
78
  fields = {}
79
-
80
  for field_name, patterns in cls.FIELD_PATTERNS.items():
81
  value = None
82
  confidence = 0.0
83
-
84
  for pattern in patterns:
85
  match = re.search(pattern, ocr_text, re.IGNORECASE)
86
  if match:
87
  value = match.group(1).strip()
88
  confidence = 0.8 # Base confidence for pattern match
89
  break
90
-
91
  if value:
92
  fields[field_name] = ExtractedField(
93
  field_name=field_name,
94
  value=value,
95
  confidence=confidence,
96
- source="ocr"
97
  )
98
-
99
  return IdCardFields(**fields)
100
-
101
  @classmethod
102
  def extract_mrz(cls, ocr_text: str) -> Optional[MRZData]:
103
  """Extract MRZ data from OCR text.
104
-
105
  Args:
106
  ocr_text: Raw OCR text from document processing
107
-
108
  Returns:
109
  MRZData object if MRZ detected, None otherwise
110
  """
@@ -113,9 +109,9 @@ class FieldExtractor:
113
  r"(P<[A-Z0-9<]+\n[A-Z0-9<]+)", # Generic passport format (try first)
114
  r"([A-Z0-9<]{30}\n[A-Z0-9<]{30})", # TD1 format
115
  r"([A-Z0-9<]{44}\n[A-Z0-9<]{44})", # TD2 format
116
- r"([A-Z0-9<]{44}\n[A-Z0-9<]{44}\n[A-Z0-9<]{44})" # TD3 format
117
  ]
118
-
119
  for pattern in mrz_patterns:
120
  match = re.search(pattern, ocr_text, re.MULTILINE)
121
  if match:
@@ -123,10 +119,10 @@ class FieldExtractor:
123
  # Basic MRZ parsing (simplified)
124
  return MRZData(
125
  raw_text=raw_mrz,
126
- format_type="TD3" if len(raw_mrz.split('\n')) == 3 else "TD2",
127
  is_valid=True, # Assume valid if present
128
  checksum_errors=[], # Not implemented in basic version
129
- confidence=0.9
130
  )
131
-
132
  return None
 
11
 
12
  class FieldExtractor:
13
  """Field extraction and mapping from OCR results."""
14
+
15
  # Field mapping patterns for Dutch ID cards
16
  FIELD_PATTERNS = {
17
  "document_number": [
18
  r"documentnummer[:\s]*([A-Z0-9]+)",
19
  r"document\s*number[:\s]*([A-Z0-9]+)",
20
+ r"nr[:\s]*([A-Z0-9]+)",
21
  ],
22
  "surname": [
23
  r"achternaam[:\s]*([A-Z]+)",
24
  r"surname[:\s]*([A-Z]+)",
25
+ r"family\s*name[:\s]*([A-Z]+)",
26
  ],
27
  "given_names": [
28
  r"voornamen[:\s]*([A-Z]+)",
29
  r"given\s*names[:\s]*([A-Z]+)",
30
+ r"first\s*name[:\s]*([A-Z]+)",
31
  ],
32
  "nationality": [
33
  r"nationaliteit[:\s]*([A-Za-z]+)",
34
+ r"nationality[:\s]*([A-Za-z]+)",
35
  ],
36
  "date_of_birth": [
37
  r"geboortedatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
38
  r"date\s*of\s*birth[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
39
+ r"born[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
 
 
 
 
 
40
  ],
41
+ "gender": [r"geslacht[:\s]*([MF])", r"gender[:\s]*([MF])", r"sex[:\s]*([MF])"],
42
  "place_of_birth": [
43
  r"geboorteplaats[:\s]*([A-Za-z\s]+)",
44
  r"place\s*of\s*birth[:\s]*([A-Za-z\s]+)",
45
+ r"born\s*in[:\s]*([A-Za-z\s]+)",
46
  ],
47
  "date_of_issue": [
48
  r"uitgiftedatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
49
  r"date\s*of\s*issue[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
50
+ r"issued[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
51
  ],
52
  "date_of_expiry": [
53
  r"vervaldatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
54
  r"date\s*of\s*expiry[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
55
+ r"expires[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
56
  ],
57
  "personal_number": [
58
  r"persoonsnummer[:\s]*(\d{9})",
59
  r"personal\s*number[:\s]*(\d{9})",
60
+ r"bsn[:\s]*(\d{9})",
61
+ ],
62
  }
63
+
64
  @classmethod
65
  def extract_fields(cls, ocr_text: str) -> IdCardFields:
66
  """Extract structured fields from OCR text.
67
+
68
  Args:
69
  ocr_text: Raw OCR text from document processing
70
+
71
  Returns:
72
  IdCardFields object with extracted field data
73
  """
74
  fields = {}
75
+
76
  for field_name, patterns in cls.FIELD_PATTERNS.items():
77
  value = None
78
  confidence = 0.0
79
+
80
  for pattern in patterns:
81
  match = re.search(pattern, ocr_text, re.IGNORECASE)
82
  if match:
83
  value = match.group(1).strip()
84
  confidence = 0.8 # Base confidence for pattern match
85
  break
86
+
87
  if value:
88
  fields[field_name] = ExtractedField(
89
  field_name=field_name,
90
  value=value,
91
  confidence=confidence,
92
+ source="ocr",
93
  )
94
+
95
  return IdCardFields(**fields)
96
+
97
  @classmethod
98
  def extract_mrz(cls, ocr_text: str) -> Optional[MRZData]:
99
  """Extract MRZ data from OCR text.
100
+
101
  Args:
102
  ocr_text: Raw OCR text from document processing
103
+
104
  Returns:
105
  MRZData object if MRZ detected, None otherwise
106
  """
 
109
  r"(P<[A-Z0-9<]+\n[A-Z0-9<]+)", # Generic passport format (try first)
110
  r"([A-Z0-9<]{30}\n[A-Z0-9<]{30})", # TD1 format
111
  r"([A-Z0-9<]{44}\n[A-Z0-9<]{44})", # TD2 format
112
+ r"([A-Z0-9<]{44}\n[A-Z0-9<]{44}\n[A-Z0-9<]{44})", # TD3 format
113
  ]
114
+
115
  for pattern in mrz_patterns:
116
  match = re.search(pattern, ocr_text, re.MULTILINE)
117
  if match:
 
119
  # Basic MRZ parsing (simplified)
120
  return MRZData(
121
  raw_text=raw_mrz,
122
+ format_type="TD3" if len(raw_mrz.split("\n")) == 3 else "TD2",
123
  is_valid=True, # Assume valid if present
124
  checksum_errors=[], # Not implemented in basic version
125
+ confidence=0.9,
126
  )
127
+
128
  return None
src/kybtech_dots_ocr/models.py CHANGED
@@ -10,6 +10,7 @@ from pydantic import BaseModel, Field
10
 
11
  class ExtractedField(BaseModel):
12
  """Individual extracted field from identity document."""
 
13
  field_name: str = Field(..., description="Standardized field name")
14
  value: Optional[str] = Field(None, description="Extracted field value")
15
  confidence: float = Field(..., ge=0.0, le=1.0, description="Extraction confidence")
@@ -18,10 +19,19 @@ class ExtractedField(BaseModel):
18
 
19
  class IdCardFields(BaseModel):
20
  """Structured fields extracted from identity documents."""
21
- document_number: Optional[ExtractedField] = Field(None, description="Document number/ID")
22
- document_type: Optional[ExtractedField] = Field(None, description="Type of document")
23
- issuing_country: Optional[ExtractedField] = Field(None, description="Issuing country code")
24
- issuing_authority: Optional[ExtractedField] = Field(None, description="Issuing authority")
 
 
 
 
 
 
 
 
 
25
 
26
  # Personal Information
27
  surname: Optional[ExtractedField] = Field(None, description="Family name/surname")
@@ -34,17 +44,30 @@ class IdCardFields(BaseModel):
34
  # Validity Information
35
  date_of_issue: Optional[ExtractedField] = Field(None, description="Date of issue")
36
  date_of_expiry: Optional[ExtractedField] = Field(None, description="Date of expiry")
37
- personal_number: Optional[ExtractedField] = Field(None, description="Personal number")
 
 
38
 
39
  # Additional fields for specific document types
40
- optional_data_1: Optional[ExtractedField] = Field(None, description="Optional data field 1")
41
- optional_data_2: Optional[ExtractedField] = Field(None, description="Optional data field 2")
 
 
 
 
42
 
43
 
44
  class MRZData(BaseModel):
45
  """Machine Readable Zone data extracted from identity documents."""
 
46
  raw_text: str = Field(..., description="Raw MRZ text as extracted")
47
- format_type: str = Field(..., description="MRZ format type (TD1, TD2, TD3, MRVA, MRVB)")
 
 
48
  is_valid: bool = Field(..., description="Whether MRZ checksums are valid")
49
- checksum_errors: List[str] = Field(default_factory=list, description="List of checksum validation errors")
50
- confidence: float = Field(..., ge=0.0, le=1.0, description="Extraction confidence score")
 
 
 
 
 
10
 
11
  class ExtractedField(BaseModel):
12
  """Individual extracted field from identity document."""
13
+
14
  field_name: str = Field(..., description="Standardized field name")
15
  value: Optional[str] = Field(None, description="Extracted field value")
16
  confidence: float = Field(..., ge=0.0, le=1.0, description="Extraction confidence")
 
19
 
20
  class IdCardFields(BaseModel):
21
  """Structured fields extracted from identity documents."""
22
+
23
+ document_number: Optional[ExtractedField] = Field(
24
+ None, description="Document number/ID"
25
+ )
26
+ document_type: Optional[ExtractedField] = Field(
27
+ None, description="Type of document"
28
+ )
29
+ issuing_country: Optional[ExtractedField] = Field(
30
+ None, description="Issuing country code"
31
+ )
32
+ issuing_authority: Optional[ExtractedField] = Field(
33
+ None, description="Issuing authority"
34
+ )
35
 
36
  # Personal Information
37
  surname: Optional[ExtractedField] = Field(None, description="Family name/surname")
 
44
  # Validity Information
45
  date_of_issue: Optional[ExtractedField] = Field(None, description="Date of issue")
46
  date_of_expiry: Optional[ExtractedField] = Field(None, description="Date of expiry")
47
+ personal_number: Optional[ExtractedField] = Field(
48
+ None, description="Personal number"
49
+ )
50
 
51
  # Additional fields for specific document types
52
+ optional_data_1: Optional[ExtractedField] = Field(
53
+ None, description="Optional data field 1"
54
+ )
55
+ optional_data_2: Optional[ExtractedField] = Field(
56
+ None, description="Optional data field 2"
57
+ )
58
 
59
 
60
  class MRZData(BaseModel):
61
  """Machine Readable Zone data extracted from identity documents."""
62
+
63
  raw_text: str = Field(..., description="Raw MRZ text as extracted")
64
+ format_type: str = Field(
65
+ ..., description="MRZ format type (TD1, TD2, TD3, MRVA, MRVB)"
66
+ )
67
  is_valid: bool = Field(..., description="Whether MRZ checksums are valid")
68
+ checksum_errors: List[str] = Field(
69
+ default_factory=list, description="List of checksum validation errors"
70
+ )
71
+ confidence: float = Field(
72
+ ..., ge=0.0, le=1.0, description="Extraction confidence score"
73
+ )
src/kybtech_dots_ocr/preprocessing.py CHANGED
@@ -21,15 +21,19 @@ logger = logging.getLogger(__name__)
21
  # Environment variable configuration
22
  PDF_DPI = int(os.getenv("DOTS_OCR_PDF_DPI", "300"))
23
  PDF_MAX_PAGES = int(os.getenv("DOTS_OCR_PDF_MAX_PAGES", "10"))
24
- IMAGE_MAX_SIZE = int(os.getenv("DOTS_OCR_IMAGE_MAX_SIZE", "10")) * 1024 * 1024 # 10MB default
 
 
25
 
26
 
27
  class ImagePreprocessor:
28
  """Handles image preprocessing for Dots.OCR model."""
29
-
30
- def __init__(self, min_pixels: int = 3136, max_pixels: int = 11289600, divisor: int = 28):
 
 
31
  """Initialize the image preprocessor.
32
-
33
  Args:
34
  min_pixels: Minimum pixel count for images
35
  max_pixels: Maximum pixel count for images
@@ -38,29 +42,29 @@ class ImagePreprocessor:
38
  self.min_pixels = min_pixels
39
  self.max_pixels = max_pixels
40
  self.divisor = divisor
41
-
42
  def preprocess_image(self, image: Image.Image) -> Image.Image:
43
  """Preprocess an image to meet model requirements.
44
-
45
  Args:
46
  image: Input PIL Image
47
-
48
  Returns:
49
  Preprocessed PIL Image
50
  """
51
  # Convert to RGB if necessary
52
  if image.mode != "RGB":
53
  image = image.convert("RGB")
54
-
55
  # Auto-orient image based on EXIF data
56
  image = ImageOps.exif_transpose(image)
57
-
58
  # Calculate current pixel count
59
  width, height = image.size
60
  current_pixels = width * height
61
-
62
  logger.info(f"Original image size: {width}x{height} ({current_pixels} pixels)")
63
-
64
  # Resize if necessary to meet pixel requirements
65
  if current_pixels < self.min_pixels:
66
  # Scale up to meet minimum pixel requirement
@@ -69,7 +73,7 @@ class ImagePreprocessor:
69
  new_height = int(height * scale_factor)
70
  image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
71
  logger.info(f"Scaled up image to {new_width}x{new_height}")
72
-
73
  elif current_pixels > self.max_pixels:
74
  # Scale down to meet maximum pixel requirement
75
  scale_factor = (self.max_pixels / current_pixels) ** 0.5
@@ -77,69 +81,73 @@ class ImagePreprocessor:
77
  new_height = int(height * scale_factor)
78
  image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
79
  logger.info(f"Scaled down image to {new_width}x{new_height}")
80
-
81
  # Ensure dimensions are divisible by the required divisor
82
  width, height = image.size
83
  new_width = ((width + self.divisor - 1) // self.divisor) * self.divisor
84
  new_height = ((height + self.divisor - 1) // self.divisor) * self.divisor
85
-
86
  if new_width != width or new_height != height:
87
  image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
88
- logger.info(f"Adjusted dimensions to be divisible by {self.divisor}: {new_width}x{new_height}")
89
-
 
 
90
  return image
91
-
92
- def crop_by_roi(self, image: Image.Image, roi: Tuple[float, float, float, float]) -> Image.Image:
 
 
93
  """Crop image using ROI coordinates.
94
-
95
  Args:
96
  image: Input PIL Image
97
  roi: ROI coordinates as (x1, y1, x2, y2) normalized to [0, 1]
98
-
99
  Returns:
100
  Cropped PIL Image
101
  """
102
  x1, y1, x2, y2 = roi
103
  width, height = image.size
104
-
105
  # Convert normalized coordinates to pixel coordinates
106
  x1_px = int(x1 * width)
107
  y1_px = int(y1 * height)
108
  x2_px = int(x2 * width)
109
  y2_px = int(y2 * height)
110
-
111
  # Ensure coordinates are within image bounds
112
  x1_px = max(0, min(x1_px, width))
113
  y1_px = max(0, min(y1_px, height))
114
  x2_px = max(x1_px, min(x2_px, width))
115
  y2_px = max(y1_px, min(y2_px, height))
116
-
117
  # Crop the image
118
  cropped = image.crop((x1_px, y1_px, x2_px, y2_px))
119
  logger.info(f"Cropped image to {x2_px - x1_px}x{y2_px - y1_px} pixels")
120
-
121
  return cropped
122
 
123
 
124
  class PDFProcessor:
125
  """Handles PDF to image conversion and multi-page processing."""
126
-
127
  def __init__(self, dpi: int = PDF_DPI, max_pages: int = PDF_MAX_PAGES):
128
  """Initialize the PDF processor.
129
-
130
  Args:
131
  dpi: DPI for PDF to image conversion
132
  max_pages: Maximum number of pages to process
133
  """
134
  self.dpi = dpi
135
  self.max_pages = max_pages
136
-
137
  def pdf_to_images(self, pdf_data: bytes) -> List[Image.Image]:
138
  """Convert PDF to list of images.
139
-
140
  Args:
141
  pdf_data: PDF file data as bytes
142
-
143
  Returns:
144
  List of PIL Images, one per page
145
  """
@@ -147,49 +155,49 @@ class PDFProcessor:
147
  # Open PDF from bytes
148
  pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
149
  images = []
150
-
151
  # Limit number of pages to process
152
  num_pages = min(len(pdf_document), self.max_pages)
153
  logger.info(f"Processing {num_pages} pages from PDF")
154
-
155
  for page_num in range(num_pages):
156
  page = pdf_document[page_num]
157
-
158
  # Convert page to image
159
  mat = fitz.Matrix(self.dpi / 72, self.dpi / 72) # 72 is default DPI
160
  pix = page.get_pixmap(matrix=mat)
161
-
162
  # Convert to PIL Image
163
  img_data = pix.tobytes("png")
164
  image = Image.open(io.BytesIO(img_data))
165
  images.append(image)
166
-
167
  logger.info(f"Converted page {page_num + 1} to image: {image.size}")
168
-
169
  pdf_document.close()
170
  return images
171
-
172
  except Exception as e:
173
  logger.error(f"Failed to convert PDF to images: {e}")
174
  raise RuntimeError(f"PDF conversion failed: {e}")
175
-
176
  def is_pdf(self, file_data: bytes) -> bool:
177
  """Check if file data is a PDF.
178
-
179
  Args:
180
  file_data: File data as bytes
181
-
182
  Returns:
183
  True if file is a PDF
184
  """
185
- return file_data.startswith(b'%PDF-')
186
-
187
  def get_pdf_page_count(self, pdf_data: bytes) -> int:
188
  """Get the number of pages in a PDF.
189
-
190
  Args:
191
  pdf_data: PDF file data as bytes
192
-
193
  Returns:
194
  Number of pages in the PDF
195
  """
@@ -205,23 +213,21 @@ class PDFProcessor:
205
 
206
  class DocumentProcessor:
207
  """Main document processing class that handles both images and PDFs."""
208
-
209
  def __init__(self):
210
  """Initialize the document processor."""
211
  self.image_preprocessor = ImagePreprocessor()
212
  self.pdf_processor = PDFProcessor()
213
-
214
  def process_document(
215
- self,
216
- file_data: bytes,
217
- roi: Optional[Tuple[float, float, float, float]] = None
218
  ) -> List[Image.Image]:
219
  """Process a document (image or PDF) and return preprocessed images.
220
-
221
  Args:
222
  file_data: Document file data as bytes
223
  roi: Optional ROI coordinates as (x1, y1, x2, y2) normalized to [0, 1]
224
-
225
  Returns:
226
  List of preprocessed PIL Images
227
  """
@@ -238,7 +244,7 @@ class DocumentProcessor:
238
  except Exception as e:
239
  logger.error(f"Failed to open image: {e}")
240
  raise RuntimeError(f"Image processing failed: {e}")
241
-
242
  # Preprocess each image
243
  processed_images = []
244
  for i, image in enumerate(images):
@@ -246,30 +252,30 @@ class DocumentProcessor:
246
  # Apply ROI cropping if provided
247
  if roi is not None:
248
  image = self.image_preprocessor.crop_by_roi(image, roi)
249
-
250
  # Preprocess image for model requirements
251
  processed_image = self.image_preprocessor.preprocess_image(image)
252
  processed_images.append(processed_image)
253
-
254
  logger.info(f"Processed image {i + 1}: {processed_image.size}")
255
-
256
  except Exception as e:
257
  logger.error(f"Failed to preprocess image {i + 1}: {e}")
258
  # Continue with other images even if one fails
259
  continue
260
-
261
  if not processed_images:
262
  raise RuntimeError("No images could be processed from the document")
263
-
264
  logger.info(f"Successfully processed {len(processed_images)} images")
265
  return processed_images
266
-
267
  def validate_file_size(self, file_data: bytes) -> bool:
268
  """Validate that file size is within limits.
269
-
270
  Args:
271
  file_data: File data as bytes
272
-
273
  Returns:
274
  True if file size is acceptable
275
  """
@@ -278,25 +284,25 @@ class DocumentProcessor:
278
  logger.warning(f"File size {file_size} exceeds limit {IMAGE_MAX_SIZE}")
279
  return False
280
  return True
281
-
282
  def get_document_info(self, file_data: bytes) -> dict:
283
  """Get information about the document.
284
-
285
  Args:
286
  file_data: Document file data as bytes
287
-
288
  Returns:
289
  Dictionary with document information
290
  """
291
  info = {
292
  "file_size": len(file_data),
293
  "is_pdf": self.pdf_processor.is_pdf(file_data),
294
- "page_count": 1
295
  }
296
-
297
  if info["is_pdf"]:
298
  info["page_count"] = self.pdf_processor.get_pdf_page_count(file_data)
299
-
300
  return info
301
 
302
 
@@ -313,8 +319,7 @@ def get_document_processor() -> DocumentProcessor:
313
 
314
 
315
  def process_document(
316
- file_data: bytes,
317
- roi: Optional[Tuple[float, float, float, float]] = None
318
  ) -> List[Image.Image]:
319
  """Process a document and return preprocessed images."""
320
  processor = get_document_processor()
 
21
  # Environment variable configuration
22
  PDF_DPI = int(os.getenv("DOTS_OCR_PDF_DPI", "300"))
23
  PDF_MAX_PAGES = int(os.getenv("DOTS_OCR_PDF_MAX_PAGES", "10"))
24
+ IMAGE_MAX_SIZE = (
25
+ int(os.getenv("DOTS_OCR_IMAGE_MAX_SIZE", "10")) * 1024 * 1024
26
+ ) # 10MB default
27
 
28
 
29
  class ImagePreprocessor:
30
  """Handles image preprocessing for Dots.OCR model."""
31
+
32
+ def __init__(
33
+ self, min_pixels: int = 3136, max_pixels: int = 11289600, divisor: int = 28
34
+ ):
35
  """Initialize the image preprocessor.
36
+
37
  Args:
38
  min_pixels: Minimum pixel count for images
39
  max_pixels: Maximum pixel count for images
 
42
  self.min_pixels = min_pixels
43
  self.max_pixels = max_pixels
44
  self.divisor = divisor
45
+
46
  def preprocess_image(self, image: Image.Image) -> Image.Image:
47
  """Preprocess an image to meet model requirements.
48
+
49
  Args:
50
  image: Input PIL Image
51
+
52
  Returns:
53
  Preprocessed PIL Image
54
  """
55
  # Convert to RGB if necessary
56
  if image.mode != "RGB":
57
  image = image.convert("RGB")
58
+
59
  # Auto-orient image based on EXIF data
60
  image = ImageOps.exif_transpose(image)
61
+
62
  # Calculate current pixel count
63
  width, height = image.size
64
  current_pixels = width * height
65
+
66
  logger.info(f"Original image size: {width}x{height} ({current_pixels} pixels)")
67
+
68
  # Resize if necessary to meet pixel requirements
69
  if current_pixels < self.min_pixels:
70
  # Scale up to meet minimum pixel requirement
 
73
  new_height = int(height * scale_factor)
74
  image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
75
  logger.info(f"Scaled up image to {new_width}x{new_height}")
76
+
77
  elif current_pixels > self.max_pixels:
78
  # Scale down to meet maximum pixel requirement
79
  scale_factor = (self.max_pixels / current_pixels) ** 0.5
 
81
  new_height = int(height * scale_factor)
82
  image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
83
  logger.info(f"Scaled down image to {new_width}x{new_height}")
84
+
85
  # Ensure dimensions are divisible by the required divisor
86
  width, height = image.size
87
  new_width = ((width + self.divisor - 1) // self.divisor) * self.divisor
88
  new_height = ((height + self.divisor - 1) // self.divisor) * self.divisor
89
+
90
  if new_width != width or new_height != height:
91
  image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
92
+ logger.info(
93
+ f"Adjusted dimensions to be divisible by {self.divisor}: {new_width}x{new_height}"
94
+ )
95
+
96
  return image
97
+
98
+ def crop_by_roi(
99
+ self, image: Image.Image, roi: Tuple[float, float, float, float]
100
+ ) -> Image.Image:
101
  """Crop image using ROI coordinates.
102
+
103
  Args:
104
  image: Input PIL Image
105
  roi: ROI coordinates as (x1, y1, x2, y2) normalized to [0, 1]
106
+
107
  Returns:
108
  Cropped PIL Image
109
  """
110
  x1, y1, x2, y2 = roi
111
  width, height = image.size
112
+
113
  # Convert normalized coordinates to pixel coordinates
114
  x1_px = int(x1 * width)
115
  y1_px = int(y1 * height)
116
  x2_px = int(x2 * width)
117
  y2_px = int(y2 * height)
118
+
119
  # Ensure coordinates are within image bounds
120
  x1_px = max(0, min(x1_px, width))
121
  y1_px = max(0, min(y1_px, height))
122
  x2_px = max(x1_px, min(x2_px, width))
123
  y2_px = max(y1_px, min(y2_px, height))
124
+
125
  # Crop the image
126
  cropped = image.crop((x1_px, y1_px, x2_px, y2_px))
127
  logger.info(f"Cropped image to {x2_px - x1_px}x{y2_px - y1_px} pixels")
128
+
129
  return cropped
130
 
131
 
132
  class PDFProcessor:
133
  """Handles PDF to image conversion and multi-page processing."""
134
+
135
  def __init__(self, dpi: int = PDF_DPI, max_pages: int = PDF_MAX_PAGES):
136
  """Initialize the PDF processor.
137
+
138
  Args:
139
  dpi: DPI for PDF to image conversion
140
  max_pages: Maximum number of pages to process
141
  """
142
  self.dpi = dpi
143
  self.max_pages = max_pages
144
+
145
  def pdf_to_images(self, pdf_data: bytes) -> List[Image.Image]:
146
  """Convert PDF to list of images.
147
+
148
  Args:
149
  pdf_data: PDF file data as bytes
150
+
151
  Returns:
152
  List of PIL Images, one per page
153
  """
 
155
  # Open PDF from bytes
156
  pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
157
  images = []
158
+
159
  # Limit number of pages to process
160
  num_pages = min(len(pdf_document), self.max_pages)
161
  logger.info(f"Processing {num_pages} pages from PDF")
162
+
163
  for page_num in range(num_pages):
164
  page = pdf_document[page_num]
165
+
166
  # Convert page to image
167
  mat = fitz.Matrix(self.dpi / 72, self.dpi / 72) # 72 is default DPI
168
  pix = page.get_pixmap(matrix=mat)
169
+
170
  # Convert to PIL Image
171
  img_data = pix.tobytes("png")
172
  image = Image.open(io.BytesIO(img_data))
173
  images.append(image)
174
+
175
  logger.info(f"Converted page {page_num + 1} to image: {image.size}")
176
+
177
  pdf_document.close()
178
  return images
179
+
180
  except Exception as e:
181
  logger.error(f"Failed to convert PDF to images: {e}")
182
  raise RuntimeError(f"PDF conversion failed: {e}")
183
+
184
  def is_pdf(self, file_data: bytes) -> bool:
185
  """Check if file data is a PDF.
186
+
187
  Args:
188
  file_data: File data as bytes
189
+
190
  Returns:
191
  True if file is a PDF
192
  """
193
+ return file_data.startswith(b"%PDF-")
194
+
195
  def get_pdf_page_count(self, pdf_data: bytes) -> int:
196
  """Get the number of pages in a PDF.
197
+
198
  Args:
199
  pdf_data: PDF file data as bytes
200
+
201
  Returns:
202
  Number of pages in the PDF
203
  """
 
213
 
214
  class DocumentProcessor:
215
  """Main document processing class that handles both images and PDFs."""
216
+
217
  def __init__(self):
218
  """Initialize the document processor."""
219
  self.image_preprocessor = ImagePreprocessor()
220
  self.pdf_processor = PDFProcessor()
221
+
222
  def process_document(
223
+ self, file_data: bytes, roi: Optional[Tuple[float, float, float, float]] = None
 
 
224
  ) -> List[Image.Image]:
225
  """Process a document (image or PDF) and return preprocessed images.
226
+
227
  Args:
228
  file_data: Document file data as bytes
229
  roi: Optional ROI coordinates as (x1, y1, x2, y2) normalized to [0, 1]
230
+
231
  Returns:
232
  List of preprocessed PIL Images
233
  """
 
244
  except Exception as e:
245
  logger.error(f"Failed to open image: {e}")
246
  raise RuntimeError(f"Image processing failed: {e}")
247
+
248
  # Preprocess each image
249
  processed_images = []
250
  for i, image in enumerate(images):
 
252
  # Apply ROI cropping if provided
253
  if roi is not None:
254
  image = self.image_preprocessor.crop_by_roi(image, roi)
255
+
256
  # Preprocess image for model requirements
257
  processed_image = self.image_preprocessor.preprocess_image(image)
258
  processed_images.append(processed_image)
259
+
260
  logger.info(f"Processed image {i + 1}: {processed_image.size}")
261
+
262
  except Exception as e:
263
  logger.error(f"Failed to preprocess image {i + 1}: {e}")
264
  # Continue with other images even if one fails
265
  continue
266
+
267
  if not processed_images:
268
  raise RuntimeError("No images could be processed from the document")
269
+
270
  logger.info(f"Successfully processed {len(processed_images)} images")
271
  return processed_images
272
+
273
  def validate_file_size(self, file_data: bytes) -> bool:
274
  """Validate that file size is within limits.
275
+
276
  Args:
277
  file_data: File data as bytes
278
+
279
  Returns:
280
  True if file size is acceptable
281
  """
 
284
  logger.warning(f"File size {file_size} exceeds limit {IMAGE_MAX_SIZE}")
285
  return False
286
  return True
287
+
288
  def get_document_info(self, file_data: bytes) -> dict:
289
  """Get information about the document.
290
+
291
  Args:
292
  file_data: Document file data as bytes
293
+
294
  Returns:
295
  Dictionary with document information
296
  """
297
  info = {
298
  "file_size": len(file_data),
299
  "is_pdf": self.pdf_processor.is_pdf(file_data),
300
+ "page_count": 1,
301
  }
302
+
303
  if info["is_pdf"]:
304
  info["page_count"] = self.pdf_processor.get_pdf_page_count(file_data)
305
+
306
  return info
307
 
308
 
 
319
 
320
 
321
  def process_document(
322
+ file_data: bytes, roi: Optional[Tuple[float, float, float, float]] = None
 
323
  ) -> List[Image.Image]:
324
  """Process a document and return preprocessed images."""
325
  processor = get_document_processor()
src/kybtech_dots_ocr/response_builder.py CHANGED
@@ -2,9 +2,13 @@
2
 
3
  This module handles the construction and validation of OCR API responses
4
  according to the specified schema with proper error handling and metadata.
 
 
 
5
  """
6
 
7
  import logging
 
8
  import time
9
  from typing import List, Optional, Dict, Any
10
  from datetime import datetime
@@ -29,7 +33,8 @@ class OCRResponseBuilder:
29
  media_type: str,
30
  processing_time: float,
31
  ocr_texts: List[str],
32
- page_metadata: Optional[List[Dict[str, Any]]] = None
 
33
  ) -> OCRResponse:
34
  """Build a complete OCR response from extracted texts.
35
 
@@ -39,6 +44,7 @@ class OCRResponseBuilder:
39
  processing_time: Total processing time in seconds
40
  ocr_texts: List of OCR text results (one per page)
41
  page_metadata: Optional metadata for each page
 
42
 
43
  Returns:
44
  Complete OCRResponse object
@@ -46,6 +52,8 @@ class OCRResponseBuilder:
46
  logger.info(f"Building response for {len(ocr_texts)} pages")
47
 
48
  detections = []
 
 
49
 
50
  for i, ocr_text in enumerate(ocr_texts):
51
  try:
@@ -53,6 +61,40 @@ class OCRResponseBuilder:
53
  extracted_fields = self.field_extractor.extract_fields(ocr_text)
54
  mrz_data = self.field_extractor.extract_mrz(ocr_text)
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  # Create detection for this page
57
  detection = self._create_detection(extracted_fields, mrz_data, i, page_metadata)
58
  detections.append(detection)
@@ -304,11 +346,19 @@ def build_ocr_response(
304
  media_type: str,
305
  processing_time: float,
306
  ocr_texts: List[str],
307
- page_metadata: Optional[List[Dict[str, Any]]] = None
 
308
  ) -> OCRResponse:
309
  """Build a complete OCR response from extracted texts."""
310
  builder = get_response_builder()
311
- return builder.build_response(request_id, media_type, processing_time, ocr_texts, page_metadata)
 
 
 
 
 
 
 
312
 
313
 
314
  def build_error_response(
 
2
 
3
  This module handles the construction and validation of OCR API responses
4
  according to the specified schema with proper error handling and metadata.
5
+
6
+ Debug-mode logging is supported to surface detailed information about
7
+ extraction results when troubleshooting in environments like Hugging Face.
8
  """
9
 
10
  import logging
11
+ import os
12
  import time
13
  from typing import List, Optional, Dict, Any
14
  from datetime import datetime
 
33
  media_type: str,
34
  processing_time: float,
35
  ocr_texts: List[str],
36
+ page_metadata: Optional[List[Dict[str, Any]]] = None,
37
+ debug: bool = False,
38
  ) -> OCRResponse:
39
  """Build a complete OCR response from extracted texts.
40
 
 
44
  processing_time: Total processing time in seconds
45
  ocr_texts: List of OCR text results (one per page)
46
  page_metadata: Optional metadata for each page
47
+ debug: When True, emit detailed logs about OCR text and mapping
48
 
49
  Returns:
50
  Complete OCRResponse object
 
52
  logger.info(f"Building response for {len(ocr_texts)} pages")
53
 
54
  detections = []
55
+ # Allow configuring the OCR text snippet length via env var. Defaults to 1200.
56
+ debug_snippet_len = int(os.getenv("DOTS_OCR_DEBUG_TEXT_SNIPPET_LEN", "1200"))
57
 
58
  for i, ocr_text in enumerate(ocr_texts):
59
  try:
 
61
  extracted_fields = self.field_extractor.extract_fields(ocr_text)
62
  mrz_data = self.field_extractor.extract_mrz(ocr_text)
63
 
64
+ # In debug mode, log OCR text snippet and extracted mapping details.
65
+ if debug:
66
+ # Log a bounded snippet of the OCR text to avoid overwhelming logs
67
+ snippet = ocr_text[:debug_snippet_len]
68
+ if len(ocr_text) > debug_snippet_len:
69
+ snippet += "\n...[truncated]"
70
+ logger.info(
71
+ f"[debug] Page {i + 1}: OCR text snippet (len={len(ocr_text)}):\n{snippet}"
72
+ )
73
+
74
+ # Prepare a compact dict of non-null extracted fields
75
+ non_null_fields: Dict[str, Any] = {}
76
+ for fname, fval in extracted_fields.__dict__.items():
77
+ if fval is not None:
78
+ non_null_fields[fname] = {
79
+ "value": fval.value,
80
+ "confidence": fval.confidence,
81
+ "source": fval.source,
82
+ }
83
+ logger.info(
84
+ f"[debug] Page {i + 1}: Extracted fields (non-null): {non_null_fields}"
85
+ )
86
+
87
+ if mrz_data is not None:
88
+ # Support both canonical and legacy attribute names
89
+ raw_mrz = getattr(mrz_data, "raw_mrz", None) or getattr(mrz_data, "raw_text", None)
90
+ logger.info(
91
+ f"[debug] Page {i + 1}: MRZ detected — type={getattr(mrz_data, 'document_type', None) or getattr(mrz_data, 'format_type', None)}, confidence={mrz_data.confidence:.2f}"
92
+ )
93
+ if raw_mrz:
94
+ logger.info(f"[debug] Page {i + 1}: MRZ raw text:\n{raw_mrz}")
95
+ else:
96
+ logger.info(f"[debug] Page {i + 1}: No MRZ detected")
97
+
98
  # Create detection for this page
99
  detection = self._create_detection(extracted_fields, mrz_data, i, page_metadata)
100
  detections.append(detection)
 
346
  media_type: str,
347
  processing_time: float,
348
  ocr_texts: List[str],
349
+ page_metadata: Optional[List[Dict[str, Any]]] = None,
350
+ debug: bool = False,
351
  ) -> OCRResponse:
352
  """Build a complete OCR response from extracted texts."""
353
  builder = get_response_builder()
354
+ return builder.build_response(
355
+ request_id=request_id,
356
+ media_type=media_type,
357
+ processing_time=processing_time,
358
+ ocr_texts=ocr_texts,
359
+ page_metadata=page_metadata,
360
+ debug=debug,
361
+ )
362
 
363
 
364
  def build_error_response(