|  | from typing import Any, Optional | 
					
						
						|  | from transformers.configuration_utils import PretrainedConfig | 
					
						
						|  | from transformers.models.qwen2 import Qwen2Config | 
					
						
						|  | from transformers import Qwen2_5_VLProcessor, AutoProcessor | 
					
						
						|  | from transformers.models.auto.configuration_auto import CONFIG_MAPPING | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | class DotsVisionConfig(PretrainedConfig): | 
					
						
						|  | model_type: str = "dots_vit" | 
					
						
						|  |  | 
					
						
						|  | def __init__( | 
					
						
						|  | self, | 
					
						
						|  | embed_dim: int = 1536, | 
					
						
						|  | hidden_size: int = 1536, | 
					
						
						|  | intermediate_size: int = 4224, | 
					
						
						|  | num_hidden_layers: int = 42, | 
					
						
						|  | num_attention_heads: int = 12, | 
					
						
						|  | num_channels: int = 3, | 
					
						
						|  | patch_size: int = 14, | 
					
						
						|  | spatial_merge_size: int = 2, | 
					
						
						|  | temporal_patch_size: int = 1, | 
					
						
						|  | rms_norm_eps: float = 1e-5, | 
					
						
						|  | use_bias: bool = False, | 
					
						
						|  | attn_implementation="flash_attention_2", | 
					
						
						|  | initializer_range=0.02, | 
					
						
						|  | init_merger_std=0.02, | 
					
						
						|  | is_causal=False, | 
					
						
						|  | post_norm=True, | 
					
						
						|  | gradient_checkpointing=False, | 
					
						
						|  | **kwargs: Any, | 
					
						
						|  | ): | 
					
						
						|  | super().__init__(**kwargs) | 
					
						
						|  | self.embed_dim = embed_dim | 
					
						
						|  | self.hidden_size = hidden_size | 
					
						
						|  | self.intermediate_size = intermediate_size | 
					
						
						|  | self.num_hidden_layers = num_hidden_layers | 
					
						
						|  | self.num_attention_heads = num_attention_heads | 
					
						
						|  | self.num_channels = num_channels | 
					
						
						|  | self.patch_size = patch_size | 
					
						
						|  | self.spatial_merge_size = spatial_merge_size | 
					
						
						|  | self.temporal_patch_size = temporal_patch_size | 
					
						
						|  | self.rms_norm_eps = rms_norm_eps | 
					
						
						|  | self.use_bias = use_bias | 
					
						
						|  | self.attn_implementation = attn_implementation | 
					
						
						|  | self.initializer_range = initializer_range | 
					
						
						|  | self.init_merger_std = init_merger_std | 
					
						
						|  | self.is_causal = is_causal | 
					
						
						|  | self.post_norm = post_norm | 
					
						
						|  | self.gradient_checkpointing = gradient_checkpointing | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | class DotsOCRConfig(Qwen2Config): | 
					
						
						|  | model_type = "dots_ocr" | 
					
						
						|  | def __init__(self, | 
					
						
						|  | image_token_id = 151665, | 
					
						
						|  | video_token_id = 151656, | 
					
						
						|  | vision_config: Optional[dict] = None, *args, **kwargs): | 
					
						
						|  | super().__init__(*args, **kwargs) | 
					
						
						|  | self.image_token_id = image_token_id | 
					
						
						|  | self.video_token_id = video_token_id | 
					
						
						|  | self.vision_config = DotsVisionConfig(**(vision_config or {})) | 
					
						
						|  |  | 
					
						
						|  | def save_pretrained(self, save_directory, **kwargs): | 
					
						
						|  | self._auto_class = None | 
					
						
						|  | super().save_pretrained(save_directory, **kwargs) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | class DotsVLProcessor(Qwen2_5_VLProcessor): | 
					
						
						|  | def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs): | 
					
						
						|  | super().__init__(image_processor, tokenizer, chat_template=chat_template) | 
					
						
						|  | self.image_token = "<|imgpad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token | 
					
						
						|  | self.image_token_id = 151665 if not hasattr(tokenizer, "image_token_id") else tokenizer.image_token_id | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | AutoProcessor.register("dots_ocr", DotsVLProcessor) | 
					
						
						|  | CONFIG_MAPPING.register("dots_ocr", DotsOCRConfig) | 
					
						
						|  |  |