remove-single-vector-projection
#18
by
						
jupyterjazz
	
							
						- opened
							
					
    	
        adapters/adapter_config.json
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 1 | 
         
             
            {
         
     | 
| 2 | 
         
             
              "alpha_pattern": {},
         
     | 
| 3 | 
         
             
              "auto_mapping": null,
         
     | 
| 4 | 
         
            -
              "base_model_name_or_path": "jinaai/ 
     | 
| 5 | 
         
             
              "bias": "none",
         
     | 
| 6 | 
         
             
              "corda_config": null,
         
     | 
| 7 | 
         
             
              "eva_config": null,
         
     | 
| 
         | 
|
| 1 | 
         
             
            {
         
     | 
| 2 | 
         
             
              "alpha_pattern": {},
         
     | 
| 3 | 
         
             
              "auto_mapping": null,
         
     | 
| 4 | 
         
            +
              "base_model_name_or_path": "jinaai/jina-embeddings-v4",
         
     | 
| 5 | 
         
             
              "bias": "none",
         
     | 
| 6 | 
         
             
              "corda_config": null,
         
     | 
| 7 | 
         
             
              "eva_config": null,
         
     | 
    	
        adapters/adapter_model.safetensors
    CHANGED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
            -
            size  
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:b6b7ab4a79daa3b4f3b5274500cc99d3dc89aa8c3419e9d79f89e366685e12e5
         
     | 
| 3 | 
         
            +
            size 359863776
         
     | 
    	
        config.json
    CHANGED
    
    | 
         @@ -33,7 +33,6 @@ 
     | 
|
| 33 | 
         
             
              },
         
     | 
| 34 | 
         
             
              "rope_theta": 1000000.0,
         
     | 
| 35 | 
         
             
              "single_vector_pool_strategy": "mean",
         
     | 
| 36 | 
         
            -
              "single_vector_projector_dim": 1024,
         
     | 
| 37 | 
         
             
              "sliding_window": 32768,
         
     | 
| 38 | 
         
             
              "tie_word_embeddings": true,
         
     | 
| 39 | 
         
             
              "torch_dtype": "bfloat16",
         
     | 
| 
         | 
|
| 33 | 
         
             
              },
         
     | 
| 34 | 
         
             
              "rope_theta": 1000000.0,
         
     | 
| 35 | 
         
             
              "single_vector_pool_strategy": "mean",
         
     | 
| 
         | 
|
| 36 | 
         
             
              "sliding_window": 32768,
         
     | 
| 37 | 
         
             
              "tie_word_embeddings": true,
         
     | 
| 38 | 
         
             
              "torch_dtype": "bfloat16",
         
     | 
    	
        configuration_jina_embeddings_v4.py
    CHANGED
    
    | 
         @@ -9,14 +9,12 @@ class JinaEmbeddingsV4Config(Qwen2_5_VLConfig): 
     | 
|
| 9 | 
         | 
| 10 | 
         
             
                def __init__(
         
     | 
| 11 | 
         
             
                    self,
         
     | 
| 12 | 
         
            -
                    single_vector_projector_dim: int = 1024,
         
     | 
| 13 | 
         
             
                    single_vector_pool_strategy: str = "mean",
         
     | 
| 14 | 
         
             
                    multi_vector_projector_dim: int = 128,
         
     | 
| 15 | 
         
             
                    pretrained_peft_model_name_or_path: Optional[str] = None,
         
     | 
| 16 | 
         
             
                    **kwargs,
         
     | 
| 17 | 
         
             
                ):
         
     | 
| 18 | 
         
             
                    super().__init__(**kwargs)
         
     | 
| 19 | 
         
            -
                    self.single_vector_projector_dim = single_vector_projector_dim
         
     | 
| 20 | 
         
             
                    self.single_vector_pool_strategy = single_vector_pool_strategy
         
     | 
| 21 | 
         
             
                    self.multi_vector_projector_dim = multi_vector_projector_dim
         
     | 
| 22 | 
         
             
                    self.pretrained_peft_model_name_or_path = pretrained_peft_model_name_or_path
         
     | 
| 
         | 
|
| 9 | 
         | 
| 10 | 
         
             
                def __init__(
         
     | 
| 11 | 
         
             
                    self,
         
     | 
| 
         | 
|
| 12 | 
         
             
                    single_vector_pool_strategy: str = "mean",
         
     | 
| 13 | 
         
             
                    multi_vector_projector_dim: int = 128,
         
     | 
| 14 | 
         
             
                    pretrained_peft_model_name_or_path: Optional[str] = None,
         
     | 
| 15 | 
         
             
                    **kwargs,
         
     | 
| 16 | 
         
             
                ):
         
     | 
| 17 | 
         
             
                    super().__init__(**kwargs)
         
     | 
| 
         | 
|
| 18 | 
         
             
                    self.single_vector_pool_strategy = single_vector_pool_strategy
         
     | 
| 19 | 
         
             
                    self.multi_vector_projector_dim = multi_vector_projector_dim
         
     | 
| 20 | 
         
             
                    self.pretrained_peft_model_name_or_path = pretrained_peft_model_name_or_path
         
     | 
    	
        model-00001-of-00002.safetensors
    CHANGED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
            -
            size  
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:abb244162956ec2f26d944b6c10cbb96afe211d2aff908b8b2f498ec27a9100b
         
     | 
| 3 | 
         
            +
            size 4997750728
         
     | 
    	
        model-00002-of-00002.safetensors
    CHANGED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
            -
            size  
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:5d5252a7ede6469220b0e7386af53fea9a45fa299a1d2af6fe68cb29897de3e3
         
     | 
| 3 | 
         
            +
            size 2512111904
         
     | 
    	
        model.safetensors.index.json
    CHANGED
    
    | 
         @@ -439,8 +439,6 @@ 
     | 
|
| 439 | 
         
             
                "model.norm.weight": "model-00002-of-00002.safetensors",
         
     | 
| 440 | 
         
             
                "multi_vector_projector.bias": "model-00002-of-00002.safetensors",
         
     | 
| 441 | 
         
             
                "multi_vector_projector.weight": "model-00002-of-00002.safetensors",
         
     | 
| 442 | 
         
            -
                "single_vector_projector.bias": "model-00002-of-00002.safetensors",
         
     | 
| 443 | 
         
            -
                "single_vector_projector.weight": "model-00002-of-00002.safetensors",
         
     | 
| 444 | 
         
             
                "visual.blocks.0.attn.proj.bias": "model-00001-of-00002.safetensors",
         
     | 
| 445 | 
         
             
                "visual.blocks.0.attn.proj.weight": "model-00001-of-00002.safetensors",
         
     | 
| 446 | 
         
             
                "visual.blocks.0.attn.qkv.bias": "model-00001-of-00002.safetensors",
         
     | 
| 
         | 
|
| 439 | 
         
             
                "model.norm.weight": "model-00002-of-00002.safetensors",
         
     | 
| 440 | 
         
             
                "multi_vector_projector.bias": "model-00002-of-00002.safetensors",
         
     | 
| 441 | 
         
             
                "multi_vector_projector.weight": "model-00002-of-00002.safetensors",
         
     | 
| 
         | 
|
| 
         | 
|
| 442 | 
         
             
                "visual.blocks.0.attn.proj.bias": "model-00001-of-00002.safetensors",
         
     | 
| 443 | 
         
             
                "visual.blocks.0.attn.proj.weight": "model-00001-of-00002.safetensors",
         
     | 
| 444 | 
         
             
                "visual.blocks.0.attn.qkv.bias": "model-00001-of-00002.safetensors",
         
     | 
    	
        modeling_jina_embeddings_v4.py
    CHANGED
    
    | 
         @@ -141,12 +141,11 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration): 
     | 
|
| 141 | 
         | 
| 142 | 
         
             
                def __init__(self, config: JinaEmbeddingsV4Config):
         
     | 
| 143 | 
         
             
                    Qwen2_5_VLForConditionalGeneration.__init__(self, config)
         
     | 
| 144 | 
         
            -
                    self. 
     | 
| 145 | 
         
             
                    self.post_init()
         
     | 
| 146 | 
         
             
                    self.processor = JinaEmbeddingsV4Processor.from_pretrained(
         
     | 
| 147 | 
         
             
                        self.name_or_path, trust_remote_code=True, use_fast=True
         
     | 
| 148 | 
         
             
                    )
         
     | 
| 149 | 
         
            -
                    self.single_vector_projector_dim = config.single_vector_projector_dim
         
     | 
| 150 | 
         
             
                    self.multi_vector_projector_dim = config.multi_vector_projector_dim
         
     | 
| 151 | 
         
             
                    self._task = None
         
     | 
| 152 | 
         | 
| 
         @@ -204,32 +203,25 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration): 
     | 
|
| 204 | 
         | 
| 205 | 
         
             
                    return hidden_states[-1]
         
     | 
| 206 | 
         | 
| 207 | 
         
            -
                def  
     | 
| 208 | 
         
             
                    """
         
     | 
| 209 | 
         
             
                    Initializes projection layers.
         
     | 
| 210 | 
         
             
                    """
         
     | 
| 211 | 
         
            -
                    self.config.single_vector_projector_dim = config.single_vector_projector_dim
         
     | 
| 212 | 
         
             
                    self.config.multi_vector_projector_dim = config.multi_vector_projector_dim
         
     | 
| 213 | 
         | 
| 214 | 
         
            -
                    self.single_vector_projector = nn.Linear(
         
     | 
| 215 | 
         
            -
                        in_features=self.config.text_config.hidden_size,
         
     | 
| 216 | 
         
            -
                        out_features=self.config.single_vector_projector_dim,
         
     | 
| 217 | 
         
            -
                    )
         
     | 
| 218 | 
         
            -
             
     | 
| 219 | 
         
             
                    self.multi_vector_projector = nn.Linear(
         
     | 
| 220 | 
         
             
                        in_features=self.config.text_config.hidden_size,
         
     | 
| 221 | 
         
             
                        out_features=self.config.multi_vector_projector_dim,
         
     | 
| 222 | 
         
             
                    )
         
     | 
| 223 | 
         | 
| 224 | 
         
            -
                def  
     | 
| 225 | 
         
             
                    self,
         
     | 
| 226 | 
         
            -
                    task_label: Union[str, List[str]],
         
     | 
| 227 | 
         
             
                    hidden_states: torch.Tensor,
         
     | 
| 228 | 
         
             
                    attention_mask: torch.Tensor,
         
     | 
| 229 | 
         
             
                    input_ids: Optional[torch.LongTensor] = None,
         
     | 
| 230 | 
         
             
                ) -> torch.Tensor:
         
     | 
| 231 | 
         
             
                    """
         
     | 
| 232 | 
         
            -
                     
     | 
| 233 | 
         
             
                    """
         
     | 
| 234 | 
         
             
                    if self._input_has_image(input_ids[0]):  # got document image
         
     | 
| 235 | 
         
             
                        img_start_positions = torch.where(
         
     | 
| 
         @@ -257,12 +249,9 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration): 
     | 
|
| 257 | 
         
             
                            hidden_states * attention_mask.unsqueeze(-1), dim=1
         
     | 
| 258 | 
         
             
                        ) / torch.sum(attention_mask, dim=1, keepdim=True)
         
     | 
| 259 | 
         | 
| 260 | 
         
            -
                     
     | 
| 261 | 
         
            -
                        pooled_output, task_label=task_label
         
     | 
| 262 | 
         
            -
                    )
         
     | 
| 263 | 
         
            -
                    return torch.nn.functional.normalize(single_vec_emb, dim=-1)
         
     | 
| 264 | 
         | 
| 265 | 
         
            -
                def  
     | 
| 266 | 
         
             
                    self,
         
     | 
| 267 | 
         
             
                    task_label: Union[str, List[str]],
         
     | 
| 268 | 
         
             
                    hidden_states: torch.Tensor,
         
     | 
| 
         @@ -306,13 +295,12 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration): 
     | 
|
| 306 | 
         
             
                        **kwargs,
         
     | 
| 307 | 
         
             
                    )  # (batch_size, seq_length, hidden_size)
         
     | 
| 308 | 
         
             
                    # Compute the embeddings
         
     | 
| 309 | 
         
            -
                    single_vec_emb = self. 
     | 
| 310 | 
         
             
                        hidden_states=hidden_states,
         
     | 
| 311 | 
         
             
                        attention_mask=attention_mask,
         
     | 
| 312 | 
         
             
                        input_ids=input_ids,
         
     | 
| 313 | 
         
            -
                        task_label=task_label,
         
     | 
| 314 | 
         
             
                    )
         
     | 
| 315 | 
         
            -
                    multi_vec_emb = self. 
     | 
| 316 | 
         
             
                        hidden_states=hidden_states,
         
     | 
| 317 | 
         
             
                        attention_mask=attention_mask,
         
     | 
| 318 | 
         
             
                        task_label=task_label,
         
     | 
| 
         | 
|
| 141 | 
         | 
| 142 | 
         
             
                def __init__(self, config: JinaEmbeddingsV4Config):
         
     | 
| 143 | 
         
             
                    Qwen2_5_VLForConditionalGeneration.__init__(self, config)
         
     | 
| 144 | 
         
            +
                    self._init_projection_layer(config)
         
     | 
| 145 | 
         
             
                    self.post_init()
         
     | 
| 146 | 
         
             
                    self.processor = JinaEmbeddingsV4Processor.from_pretrained(
         
     | 
| 147 | 
         
             
                        self.name_or_path, trust_remote_code=True, use_fast=True
         
     | 
| 148 | 
         
             
                    )
         
     | 
| 
         | 
|
| 149 | 
         
             
                    self.multi_vector_projector_dim = config.multi_vector_projector_dim
         
     | 
| 150 | 
         
             
                    self._task = None
         
     | 
| 151 | 
         | 
| 
         | 
|
| 203 | 
         | 
| 204 | 
         
             
                    return hidden_states[-1]
         
     | 
| 205 | 
         | 
| 206 | 
         
            +
                def _init_projection_layer(self, config) -> None:
         
     | 
| 207 | 
         
             
                    """
         
     | 
| 208 | 
         
             
                    Initializes projection layers.
         
     | 
| 209 | 
         
             
                    """
         
     | 
| 
         | 
|
| 210 | 
         
             
                    self.config.multi_vector_projector_dim = config.multi_vector_projector_dim
         
     | 
| 211 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 212 | 
         
             
                    self.multi_vector_projector = nn.Linear(
         
     | 
| 213 | 
         
             
                        in_features=self.config.text_config.hidden_size,
         
     | 
| 214 | 
         
             
                        out_features=self.config.multi_vector_projector_dim,
         
     | 
| 215 | 
         
             
                    )
         
     | 
| 216 | 
         | 
| 217 | 
         
            +
                def get_single_vector_embeddings(
         
     | 
| 218 | 
         
             
                    self,
         
     | 
| 
         | 
|
| 219 | 
         
             
                    hidden_states: torch.Tensor,
         
     | 
| 220 | 
         
             
                    attention_mask: torch.Tensor,
         
     | 
| 221 | 
         
             
                    input_ids: Optional[torch.LongTensor] = None,
         
     | 
| 222 | 
         
             
                ) -> torch.Tensor:
         
     | 
| 223 | 
         
             
                    """
         
     | 
| 224 | 
         
            +
                    Get the single-vector embeddings from the hidden states.
         
     | 
| 225 | 
         
             
                    """
         
     | 
| 226 | 
         
             
                    if self._input_has_image(input_ids[0]):  # got document image
         
     | 
| 227 | 
         
             
                        img_start_positions = torch.where(
         
     | 
| 
         | 
|
| 249 | 
         
             
                            hidden_states * attention_mask.unsqueeze(-1), dim=1
         
     | 
| 250 | 
         
             
                        ) / torch.sum(attention_mask, dim=1, keepdim=True)
         
     | 
| 251 | 
         | 
| 252 | 
         
            +
                    return torch.nn.functional.normalize(pooled_output, dim=-1)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 253 | 
         | 
| 254 | 
         
            +
                def get_multi_vector_embeddings(
         
     | 
| 255 | 
         
             
                    self,
         
     | 
| 256 | 
         
             
                    task_label: Union[str, List[str]],
         
     | 
| 257 | 
         
             
                    hidden_states: torch.Tensor,
         
     | 
| 
         | 
|
| 295 | 
         
             
                        **kwargs,
         
     | 
| 296 | 
         
             
                    )  # (batch_size, seq_length, hidden_size)
         
     | 
| 297 | 
         
             
                    # Compute the embeddings
         
     | 
| 298 | 
         
            +
                    single_vec_emb = self.get_single_vector_embeddings(
         
     | 
| 299 | 
         
             
                        hidden_states=hidden_states,
         
     | 
| 300 | 
         
             
                        attention_mask=attention_mask,
         
     | 
| 301 | 
         
             
                        input_ids=input_ids,
         
     | 
| 
         | 
|
| 302 | 
         
             
                    )
         
     | 
| 303 | 
         
            +
                    multi_vec_emb = self.get_multi_vector_embeddings(
         
     | 
| 304 | 
         
             
                        hidden_states=hidden_states,
         
     | 
| 305 | 
         
             
                        attention_mask=attention_mask,
         
     | 
| 306 | 
         
             
                        task_label=task_label,
         
     |