Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

config.json +1 -1
configuration_powercoder.py +0 -4
modeling_powercoder.py +23 -10

config.json CHANGED Viewed

@@ -1,4 +1,5 @@
 {
   "architectures": [
     "PowerCoderForCausalLM"
   ],
@@ -17,7 +18,6 @@
   "initializer_range": 0.018042,
   "intermediate_size": 12288,
   "max_position_embeddings": 4096,
-  "model_type": "powercoder",
   "norm_epsilon": 1e-05,
   "num_attention_heads": 24,
   "num_hidden_layers": 30,

 {
+  "model_type": "powercoder",
   "architectures": [
     "PowerCoderForCausalLM"
   ],
   "initializer_range": 0.018042,
   "intermediate_size": 12288,
   "max_position_embeddings": 4096,
   "norm_epsilon": 1e-05,
   "num_attention_heads": 24,
   "num_hidden_layers": 30,

configuration_powercoder.py CHANGED Viewed

@@ -156,8 +156,6 @@ class PowerCoderConfig(PretrainedConfig):
         residual_dropout=0.0,
         embedding_dropout=0.0,
         use_bias=True,
-        chunk_size=None,
-        switch_over_seq_len=None,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -178,8 +176,6 @@ class PowerCoderConfig(PretrainedConfig):
         self.attention_dropout = attention_dropout
         self.residual_dropout = residual_dropout
         self.embedding_dropout = embedding_dropout
-        self.chunk_size = chunk_size
-        self.switch_over_seq_len = switch_over_seq_len
         # Validate the correctness of rotary position embeddings parameters
         # BC: if there is a 'type' field, move it to 'rope_type'.
         if self.rope_scaling is not None and "type" in self.rope_scaling:

         residual_dropout=0.0,
         embedding_dropout=0.0,
         use_bias=True,
         **kwargs,
     ):
         self.vocab_size = vocab_size
         self.attention_dropout = attention_dropout
         self.residual_dropout = residual_dropout
         self.embedding_dropout = embedding_dropout
         # Validate the correctness of rotary position embeddings parameters
         # BC: if there is a 'type' field, move it to 'rope_type'.
         if self.rope_scaling is not None and "type" in self.rope_scaling:

modeling_powercoder.py CHANGED Viewed

@@ -119,8 +119,6 @@ class PowerCoderAttention(nn.Module):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
-        self.chunk_size = config.chunk_size
-        self.switch_over_seq_len = config.switch_over_seq_len
         self.head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
         self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
         self.scaling = self.head_dim**-0.5
@@ -141,6 +139,8 @@ class PowerCoderAttention(nn.Module):
         padding_starts: Optional[torch.Tensor],
         past_key_value: Optional[Cache] = None,
         cache_position: Optional[torch.LongTensor] = None,
         **kwargs: Unpack[FlashAttentionKwargs],
     ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
         input_shape = hidden_states.shape[:-1]
@@ -189,9 +189,9 @@ class PowerCoderAttention(nn.Module):
                 sum_of_keys=sum_of_keys,
                 deg=2,
                 scale=self.scaling,
-                switch_over_seq_len=self.switch_over_seq_len,
             )
-            if self.switch_over_seq_len is not None and key_len >= self.switch_over_seq_len:
                 past_key_value.clean_kv(self.layer_idx)
                 past_key_value.update_state(state, sum_of_keys, self.layer_idx, cache_kwargs)
@@ -204,7 +204,8 @@ class PowerCoderAttention(nn.Module):
                 gate_states.transpose(1, 2),
                 deg=2,
                 scale=self.scaling,
-                chunk_size=self.chunk_size, # enable chunked prefilling by default
             )
         if interpolate_exp_amount == 1:
@@ -243,6 +244,8 @@ class PowerCoderDecoderLayer(GradientCheckpointingLayer):
         use_cache: Optional[bool] = False,
         cache_position: Optional[torch.LongTensor] = None,
         position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
         **kwargs: Unpack[TransformersKwargs],
     ) -> tuple[torch.Tensor]:
         residual = hidden_states
@@ -256,6 +259,8 @@ class PowerCoderDecoderLayer(GradientCheckpointingLayer):
             use_cache=use_cache,
             cache_position=cache_position,
             position_embeddings=position_embeddings,
             **kwargs,
         )
         hidden_states = residual + hidden_states
@@ -348,6 +353,8 @@ class PowerCoderModel(PowerCoderPreTrainedModel):
         inputs_embeds: Optional[torch.FloatTensor] = None,
         use_cache: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> BaseModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
@@ -397,6 +404,8 @@ class PowerCoderModel(PowerCoderPreTrainedModel):
                 use_cache=use_cache,
                 cache_position=cache_position,
                 position_embeddings=position_embeddings,
                 **kwargs,
             )
@@ -414,11 +423,7 @@ class PowerCoderForCausalLM(PowerCoderPreTrainedModel, GenerationMixin):
     _tp_plan = {"lm_head": "colwise_rep"}
     _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
-    def __init__(self, config, chunk_size=None, switch_over_seq_len=None):
-        if chunk_size is not None:
-            config.chunk_size = chunk_size
-        if switch_over_seq_len is not None:
-            config.switch_over_seq_len = switch_over_seq_len
         super().__init__(config)
         self.model = PowerCoderModel(config)
         self.vocab_size = config.vocab_size
@@ -446,6 +451,8 @@ class PowerCoderForCausalLM(PowerCoderPreTrainedModel, GenerationMixin):
         use_cache: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs: Unpack[TransformersKwargs],
     ) -> CausalLMOutputWithPast:
         r"""
@@ -486,6 +493,10 @@ class PowerCoderForCausalLM(PowerCoderPreTrainedModel, GenerationMixin):
                 Position indices for cached key/value states when using incremental decoding.
             logits_to_keep (`Union[int, torch.Tensor]`, *optional*, defaults to 0):
                 Number of logits to compute from the end of the sequence, or specific indices to compute.
             **kwargs:
                 Additional arguments passed to the underlying model's forward method.
@@ -505,6 +516,8 @@ class PowerCoderForCausalLM(PowerCoderPreTrainedModel, GenerationMixin):
             inputs_embeds=inputs_embeds,
             use_cache=use_cache,
             cache_position=cache_position,
             **kwargs,
         )

         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
         self.head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
         self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
         self.scaling = self.head_dim**-0.5
         padding_starts: Optional[torch.Tensor],
         past_key_value: Optional[Cache] = None,
         cache_position: Optional[torch.LongTensor] = None,
+        chunk_size: Optional[int] = None,
+        switch_over_seq_len: Optional[int] = None,
         **kwargs: Unpack[FlashAttentionKwargs],
     ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
         input_shape = hidden_states.shape[:-1]
                 sum_of_keys=sum_of_keys,
                 deg=2,
                 scale=self.scaling,
+                switch_over_seq_len=switch_over_seq_len,
             )
+            if switch_over_seq_len is not None and key_len >= switch_over_seq_len:
                 past_key_value.clean_kv(self.layer_idx)
                 past_key_value.update_state(state, sum_of_keys, self.layer_idx, cache_kwargs)
                 gate_states.transpose(1, 2),
                 deg=2,
                 scale=self.scaling,
+                chunk_size=chunk_size, # enable chunked prefilling by default
+                switch_over_seq_len=switch_over_seq_len,
             )
         if interpolate_exp_amount == 1:
         use_cache: Optional[bool] = False,
         cache_position: Optional[torch.LongTensor] = None,
         position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        chunk_size: Optional[int] = None,
+        switch_over_seq_len: Optional[int] = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> tuple[torch.Tensor]:
         residual = hidden_states
             use_cache=use_cache,
             cache_position=cache_position,
             position_embeddings=position_embeddings,
+            chunk_size=chunk_size,
+            switch_over_seq_len=switch_over_seq_len,
             **kwargs,
         )
         hidden_states = residual + hidden_states
         inputs_embeds: Optional[torch.FloatTensor] = None,
         use_cache: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
+        chunk_size: Optional[int] = None,
+        switch_over_seq_len: Optional[int] = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> BaseModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
                 use_cache=use_cache,
                 cache_position=cache_position,
                 position_embeddings=position_embeddings,
+                chunk_size=chunk_size,
+                switch_over_seq_len=switch_over_seq_len,
                 **kwargs,
             )
     _tp_plan = {"lm_head": "colwise_rep"}
     _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    def __init__(self, config):
         super().__init__(config)
         self.model = PowerCoderModel(config)
         self.vocab_size = config.vocab_size
         use_cache: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
+        chunk_size: Optional[int] = None,
+        switch_over_seq_len: Optional[int] = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> CausalLMOutputWithPast:
         r"""
                 Position indices for cached key/value states when using incremental decoding.
             logits_to_keep (`Union[int, torch.Tensor]`, *optional*, defaults to 0):
                 Number of logits to compute from the end of the sequence, or specific indices to compute.
+            chunk_size (`Optional[int]`, *optional*):
+                Chunk size for training and prefilling.
+            switch_over_seq_len (`Optional[int]`, *optional*):
+                Sequence length threshold for state update.
             **kwargs:
                 Additional arguments passed to the underlying model's forward method.
             inputs_embeds=inputs_embeds,
             use_cache=use_cache,
             cache_position=cache_position,
+            chunk_size=chunk_size,
+            switch_over_seq_len=switch_over_seq_len,
             **kwargs,
         )