seanmanifest commited on
Commit
7edbb4d
·
verified ·
1 Parent(s): ef13bf1

Upload folder using huggingface_hub

Browse files
config.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "architectures": [
3
  "PowerCoderForCausalLM"
4
  ],
@@ -17,7 +18,6 @@
17
  "initializer_range": 0.018042,
18
  "intermediate_size": 12288,
19
  "max_position_embeddings": 4096,
20
- "model_type": "powercoder",
21
  "norm_epsilon": 1e-05,
22
  "num_attention_heads": 24,
23
  "num_hidden_layers": 30,
 
1
  {
2
+ "model_type": "powercoder",
3
  "architectures": [
4
  "PowerCoderForCausalLM"
5
  ],
 
18
  "initializer_range": 0.018042,
19
  "intermediate_size": 12288,
20
  "max_position_embeddings": 4096,
 
21
  "norm_epsilon": 1e-05,
22
  "num_attention_heads": 24,
23
  "num_hidden_layers": 30,
configuration_powercoder.py CHANGED
@@ -156,8 +156,6 @@ class PowerCoderConfig(PretrainedConfig):
156
  residual_dropout=0.0,
157
  embedding_dropout=0.0,
158
  use_bias=True,
159
- chunk_size=None,
160
- switch_over_seq_len=None,
161
  **kwargs,
162
  ):
163
  self.vocab_size = vocab_size
@@ -178,8 +176,6 @@ class PowerCoderConfig(PretrainedConfig):
178
  self.attention_dropout = attention_dropout
179
  self.residual_dropout = residual_dropout
180
  self.embedding_dropout = embedding_dropout
181
- self.chunk_size = chunk_size
182
- self.switch_over_seq_len = switch_over_seq_len
183
  # Validate the correctness of rotary position embeddings parameters
184
  # BC: if there is a 'type' field, move it to 'rope_type'.
185
  if self.rope_scaling is not None and "type" in self.rope_scaling:
 
156
  residual_dropout=0.0,
157
  embedding_dropout=0.0,
158
  use_bias=True,
 
 
159
  **kwargs,
160
  ):
161
  self.vocab_size = vocab_size
 
176
  self.attention_dropout = attention_dropout
177
  self.residual_dropout = residual_dropout
178
  self.embedding_dropout = embedding_dropout
 
 
179
  # Validate the correctness of rotary position embeddings parameters
180
  # BC: if there is a 'type' field, move it to 'rope_type'.
181
  if self.rope_scaling is not None and "type" in self.rope_scaling:
modeling_powercoder.py CHANGED
@@ -119,8 +119,6 @@ class PowerCoderAttention(nn.Module):
119
  super().__init__()
120
  self.config = config
121
  self.layer_idx = layer_idx
122
- self.chunk_size = config.chunk_size
123
- self.switch_over_seq_len = config.switch_over_seq_len
124
  self.head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
125
  self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
126
  self.scaling = self.head_dim**-0.5
@@ -141,6 +139,8 @@ class PowerCoderAttention(nn.Module):
141
  padding_starts: Optional[torch.Tensor],
142
  past_key_value: Optional[Cache] = None,
143
  cache_position: Optional[torch.LongTensor] = None,
 
 
144
  **kwargs: Unpack[FlashAttentionKwargs],
145
  ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
146
  input_shape = hidden_states.shape[:-1]
@@ -189,9 +189,9 @@ class PowerCoderAttention(nn.Module):
189
  sum_of_keys=sum_of_keys,
190
  deg=2,
191
  scale=self.scaling,
192
- switch_over_seq_len=self.switch_over_seq_len,
193
  )
194
- if self.switch_over_seq_len is not None and key_len >= self.switch_over_seq_len:
195
  past_key_value.clean_kv(self.layer_idx)
196
  past_key_value.update_state(state, sum_of_keys, self.layer_idx, cache_kwargs)
197
 
@@ -204,7 +204,8 @@ class PowerCoderAttention(nn.Module):
204
  gate_states.transpose(1, 2),
205
  deg=2,
206
  scale=self.scaling,
207
- chunk_size=self.chunk_size, # enable chunked prefilling by default
 
208
  )
209
 
210
  if interpolate_exp_amount == 1:
@@ -243,6 +244,8 @@ class PowerCoderDecoderLayer(GradientCheckpointingLayer):
243
  use_cache: Optional[bool] = False,
244
  cache_position: Optional[torch.LongTensor] = None,
245
  position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
 
 
246
  **kwargs: Unpack[TransformersKwargs],
247
  ) -> tuple[torch.Tensor]:
248
  residual = hidden_states
@@ -256,6 +259,8 @@ class PowerCoderDecoderLayer(GradientCheckpointingLayer):
256
  use_cache=use_cache,
257
  cache_position=cache_position,
258
  position_embeddings=position_embeddings,
 
 
259
  **kwargs,
260
  )
261
  hidden_states = residual + hidden_states
@@ -348,6 +353,8 @@ class PowerCoderModel(PowerCoderPreTrainedModel):
348
  inputs_embeds: Optional[torch.FloatTensor] = None,
349
  use_cache: Optional[bool] = None,
350
  cache_position: Optional[torch.LongTensor] = None,
 
 
351
  **kwargs: Unpack[TransformersKwargs],
352
  ) -> BaseModelOutputWithPast:
353
  if (input_ids is None) ^ (inputs_embeds is not None):
@@ -397,6 +404,8 @@ class PowerCoderModel(PowerCoderPreTrainedModel):
397
  use_cache=use_cache,
398
  cache_position=cache_position,
399
  position_embeddings=position_embeddings,
 
 
400
  **kwargs,
401
  )
402
 
@@ -414,11 +423,7 @@ class PowerCoderForCausalLM(PowerCoderPreTrainedModel, GenerationMixin):
414
  _tp_plan = {"lm_head": "colwise_rep"}
415
  _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
416
 
417
- def __init__(self, config, chunk_size=None, switch_over_seq_len=None):
418
- if chunk_size is not None:
419
- config.chunk_size = chunk_size
420
- if switch_over_seq_len is not None:
421
- config.switch_over_seq_len = switch_over_seq_len
422
  super().__init__(config)
423
  self.model = PowerCoderModel(config)
424
  self.vocab_size = config.vocab_size
@@ -446,6 +451,8 @@ class PowerCoderForCausalLM(PowerCoderPreTrainedModel, GenerationMixin):
446
  use_cache: Optional[bool] = None,
447
  cache_position: Optional[torch.LongTensor] = None,
448
  logits_to_keep: Union[int, torch.Tensor] = 0,
 
 
449
  **kwargs: Unpack[TransformersKwargs],
450
  ) -> CausalLMOutputWithPast:
451
  r"""
@@ -486,6 +493,10 @@ class PowerCoderForCausalLM(PowerCoderPreTrainedModel, GenerationMixin):
486
  Position indices for cached key/value states when using incremental decoding.
487
  logits_to_keep (`Union[int, torch.Tensor]`, *optional*, defaults to 0):
488
  Number of logits to compute from the end of the sequence, or specific indices to compute.
 
 
 
 
489
  **kwargs:
490
  Additional arguments passed to the underlying model's forward method.
491
 
@@ -505,6 +516,8 @@ class PowerCoderForCausalLM(PowerCoderPreTrainedModel, GenerationMixin):
505
  inputs_embeds=inputs_embeds,
506
  use_cache=use_cache,
507
  cache_position=cache_position,
 
 
508
  **kwargs,
509
  )
510
 
 
119
  super().__init__()
120
  self.config = config
121
  self.layer_idx = layer_idx
 
 
122
  self.head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
123
  self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
124
  self.scaling = self.head_dim**-0.5
 
139
  padding_starts: Optional[torch.Tensor],
140
  past_key_value: Optional[Cache] = None,
141
  cache_position: Optional[torch.LongTensor] = None,
142
+ chunk_size: Optional[int] = None,
143
+ switch_over_seq_len: Optional[int] = None,
144
  **kwargs: Unpack[FlashAttentionKwargs],
145
  ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
146
  input_shape = hidden_states.shape[:-1]
 
189
  sum_of_keys=sum_of_keys,
190
  deg=2,
191
  scale=self.scaling,
192
+ switch_over_seq_len=switch_over_seq_len,
193
  )
194
+ if switch_over_seq_len is not None and key_len >= switch_over_seq_len:
195
  past_key_value.clean_kv(self.layer_idx)
196
  past_key_value.update_state(state, sum_of_keys, self.layer_idx, cache_kwargs)
197
 
 
204
  gate_states.transpose(1, 2),
205
  deg=2,
206
  scale=self.scaling,
207
+ chunk_size=chunk_size, # enable chunked prefilling by default
208
+ switch_over_seq_len=switch_over_seq_len,
209
  )
210
 
211
  if interpolate_exp_amount == 1:
 
244
  use_cache: Optional[bool] = False,
245
  cache_position: Optional[torch.LongTensor] = None,
246
  position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
247
+ chunk_size: Optional[int] = None,
248
+ switch_over_seq_len: Optional[int] = None,
249
  **kwargs: Unpack[TransformersKwargs],
250
  ) -> tuple[torch.Tensor]:
251
  residual = hidden_states
 
259
  use_cache=use_cache,
260
  cache_position=cache_position,
261
  position_embeddings=position_embeddings,
262
+ chunk_size=chunk_size,
263
+ switch_over_seq_len=switch_over_seq_len,
264
  **kwargs,
265
  )
266
  hidden_states = residual + hidden_states
 
353
  inputs_embeds: Optional[torch.FloatTensor] = None,
354
  use_cache: Optional[bool] = None,
355
  cache_position: Optional[torch.LongTensor] = None,
356
+ chunk_size: Optional[int] = None,
357
+ switch_over_seq_len: Optional[int] = None,
358
  **kwargs: Unpack[TransformersKwargs],
359
  ) -> BaseModelOutputWithPast:
360
  if (input_ids is None) ^ (inputs_embeds is not None):
 
404
  use_cache=use_cache,
405
  cache_position=cache_position,
406
  position_embeddings=position_embeddings,
407
+ chunk_size=chunk_size,
408
+ switch_over_seq_len=switch_over_seq_len,
409
  **kwargs,
410
  )
411
 
 
423
  _tp_plan = {"lm_head": "colwise_rep"}
424
  _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
425
 
426
+ def __init__(self, config):
 
 
 
 
427
  super().__init__(config)
428
  self.model = PowerCoderModel(config)
429
  self.vocab_size = config.vocab_size
 
451
  use_cache: Optional[bool] = None,
452
  cache_position: Optional[torch.LongTensor] = None,
453
  logits_to_keep: Union[int, torch.Tensor] = 0,
454
+ chunk_size: Optional[int] = None,
455
+ switch_over_seq_len: Optional[int] = None,
456
  **kwargs: Unpack[TransformersKwargs],
457
  ) -> CausalLMOutputWithPast:
458
  r"""
 
493
  Position indices for cached key/value states when using incremental decoding.
494
  logits_to_keep (`Union[int, torch.Tensor]`, *optional*, defaults to 0):
495
  Number of logits to compute from the end of the sequence, or specific indices to compute.
496
+ chunk_size (`Optional[int]`, *optional*):
497
+ Chunk size for training and prefilling.
498
+ switch_over_seq_len (`Optional[int]`, *optional*):
499
+ Sequence length threshold for state update.
500
  **kwargs:
501
  Additional arguments passed to the underlying model's forward method.
502
 
 
516
  inputs_embeds=inputs_embeds,
517
  use_cache=use_cache,
518
  cache_position=cache_position,
519
+ chunk_size=chunk_size,
520
+ switch_over_seq_len=switch_over_seq_len,
521
  **kwargs,
522
  )
523