Upload folder using huggingface_hub
Browse files- config.json +1 -1
- configuration_powercoder.py +0 -4
- modeling_powercoder.py +23 -10
config.json
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
{
|
|
|
|
| 2 |
"architectures": [
|
| 3 |
"PowerCoderForCausalLM"
|
| 4 |
],
|
|
@@ -17,7 +18,6 @@
|
|
| 17 |
"initializer_range": 0.018042,
|
| 18 |
"intermediate_size": 12288,
|
| 19 |
"max_position_embeddings": 4096,
|
| 20 |
-
"model_type": "powercoder",
|
| 21 |
"norm_epsilon": 1e-05,
|
| 22 |
"num_attention_heads": 24,
|
| 23 |
"num_hidden_layers": 30,
|
|
|
|
| 1 |
{
|
| 2 |
+
"model_type": "powercoder",
|
| 3 |
"architectures": [
|
| 4 |
"PowerCoderForCausalLM"
|
| 5 |
],
|
|
|
|
| 18 |
"initializer_range": 0.018042,
|
| 19 |
"intermediate_size": 12288,
|
| 20 |
"max_position_embeddings": 4096,
|
|
|
|
| 21 |
"norm_epsilon": 1e-05,
|
| 22 |
"num_attention_heads": 24,
|
| 23 |
"num_hidden_layers": 30,
|
configuration_powercoder.py
CHANGED
|
@@ -156,8 +156,6 @@ class PowerCoderConfig(PretrainedConfig):
|
|
| 156 |
residual_dropout=0.0,
|
| 157 |
embedding_dropout=0.0,
|
| 158 |
use_bias=True,
|
| 159 |
-
chunk_size=None,
|
| 160 |
-
switch_over_seq_len=None,
|
| 161 |
**kwargs,
|
| 162 |
):
|
| 163 |
self.vocab_size = vocab_size
|
|
@@ -178,8 +176,6 @@ class PowerCoderConfig(PretrainedConfig):
|
|
| 178 |
self.attention_dropout = attention_dropout
|
| 179 |
self.residual_dropout = residual_dropout
|
| 180 |
self.embedding_dropout = embedding_dropout
|
| 181 |
-
self.chunk_size = chunk_size
|
| 182 |
-
self.switch_over_seq_len = switch_over_seq_len
|
| 183 |
# Validate the correctness of rotary position embeddings parameters
|
| 184 |
# BC: if there is a 'type' field, move it to 'rope_type'.
|
| 185 |
if self.rope_scaling is not None and "type" in self.rope_scaling:
|
|
|
|
| 156 |
residual_dropout=0.0,
|
| 157 |
embedding_dropout=0.0,
|
| 158 |
use_bias=True,
|
|
|
|
|
|
|
| 159 |
**kwargs,
|
| 160 |
):
|
| 161 |
self.vocab_size = vocab_size
|
|
|
|
| 176 |
self.attention_dropout = attention_dropout
|
| 177 |
self.residual_dropout = residual_dropout
|
| 178 |
self.embedding_dropout = embedding_dropout
|
|
|
|
|
|
|
| 179 |
# Validate the correctness of rotary position embeddings parameters
|
| 180 |
# BC: if there is a 'type' field, move it to 'rope_type'.
|
| 181 |
if self.rope_scaling is not None and "type" in self.rope_scaling:
|
modeling_powercoder.py
CHANGED
|
@@ -119,8 +119,6 @@ class PowerCoderAttention(nn.Module):
|
|
| 119 |
super().__init__()
|
| 120 |
self.config = config
|
| 121 |
self.layer_idx = layer_idx
|
| 122 |
-
self.chunk_size = config.chunk_size
|
| 123 |
-
self.switch_over_seq_len = config.switch_over_seq_len
|
| 124 |
self.head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
|
| 125 |
self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
|
| 126 |
self.scaling = self.head_dim**-0.5
|
|
@@ -141,6 +139,8 @@ class PowerCoderAttention(nn.Module):
|
|
| 141 |
padding_starts: Optional[torch.Tensor],
|
| 142 |
past_key_value: Optional[Cache] = None,
|
| 143 |
cache_position: Optional[torch.LongTensor] = None,
|
|
|
|
|
|
|
| 144 |
**kwargs: Unpack[FlashAttentionKwargs],
|
| 145 |
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
|
| 146 |
input_shape = hidden_states.shape[:-1]
|
|
@@ -189,9 +189,9 @@ class PowerCoderAttention(nn.Module):
|
|
| 189 |
sum_of_keys=sum_of_keys,
|
| 190 |
deg=2,
|
| 191 |
scale=self.scaling,
|
| 192 |
-
switch_over_seq_len=
|
| 193 |
)
|
| 194 |
-
if
|
| 195 |
past_key_value.clean_kv(self.layer_idx)
|
| 196 |
past_key_value.update_state(state, sum_of_keys, self.layer_idx, cache_kwargs)
|
| 197 |
|
|
@@ -204,7 +204,8 @@ class PowerCoderAttention(nn.Module):
|
|
| 204 |
gate_states.transpose(1, 2),
|
| 205 |
deg=2,
|
| 206 |
scale=self.scaling,
|
| 207 |
-
chunk_size=
|
|
|
|
| 208 |
)
|
| 209 |
|
| 210 |
if interpolate_exp_amount == 1:
|
|
@@ -243,6 +244,8 @@ class PowerCoderDecoderLayer(GradientCheckpointingLayer):
|
|
| 243 |
use_cache: Optional[bool] = False,
|
| 244 |
cache_position: Optional[torch.LongTensor] = None,
|
| 245 |
position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
|
|
|
|
|
|
|
| 246 |
**kwargs: Unpack[TransformersKwargs],
|
| 247 |
) -> tuple[torch.Tensor]:
|
| 248 |
residual = hidden_states
|
|
@@ -256,6 +259,8 @@ class PowerCoderDecoderLayer(GradientCheckpointingLayer):
|
|
| 256 |
use_cache=use_cache,
|
| 257 |
cache_position=cache_position,
|
| 258 |
position_embeddings=position_embeddings,
|
|
|
|
|
|
|
| 259 |
**kwargs,
|
| 260 |
)
|
| 261 |
hidden_states = residual + hidden_states
|
|
@@ -348,6 +353,8 @@ class PowerCoderModel(PowerCoderPreTrainedModel):
|
|
| 348 |
inputs_embeds: Optional[torch.FloatTensor] = None,
|
| 349 |
use_cache: Optional[bool] = None,
|
| 350 |
cache_position: Optional[torch.LongTensor] = None,
|
|
|
|
|
|
|
| 351 |
**kwargs: Unpack[TransformersKwargs],
|
| 352 |
) -> BaseModelOutputWithPast:
|
| 353 |
if (input_ids is None) ^ (inputs_embeds is not None):
|
|
@@ -397,6 +404,8 @@ class PowerCoderModel(PowerCoderPreTrainedModel):
|
|
| 397 |
use_cache=use_cache,
|
| 398 |
cache_position=cache_position,
|
| 399 |
position_embeddings=position_embeddings,
|
|
|
|
|
|
|
| 400 |
**kwargs,
|
| 401 |
)
|
| 402 |
|
|
@@ -414,11 +423,7 @@ class PowerCoderForCausalLM(PowerCoderPreTrainedModel, GenerationMixin):
|
|
| 414 |
_tp_plan = {"lm_head": "colwise_rep"}
|
| 415 |
_pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
|
| 416 |
|
| 417 |
-
def __init__(self, config
|
| 418 |
-
if chunk_size is not None:
|
| 419 |
-
config.chunk_size = chunk_size
|
| 420 |
-
if switch_over_seq_len is not None:
|
| 421 |
-
config.switch_over_seq_len = switch_over_seq_len
|
| 422 |
super().__init__(config)
|
| 423 |
self.model = PowerCoderModel(config)
|
| 424 |
self.vocab_size = config.vocab_size
|
|
@@ -446,6 +451,8 @@ class PowerCoderForCausalLM(PowerCoderPreTrainedModel, GenerationMixin):
|
|
| 446 |
use_cache: Optional[bool] = None,
|
| 447 |
cache_position: Optional[torch.LongTensor] = None,
|
| 448 |
logits_to_keep: Union[int, torch.Tensor] = 0,
|
|
|
|
|
|
|
| 449 |
**kwargs: Unpack[TransformersKwargs],
|
| 450 |
) -> CausalLMOutputWithPast:
|
| 451 |
r"""
|
|
@@ -486,6 +493,10 @@ class PowerCoderForCausalLM(PowerCoderPreTrainedModel, GenerationMixin):
|
|
| 486 |
Position indices for cached key/value states when using incremental decoding.
|
| 487 |
logits_to_keep (`Union[int, torch.Tensor]`, *optional*, defaults to 0):
|
| 488 |
Number of logits to compute from the end of the sequence, or specific indices to compute.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 489 |
**kwargs:
|
| 490 |
Additional arguments passed to the underlying model's forward method.
|
| 491 |
|
|
@@ -505,6 +516,8 @@ class PowerCoderForCausalLM(PowerCoderPreTrainedModel, GenerationMixin):
|
|
| 505 |
inputs_embeds=inputs_embeds,
|
| 506 |
use_cache=use_cache,
|
| 507 |
cache_position=cache_position,
|
|
|
|
|
|
|
| 508 |
**kwargs,
|
| 509 |
)
|
| 510 |
|
|
|
|
| 119 |
super().__init__()
|
| 120 |
self.config = config
|
| 121 |
self.layer_idx = layer_idx
|
|
|
|
|
|
|
| 122 |
self.head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
|
| 123 |
self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
|
| 124 |
self.scaling = self.head_dim**-0.5
|
|
|
|
| 139 |
padding_starts: Optional[torch.Tensor],
|
| 140 |
past_key_value: Optional[Cache] = None,
|
| 141 |
cache_position: Optional[torch.LongTensor] = None,
|
| 142 |
+
chunk_size: Optional[int] = None,
|
| 143 |
+
switch_over_seq_len: Optional[int] = None,
|
| 144 |
**kwargs: Unpack[FlashAttentionKwargs],
|
| 145 |
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
|
| 146 |
input_shape = hidden_states.shape[:-1]
|
|
|
|
| 189 |
sum_of_keys=sum_of_keys,
|
| 190 |
deg=2,
|
| 191 |
scale=self.scaling,
|
| 192 |
+
switch_over_seq_len=switch_over_seq_len,
|
| 193 |
)
|
| 194 |
+
if switch_over_seq_len is not None and key_len >= switch_over_seq_len:
|
| 195 |
past_key_value.clean_kv(self.layer_idx)
|
| 196 |
past_key_value.update_state(state, sum_of_keys, self.layer_idx, cache_kwargs)
|
| 197 |
|
|
|
|
| 204 |
gate_states.transpose(1, 2),
|
| 205 |
deg=2,
|
| 206 |
scale=self.scaling,
|
| 207 |
+
chunk_size=chunk_size, # enable chunked prefilling by default
|
| 208 |
+
switch_over_seq_len=switch_over_seq_len,
|
| 209 |
)
|
| 210 |
|
| 211 |
if interpolate_exp_amount == 1:
|
|
|
|
| 244 |
use_cache: Optional[bool] = False,
|
| 245 |
cache_position: Optional[torch.LongTensor] = None,
|
| 246 |
position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
|
| 247 |
+
chunk_size: Optional[int] = None,
|
| 248 |
+
switch_over_seq_len: Optional[int] = None,
|
| 249 |
**kwargs: Unpack[TransformersKwargs],
|
| 250 |
) -> tuple[torch.Tensor]:
|
| 251 |
residual = hidden_states
|
|
|
|
| 259 |
use_cache=use_cache,
|
| 260 |
cache_position=cache_position,
|
| 261 |
position_embeddings=position_embeddings,
|
| 262 |
+
chunk_size=chunk_size,
|
| 263 |
+
switch_over_seq_len=switch_over_seq_len,
|
| 264 |
**kwargs,
|
| 265 |
)
|
| 266 |
hidden_states = residual + hidden_states
|
|
|
|
| 353 |
inputs_embeds: Optional[torch.FloatTensor] = None,
|
| 354 |
use_cache: Optional[bool] = None,
|
| 355 |
cache_position: Optional[torch.LongTensor] = None,
|
| 356 |
+
chunk_size: Optional[int] = None,
|
| 357 |
+
switch_over_seq_len: Optional[int] = None,
|
| 358 |
**kwargs: Unpack[TransformersKwargs],
|
| 359 |
) -> BaseModelOutputWithPast:
|
| 360 |
if (input_ids is None) ^ (inputs_embeds is not None):
|
|
|
|
| 404 |
use_cache=use_cache,
|
| 405 |
cache_position=cache_position,
|
| 406 |
position_embeddings=position_embeddings,
|
| 407 |
+
chunk_size=chunk_size,
|
| 408 |
+
switch_over_seq_len=switch_over_seq_len,
|
| 409 |
**kwargs,
|
| 410 |
)
|
| 411 |
|
|
|
|
| 423 |
_tp_plan = {"lm_head": "colwise_rep"}
|
| 424 |
_pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
|
| 425 |
|
| 426 |
+
def __init__(self, config):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 427 |
super().__init__(config)
|
| 428 |
self.model = PowerCoderModel(config)
|
| 429 |
self.vocab_size = config.vocab_size
|
|
|
|
| 451 |
use_cache: Optional[bool] = None,
|
| 452 |
cache_position: Optional[torch.LongTensor] = None,
|
| 453 |
logits_to_keep: Union[int, torch.Tensor] = 0,
|
| 454 |
+
chunk_size: Optional[int] = None,
|
| 455 |
+
switch_over_seq_len: Optional[int] = None,
|
| 456 |
**kwargs: Unpack[TransformersKwargs],
|
| 457 |
) -> CausalLMOutputWithPast:
|
| 458 |
r"""
|
|
|
|
| 493 |
Position indices for cached key/value states when using incremental decoding.
|
| 494 |
logits_to_keep (`Union[int, torch.Tensor]`, *optional*, defaults to 0):
|
| 495 |
Number of logits to compute from the end of the sequence, or specific indices to compute.
|
| 496 |
+
chunk_size (`Optional[int]`, *optional*):
|
| 497 |
+
Chunk size for training and prefilling.
|
| 498 |
+
switch_over_seq_len (`Optional[int]`, *optional*):
|
| 499 |
+
Sequence length threshold for state update.
|
| 500 |
**kwargs:
|
| 501 |
Additional arguments passed to the underlying model's forward method.
|
| 502 |
|
|
|
|
| 516 |
inputs_embeds=inputs_embeds,
|
| 517 |
use_cache=use_cache,
|
| 518 |
cache_position=cache_position,
|
| 519 |
+
chunk_size=chunk_size,
|
| 520 |
+
switch_over_seq_len=switch_over_seq_len,
|
| 521 |
**kwargs,
|
| 522 |
)
|
| 523 |
|