SequentialLearning
/

SuperLinear

mixture-of-experts

Model card Files Files and versions

lirannoc commited on Jun 3

Commit

868b7c0

·

verified ·

1 Parent(s): 628d649

Update modeling_super_linear.py

Files changed (1) hide show

modeling_super_linear.py +2 -3

modeling_super_linear.py CHANGED Viewed

@@ -561,7 +561,6 @@ class SuperLinearForCausalLM(PreTrainedModel, GenerationMixin):
     def forward(self,
                 inputs_embeds: torch.Tensor = None,
-                prediction_len: int = None,
                 attention_mask: Optional[torch.Tensor] = None,
                 past_key_values: Optional[Tuple] = None,
                 use_cache: bool = True,
@@ -577,11 +576,11 @@ class SuperLinearForCausalLM(PreTrainedModel, GenerationMixin):
         return CausalLMOutputWithCrossAttentions(loss=None,logits=preds,past_key_values=None,hidden_states=None,attentions=None,)
-    def prepare_inputs_for_generation(self, inputs_embeds, past_key_values=None, prediction_len=None, **kwargs):
         if past_key_values is not None:
             # only feed the last new step
             inputs_embeds = inputs_embeds[:, -1:, :]
-        return {"inputs_embeds": inputs_embeds, "past_key_values": past_key_values, "prediction_len": prediction_len}
     def _reorder_cache(self, past, beam_idx, **kwargs):
         return past  # backbone keeps no KV cache

     def forward(self,
                 inputs_embeds: torch.Tensor = None,
                 attention_mask: Optional[torch.Tensor] = None,
                 past_key_values: Optional[Tuple] = None,
                 use_cache: bool = True,
         return CausalLMOutputWithCrossAttentions(loss=None,logits=preds,past_key_values=None,hidden_states=None,attentions=None,)
+    def prepare_inputs_for_generation(self, inputs_embeds, past_key_values=None, **kwargs):
         if past_key_values is not None:
             # only feed the last new step
             inputs_embeds = inputs_embeds[:, -1:, :]
+        return {"inputs_embeds": inputs_embeds, "past_key_values": past_key_values}
     def _reorder_cache(self, past, beam_idx, **kwargs):
         return past  # backbone keeps no KV cache