Spaces:

knowledgator
/

ChemicalConverters

Runtime error

App Files Files Community

ChemicalConverters / modeling /model.py

BioMike

Upload 23 files

7476d14 verified almost 2 years ago

raw

history blame

27.5 kB

	import copy
	import math
	import warnings
	from typing import List, Optional, Tuple, Union

	from transformers import MT5PreTrainedModel
	from transformers.models.mt5 import MT5Stack
	from transformers.modeling_outputs import Seq2SeqModelOutput,Seq2SeqLMOutput, BaseModelOutput
	from transformers.utils import (
	add_start_docstrings,
	add_start_docstrings_to_model_forward,
	logging,
	replace_return_docstrings,
	)

	from transformers.utils.model_parallel_utils import assert_device_map, get_device_map

	import torch
	from torch import nn
	from torch.nn import CrossEntropyLoss

	from .config import MT5Config
	from .docstrings import (
	PARALLELIZE_DOCSTRING,
	DEPARALLELIZE_DOCSTRING,
	__HEAD_MASK_WARNING_MSG,
	MT5_START_DOCSTRING,
	MT5_INPUTS_DOCSTRING,
	)


	logger = logging.get_logger(__name__)

	_CONFIG_FOR_DOC = "MT5Config"
	_CHECKPOINT_FOR_DOC = "mt5-small"


	class MT5Model(MT5PreTrainedModel):
	r"""
	Examples:

	```python
	>>> from transformers import MT5Model, AutoTokenizer

	>>> model = MT5Model.from_pretrained("google/mt5-small")
	>>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
	>>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
	>>> summary = "Weiter Verhandlung in Syrien."
	>>> inputs = tokenizer(article, return_tensors="pt")
	>>> labels = tokenizer(text_target=summary, return_tensors="pt")

	>>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"])
	>>> hidden_states = outputs.last_hidden_state
	```"""

	model_type = "mt5"
	config_class = MT5Config
	_keys_to_ignore_on_load_missing = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
	_keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
	_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]

	# Copied from transformers.models.t5.modeling_t5.T5Model.__init__ with T5->MT5
	def __init__(self, config: MT5Config):
	super().__init__(config)
	self.encoder_embedding = nn.Embedding(config.encoder_vocab_size, config.d_model)
	if config.shared_embedding:
	self.decoder_embedding = self.encoder_embedding
	else:
	self.decoder_emebedding = nn.Embedding(config.decoder_vocab_size, config.d_model)

	encoder_config = copy.deepcopy(config)
	encoder_config.is_decoder = False
	encoder_config.use_cache = False
	encoder_config.is_encoder_decoder = False
	self.encoder = MT5Stack(encoder_config, self.encoder_embedding)

	decoder_config = copy.deepcopy(config)
	decoder_config.is_decoder = True
	decoder_config.is_encoder_decoder = False
	decoder_config.num_layers = config.num_decoder_layers
	self.decoder = MT5Stack(decoder_config, self.decoder_emebedding)

	# Initialize weights and apply final processing
	self.post_init()

	# Model parallel
	self.model_parallel = False
	self.device_map = None

	# Copied from transformers.models.t5.modeling_t5.T5Model.parallelize
	def parallelize(self, device_map=None):
	warnings.warn(
	"`T5Model.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model"
	" with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
	" `device_map` but it needs to be a dictionary module_name to device, so for instance {'encoder.block.0':"
	" 0, 'encoder.block.1': 1, ...}",
	FutureWarning,
	)
	self.device_map = (
	get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
	if device_map is None
	else device_map
	)
	assert_device_map(self.device_map, len(self.encoder.block))
	self.encoder.parallelize(self.device_map)
	self.decoder.parallelize(self.device_map)
	self.model_parallel = True

	@add_start_docstrings(DEPARALLELIZE_DOCSTRING)
	# Copied from transformers.models.t5.modeling_t5.T5Model.deparallelize
	def deparallelize(self):
	warnings.warn(
	"Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
	FutureWarning,
	)
	self.encoder.deparallelize()
	self.decoder.deparallelize()
	self.encoder = self.encoder.to("cpu")
	self.decoder = self.decoder.to("cpu")
	self.model_parallel = False
	self.device_map = None
	torch.cuda.empty_cache()

	# Copied from transformers.models.t5.modeling_t5.T5Model.get_input_embeddings
	def get_input_embeddings(self):
	return self.encoder_embedding

	# Copied from transformers.models.t5.modeling_t5.T5Model.set_input_embeddings
	def set_input_embeddings(self, new_embeddings):
	self.encoder_embedding = new_embeddings
	self.encoder.set_input_embeddings(new_embeddings)
	self.decoder.set_input_embeddings(new_embeddings)

	# Copied from transformers.models.t5.modeling_t5.T5Model.get_encoder
	def get_encoder(self):
	return self.encoder

	# Copied from transformers.models.t5.modeling_t5.T5Model.get_decoder
	def get_decoder(self):
	return self.decoder

	# Copied from transformers.models.t5.modeling_t5.T5Model._prune_heads
	def _prune_heads(self, heads_to_prune):
	"""
	Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
	class PreTrainedModel
	"""
	for layer, heads in heads_to_prune.items():
	self.encoder.layer[layer].attention.prune_heads(heads)

	@replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
	# Copied from transformers.models.t5.modeling_t5.T5Model.forward with T5->MT5, t5->mt5
	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.FloatTensor] = None,
	decoder_input_ids: Optional[torch.LongTensor] = None,
	decoder_attention_mask: Optional[torch.BoolTensor] = None,
	head_mask: Optional[torch.FloatTensor] = None,
	decoder_head_mask: Optional[torch.FloatTensor] = None,
	cross_attn_head_mask: Optional[torch.Tensor] = None,
	encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
	past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
	inputs_embeds: Optional[torch.Tensor] = None,
	decoder_inputs_embeds: Optional[torch.Tensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> Union[Tuple[torch.FloatTensor], Seq2SeqModelOutput]:
	r"""
	Returns:

	Example:

	```python
	>>> from transformers import AutoTokenizer, MT5Model

	>>> tokenizer = AutoTokenizer.from_pretrained("mt5-small")
	>>> model = MT5Model.from_pretrained("mt5-small")

	>>> input_ids = tokenizer(
	... "Studies have been shown that owning a dog is good for you", return_tensors="pt"
	... ).input_ids # Batch size 1
	>>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1

	>>> # preprocess: Prepend decoder_input_ids with start token which is pad token for MT5Model.
	>>> # This is not needed for torch's MT5ForConditionalGeneration as it does this internally using labels arg.
	>>> decoder_input_ids = model._shift_right(decoder_input_ids)

	>>> # forward pass
	>>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
	>>> last_hidden_states = outputs.last_hidden_state
	```"""
	use_cache = use_cache if use_cache is not None else self.config.use_cache
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	# FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
	if head_mask is not None and decoder_head_mask is None:
	if self.config.num_layers == self.config.num_decoder_layers:
	warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
	decoder_head_mask = head_mask

	# Encode if needed (training, first prediction pass)
	if encoder_outputs is None:
	encoder_outputs = self.encoder(
	input_ids=input_ids,
	attention_mask=attention_mask,
	inputs_embeds=inputs_embeds,
	head_mask=head_mask,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)
	elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
	encoder_outputs = BaseModelOutput(
	last_hidden_state=encoder_outputs[0],
	hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
	attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
	)

	hidden_states = encoder_outputs[0]

	# Set device for model parallelism
	if self.model_parallel:
	torch.cuda.set_device(self.decoder.first_device)
	hidden_states = hidden_states.to(self.decoder.first_device)
	if decoder_input_ids is not None:
	decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
	if attention_mask is not None:
	attention_mask = attention_mask.to(self.decoder.first_device)
	if decoder_attention_mask is not None:
	decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)

	# Decode
	decoder_outputs = self.decoder(
	input_ids=decoder_input_ids,
	attention_mask=decoder_attention_mask,
	inputs_embeds=decoder_inputs_embeds,
	past_key_values=past_key_values,
	encoder_hidden_states=hidden_states,
	encoder_attention_mask=attention_mask,
	head_mask=decoder_head_mask,
	cross_attn_head_mask=cross_attn_head_mask,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)

	if not return_dict:
	return decoder_outputs + encoder_outputs

	return Seq2SeqModelOutput(
	last_hidden_state=decoder_outputs.last_hidden_state,
	past_key_values=decoder_outputs.past_key_values,
	decoder_hidden_states=decoder_outputs.hidden_states,
	decoder_attentions=decoder_outputs.attentions,
	cross_attentions=decoder_outputs.cross_attentions,
	encoder_last_hidden_state=encoder_outputs.last_hidden_state,
	encoder_hidden_states=encoder_outputs.hidden_states,
	encoder_attentions=encoder_outputs.attentions,
	)


	@add_start_docstrings("""MT5 Model with a `language modeling` head on top.""", MT5_START_DOCSTRING)
	class MT5ForConditionalGeneration(MT5PreTrainedModel):
	r"""
	Examples:

	```python
	>>> from transformers import MT5ForConditionalGeneration, AutoTokenizer

	>>> model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
	>>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
	>>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
	>>> summary = "Weiter Verhandlung in Syrien."
	>>> inputs = tokenizer(article, text_target=summary, return_tensors="pt")

	>>> outputs = model(**inputs)
	>>> loss = outputs.loss
	```"""

	model_type = "mt5"
	config_class = MT5Config
	_keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
	_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]

	# Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.__init__ with T5->MT5
	def __init__(self, config: MT5Config):
	super().__init__(config)
	self.model_dim = config.d_model

	self.encoder_embedding = nn.Embedding(config.encoder_vocab_size, config.d_model)
	if config.shared_embedding:
	self.decoder_embedding = self.encoder_embedding
	else:
	self.decoder_emebedding = nn.Embedding(config.decoder_vocab_size, config.d_model)

	encoder_config = copy.deepcopy(config)
	encoder_config.is_decoder = False
	encoder_config.use_cache = False
	encoder_config.is_encoder_decoder = False
	self.encoder = MT5Stack(encoder_config, self.encoder_embedding)

	decoder_config = copy.deepcopy(config)
	decoder_config.is_decoder = True
	decoder_config.is_encoder_decoder = False
	decoder_config.num_layers = config.num_decoder_layers
	self.decoder = MT5Stack(decoder_config, self.decoder_emebedding)

	self.lm_head = nn.Linear(config.d_model, config.decoder_vocab_size, bias=False)

	# Initialize weights and apply final processing
	self.post_init()

	# Model parallel
	self.model_parallel = False
	self.device_map = None

	@add_start_docstrings(PARALLELIZE_DOCSTRING)
	# Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.parallelize
	def parallelize(self, device_map=None):
	warnings.warn(
	"`T5ForConditionalGeneration.parallelize` is deprecated and will be removed in v5 of Transformers, you"
	" should load your model with `device_map='balanced'` in the call to `from_pretrained`. You can also"
	" provide your own `device_map` but it needs to be a dictionary module_name to device, so for instance"
	" {'encoder.block.0': 0, 'encoder.block.1': 1, ...}",
	FutureWarning,
	)
	self.device_map = (
	get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
	if device_map is None
	else device_map
	)
	assert_device_map(self.device_map, len(self.encoder.block))
	self.encoder.parallelize(self.device_map)
	self.decoder.parallelize(self.device_map)
	self.lm_head = self.lm_head.to(self.decoder.first_device)
	self.model_parallel = True

	@add_start_docstrings(DEPARALLELIZE_DOCSTRING)
	# Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.deparallelize
	def deparallelize(self):
	warnings.warn(
	"Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
	FutureWarning,
	)
	self.encoder.deparallelize()
	self.decoder.deparallelize()
	self.encoder = self.encoder.to("cpu")
	self.decoder = self.decoder.to("cpu")
	self.lm_head = self.lm_head.to("cpu")
	self.model_parallel = False
	self.device_map = None
	torch.cuda.empty_cache()

	# Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_input_embeddings
	def get_input_embeddings(self):
	return self.encoder_embedding

	# Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.set_input_embeddings
	def set_input_embeddings(self, new_embeddings):
	self.encoder_embedding = new_embeddings
	self.encoder.set_input_embeddings(new_embeddings)
	self.decoder.set_input_embeddings(new_embeddings)

	# Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.set_output_embeddings
	def set_output_embeddings(self, new_embeddings):
	self.lm_head = new_embeddings

	# Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_output_embeddings
	def get_output_embeddings(self):
	return self.lm_head

	# Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_encoder
	def get_encoder(self):
	return self.encoder

	# Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_decoder
	def get_decoder(self):
	return self.decoder

	@add_start_docstrings_to_model_forward(MT5_INPUTS_DOCSTRING)
	@replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
	# Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.forward with T5->MT5, t5->mt5
	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.FloatTensor] = None,
	decoder_input_ids: Optional[torch.LongTensor] = None,
	decoder_attention_mask: Optional[torch.BoolTensor] = None,
	head_mask: Optional[torch.FloatTensor] = None,
	decoder_head_mask: Optional[torch.FloatTensor] = None,
	cross_attn_head_mask: Optional[torch.Tensor] = None,
	encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
	past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
	labels: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
	r"""
	labels (`torch.LongTensor` of shape `(batch_size,)`, optional):
	Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
	config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
	labels in `[0, ..., config.vocab_size]`

	Returns:

	Examples:

	```python
	>>> from transformers import AutoTokenizer, MT5ForConditionalGeneration

	>>> tokenizer = AutoTokenizer.from_pretrained("mt5-small")
	>>> model = MT5ForConditionalGeneration.from_pretrained("mt5-small")

	>>> # training
	>>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
	>>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
	>>> outputs = model(input_ids=input_ids, labels=labels)
	>>> loss = outputs.loss
	>>> logits = outputs.logits

	>>> # inference
	>>> input_ids = tokenizer(
	... "summarize: studies have shown that owning a dog is good for you", return_tensors="pt"
	... ).input_ids # Batch size 1
	>>> outputs = model.generate(input_ids)
	>>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
	>>> # studies have shown that owning a dog is good for you.
	```"""
	use_cache = use_cache if use_cache is not None else self.config.use_cache
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	# FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
	if head_mask is not None and decoder_head_mask is None:
	if self.config.num_layers == self.config.num_decoder_layers:
	warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
	decoder_head_mask = head_mask

	# Encode if needed (training, first prediction pass)
	if encoder_outputs is None:
	# Convert encoder inputs in embeddings if needed
	encoder_outputs = self.encoder(
	input_ids=input_ids,
	attention_mask=attention_mask,
	inputs_embeds=inputs_embeds,
	head_mask=head_mask,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)
	elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
	encoder_outputs = BaseModelOutput(
	last_hidden_state=encoder_outputs[0],
	hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
	attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
	)

	hidden_states = encoder_outputs[0]

	if self.model_parallel:
	torch.cuda.set_device(self.decoder.first_device)

	if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
	# get decoder inputs from shifting lm labels to the right
	decoder_input_ids = self._shift_right(labels)

	# Set device for model parallelism
	if self.model_parallel:
	torch.cuda.set_device(self.decoder.first_device)
	hidden_states = hidden_states.to(self.decoder.first_device)
	if decoder_input_ids is not None:
	decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
	if attention_mask is not None:
	attention_mask = attention_mask.to(self.decoder.first_device)
	if decoder_attention_mask is not None:
	decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)

	# Decode
	decoder_outputs = self.decoder(
	input_ids=decoder_input_ids,
	attention_mask=decoder_attention_mask,
	inputs_embeds=decoder_inputs_embeds,
	past_key_values=past_key_values,
	encoder_hidden_states=hidden_states,
	encoder_attention_mask=attention_mask,
	head_mask=decoder_head_mask,
	cross_attn_head_mask=cross_attn_head_mask,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)

	sequence_output = decoder_outputs[0]

	# Set device for model parallelism
	if self.model_parallel:
	torch.cuda.set_device(self.encoder.first_device)
	self.lm_head = self.lm_head.to(self.encoder.first_device)
	sequence_output = sequence_output.to(self.lm_head.weight.device)

	if self.config.tie_word_embeddings:
	# Rescale output before projecting on vocab
	# See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
	sequence_output = sequence_output * (self.model_dim**-0.5)

	lm_logits = self.lm_head(sequence_output)

	loss = None
	if labels is not None:
	loss_fct = CrossEntropyLoss(ignore_index=-100)
	# move labels to correct device to enable PP
	labels = labels.to(lm_logits.device)
	loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
	# TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666

	if not return_dict:
	output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
	return ((loss,) + output) if loss is not None else output

	return Seq2SeqLMOutput(
	loss=loss,
	logits=lm_logits,
	past_key_values=decoder_outputs.past_key_values,
	decoder_hidden_states=decoder_outputs.hidden_states,
	decoder_attentions=decoder_outputs.attentions,
	cross_attentions=decoder_outputs.cross_attentions,
	encoder_last_hidden_state=encoder_outputs.last_hidden_state,
	encoder_hidden_states=encoder_outputs.hidden_states,
	encoder_attentions=encoder_outputs.attentions,
	)

	# Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.prepare_inputs_for_generation
	def prepare_inputs_for_generation(
	self,
	input_ids,
	past_key_values=None,
	attention_mask=None,
	head_mask=None,
	decoder_head_mask=None,
	decoder_attention_mask=None,
	cross_attn_head_mask=None,
	use_cache=None,
	encoder_outputs=None,
	**kwargs,
	):
	# cut decoder_input_ids if past_key_values is used
	if past_key_values is not None:
	past_length = past_key_values[0][0].shape[2]

	# Some generation methods already pass only the last input ID
	if input_ids.shape[1] > past_length:
	remove_prefix_length = past_length
	else:
	# Default to old behavior: keep only final ID
	remove_prefix_length = input_ids.shape[1] - 1

	input_ids = input_ids[:, remove_prefix_length:]

	return {
	"decoder_input_ids": input_ids,
	"past_key_values": past_key_values,
	"encoder_outputs": encoder_outputs,
	"attention_mask": attention_mask,
	"head_mask": head_mask,
	"decoder_head_mask": decoder_head_mask,
	"decoder_attention_mask": decoder_attention_mask,
	"cross_attn_head_mask": cross_attn_head_mask,
	"use_cache": use_cache,
	}

	# Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.prepare_decoder_input_ids_from_labels
	def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
	return self._shift_right(labels)

	# Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration._reorder_cache
	def _reorder_cache(self, past_key_values, beam_idx):
	# if decoder past is not included in output
	# speedy decoding is disabled and no need to reorder
	if past_key_values is None:
	logger.warning("You might want to consider setting `use_cache=True` to speed up decoding")
	return past_key_values

	reordered_decoder_past = ()
	for layer_past_states in past_key_values:
	# get the correct batch idx from layer past batch dim
	# batch dim of `past` is at 2nd position
	reordered_layer_past_states = ()
	for layer_past_state in layer_past_states:
	# need to set correct `past` for each of the four key / value states
	reordered_layer_past_states = reordered_layer_past_states + (
	layer_past_state.index_select(0, beam_idx.to(layer_past_state.device)),
	)

	if reordered_layer_past_states[0].shape != layer_past_states[0].shape:
	raise ValueError(
	f"reordered_layer_past_states[0] shape {reordered_layer_past_states[0].shape} and layer_past_states[0] shape {layer_past_states[0].shape} mismatched"
	)
	if len(reordered_layer_past_states) != len(layer_past_states):
	raise ValueError(
	f"length of reordered_layer_past_states {len(reordered_layer_past_states)} and length of layer_past_states {len(layer_past_states)} mismatched"
	)

	reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
	return reordered_decoder_past