Spaces:
Running
Running
| # -*- coding: utf-8 -*- | |
| # @Time : 2021/12/30 8:35 下午 | |
| # @Author : JianingWang | |
| # @File : mlm.py | |
| import logging | |
| from typing import Union, Tuple, Optional | |
| import torch | |
| from torch.nn import CrossEntropyLoss | |
| from transformers.modeling_outputs import MaskedLMOutput | |
| from transformers.models.bert import BertPreTrainedModel | |
| from transformers.models.bert.modeling_bert import BertPreTrainedModel, BertModel, BertOnlyMLMHead | |
| from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel, RobertaModel, RobertaLMHead | |
| from transformers.models.albert.modeling_albert import AlbertPreTrainedModel, AlbertModel, AlbertMLMHead | |
| from transformers.models.roformer.modeling_roformer import RoFormerPreTrainedModel, RoFormerModel, RoFormerOnlyMLMHead | |
| logger = logging.getLogger(__name__) | |
| """ | |
| Function: Use MLM to pre-train BERT | |
| Notes: | |
| - The label of non-masked token is -100, which can be used for cross-entropy function (only calculate loss at not -100) | |
| """ | |
| class BertForMaskedLM(BertPreTrainedModel): | |
| def __init__(self, config, *inputs, **kwargs): | |
| super().__init__(config, *inputs, **kwargs) | |
| self.bert = BertModel(config, add_pooling_layer=False) | |
| self.cls = BertOnlyMLMHead(config) | |
| # Initialize weights and apply final processing | |
| self.post_init() | |
| def forward( | |
| self, | |
| input_ids: Optional[torch.Tensor] = None, | |
| attention_mask: Optional[torch.Tensor] = None, | |
| token_type_ids: Optional[torch.Tensor] = None, | |
| position_ids: Optional[torch.Tensor] = None, | |
| head_mask: Optional[torch.Tensor] = None, | |
| inputs_embeds: Optional[torch.Tensor] = None, | |
| encoder_hidden_states: Optional[torch.Tensor] = None, | |
| encoder_attention_mask: Optional[torch.Tensor] = None, | |
| labels: Optional[torch.Tensor] = None, | |
| output_attentions: Optional[bool] = None, | |
| output_hidden_states: Optional[bool] = None, | |
| return_dict: Optional[bool] = None, | |
| ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]: | |
| r""" | |
| labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): | |
| Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., | |
| config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the | |
| loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` | |
| kwargs (`Dict[str, any]`, optional, defaults to *{}*): | |
| Used to hide legacy arguments that have been deprecated. | |
| """ | |
| return_dict = return_dict if return_dict is not None else self.config.use_return_dict | |
| outputs = self.bert( | |
| input_ids, | |
| attention_mask=attention_mask, | |
| token_type_ids=token_type_ids, | |
| position_ids=position_ids, | |
| head_mask=head_mask, | |
| inputs_embeds=inputs_embeds, | |
| encoder_hidden_states=encoder_hidden_states, | |
| encoder_attention_mask=encoder_attention_mask, | |
| output_attentions=output_attentions, | |
| output_hidden_states=output_hidden_states, | |
| return_dict=return_dict, | |
| ) | |
| sequence_output = outputs[0] | |
| prediction_scores = self.cls(sequence_output) | |
| masked_lm_loss = None | |
| if labels is not None: | |
| loss_fct = CrossEntropyLoss() # -100 index = padding token | |
| masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) | |
| if not return_dict: | |
| output = (prediction_scores,) + outputs[2:] | |
| return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output | |
| return MaskedLMOutput( | |
| loss=masked_lm_loss, # () | |
| logits=prediction_scores, # (batch_size, seq_len, vocab_size) | |
| hidden_states=outputs.hidden_states, # (batch_size, seq_len, hidden_size) | |
| attentions=outputs.attentions, | |
| ) | |
| """ | |
| Function: Use MLM to pre-train RoBERTa | |
| Notes: | |
| - The label of non-masked token is -100, which can be used for cross-entropy function (only calculate loss at not -100) | |
| """ | |
| class RobertaForMaskedLM(RobertaPreTrainedModel): | |
| def __init__(self, config, *inputs, **kwargs): | |
| super().__init__(config, *inputs, **kwargs) | |
| self.roberta = BertModel(config, add_pooling_layer=False) | |
| self.lm_head = RobertaLMHead(config) | |
| # Initialize weights and apply final processing | |
| self.post_init() | |
| def forward( | |
| self, | |
| input_ids: Optional[torch.LongTensor] = None, | |
| attention_mask: Optional[torch.FloatTensor] = None, | |
| token_type_ids: Optional[torch.LongTensor] = None, | |
| position_ids: Optional[torch.LongTensor] = None, | |
| head_mask: Optional[torch.FloatTensor] = None, | |
| inputs_embeds: Optional[torch.FloatTensor] = None, | |
| encoder_hidden_states: Optional[torch.FloatTensor] = None, | |
| encoder_attention_mask: Optional[torch.FloatTensor] = None, | |
| labels: Optional[torch.LongTensor] = None, | |
| output_attentions: Optional[bool] = None, | |
| output_hidden_states: Optional[bool] = None, | |
| return_dict: Optional[bool] = None, | |
| ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]: | |
| r""" | |
| labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): | |
| Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., | |
| config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the | |
| loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` | |
| kwargs (`Dict[str, any]`, optional, defaults to *{}*): | |
| Used to hide legacy arguments that have been deprecated. | |
| """ | |
| return_dict = return_dict if return_dict is not None else self.config.use_return_dict | |
| outputs = self.roberta( | |
| input_ids, | |
| attention_mask=attention_mask, | |
| token_type_ids=token_type_ids, | |
| position_ids=position_ids, | |
| head_mask=head_mask, | |
| inputs_embeds=inputs_embeds, | |
| encoder_hidden_states=encoder_hidden_states, | |
| encoder_attention_mask=encoder_attention_mask, | |
| output_attentions=output_attentions, | |
| output_hidden_states=output_hidden_states, | |
| return_dict=return_dict, | |
| ) | |
| sequence_output = outputs[0] | |
| prediction_scores = self.lm_head(sequence_output) | |
| masked_lm_loss = None | |
| if labels is not None: | |
| loss_fct = CrossEntropyLoss() # -100 index = padding token | |
| masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) | |
| if not return_dict: | |
| output = (prediction_scores,) + outputs[2:] | |
| return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output | |
| return MaskedLMOutput( | |
| loss=masked_lm_loss, # () | |
| logits=prediction_scores, # (batch_size, seq_len, vocab_size) | |
| hidden_states=outputs.hidden_states, # (batch_size, seq_len, hidden_size) | |
| attentions=outputs.attentions, | |
| ) | |
| """ | |
| Function: Use MLM to pre-train ALBERT | |
| Notes: | |
| - The label of non-masked token is -100, which can be used for cross-entropy function (only calculate loss at not -100) | |
| """ | |
| class AlbertForMaskedLM(AlbertPreTrainedModel): | |
| def __init__(self, config, *inputs, **kwargs): | |
| super().__init__(config, *inputs, **kwargs) | |
| self.albert = AlbertModel(config, add_pooling_layer=False) | |
| self.predictions = AlbertMLMHead(config) | |
| # Initialize weights and apply final processing | |
| self.post_init() | |
| def forward( | |
| self, | |
| input_ids: Optional[torch.LongTensor] = None, | |
| attention_mask: Optional[torch.FloatTensor] = None, | |
| token_type_ids: Optional[torch.LongTensor] = None, | |
| position_ids: Optional[torch.LongTensor] = None, | |
| head_mask: Optional[torch.FloatTensor] = None, | |
| inputs_embeds: Optional[torch.FloatTensor] = None, | |
| labels: Optional[torch.LongTensor] = None, | |
| output_attentions: Optional[bool] = None, | |
| output_hidden_states: Optional[bool] = None, | |
| return_dict: Optional[bool] = None, | |
| ) -> Union[MaskedLMOutput, Tuple]: | |
| r""" | |
| labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): | |
| Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., | |
| config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the | |
| loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` | |
| Returns: | |
| Example: | |
| ```python | |
| >>> import torch | |
| >>> from transformers import AlbertTokenizer, AlbertForMaskedLM | |
| >>> tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2") | |
| >>> model = AlbertForMaskedLM.from_pretrained("albert-base-v2") | |
| >>> # add mask_token | |
| >>> inputs = tokenizer("The capital of [MASK] is Paris.", return_tensors="pt") | |
| >>> with torch.no_grad(): | |
| ... logits = model(**inputs).logits | |
| >>> # retrieve index of [MASK] | |
| >>> mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0] | |
| >>> predicted_token_id = logits[0, mask_token_index].argmax(axis=-1) | |
| >>> tokenizer.decode(predicted_token_id) | |
| "france" | |
| ``` | |
| ```python | |
| >>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"] | |
| >>> labels = torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100) | |
| >>> outputs = model(**inputs, labels=labels) | |
| >>> round(outputs.loss.item(), 2) | |
| 0.81 | |
| ``` | |
| """ | |
| return_dict = return_dict if return_dict is not None else self.config.use_return_dict | |
| outputs = self.albert( | |
| input_ids=input_ids, | |
| attention_mask=attention_mask, | |
| token_type_ids=token_type_ids, | |
| position_ids=position_ids, | |
| head_mask=head_mask, | |
| inputs_embeds=inputs_embeds, | |
| output_attentions=output_attentions, | |
| output_hidden_states=output_hidden_states, | |
| return_dict=return_dict, | |
| ) | |
| sequence_outputs = outputs[0] | |
| prediction_scores = self.predictions(sequence_outputs) | |
| masked_lm_loss = None | |
| if labels is not None: | |
| loss_fct = CrossEntropyLoss() | |
| masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) | |
| if not return_dict: | |
| output = (prediction_scores,) + outputs[2:] | |
| return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output | |
| return MaskedLMOutput( | |
| loss=masked_lm_loss, | |
| logits=prediction_scores, | |
| hidden_states=outputs.hidden_states, | |
| attentions=outputs.attentions, | |
| ) | |
| """ | |
| Function: Use MLM to pre-train RoFormer | |
| Notes: | |
| - The label of non-masked token is -100, which can be used for cross-entropy function (only calculate loss at not -100) | |
| """ | |
| class RoFormerForMaskedLM(RoFormerPreTrainedModel): | |
| def __init__(self, config): | |
| super().__init__(config) | |
| if config.is_decoder: | |
| logger.warning( | |
| "If you want to use `RoFormerForMaskedLM` make sure `config.is_decoder=False` for " | |
| "bi-directional self-attention." | |
| ) | |
| self.roformer = RoFormerModel(config) | |
| self.cls = RoFormerOnlyMLMHead(config) | |
| # Initialize weights and apply final processing | |
| self.post_init() | |
| def forward( | |
| self, | |
| input_ids: Optional[torch.LongTensor] = None, | |
| attention_mask: Optional[torch.FloatTensor] = None, | |
| token_type_ids: Optional[torch.LongTensor] = None, | |
| head_mask: Optional[torch.FloatTensor] = None, | |
| inputs_embeds: Optional[torch.FloatTensor] = None, | |
| encoder_hidden_states: Optional[torch.FloatTensor] = None, | |
| encoder_attention_mask: Optional[torch.FloatTensor] = None, | |
| labels: Optional[torch.LongTensor] = None, | |
| output_attentions: Optional[bool] = None, | |
| output_hidden_states: Optional[bool] = None, | |
| return_dict: Optional[bool] = None, | |
| ) -> Union[MaskedLMOutput, Tuple[torch.Tensor]]: | |
| r""" | |
| labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): | |
| Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., | |
| config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the | |
| loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. | |
| """ | |
| return_dict = return_dict if return_dict is not None else self.config.use_return_dict | |
| outputs = self.roformer( | |
| input_ids, | |
| attention_mask=attention_mask, | |
| token_type_ids=token_type_ids, | |
| head_mask=head_mask, | |
| inputs_embeds=inputs_embeds, | |
| encoder_hidden_states=encoder_hidden_states, | |
| encoder_attention_mask=encoder_attention_mask, | |
| output_attentions=output_attentions, | |
| output_hidden_states=output_hidden_states, | |
| return_dict=return_dict, | |
| ) | |
| sequence_output = outputs[0] | |
| prediction_scores = self.cls(sequence_output) | |
| masked_lm_loss = None | |
| if labels is not None: | |
| loss_fct = CrossEntropyLoss() # -100 index = padding token | |
| masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) | |
| if not return_dict: | |
| output = (prediction_scores,) + outputs[1:] | |
| return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output | |
| return MaskedLMOutput( | |
| loss=masked_lm_loss, | |
| logits=prediction_scores, | |
| hidden_states=outputs.hidden_states, | |
| attentions=outputs.attentions, | |
| ) | |
| if __name__ == "__main__": | |
| from transformers.models.bert.tokenization_bert import BertTokenizer | |
| tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") | |
| model = BertForMaskedLM.from_pretrained("bert-base-uncased") | |
| input_text = "Today is a nice day, I will [MASK] to play [MASK] with my friends." | |
| inputs = tokenizer(input_text, return_tensors="pt") | |
| masked_positions = inputs["input_ids"] == tokenizer.mask_token_id | |
| print("inputs=", inputs) | |
| """ | |
| inputs= {"input_ids": tensor([[ 101, 2651, 2003, 1037, 3835, 2154, 1010, 1045, 2097, 103, 2000, 2377, | |
| 103, 2007, 2026, 2814, 1012, 102]]), "token_type_ids": tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), "attention_mask": tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])} | |
| """ | |
| outputs = model(**inputs) | |
| masked_results = outputs.logits.argmax(-1)[masked_positions] | |
| masked_results = tokenizer.convert_ids_to_tokens(masked_results) | |
| print("masked_results=", masked_results) | |
| """ | |
| masked_results= ["have", "football"] | |
| """ | |