Spaces:
Sleeping
Sleeping
| from transformers import Wav2Vec2BertPreTrainedModel, Wav2Vec2BertModel | |
| from transformers.modeling_outputs import SequenceClassifierOutput | |
| from typing import Optional, Tuple, Union | |
| from torch.nn import MSELoss | |
| import torch | |
| import torch.nn as nn | |
| import math | |
| class Wav2Vec2BertForSequenceClassification(Wav2Vec2BertPreTrainedModel): | |
| # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.__init__ with Wav2Vec2->Wav2Vec2Bert,wav2vec2->wav2vec2_bert | |
| def __init__(self, config): | |
| super().__init__(config) | |
| if hasattr(config, "add_adapter") and config.add_adapter: | |
| raise ValueError( | |
| "Sequence classification does not support the use of Wav2Vec2Bert adapters (config.add_adapter=True)" | |
| ) | |
| self.wav2vec2_bert = Wav2Vec2BertModel(config) | |
| num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings | |
| if config.use_weighted_layer_sum: | |
| self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers) | |
| self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size) | |
| self.pooled_conv = nn.Sequential(nn.Conv1d(config.hidden_size, config.hidden_size // 2, kernel_size=15, stride=3, padding=30), | |
| nn.AvgPool1d(2, 2), | |
| nn.BatchNorm1d(config.hidden_size // 2), | |
| nn.Conv1d(config.hidden_size // 2, config.classifier_proj_size, kernel_size=7, stride=2, padding=0), | |
| nn.ReLU() | |
| ) | |
| self.classifier = nn.Sequential(nn.Dropout(p=0.091,), | |
| nn.Linear(config.classifier_proj_size, config.classifier_proj_size // 2), | |
| nn.ReLU(), | |
| nn.Linear(config.classifier_proj_size // 2, config.num_labels), | |
| nn.ReLU(), | |
| ) | |
| # Initialize weights and apply final processing | |
| self.post_init() | |
| def freeze_base_model(self): | |
| """ | |
| Calling this function will disable the gradient computation for the base model so that its parameters will not | |
| be updated during training. Only the classification head will be updated. | |
| """ | |
| for param in self.wav2vec2_bert.parameters(): | |
| param.requires_grad = False | |
| # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.forward with Wav2Vec2->Wav2Vec2Bert,wav2vec2->wav2vec2_bert,WAV_2_VEC_2->WAV2VEC2_BERT, input_values->input_features | |
| def forward( | |
| self, | |
| input_features: Optional[torch.Tensor], | |
| attention_mask: Optional[torch.Tensor] = None, | |
| output_attentions: Optional[bool] = None, | |
| output_hidden_states: Optional[bool] = None, | |
| return_dict: Optional[bool] = None, | |
| labels: Optional[torch.Tensor] = None, | |
| ) -> Union[Tuple, SequenceClassifierOutput]: | |
| r""" | |
| labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): | |
| Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., | |
| config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If | |
| `config.num_labels > 1` a classification loss is computed (Cross-Entropy). | |
| """ | |
| return_dict = return_dict if return_dict is not None else self.config.use_return_dict | |
| output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states | |
| outputs = self.wav2vec2_bert( | |
| input_features, | |
| attention_mask=attention_mask, | |
| output_attentions=output_attentions, | |
| output_hidden_states=output_hidden_states, | |
| return_dict=return_dict, | |
| ) | |
| if self.config.use_weighted_layer_sum: | |
| hidden_states = outputs[_HIDDEN_STATES_START_POSITION] | |
| hidden_states = torch.stack(hidden_states, dim=1) | |
| norm_weights = nn.functional.softmax(self.layer_weights, dim=-1) | |
| hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1) | |
| else: | |
| hidden_states = outputs[0] | |
| hidden_states = hidden_states.permute(0, 2, 1) | |
| hidden_states = self.pooled_conv(hidden_states) | |
| hidden_states = torch.mean(hidden_states, dim=2) | |
| logits = self.classifier(hidden_states) | |
| loss = None | |
| if labels is not None: | |
| loss_fct = nn.L1Loss() | |
| loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1, self.config.num_labels)) | |
| if not return_dict: | |
| output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:] | |
| return ((loss,) + output) if loss is not None else output | |
| return SequenceClassifierOutput( | |
| loss=loss, | |
| logits=logits, | |
| hidden_states=outputs.hidden_states, | |
| attentions=outputs.attentions, | |
| ) | |