| from transformers import SequenceFeatureExtractor | |
| import numpy as np | |
| import torch | |
| class AntispoofingFeatureExtractor(SequenceFeatureExtractor): | |
| def __init__( | |
| self, | |
| feature_size=1, | |
| sampling_rate=16000, | |
| padding_value=0.0, | |
| return_attention_mask=True, | |
| **kwargs | |
| ): | |
| super().__init__( | |
| feature_size=feature_size, | |
| sampling_rate=sampling_rate, | |
| padding_value=padding_value, | |
| **kwargs | |
| ) | |
| self.return_attention_mask = return_attention_mask | |
| def __call__(self, audio, sampling_rate=None, return_tensors=True, **kwargs): | |
| audio = self.pad(audio, 64600) | |
| audio = torch.Tensor(audio) | |
| return { | |
| "input_values": audio | |
| } | |
| def pad(self, x, max_len): | |
| x_len = x.shape[0] | |
| if x_len >= max_len: | |
| return x[:max_len] | |
| num_repeats = int(max_len / x_len)+1 | |
| padded_x = np.tile(x, (1, num_repeats))[:, :max_len][0] | |
| return padded_x |