Spaces:
Running
on
Zero
Running
on
Zero
| from __future__ import annotations | |
| from typing import Any, Dict, Tuple, Union, Optional | |
| import torch | |
| import yaml | |
| from torch import nn | |
| from .heads import ISTFTHead | |
| from .models import VocosBackbone | |
| class Vocos(nn.Module): | |
| """ | |
| The Vocos class represents a Fourier-based neural vocoder for audio synthesis. | |
| This class is primarily designed for inference, with support for loading from pretrained | |
| model checkpoints. It consists of three main components: a feature extractor, | |
| a backbone, and a head. | |
| """ | |
| def __init__( | |
| self, args, | |
| ): | |
| super().__init__() | |
| self.backbone = VocosBackbone( | |
| input_channels=args.vocos.backbone.input_channels, | |
| dim=args.vocos.backbone.dim, | |
| intermediate_dim=args.vocos.backbone.intermediate_dim, | |
| num_layers=args.vocos.backbone.num_layers, | |
| ) | |
| self.head = ISTFTHead( | |
| dim=args.vocos.head.dim, | |
| n_fft=args.vocos.head.n_fft, | |
| hop_length=args.vocos.head.hop_length, | |
| padding=args.vocos.head.padding, | |
| ) | |
| def forward(self, features_input: torch.Tensor, **kwargs: Any) -> torch.Tensor: | |
| """ | |
| Method to decode audio waveform from already calculated features. The features input is passed through | |
| the backbone and the head to reconstruct the audio output. | |
| Args: | |
| features_input (Tensor): The input tensor of features of shape (B, C, L), where B is the batch size, | |
| C denotes the feature dimension, and L is the sequence length. | |
| Returns: | |
| Tensor: The output tensor representing the reconstructed audio waveform of shape (B, T). | |
| """ | |
| x = self.backbone(features_input, **kwargs) | |
| audio_output = self.head(x) | |
| return audio_output | |