IndexTTS-2-Demo

Running on Zero

init

fba9477 2 months ago

1.84 kB

	from __future__ import annotations

	from typing import Any, Dict, Tuple, Union, Optional

	import torch
	import yaml
	from torch import nn
	from .heads import ISTFTHead
	from .models import VocosBackbone


	class Vocos(nn.Module):
	"""
	The Vocos class represents a Fourier-based neural vocoder for audio synthesis.
	This class is primarily designed for inference, with support for loading from pretrained
	model checkpoints. It consists of three main components: a feature extractor,
	a backbone, and a head.
	"""

	def __init__(
	self, args,
	):
	super().__init__()
	self.backbone = VocosBackbone(
	input_channels=args.vocos.backbone.input_channels,
	dim=args.vocos.backbone.dim,
	intermediate_dim=args.vocos.backbone.intermediate_dim,
	num_layers=args.vocos.backbone.num_layers,
	)
	self.head = ISTFTHead(
	dim=args.vocos.head.dim,
	n_fft=args.vocos.head.n_fft,
	hop_length=args.vocos.head.hop_length,
	padding=args.vocos.head.padding,
	)

	def forward(self, features_input: torch.Tensor, **kwargs: Any) -> torch.Tensor:
	"""
	Method to decode audio waveform from already calculated features. The features input is passed through
	the backbone and the head to reconstruct the audio output.

	Args:
	features_input (Tensor): The input tensor of features of shape (B, C, L), where B is the batch size,
	C denotes the feature dimension, and L is the sequence length.

	Returns:
	Tensor: The output tensor representing the reconstructed audio waveform of shape (B, T).
	"""
	x = self.backbone(features_input, **kwargs)
	audio_output = self.head(x)
	return audio_output