Spaces:

KaioSan
/

Voice-Cloning22

Running

App Files Files Community

Voice-Cloning22 / TTS /vocoder /models /univnet_generator.py

Shadhil

voice-clone with single audio sample input

9b2107c almost 2 years ago

raw

history blame contribute delete

5.56 kB

	from typing import List

	import numpy as np
	import torch
	import torch.nn.functional as F
	from torch.nn.utils import parametrize

	from TTS.vocoder.layers.lvc_block import LVCBlock

	LRELU_SLOPE = 0.1


	class UnivnetGenerator(torch.nn.Module):
	def __init__(
	self,
	in_channels: int,
	out_channels: int,
	hidden_channels: int,
	cond_channels: int,
	upsample_factors: List[int],
	lvc_layers_each_block: int,
	lvc_kernel_size: int,
	kpnet_hidden_channels: int,
	kpnet_conv_size: int,
	dropout: float,
	use_weight_norm=True,
	):
	"""Univnet Generator network.

	Paper: https://arxiv.org/pdf/2106.07889.pdf

	Args:
	in_channels (int): Number of input tensor channels.
	out_channels (int): Number of channels of the output tensor.
	hidden_channels (int): Number of hidden network channels.
	cond_channels (int): Number of channels of the conditioning tensors.
	upsample_factors (List[int]): List of uplsample factors for the upsampling layers.
	lvc_layers_each_block (int): Number of LVC layers in each block.
	lvc_kernel_size (int): Kernel size of the LVC layers.
	kpnet_hidden_channels (int): Number of hidden channels in the key-point network.
	kpnet_conv_size (int): Number of convolution channels in the key-point network.
	dropout (float): Dropout rate.
	use_weight_norm (bool, optional): Enable/disable weight norm. Defaults to True.
	"""

	super().__init__()
	self.in_channels = in_channels
	self.out_channels = out_channels
	self.cond_channels = cond_channels
	self.upsample_scale = np.prod(upsample_factors)
	self.lvc_block_nums = len(upsample_factors)

	# define first convolution
	self.first_conv = torch.nn.Conv1d(
	in_channels, hidden_channels, kernel_size=7, padding=(7 - 1) // 2, dilation=1, bias=True
	)

	# define residual blocks
	self.lvc_blocks = torch.nn.ModuleList()
	cond_hop_length = 1
	for n in range(self.lvc_block_nums):
	cond_hop_length = cond_hop_length * upsample_factors[n]
	lvcb = LVCBlock(
	in_channels=hidden_channels,
	cond_channels=cond_channels,
	upsample_ratio=upsample_factors[n],
	conv_layers=lvc_layers_each_block,
	conv_kernel_size=lvc_kernel_size,
	cond_hop_length=cond_hop_length,
	kpnet_hidden_channels=kpnet_hidden_channels,
	kpnet_conv_size=kpnet_conv_size,
	kpnet_dropout=dropout,
	)
	self.lvc_blocks += [lvcb]

	# define output layers
	self.last_conv_layers = torch.nn.ModuleList(
	[
	torch.nn.Conv1d(
	hidden_channels, out_channels, kernel_size=7, padding=(7 - 1) // 2, dilation=1, bias=True
	),
	]
	)

	# apply weight norm
	if use_weight_norm:
	self.apply_weight_norm()

	def forward(self, c):
	"""Calculate forward propagation.
	Args:
	c (Tensor): Local conditioning auxiliary features (B, C ,T').
	Returns:
	Tensor: Output tensor (B, out_channels, T)
	"""
	# random noise
	x = torch.randn([c.shape[0], self.in_channels, c.shape[2]])
	x = x.to(self.first_conv.bias.device)
	x = self.first_conv(x)

	for n in range(self.lvc_block_nums):
	x = self.lvc_blocks[n](x, c)

	# apply final layers
	for f in self.last_conv_layers:
	x = F.leaky_relu(x, LRELU_SLOPE)
	x = f(x)
	x = torch.tanh(x)
	return x

	def remove_weight_norm(self):
	"""Remove weight normalization module from all of the layers."""

	def _remove_weight_norm(m):
	try:
	# print(f"Weight norm is removed from {m}.")
	parametrize.remove_parametrizations(m, "weight")
	except ValueError: # this module didn't have weight norm
	return

	self.apply(_remove_weight_norm)

	def apply_weight_norm(self):
	"""Apply weight normalization module from all of the layers."""

	def _apply_weight_norm(m):
	if isinstance(m, (torch.nn.Conv1d, torch.nn.Conv2d)):
	torch.nn.utils.parametrizations.weight_norm(m)
	# print(f"Weight norm is applied to {m}.")

	self.apply(_apply_weight_norm)

	@staticmethod
	def _get_receptive_field_size(layers, stacks, kernel_size, dilation=lambda x: 2**x):
	assert layers % stacks == 0
	layers_per_cycle = layers // stacks
	dilations = [dilation(i % layers_per_cycle) for i in range(layers)]
	return (kernel_size - 1) * sum(dilations) + 1

	@property
	def receptive_field_size(self):
	"""Return receptive field size."""
	return self._get_receptive_field_size(self.layers, self.stacks, self.kernel_size)

	@torch.no_grad()
	def inference(self, c):
	"""Perform inference.
	Args:
	c (Tensor): Local conditioning auxiliary features :math:`(B, C, T)`.
	Returns:
	Tensor: Output tensor (T, out_channels)
	"""
	x = torch.randn([c.shape[0], self.in_channels, c.shape[2]])
	x = x.to(self.first_conv.bias.device)

	c = c.to(next(self.parameters()))
	return self.forward(c)