Spaces:

KaioSan
/

Voice-Cloning22

Running

App Files Files Community

Voice-Cloning22 / TTS /tts /layers /generic /time_depth_sep_conv.py

Shadhil

voice-clone with single audio sample input

9b2107c almost 2 years ago

raw

history blame contribute delete

2.56 kB

	import torch
	from torch import nn


	class TimeDepthSeparableConv(nn.Module):
	"""Time depth separable convolution as in https://arxiv.org/pdf/1904.02619.pdf
	It shows competative results with less computation and memory footprint."""

	def __init__(self, in_channels, hid_channels, out_channels, kernel_size, bias=True):
	super().__init__()

	self.in_channels = in_channels
	self.out_channels = out_channels
	self.hid_channels = hid_channels
	self.kernel_size = kernel_size

	self.time_conv = nn.Conv1d(
	in_channels,
	2 * hid_channels,
	kernel_size=1,
	stride=1,
	padding=0,
	bias=bias,
	)
	self.norm1 = nn.BatchNorm1d(2 * hid_channels)
	self.depth_conv = nn.Conv1d(
	hid_channels,
	hid_channels,
	kernel_size,
	stride=1,
	padding=(kernel_size - 1) // 2,
	groups=hid_channels,
	bias=bias,
	)
	self.norm2 = nn.BatchNorm1d(hid_channels)
	self.time_conv2 = nn.Conv1d(
	hid_channels,
	out_channels,
	kernel_size=1,
	stride=1,
	padding=0,
	bias=bias,
	)
	self.norm3 = nn.BatchNorm1d(out_channels)

	def forward(self, x):
	x_res = x
	x = self.time_conv(x)
	x = self.norm1(x)
	x = nn.functional.glu(x, dim=1)
	x = self.depth_conv(x)
	x = self.norm2(x)
	x = x * torch.sigmoid(x)
	x = self.time_conv2(x)
	x = self.norm3(x)
	x = x_res + x
	return x


	class TimeDepthSeparableConvBlock(nn.Module):
	def __init__(self, in_channels, hid_channels, out_channels, num_layers, kernel_size, bias=True):
	super().__init__()
	assert (kernel_size - 1) % 2 == 0
	assert num_layers > 1

	self.layers = nn.ModuleList()
	layer = TimeDepthSeparableConv(
	in_channels, hid_channels, out_channels if num_layers == 1 else hid_channels, kernel_size, bias
	)
	self.layers.append(layer)
	for idx in range(num_layers - 1):
	layer = TimeDepthSeparableConv(
	hid_channels,
	hid_channels,
	out_channels if (idx + 1) == (num_layers - 1) else hid_channels,
	kernel_size,
	bias,
	)
	self.layers.append(layer)

	def forward(self, x, mask):
	for layer in self.layers:
	x = layer(x * mask)
	return x