Spaces:

stepfun-ai
/

Step-Audio-EditX

Running on Zero

App Files Files Community

xieli commited on 27 days ago

Commit

822d1fc

1 Parent(s): 91b9368

feat: update hifigan

Browse files

Files changed (2) hide show

stepvocoder/cosyvoice2/hifigan/f0_predictor.py +4 -2
stepvocoder/cosyvoice2/hifigan/generator.py +202 -28

stepvocoder/cosyvoice2/hifigan/f0_predictor.py CHANGED Viewed

@@ -13,8 +13,10 @@
 # limitations under the License.
 import torch
 import torch.nn as nn
-from torch.nn.utils import weight_norm
 class ConvRNNF0Predictor(nn.Module):
     def __init__(self,

 # limitations under the License.
 import torch
 import torch.nn as nn
+try:
+    from torch.nn.utils.parametrizations import weight_norm
+except ImportError:
+    from torch.nn.utils import weight_norm
 class ConvRNNF0Predictor(nn.Module):
     def __init__(self,

stepvocoder/cosyvoice2/hifigan/generator.py CHANGED Viewed

@@ -25,7 +25,10 @@ from torch.nn import ConvTranspose1d
 from torch.nn.utils import remove_weight_norm
 from torch.nn.utils import weight_norm
 from torch.distributions.uniform import Uniform
 from stepvocoder.cosyvoice2.hifigan.activation import Snake
 from stepvocoder.cosyvoice2.utils.common import get_padding, init_weights
@@ -133,7 +136,7 @@ class SineGen(torch.nn.Module):
     def _f02uv(self, f0):
         # generate uv signal
-        uv = (f0 > self.voiced_threshold).type(f0.dtype)
         return uv
     @torch.no_grad()
@@ -142,13 +145,14 @@ class SineGen(torch.nn.Module):
         :param f0: [B, 1, sample_len], Hz
         :return: [B, 1, sample_len]
         """
-        F_mat = torch.zeros((f0.size(0), self.harmonic_num + 1, f0.size(-1)),dtype=f0.dtype).to(f0.device)
         for i in range(self.harmonic_num + 1):
             F_mat[:, i: i + 1, :] = f0 * (i + 1) / self.sampling_rate
         theta_mat = 2 * np.pi * (torch.cumsum(F_mat, dim=-1) % 1)
         u_dist = Uniform(low=-np.pi, high=np.pi)
-        phase_vec = u_dist.sample(sample_shape=(f0.size(0), self.harmonic_num + 1, 1)).to(F_mat.device).to(f0.dtype)
         phase_vec[:, 0, :] = 0
         # generate sine waveforms
         sine_waves = self.sine_amp * torch.sin(theta_mat + phase_vec)
@@ -211,6 +215,172 @@ class SourceModuleHnNSF(torch.nn.Module):
             sine_wavs = sine_wavs.transpose(1, 2)
             uv = uv.transpose(1, 2)
         sine_merge = self.l_tanh(self.l_linear(sine_wavs))
         # source for noise branch, in the same shape as uv
         noise = torch.randn_like(uv) * self.sine_amp / 3
         return sine_merge, noise, uv
@@ -252,7 +422,10 @@ class HiFTGenerator(nn.Module):
         self.num_kernels = len(resblock_kernel_sizes)
         self.num_upsamples = len(upsample_rates)
-        self.m_source = SourceModuleHnNSF(
             sampling_rate=sampling_rate,
             upsample_scale=np.prod(upsample_rates) * istft_params["hop_len"],
             harmonic_num=nb_harmonics,
@@ -312,7 +485,7 @@ class HiFTGenerator(nn.Module):
         self.stft_window = torch.from_numpy(get_window("hann", istft_params["n_fft"], fftbins=True).astype(np.float32))
         self.f0_predictor = f0_predictor
-        # for cuda graph
         self.use_cuda_graph = False
         self.graph = {}
         self.inference_buffers = {}
@@ -347,6 +520,25 @@ class HiFTGenerator(nn.Module):
                                         self.istft_params["n_fft"], window=self.stft_window.to(magnitude.device))
         return inverse_transform
     def decode_without_stft(self, x: torch.Tensor, s_stft: torch.Tensor) -> torch.Tensor:
         x = self.conv_pre(x)
         for i in range(self.num_upsamples):
@@ -373,24 +565,6 @@ class HiFTGenerator(nn.Module):
         x = self.conv_post(x)
         return x
-    def decode(self, x: torch.Tensor, s: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
-        s_stft_real, s_stft_imag = self._stft(s.squeeze(1))
-        s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1).to(s.dtype)
-        if self.use_cuda_graph and x.shape[-1] in self.graph:
-            self.inference_buffers[x.shape[-1]]['static_inputs']['static_x'].copy_(x)
-            self.inference_buffers[x.shape[-1]]['static_inputs']['static_s_stft'].copy_(s_stft)
-            self.graph[x.shape[-1]].replay()
-            x = self.inference_buffers[x.shape[-1]]['static_outputs']['static_output_x']
-        else:
-            x = self.decode_without_stft(x, s_stft)
-        magnitude = torch.exp(x[:, :self.istft_params["n_fft"] // 2 + 1, :])
-        phase = torch.sin(x[:, self.istft_params["n_fft"] // 2 + 1:, :])  # actually, sin is redundancy
-        magnitude = magnitude.to(torch.float32)
-        phase = phase.to(torch.float32)
-        x = self._istft(magnitude, phase).to(x.dtype)
-        x = torch.clamp(x, -self.audio_limit, self.audio_limit)
-        return x
     def forward(
             self,
             batch: dict,
@@ -406,7 +580,7 @@ class HiFTGenerator(nn.Module):
         # mel+source->speech
         generated_speech = self.decode(x=speech_feat, s=s)
         return generated_speech, f0
     def _init_cuda_graph(self):
         self.use_cuda_graph = True
         dummy_param = next(self.parameters())
@@ -435,7 +609,7 @@ class HiFTGenerator(nn.Module):
         print(f"CUDA Graph initialized successfully for chunk generator")
     @torch.inference_mode()
-    def inference(self, speech_feat: torch.Tensor, cache_source: torch.Tensor = None) -> torch.Tensor:
         # mel->f0
         f0 = self.f0_predictor(speech_feat)
         # f0->source
@@ -443,7 +617,7 @@ class HiFTGenerator(nn.Module):
         s, _, _ = self.m_source(s)
         s = s.transpose(1, 2)
         # use cache_source to avoid glitch
-        if cache_source is not None:
             s[:, :, :cache_source.shape[2]] = cache_source
         generated_speech = self.decode(x=speech_feat, s=s)
-        return generated_speech, s

 from torch.nn.utils import remove_weight_norm
 from torch.nn.utils import weight_norm
 from torch.distributions.uniform import Uniform
+try:
+    from torch.nn.utils.parametrizations import weight_norm
+except ImportError:
+    from torch.nn.utils import weight_norm
 from stepvocoder.cosyvoice2.hifigan.activation import Snake
 from stepvocoder.cosyvoice2.utils.common import get_padding, init_weights
     def _f02uv(self, f0):
         # generate uv signal
+        uv = (f0 > self.voiced_threshold).type(torch.float32)
         return uv
     @torch.no_grad()
         :param f0: [B, 1, sample_len], Hz
         :return: [B, 1, sample_len]
         """
+        F_mat = torch.zeros((f0.size(0), self.harmonic_num + 1, f0.size(-1))).to(f0.device)
         for i in range(self.harmonic_num + 1):
             F_mat[:, i: i + 1, :] = f0 * (i + 1) / self.sampling_rate
         theta_mat = 2 * np.pi * (torch.cumsum(F_mat, dim=-1) % 1)
         u_dist = Uniform(low=-np.pi, high=np.pi)
+        phase_vec = u_dist.sample(sample_shape=(f0.size(0), self.harmonic_num + 1, 1)).to(F_mat.device)
         phase_vec[:, 0, :] = 0
         # generate sine waveforms
         sine_waves = self.sine_amp * torch.sin(theta_mat + phase_vec)
             sine_wavs = sine_wavs.transpose(1, 2)
             uv = uv.transpose(1, 2)
         sine_merge = self.l_tanh(self.l_linear(sine_wavs))
+        # source for noise branch, in the same shape as uv
+        noise = torch.randn_like(uv) * self.sine_amp / 3
+        return sine_merge, noise, uv
+class SineGen2(torch.nn.Module):
+    """ Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-wavefrom (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_thoreshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+    def __init__(self, samp_rate, upsample_scale, harmonic_num=0,
+                 sine_amp=0.1, noise_std=0.003,
+                 voiced_threshold=0,
+                 flag_for_pulse=False):
+        super(SineGen2, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.dim = self.harmonic_num + 1
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+        self.flag_for_pulse = flag_for_pulse
+        self.upsample_scale = upsample_scale
+    def _f02uv(self, f0):
+        # generate uv signal
+        uv = (f0 > self.voiced_threshold).type(torch.float32)
+        return uv
+    def _f02sine(self, f0_values):
+        """ f0_values: (batchsize, length, dim)
+            where dim indicates fundamental tone and overtones
+        """
+        # convert to F0 in rad. The interger part n can be ignored
+        # because 2 * np.pi * n doesn't affect phase
+        rad_values = (f0_values / self.sampling_rate) % 1
+        # initial phase noise (no noise for fundamental component)
+        rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], device=f0_values.device)
+        rand_ini[:, 0] = 0
+        rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
+        # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
+        if not self.flag_for_pulse:
+            rad_values = torch.nn.functional.interpolate(rad_values.transpose(1, 2),
+                                                         scale_factor=1 / self.upsample_scale,
+                                                         mode="linear").transpose(1, 2)
+            phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi
+            phase = torch.nn.functional.interpolate(phase.transpose(1, 2) * self.upsample_scale,
+                                                    scale_factor=self.upsample_scale, mode="linear").transpose(1, 2)
+            sines = torch.sin(phase)
+        else:
+            # If necessary, make sure that the first time step of every
+            # voiced segments is sin(pi) or cos(0)
+            # This is used for pulse-train generation
+            # identify the last time step in unvoiced segments
+            uv = self._f02uv(f0_values)
+            uv_1 = torch.roll(uv, shifts=-1, dims=1)
+            uv_1[:, -1, :] = 1
+            u_loc = (uv < 1) * (uv_1 > 0)
+            # get the instantanouse phase
+            tmp_cumsum = torch.cumsum(rad_values, dim=1)
+            # different batch needs to be processed differently
+            for idx in range(f0_values.shape[0]):
+                temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
+                temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
+                # stores the accumulation of i.phase within
+                # each voiced segments
+                tmp_cumsum[idx, :, :] = 0
+                tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
+            # rad_values - tmp_cumsum: remove the accumulation of i.phase
+            # within the previous voiced segment.
+            i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
+            # get the sines
+            sines = torch.cos(i_phase * 2 * np.pi)
+        return sines
+    def forward(self, f0):
+        """ sine_tensor, uv = forward(f0)
+        input F0: tensor(batchsize=1, length, dim=1)
+                  f0 for unvoiced steps should be 0
+        output sine_tensor: tensor(batchsize=1, length, dim)
+        output uv: tensor(batchsize=1, length, 1)
+        """
+        # fundamental component
+        fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
+        # generate sine waveforms
+        sine_waves = self._f02sine(fn) * self.sine_amp
+        # generate uv signal
+        uv = self._f02uv(f0)
+        # noise: for unvoiced should be similar to sine_amp
+        #        std = self.sine_amp/3 -> max value ~ self.sine_amp
+        # .       for voiced regions is self.noise_std
+        noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+        noise = noise_amp * torch.randn_like(sine_waves)
+        # first: set the unvoiced part to 0 by uv
+        # then: additive noise
+        sine_waves = sine_waves * uv + noise
+        return sine_waves, uv, noise
+class SourceModuleHnNSF2(torch.nn.Module):
+    """ SourceModule for hn-nsf
+    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0)
+    sampling_rate: sampling_rate in Hz
+    harmonic_num: number of harmonic above F0 (default: 0)
+    sine_amp: amplitude of sine source signal (default: 0.1)
+    add_noise_std: std of additive Gaussian noise (default: 0.003)
+        note that amplitude of noise in unvoiced is decided
+        by sine_amp
+    voiced_threshold: threhold to set U/V given F0 (default: 0)
+    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+    F0_sampled (batchsize, length, 1)
+    Sine_source (batchsize, length, 1)
+    noise_source (batchsize, length 1)
+    uv (batchsize, length, 1)
+    """
+    def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0):
+        super(SourceModuleHnNSF2, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+        # to produce sine waveforms
+        self.l_sin_gen = SineGen2(sampling_rate, upsample_scale, harmonic_num,
+                                  sine_amp, add_noise_std, voiced_threshod)
+        # to merge source harmonics into a single excitation
+        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
+        self.l_tanh = torch.nn.Tanh()
+    def forward(self, x):
+        """
+        Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+        F0_sampled (batchsize, length, 1)
+        Sine_source (batchsize, length, 1)
+        noise_source (batchsize, length 1)
+        """
+        # source for harmonic branch
+        with torch.no_grad():
+            sine_wavs, uv, _ = self.l_sin_gen(x)
+        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
         # source for noise branch, in the same shape as uv
         noise = torch.randn_like(uv) * self.sine_amp / 3
         return sine_merge, noise, uv
         self.num_kernels = len(resblock_kernel_sizes)
         self.num_upsamples = len(upsample_rates)
+        # NOTE in CosyVoice2, we use the original SourceModuleHnNSF implementation
+        # this_SourceModuleHnNSF = SourceModuleHnNSF if self.sampling_rate == 22050 else SourceModuleHnNSF2
+        this_SourceModuleHnNSF = SourceModuleHnNSF2 # WBY
+        self.m_source = this_SourceModuleHnNSF(
             sampling_rate=sampling_rate,
             upsample_scale=np.prod(upsample_rates) * istft_params["hop_len"],
             harmonic_num=nb_harmonics,
         self.stft_window = torch.from_numpy(get_window("hann", istft_params["n_fft"], fftbins=True).astype(np.float32))
         self.f0_predictor = f0_predictor
+        # for cuda graph
         self.use_cuda_graph = False
         self.graph = {}
         self.inference_buffers = {}
                                         self.istft_params["n_fft"], window=self.stft_window.to(magnitude.device))
         return inverse_transform
+    def decode(self, x: torch.Tensor, s: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
+        s_stft_real, s_stft_imag = self._stft(s.squeeze(1))
+        s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1).to(s.dtype)
+        if self.use_cuda_graph and x.shape[-1] in self.graph:
+            self.inference_buffers[x.shape[-1]]['static_inputs']['static_x'].copy_(x)
+            self.inference_buffers[x.shape[-1]]['static_inputs']['static_s_stft'].copy_(s_stft)
+            self.graph[x.shape[-1]].replay()
+            x = self.inference_buffers[x.shape[-1]]['static_outputs']['static_output_x']
+        else:
+            x = self.decode_without_stft(x, s_stft)
+        magnitude = torch.exp(x[:, :self.istft_params["n_fft"] // 2 + 1, :])
+        phase = torch.sin(x[:, self.istft_params["n_fft"] // 2 + 1:, :])  # actually, sin is redundancy
+        magnitude = magnitude.to(torch.float32)
+        phase = phase.to(torch.float32)
+        x = self._istft(magnitude, phase).to(x.dtype)
+        x = torch.clamp(x, -self.audio_limit, self.audio_limit)
+        return x
     def decode_without_stft(self, x: torch.Tensor, s_stft: torch.Tensor) -> torch.Tensor:
         x = self.conv_pre(x)
         for i in range(self.num_upsamples):
         x = self.conv_post(x)
         return x
     def forward(
             self,
             batch: dict,
         # mel+source->speech
         generated_speech = self.decode(x=speech_feat, s=s)
         return generated_speech, f0
     def _init_cuda_graph(self):
         self.use_cuda_graph = True
         dummy_param = next(self.parameters())
         print(f"CUDA Graph initialized successfully for chunk generator")
     @torch.inference_mode()
+    def inference(self, speech_feat: torch.Tensor, cache_source: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
         # mel->f0
         f0 = self.f0_predictor(speech_feat)
         # f0->source
         s, _, _ = self.m_source(s)
         s = s.transpose(1, 2)
         # use cache_source to avoid glitch
+        if cache_source.shape[2] != 0:
             s[:, :, :cache_source.shape[2]] = cache_source
         generated_speech = self.decode(x=speech_feat, s=s)
+        return generated_speech, s