Spaces:

nvidia
/

BigVGAN

Running

App Files Files Community

L0SG commited on Jul 15, 2024

Commit

1de35a2

1 Parent(s): cfe9514

update space demo

Browse files

Files changed (5) hide show

app.py +14 -44
bigvgan.py +351 -0
inference.py +0 -105
meldataset.py +2 -149
models.py +0 -955

app.py CHANGED Viewed

@@ -6,8 +6,8 @@ import json
 import torch
 import os
 from env import AttrDict
-from meldataset import mel_spectrogram, MAX_WAV_VALUE
-from models import BigVGAN as Generator
 import librosa
 import numpy as np
 from utils import plot_spectrogram
@@ -35,22 +35,21 @@ def inference_gradio(input, model_choice):  # input is audio waveform in [T, cha
     audio = np.transpose(audio)  # transpose to [channel, T] for librosa
     audio = audio / MAX_WAV_VALUE  # convert int16 to float range used by BigVGAN
-    h = dict_config[model_choice]
     model = dict_model[model_choice]
-    if sr != h.sampling_rate:  # convert audio to model's sampling rate
-        audio = librosa.resample(audio, orig_sr=sr, target_sr=h.sampling_rate)
     if len(audio.shape) == 2:  # stereo
         audio = librosa.to_mono(audio)  # convert to mono if stereo
     audio = librosa.util.normalize(audio) * 0.95
     output, spec_gen = inference_model(
-        audio, h, model
     )  # output is generated audio in ndarray, int16
     spec_plot_gen = plot_spectrogram(spec_gen)
-    output_audio = (h.sampling_rate, output) # tuple for gr.Audio output
     buffer = spec_plot_gen.canvas.buffer_rgba()
     output_image = PIL.Image.frombuffer(
@@ -67,22 +66,19 @@ def inference_gradio(input, model_choice):  # input is audio waveform in [T, cha
 @spaces.GPU(duration=120)
-def inference_model(audio_input, h, model):
     # load model to device
     model.to(device)
-    def get_mel(x):
-        return mel_spectrogram(x, h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, h.fmin, h.fmax)
     with torch.inference_mode():
         wav = torch.FloatTensor(audio_input)
         # compute mel spectrogram from the ground truth audio
-        spec_gt = get_mel(wav.unsqueeze(0)).to(device)
         y_g_hat = model(spec_gt)
         audio_gen = y_g_hat.squeeze().cpu()
-        spec_gen = get_mel(audio_gen.unsqueeze(0))
         audio_gen = audio_gen.numpy()  # [T], float [-1, 1]
         audio_gen = (audio_gen * MAX_WAV_VALUE).astype("int16")  # [T], int16
         spec_gen = spec_gen.squeeze().numpy()  # [C, T_frame]
@@ -234,9 +230,7 @@ css = """
 ######################## script for loading the models ########################
-MODEL_PATH = "nvidia/BigVGAN"
-LIST_MODEL_NAME = [
     "bigvgan_24khz_100band",
     "bigvgan_base_24khz_100band",
     "bigvgan_22khz_80band",
@@ -248,41 +242,17 @@ LIST_MODEL_NAME = [
     "bigvgan_v2_44khz_128band_512x"
 ]
-DICT_MODEL_NAME_FILE_PAIRS = {
-    "bigvgan_24khz_100band": "g_05000000",
-    "bigvgan_base_24khz_100band": "g_05000000",
-    "bigvgan_22khz_80band": "g_05000000",
-    "bigvgan_base_22khz_80band": "g_05000000",
-    "bigvgan_v2_22khz_80band_256x": "g_03000000",
-    "bigvgan_v2_22khz_80band_fmax8k_256x": "g_03000000",
-    "bigvgan_v2_24khz_100band_256x": "g_03000000",
-    "bigvgan_v2_44khz_128band_256x": "g_03000000",
-    "bigvgan_v2_44khz_128band_512x": "g_03000000"
-}
 dict_model = {}
 dict_config = {}
-for model_name in LIST_MODEL_NAME:
-    model_file = hf_hub_download(MODEL_PATH, f"{model_name}/{DICT_MODEL_NAME_FILE_PAIRS[model_name]}", use_auth_token=os.environ['TOKEN'])
-    config_file = hf_hub_download(MODEL_PATH, f"{model_name}/config.json", use_auth_token=os.environ['TOKEN'])
-    with open(config_file) as f:
-        data = f.read()
-    json_config = json.loads(data)
-    h = AttrDict(json_config)
-    torch.manual_seed(h.seed)
-    generator = Generator(h)
-    state_dict_g = load_checkpoint(model_file)
-    generator.load_state_dict(state_dict_g['generator'])
     generator.eval()
     generator.remove_weight_norm()
     dict_model[model_name] = generator
-    dict_config[model_name] = h
 ######################## script for gradio UI ########################
@@ -338,7 +308,7 @@ with iface:
         model_choice = gr.Dropdown(
             label="Select the model. Default: bigvgan_v2_24khz_100band_256x",
             value="bigvgan_v2_24khz_100band_256x",
-            choices=[m for m in LIST_MODEL_NAME],
             interactive=True,
         )

 import torch
 import os
 from env import AttrDict
+from meldataset import get_mel_spectrogram, MAX_WAV_VALUE
+from bigvgan import BigVGAN
 import librosa
 import numpy as np
 from utils import plot_spectrogram
     audio = np.transpose(audio)  # transpose to [channel, T] for librosa
     audio = audio / MAX_WAV_VALUE  # convert int16 to float range used by BigVGAN
     model = dict_model[model_choice]
+    if sr != model.h.sampling_rate:  # convert audio to model's sampling rate
+        audio = librosa.resample(audio, orig_sr=sr, target_sr=model.h.sampling_rate)
     if len(audio.shape) == 2:  # stereo
         audio = librosa.to_mono(audio)  # convert to mono if stereo
     audio = librosa.util.normalize(audio) * 0.95
     output, spec_gen = inference_model(
+        audio, model
     )  # output is generated audio in ndarray, int16
     spec_plot_gen = plot_spectrogram(spec_gen)
+    output_audio = (model.h.sampling_rate, output) # tuple for gr.Audio output
     buffer = spec_plot_gen.canvas.buffer_rgba()
     output_image = PIL.Image.frombuffer(
 @spaces.GPU(duration=120)
+def inference_model(audio_input, model):
     # load model to device
     model.to(device)
     with torch.inference_mode():
         wav = torch.FloatTensor(audio_input)
         # compute mel spectrogram from the ground truth audio
+        spec_gt = get_mel_spectrogram(wav.unsqueeze(0), model.h).to(device)
         y_g_hat = model(spec_gt)
         audio_gen = y_g_hat.squeeze().cpu()
+        spec_gen = get_mel_spectrogram(audio_gen.unsqueeze(0))
         audio_gen = audio_gen.numpy()  # [T], float [-1, 1]
         audio_gen = (audio_gen * MAX_WAV_VALUE).astype("int16")  # [T], int16
         spec_gen = spec_gen.squeeze().numpy()  # [C, T_frame]
 ######################## script for loading the models ########################
+LIST_MODEL_ID = [
     "bigvgan_24khz_100band",
     "bigvgan_base_24khz_100band",
     "bigvgan_22khz_80band",
     "bigvgan_v2_44khz_128band_512x"
 ]
 dict_model = {}
 dict_config = {}
+for model_name in LIST_MODEL_ID:
+    generator = BigVGAN.from_pretrained('nvidia/'+model_name, token=os.environ['TOKEN'])
     generator.eval()
     generator.remove_weight_norm()
     dict_model[model_name] = generator
+    dict_config[model_name] = generator.h
 ######################## script for gradio UI ########################
         model_choice = gr.Dropdown(
             label="Select the model. Default: bigvgan_v2_24khz_100band_256x",
             value="bigvgan_v2_24khz_100band_256x",
+            choices=[m for m in LIST_MODEL_ID],
             interactive=True,
         )

bigvgan.py ADDED Viewed

	@@ -0,0 +1,351 @@

+# Copyright (c) 2024 NVIDIA CORPORATION.
+#   Licensed under the MIT license.
+# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
+#   LICENSE is in incl_licenses directory.
+import os
+import json
+from pathlib import Path
+from collections import namedtuple
+from typing import Optional, List, Union, Dict
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from torch.nn import Conv1d, ConvTranspose1d
+from torch.nn.utils import weight_norm, remove_weight_norm
+import activations
+from utils import init_weights, get_padding
+from alias_free_torch.act import Activation1d as TorchActivation1d
+from env import AttrDict
+from huggingface_hub import PyTorchModelHubMixin, hf_hub_download
+def load_hparams_from_json(path) -> AttrDict:
+    with open(path) as f:
+        data = f.read()
+    h = json.loads(data)
+    return AttrDict(h)
+class AMPBlock1(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5), activation=None):
+        super(AMPBlock1, self).__init__()
+        self.h = h
+        self.convs1 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
+                               padding=get_padding(kernel_size, dilation[2])))
+        ])
+        self.convs1.apply(init_weights)
+        self.convs2 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1)))
+        ])
+        self.convs2.apply(init_weights)
+        self.num_layers = len(self.convs1) + len(self.convs2) # total number of conv layers
+        # select which Activation1d, lazy-load cuda version to ensure backward compatibility
+        if self.h.get("use_cuda_kernel", False):
+            # faster CUDA kernel implementation of Activation1d
+            from alias_free_cuda.activation1d import Activation1d as CudaActivation1d
+            Activation1d = CudaActivation1d
+        else:
+            Activation1d = TorchActivation1d
+        if activation == 'snake': # periodic nonlinearity with snake function and anti-aliasing
+            self.activations = nn.ModuleList([
+                Activation1d(
+                    activation=activations.Snake(channels, alpha_logscale=h.snake_logscale))
+                for _ in range(self.num_layers)
+            ])
+        elif activation == 'snakebeta': # periodic nonlinearity with snakebeta function and anti-aliasing
+            self.activations = nn.ModuleList([
+                Activation1d(
+                    activation=activations.SnakeBeta(channels, alpha_logscale=h.snake_logscale))
+                 for _ in range(self.num_layers)
+            ])
+        else:
+            raise NotImplementedError("activation incorrectly specified. check the config file and look for 'activation'.")
+    def forward(self, x):
+        acts1, acts2 = self.activations[::2], self.activations[1::2]
+        for c1, c2, a1, a2 in zip(self.convs1, self.convs2, acts1, acts2):
+            xt = a1(x)
+            xt = c1(xt)
+            xt = a2(xt)
+            xt = c2(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+class AMPBlock2(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3), activation=None):
+        super(AMPBlock2, self).__init__()
+        self.h = h
+        self.convs = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1])))
+        ])
+        self.convs.apply(init_weights)
+        self.num_layers = len(self.convs) # total number of conv layers
+        # select which Activation1d, lazy-load cuda version to ensure backward compatibility
+        if self.h.get("use_cuda_kernel", False):
+            # faster CUDA kernel implementation of Activation1d
+            from alias_free_cuda.activation1d import Activation1d as CudaActivation1d
+            Activation1d = CudaActivation1d
+        else:
+            Activation1d = TorchActivation1d
+        if activation == 'snake': # periodic nonlinearity with snake function and anti-aliasing
+            self.activations = nn.ModuleList([
+                Activation1d(
+                    activation=activations.Snake(channels, alpha_logscale=h.snake_logscale))
+                for _ in range(self.num_layers)
+            ])
+        elif activation == 'snakebeta': # periodic nonlinearity with snakebeta function and anti-aliasing
+            self.activations = nn.ModuleList([
+                Activation1d(
+                    activation=activations.SnakeBeta(channels, alpha_logscale=h.snake_logscale))
+                 for _ in range(self.num_layers)
+            ])
+        else:
+            raise NotImplementedError("activation incorrectly specified. check the config file and look for 'activation'.")
+    def forward(self, x):
+        for c, a in zip (self.convs, self.activations):
+            xt = a(x)
+            xt = c(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+class BigVGAN(
+    torch.nn.Module,
+    PyTorchModelHubMixin,
+    library_name="bigvgan",
+    repo_url="https://github.com/NVIDIA/BigVGAN",
+    docs_url="https://github.com/NVIDIA/BigVGAN/blob/main/README.md",
+    pipeline_tag="audio-to-audio",
+    license="mit",
+    tags=["neural-vocoder", "audio-generation", "arxiv:2206.04658"]
+):
+    # this is our main BigVGAN model. Applies anti-aliased periodic activation for resblocks.
+    # New in v2: if use_cuda_kernel is set to True, it loads optimized CUDA kernels for AMP.
+    # NOTE: use_cuda_kernel=True should be used for inference only (training is not supported).
+    def __init__(
+        self,
+        h,
+        use_cuda_kernel: bool=False
+    ):
+        super(BigVGAN, self).__init__()
+        self.h = h
+        self.h["use_cuda_kernel"] = use_cuda_kernel # add it to global hyperparameters (h)
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+        # pre conv
+        self.conv_pre = weight_norm(Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3))
+        # define which AMPBlock to use. BigVGAN uses AMPBlock1 as default
+        resblock = AMPBlock1 if h.resblock == '1' else AMPBlock2
+        # transposed conv-based upsamplers. does not apply anti-aliasing
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
+            self.ups.append(nn.ModuleList([
+                weight_norm(ConvTranspose1d(h.upsample_initial_channel // (2 ** i),
+                                            h.upsample_initial_channel // (2 ** (i + 1)),
+                                            k, u, padding=(k - u) // 2))
+            ]))
+        # residual blocks using anti-aliased multi-periodicity composition modules (AMP)
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = h.upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
+                self.resblocks.append(resblock(h, ch, k, d, activation=h.activation))
+        # select which Activation1d, lazy-load cuda version to ensure backward compatibility
+        if self.h.get("use_cuda_kernel", False):
+            # faster CUDA kernel implementation of Activation1d
+            from alias_free_cuda.activation1d import Activation1d as CudaActivation1d
+            Activation1d = CudaActivation1d
+        else:
+            Activation1d = TorchActivation1d
+        # post conv
+        if h.activation == "snake": # periodic nonlinearity with snake function and anti-aliasing
+            activation_post = activations.Snake(ch, alpha_logscale=h.snake_logscale)
+            self.activation_post = Activation1d(activation=activation_post)
+        elif h.activation == "snakebeta": # periodic nonlinearity with snakebeta function and anti-aliasing
+            activation_post = activations.SnakeBeta(ch, alpha_logscale=h.snake_logscale)
+            self.activation_post = Activation1d(activation=activation_post)
+        else:
+            raise NotImplementedError("activation incorrectly specified. check the config file and look for 'activation'.")
+        # whether to use bias for the final conv_post. Defaults to True for backward compatibility
+        self.use_bias_at_final = h.get("use_bias_at_final", True)
+        self.conv_post = weight_norm(Conv1d(
+            ch, 1, 7, 1, padding=3, bias=self.use_bias_at_final
+        ))
+        # weight initialization
+        for i in range(len(self.ups)):
+            self.ups[i].apply(init_weights)
+        self.conv_post.apply(init_weights)
+        # final tanh activation. Defaults to True for backward compatibility
+        self.use_tanh_at_final = h.get("use_tanh_at_final", True)
+    def forward(self, x):
+        # pre conv
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            # upsampling
+            for i_up in range(len(self.ups[i])):
+                x = self.ups[i][i_up](x)
+            # AMP blocks
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        # post conv
+        x = self.activation_post(x)
+        x = self.conv_post(x)
+        # final tanh activation
+        if self.use_tanh_at_final:
+            x = torch.tanh(x)
+        else:
+            x = torch.clamp(x, min=-1., max=1.) # bound the output to [-1, 1]
+        return x
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            for l_i in l:
+                remove_weight_norm(l_i)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+    ##################################################################
+    # additional methods for huggingface_hub support
+    ##################################################################
+    def _save_pretrained(self, save_directory: Path) -> None:
+        """Save weights and config.json from a Pytorch model to a local directory."""
+        model_path = save_directory / 'bigvgan_generator.pt'
+        torch.save(
+            {'generator': self.state_dict()},
+            model_path
+        )
+        config_path = save_directory / 'config.json'
+        with open(config_path, 'w') as config_file:
+            json.dump(self.h, config_file, indent=4)
+    @classmethod
+    def _from_pretrained(
+        cls,
+        *,
+        model_id: str,
+        revision: str,
+        cache_dir: str,
+        force_download: bool,
+        proxies: Optional[Dict],
+        resume_download: bool,
+        local_files_only: bool,
+        token: Union[str, bool, None],
+        map_location: str = "cpu", # additional argument
+        strict: bool = False, # additional argument
+        use_cuda_kernel: bool = False,
+        **model_kwargs,
+    ):
+        """Load Pytorch pretrained weights and return the loaded model."""
+        ##################################################################
+        # download and load hyperparameters (h) used by BigVGAN
+        ##################################################################
+        config_file = hf_hub_download(
+            repo_id=model_id,
+            filename='config.json',
+            revision=revision,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            proxies=proxies,
+            resume_download=resume_download,
+            token=token,
+            local_files_only=local_files_only,
+        )
+        h = load_hparams_from_json(config_file)
+        ##################################################################
+        # instantiate BigVGAN using h
+        ##################################################################
+        if use_cuda_kernel:
+            print(f"[INFO] You have specified use_cuda_kernel=True during BigVGAN.from_pretrained(). Only inference is supported (training is not implemented)!")
+            print(f"[INFO] You need nvcc and ninja installed in your system to build the kernel. For detail, see: https://github.com/NVIDIA/BigVGAN?tab=readme-ov-file#using-custom-cuda-kernel-for-synthesis")
+        model = cls(h, use_cuda_kernel=use_cuda_kernel)
+        ##################################################################
+        # download and load pretrained generator weight
+        ##################################################################
+        if os.path.isdir(model_id):
+            print("Loading weights from local directory")
+            model_file = os.path.join(model_id, 'bigvgan_generator.pt')
+        else:
+            print(f"Downloading weights from {model_id}")
+            model_file = hf_hub_download(
+                repo_id=model_id,
+                filename='bigvgan_generator.pt',
+                revision=revision,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                resume_download=resume_download,
+                token=token,
+                local_files_only=local_files_only,
+                )
+        checkpoint_dict = torch.load(model_file, map_location=map_location)
+        model.load_state_dict(checkpoint_dict['generator'])
+        return model

inference.py DELETED Viewed

@@ -1,105 +0,0 @@
-# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
-#   LICENSE is in incl_licenses directory.
-from __future__ import absolute_import, division, print_function, unicode_literals
-import glob
-import os
-import argparse
-import json
-import torch
-from scipy.io.wavfile import write
-from env import AttrDict
-from meldataset import mel_spectrogram, MAX_WAV_VALUE
-from models import BigVGAN as Generator
-import librosa
-h = None
-device = None
-torch.backends.cudnn.benchmark = False
-def load_checkpoint(filepath, device):
-    assert os.path.isfile(filepath)
-    print("Loading '{}'".format(filepath))
-    checkpoint_dict = torch.load(filepath, map_location=device)
-    print("Complete.")
-    return checkpoint_dict
-def get_mel(x):
-    return mel_spectrogram(x, h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, h.fmin, h.fmax)
-def scan_checkpoint(cp_dir, prefix):
-    pattern = os.path.join(cp_dir, prefix + '*')
-    cp_list = glob.glob(pattern)
-    if len(cp_list) == 0:
-        return ''
-    return sorted(cp_list)[-1]
-def inference(a, h):
-    generator = Generator(h, use_cuda_kernel=a.use_cuda_kernel).to(device)
-    state_dict_g = load_checkpoint(a.checkpoint_file, device)
-    generator.load_state_dict(state_dict_g['generator'])
-    filelist = os.listdir(a.input_wavs_dir)
-    os.makedirs(a.output_dir, exist_ok=True)
-    generator.eval()
-    generator.remove_weight_norm()
-    with torch.no_grad():
-        for i, filname in enumerate(filelist):
-            # load the ground truth audio and resample if necessary
-            wav, sr = librosa.load(os.path.join(a.input_wavs_dir, filname), sr=h.sampling_rate, mono=True)
-            wav = torch.FloatTensor(wav).to(device)
-            # compute mel spectrogram from the ground truth audio
-            x = get_mel(wav.unsqueeze(0))
-            y_g_hat = generator(x)
-            audio = y_g_hat.squeeze()
-            audio = audio * MAX_WAV_VALUE
-            audio = audio.cpu().numpy().astype('int16')
-            output_file = os.path.join(a.output_dir, os.path.splitext(filname)[0] + '_generated.wav')
-            write(output_file, h.sampling_rate, audio)
-            print(output_file)
-def main():
-    print('Initializing Inference Process..')
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--input_wavs_dir', default='test_files')
-    parser.add_argument('--output_dir', default='generated_files')
-    parser.add_argument('--checkpoint_file', required=True)
-    parser.add_argument('--use_cuda_kernel', action='store_true', default=False)
-    a = parser.parse_args()
-    config_file = os.path.join(os.path.split(a.checkpoint_file)[0], 'config.json')
-    with open(config_file) as f:
-        data = f.read()
-    global h
-    json_config = json.loads(data)
-    h = AttrDict(json_config)
-    torch.manual_seed(h.seed)
-    global device
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(h.seed)
-        device = torch.device('cuda')
-    else:
-        device = torch.device('cpu')
-    inference(a, h)
-if __name__ == '__main__':
-    main()

meldataset.py CHANGED Viewed

@@ -4,59 +4,37 @@
 # Adapted from https://github.com/jik876/hifi-gan under the MIT license.
 #   LICENSE is in incl_licenses directory.
-import math
-import os
-import random
 import torch
 import torch.utils.data
 import numpy as np
-from librosa.util import normalize
 from scipy.io.wavfile import read
 from librosa.filters import mel as librosa_mel_fn
-import pathlib
-from tqdm import tqdm
 MAX_WAV_VALUE = 32767.0 # NOTE: 32768.0 -1 to prevent int16 overflow (results in popping sound in corner cases)
-def load_wav(full_path, sr_target):
-    sampling_rate, data = read(full_path)
-    if sampling_rate != sr_target:
-        raise RuntimeError("Sampling rate of the file {} is {} Hz, but the model requires {} Hz".
-              format(full_path, sampling_rate, sr_target))
-    return data, sampling_rate
 def dynamic_range_compression(x, C=1, clip_val=1e-5):
     return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
 def dynamic_range_decompression(x, C=1):
     return np.exp(x) / C
 def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
     return torch.log(torch.clamp(x, min=clip_val) * C)
 def dynamic_range_decompression_torch(x, C=1):
     return torch.exp(x) / C
 def spectral_normalize_torch(magnitudes):
     output = dynamic_range_compression_torch(magnitudes)
     return output
 def spectral_de_normalize_torch(magnitudes):
     output = dynamic_range_decompression_torch(magnitudes)
     return output
 mel_basis = {}
 hann_window = {}
 def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
     if torch.min(y) < -1.:
         print('min value is ', torch.min(y))
@@ -84,130 +62,5 @@ def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin,
     return spec
-def get_dataset_filelist(a):
-    with open(a.input_training_file, 'r', encoding='utf-8') as fi:
-        training_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav')
-                          for x in fi.read().split('\n') if len(x) > 0]
-        print("first training file: {}".format(training_files[0]))
-    with open(a.input_validation_file, 'r', encoding='utf-8') as fi:
-        validation_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav')
-                            for x in fi.read().split('\n') if len(x) > 0]
-        print("first validation file: {}".format(validation_files[0]))
-    list_unseen_validation_files = []
-    for i in range(len(a.list_input_unseen_validation_file)):
-        with open(a.list_input_unseen_validation_file[i], 'r', encoding='utf-8') as fi:
-            unseen_validation_files = [os.path.join(a.list_input_unseen_wavs_dir[i], x.split('|')[0] + '.wav')
-                                for x in fi.read().split('\n') if len(x) > 0]
-            print("first unseen {}th validation fileset: {}".format(i, unseen_validation_files[0]))
-            list_unseen_validation_files.append(unseen_validation_files)
-    return training_files, validation_files, list_unseen_validation_files
-class MelDataset(torch.utils.data.Dataset):
-    def __init__(self, training_files, hparams, segment_size, n_fft, num_mels,
-                 hop_size, win_size, sampling_rate,  fmin, fmax, split=True, shuffle=True, n_cache_reuse=1,
-                 device=None, fmax_loss=None, fine_tuning=False, base_mels_path=None, is_seen=True):
-        self.audio_files = training_files
-        random.seed(1234)
-        if shuffle:
-            random.shuffle(self.audio_files)
-        self.hparams = hparams
-        self.is_seen = is_seen
-        if self.is_seen:
-            self.name = pathlib.Path(self.audio_files[0]).parts[0]
-        else:
-            self.name = '-'.join(pathlib.Path(self.audio_files[0]).parts[:2]).strip("/")
-        self.segment_size = segment_size
-        self.sampling_rate = sampling_rate
-        self.split = split
-        self.n_fft = n_fft
-        self.num_mels = num_mels
-        self.hop_size = hop_size
-        self.win_size = win_size
-        self.fmin = fmin
-        self.fmax = fmax
-        self.fmax_loss = fmax_loss
-        self.cached_wav = None
-        self.n_cache_reuse = n_cache_reuse
-        self._cache_ref_count = 0
-        self.device = device
-        self.fine_tuning = fine_tuning
-        self.base_mels_path = base_mels_path
-        print("INFO: checking dataset integrity...")
-        for i in tqdm(range(len(self.audio_files))):
-            assert os.path.exists(self.audio_files[i]), "{} not found".format(self.audio_files[i])
-    def __getitem__(self, index):
-        filename = self.audio_files[index]
-        if self._cache_ref_count == 0:
-            audio, sampling_rate = load_wav(filename, self.sampling_rate)
-            audio = audio / MAX_WAV_VALUE
-            if not self.fine_tuning:
-                audio = normalize(audio) * 0.95
-            self.cached_wav = audio
-            if sampling_rate != self.sampling_rate:
-                raise ValueError("{} SR doesn't match target {} SR".format(
-                    sampling_rate, self.sampling_rate))
-            self._cache_ref_count = self.n_cache_reuse
-        else:
-            audio = self.cached_wav
-            self._cache_ref_count -= 1
-        audio = torch.FloatTensor(audio)
-        audio = audio.unsqueeze(0)
-        if not self.fine_tuning:
-            if self.split:
-                if audio.size(1) >= self.segment_size:
-                    max_audio_start = audio.size(1) - self.segment_size
-                    audio_start = random.randint(0, max_audio_start)
-                    audio = audio[:, audio_start:audio_start+self.segment_size]
-                else:
-                    audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant')
-                mel = mel_spectrogram(audio, self.n_fft, self.num_mels,
-                                      self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax,
-                                      center=False)
-            else: # validation step
-                # match audio length to self.hop_size * n for evaluation
-                if (audio.size(1) % self.hop_size) != 0:
-                    audio = audio[:, :-(audio.size(1) % self.hop_size)]
-                mel = mel_spectrogram(audio, self.n_fft, self.num_mels,
-                                      self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax,
-                                      center=False)
-                assert audio.shape[1] == mel.shape[2] * self.hop_size, "audio shape {} mel shape {}".format(audio.shape, mel.shape)
-        else:
-            mel = np.load(
-                os.path.join(self.base_mels_path, os.path.splitext(os.path.split(filename)[-1])[0] + '.npy'))
-            mel = torch.from_numpy(mel)
-            if len(mel.shape) < 3:
-                mel = mel.unsqueeze(0)
-            if self.split:
-                frames_per_seg = math.ceil(self.segment_size / self.hop_size)
-                if audio.size(1) >= self.segment_size:
-                    mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1)
-                    mel = mel[:, :, mel_start:mel_start + frames_per_seg]
-                    audio = audio[:, mel_start * self.hop_size:(mel_start + frames_per_seg) * self.hop_size]
-                else:
-                    mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), 'constant')
-                    audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant')
-        mel_loss = mel_spectrogram(audio, self.n_fft, self.num_mels,
-                                   self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax_loss,
-                                   center=False)
-        return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
-    def __len__(self):
-        return len(self.audio_files)

 # Adapted from https://github.com/jik876/hifi-gan under the MIT license.
 #   LICENSE is in incl_licenses directory.
 import torch
 import torch.utils.data
 import numpy as np
 from scipy.io.wavfile import read
 from librosa.filters import mel as librosa_mel_fn
 MAX_WAV_VALUE = 32767.0 # NOTE: 32768.0 -1 to prevent int16 overflow (results in popping sound in corner cases)
 def dynamic_range_compression(x, C=1, clip_val=1e-5):
     return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
 def dynamic_range_decompression(x, C=1):
     return np.exp(x) / C
 def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
     return torch.log(torch.clamp(x, min=clip_val) * C)
 def dynamic_range_decompression_torch(x, C=1):
     return torch.exp(x) / C
 def spectral_normalize_torch(magnitudes):
     output = dynamic_range_compression_torch(magnitudes)
     return output
 def spectral_de_normalize_torch(magnitudes):
     output = dynamic_range_decompression_torch(magnitudes)
     return output
 mel_basis = {}
 hann_window = {}
 def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
     if torch.min(y) < -1.:
         print('min value is ', torch.min(y))
     return spec
+def get_mel_spectrogram(wav, h):
+    return mel_spectrogram(wav, h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, h.fmin, h.fmax)

models.py DELETED Viewed

@@ -1,955 +0,0 @@
-# Copyright (c) 2024 NVIDIA CORPORATION.
-#   Licensed under the MIT license.
-# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
-#   LICENSE is in incl_licenses directory.
-import torch
-import torch.nn.functional as F
-import torch.nn as nn
-from torch.nn import Conv1d, ConvTranspose1d, Conv2d
-from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
-from torchaudio.transforms import Spectrogram, Resample
-from librosa.filters import mel as librosa_mel_fn
-from scipy import signal
-import activations
-from utils import init_weights, get_padding
-from alias_free_torch.act import Activation1d as TorchActivation1d
-import typing
-from typing import List, Optional, Tuple
-from collections import namedtuple
-import math
-import functools
-class AMPBlock1(torch.nn.Module):
-    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5), activation=None):
-        super(AMPBlock1, self).__init__()
-        self.h = h
-        self.convs1 = nn.ModuleList([
-            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
-                               padding=get_padding(kernel_size, dilation[0]))),
-            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
-                               padding=get_padding(kernel_size, dilation[1]))),
-            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
-                               padding=get_padding(kernel_size, dilation[2])))
-        ])
-        self.convs1.apply(init_weights)
-        self.convs2 = nn.ModuleList([
-            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
-                               padding=get_padding(kernel_size, 1))),
-            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
-                               padding=get_padding(kernel_size, 1))),
-            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
-                               padding=get_padding(kernel_size, 1)))
-        ])
-        self.convs2.apply(init_weights)
-        self.num_layers = len(self.convs1) + len(self.convs2) # total number of conv layers
-        # select which Activation1d, lazy-load cuda version to ensure backward compatibility
-        if self.h.get("use_cuda_kernel", False):
-            # faster CUDA kernel implementation of Activation1d
-            from alias_free_cuda.activation1d import Activation1d as CudaActivation1d
-            Activation1d = CudaActivation1d
-        else:
-            Activation1d = TorchActivation1d
-        if activation == 'snake': # periodic nonlinearity with snake function and anti-aliasing
-            self.activations = nn.ModuleList([
-                Activation1d(
-                    activation=activations.Snake(channels, alpha_logscale=h.snake_logscale))
-                for _ in range(self.num_layers)
-            ])
-        elif activation == 'snakebeta': # periodic nonlinearity with snakebeta function and anti-aliasing
-            self.activations = nn.ModuleList([
-                Activation1d(
-                    activation=activations.SnakeBeta(channels, alpha_logscale=h.snake_logscale))
-                 for _ in range(self.num_layers)
-            ])
-        else:
-            raise NotImplementedError("activation incorrectly specified. check the config file and look for 'activation'.")
-    def forward(self, x):
-        acts1, acts2 = self.activations[::2], self.activations[1::2]
-        for c1, c2, a1, a2 in zip(self.convs1, self.convs2, acts1, acts2):
-            xt = a1(x)
-            xt = c1(xt)
-            xt = a2(xt)
-            xt = c2(xt)
-            x = xt + x
-        return x
-    def remove_weight_norm(self):
-        for l in self.convs1:
-            remove_weight_norm(l)
-        for l in self.convs2:
-            remove_weight_norm(l)
-class AMPBlock2(torch.nn.Module):
-    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3), activation=None):
-        super(AMPBlock2, self).__init__()
-        self.h = h
-        self.convs = nn.ModuleList([
-            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
-                               padding=get_padding(kernel_size, dilation[0]))),
-            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
-                               padding=get_padding(kernel_size, dilation[1])))
-        ])
-        self.convs.apply(init_weights)
-        self.num_layers = len(self.convs) # total number of conv layers
-        # select which Activation1d, lazy-load cuda version to ensure backward compatibility
-        if self.h.get("use_cuda_kernel", False):
-            # faster CUDA kernel implementation of Activation1d
-            from alias_free_cuda.activation1d import Activation1d as CudaActivation1d
-            Activation1d = CudaActivation1d
-        else:
-            Activation1d = TorchActivation1d
-        if activation == 'snake': # periodic nonlinearity with snake function and anti-aliasing
-            self.activations = nn.ModuleList([
-                Activation1d(
-                    activation=activations.Snake(channels, alpha_logscale=h.snake_logscale))
-                for _ in range(self.num_layers)
-            ])
-        elif activation == 'snakebeta': # periodic nonlinearity with snakebeta function and anti-aliasing
-            self.activations = nn.ModuleList([
-                Activation1d(
-                    activation=activations.SnakeBeta(channels, alpha_logscale=h.snake_logscale))
-                 for _ in range(self.num_layers)
-            ])
-        else:
-            raise NotImplementedError("activation incorrectly specified. check the config file and look for 'activation'.")
-    def forward(self, x):
-        for c, a in zip (self.convs, self.activations):
-            xt = a(x)
-            xt = c(xt)
-            x = xt + x
-        return x
-    def remove_weight_norm(self):
-        for l in self.convs:
-            remove_weight_norm(l)
-class BigVGAN(torch.nn.Module):
-    # this is our main BigVGAN model. Applies anti-aliased periodic activation for resblocks.
-    # New in v2: if use_cuda_kernel is set to True, it loads optimized CUDA kernels for AMP.
-    # NOTE: use_cuda_kernel=True should be used for inference only (training is not supported).
-    def __init__(
-        self,
-        h,
-        use_cuda_kernel: bool=False
-    ):
-        super(BigVGAN, self).__init__()
-        self.h = h
-        self.h["use_cuda_kernel"] = use_cuda_kernel # add it to global hyperparameters (h)
-        self.num_kernels = len(h.resblock_kernel_sizes)
-        self.num_upsamples = len(h.upsample_rates)
-        # pre conv
-        self.conv_pre = weight_norm(Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3))
-        # define which AMPBlock to use. BigVGAN uses AMPBlock1 as default
-        resblock = AMPBlock1 if h.resblock == '1' else AMPBlock2
-        # transposed conv-based upsamplers. does not apply anti-aliasing
-        self.ups = nn.ModuleList()
-        for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
-            self.ups.append(nn.ModuleList([
-                weight_norm(ConvTranspose1d(h.upsample_initial_channel // (2 ** i),
-                                            h.upsample_initial_channel // (2 ** (i + 1)),
-                                            k, u, padding=(k - u) // 2))
-            ]))
-        # residual blocks using anti-aliased multi-periodicity composition modules (AMP)
-        self.resblocks = nn.ModuleList()
-        for i in range(len(self.ups)):
-            ch = h.upsample_initial_channel // (2 ** (i + 1))
-            for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
-                self.resblocks.append(resblock(h, ch, k, d, activation=h.activation))
-        # select which Activation1d, lazy-load cuda version to ensure backward compatibility
-        if self.h.get("use_cuda_kernel", False):
-            # faster CUDA kernel implementation of Activation1d
-            from alias_free_cuda.activation1d import Activation1d as CudaActivation1d
-            Activation1d = CudaActivation1d
-        else:
-            Activation1d = TorchActivation1d
-        # post conv
-        if h.activation == "snake": # periodic nonlinearity with snake function and anti-aliasing
-            activation_post = activations.Snake(ch, alpha_logscale=h.snake_logscale)
-            self.activation_post = Activation1d(activation=activation_post)
-        elif h.activation == "snakebeta": # periodic nonlinearity with snakebeta function and anti-aliasing
-            activation_post = activations.SnakeBeta(ch, alpha_logscale=h.snake_logscale)
-            self.activation_post = Activation1d(activation=activation_post)
-        else:
-            raise NotImplementedError("activation incorrectly specified. check the config file and look for 'activation'.")
-        # whether to use bias for the final conv_post. Defaults to True for backward compatibility
-        self.use_bias_at_final = h.get("use_bias_at_final", True)
-        self.conv_post = weight_norm(Conv1d(
-            ch, 1, 7, 1, padding=3, bias=self.use_bias_at_final
-        ))
-        # weight initialization
-        for i in range(len(self.ups)):
-            self.ups[i].apply(init_weights)
-        self.conv_post.apply(init_weights)
-        # final tanh activation. Defaults to True for backward compatibility
-        self.use_tanh_at_final = h.get("use_tanh_at_final", True)
-    def forward(self, x):
-        # pre conv
-        x = self.conv_pre(x)
-        for i in range(self.num_upsamples):
-            # upsampling
-            for i_up in range(len(self.ups[i])):
-                x = self.ups[i][i_up](x)
-            # AMP blocks
-            xs = None
-            for j in range(self.num_kernels):
-                if xs is None:
-                    xs = self.resblocks[i * self.num_kernels + j](x)
-                else:
-                    xs += self.resblocks[i * self.num_kernels + j](x)
-            x = xs / self.num_kernels
-        # post conv
-        x = self.activation_post(x)
-        x = self.conv_post(x)
-        # final tanh activation
-        if self.use_tanh_at_final:
-            x = torch.tanh(x)
-        else:
-            x = torch.clamp(x, min=-1., max=1.) # bound the output to [-1, 1]
-        return x
-    def remove_weight_norm(self):
-        print('Removing weight norm...')
-        for l in self.ups:
-            for l_i in l:
-                remove_weight_norm(l_i)
-        for l in self.resblocks:
-            l.remove_weight_norm()
-        remove_weight_norm(self.conv_pre)
-        remove_weight_norm(self.conv_post)
-class DiscriminatorP(torch.nn.Module):
-    def __init__(self, h, period, kernel_size=5, stride=3, use_spectral_norm=False):
-        super(DiscriminatorP, self).__init__()
-        self.period = period
-        self.d_mult = h.discriminator_channel_mult
-        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
-        self.convs = nn.ModuleList([
-            norm_f(Conv2d(1, int(32*self.d_mult), (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
-            norm_f(Conv2d(int(32*self.d_mult), int(128*self.d_mult), (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
-            norm_f(Conv2d(int(128*self.d_mult), int(512*self.d_mult), (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
-            norm_f(Conv2d(int(512*self.d_mult), int(1024*self.d_mult), (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
-            norm_f(Conv2d(int(1024*self.d_mult), int(1024*self.d_mult), (kernel_size, 1), 1, padding=(2, 0))),
-        ])
-        self.conv_post = norm_f(Conv2d(int(1024*self.d_mult), 1, (3, 1), 1, padding=(1, 0)))
-    def forward(self, x):
-        fmap = []
-        # 1d to 2d
-        b, c, t = x.shape
-        if t % self.period != 0: # pad first
-            n_pad = self.period - (t % self.period)
-            x = F.pad(x, (0, n_pad), "reflect")
-            t = t + n_pad
-        x = x.view(b, c, t // self.period, self.period)
-        for l in self.convs:
-            x = l(x)
-            x = F.leaky_relu(x, 0.1)
-            fmap.append(x)
-        x = self.conv_post(x)
-        fmap.append(x)
-        x = torch.flatten(x, 1, -1)
-        return x, fmap
-class MultiPeriodDiscriminator(torch.nn.Module):
-    def __init__(self, h):
-        super(MultiPeriodDiscriminator, self).__init__()
-        self.mpd_reshapes = h.mpd_reshapes
-        print("mpd_reshapes: {}".format(self.mpd_reshapes))
-        discriminators = [DiscriminatorP(h, rs, use_spectral_norm=h.use_spectral_norm) for rs in self.mpd_reshapes]
-        self.discriminators = nn.ModuleList(discriminators)
-    def forward(self, y, y_hat):
-        y_d_rs = []
-        y_d_gs = []
-        fmap_rs = []
-        fmap_gs = []
-        for i, d in enumerate(self.discriminators):
-            y_d_r, fmap_r = d(y)
-            y_d_g, fmap_g = d(y_hat)
-            y_d_rs.append(y_d_r)
-            fmap_rs.append(fmap_r)
-            y_d_gs.append(y_d_g)
-            fmap_gs.append(fmap_g)
-        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
-class DiscriminatorR(nn.Module):
-    def __init__(self, cfg, resolution):
-        super().__init__()
-        self.resolution = resolution
-        assert len(self.resolution) == 3, \
-            "MRD layer requires list with len=3, got {}".format(self.resolution)
-        self.lrelu_slope = 0.1
-        norm_f = weight_norm if cfg.use_spectral_norm == False else spectral_norm
-        if hasattr(cfg, "mrd_use_spectral_norm"):
-            print("INFO: overriding MRD use_spectral_norm as {}".format(cfg.mrd_use_spectral_norm))
-            norm_f = weight_norm if cfg.mrd_use_spectral_norm == False else spectral_norm
-        self.d_mult = cfg.discriminator_channel_mult
-        if hasattr(cfg, "mrd_channel_mult"):
-            print("INFO: overriding mrd channel multiplier as {}".format(cfg.mrd_channel_mult))
-            self.d_mult = cfg.mrd_channel_mult
-        self.convs = nn.ModuleList([
-            norm_f(nn.Conv2d(1, int(32*self.d_mult), (3, 9), padding=(1, 4))),
-            norm_f(nn.Conv2d(int(32*self.d_mult), int(32*self.d_mult), (3, 9), stride=(1, 2), padding=(1, 4))),
-            norm_f(nn.Conv2d(int(32*self.d_mult), int(32*self.d_mult), (3, 9), stride=(1, 2), padding=(1, 4))),
-            norm_f(nn.Conv2d(int(32*self.d_mult), int(32*self.d_mult), (3, 9), stride=(1, 2), padding=(1, 4))),
-            norm_f(nn.Conv2d(int(32*self.d_mult), int(32*self.d_mult), (3, 3), padding=(1, 1))),
-        ])
-        self.conv_post = norm_f(nn.Conv2d(int(32 * self.d_mult), 1, (3, 3), padding=(1, 1)))
-    def forward(self, x):
-        fmap = []
-        x = self.spectrogram(x)
-        x = x.unsqueeze(1)
-        for l in self.convs:
-            x = l(x)
-            x = F.leaky_relu(x, self.lrelu_slope)
-            fmap.append(x)
-        x = self.conv_post(x)
-        fmap.append(x)
-        x = torch.flatten(x, 1, -1)
-        return x, fmap
-    def spectrogram(self, x):
-        n_fft, hop_length, win_length = self.resolution
-        x = F.pad(x, (int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)), mode='reflect')
-        x = x.squeeze(1)
-        x = torch.stft(x, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=False, return_complex=True)
-        x = torch.view_as_real(x)  # [B, F, TT, 2]
-        mag = torch.norm(x, p=2, dim =-1) #[B, F, TT]
-        return mag
-class MultiResolutionDiscriminator(nn.Module):
-    def __init__(self, cfg, debug=False):
-        super().__init__()
-        self.resolutions = cfg.resolutions
-        assert len(self.resolutions) == 3,\
-            "MRD requires list of list with len=3, each element having a list with len=3. got {}".\
-                format(self.resolutions)
-        self.discriminators = nn.ModuleList(
-            [DiscriminatorR(cfg, resolution) for resolution in self.resolutions]
-        )
-    def forward(self, y, y_hat):
-        y_d_rs = []
-        y_d_gs = []
-        fmap_rs = []
-        fmap_gs = []
-        for i, d in enumerate(self.discriminators):
-            y_d_r, fmap_r = d(x=y)
-            y_d_g, fmap_g = d(x=y_hat)
-            y_d_rs.append(y_d_r)
-            fmap_rs.append(fmap_r)
-            y_d_gs.append(y_d_g)
-            fmap_gs.append(fmap_g)
-        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
-# Method based on descript-audio-codec: https://github.com/descriptinc/descript-audio-codec
-# Modified code adapted from https://github.com/gemelo-ai/vocos under the MIT license.
-#   LICENSE is in incl_licenses directory.
-class DiscriminatorB(nn.Module):
-    def __init__(
-        self,
-        window_length: int,
-        channels: int = 32,
-        hop_factor: float = 0.25,
-        bands: Tuple[Tuple[float, float], ...] = ((0.0, 0.1), (0.1, 0.25), (0.25, 0.5), (0.5, 0.75), (0.75, 1.0)),
-    ):
-        super().__init__()
-        self.window_length = window_length
-        self.hop_factor = hop_factor
-        self.spec_fn = Spectrogram(
-            n_fft=window_length, hop_length=int(window_length * hop_factor), win_length=window_length, power=None
-        )
-        n_fft = window_length // 2 + 1
-        bands = [(int(b[0] * n_fft), int(b[1] * n_fft)) for b in bands]
-        self.bands = bands
-        convs = lambda: nn.ModuleList(
-            [
-                weight_norm(nn.Conv2d(2, channels, (3, 9), (1, 1), padding=(1, 4))),
-                weight_norm(nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))),
-                weight_norm(nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))),
-                weight_norm(nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))),
-                weight_norm(nn.Conv2d(channels, channels, (3, 3), (1, 1), padding=(1, 1))),
-            ]
-        )
-        self.band_convs = nn.ModuleList([convs() for _ in range(len(self.bands))])
-        self.conv_post = weight_norm(nn.Conv2d(channels, 1, (3, 3), (1, 1), padding=(1, 1)))
-    def spectrogram(self, x):
-        # Remove DC offset
-        x = x - x.mean(dim=-1, keepdims=True)
-        # Peak normalize the volume of input audio
-        x = 0.8 * x / (x.abs().max(dim=-1, keepdim=True)[0] + 1e-9)
-        x = self.spec_fn(x)
-        x = torch.view_as_real(x)
-        x = x.permute(0, 3, 2, 1) # [B, F, T, C] -> [B, C, T, F]
-        # Split into bands
-        x_bands = [x[..., b[0] : b[1]] for b in self.bands]
-        return x_bands
-    def forward(self, x: torch.Tensor):
-        x_bands = self.spectrogram(x.squeeze(1))
-        fmap = []
-        x = []
-        for band, stack in zip(x_bands, self.band_convs):
-            for i, layer in enumerate(stack):
-                band = layer(band)
-                band = torch.nn.functional.leaky_relu(band, 0.1)
-                if i > 0:
-                    fmap.append(band)
-            x.append(band)
-        x = torch.cat(x, dim=-1)
-        x = self.conv_post(x)
-        fmap.append(x)
-        return x, fmap
-# Method based on descript-audio-codec: https://github.com/descriptinc/descript-audio-codec
-# Modified code adapted from https://github.com/gemelo-ai/vocos under the MIT license.
-#   LICENSE is in incl_licenses directory.
-class MultiBandDiscriminator(nn.Module):
-    def __init__(
-        self,
-        h,
-    ):
-        """
-        Multi-band multi-scale STFT discriminator, with the architecture based on https://github.com/descriptinc/descript-audio-codec.
-        and the modified code adapted from https://github.com/gemelo-ai/vocos.
-        """
-        super().__init__()
-        # fft_sizes (list[int]): Tuple of window lengths for FFT. Defaults to [2048, 1024, 512] if not set in h.
-        self.fft_sizes = h.get("mbd_fft_sizes", [2048, 1024, 512])
-        self.discriminators = nn.ModuleList(
-            [DiscriminatorB(window_length=w) for w in self.fft_sizes]
-        )
-    def forward(
-        self,
-        y: torch.Tensor,
-        y_hat: torch.Tensor
-    ) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], List[List[torch.Tensor]]]:
-        y_d_rs = []
-        y_d_gs = []
-        fmap_rs = []
-        fmap_gs = []
-        for d in self.discriminators:
-            y_d_r, fmap_r = d(x=y)
-            y_d_g, fmap_g = d(x=y_hat)
-            y_d_rs.append(y_d_r)
-            fmap_rs.append(fmap_r)
-            y_d_gs.append(y_d_g)
-            fmap_gs.append(fmap_g)
-        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
-# Adapted from https://github.com/open-mmlab/Amphion/blob/main/models/vocoders/gan/discriminator/mssbcqtd.py under the MIT license.
-#   LICENSE is in incl_licenses directory.
-class DiscriminatorCQT(nn.Module):
-    def __init__(self, cfg, hop_length, n_octaves, bins_per_octave):
-        super().__init__()
-        self.cfg = cfg
-        self.filters = cfg["cqtd_filters"]
-        self.max_filters = cfg["cqtd_max_filters"]
-        self.filters_scale = cfg["cqtd_filters_scale"]
-        self.kernel_size = (3, 9)
-        self.dilations = cfg["cqtd_dilations"]
-        self.stride = (1, 2)
-        self.in_channels = cfg["cqtd_in_channels"]
-        self.out_channels = cfg["cqtd_out_channels"]
-        self.fs = cfg["sampling_rate"]
-        self.hop_length = hop_length
-        self.n_octaves = n_octaves
-        self.bins_per_octave = bins_per_octave
-        # lazy-load
-        from nnAudio import features
-        self.cqt_transform = features.cqt.CQT2010v2(
-            sr=self.fs * 2,
-            hop_length=self.hop_length,
-            n_bins=self.bins_per_octave * self.n_octaves,
-            bins_per_octave=self.bins_per_octave,
-            output_format="Complex",
-            pad_mode="constant",
-        )
-        self.conv_pres = nn.ModuleList()
-        for i in range(self.n_octaves):
-            self.conv_pres.append(
-                nn.Conv2d(
-                    self.in_channels * 2,
-                    self.in_channels * 2,
-                    kernel_size=self.kernel_size,
-                    padding=self.get_2d_padding(self.kernel_size),
-                )
-            )
-        self.convs = nn.ModuleList()
-        self.convs.append(
-            nn.Conv2d(
-                self.in_channels * 2,
-                self.filters,
-                kernel_size=self.kernel_size,
-                padding=self.get_2d_padding(self.kernel_size),
-            )
-        )
-        in_chs = min(self.filters_scale * self.filters, self.max_filters)
-        for i, dilation in enumerate(self.dilations):
-            out_chs = min(
-                (self.filters_scale ** (i + 1)) * self.filters, self.max_filters
-            )
-            self.convs.append(
-                weight_norm(nn.Conv2d(
-                    in_chs,
-                    out_chs,
-                    kernel_size=self.kernel_size,
-                    stride=self.stride,
-                    dilation=(dilation, 1),
-                    padding=self.get_2d_padding(self.kernel_size, (dilation, 1)),
-                ))
-            )
-            in_chs = out_chs
-        out_chs = min(
-            (self.filters_scale ** (len(self.dilations) + 1)) * self.filters,
-            self.max_filters,
-        )
-        self.convs.append(
-            weight_norm(nn.Conv2d(
-                in_chs,
-                out_chs,
-                kernel_size=(self.kernel_size[0], self.kernel_size[0]),
-                padding=self.get_2d_padding((self.kernel_size[0], self.kernel_size[0])),
-            ))
-        )
-        self.conv_post = weight_norm(nn.Conv2d(
-            out_chs,
-            self.out_channels,
-            kernel_size=(self.kernel_size[0], self.kernel_size[0]),
-            padding=self.get_2d_padding((self.kernel_size[0], self.kernel_size[0])),
-        ))
-        self.activation = torch.nn.LeakyReLU(negative_slope=0.1)
-        self.resample = Resample(orig_freq=self.fs, new_freq=self.fs * 2)
-        self.cqtd_normalize_volume = self.cfg.get("cqtd_normalize_volume", False)
-        if self.cqtd_normalize_volume:
-            print(f"INFO: cqtd_normalize_volume set to True. Will apply DC offset removal & peak volume normalization in CQTD!")
-    def get_2d_padding(
-            self, kernel_size: typing.Tuple[int, int], dilation: typing.Tuple[int, int] = (1, 1)
-        ):
-        return (
-            ((kernel_size[0] - 1) * dilation[0]) // 2,
-            ((kernel_size[1] - 1) * dilation[1]) // 2,
-        )
-    def forward(self, x):
-        fmap = []
-        if self.cqtd_normalize_volume:
-            # Remove DC offset
-            x = x - x.mean(dim=-1, keepdims=True)
-            # Peak normalize the volume of input audio
-            x = 0.8 * x / (x.abs().max(dim=-1, keepdim=True)[0] + 1e-9)
-        x = self.resample(x)
-        z = self.cqt_transform(x)
-        z_amplitude = z[:, :, :, 0].unsqueeze(1)
-        z_phase = z[:, :, :, 1].unsqueeze(1)
-        z = torch.cat([z_amplitude, z_phase], dim=1)
-        z = torch.permute(z, (0, 1, 3, 2)) # [B, C, W, T] -> [B, C, T, W]
-        latent_z = []
-        for i in range(self.n_octaves):
-            latent_z.append(
-                self.conv_pres[i](
-                    z[
-                        :,
-                        :,
-                        :,
-                        i * self.bins_per_octave : (i + 1) * self.bins_per_octave,
-                    ]
-                )
-            )
-        latent_z = torch.cat(latent_z, dim=-1)
-        for i, l in enumerate(self.convs):
-            latent_z = l(latent_z)
-            latent_z = self.activation(latent_z)
-            fmap.append(latent_z)
-        latent_z = self.conv_post(latent_z)
-        return latent_z, fmap
-class MultiScaleSubbandCQTDiscriminator(nn.Module):
-    def __init__(self, cfg):
-        super().__init__()
-        self.cfg = cfg
-        # Using get with defaults
-        self.cfg["cqtd_filters"] = self.cfg.get("cqtd_filters", 32)
-        self.cfg["cqtd_max_filters"] = self.cfg.get("cqtd_max_filters", 1024)
-        self.cfg["cqtd_filters_scale"] = self.cfg.get("cqtd_filters_scale", 1)
-        self.cfg["cqtd_dilations"] = self.cfg.get("cqtd_dilations", [1, 2, 4])
-        self.cfg["cqtd_in_channels"] = self.cfg.get("cqtd_in_channels", 1)
-        self.cfg["cqtd_out_channels"] = self.cfg.get("cqtd_out_channels", 1)
-        # multi-scale params to loop over
-        self.cfg["cqtd_hop_lengths"] = self.cfg.get("cqtd_hop_lengths", [512, 256, 256])
-        self.cfg["cqtd_n_octaves"] = self.cfg.get("cqtd_n_octaves", [9, 9, 9])
-        self.cfg["cqtd_bins_per_octaves"] = self.cfg.get("cqtd_bins_per_octaves", [24, 36, 48])
-        self.discriminators = nn.ModuleList(
-            [
-                DiscriminatorCQT(
-                    self.cfg,
-                    hop_length=self.cfg["cqtd_hop_lengths"][i],
-                    n_octaves=self.cfg["cqtd_n_octaves"][i],
-                    bins_per_octave=self.cfg["cqtd_bins_per_octaves"][i],
-                )
-                for i in range(len(self.cfg["cqtd_hop_lengths"]))
-            ]
-        )
-    def forward(
-        self,
-        y: torch.Tensor,
-        y_hat: torch.Tensor
-    ) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], List[List[torch.Tensor]]]:
-        y_d_rs = []
-        y_d_gs = []
-        fmap_rs = []
-        fmap_gs = []
-        for disc in self.discriminators:
-            y_d_r, fmap_r = disc(y)
-            y_d_g, fmap_g = disc(y_hat)
-            y_d_rs.append(y_d_r)
-            fmap_rs.append(fmap_r)
-            y_d_gs.append(y_d_g)
-            fmap_gs.append(fmap_g)
-        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
-class CombinedDiscriminator(nn.Module):
-    # wrapper of chaining multiple discrimiantor architectures
-    # ex: combine mbd and cqtd as a single class
-    def __init__(
-        self,
-        list_discriminator: List[nn.Module]
-    ):
-        super().__init__()
-        self.discrimiantor = nn.ModuleList(list_discriminator)
-    def forward(
-        self,
-        y: torch.Tensor,
-        y_hat: torch.Tensor
-    ) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], List[List[torch.Tensor]]]:
-        y_d_rs = []
-        y_d_gs = []
-        fmap_rs = []
-        fmap_gs = []
-        for disc in self.discrimiantor:
-            y_d_r, y_d_g, fmap_r, fmap_g = disc(y, y_hat)
-            y_d_rs.extend(y_d_r)
-            fmap_rs.extend(fmap_r)
-            y_d_gs.extend(y_d_g)
-            fmap_gs.extend(fmap_g)
-        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
-# Adapted from https://github.com/descriptinc/descript-audio-codec/blob/main/dac/nn/loss.py under the MIT license.
-#   LICENSE is in incl_licenses directory.
-class MultiScaleMelSpectrogramLoss(nn.Module):
-    """Compute distance between mel spectrograms. Can be used
-    in a multi-scale way.
-    Parameters
-    ----------
-    n_mels : List[int]
-        Number of mels per STFT, by default [5, 10, 20, 40, 80, 160, 320],
-    window_lengths : List[int], optional
-        Length of each window of each STFT, by default [32, 64, 128, 256, 512, 1024, 2048]
-    loss_fn : typing.Callable, optional
-        How to compare each loss, by default nn.L1Loss()
-    clamp_eps : float, optional
-        Clamp on the log magnitude, below, by default 1e-5
-    mag_weight : float, optional
-        Weight of raw magnitude portion of loss, by default 0.0 (no ampliciation on mag part)
-    log_weight : float, optional
-        Weight of log magnitude portion of loss, by default 1.0
-    pow : float, optional
-        Power to raise magnitude to before taking log, by default 1.0
-    weight : float, optional
-        Weight of this loss, by default 1.0
-    match_stride : bool, optional
-        Whether to match the stride of convolutional layers, by default False
-    Implementation copied from: https://github.com/descriptinc/lyrebird-audiotools/blob/961786aa1a9d628cca0c0486e5885a457fe70c1a/audiotools/metrics/spectral.py
-    Additional code copied and modified from https://github.com/descriptinc/audiotools/blob/master/audiotools/core/audio_signal.py
-    """
-    def __init__(
-        self,
-        sampling_rate: int,
-        n_mels: List[int] = [5, 10, 20, 40, 80, 160, 320],
-        window_lengths: List[int] = [32, 64, 128, 256, 512, 1024, 2048],
-        loss_fn: typing.Callable = nn.L1Loss(),
-        clamp_eps: float = 1e-5,
-        mag_weight: float = 0.0,
-        log_weight: float = 1.0,
-        pow: float = 1.0,
-        weight: float = 1.0,
-        match_stride: bool = False,
-        mel_fmin: List[float] = [0, 0, 0, 0, 0, 0, 0],
-        mel_fmax: List[float] = [None, None, None, None, None, None, None],
-        window_type: str = 'hann',
-    ):
-        super().__init__()
-        self.sampling_rate = sampling_rate
-        STFTParams = namedtuple(
-            "STFTParams",
-            ["window_length", "hop_length", "window_type", "match_stride"],
-        )
-        self.stft_params = [
-            STFTParams(
-                window_length=w,
-                hop_length=w // 4,
-                match_stride=match_stride,
-                window_type=window_type,
-            )
-            for w in window_lengths
-        ]
-        self.n_mels = n_mels
-        self.loss_fn = loss_fn
-        self.clamp_eps = clamp_eps
-        self.log_weight = log_weight
-        self.mag_weight = mag_weight
-        self.weight = weight
-        self.mel_fmin = mel_fmin
-        self.mel_fmax = mel_fmax
-        self.pow = pow
-    @staticmethod
-    @functools.lru_cache(None)
-    def get_window(
-        window_type,window_length,
-    ):
-        return signal.get_window(window_type, window_length)
-    @staticmethod
-    @functools.lru_cache(None)
-    def get_mel_filters(
-        sr, n_fft, n_mels, fmin, fmax
-    ):
-        return librosa_mel_fn(sr=sr, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
-    def mel_spectrogram(
-        self, wav, n_mels, fmin, fmax, window_length, hop_length, match_stride, window_type
-    ):
-        # mirrors AudioSignal.mel_spectrogram used by BigVGAN-v2 training from:
-        # https://github.com/descriptinc/audiotools/blob/master/audiotools/core/audio_signal.py
-        B, C, T = wav.shape
-        if match_stride:
-            assert (
-                hop_length == window_length // 4
-            ), "For match_stride, hop must equal n_fft // 4"
-            right_pad = math.ceil(T / hop_length) * hop_length - T
-            pad = (window_length - hop_length) // 2
-        else:
-            right_pad = 0
-            pad = 0
-        wav = torch.nn.functional.pad(
-            wav, (pad, pad + right_pad), mode='reflect'
-        )
-        window = self.get_window(window_type, window_length)
-        window = torch.from_numpy(window).to(wav.device).float()
-        stft = torch.stft(
-            wav.reshape(-1, T),
-            n_fft=window_length,
-            hop_length=hop_length,
-            window=window,
-            return_complex=True,
-            center=True,
-        )
-        _, nf, nt = stft.shape
-        stft = stft.reshape(B, C, nf, nt)
-        if match_stride:
-            # Drop first two and last two frames, which are added
-            # because of padding. Now num_frames * hop_length = num_samples.
-            stft = stft[..., 2:-2]
-        magnitude = torch.abs(stft)
-        nf = magnitude.shape[2]
-        mel_basis = self.get_mel_filters(self.sampling_rate, 2 * (nf - 1), n_mels, fmin, fmax)
-        mel_basis = torch.from_numpy(mel_basis).to(wav.device)
-        mel_spectrogram = magnitude.transpose(2, -1) @ mel_basis.T
-        mel_spectrogram = mel_spectrogram.transpose(-1, 2)
-        return mel_spectrogram
-    def forward(
-        self,
-        x: torch.Tensor,
-        y: torch.Tensor
-    ) -> torch.Tensor:
-        """Computes mel loss between an estimate and a reference
-        signal.
-        Parameters
-        ----------
-        x : torch.Tensor
-            Estimate signal
-        y : torch.Tensor
-            Reference signal
-        Returns
-        -------
-        torch.Tensor
-            Mel loss.
-        """
-        loss = 0.0
-        for n_mels, fmin, fmax, s in zip(
-            self.n_mels, self.mel_fmin, self.mel_fmax, self.stft_params
-        ):
-            kwargs = {
-                "n_mels": n_mels,
-                "fmin": fmin,
-                "fmax": fmax,
-                "window_length": s.window_length,
-                "hop_length": s.hop_length,
-                "match_stride": s.match_stride,
-                "window_type": s.window_type,
-            }
-            x_mels = self.mel_spectrogram(x, **kwargs)
-            y_mels = self.mel_spectrogram(y, **kwargs)
-            x_logmels = torch.log(x_mels.clamp(min=self.clamp_eps).pow(self.pow)) / torch.log(torch.tensor(10.0))
-            y_logmels = torch.log(y_mels.clamp(min=self.clamp_eps).pow(self.pow)) / torch.log(torch.tensor(10.0))
-            loss += self.log_weight * self.loss_fn(x_logmels, y_logmels)
-            loss += self.mag_weight * self.loss_fn(x_logmels, y_logmels)
-        return loss
-# loss functions
-def feature_loss(
-    fmap_r: List[List[torch.Tensor]],
-    fmap_g: List[List[torch.Tensor]]
-) -> torch.Tensor:
-    loss = 0
-    for dr, dg in zip(fmap_r, fmap_g):
-        for rl, gl in zip(dr, dg):
-            loss += torch.mean(torch.abs(rl - gl))
-    return loss*2 # this equates to lambda=2.0 for the feature matching loss
-def discriminator_loss(
-    disc_real_outputs: List[torch.Tensor],
-    disc_generated_outputs: List[torch.Tensor]
-) -> Tuple[torch.Tensor, List[torch.Tensor], List[torch.Tensor]]:
-    loss = 0
-    r_losses = []
-    g_losses = []
-    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
-        r_loss = torch.mean((1-dr)**2)
-        g_loss = torch.mean(dg**2)
-        loss += (r_loss + g_loss)
-        r_losses.append(r_loss.item())
-        g_losses.append(g_loss.item())
-    return loss, r_losses, g_losses
-def generator_loss(
-    disc_outputs: List[torch.Tensor]
-) -> Tuple[torch.Tensor, List[torch.Tensor]]:
-    loss = 0
-    gen_losses = []
-    for dg in disc_outputs:
-        l = torch.mean((1-dg)**2)
-        gen_losses.append(l)
-        loss += l
-    return loss, gen_losses