Spaces:
Runtime error
Runtime error
| import argparse | |
| import logging | |
| import os | |
| import random | |
| from concurrent.futures import ProcessPoolExecutor | |
| from glob import glob | |
| from random import shuffle | |
| import librosa | |
| import numpy as np | |
| import torch | |
| import torch.multiprocessing as mp | |
| from loguru import logger | |
| from tqdm import tqdm | |
| import diffusion.logger.utils as du | |
| import utils | |
| from diffusion.vocoder import Vocoder | |
| from modules.mel_processing import spectrogram_torch | |
| logging.getLogger("numba").setLevel(logging.WARNING) | |
| logging.getLogger("matplotlib").setLevel(logging.WARNING) | |
| hps = utils.get_hparams_from_file("configs/config.json") | |
| dconfig = du.load_config("configs/diffusion.yaml") | |
| sampling_rate = hps.data.sampling_rate | |
| hop_length = hps.data.hop_length | |
| speech_encoder = hps["model"]["speech_encoder"] | |
| def process_one(filename, hmodel, f0p, device, diff=False, mel_extractor=None): | |
| wav, sr = librosa.load(filename, sr=sampling_rate) | |
| audio_norm = torch.FloatTensor(wav) | |
| audio_norm = audio_norm.unsqueeze(0) | |
| soft_path = filename + ".soft.pt" | |
| if not os.path.exists(soft_path): | |
| wav16k = librosa.resample(wav, orig_sr=sampling_rate, target_sr=16000) | |
| wav16k = torch.from_numpy(wav16k).to(device) | |
| c = hmodel.encoder(wav16k) | |
| torch.save(c.cpu(), soft_path) | |
| f0_path = filename + ".f0.npy" | |
| if not os.path.exists(f0_path): | |
| f0_predictor = utils.get_f0_predictor(f0p,sampling_rate=sampling_rate, hop_length=hop_length,device=None,threshold=0.05) | |
| f0,uv = f0_predictor.compute_f0_uv( | |
| wav | |
| ) | |
| np.save(f0_path, np.asanyarray((f0,uv),dtype=object)) | |
| spec_path = filename.replace(".wav", ".spec.pt") | |
| if not os.path.exists(spec_path): | |
| # Process spectrogram | |
| # The following code can't be replaced by torch.FloatTensor(wav) | |
| # because load_wav_to_torch return a tensor that need to be normalized | |
| if sr != hps.data.sampling_rate: | |
| raise ValueError( | |
| "{} SR doesn't match target {} SR".format( | |
| sr, hps.data.sampling_rate | |
| ) | |
| ) | |
| #audio_norm = audio / hps.data.max_wav_value | |
| spec = spectrogram_torch( | |
| audio_norm, | |
| hps.data.filter_length, | |
| hps.data.sampling_rate, | |
| hps.data.hop_length, | |
| hps.data.win_length, | |
| center=False, | |
| ) | |
| spec = torch.squeeze(spec, 0) | |
| torch.save(spec, spec_path) | |
| if diff or hps.model.vol_embedding: | |
| volume_path = filename + ".vol.npy" | |
| volume_extractor = utils.Volume_Extractor(hop_length) | |
| if not os.path.exists(volume_path): | |
| volume = volume_extractor.extract(audio_norm) | |
| np.save(volume_path, volume.to('cpu').numpy()) | |
| if diff: | |
| mel_path = filename + ".mel.npy" | |
| if not os.path.exists(mel_path) and mel_extractor is not None: | |
| mel_t = mel_extractor.extract(audio_norm.to(device), sampling_rate) | |
| mel = mel_t.squeeze().to('cpu').numpy() | |
| np.save(mel_path, mel) | |
| aug_mel_path = filename + ".aug_mel.npy" | |
| aug_vol_path = filename + ".aug_vol.npy" | |
| max_amp = float(torch.max(torch.abs(audio_norm))) + 1e-5 | |
| max_shift = min(1, np.log10(1/max_amp)) | |
| log10_vol_shift = random.uniform(-1, max_shift) | |
| keyshift = random.uniform(-5, 5) | |
| if mel_extractor is not None: | |
| aug_mel_t = mel_extractor.extract(audio_norm * (10 ** log10_vol_shift), sampling_rate, keyshift = keyshift) | |
| aug_mel = aug_mel_t.squeeze().to('cpu').numpy() | |
| aug_vol = volume_extractor.extract(audio_norm * (10 ** log10_vol_shift)) | |
| if not os.path.exists(aug_mel_path): | |
| np.save(aug_mel_path,np.asanyarray((aug_mel,keyshift),dtype=object)) | |
| if not os.path.exists(aug_vol_path): | |
| np.save(aug_vol_path,aug_vol.to('cpu').numpy()) | |
| def process_batch(file_chunk, f0p, diff=False, mel_extractor=None, device="cpu"): | |
| logger.info("Loading speech encoder for content...") | |
| rank = mp.current_process()._identity | |
| rank = rank[0] if len(rank) > 0 else 0 | |
| if torch.cuda.is_available(): | |
| gpu_id = rank % torch.cuda.device_count() | |
| device = torch.device(f"cuda:{gpu_id}") | |
| logger.info(f"Rank {rank} uses device {device}") | |
| hmodel = utils.get_speech_encoder(speech_encoder, device=device) | |
| logger.info(f"Loaded speech encoder for rank {rank}") | |
| for filename in tqdm(file_chunk): | |
| process_one(filename, hmodel, f0p, device, diff, mel_extractor) | |
| def parallel_process(filenames, num_processes, f0p, diff, mel_extractor, device): | |
| with ProcessPoolExecutor(max_workers=num_processes) as executor: | |
| tasks = [] | |
| for i in range(num_processes): | |
| start = int(i * len(filenames) / num_processes) | |
| end = int((i + 1) * len(filenames) / num_processes) | |
| file_chunk = filenames[start:end] | |
| tasks.append(executor.submit(process_batch, file_chunk, f0p, diff, mel_extractor, device=device)) | |
| for task in tqdm(tasks): | |
| task.result() | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument('-d', '--device', type=str, default=None) | |
| parser.add_argument( | |
| "--in_dir", type=str, default="dataset/44k", help="path to input dir" | |
| ) | |
| parser.add_argument( | |
| '--use_diff',action='store_true', help='Whether to use the diffusion model' | |
| ) | |
| parser.add_argument( | |
| '--f0_predictor', type=str, default="dio", help='Select F0 predictor, can select crepe,pm,dio,harvest,rmvpe,fcpe|default: pm(note: crepe is original F0 using mean filter)' | |
| ) | |
| parser.add_argument( | |
| '--num_processes', type=int, default=1, help='You are advised to set the number of processes to the same as the number of CPU cores' | |
| ) | |
| args = parser.parse_args() | |
| f0p = args.f0_predictor | |
| device = args.device | |
| if device is None: | |
| device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") | |
| print(speech_encoder) | |
| logger.info("Using device: ", device) | |
| logger.info("Using SpeechEncoder: " + speech_encoder) | |
| logger.info("Using extractor: " + f0p) | |
| logger.info("Using diff Mode: " + str( args.use_diff)) | |
| if args.use_diff: | |
| print("use_diff") | |
| print("Loading Mel Extractor...") | |
| mel_extractor = Vocoder(dconfig.vocoder.type, dconfig.vocoder.ckpt, device=device) | |
| print("Loaded Mel Extractor.") | |
| else: | |
| mel_extractor = None | |
| filenames = glob(f"{args.in_dir}/*/*.wav", recursive=True) # [:10] | |
| shuffle(filenames) | |
| mp.set_start_method("spawn", force=True) | |
| num_processes = args.num_processes | |
| if num_processes == 0: | |
| num_processes = os.cpu_count() | |
| parallel_process(filenames, num_processes, f0p, args.use_diff, mel_extractor, device) | |