Spaces:
Running
Running
| import os | |
| from glob import glob | |
| from loguru import logger | |
| import soundfile as sf | |
| import librosa | |
| import gradio as gr | |
| from huggingface_hub import hf_hub_download | |
| import time | |
| import torch | |
| import yaml | |
| from s3prl_vc.upstream.interface import get_upstream | |
| from s3prl.nn import Featurizer | |
| import s3prl_vc.models | |
| from s3prl_vc.utils import read_hdf5 | |
| from s3prl_vc.vocoder import Vocoder | |
| # ---------- Settings ---------- | |
| GPU_ID = '-1' | |
| os.environ['CUDA_VISIBLE_DEVICES'] = GPU_ID | |
| DEVICE = 'cuda' if GPU_ID != '-1' else 'cpu' | |
| SERVER_PORT = 42208 | |
| SERVER_NAME = "0.0.0.0" | |
| SSL_DIR = './keyble_ssl' | |
| EXAMPLE_DIR = './examples' | |
| en_examples = sorted(glob(os.path.join(EXAMPLE_DIR, "en", '*.wav'))) | |
| jp_examples = sorted(glob(os.path.join(EXAMPLE_DIR, "jp", '*.wav'))) | |
| zh_examples = sorted(glob(os.path.join(EXAMPLE_DIR, "zh", '*.wav'))) | |
| TRGSPKS = ["TEF1", "TEF2", "TEM1", "TEM2"] | |
| ref_samples = { | |
| trgspk: sorted(glob(os.path.join("./ref_samples", trgspk, '*.wav'))) | |
| for trgspk in TRGSPKS | |
| } | |
| # ---------- Logging ---------- | |
| logger.add('app.log', mode='a') | |
| logger.info('============================= App restarted =============================') | |
| # ---------- Download models ---------- | |
| logger.info('============================= Download models ===========================') | |
| vocoder_paths = { | |
| "ckpt": hf_hub_download(repo_id="unilight/hifigan_vctk_plus_vcc2020", filename="checkpoint-2500000steps.pkl"), | |
| "config": hf_hub_download(repo_id="unilight/hifigan_vctk_plus_vcc2020", filename="config.yml"), | |
| "stats": hf_hub_download(repo_id="unilight/hifigan_vctk_plus_vcc2020", filename="stats.h5") | |
| } | |
| vc_model_paths = { | |
| trgspk: { | |
| "ckpt": hf_hub_download(repo_id="unilight/s3prl-vc-vcc2020", filename=f"{trgspk}/checkpoint-10000steps.pkl"), | |
| "config": hf_hub_download(repo_id="unilight/s3prl-vc-vcc2020", filename=f"{trgspk}/config.yml"), | |
| "stats": hf_hub_download(repo_id="unilight/s3prl-vc-vcc2020", filename=f"{trgspk}/stats.h5"), | |
| } for trgspk in TRGSPKS | |
| } | |
| # ---------- Model ---------- | |
| vc_models = {} | |
| for trgspk in TRGSPKS: | |
| logger.info(f'============================= Setting up model for {trgspk} =============') | |
| checkpoint_path = vc_model_paths[trgspk]["ckpt"] | |
| config_path = vc_model_paths[trgspk]["config"] | |
| stats_path = vc_model_paths[trgspk]["stats"] | |
| with open(config_path) as f: | |
| config = yaml.load(f, Loader=yaml.Loader) | |
| config["trg_stats"] = { | |
| "mean": torch.from_numpy(read_hdf5(stats_path, "mean")).float().to(DEVICE), | |
| "scale": torch.from_numpy(read_hdf5(stats_path, "scale")) | |
| .float() | |
| .to(DEVICE), | |
| } | |
| # define upstream model | |
| upstream_model = get_upstream(config["upstream"]).to(DEVICE) | |
| upstream_model.eval() | |
| upstream_featurizer = Featurizer(upstream_model).to(DEVICE) | |
| upstream_featurizer.load_state_dict( | |
| torch.load(checkpoint_path, map_location="cpu")["featurizer"] | |
| ) | |
| upstream_featurizer.eval() | |
| # get model and load parameters | |
| model_class = getattr(s3prl_vc.models, config["model_type"]) | |
| model = model_class( | |
| upstream_featurizer.output_size, | |
| config["num_mels"], | |
| config["sampling_rate"] | |
| / config["hop_size"] | |
| * upstream_featurizer.downsample_rate | |
| / 16000, | |
| config["trg_stats"], | |
| use_spemb=config.get("use_spk_emb", False), | |
| **config["model_params"], | |
| ).to(DEVICE) | |
| model.load_state_dict(torch.load(checkpoint_path, map_location="cpu")["model"]) | |
| model = model.eval().to(DEVICE) | |
| logger.info(f"Loaded model parameters from {checkpoint_path}.") | |
| # load vocoder | |
| vocoder = Vocoder( | |
| vocoder_paths["ckpt"], | |
| vocoder_paths["config"], | |
| vocoder_paths["stats"], | |
| config["trg_stats"], | |
| DEVICE, | |
| ) | |
| vc_models[trgspk] = { | |
| "upstream": upstream_model, | |
| "featurizer": upstream_featurizer, | |
| "decoder": model, | |
| "vocoder": vocoder | |
| } | |
| def predict(trgspk, wav_file): | |
| x, fs = librosa.load(wav_file, sr=16000) | |
| logger.info('wav file loaded') | |
| with torch.no_grad(): | |
| start_time = time.time() | |
| xs = torch.from_numpy(x).unsqueeze(0).float().to(DEVICE) | |
| ilens = torch.LongTensor([x.shape[0]]).to(DEVICE) | |
| all_hs, all_hlens = vc_models[trgspk]["upstream"](xs, ilens) | |
| logger.info('upstream done') | |
| hs, hlens = vc_models[trgspk]["featurizer"](all_hs, all_hlens) | |
| logger.info('featurizer done') | |
| outs, _ = vc_models[trgspk]["decoder"](hs, hlens, spk_embs=None) | |
| logger.info('downstream done') | |
| out = outs[0] | |
| y, sr = vc_models[trgspk]["vocoder"].decode(out) | |
| logger.info('vocoder done') | |
| sf.write( | |
| "out.wav", | |
| y.cpu().numpy(), | |
| 24000, | |
| "PCM_16", | |
| ) | |
| logger.info('write done') | |
| logger.info('RTF={}'.format( | |
| (time.time() - start_time) / (len(x) / 16000) | |
| )) | |
| return "out.wav" | |
| with gr.Blocks(title="S3PRL-VC: Any-to-one voice conversion demo on VCC2020") as demo: | |
| gr.Markdown( | |
| """ | |
| # S3PRL-VC: Any-to-one voice conversion demo on VCC2020 | |
| ### [[Paper (ICASSP2023)]](https://arxiv.org/abs/2110.06280) [[Paper(JSTSP)]](https://arxiv.org/abs/2207.04356) [[Code]](https://github.com/unilight/s3prl-vc) | |
| **S3PRL-VC** is a voice conversion (VC) toolkit for benchmarking self-supervised speech representations (S3Rs). The term **any-to-one** means that the system can convert from any unseen speaker to a pre-defined speaker given in training. | |
| In this demo, you can record your voice, and the model will convert your voice to one of the four pre-defined speakers. These four speakers come from the **voice conversion challenge (VCC) 2020**. You can listen to the samples to get a sense of what these speakers sound like. | |
| The **RTF** of the system is around **1.5~2.5**, i.e. if you recorded a 5 second long audio, it will take 5 * (1.5~2.5) = 7.5~12.5 seconds to generate the output. | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("## Record your speech here!") | |
| input_wav = gr.Audio(label="Source speech", source='microphone', type='filepath') | |
| gr.Markdown("## Select a target speaker!") | |
| trgspk = gr.Radio(label="Target speaker", choices=["TEF1", "TEF2", "TEM1", "TEM2"]) | |
| gr.Markdown("### Here is what the target speaker sounds like!") | |
| ref_sample_wav1 = gr.Audio(label="Sample 1", type="filepath") | |
| ref_sample_wav2 = gr.Audio(label="Sample 2", type="filepath") | |
| trgspk.change(lambda trgspk: ref_samples[trgspk], | |
| inputs = trgspk, | |
| outputs = [ref_sample_wav1, ref_sample_wav2] | |
| ) | |
| convert_btn = gr.Button(value="Convert!") | |
| gr.Markdown("### You can use these examples if using a microphone is too troublesome!") | |
| gr.Markdown("I recorded the samples using my Macbook Pro, so there might be some noises.") | |
| gr.Examples( | |
| examples=en_examples, | |
| inputs=input_wav, | |
| label="English examples" | |
| ) | |
| gr.Examples( | |
| examples=jp_examples, | |
| inputs=input_wav, | |
| label="Japanese examples" | |
| ) | |
| gr.Examples( | |
| examples=zh_examples, | |
| inputs=input_wav, | |
| label="Mandarin examples" | |
| ) | |
| with gr.Column(): | |
| gr.Markdown("## Listen to the converted speech here!") | |
| output_wav = gr.Audio(type="filepath", label="Converted speech") | |
| convert_btn.click(predict, [trgspk, input_wav], output_wav) | |
| if __name__ == '__main__': | |
| try: | |
| demo.launch(debug=True, | |
| enable_queue=True, | |
| ) | |
| except KeyboardInterrupt as e: | |
| print(e) | |
| finally: | |
| demo.close() |