Spaces:

unilight
/

s3prl-vc-vcc2020

Running

s3prl-vc-vcc2020 / app.py

unilight

init

aff8b39 over 2 years ago

8.01 kB


	import os
	from glob import glob
	from loguru import logger
	import soundfile as sf
	import librosa
	import gradio as gr

	from huggingface_hub import hf_hub_download
	import time
	import torch
	import yaml

	from s3prl_vc.upstream.interface import get_upstream
	from s3prl.nn import Featurizer
	import s3prl_vc.models
	from s3prl_vc.utils import read_hdf5
	from s3prl_vc.vocoder import Vocoder


	# ---------- Settings ----------
	GPU_ID = '-1'
	os.environ['CUDA_VISIBLE_DEVICES'] = GPU_ID
	DEVICE = 'cuda' if GPU_ID != '-1' else 'cpu'

	SERVER_PORT = 42208
	SERVER_NAME = "0.0.0.0"
	SSL_DIR = './keyble_ssl'

	EXAMPLE_DIR = './examples'
	en_examples = sorted(glob(os.path.join(EXAMPLE_DIR, "en", '*.wav')))
	jp_examples = sorted(glob(os.path.join(EXAMPLE_DIR, "jp", '*.wav')))
	zh_examples = sorted(glob(os.path.join(EXAMPLE_DIR, "zh", '*.wav')))

	TRGSPKS = ["TEF1", "TEF2", "TEM1", "TEM2"]

	ref_samples = {
	trgspk: sorted(glob(os.path.join("./ref_samples", trgspk, '*.wav')))
	for trgspk in TRGSPKS
	}

	# ---------- Logging ----------
	logger.add('app.log', mode='a')
	logger.info('============================= App restarted =============================')

	# ---------- Download models ----------
	logger.info('============================= Download models ===========================')

	vocoder_paths = {
	"ckpt": hf_hub_download(repo_id="unilight/hifigan_vctk_plus_vcc2020", filename="checkpoint-2500000steps.pkl"),
	"config": hf_hub_download(repo_id="unilight/hifigan_vctk_plus_vcc2020", filename="config.yml"),
	"stats": hf_hub_download(repo_id="unilight/hifigan_vctk_plus_vcc2020", filename="stats.h5")
	}

	vc_model_paths = {
	trgspk: {
	"ckpt": hf_hub_download(repo_id="unilight/s3prl-vc-vcc2020", filename=f"{trgspk}/checkpoint-10000steps.pkl"),
	"config": hf_hub_download(repo_id="unilight/s3prl-vc-vcc2020", filename=f"{trgspk}/config.yml"),
	"stats": hf_hub_download(repo_id="unilight/s3prl-vc-vcc2020", filename=f"{trgspk}/stats.h5"),
	} for trgspk in TRGSPKS
	}

	# ---------- Model ----------
	vc_models = {}
	for trgspk in TRGSPKS:
	logger.info(f'============================= Setting up model for {trgspk} =============')
	checkpoint_path = vc_model_paths[trgspk]["ckpt"]
	config_path = vc_model_paths[trgspk]["config"]
	stats_path = vc_model_paths[trgspk]["stats"]
	with open(config_path) as f:
	config = yaml.load(f, Loader=yaml.Loader)

	config["trg_stats"] = {
	"mean": torch.from_numpy(read_hdf5(stats_path, "mean")).float().to(DEVICE),
	"scale": torch.from_numpy(read_hdf5(stats_path, "scale"))
	.float()
	.to(DEVICE),
	}

	# define upstream model
	upstream_model = get_upstream(config["upstream"]).to(DEVICE)
	upstream_model.eval()
	upstream_featurizer = Featurizer(upstream_model).to(DEVICE)
	upstream_featurizer.load_state_dict(
	torch.load(checkpoint_path, map_location="cpu")["featurizer"]
	)
	upstream_featurizer.eval()

	# get model and load parameters
	model_class = getattr(s3prl_vc.models, config["model_type"])
	model = model_class(
	upstream_featurizer.output_size,
	config["num_mels"],
	config["sampling_rate"]
	/ config["hop_size"]
	* upstream_featurizer.downsample_rate
	/ 16000,
	config["trg_stats"],
	use_spemb=config.get("use_spk_emb", False),
	**config["model_params"],
	).to(DEVICE)
	model.load_state_dict(torch.load(checkpoint_path, map_location="cpu")["model"])
	model = model.eval().to(DEVICE)
	logger.info(f"Loaded model parameters from {checkpoint_path}.")

	# load vocoder
	vocoder = Vocoder(
	vocoder_paths["ckpt"],
	vocoder_paths["config"],
	vocoder_paths["stats"],
	config["trg_stats"],
	DEVICE,
	)

	vc_models[trgspk] = {
	"upstream": upstream_model,
	"featurizer": upstream_featurizer,
	"decoder": model,
	"vocoder": vocoder
	}

	def predict(trgspk, wav_file):
	x, fs = librosa.load(wav_file, sr=16000)
	logger.info('wav file loaded')

	with torch.no_grad():
	start_time = time.time()
	xs = torch.from_numpy(x).unsqueeze(0).float().to(DEVICE)
	ilens = torch.LongTensor([x.shape[0]]).to(DEVICE)

	all_hs, all_hlens = vc_models[trgspk]["upstream"](xs, ilens)
	logger.info('upstream done')

	hs, hlens = vc_models[trgspk]["featurizer"](all_hs, all_hlens)
	logger.info('featurizer done')

	outs, _ = vc_models[trgspk]["decoder"](hs, hlens, spk_embs=None)
	logger.info('downstream done')

	out = outs[0]
	y, sr = vc_models[trgspk]["vocoder"].decode(out)
	logger.info('vocoder done')
	sf.write(
	"out.wav",
	y.cpu().numpy(),
	24000,
	"PCM_16",
	)
	logger.info('write done')
	logger.info('RTF={}'.format(
	(time.time() - start_time) / (len(x) / 16000)
	))

	return "out.wav"

	with gr.Blocks(title="S3PRL-VC: Any-to-one voice conversion demo on VCC2020") as demo:
	gr.Markdown(
	"""
	# S3PRL-VC: Any-to-one voice conversion demo on VCC2020

	### [[Paper (ICASSP2023)]](https://arxiv.org/abs/2110.06280) [[Paper(JSTSP)]](https://arxiv.org/abs/2207.04356) [[Code]](https://github.com/unilight/s3prl-vc)

	S3PRL-VC is a voice conversion (VC) toolkit for benchmarking self-supervised speech representations (S3Rs). The term any-to-one means that the system can convert from any unseen speaker to a pre-defined speaker given in training.

	In this demo, you can record your voice, and the model will convert your voice to one of the four pre-defined speakers. These four speakers come from the voice conversion challenge (VCC) 2020. You can listen to the samples to get a sense of what these speakers sound like.

	The RTF of the system is around 1.5~2.5, i.e. if you recorded a 5 second long audio, it will take 5 * (1.5~2.5) = 7.5~12.5 seconds to generate the output.
	"""
	)

	with gr.Row():
	with gr.Column():
	gr.Markdown("## Record your speech here!")
	input_wav = gr.Audio(label="Source speech", source='microphone', type='filepath')

	gr.Markdown("## Select a target speaker!")
	trgspk = gr.Radio(label="Target speaker", choices=["TEF1", "TEF2", "TEM1", "TEM2"])
	gr.Markdown("### Here is what the target speaker sounds like!")
	ref_sample_wav1 = gr.Audio(label="Sample 1", type="filepath")
	ref_sample_wav2 = gr.Audio(label="Sample 2", type="filepath")
	trgspk.change(lambda trgspk: ref_samples[trgspk],
	inputs = trgspk,
	outputs = [ref_sample_wav1, ref_sample_wav2]
	)

	convert_btn = gr.Button(value="Convert!")
	gr.Markdown("### You can use these examples if using a microphone is too troublesome!")
	gr.Markdown("I recorded the samples using my Macbook Pro, so there might be some noises.")
	gr.Examples(
	examples=en_examples,
	inputs=input_wav,
	label="English examples"
	)
	gr.Examples(
	examples=jp_examples,
	inputs=input_wav,
	label="Japanese examples"
	)
	gr.Examples(
	examples=zh_examples,
	inputs=input_wav,
	label="Mandarin examples"
	)

	with gr.Column():
	gr.Markdown("## Listen to the converted speech here!")
	output_wav = gr.Audio(type="filepath", label="Converted speech")
	convert_btn.click(predict, [trgspk, input_wav], output_wav)

	if __name__ == '__main__':
	try:
	demo.launch(debug=True,
	enable_queue=True,
	)
	except KeyboardInterrupt as e:
	print(e)

	finally:
	demo.close()