Spaces:

atsushieee
/

sovits-test

Running

App Files Files Community

sovits-test / speaker /infer.py

atsushieee

Upload folder using huggingface_hub

9791162 11 months ago

raw

history blame contribute delete

3.26 kB

	import re
	import json
	import fsspec
	import torch
	import numpy as np
	import argparse

	from argparse import RawTextHelpFormatter
	from .models.lstm import LSTMSpeakerEncoder
	from .config import SpeakerEncoderConfig
	from .utils.audio import AudioProcessor


	def read_json(json_path):
	config_dict = {}
	try:
	with fsspec.open(json_path, "r", encoding="utf-8") as f:
	data = json.load(f)
	except json.decoder.JSONDecodeError:
	# backwards compat.
	data = read_json_with_comments(json_path)
	config_dict.update(data)
	return config_dict


	def read_json_with_comments(json_path):
	"""for backward compat."""
	# fallback to json
	with fsspec.open(json_path, "r", encoding="utf-8") as f:
	input_str = f.read()
	# handle comments
	input_str = re.sub(r"\\\n", "", input_str)
	input_str = re.sub(r"//.*\n", "\n", input_str)
	data = json.loads(input_str)
	return data


	if __name__ == "__main__":

	parser = argparse.ArgumentParser(
	description="""Compute embedding vectors for each wav file in a dataset.""",
	formatter_class=RawTextHelpFormatter,
	)
	parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
	parser.add_argument(
	"config_path",
	type=str,
	help="Path to model config file.",
	)

	parser.add_argument("-s", "--source", help="input wave", dest="source")
	parser.add_argument(
	"-t", "--target", help="output 256d speaker embeddimg", dest="target"
	)

	parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
	parser.add_argument("--eval", type=bool, help="compute eval.", default=True)

	args = parser.parse_args()
	source_file = args.source
	target_file = args.target

	# config
	config_dict = read_json(args.config_path)
	# print(config_dict)

	# model
	config = SpeakerEncoderConfig(config_dict)
	config.from_dict(config_dict)

	speaker_encoder = LSTMSpeakerEncoder(
	config.model_params["input_dim"],
	config.model_params["proj_dim"],
	config.model_params["lstm_dim"],
	config.model_params["num_lstm_layers"],
	)

	speaker_encoder.load_checkpoint(args.model_path, eval=True, use_cuda=args.use_cuda)

	# preprocess
	speaker_encoder_ap = AudioProcessor(**config.audio)
	# normalize the input audio level and trim silences
	speaker_encoder_ap.do_sound_norm = True
	speaker_encoder_ap.do_trim_silence = True

	# compute speaker embeddings

	# extract the embedding
	waveform = speaker_encoder_ap.load_wav(
	source_file, sr=speaker_encoder_ap.sample_rate
	)
	spec = speaker_encoder_ap.melspectrogram(waveform)
	spec = torch.from_numpy(spec.T)
	if args.use_cuda:
	spec = spec.cuda()
	spec = spec.unsqueeze(0)
	embed = speaker_encoder.compute_embedding(spec).detach().cpu().numpy()
	embed = embed.squeeze()
	# print(embed)
	# print(embed.size)
	np.save(target_file, embed, allow_pickle=False)


	if hasattr(speaker_encoder, 'module'):
	state_dict = speaker_encoder.module.state_dict()
	else:
	state_dict = speaker_encoder.state_dict()
	torch.save({'model': state_dict}, "model_small.pth")