Spaces:
Build error
Build error
update
Browse files
checkpoints/ps_normal_exp/config.yaml
CHANGED
|
@@ -82,7 +82,6 @@ fvae_kernel_size: 5
|
|
| 82 |
fvae_noise_scale: 1.0
|
| 83 |
fvae_strides: 4
|
| 84 |
gen_dir_name: ''
|
| 85 |
-
glow_kernel_size: 3
|
| 86 |
griffin_lim_iters: 30
|
| 87 |
hidden_size: 192
|
| 88 |
hop_size: 256
|
|
@@ -127,8 +126,6 @@ out_wav_norm: false
|
|
| 127 |
pitch_extractor: parselmouth
|
| 128 |
pitch_key: pitch
|
| 129 |
pitch_type: frame
|
| 130 |
-
post_decoder: false
|
| 131 |
-
post_decoder_detach_ling: false
|
| 132 |
post_flow_lr: 0.001
|
| 133 |
post_glow_hidden: 192
|
| 134 |
post_glow_kernel_size: 3
|
|
@@ -157,8 +154,9 @@ preprocess_args:
|
|
| 157 |
with_phsep: true
|
| 158 |
preprocess_cls: egs.datasets.audio.lj.preprocess.LJPreprocess
|
| 159 |
print_nan_grads: false
|
| 160 |
-
|
| 161 |
-
|
|
|
|
| 162 |
processed_data_dir: data/processed/ljspeech
|
| 163 |
profile_infer: false
|
| 164 |
raw_data_dir: data/raw/LJSpeech-1.1
|
|
|
|
| 82 |
fvae_noise_scale: 1.0
|
| 83 |
fvae_strides: 4
|
| 84 |
gen_dir_name: ''
|
|
|
|
| 85 |
griffin_lim_iters: 30
|
| 86 |
hidden_size: 192
|
| 87 |
hop_size: 256
|
|
|
|
| 126 |
pitch_extractor: parselmouth
|
| 127 |
pitch_key: pitch
|
| 128 |
pitch_type: frame
|
|
|
|
|
|
|
| 129 |
post_flow_lr: 0.001
|
| 130 |
post_glow_hidden: 192
|
| 131 |
post_glow_kernel_size: 3
|
|
|
|
| 154 |
with_phsep: true
|
| 155 |
preprocess_cls: egs.datasets.audio.lj.preprocess.LJPreprocess
|
| 156 |
print_nan_grads: false
|
| 157 |
+
prior_flow_hidden: 64
|
| 158 |
+
prior_flow_kernel_size: 3
|
| 159 |
+
prior_flow_n_blocks: 4
|
| 160 |
processed_data_dir: data/processed/ljspeech
|
| 161 |
profile_infer: false
|
| 162 |
raw_data_dir: data/raw/LJSpeech-1.1
|
checkpoints/ps_small_exp/config.yaml
CHANGED
|
@@ -82,7 +82,6 @@ fvae_kernel_size: 3
|
|
| 82 |
fvae_noise_scale: 1.0
|
| 83 |
fvae_strides: 4
|
| 84 |
gen_dir_name: ''
|
| 85 |
-
glow_kernel_size: 3
|
| 86 |
griffin_lim_iters: 30
|
| 87 |
hidden_size: 128
|
| 88 |
hop_size: 256
|
|
@@ -127,8 +126,6 @@ out_wav_norm: false
|
|
| 127 |
pitch_extractor: parselmouth
|
| 128 |
pitch_key: pitch
|
| 129 |
pitch_type: frame
|
| 130 |
-
post_decoder: false
|
| 131 |
-
post_decoder_detach_ling: false
|
| 132 |
post_flow_lr: 0.001
|
| 133 |
post_glow_hidden: 128
|
| 134 |
post_glow_kernel_size: 3
|
|
@@ -157,8 +154,9 @@ preprocess_args:
|
|
| 157 |
with_phsep: true
|
| 158 |
preprocess_cls: egs.datasets.audio.lj.preprocess.LJPreprocess
|
| 159 |
print_nan_grads: false
|
| 160 |
-
|
| 161 |
-
|
|
|
|
| 162 |
processed_data_dir: data/processed/ljspeech
|
| 163 |
profile_infer: false
|
| 164 |
raw_data_dir: data/raw/LJSpeech-1.1
|
|
|
|
| 82 |
fvae_noise_scale: 1.0
|
| 83 |
fvae_strides: 4
|
| 84 |
gen_dir_name: ''
|
|
|
|
| 85 |
griffin_lim_iters: 30
|
| 86 |
hidden_size: 128
|
| 87 |
hop_size: 256
|
|
|
|
| 126 |
pitch_extractor: parselmouth
|
| 127 |
pitch_key: pitch
|
| 128 |
pitch_type: frame
|
|
|
|
|
|
|
| 129 |
post_flow_lr: 0.001
|
| 130 |
post_glow_hidden: 128
|
| 131 |
post_glow_kernel_size: 3
|
|
|
|
| 154 |
with_phsep: true
|
| 155 |
preprocess_cls: egs.datasets.audio.lj.preprocess.LJPreprocess
|
| 156 |
print_nan_grads: false
|
| 157 |
+
prior_flow_hidden: 32
|
| 158 |
+
prior_flow_kernel_size: 3
|
| 159 |
+
prior_flow_n_blocks: 3
|
| 160 |
processed_data_dir: data/processed/ljspeech
|
| 161 |
profile_infer: false
|
| 162 |
raw_data_dir: data/raw/LJSpeech-1.1
|
egs/egs_bases/tts/ps.yaml
CHANGED
|
@@ -38,14 +38,12 @@ fvae_enc_n_layers: 8
|
|
| 38 |
fvae_dec_n_layers: 4
|
| 39 |
fvae_strides: 4
|
| 40 |
fvae_noise_scale: 1.0
|
| 41 |
-
post_decoder: false
|
| 42 |
-
post_decoder_detach_ling: false
|
| 43 |
|
| 44 |
# prior flow
|
| 45 |
use_prior_flow: true
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
|
| 50 |
###########################
|
| 51 |
# training and inference
|
|
|
|
| 38 |
fvae_dec_n_layers: 4
|
| 39 |
fvae_strides: 4
|
| 40 |
fvae_noise_scale: 1.0
|
|
|
|
|
|
|
| 41 |
|
| 42 |
# prior flow
|
| 43 |
use_prior_flow: true
|
| 44 |
+
prior_flow_hidden: 64
|
| 45 |
+
prior_flow_kernel_size: 3
|
| 46 |
+
prior_flow_n_blocks: 4
|
| 47 |
|
| 48 |
###########################
|
| 49 |
# training and inference
|
egs/egs_bases/tts/ps_flow_small.yaml
CHANGED
|
@@ -30,9 +30,9 @@ fvae_noise_scale: 1.0
|
|
| 30 |
|
| 31 |
# prior flow
|
| 32 |
use_prior_flow: true
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
# post flow
|
| 37 |
post_glow_hidden: 128
|
| 38 |
post_glow_kernel_size: 3
|
|
|
|
| 30 |
|
| 31 |
# prior flow
|
| 32 |
use_prior_flow: true
|
| 33 |
+
prior_flow_hidden: 32
|
| 34 |
+
prior_flow_kernel_size: 3
|
| 35 |
+
prior_flow_n_blocks: 3
|
| 36 |
# post flow
|
| 37 |
post_glow_hidden: 128
|
| 38 |
post_glow_kernel_size: 3
|
modules/tts/portaspeech/portaspeech.py
CHANGED
|
@@ -74,9 +74,9 @@ class PortaSpeech(FastSpeech):
|
|
| 74 |
dec_n_layers=hparams['fvae_dec_n_layers'],
|
| 75 |
c_cond=self.hidden_size,
|
| 76 |
use_prior_flow=hparams['use_prior_flow'],
|
| 77 |
-
flow_hidden=hparams['
|
| 78 |
-
flow_kernel_size=hparams['
|
| 79 |
-
flow_n_steps=hparams['
|
| 80 |
strides=[hparams['fvae_strides']],
|
| 81 |
encoder_type=hparams['fvae_encoder_type'],
|
| 82 |
decoder_type=hparams['fvae_decoder_type'],
|
|
@@ -88,11 +88,6 @@ class PortaSpeech(FastSpeech):
|
|
| 88 |
self.pitch_embed = Embedding(300, self.hidden_size, 0)
|
| 89 |
if self.hparams['add_word_pos']:
|
| 90 |
self.word_pos_proj = Linear(self.hidden_size, self.hidden_size)
|
| 91 |
-
if self.hparams['post_decoder']:
|
| 92 |
-
self.post_decoder_proj_in = Linear(self.out_dims, self.hidden_size)
|
| 93 |
-
self.post_decoder = ConditionalConvBlocks(
|
| 94 |
-
self.hidden_size, self.hidden_size, self.out_dims, None,
|
| 95 |
-
hparams['dec_kernel_size'], num_layers=4)
|
| 96 |
|
| 97 |
def build_embedding(self, dictionary, embed_dim):
|
| 98 |
num_embeddings = len(dictionary)
|
|
@@ -188,11 +183,6 @@ class PortaSpeech(FastSpeech):
|
|
| 188 |
z = torch.randn_like(z)
|
| 189 |
x_recon = self.fvae.decoder(z, nonpadding=tgt_nonpadding_BHT, cond=x).transpose(1, 2)
|
| 190 |
ret['pre_mel_out'] = x_recon
|
| 191 |
-
if self.hparams['post_decoder']:
|
| 192 |
-
x_recon = self.post_decoder_proj_in(x_recon.detach())
|
| 193 |
-
if self.hparams['post_decoder_detach_ling']:
|
| 194 |
-
decoder_inp = decoder_inp.detach()
|
| 195 |
-
x_recon = self.post_decoder(x_recon, decoder_inp) * tgt_nonpadding
|
| 196 |
return x_recon
|
| 197 |
|
| 198 |
def forward_dur(self, dur_input, mel2word, ret, **kwargs):
|
|
|
|
| 74 |
dec_n_layers=hparams['fvae_dec_n_layers'],
|
| 75 |
c_cond=self.hidden_size,
|
| 76 |
use_prior_flow=hparams['use_prior_flow'],
|
| 77 |
+
flow_hidden=hparams['prior_flow_hidden'],
|
| 78 |
+
flow_kernel_size=hparams['prior_flow_kernel_size'],
|
| 79 |
+
flow_n_steps=hparams['prior_flow_n_blocks'],
|
| 80 |
strides=[hparams['fvae_strides']],
|
| 81 |
encoder_type=hparams['fvae_encoder_type'],
|
| 82 |
decoder_type=hparams['fvae_decoder_type'],
|
|
|
|
| 88 |
self.pitch_embed = Embedding(300, self.hidden_size, 0)
|
| 89 |
if self.hparams['add_word_pos']:
|
| 90 |
self.word_pos_proj = Linear(self.hidden_size, self.hidden_size)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
def build_embedding(self, dictionary, embed_dim):
|
| 93 |
num_embeddings = len(dictionary)
|
|
|
|
| 183 |
z = torch.randn_like(z)
|
| 184 |
x_recon = self.fvae.decoder(z, nonpadding=tgt_nonpadding_BHT, cond=x).transpose(1, 2)
|
| 185 |
ret['pre_mel_out'] = x_recon
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
return x_recon
|
| 187 |
|
| 188 |
def forward_dur(self, dur_input, mel2word, ret, **kwargs):
|
tasks/tts/ps.py
CHANGED
|
@@ -58,8 +58,6 @@ class PortaSpeechTask(FastSpeechTask):
|
|
| 58 |
losses_kl = min(self.global_step / hparams['kl_start_steps'], 1) * losses_kl
|
| 59 |
losses_kl = losses_kl * hparams['lambda_kl']
|
| 60 |
losses['kl'] = losses_kl
|
| 61 |
-
if hparams['post_decoder']:
|
| 62 |
-
self.add_mel_loss(output['pre_mel_out'], sample['mels'], losses, '_post')
|
| 63 |
self.add_mel_loss(output['mel_out'], sample['mels'], losses)
|
| 64 |
if hparams['dur_level'] == 'word':
|
| 65 |
self.add_dur_loss(
|
|
|
|
| 58 |
losses_kl = min(self.global_step / hparams['kl_start_steps'], 1) * losses_kl
|
| 59 |
losses_kl = losses_kl * hparams['lambda_kl']
|
| 60 |
losses['kl'] = losses_kl
|
|
|
|
|
|
|
| 61 |
self.add_mel_loss(output['mel_out'], sample['mels'], losses)
|
| 62 |
if hparams['dur_level'] == 'word':
|
| 63 |
self.add_dur_loss(
|