Spaces:
Running
on
T4
Running
on
T4
remove clvp for lower gpu usage and increased speed.
Browse files- app.py +3 -15
- tortoise/api.py +42 -180
app.py
CHANGED
|
@@ -40,7 +40,6 @@ VOICE_OPTIONS = [
|
|
| 40 |
"william",
|
| 41 |
"jane_eyre",
|
| 42 |
"random", # special option for random voice
|
| 43 |
-
"disabled", # special option for disabled voice
|
| 44 |
]
|
| 45 |
|
| 46 |
|
|
@@ -49,7 +48,6 @@ def inference(
|
|
| 49 |
script,
|
| 50 |
voice,
|
| 51 |
voice_b,
|
| 52 |
-
preset,
|
| 53 |
seed,
|
| 54 |
split_by_newline,
|
| 55 |
):
|
|
@@ -81,7 +79,7 @@ def inference(
|
|
| 81 |
text,
|
| 82 |
voice_samples=voice_samples,
|
| 83 |
conditioning_latents=conditioning_latents,
|
| 84 |
-
preset=
|
| 85 |
k=1,
|
| 86 |
use_deterministic_seed=seed,
|
| 87 |
)
|
|
@@ -91,12 +89,9 @@ def inference(
|
|
| 91 |
|
| 92 |
full_audio = torch.cat(all_parts, dim=-1)
|
| 93 |
|
| 94 |
-
# os.makedirs("outputs", exist_ok=True)
|
| 95 |
-
# torchaudio.save(os.path.join("outputs", f"{name}.wav"), full_audio, 24000)
|
| 96 |
-
|
| 97 |
with open("Tortoise_TTS_Runs_Scripts.log", "a") as f:
|
| 98 |
f.write(
|
| 99 |
-
f"{datetime.now()} | Voice: {','.join(voices)} | Text: {text} |
|
| 100 |
)
|
| 101 |
|
| 102 |
output_texts = [f"({j+1}) {texts[j]}" for j in range(len(texts))]
|
|
@@ -120,14 +115,8 @@ def main():
|
|
| 120 |
)
|
| 121 |
script = gr.File(label="Upload a text file")
|
| 122 |
|
| 123 |
-
preset = gr.Radio(
|
| 124 |
-
["ultra_fast", "fast", "standard", "high_quality"],
|
| 125 |
-
value="fast",
|
| 126 |
-
label="Preset mode (determines quality with tradeoff over speed):",
|
| 127 |
-
type="value",
|
| 128 |
-
)
|
| 129 |
voice = gr.Dropdown(
|
| 130 |
-
VOICE_OPTIONS, value="
|
| 131 |
)
|
| 132 |
voice_b = gr.Dropdown(
|
| 133 |
VOICE_OPTIONS,
|
|
@@ -154,7 +143,6 @@ def main():
|
|
| 154 |
script,
|
| 155 |
voice,
|
| 156 |
voice_b,
|
| 157 |
-
preset,
|
| 158 |
seed,
|
| 159 |
split_by_newline,
|
| 160 |
],
|
|
|
|
| 40 |
"william",
|
| 41 |
"jane_eyre",
|
| 42 |
"random", # special option for random voice
|
|
|
|
| 43 |
]
|
| 44 |
|
| 45 |
|
|
|
|
| 48 |
script,
|
| 49 |
voice,
|
| 50 |
voice_b,
|
|
|
|
| 51 |
seed,
|
| 52 |
split_by_newline,
|
| 53 |
):
|
|
|
|
| 79 |
text,
|
| 80 |
voice_samples=voice_samples,
|
| 81 |
conditioning_latents=conditioning_latents,
|
| 82 |
+
preset="ultra_fast",
|
| 83 |
k=1,
|
| 84 |
use_deterministic_seed=seed,
|
| 85 |
)
|
|
|
|
| 89 |
|
| 90 |
full_audio = torch.cat(all_parts, dim=-1)
|
| 91 |
|
|
|
|
|
|
|
|
|
|
| 92 |
with open("Tortoise_TTS_Runs_Scripts.log", "a") as f:
|
| 93 |
f.write(
|
| 94 |
+
f"{datetime.now()} | Voice: {','.join(voices)} | Text: {text} | Time Taken (s): {time.time()-start_time} | Seed: {seed}\n"
|
| 95 |
)
|
| 96 |
|
| 97 |
output_texts = [f"({j+1}) {texts[j]}" for j in range(len(texts))]
|
|
|
|
| 115 |
)
|
| 116 |
script = gr.File(label="Upload a text file")
|
| 117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
voice = gr.Dropdown(
|
| 119 |
+
VOICE_OPTIONS, value="jane_eyre", label="Select voice:", type="value"
|
| 120 |
)
|
| 121 |
voice_b = gr.Dropdown(
|
| 122 |
VOICE_OPTIONS,
|
|
|
|
| 143 |
script,
|
| 144 |
voice,
|
| 145 |
voice_b,
|
|
|
|
| 146 |
seed,
|
| 147 |
split_by_newline,
|
| 148 |
],
|
tortoise/api.py
CHANGED
|
@@ -252,13 +252,6 @@ class TextToSpeech:
|
|
| 252 |
layer_drop=0, unconditioned_percentage=0).cpu().eval()
|
| 253 |
self.diffusion.load_state_dict(torch.load(get_model_path('diffusion_decoder.pth', models_dir)))
|
| 254 |
|
| 255 |
-
self.clvp = CLVP(dim_text=768, dim_speech=768, dim_latent=768, num_text_tokens=256, text_enc_depth=20,
|
| 256 |
-
text_seq_len=350, text_heads=12,
|
| 257 |
-
num_speech_tokens=8192, speech_enc_depth=20, speech_heads=12, speech_seq_len=430,
|
| 258 |
-
use_xformers=True).cpu().eval()
|
| 259 |
-
self.clvp.load_state_dict(torch.load(get_model_path('clvp2.pth', models_dir)))
|
| 260 |
-
self.cvvp = None # CVVP model is only loaded if used.
|
| 261 |
-
|
| 262 |
self.vocoder = UnivNetGenerator().cpu()
|
| 263 |
self.vocoder.load_state_dict(torch.load(get_model_path('vocoder.pth', models_dir), map_location=torch.device('cpu'))['model_g'])
|
| 264 |
self.vocoder.eval(inference=True)
|
|
@@ -272,13 +265,6 @@ class TextToSpeech:
|
|
| 272 |
yield m
|
| 273 |
m = model.cpu()
|
| 274 |
|
| 275 |
-
|
| 276 |
-
def load_cvvp(self):
|
| 277 |
-
"""Load CVVP model."""
|
| 278 |
-
self.cvvp = CVVP(model_dim=512, transformer_heads=8, dropout=0, mel_codes=8192, conditioning_enc_depth=8, cond_mask_percentage=0,
|
| 279 |
-
speech_enc_depth=8, speech_mask_percentage=0, latent_multiplier=1).cpu().eval()
|
| 280 |
-
self.cvvp.load_state_dict(torch.load(get_model_path('cvvp.pth', self.models_dir)))
|
| 281 |
-
|
| 282 |
def get_conditioning_latents(self, voice_samples, return_mels=False):
|
| 283 |
"""
|
| 284 |
Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
|
|
@@ -341,8 +327,9 @@ class TextToSpeech:
|
|
| 341 |
'cond_free_k': 2.0, 'diffusion_temperature': 1.0}
|
| 342 |
# Presets are defined here.
|
| 343 |
presets = {
|
| 344 |
-
'ultra_fast': {'num_autoregressive_samples':
|
| 345 |
-
'
|
|
|
|
| 346 |
'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 200},
|
| 347 |
'high_quality': {'num_autoregressive_samples': 256, 'diffusion_iterations': 400},
|
| 348 |
}
|
|
@@ -422,182 +409,57 @@ class TextToSpeech:
|
|
| 422 |
diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k)
|
| 423 |
|
| 424 |
with torch.no_grad():
|
| 425 |
-
|
| 426 |
-
num_batches = num_autoregressive_samples // self.autoregressive_batch_size
|
| 427 |
stop_mel_token = self.autoregressive.stop_mel_token
|
| 428 |
calm_token = 83 # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output"
|
| 429 |
if verbose:
|
| 430 |
print("Generating autoregressive samples..")
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
max_generate_length=max_mel_tokens,
|
| 443 |
-
**hf_generate_kwargs)
|
| 444 |
-
padding_needed = max_mel_tokens - codes.shape[1]
|
| 445 |
-
codes = F.pad(codes, (0, padding_needed), value=stop_mel_token)
|
| 446 |
-
samples.append(codes)
|
| 447 |
-
else:
|
| 448 |
-
with self.temporary_cuda(self.autoregressive) as autoregressive:
|
| 449 |
-
for b in tqdm(range(num_batches), disable=not verbose):
|
| 450 |
-
codes = autoregressive.inference_speech(auto_conditioning, text_tokens,
|
| 451 |
-
do_sample=True,
|
| 452 |
-
top_p=top_p,
|
| 453 |
-
temperature=temperature,
|
| 454 |
-
num_return_sequences=self.autoregressive_batch_size,
|
| 455 |
-
length_penalty=length_penalty,
|
| 456 |
-
repetition_penalty=repetition_penalty,
|
| 457 |
-
max_generate_length=max_mel_tokens,
|
| 458 |
-
**hf_generate_kwargs)
|
| 459 |
-
padding_needed = max_mel_tokens - codes.shape[1]
|
| 460 |
-
codes = F.pad(codes, (0, padding_needed), value=stop_mel_token)
|
| 461 |
-
samples.append(codes)
|
| 462 |
-
|
| 463 |
-
clip_results = []
|
| 464 |
-
|
| 465 |
-
if not torch.backends.mps.is_available():
|
| 466 |
-
with self.temporary_cuda(self.clvp) as clvp, torch.autocast(
|
| 467 |
-
device_type="cuda" if not torch.backends.mps.is_available() else 'mps', dtype=torch.float16, enabled=self.half
|
| 468 |
-
):
|
| 469 |
-
if cvvp_amount > 0:
|
| 470 |
-
if self.cvvp is None:
|
| 471 |
-
self.load_cvvp()
|
| 472 |
-
self.cvvp = self.cvvp.to(self.device)
|
| 473 |
-
if verbose:
|
| 474 |
-
if self.cvvp is None:
|
| 475 |
-
print("Computing best candidates using CLVP")
|
| 476 |
-
else:
|
| 477 |
-
print(f"Computing best candidates using CLVP {((1-cvvp_amount) * 100):2.0f}% and CVVP {(cvvp_amount * 100):2.0f}%")
|
| 478 |
-
for batch in tqdm(samples, disable=not verbose):
|
| 479 |
-
for i in range(batch.shape[0]):
|
| 480 |
-
batch[i] = fix_autoregressive_output(batch[i], stop_mel_token)
|
| 481 |
-
if cvvp_amount != 1:
|
| 482 |
-
clvp_out = clvp(text_tokens.repeat(batch.shape[0], 1), batch, return_loss=False)
|
| 483 |
-
if auto_conds is not None and cvvp_amount > 0:
|
| 484 |
-
cvvp_accumulator = 0
|
| 485 |
-
for cl in range(auto_conds.shape[1]):
|
| 486 |
-
cvvp_accumulator = cvvp_accumulator + self.cvvp(auto_conds[:, cl].repeat(batch.shape[0], 1, 1), batch, return_loss=False)
|
| 487 |
-
cvvp = cvvp_accumulator / auto_conds.shape[1]
|
| 488 |
-
if cvvp_amount == 1:
|
| 489 |
-
clip_results.append(cvvp)
|
| 490 |
-
else:
|
| 491 |
-
clip_results.append(cvvp * cvvp_amount + clvp_out * (1-cvvp_amount))
|
| 492 |
-
else:
|
| 493 |
-
clip_results.append(clvp_out)
|
| 494 |
-
clip_results = torch.cat(clip_results, dim=0)
|
| 495 |
-
samples = torch.cat(samples, dim=0)
|
| 496 |
-
best_results = samples[torch.topk(clip_results, k=k).indices]
|
| 497 |
-
else:
|
| 498 |
-
with self.temporary_cuda(self.clvp) as clvp:
|
| 499 |
-
if cvvp_amount > 0:
|
| 500 |
-
if self.cvvp is None:
|
| 501 |
-
self.load_cvvp()
|
| 502 |
-
self.cvvp = self.cvvp.to(self.device)
|
| 503 |
-
if verbose:
|
| 504 |
-
if self.cvvp is None:
|
| 505 |
-
print("Computing best candidates using CLVP")
|
| 506 |
-
else:
|
| 507 |
-
print(f"Computing best candidates using CLVP {((1-cvvp_amount) * 100):2.0f}% and CVVP {(cvvp_amount * 100):2.0f}%")
|
| 508 |
-
for batch in tqdm(samples, disable=not verbose):
|
| 509 |
-
for i in range(batch.shape[0]):
|
| 510 |
-
batch[i] = fix_autoregressive_output(batch[i], stop_mel_token)
|
| 511 |
-
if cvvp_amount != 1:
|
| 512 |
-
clvp_out = clvp(text_tokens.repeat(batch.shape[0], 1), batch, return_loss=False)
|
| 513 |
-
if auto_conds is not None and cvvp_amount > 0:
|
| 514 |
-
cvvp_accumulator = 0
|
| 515 |
-
for cl in range(auto_conds.shape[1]):
|
| 516 |
-
cvvp_accumulator = cvvp_accumulator + self.cvvp(auto_conds[:, cl].repeat(batch.shape[0], 1, 1), batch, return_loss=False)
|
| 517 |
-
cvvp = cvvp_accumulator / auto_conds.shape[1]
|
| 518 |
-
if cvvp_amount == 1:
|
| 519 |
-
clip_results.append(cvvp)
|
| 520 |
-
else:
|
| 521 |
-
clip_results.append(cvvp * cvvp_amount + clvp_out * (1-cvvp_amount))
|
| 522 |
-
else:
|
| 523 |
-
clip_results.append(clvp_out)
|
| 524 |
-
clip_results = torch.cat(clip_results, dim=0)
|
| 525 |
-
samples = torch.cat(samples, dim=0)
|
| 526 |
-
best_results = samples[torch.topk(clip_results, k=k).indices]
|
| 527 |
-
if self.cvvp is not None:
|
| 528 |
-
self.cvvp = self.cvvp.cpu()
|
| 529 |
-
del samples
|
| 530 |
-
|
| 531 |
# The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
|
| 532 |
# inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
|
| 533 |
# results, but will increase memory usage.
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
)
|
| 538 |
-
|
| 539 |
-
)
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
del auto_conditioning
|
| 545 |
-
else:
|
| 546 |
-
with self.temporary_cuda(
|
| 547 |
-
self.autoregressive
|
| 548 |
-
) as autoregressive:
|
| 549 |
-
best_latents = autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1),
|
| 550 |
-
torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), best_results,
|
| 551 |
-
torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device),
|
| 552 |
-
return_latent=True, clip_inputs=False)
|
| 553 |
-
del auto_conditioning
|
| 554 |
|
| 555 |
if verbose:
|
| 556 |
print("Transforming autoregressive outputs into audio..")
|
| 557 |
wav_candidates = []
|
| 558 |
-
|
| 559 |
-
|
| 560 |
-
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
|
|
|
|
| 567 |
ctokens = 0
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
break
|
| 576 |
-
mel = do_spectrogram_diffusion(diffusion, diffuser, latents, diffusion_conditioning, temperature=diffusion_temperature,
|
| 577 |
-
verbose=verbose)
|
| 578 |
-
wav = vocoder.inference(mel)
|
| 579 |
-
wav_candidates.append(wav.cpu())
|
| 580 |
-
else:
|
| 581 |
-
diffusion, vocoder = self.diffusion, self.vocoder
|
| 582 |
-
diffusion_conditioning = diffusion_conditioning.cpu()
|
| 583 |
-
for b in range(best_results.shape[0]):
|
| 584 |
-
codes = best_results[b].unsqueeze(0).cpu()
|
| 585 |
-
latents = best_latents[b].unsqueeze(0).cpu()
|
| 586 |
-
|
| 587 |
-
# Find the first occurrence of the "calm" token and trim the codes to that.
|
| 588 |
-
ctokens = 0
|
| 589 |
-
for k in range(codes.shape[-1]):
|
| 590 |
-
if codes[0, k] == calm_token:
|
| 591 |
-
ctokens += 1
|
| 592 |
-
else:
|
| 593 |
-
ctokens = 0
|
| 594 |
-
if ctokens > 8: # 8 tokens gives the diffusion model some "breathing room" to terminate speech.
|
| 595 |
-
latents = latents[:, :k]
|
| 596 |
-
break
|
| 597 |
-
mel = do_spectrogram_diffusion(diffusion, diffuser, latents, diffusion_conditioning, temperature=diffusion_temperature,
|
| 598 |
-
verbose=verbose)
|
| 599 |
-
wav = vocoder.inference(mel)
|
| 600 |
-
wav_candidates.append(wav.cpu())
|
| 601 |
|
| 602 |
def potentially_redact(clip, text):
|
| 603 |
if self.enable_redaction:
|
|
|
|
| 252 |
layer_drop=0, unconditioned_percentage=0).cpu().eval()
|
| 253 |
self.diffusion.load_state_dict(torch.load(get_model_path('diffusion_decoder.pth', models_dir)))
|
| 254 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
self.vocoder = UnivNetGenerator().cpu()
|
| 256 |
self.vocoder.load_state_dict(torch.load(get_model_path('vocoder.pth', models_dir), map_location=torch.device('cpu'))['model_g'])
|
| 257 |
self.vocoder.eval(inference=True)
|
|
|
|
| 265 |
yield m
|
| 266 |
m = model.cpu()
|
| 267 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
def get_conditioning_latents(self, voice_samples, return_mels=False):
|
| 269 |
"""
|
| 270 |
Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
|
|
|
|
| 327 |
'cond_free_k': 2.0, 'diffusion_temperature': 1.0}
|
| 328 |
# Presets are defined here.
|
| 329 |
presets = {
|
| 330 |
+
'ultra_fast': {'num_autoregressive_samples': 1, 'diffusion_iterations': 15},
|
| 331 |
+
# 'ultra_fast': {'num_autoregressive_samples': 16, 'diffusion_iterations': 30},
|
| 332 |
+
'fast': {'num_autoregressive_samples': 32, 'diffusion_iterations': 50},
|
| 333 |
'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 200},
|
| 334 |
'high_quality': {'num_autoregressive_samples': 256, 'diffusion_iterations': 400},
|
| 335 |
}
|
|
|
|
| 409 |
diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k)
|
| 410 |
|
| 411 |
with torch.no_grad():
|
| 412 |
+
|
|
|
|
| 413 |
stop_mel_token = self.autoregressive.stop_mel_token
|
| 414 |
calm_token = 83 # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output"
|
| 415 |
if verbose:
|
| 416 |
print("Generating autoregressive samples..")
|
| 417 |
+
with self.temporary_cuda(self.autoregressive
|
| 418 |
+
) as autoregressive, torch.autocast(device_type="cuda", dtype=torch.float16, enabled=self.half):
|
| 419 |
+
codes = autoregressive.inference_speech(auto_conditioning, text_tokens,
|
| 420 |
+
do_sample=True,
|
| 421 |
+
top_p=top_p,
|
| 422 |
+
temperature=temperature,
|
| 423 |
+
num_return_sequences=num_autoregressive_samples,
|
| 424 |
+
length_penalty=length_penalty,
|
| 425 |
+
repetition_penalty=repetition_penalty,
|
| 426 |
+
max_generate_length=max_mel_tokens,
|
| 427 |
+
**hf_generate_kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 428 |
# The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
|
| 429 |
# inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
|
| 430 |
# results, but will increase memory usage.
|
| 431 |
+
with self.temporary_cuda(
|
| 432 |
+
self.autoregressive
|
| 433 |
+
) as autoregressive, torch.autocast(
|
| 434 |
+
device_type="cuda" if not torch.backends.mps.is_available() else 'mps', dtype=torch.float16, enabled=self.half
|
| 435 |
+
):
|
| 436 |
+
best_latents = autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1),
|
| 437 |
+
torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), codes,
|
| 438 |
+
torch.tensor([codes.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device),
|
| 439 |
+
return_latent=True, clip_inputs=False)
|
| 440 |
+
del auto_conditioning
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 441 |
|
| 442 |
if verbose:
|
| 443 |
print("Transforming autoregressive outputs into audio..")
|
| 444 |
wav_candidates = []
|
| 445 |
+
with self.temporary_cuda(self.diffusion) as diffusion, self.temporary_cuda(
|
| 446 |
+
self.vocoder
|
| 447 |
+
) as vocoder:
|
| 448 |
+
latents = best_latents
|
| 449 |
+
# Find the first occurrence of the "calm" token and trim the codes to that.
|
| 450 |
+
ctokens = 0
|
| 451 |
+
for k in range(codes.shape[-1]):
|
| 452 |
+
if codes[0, k] == calm_token:
|
| 453 |
+
ctokens += 1
|
| 454 |
+
else:
|
| 455 |
ctokens = 0
|
| 456 |
+
if ctokens > 8: # 8 tokens gives the diffusion model some "breathing room" to terminate speech.
|
| 457 |
+
latents = latents[:, :k]
|
| 458 |
+
break
|
| 459 |
+
mel = do_spectrogram_diffusion(diffusion, diffuser, latents, diffusion_conditioning, temperature=diffusion_temperature,
|
| 460 |
+
verbose=verbose)
|
| 461 |
+
wav = vocoder.inference(mel)
|
| 462 |
+
wav_candidates.append(wav.cpu())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 463 |
|
| 464 |
def potentially_redact(clip, text):
|
| 465 |
if self.enable_redaction:
|