Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -28,6 +28,8 @@ from infer.utils_infer import (
|
|
| 28 |
save_spectrogram,
|
| 29 |
)
|
| 30 |
from tokenizers import Tokenizer
|
|
|
|
|
|
|
| 31 |
from transformers import pipeline
|
| 32 |
import click
|
| 33 |
import soundfile as sf
|
|
@@ -131,7 +133,7 @@ F5TTS_ema_model = load_custom(
|
|
| 131 |
"hf://Gregniuki/F5-tts_English_German_Polish/English/model_222600.pt", "", F5TTS_model_cfg
|
| 132 |
)
|
| 133 |
E2TTS_ema_model = load_custom(
|
| 134 |
-
"hf://Gregniuki/F5-tts_English_German_Polish/
|
| 135 |
)
|
| 136 |
E2TTS_ema_model2 = load_custom(
|
| 137 |
"hf://Gregniuki/F5-tts_English_German_Polish/Polish/model_500000.pt", "", F5TTS_model_cfg
|
|
@@ -202,11 +204,24 @@ def chunk_text(text, max_chars):
|
|
| 202 |
return chunks
|
| 203 |
|
| 204 |
|
| 205 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
|
| 207 |
|
| 208 |
@gpu_decorator
|
| 209 |
-
def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence, cross_fade_duration=0.15, progress=gr.Progress()):
|
| 210 |
if exp_name == "English":
|
| 211 |
ema_model = F5TTS_ema_model
|
| 212 |
elif exp_name == "Polish":
|
|
@@ -247,7 +262,12 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
|
|
| 247 |
|
| 248 |
for i, gen_text in enumerate(progress.tqdm(gen_text_batches)):
|
| 249 |
# Prepare the text
|
| 250 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
encoding = tokenizer.encode(text_list)
|
| 252 |
tokens = encoding.tokens
|
| 253 |
text_list = ' '.join(map(str, tokens))
|
|
@@ -384,7 +404,7 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
|
|
| 384 |
return (target_sample_rate, final_wave), spectrogram_path
|
| 385 |
|
| 386 |
@gpu_decorator
|
| 387 |
-
def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, cross_fade_duration=0.15 # Set the desired language code dynamically
|
| 388 |
):
|
| 389 |
|
| 390 |
print(gen_text)
|
|
@@ -442,7 +462,7 @@ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, cross_fa
|
|
| 442 |
print(f'gen_text {i}', batch_text)
|
| 443 |
|
| 444 |
gr.Info(f"Generating audio using {exp_name} in {len(gen_text_batches)} batches")
|
| 445 |
-
return infer_batch((audio, sr), ref_text, gen_text_batches, exp_name, remove_silence, cross_fade_duration)
|
| 446 |
|
| 447 |
|
| 448 |
@gpu_decorator
|
|
@@ -468,7 +488,7 @@ def generate_podcast(script, speaker1_name, ref_audio1, ref_text1, speaker2_name
|
|
| 468 |
continue # Skip if the speaker is neither speaker1 nor speaker2
|
| 469 |
|
| 470 |
# Generate audio for this block
|
| 471 |
-
audio, _ = infer(ref_audio, ref_text, text, exp_name, remove_silence)
|
| 472 |
|
| 473 |
# Convert the generated audio to a numpy array
|
| 474 |
sr, audio_data = audio
|
|
@@ -580,6 +600,7 @@ with gr.Blocks() as app_tts:
|
|
| 580 |
model_choice,
|
| 581 |
remove_silence,
|
| 582 |
cross_fade_duration_slider,
|
|
|
|
| 583 |
],
|
| 584 |
outputs=[audio_output, spectrogram_output],
|
| 585 |
)
|
|
@@ -786,7 +807,7 @@ with gr.Blocks() as app_emotional:
|
|
| 786 |
ref_text = speech_types[current_emotion].get('ref_text', '')
|
| 787 |
|
| 788 |
# Generate speech for this segment
|
| 789 |
-
audio, _ = infer(ref_audio, ref_text, text, model_choice, remove_silence)
|
| 790 |
sr, audio_data = audio
|
| 791 |
|
| 792 |
# generated_audio_segments.append(audio_data)
|
|
|
|
| 28 |
save_spectrogram,
|
| 29 |
)
|
| 30 |
from tokenizers import Tokenizer
|
| 31 |
+
from phonemizer import phonemize
|
| 32 |
+
|
| 33 |
from transformers import pipeline
|
| 34 |
import click
|
| 35 |
import soundfile as sf
|
|
|
|
| 133 |
"hf://Gregniuki/F5-tts_English_German_Polish/English/model_222600.pt", "", F5TTS_model_cfg
|
| 134 |
)
|
| 135 |
E2TTS_ema_model = load_custom(
|
| 136 |
+
"hf://Gregniuki/F5-tts_English_German_Polish/multi/model_300000.pt", "", F5TTS_model_cfg
|
| 137 |
)
|
| 138 |
E2TTS_ema_model2 = load_custom(
|
| 139 |
"hf://Gregniuki/F5-tts_English_German_Polish/Polish/model_500000.pt", "", F5TTS_model_cfg
|
|
|
|
| 204 |
return chunks
|
| 205 |
|
| 206 |
|
| 207 |
+
def text_to_ipa(text, language='en-gb'):
|
| 208 |
+
try:
|
| 209 |
+
ipa_text = phonemize(
|
| 210 |
+
text,
|
| 211 |
+
language=language,
|
| 212 |
+
backend='espeak',
|
| 213 |
+
strip=False,
|
| 214 |
+
preserve_punctuation=True,
|
| 215 |
+
with_stress=True
|
| 216 |
+
)
|
| 217 |
+
return ipa_text #preserve_case(text, ipa_text)
|
| 218 |
+
except Exception as e:
|
| 219 |
+
print(f"Error processing text: {text}. Error: {e}")
|
| 220 |
+
return None
|
| 221 |
|
| 222 |
|
| 223 |
@gpu_decorator
|
| 224 |
+
def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence, cross_fade_duration=0.15, progress=gr.Progress(), language):
|
| 225 |
if exp_name == "English":
|
| 226 |
ema_model = F5TTS_ema_model
|
| 227 |
elif exp_name == "Polish":
|
|
|
|
| 262 |
|
| 263 |
for i, gen_text in enumerate(progress.tqdm(gen_text_batches)):
|
| 264 |
# Prepare the text
|
| 265 |
+
ipa_text_ref = text_to_ipa(ref_text, language=language)
|
| 266 |
+
ipa_text_gen = text_to_ipa(gen_text, language=language)
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
text_list = [ref_text_ref + gen_text_gen]
|
| 270 |
+
|
| 271 |
encoding = tokenizer.encode(text_list)
|
| 272 |
tokens = encoding.tokens
|
| 273 |
text_list = ' '.join(map(str, tokens))
|
|
|
|
| 404 |
return (target_sample_rate, final_wave), spectrogram_path
|
| 405 |
|
| 406 |
@gpu_decorator
|
| 407 |
+
def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, cross_fade_duration=0.15, language # Set the desired language code dynamically
|
| 408 |
):
|
| 409 |
|
| 410 |
print(gen_text)
|
|
|
|
| 462 |
print(f'gen_text {i}', batch_text)
|
| 463 |
|
| 464 |
gr.Info(f"Generating audio using {exp_name} in {len(gen_text_batches)} batches")
|
| 465 |
+
return infer_batch((audio, sr), ref_text, gen_text_batches, exp_name, remove_silence, cross_fade_duration, language)
|
| 466 |
|
| 467 |
|
| 468 |
@gpu_decorator
|
|
|
|
| 488 |
continue # Skip if the speaker is neither speaker1 nor speaker2
|
| 489 |
|
| 490 |
# Generate audio for this block
|
| 491 |
+
audio, _ = infer(ref_audio, ref_text, text, exp_name, remove_silence, language)
|
| 492 |
|
| 493 |
# Convert the generated audio to a numpy array
|
| 494 |
sr, audio_data = audio
|
|
|
|
| 600 |
model_choice,
|
| 601 |
remove_silence,
|
| 602 |
cross_fade_duration_slider,
|
| 603 |
+
language='en-gb',
|
| 604 |
],
|
| 605 |
outputs=[audio_output, spectrogram_output],
|
| 606 |
)
|
|
|
|
| 807 |
ref_text = speech_types[current_emotion].get('ref_text', '')
|
| 808 |
|
| 809 |
# Generate speech for this segment
|
| 810 |
+
audio, _ = infer(ref_audio, ref_text, text, model_choice, remove_silence, language)
|
| 811 |
sr, audio_data = audio
|
| 812 |
|
| 813 |
# generated_audio_segments.append(audio_data)
|