MohamedRashad's picture
Update app.py
ec9c685 verified
import spaces
import logging
import os
from datetime import datetime
import gradio as gr
import soundfile as sf
import torch
from huggingface_hub import snapshot_download
from SparkTTS import SparkTTS
from sparktts.utils.token_parser import LEVELS_MAP_UI
download_path = snapshot_download("MrEzzat/Spark_TTS_Arabic")
print(f"Model downloaded to: {download_path}")
model = SparkTTS(download_path, device="cuda:0" if torch.cuda.is_available() else "cpu")
print(model)
@spaces.GPU()
def run_tts(
text,
prompt_text=None,
prompt_speech=None,
gender=None,
pitch=None,
speed=None,
save_dir="example/results",
):
"""Perform TTS inference and save the generated audio."""
logging.info(f"Saving audio to: {save_dir}")
if prompt_text is not None:
prompt_text = None if len(prompt_text) <= 1 else prompt_text
# Ensure the save directory exists
os.makedirs(save_dir, exist_ok=True)
# Generate unique filename using timestamp
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
save_path = os.path.join(save_dir, f"{timestamp}.wav")
logging.info("Starting inference...")
# Perform inference and save the output audio
with torch.no_grad():
wav = model.inference(
text,
prompt_speech,
prompt_text,
gender,
pitch,
speed,
)
sf.write(save_path, wav, samplerate=16000)
logging.info(f"Audio saved at: {save_path}")
return save_path
# Define callback function for voice cloning
def voice_clone(text, prompt_text, prompt_speech):
"""
Gradio callback to clone voice using text and optional prompt speech.
- text: The input text to be synthesised.
- prompt_text: Additional textual info for the prompt (optional).
- prompt_speech: Audio files used as reference.
"""
prompt_text_clean = None if len(prompt_text) < 2 else prompt_text
audio_output_path = run_tts(
text, prompt_text=prompt_text_clean, prompt_speech=prompt_speech
)
return audio_output_path
# Define callback function for creating new voices
def voice_creation(text, gender, pitch, speed):
"""
Gradio callback to create a synthetic voice with adjustable parameters.
- text: The input text for synthesis.
- gender: 'male' or 'female'.
- pitch/speed: Ranges mapped by LEVELS_MAP_UI.
"""
pitch_val = LEVELS_MAP_UI[int(pitch)]
speed_val = LEVELS_MAP_UI[int(speed)]
audio_output_path = run_tts(text, gender=gender, pitch=pitch_val, speed=speed_val)
return audio_output_path
with gr.Blocks() as app:
# Use HTML for centered title
gr.HTML(
'<h1 style="text-align: center;">Arabic Spark-TTS by <a href="https://huggingface.co/MrEzzat/Spark_TTS_Arabic">MrEzzat</a></h1>'
)
with gr.Tabs():
# Voice Creation Tab
with gr.TabItem("Voice Creation"):
gr.Markdown("### Create your own voice based on the following parameters")
with gr.Row():
with gr.Column():
gender = gr.Radio(
choices=["male", "female"], value="male", label="Gender"
)
pitch = gr.Slider(
minimum=1, maximum=5, step=1, value=3, label="Pitch"
)
speed = gr.Slider(
minimum=1, maximum=5, step=1, value=3, label="Speed"
)
with gr.Column():
text_input_creation = gr.Textbox(
label="Input Text",
lines=3,
placeholder="ุฃูƒุชุจ ู‡ู†ุง ุงู„ู†ุต ุงู„ุฐูŠ ุชุฑูŠุฏ ุชุญูˆูŠู„ู‡ ุฅู„ู‰ ูƒู„ุงู….",
value="ูŠู…ูƒู†ูƒ ุฅู†ุดุงุก ุตูˆุช ู…ุฎุตุต ุนู† ุทุฑูŠู‚ ุถุจุท ู…ุนู„ู…ุงุช ู…ุซู„ ุงู„ู†ุบู…ุฉ ูˆุงู„ุณุฑุนุฉ.",
text_align="right",
rtl=True,
)
create_button = gr.Button("Create Voice", variant="primary")
audio_output = gr.Audio(
label="Generated Audio", autoplay=True, streaming=True
)
create_button.click(
voice_creation,
inputs=[text_input_creation, gender, pitch, speed],
outputs=[audio_output],
)
# Voice Clone Tab
with gr.TabItem("Voice Clone"):
gr.Markdown(
"## Write the text you want to synthesize on the right and upload or record a prompt audio on the left (leave the text of the prompt speech empty if you are getting errors in generation)."
)
with gr.Row(equal_height=False):
with gr.Column():
prompt_wav = gr.Audio(
type="filepath",
label="Upload a prompt audio file, or record one using the microphone.",
)
prompt_text_input = gr.Textbox(
label="Text of prompt speech (Optional; recommended for cloning in the same language.)",
lines=3,
placeholder="ุฃูƒุชุจ ู‡ู†ุง ู†ุต ุงู„ูƒู„ุงู… ููŠ ุงู„ู…ู„ู ุงู„ุตูˆุชูŠ ุงู„ู…ุฑูู‚.",
text_align="right",
rtl=True,
)
with gr.Column():
text_input = gr.Textbox(
label="Text to synthesize",
lines=10,
value="ุงู„ุณู„ุงู… ุนู„ูŠูƒู… ูˆุฑุญู…ุฉ ุงู„ู„ู‡",
placeholder="ุฃูƒุชุจ ู‡ู†ุง ุงู„ู†ุต ุงู„ุฐูŠ ุชุฑูŠุฏ ุชุญูˆูŠู„ู‡ ุฅู„ู‰ ูƒู„ุงู….",
text_align="right",
rtl=True,
)
generate_button_clone = gr.Button("Generate", variant="primary")
audio_output = gr.Audio(
label="Generated Audio", autoplay=True, streaming=True
)
generate_button_clone.click(
voice_clone,
inputs=[
text_input,
prompt_text_input,
prompt_wav,
],
outputs=[audio_output],
)
app.queue().launch()