Spaces:

MohamedRashad
/

Arabic-Spark-TTS

Running

File size: 6,346 Bytes

import spaces
import logging
import os
from datetime import datetime

import gradio as gr
import soundfile as sf
import torch
from huggingface_hub import snapshot_download

from SparkTTS import SparkTTS
from sparktts.utils.token_parser import LEVELS_MAP_UI

download_path = snapshot_download("MrEzzat/Spark_TTS_Arabic")
print(f"Model downloaded to: {download_path}")
model = SparkTTS(download_path, device="cuda:0" if torch.cuda.is_available() else "cpu")
print(model)


@spaces.GPU()
def run_tts(
    text,
    prompt_text=None,
    prompt_speech=None,
    gender=None,
    pitch=None,
    speed=None,
    save_dir="example/results",
):
    """Perform TTS inference and save the generated audio."""
    logging.info(f"Saving audio to: {save_dir}")

    if prompt_text is not None:
        prompt_text = None if len(prompt_text) <= 1 else prompt_text

    # Ensure the save directory exists
    os.makedirs(save_dir, exist_ok=True)

    # Generate unique filename using timestamp
    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
    save_path = os.path.join(save_dir, f"{timestamp}.wav")

    logging.info("Starting inference...")

    # Perform inference and save the output audio
    with torch.no_grad():
        wav = model.inference(
            text,
            prompt_speech,
            prompt_text,
            gender,
            pitch,
            speed,
        )

        sf.write(save_path, wav, samplerate=16000)

    logging.info(f"Audio saved at: {save_path}")

    return save_path


# Define callback function for voice cloning
def voice_clone(text, prompt_text, prompt_speech):
    """
    Gradio callback to clone voice using text and optional prompt speech.
    - text: The input text to be synthesised.
    - prompt_text: Additional textual info for the prompt (optional).
    - prompt_speech: Audio files used as reference.
    """
    prompt_text_clean = None if len(prompt_text) < 2 else prompt_text

    audio_output_path = run_tts(
        text, prompt_text=prompt_text_clean, prompt_speech=prompt_speech
    )
    return audio_output_path


# Define callback function for creating new voices
def voice_creation(text, gender, pitch, speed):
    """
    Gradio callback to create a synthetic voice with adjustable parameters.
    - text: The input text for synthesis.
    - gender: 'male' or 'female'.
    - pitch/speed: Ranges mapped by LEVELS_MAP_UI.
    """
    pitch_val = LEVELS_MAP_UI[int(pitch)]
    speed_val = LEVELS_MAP_UI[int(speed)]
    audio_output_path = run_tts(text, gender=gender, pitch=pitch_val, speed=speed_val)
    return audio_output_path


with gr.Blocks() as app:
    # Use HTML for centered title
    gr.HTML(
        '<h1 style="text-align: center;">Arabic Spark-TTS by <a href="https://huggingface.co/MrEzzat/Spark_TTS_Arabic">MrEzzat</a></h1>'
    )

    with gr.Tabs():
        # Voice Creation Tab
        with gr.TabItem("Voice Creation"):
            gr.Markdown("### Create your own voice based on the following parameters")

            with gr.Row():
                with gr.Column():
                    gender = gr.Radio(
                        choices=["male", "female"], value="male", label="Gender"
                    )
                    pitch = gr.Slider(
                        minimum=1, maximum=5, step=1, value=3, label="Pitch"
                    )
                    speed = gr.Slider(
                        minimum=1, maximum=5, step=1, value=3, label="Speed"
                    )
                with gr.Column():
                    text_input_creation = gr.Textbox(
                        label="Input Text",
                        lines=3,
                        placeholder="أكتب هنا النص الذي تريد تحويله إلى كلام.",
                        value="يمكنك إنشاء صوت مخصص عن طريق ضبط معلمات مثل النغمة والسرعة.",
                        text_align="right",
                        rtl=True,
                    )
                    create_button = gr.Button("Create Voice", variant="primary")

            audio_output = gr.Audio(
                label="Generated Audio", autoplay=True, streaming=True
            )

            create_button.click(
                voice_creation,
                inputs=[text_input_creation, gender, pitch, speed],
                outputs=[audio_output],
            )
        # Voice Clone Tab
        with gr.TabItem("Voice Clone"):
            gr.Markdown(
                "## Write the text you want to synthesize on the right and upload or record a prompt audio on the left (leave the text of the prompt speech empty if you are getting errors in generation)."
            )

            with gr.Row(equal_height=False):
                with gr.Column():
                    prompt_wav = gr.Audio(
                        type="filepath",
                        label="Upload a prompt audio file, or record one using the microphone.",
                    )
                    prompt_text_input = gr.Textbox(
                        label="Text of prompt speech (Optional; recommended for cloning in the same language.)",
                        lines=3,
                        placeholder="أكتب هنا نص الكلام في الملف الصوتي المرفق.",
                        text_align="right",
                        rtl=True,
                    )
                with gr.Column():
                    text_input = gr.Textbox(
                        label="Text to synthesize",
                        lines=10,
                        value="السلام عليكم ورحمة الله",
                        placeholder="أكتب هنا النص الذي تريد تحويله إلى كلام.",
                        text_align="right",
                        rtl=True,
                    )
                    generate_button_clone = gr.Button("Generate", variant="primary")

            audio_output = gr.Audio(
                label="Generated Audio", autoplay=True, streaming=True
            )

            generate_button_clone.click(
                voice_clone,
                inputs=[
                    text_input,
                    prompt_text_input,
                    prompt_wav,
                ],
                outputs=[audio_output],
            )

app.queue().launch()