Spaces:
Running
Running
| import spaces | |
| import logging | |
| import os | |
| from datetime import datetime | |
| import gradio as gr | |
| import soundfile as sf | |
| import torch | |
| from huggingface_hub import snapshot_download | |
| from SparkTTS import SparkTTS | |
| from sparktts.utils.token_parser import LEVELS_MAP_UI | |
| download_path = snapshot_download("MrEzzat/Spark_TTS_Arabic") | |
| print(f"Model downloaded to: {download_path}") | |
| model = SparkTTS(download_path, device="cuda:0" if torch.cuda.is_available() else "cpu") | |
| print(model) | |
| def run_tts( | |
| text, | |
| prompt_text=None, | |
| prompt_speech=None, | |
| gender=None, | |
| pitch=None, | |
| speed=None, | |
| save_dir="example/results", | |
| ): | |
| """Perform TTS inference and save the generated audio.""" | |
| logging.info(f"Saving audio to: {save_dir}") | |
| if prompt_text is not None: | |
| prompt_text = None if len(prompt_text) <= 1 else prompt_text | |
| # Ensure the save directory exists | |
| os.makedirs(save_dir, exist_ok=True) | |
| # Generate unique filename using timestamp | |
| timestamp = datetime.now().strftime("%Y%m%d%H%M%S") | |
| save_path = os.path.join(save_dir, f"{timestamp}.wav") | |
| logging.info("Starting inference...") | |
| # Perform inference and save the output audio | |
| with torch.no_grad(): | |
| wav = model.inference( | |
| text, | |
| prompt_speech, | |
| prompt_text, | |
| gender, | |
| pitch, | |
| speed, | |
| ) | |
| sf.write(save_path, wav, samplerate=16000) | |
| logging.info(f"Audio saved at: {save_path}") | |
| return save_path | |
| # Define callback function for voice cloning | |
| def voice_clone(text, prompt_text, prompt_speech): | |
| """ | |
| Gradio callback to clone voice using text and optional prompt speech. | |
| - text: The input text to be synthesised. | |
| - prompt_text: Additional textual info for the prompt (optional). | |
| - prompt_speech: Audio files used as reference. | |
| """ | |
| prompt_text_clean = None if len(prompt_text) < 2 else prompt_text | |
| audio_output_path = run_tts( | |
| text, prompt_text=prompt_text_clean, prompt_speech=prompt_speech | |
| ) | |
| return audio_output_path | |
| # Define callback function for creating new voices | |
| def voice_creation(text, gender, pitch, speed): | |
| """ | |
| Gradio callback to create a synthetic voice with adjustable parameters. | |
| - text: The input text for synthesis. | |
| - gender: 'male' or 'female'. | |
| - pitch/speed: Ranges mapped by LEVELS_MAP_UI. | |
| """ | |
| pitch_val = LEVELS_MAP_UI[int(pitch)] | |
| speed_val = LEVELS_MAP_UI[int(speed)] | |
| audio_output_path = run_tts(text, gender=gender, pitch=pitch_val, speed=speed_val) | |
| return audio_output_path | |
| with gr.Blocks() as app: | |
| # Use HTML for centered title | |
| gr.HTML( | |
| '<h1 style="text-align: center;">Arabic Spark-TTS by <a href="https://huggingface.co/MrEzzat/Spark_TTS_Arabic">MrEzzat</a></h1>' | |
| ) | |
| with gr.Tabs(): | |
| # Voice Creation Tab | |
| with gr.TabItem("Voice Creation"): | |
| gr.Markdown("### Create your own voice based on the following parameters") | |
| with gr.Row(): | |
| with gr.Column(): | |
| gender = gr.Radio( | |
| choices=["male", "female"], value="male", label="Gender" | |
| ) | |
| pitch = gr.Slider( | |
| minimum=1, maximum=5, step=1, value=3, label="Pitch" | |
| ) | |
| speed = gr.Slider( | |
| minimum=1, maximum=5, step=1, value=3, label="Speed" | |
| ) | |
| with gr.Column(): | |
| text_input_creation = gr.Textbox( | |
| label="Input Text", | |
| lines=3, | |
| placeholder="ุฃูุชุจ ููุง ุงููุต ุงูุฐู ุชุฑูุฏ ุชุญูููู ุฅูู ููุงู .", | |
| value="ูู ููู ุฅูุดุงุก ุตูุช ู ุฎุตุต ุนู ุทุฑูู ุถุจุท ู ุนูู ุงุช ู ุซู ุงููุบู ุฉ ูุงูุณุฑุนุฉ.", | |
| text_align="right", | |
| rtl=True, | |
| ) | |
| create_button = gr.Button("Create Voice", variant="primary") | |
| audio_output = gr.Audio( | |
| label="Generated Audio", autoplay=True, streaming=True | |
| ) | |
| create_button.click( | |
| voice_creation, | |
| inputs=[text_input_creation, gender, pitch, speed], | |
| outputs=[audio_output], | |
| ) | |
| # Voice Clone Tab | |
| with gr.TabItem("Voice Clone"): | |
| gr.Markdown( | |
| "## Write the text you want to synthesize on the right and upload or record a prompt audio on the left (leave the text of the prompt speech empty if you are getting errors in generation)." | |
| ) | |
| with gr.Row(equal_height=False): | |
| with gr.Column(): | |
| prompt_wav = gr.Audio( | |
| type="filepath", | |
| label="Upload a prompt audio file, or record one using the microphone.", | |
| ) | |
| prompt_text_input = gr.Textbox( | |
| label="Text of prompt speech (Optional; recommended for cloning in the same language.)", | |
| lines=3, | |
| placeholder="ุฃูุชุจ ููุง ูุต ุงูููุงู ูู ุงูู ูู ุงูุตูุชู ุงูู ุฑูู.", | |
| text_align="right", | |
| rtl=True, | |
| ) | |
| with gr.Column(): | |
| text_input = gr.Textbox( | |
| label="Text to synthesize", | |
| lines=10, | |
| value="ุงูุณูุงู ุนูููู ูุฑุญู ุฉ ุงููู", | |
| placeholder="ุฃูุชุจ ููุง ุงููุต ุงูุฐู ุชุฑูุฏ ุชุญูููู ุฅูู ููุงู .", | |
| text_align="right", | |
| rtl=True, | |
| ) | |
| generate_button_clone = gr.Button("Generate", variant="primary") | |
| audio_output = gr.Audio( | |
| label="Generated Audio", autoplay=True, streaming=True | |
| ) | |
| generate_button_clone.click( | |
| voice_clone, | |
| inputs=[ | |
| text_input, | |
| prompt_text_input, | |
| prompt_wav, | |
| ], | |
| outputs=[audio_output], | |
| ) | |
| app.queue().launch() | |