Spaces:

nazdridoy
/

inferoxy-hub

Running

App Files Files Community

nazdridoy commited on Aug 22

Commit

43333ad

verified ·

1 Parent(s): f76cd43

feat(tts): add dynamic model parameters and Fal.ai Chatterbox

Browse files

- [feat] Define TTS_MODEL_CONFIGS, TTS_EXAMPLE_AUDIO_URLS, and add "Chatterbox (Fal.ai)" preset (utils.py:75-90, 150-155)
- [feat] Update `generate_text_to_speech()` to accept dynamic parameters and conditionally include `extra_body` (tts_handler.py:29-33, 59-76)
- [feat] Update `handle_text_to_speech_generation()` to pass new TTS arguments (tts_handler.py:148, 158-161)
- [feat] Implement dynamic UI for TTS model selection and parameter inputs with `gr.Dropdown` and `gr.Group` components (ui_components.py:446-474)
- [feat] Add `on_model_change()` to dynamically update group visibility on model selection (ui_components.py:512-527)
- [feat] Extend generate button inputs and add Chatterbox examples (ui_components.py:533-535, 559-566)

Files changed (3) hide show

tts_handler.py +32 -8
ui_components.py +64 -11
utils.py +25 -0

tts_handler.py CHANGED Viewed

@@ -15,7 +15,8 @@ from utils import (
     IMAGE_CONFIG,
     validate_proxy_key,
     format_error_message,
-    format_success_message
 )
 # Timeout configuration for TTS generation
@@ -26,8 +27,12 @@ def generate_text_to_speech(
     text: str,
     model_name: str,
     provider: str,
-    voice: str = "am_eric",
     speed: float = 1.0,
 ):
     """
     Generate speech from text using the specified model and provider through HF-Inferoxy.
@@ -56,16 +61,31 @@ def generate_text_to_speech(
         print(f"🚀 TTS: Client created, preparing generation params...")
         # Prepare generation parameters
         generation_params = {
             "text": text,
             "model": model_name,
-            "extra_body": {
-                "voice": voice,
-                "speed": speed
-            }
         }
         print(f"📡 TTS: Making generation request with {TTS_GENERATION_TIMEOUT}s timeout...")
         # Create generation function for timeout handling
@@ -133,7 +153,7 @@ def generate_text_to_speech(
         return None, format_error_message("Unexpected Error", f"An unexpected error occurred: {error_msg}")
-def handle_text_to_speech_generation(text_val, model_val, provider_val, voice_val, speed_val):
     """
     Handle text-to-speech generation request with validation.
     """
@@ -151,5 +171,9 @@ def handle_text_to_speech_generation(text_val, model_val, provider_val, voice_va
         model_name=model_val,
         provider=provider_val,
         voice=voice_val,
-        speed=speed_val
     )

     IMAGE_CONFIG,
     validate_proxy_key,
     format_error_message,
+    format_success_message,
+    TTS_MODEL_CONFIGS
 )
 # Timeout configuration for TTS generation
     text: str,
     model_name: str,
     provider: str,
+    voice: str = "af_bella",
     speed: float = 1.0,
+    audio_url: str = "",
+    exaggeration: float = 0.25,
+    temperature: float = 0.7,
+    cfg: float = 0.5,
 ):
     """
     Generate speech from text using the specified model and provider through HF-Inferoxy.
         print(f"🚀 TTS: Client created, preparing generation params...")
+        # Get model configuration
+        model_config = TTS_MODEL_CONFIGS.get(model_name, {})
+        extra_body_params = model_config.get("extra_body_params", [])
         # Prepare generation parameters
         generation_params = {
             "text": text,
             "model": model_name,
+            "extra_body": {}
         }
+        # Add model-specific parameters to extra_body
+        if "voice" in extra_body_params:
+            generation_params["extra_body"]["voice"] = voice
+        if "speed" in extra_body_params:
+            generation_params["extra_body"]["speed"] = speed
+        if "audio_url" in extra_body_params:
+            generation_params["extra_body"]["audio_url"] = audio_url
+        if "exaggeration" in extra_body_params:
+            generation_params["extra_body"]["exaggeration"] = exaggeration
+        if "temperature" in extra_body_params:
+            generation_params["extra_body"]["temperature"] = temperature
+        if "cfg" in extra_body_params:
+            generation_params["extra_body"]["cfg"] = cfg
         print(f"📡 TTS: Making generation request with {TTS_GENERATION_TIMEOUT}s timeout...")
         # Create generation function for timeout handling
         return None, format_error_message("Unexpected Error", f"An unexpected error occurred: {error_msg}")
+def handle_text_to_speech_generation(text_val, model_val, provider_val, voice_val, speed_val, audio_url_val, exaggeration_val, temperature_val, cfg_val):
     """
     Handle text-to-speech generation request with validation.
     """
         model_name=model_val,
         provider=provider_val,
         voice=voice_val,
+        speed=speed_val,
+        audio_url=audio_url_val,
+        exaggeration=exaggeration_val,
+        temperature=temperature_val,
+        cfg=cfg_val
     )

ui_components.py CHANGED Viewed

@@ -9,8 +9,8 @@ from utils import (
     DEFAULT_IMAGE_TO_IMAGE_MODEL, DEFAULT_IMAGE_TO_IMAGE_PROVIDER,
     DEFAULT_TTS_MODEL, DEFAULT_TTS_PROVIDER,
     CHAT_CONFIG, IMAGE_CONFIG, IMAGE_PROVIDERS, IMAGE_MODEL_PRESETS,
-    IMAGE_TO_IMAGE_MODEL_PRESETS, TTS_MODEL_PRESETS, TTS_VOICES,
-    IMAGE_EXAMPLE_PROMPTS, IMAGE_TO_IMAGE_EXAMPLE_PROMPTS, TTS_EXAMPLE_TEXTS
 )
@@ -412,7 +412,7 @@ def create_image_to_image_tab(handle_image_to_image_generation_fn):
 def create_tts_tab(handle_tts_generation_fn):
     """
-    Create the text-to-speech tab interface.
     """
     with gr.Tab("🎤 Text-to-Speech", id="tts"):
         with gr.Row():
@@ -430,7 +430,7 @@ def create_tts_tab(handle_tts_generation_fn):
                     label="Generated Audio",
                     type="numpy",
                     interactive=False,
-                    autoplay=False,
                     show_download_button=True
                 )
                 status_text = gr.Textbox(
@@ -443,10 +443,11 @@ def create_tts_tab(handle_tts_generation_fn):
                 # Model and provider inputs
                 with gr.Group():
                     gr.Markdown("**🤖 Model & Provider**")
-                    tts_model_name = gr.Textbox(
                         value=DEFAULT_TTS_MODEL,
-                        label="Model Name",
-                        placeholder="e.g., hexgrad/Kokoro-82M"
                     )
                     tts_provider = gr.Dropdown(
                         choices=IMAGE_PROVIDERS,
@@ -455,9 +456,9 @@ def create_tts_tab(handle_tts_generation_fn):
                         interactive=True
                     )
-                # Voice and speed settings
-                with gr.Group():
-                    gr.Markdown("**🎤 Voice Settings**")
                     tts_voice = gr.Dropdown(
                         choices=list(TTS_VOICES.items()),
                         value="af_bella",
@@ -469,6 +470,28 @@ def create_tts_tab(handle_tts_generation_fn):
                         label="Speed", info="0.5 = slow, 2.0 = fast"
                     )
                 # Generate and Stop buttons
                 with gr.Row():
                     generate_btn = gr.Button(
@@ -484,6 +507,25 @@ def create_tts_tab(handle_tts_generation_fn):
         # Examples for TTS generation
         create_tts_examples(tts_text)
         # Connect TTS generation events
         # Show stop immediately when starting generation
@@ -497,7 +539,8 @@ def create_tts_tab(handle_tts_generation_fn):
         gen_event = generate_btn.click(
             fn=handle_tts_generation_fn,
             inputs=[
-                tts_text, tts_model_name, tts_provider, tts_voice, tts_speed
             ],
             outputs=[output_audio, status_text]
         )
@@ -561,6 +604,16 @@ def create_tts_examples(tts_text):
         )
 def create_image_presets(img_model_name, img_provider):
     """Create quick model presets for image generation."""
     with gr.Group():

     DEFAULT_IMAGE_TO_IMAGE_MODEL, DEFAULT_IMAGE_TO_IMAGE_PROVIDER,
     DEFAULT_TTS_MODEL, DEFAULT_TTS_PROVIDER,
     CHAT_CONFIG, IMAGE_CONFIG, IMAGE_PROVIDERS, IMAGE_MODEL_PRESETS,
+    IMAGE_TO_IMAGE_MODEL_PRESETS, TTS_MODEL_PRESETS, TTS_VOICES, TTS_MODEL_CONFIGS,
+    IMAGE_EXAMPLE_PROMPTS, IMAGE_TO_IMAGE_EXAMPLE_PROMPTS, TTS_EXAMPLE_TEXTS, TTS_EXAMPLE_AUDIO_URLS
 )
 def create_tts_tab(handle_tts_generation_fn):
     """
+    Create the text-to-speech tab interface with dynamic model-specific settings.
     """
     with gr.Tab("🎤 Text-to-Speech", id="tts"):
         with gr.Row():
                     label="Generated Audio",
                     type="numpy",
                     interactive=False,
+                    autoplay=True,
                     show_download_button=True
                 )
                 status_text = gr.Textbox(
                 # Model and provider inputs
                 with gr.Group():
                     gr.Markdown("**🤖 Model & Provider**")
+                    tts_model_name = gr.Dropdown(
+                        choices=["hexgrad/Kokoro-82M", "ResembleAI/chatterbox"],
                         value=DEFAULT_TTS_MODEL,
+                        label="Model",
+                        info="Select TTS model"
                     )
                     tts_provider = gr.Dropdown(
                         choices=IMAGE_PROVIDERS,
                         interactive=True
                     )
+                # Kokoro-specific settings (initially visible)
+                with gr.Group(visible=True) as kokoro_settings:
+                    gr.Markdown("**🎤 Kokoro Voice Settings**")
                     tts_voice = gr.Dropdown(
                         choices=list(TTS_VOICES.items()),
                         value="af_bella",
                         label="Speed", info="0.5 = slow, 2.0 = fast"
                     )
+                # Chatterbox-specific settings (initially hidden)
+                with gr.Group(visible=False) as chatterbox_settings:
+                    gr.Markdown("**🎭 Chatterbox Style Settings**")
+                    tts_audio_url = gr.Textbox(
+                        value=TTS_EXAMPLE_AUDIO_URLS[0],
+                        label="Reference Audio URL",
+                        placeholder="Enter URL to reference audio file",
+                        info="Audio file to match style and tone"
+                    )
+                    tts_exaggeration = gr.Slider(
+                        minimum=0.0, maximum=1.0, value=0.25, step=0.05,
+                        label="Exaggeration", info="How much to exaggerate the style"
+                    )
+                    tts_temperature = gr.Slider(
+                        minimum=0.0, maximum=1.0, value=0.7, step=0.1,
+                        label="Temperature", info="Creativity level"
+                    )
+                    tts_cfg = gr.Slider(
+                        minimum=0.0, maximum=1.0, value=0.5, step=0.1,
+                        label="CFG", info="Guidance strength"
+                    )
                 # Generate and Stop buttons
                 with gr.Row():
                     generate_btn = gr.Button(
         # Examples for TTS generation
         create_tts_examples(tts_text)
+        # Create Chatterbox audio URL examples
+        create_chatterbox_examples(tts_audio_url)
+        # Model change handler to show/hide appropriate settings
+        def on_model_change(model_name):
+            if model_name == "hexgrad/Kokoro-82M":
+                return gr.update(visible=True), gr.update(visible=False)
+            elif model_name == "ResembleAI/chatterbox":
+                return gr.update(visible=False), gr.update(visible=True)
+            else:
+                return gr.update(visible=True), gr.update(visible=False)
+        # Connect model change event
+        tts_model_name.change(
+            fn=on_model_change,
+            inputs=[tts_model_name],
+            outputs=[kokoro_settings, chatterbox_settings]
+        )
         # Connect TTS generation events
         # Show stop immediately when starting generation
         gen_event = generate_btn.click(
             fn=handle_tts_generation_fn,
             inputs=[
+                tts_text, tts_model_name, tts_provider, tts_voice, tts_speed,
+                tts_audio_url, tts_exaggeration, tts_temperature, tts_cfg
             ],
             outputs=[output_audio, status_text]
         )
         )
+def create_chatterbox_examples(tts_audio_url):
+    """Create example audio URLs for Chatterbox TTS."""
+    with gr.Group():
+        gr.Markdown("**🎵 Example Reference Audio URLs**")
+        chatterbox_examples = gr.Examples(
+            examples=[[url] for url in TTS_EXAMPLE_AUDIO_URLS],
+            inputs=tts_audio_url
+        )
 def create_image_presets(img_model_name, img_provider):
     """Create quick model presets for image generation."""
     with gr.Group():

utils.py CHANGED Viewed

@@ -72,8 +72,25 @@ IMAGE_TO_IMAGE_MODEL_PRESETS = [
 TTS_MODEL_PRESETS = [
     ("Kokoro (Fal.ai)", "hexgrad/Kokoro-82M", "fal-ai"),
     ("Kokoro (Replicate)", "hexgrad/Kokoro-82M", "replicate"),
 ]
 # Voice options for Kokoro TTS (based on the reference app)
 TTS_VOICES = {
     '🇺🇸 🚺 Heart ❤️': 'af_heart',
@@ -142,6 +159,14 @@ TTS_EXAMPLE_TEXTS = [
     "Life is what happens when you're busy making other plans. Embrace every moment with gratitude."
 ]
 def get_proxy_key():
     """Get the proxy API key from environment variables."""

 TTS_MODEL_PRESETS = [
     ("Kokoro (Fal.ai)", "hexgrad/Kokoro-82M", "fal-ai"),
     ("Kokoro (Replicate)", "hexgrad/Kokoro-82M", "replicate"),
+    ("Chatterbox (Fal.ai)", "ResembleAI/chatterbox", "fal-ai"),
 ]
+# Model-specific configurations for TTS
+TTS_MODEL_CONFIGS = {
+    "hexgrad/Kokoro-82M": {
+        "type": "kokoro",
+        "supports_voice": True,
+        "supports_speed": True,
+        "extra_body_params": ["voice", "speed"]
+    },
+    "ResembleAI/chatterbox": {
+        "type": "chatterbox",
+        "supports_voice": False,
+        "supports_speed": False,
+        "extra_body_params": ["audio_url", "exaggeration", "temperature", "cfg"]
+    }
+}
 # Voice options for Kokoro TTS (based on the reference app)
 TTS_VOICES = {
     '🇺🇸 🚺 Heart ❤️': 'af_heart',
     "Life is what happens when you're busy making other plans. Embrace every moment with gratitude."
 ]
+# Example audio URLs for Chatterbox TTS
+TTS_EXAMPLE_AUDIO_URLS = [
+    "https://github.com/nazdridoy/kokoro-tts/raw/main/previews/demo.mp3",
+    "https://huggingface.co/datasets/hf-internal-testing/fixtures/resolve/main/audio/sample_audio_1.mp3",
+    "https://huggingface.co/datasets/hf-internal-testing/fixtures/resolve/main/audio/sample_audio_2.mp3",
+    "https://www.soundjay.com/misc/sounds/bell-ringing-05.wav"
+]
 def get_proxy_key():
     """Get the proxy API key from environment variables."""