Spaces:

HumeAI
/

expressive-tts-arena

Running

App Files Files Community

zach commited on Feb 10

Commit

ee8b196

1 Parent(s): 5d6d1ef

Simplify UI logic in app.py, add logic for handling empty character description inputs.

Browse files

Files changed (4) hide show

src/app.py +31 -29
src/assets/styles.css +13 -2
src/integrations/hume_api.py +2 -2
src/utils.py +8 -3

src/app.py CHANGED Viewed

@@ -75,7 +75,7 @@ def generate_text(
         raise gr.Error("Failed to generate text. Please try again later.")
-def text_to_speech(
     character_description: str, text: str, generated_text_state: str
 ) -> Tuple[gr.update, gr.update, dict, str, ComparisonType, str, str, bool, str, str]:
     """
@@ -116,7 +116,9 @@ def text_to_speech(
     # Select 2 TTS providers based on whether the text has been modified.
     text_modified = text != generated_text_state
-    comparison_type, provider_a, provider_b = choose_providers(text_modified)
     try:
         if provider_b == constants.HUME_AI:
@@ -288,25 +290,17 @@ def reset_ui() -> Tuple[gr.update, gr.update, gr.update, gr.update, None, None,
     )
-def build_input_section() -> Tuple[gr.Markdown, gr.Dropdown, gr.Textbox, gr.Button]:
-    """Builds the input section including instructions, sample character description dropdown, character description input, and generate button"""
-    instructions = gr.Markdown(
-        """
-        1. **Enter or Generate Text:** Type directly in the text box—or enter a character description and click “Generate Text” to auto-populate. Edit as needed.
-        2. **Synthesize Speech:** Click “Synthesize Speech” to generate two audio outputs.
-        3. **Listen & Compare:** Play back both audio options to hear the differences.
-        4. **Vote for Your Favorite:** Click “Vote for Option A” or “Vote for Option B” to cast your vote.
-        """
-    )
     sample_character_description_dropdown = gr.Dropdown(
         choices=list(constants.SAMPLE_CHARACTER_DESCRIPTIONS.keys()),
-        label="Choose a sample character description (or enter your own)",
         value=None,
         interactive=True,
     )
     character_description_input = gr.Textbox(
         label="Character description",
-        placeholder="Enter your character description to be used to generate text and a novel voice...",
         lines=3,
         max_lines=8,
         max_length=constants.CHARACTER_DESCRIPTION_MAX_LENGTH,
@@ -314,7 +308,6 @@ def build_input_section() -> Tuple[gr.Markdown, gr.Dropdown, gr.Textbox, gr.Butt
     )
     generate_text_button = gr.Button("Generate text", variant="secondary")
     return (
-        instructions,
         sample_character_description_dropdown,
         character_description_input,
         generate_text_button,
@@ -327,7 +320,7 @@ def build_output_section() -> (
     """Builds the output section including generated text, audio players, and vote buttons."""
     text_input = gr.Textbox(
         label="Text",
-        placeholder="Enter text to synthesize speech...",
         interactive=True,
         autoscroll=False,
         lines=3,
@@ -370,12 +363,19 @@ def build_gradio_interface() -> gr.Blocks:
         fill_width=True,
         css_paths="src/assets/styles.css",
     ) as demo:
-        # Title
         gr.Markdown("# Expressive TTS Arena")
         # Build generate text section
         (
-            instructions,
             sample_character_description_dropdown,
             character_description_input,
             generate_text_button,
@@ -393,24 +393,26 @@ def build_gradio_interface() -> gr.Blocks:
         # --- UI state components ---
-        # Track text used for speech synthesis
-        text_state = gr.State("")
         # Track character description used for text and voice generation
         character_description_state = gr.State("")
-        # Track comparison type (which set of providers are being compared)
-        comparison_type_state = gr.State()
         # Track generation ID for Option A
         option_a_generation_id_state = gr.State()
         # Track generation ID for Option B
         option_b_generation_id_state = gr.State()
-        # Track whether text that was used was generated or modified/custom
-        text_modified_state = gr.State()
-        # Track generated text state
-        generated_text_state = gr.State("")
-        # Track generated audio for option B for playing automatically after option 1 audio finishes
-        option_b_audio_state = gr.State()
         # Track option map (option A and option B are randomized)
         option_map_state = gr.State()
         # Track whether the user has voted for an option
         vote_submitted_state = gr.State(False)
@@ -467,7 +469,7 @@ def build_gradio_interface() -> gr.Blocks:
                 vote_submitted_state,
             ],
         ).then(
-            fn=text_to_speech,
             inputs=[character_description_input, text_input, generated_text_state],
             outputs=[
                 option_a_audio_player,

         raise gr.Error("Failed to generate text. Please try again later.")
+def synthesize_speech(
     character_description: str, text: str, generated_text_state: str
 ) -> Tuple[gr.update, gr.update, dict, str, ComparisonType, str, str, bool, str, str]:
     """
     # Select 2 TTS providers based on whether the text has been modified.
     text_modified = text != generated_text_state
+    comparison_type, provider_a, provider_b = choose_providers(
+        text_modified, character_description
+    )
     try:
         if provider_b == constants.HUME_AI:
     )
+def build_input_section() -> Tuple[gr.Dropdown, gr.Textbox, gr.Button]:
+    """Builds the input section including the sample character description dropdown, character description input, and generate text button"""
     sample_character_description_dropdown = gr.Dropdown(
         choices=list(constants.SAMPLE_CHARACTER_DESCRIPTIONS.keys()),
+        label="Choose a sample character description",
         value=None,
         interactive=True,
     )
     character_description_input = gr.Textbox(
         label="Character description",
+        placeholder="Enter a character description...",
         lines=3,
         max_lines=8,
         max_length=constants.CHARACTER_DESCRIPTION_MAX_LENGTH,
     )
     generate_text_button = gr.Button("Generate text", variant="secondary")
     return (
         sample_character_description_dropdown,
         character_description_input,
         generate_text_button,
     """Builds the output section including generated text, audio players, and vote buttons."""
     text_input = gr.Textbox(
         label="Text",
+        placeholder="Generate or enter text...",
         interactive=True,
         autoscroll=False,
         lines=3,
         fill_width=True,
         css_paths="src/assets/styles.css",
     ) as demo:
+        # Title & instructions
         gr.Markdown("# Expressive TTS Arena")
+        gr.Markdown(
+            """
+            1. **Enter or Generate Text:** Type directly in the text box—or enter a character description and click “Generate Text” to auto-populate. Edit as needed.
+            2. **Synthesize Speech:** Click “Synthesize Speech” to generate two audio outputs.
+            3. **Listen & Compare:** Play back both audio options to hear the differences.
+            4. **Vote for Your Favorite:** Click “Vote for Option A” or “Vote for Option B” to cast your vote.
+            """
+        )
         # Build generate text section
         (
             sample_character_description_dropdown,
             character_description_input,
             generate_text_button,
         # --- UI state components ---
         # Track character description used for text and voice generation
         character_description_state = gr.State("")
+        # Track text used for speech synthesis
+        text_state = gr.State("")
+        # Track generated text state
+        generated_text_state = gr.State("")
+        # Track whether text that was used was generated or modified/custom
+        text_modified_state = gr.State()
+        # Track generated audio for option B (for playing automatically after option 1 audio finishes)
+        option_b_audio_state = gr.State()
         # Track generation ID for Option A
         option_a_generation_id_state = gr.State()
         # Track generation ID for Option B
         option_b_generation_id_state = gr.State()
+        # Track comparison type (which set of providers are being compared)
+        comparison_type_state = gr.State()
         # Track option map (option A and option B are randomized)
         option_map_state = gr.State()
         # Track whether the user has voted for an option
         vote_submitted_state = gr.State(False)
                 vote_submitted_state,
             ],
         ).then(
+            fn=synthesize_speech,
             inputs=[character_description_input, text_input, generated_text_state],
             outputs=[
                 option_a_audio_player,

src/assets/styles.css CHANGED Viewed

@@ -1,3 +1,14 @@
-footer {
-  display:none !important
 }

+/* Remove Gradio footer from UI */
+footer.svelte-sar7eh {
+  display: none !important;
+}
+/*
+  The copy buttons for Gradio Textinput components use the "button_secondary_text_color"
+  theme color which is currently #FFFFFF (white). This makes the copy svg icon white, causing
+  it to disappear background. Overriding the class color here to ensure it is visible
+  in the UI.
+*/
+.copy-button {
+  color: #7E22CE;
 }

src/integrations/hume_api.py CHANGED Viewed

@@ -135,7 +135,7 @@ def text_to_speech_with_hume(
         raise ValueError("Invalid number of generations specified. Must be 1 or 2.")
     request_body = {
-        "utterances": [{"text": text, "description": character_description}],
         "format": {
             "type": hume_config.file_format,
         },
@@ -173,7 +173,7 @@ def text_to_speech_with_hume(
         if isinstance(e, HTTPError):
             if e.response.status_code >= 400 and e.response.status_code < 500:
                 raise UnretryableHumeError(
-                    message=f'"{e.response.text}"', original_exception=e
                 ) from e
         raise HumeError(message=f"{e}", original_exception=e) from e

         raise ValueError("Invalid number of generations specified. Must be 1 or 2.")
     request_body = {
+        "utterances": [{"text": text, "description": character_description or None}],
         "format": {
             "type": hume_config.file_format,
         },
         if isinstance(e, HTTPError):
             if e.response.status_code >= 400 and e.response.status_code < 500:
                 raise UnretryableHumeError(
+                    message=f"{e.response.text}", original_exception=e
                 ) from e
         raise HumeError(message=f"{e}", original_exception=e) from e

src/utils.py CHANGED Viewed

@@ -205,13 +205,14 @@ def save_base64_audio_to_file(base64_audio: str, filename: str) -> str:
 def choose_providers(
     text_modified: bool,
 ) -> Tuple[ComparisonType, TTSProviderName, TTSProviderName]:
     """
     Select two TTS providers based on whether the text has been modified.
     The first provider is always set to "Hume AI". For the second provider, the function
-    selects "Hume AI" if the text has been modified; otherwise, it randomly chooses one from
-    the TTS_PROVIDERS list.
     Args:
         text_modified (bool): A flag indicating whether the text has been modified.
@@ -223,9 +224,13 @@ def choose_providers(
         where the first is always "Hume AI" and the second is determined by the text_modified
         flag and random selection.
     """
     provider_a = constants.HUME_AI
     provider_b = (
-        constants.HUME_AI if text_modified else random.choice(constants.TTS_PROVIDERS)
     )
     match provider_b:

 def choose_providers(
     text_modified: bool,
+    character_description: str,
 ) -> Tuple[ComparisonType, TTSProviderName, TTSProviderName]:
     """
     Select two TTS providers based on whether the text has been modified.
     The first provider is always set to "Hume AI". For the second provider, the function
+    selects "Hume AI" if the text has been modified or if a character description was
+    not provided; otherwise, it randomly chooses one from the TTS_PROVIDERS list.
     Args:
         text_modified (bool): A flag indicating whether the text has been modified.
         where the first is always "Hume AI" and the second is determined by the text_modified
         flag and random selection.
     """
+    hume_comparison_only = text_modified or not character_description
     provider_a = constants.HUME_AI
     provider_b = (
+        constants.HUME_AI
+        if hume_comparison_only
+        else random.choice(constants.TTS_PROVIDERS)
     )
     match provider_b: