Spaces:
Running
Running
zach
commited on
Commit
·
7f25817
1
Parent(s):
6431bab
Update Hume integration to use OCTAVE TTS endpoint, update Elevenlabs integration to use voice design endpoint, no longer specify voice since voices are now generated
Browse files- src/app.py +24 -34
- src/integrations/__init__.py +1 -1
- src/integrations/elevenlabs_api.py +17 -31
- src/integrations/hume_api.py +32 -59
- src/types.py +2 -15
src/app.py
CHANGED
|
@@ -35,7 +35,6 @@ from src.integrations import (
|
|
| 35 |
AnthropicError,
|
| 36 |
ElevenLabsError,
|
| 37 |
generate_text_with_claude,
|
| 38 |
-
get_random_hume_voice_names,
|
| 39 |
HumeError,
|
| 40 |
text_to_speech_with_elevenlabs,
|
| 41 |
text_to_speech_with_hume,
|
|
@@ -114,34 +113,29 @@ def text_to_speech(
|
|
| 114 |
random.random() < 0.5
|
| 115 |
)
|
| 116 |
|
| 117 |
-
# Pre-select two Hume voices pre-emptively in case we compare Hume to Hume to ensure we do not select the same voice twice.
|
| 118 |
-
hume_voice_a, hume_voice_b = get_random_hume_voice_names()
|
| 119 |
-
|
| 120 |
try:
|
| 121 |
with ThreadPoolExecutor(max_workers=2) as executor:
|
| 122 |
provider_a = HUME_AI
|
| 123 |
-
future_audio_a = executor.submit(
|
| 124 |
-
text_to_speech_with_hume, prompt, text, hume_voice_a
|
| 125 |
-
)
|
| 126 |
|
| 127 |
if compare_hume_with_elevenlabs:
|
| 128 |
provider_b = ELEVENLABS
|
| 129 |
-
future_audio_b = executor.submit(text_to_speech_with_elevenlabs, text)
|
| 130 |
-
else:
|
| 131 |
-
provider_b = HUME_AI
|
| 132 |
future_audio_b = executor.submit(
|
| 133 |
-
|
| 134 |
)
|
|
|
|
|
|
|
|
|
|
| 135 |
|
| 136 |
-
|
| 137 |
-
|
| 138 |
|
| 139 |
logger.info(
|
| 140 |
f"TTS generated: {provider_a}={len(audio_a)} bytes, {provider_b}={len(audio_b)} bytes"
|
| 141 |
)
|
| 142 |
options = [
|
| 143 |
-
(audio_a,
|
| 144 |
-
(audio_b,
|
| 145 |
]
|
| 146 |
random.shuffle(options)
|
| 147 |
option_a_audio, option_b_audio = options[0][0], options[1][0]
|
|
@@ -179,16 +173,16 @@ def vote(
|
|
| 179 |
option_map (OptionMap): A dictionary mapping option labels to their details.
|
| 180 |
Expected structure:
|
| 181 |
{
|
| 182 |
-
'Option A': '
|
| 183 |
-
'Option B': '
|
| 184 |
}
|
| 185 |
selected_button (str): The button that was clicked.
|
| 186 |
|
| 187 |
Returns:
|
| 188 |
A tuple of:
|
| 189 |
- A boolean indicating if the vote was accepted.
|
| 190 |
-
- An update for the selected vote button (showing provider
|
| 191 |
-
- An update for the unselected vote button (showing provider
|
| 192 |
- An update for enabling vote interactions.
|
| 193 |
"""
|
| 194 |
if not option_map or vote_submitted:
|
|
@@ -198,20 +192,12 @@ def vote(
|
|
| 198 |
selected_option, other_option = (
|
| 199 |
(OPTION_A, OPTION_B) if option_a_selected else (OPTION_B, OPTION_A)
|
| 200 |
)
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
selected_details = option_map.get(selected_option, {})
|
| 204 |
-
selected_provider = selected_details.get("provider", UNKNOWN_PROVIDER)
|
| 205 |
-
selected_voice = selected_details.get("voice", "")
|
| 206 |
-
|
| 207 |
-
# Parse other option details from options map
|
| 208 |
-
other_details = option_map.get(other_option, {})
|
| 209 |
-
other_provider = other_details.get("provider", UNKNOWN_PROVIDER)
|
| 210 |
-
other_voice = other_details.get("voice", "")
|
| 211 |
|
| 212 |
# Build button labels, displaying the provider and voice name, appending the trophy emoji to the selected option.
|
| 213 |
-
selected_label = f"{selected_provider}
|
| 214 |
-
other_label = f"{other_provider}
|
| 215 |
|
| 216 |
return (
|
| 217 |
True,
|
|
@@ -245,7 +231,7 @@ def reset_ui() -> Tuple[gr.update, gr.update, gr.update, gr.update, None, None,
|
|
| 245 |
"""
|
| 246 |
return (
|
| 247 |
gr.update(value=None),
|
| 248 |
-
gr.update(value=None),
|
| 249 |
gr.update(value=VOTE_FOR_OPTION_A, variant="secondary"),
|
| 250 |
gr.update(value=VOTE_FOR_OPTION_B, variant="secondary"),
|
| 251 |
None,
|
|
@@ -398,9 +384,13 @@ def build_gradio_interface() -> gr.Blocks:
|
|
| 398 |
# 3. Synthesize speech, load audio players, and display vote button
|
| 399 |
# 4. Enable the "Synthesize speech" button and display vote buttons
|
| 400 |
synthesize_speech_button.click(
|
| 401 |
-
fn=lambda:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 402 |
inputs=[],
|
| 403 |
-
outputs=[synthesize_speech_button],
|
| 404 |
).then(
|
| 405 |
fn=reset_ui,
|
| 406 |
inputs=[],
|
|
|
|
| 35 |
AnthropicError,
|
| 36 |
ElevenLabsError,
|
| 37 |
generate_text_with_claude,
|
|
|
|
| 38 |
HumeError,
|
| 39 |
text_to_speech_with_elevenlabs,
|
| 40 |
text_to_speech_with_hume,
|
|
|
|
| 113 |
random.random() < 0.5
|
| 114 |
)
|
| 115 |
|
|
|
|
|
|
|
|
|
|
| 116 |
try:
|
| 117 |
with ThreadPoolExecutor(max_workers=2) as executor:
|
| 118 |
provider_a = HUME_AI
|
| 119 |
+
future_audio_a = executor.submit(text_to_speech_with_hume, prompt, text)
|
|
|
|
|
|
|
| 120 |
|
| 121 |
if compare_hume_with_elevenlabs:
|
| 122 |
provider_b = ELEVENLABS
|
|
|
|
|
|
|
|
|
|
| 123 |
future_audio_b = executor.submit(
|
| 124 |
+
text_to_speech_with_elevenlabs, prompt, text
|
| 125 |
)
|
| 126 |
+
else:
|
| 127 |
+
provider_b = HUME_AI
|
| 128 |
+
future_audio_b = executor.submit(text_to_speech_with_hume, prompt, text)
|
| 129 |
|
| 130 |
+
audio_a = future_audio_a.result()
|
| 131 |
+
audio_b = future_audio_b.result()
|
| 132 |
|
| 133 |
logger.info(
|
| 134 |
f"TTS generated: {provider_a}={len(audio_a)} bytes, {provider_b}={len(audio_b)} bytes"
|
| 135 |
)
|
| 136 |
options = [
|
| 137 |
+
(audio_a, provider_a),
|
| 138 |
+
(audio_b, provider_b),
|
| 139 |
]
|
| 140 |
random.shuffle(options)
|
| 141 |
option_a_audio, option_b_audio = options[0][0], options[1][0]
|
|
|
|
| 173 |
option_map (OptionMap): A dictionary mapping option labels to their details.
|
| 174 |
Expected structure:
|
| 175 |
{
|
| 176 |
+
'Option A': 'Hume AI',
|
| 177 |
+
'Option B': 'ElevenLabs',
|
| 178 |
}
|
| 179 |
selected_button (str): The button that was clicked.
|
| 180 |
|
| 181 |
Returns:
|
| 182 |
A tuple of:
|
| 183 |
- A boolean indicating if the vote was accepted.
|
| 184 |
+
- An update for the selected vote button (showing provider and trophy emoji).
|
| 185 |
+
- An update for the unselected vote button (showing provider).
|
| 186 |
- An update for enabling vote interactions.
|
| 187 |
"""
|
| 188 |
if not option_map or vote_submitted:
|
|
|
|
| 192 |
selected_option, other_option = (
|
| 193 |
(OPTION_A, OPTION_B) if option_a_selected else (OPTION_B, OPTION_A)
|
| 194 |
)
|
| 195 |
+
selected_provider = option_map.get(selected_option)
|
| 196 |
+
other_provider = option_map.get(other_option)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
|
| 198 |
# Build button labels, displaying the provider and voice name, appending the trophy emoji to the selected option.
|
| 199 |
+
selected_label = f"{selected_provider} {TROPHY_EMOJI}"
|
| 200 |
+
other_label = f"{other_provider}"
|
| 201 |
|
| 202 |
return (
|
| 203 |
True,
|
|
|
|
| 231 |
"""
|
| 232 |
return (
|
| 233 |
gr.update(value=None),
|
| 234 |
+
gr.update(value=None, autoplay=False),
|
| 235 |
gr.update(value=VOTE_FOR_OPTION_A, variant="secondary"),
|
| 236 |
gr.update(value=VOTE_FOR_OPTION_B, variant="secondary"),
|
| 237 |
None,
|
|
|
|
| 384 |
# 3. Synthesize speech, load audio players, and display vote button
|
| 385 |
# 4. Enable the "Synthesize speech" button and display vote buttons
|
| 386 |
synthesize_speech_button.click(
|
| 387 |
+
fn=lambda: (
|
| 388 |
+
gr.update(interactive=False),
|
| 389 |
+
gr.update(interactive=False),
|
| 390 |
+
gr.update(interactive=False),
|
| 391 |
+
),
|
| 392 |
inputs=[],
|
| 393 |
+
outputs=[synthesize_speech_button, vote_button_a, vote_button_b],
|
| 394 |
).then(
|
| 395 |
fn=reset_ui,
|
| 396 |
inputs=[],
|
src/integrations/__init__.py
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
from .anthropic_api import generate_text_with_claude, AnthropicError
|
| 2 |
from .elevenlabs_api import text_to_speech_with_elevenlabs, ElevenLabsError
|
| 3 |
-
from .hume_api import text_to_speech_with_hume,
|
|
|
|
| 1 |
from .anthropic_api import generate_text_with_claude, AnthropicError
|
| 2 |
from .elevenlabs_api import text_to_speech_with_elevenlabs, ElevenLabsError
|
| 3 |
+
from .hume_api import text_to_speech_with_hume, HumeError
|
src/integrations/elevenlabs_api.py
CHANGED
|
@@ -114,58 +114,44 @@ elevenlabs_config = ElevenLabsConfig()
|
|
| 114 |
after=after_log(logger, logging.DEBUG),
|
| 115 |
reraise=True,
|
| 116 |
)
|
| 117 |
-
def text_to_speech_with_elevenlabs(text: str) ->
|
| 118 |
"""
|
| 119 |
Synthesizes text to speech using the ElevenLabs TTS API.
|
| 120 |
|
| 121 |
Args:
|
|
|
|
| 122 |
text (str): The text to be synthesized to speech.
|
| 123 |
|
| 124 |
Returns:
|
| 125 |
-
|
| 126 |
-
and the raw binary audio data for playback.
|
| 127 |
|
| 128 |
Raises:
|
| 129 |
ElevenLabsError: If there is an error communicating with the ElevenLabs API or processing the response.
|
| 130 |
"""
|
| 131 |
logger.debug(
|
| 132 |
-
f"Synthesizing speech
|
| 133 |
)
|
| 134 |
|
| 135 |
-
# Get a random voice as an enum member.
|
| 136 |
-
voice = elevenlabs_config.random_voice
|
| 137 |
-
logger.debug(f"Selected voice: {voice.voice_name}")
|
| 138 |
-
|
| 139 |
try:
|
| 140 |
# Synthesize speech using the ElevenLabs SDK
|
| 141 |
-
|
|
|
|
| 142 |
text=text,
|
| 143 |
-
voice_id=voice.voice_id,
|
| 144 |
-
model_id=elevenlabs_config.model_id,
|
| 145 |
-
output_format=elevenlabs_config.output_format,
|
| 146 |
)
|
| 147 |
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
logger.error("Invalid audio iterator response.")
|
| 154 |
-
raise ElevenLabsError(
|
| 155 |
-
"Invalid audio iterator received from ElevenLabs API."
|
| 156 |
-
) from iter_error
|
| 157 |
-
|
| 158 |
-
# Validate audio
|
| 159 |
-
if not audio:
|
| 160 |
-
logger.error("No audio data received from ElevenLabs API.")
|
| 161 |
-
raise ElevenLabsError("Empty audio data received from ElevenLabs API.")
|
| 162 |
|
| 163 |
-
|
| 164 |
-
|
|
|
|
| 165 |
|
| 166 |
except Exception as e:
|
| 167 |
-
logger.exception(f"Error synthesizing speech
|
| 168 |
raise ElevenLabsError(
|
| 169 |
-
message=f"Failed to synthesize speech
|
| 170 |
original_exception=e,
|
| 171 |
-
)
|
|
|
|
| 114 |
after=after_log(logger, logging.DEBUG),
|
| 115 |
reraise=True,
|
| 116 |
)
|
| 117 |
+
def text_to_speech_with_elevenlabs(prompt: str, text: str) -> bytes:
|
| 118 |
"""
|
| 119 |
Synthesizes text to speech using the ElevenLabs TTS API.
|
| 120 |
|
| 121 |
Args:
|
| 122 |
+
prompt (str): The original user prompt used as the voice description.
|
| 123 |
text (str): The text to be synthesized to speech.
|
| 124 |
|
| 125 |
Returns:
|
| 126 |
+
bytes: The raw binary audio data for playback.
|
|
|
|
| 127 |
|
| 128 |
Raises:
|
| 129 |
ElevenLabsError: If there is an error communicating with the ElevenLabs API or processing the response.
|
| 130 |
"""
|
| 131 |
logger.debug(
|
| 132 |
+
f"Synthesizing speech with ElevenLabs. Text length: {len(text)} characters."
|
| 133 |
)
|
| 134 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
try:
|
| 136 |
# Synthesize speech using the ElevenLabs SDK
|
| 137 |
+
response = elevenlabs_config.client.text_to_voice.create_previews(
|
| 138 |
+
voice_description=prompt,
|
| 139 |
text=text,
|
|
|
|
|
|
|
|
|
|
| 140 |
)
|
| 141 |
|
| 142 |
+
previews = response.previews
|
| 143 |
+
if not previews:
|
| 144 |
+
msg = "No previews returned by ElevenLabs API."
|
| 145 |
+
logger.error(msg)
|
| 146 |
+
raise ElevenLabsError(message=msg)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
|
| 148 |
+
base64_audio = previews[0].audio_base64
|
| 149 |
+
audio = base64.b64decode(base64_audio)
|
| 150 |
+
return audio
|
| 151 |
|
| 152 |
except Exception as e:
|
| 153 |
+
logger.exception(f"Error synthesizing speech with ElevenLabs: {e}")
|
| 154 |
raise ElevenLabsError(
|
| 155 |
+
message=f"Failed to synthesize speech with ElevenLabs: {e}",
|
| 156 |
original_exception=e,
|
| 157 |
+
) from e
|
src/integrations/hume_api.py
CHANGED
|
@@ -19,6 +19,7 @@ Functions:
|
|
| 19 |
"""
|
| 20 |
|
| 21 |
# Standard Library Imports
|
|
|
|
| 22 |
from dataclasses import dataclass
|
| 23 |
import logging
|
| 24 |
import random
|
|
@@ -33,17 +34,12 @@ from src.config import logger
|
|
| 33 |
from src.utils import validate_env_var, truncate_text
|
| 34 |
|
| 35 |
|
| 36 |
-
HumeVoiceName = Literal["ITO", "KORA", "STELLA", "DACHER"]
|
| 37 |
-
|
| 38 |
-
|
| 39 |
@dataclass(frozen=True)
|
| 40 |
class HumeConfig:
|
| 41 |
"""Immutable configuration for interacting with the Hume TTS API."""
|
| 42 |
|
| 43 |
api_key: str = validate_env_var("HUME_API_KEY")
|
| 44 |
-
tts_endpoint_url: str = "https://api.hume.ai/v0/tts"
|
| 45 |
-
voice_names: List[HumeVoiceName] = ("ITO", "KORA", "STELLA", "DACHER")
|
| 46 |
-
audio_format: str = "wav"
|
| 47 |
headers: dict = None
|
| 48 |
|
| 49 |
def __post_init__(self):
|
|
@@ -52,10 +48,6 @@ class HumeConfig:
|
|
| 52 |
raise ValueError("Hume API key is not set.")
|
| 53 |
if not self.tts_endpoint_url:
|
| 54 |
raise ValueError("Hume TTS endpoint URL is not set.")
|
| 55 |
-
if not self.voice_names:
|
| 56 |
-
raise ValueError("Hume voice names list is not set.")
|
| 57 |
-
if not self.audio_format:
|
| 58 |
-
raise ValueError("Hume audio format is not set.")
|
| 59 |
|
| 60 |
# Set headers dynamically after validation
|
| 61 |
object.__setattr__(
|
|
@@ -81,38 +73,31 @@ hume_config = HumeConfig()
|
|
| 81 |
|
| 82 |
|
| 83 |
@retry(
|
| 84 |
-
stop=stop_after_attempt(
|
| 85 |
wait=wait_fixed(2),
|
| 86 |
before=before_log(logger, logging.DEBUG),
|
| 87 |
after=after_log(logger, logging.DEBUG),
|
| 88 |
reraise=True,
|
| 89 |
)
|
| 90 |
-
def text_to_speech_with_hume(
|
| 91 |
-
prompt: str, text: str, voice_name: HumeVoiceName
|
| 92 |
-
) -> bytes:
|
| 93 |
"""
|
| 94 |
Synthesizes text to speech using the Hume TTS API and processes raw binary audio data.
|
| 95 |
|
| 96 |
Args:
|
| 97 |
-
prompt (str): The original user prompt
|
| 98 |
text (str): The generated text to be converted to speech.
|
| 99 |
-
voice_name (HumeVoiceName): Name of the voice Hume will use when synthesizing speech.
|
| 100 |
|
| 101 |
Returns:
|
| 102 |
-
voice_name: The name of the voice used for speech synthesis.
|
| 103 |
bytes: The raw binary audio data for playback.
|
| 104 |
|
| 105 |
Raises:
|
| 106 |
-
HumeError: If there is an error communicating with the Hume TTS API.
|
| 107 |
"""
|
| 108 |
logger.debug(
|
| 109 |
f"Processing TTS with Hume. Prompt length: {len(prompt)} characters. Text length: {len(text)} characters."
|
| 110 |
)
|
| 111 |
|
| 112 |
-
request_body = {
|
| 113 |
-
"text": text,
|
| 114 |
-
"voice": {"name": voice_name},
|
| 115 |
-
}
|
| 116 |
|
| 117 |
try:
|
| 118 |
# Synthesize speech using the Hume TTS API
|
|
@@ -121,42 +106,30 @@ def text_to_speech_with_hume(
|
|
| 121 |
headers=hume_config.headers,
|
| 122 |
json=request_body,
|
| 123 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
f"Hume TTS API responded with status {response.status_code}: {response.text[:200]}"
|
| 132 |
-
)
|
| 133 |
-
|
| 134 |
-
# Process response audio
|
| 135 |
-
if response.headers.get("Content-Type", "").startswith("audio/"):
|
| 136 |
-
audio = response.content # Raw binary audio data
|
| 137 |
-
logger.info(f"Received audio data from Hume ({len(audio)} bytes).")
|
| 138 |
-
return voice_name, audio
|
| 139 |
-
|
| 140 |
-
raise HumeError(
|
| 141 |
-
f'Unexpected Content-Type: {response.headers.get("Content-Type", "Unknown")}'
|
| 142 |
-
)
|
| 143 |
-
|
| 144 |
-
except Exception as e:
|
| 145 |
-
logger.exception(f"Error synthesizing speech from text with Hume: {e}")
|
| 146 |
-
raise HumeError(
|
| 147 |
-
message=f"Failed to synthesize speech from text with Hume: {e}",
|
| 148 |
-
original_exception=e,
|
| 149 |
-
)
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
def get_random_hume_voice_names() -> Tuple[HumeVoiceName, HumeVoiceName]:
|
| 153 |
-
"""
|
| 154 |
-
Get two random Hume voice names.
|
| 155 |
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
"""
|
| 20 |
|
| 21 |
# Standard Library Imports
|
| 22 |
+
import base64
|
| 23 |
from dataclasses import dataclass
|
| 24 |
import logging
|
| 25 |
import random
|
|
|
|
| 34 |
from src.utils import validate_env_var, truncate_text
|
| 35 |
|
| 36 |
|
|
|
|
|
|
|
|
|
|
| 37 |
@dataclass(frozen=True)
|
| 38 |
class HumeConfig:
|
| 39 |
"""Immutable configuration for interacting with the Hume TTS API."""
|
| 40 |
|
| 41 |
api_key: str = validate_env_var("HUME_API_KEY")
|
| 42 |
+
tts_endpoint_url: str = "https://test-api.hume.ai/v0/tts/octave"
|
|
|
|
|
|
|
| 43 |
headers: dict = None
|
| 44 |
|
| 45 |
def __post_init__(self):
|
|
|
|
| 48 |
raise ValueError("Hume API key is not set.")
|
| 49 |
if not self.tts_endpoint_url:
|
| 50 |
raise ValueError("Hume TTS endpoint URL is not set.")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
# Set headers dynamically after validation
|
| 53 |
object.__setattr__(
|
|
|
|
| 73 |
|
| 74 |
|
| 75 |
@retry(
|
| 76 |
+
stop=stop_after_attempt(3),
|
| 77 |
wait=wait_fixed(2),
|
| 78 |
before=before_log(logger, logging.DEBUG),
|
| 79 |
after=after_log(logger, logging.DEBUG),
|
| 80 |
reraise=True,
|
| 81 |
)
|
| 82 |
+
def text_to_speech_with_hume(prompt: str, text: str) -> bytes:
|
|
|
|
|
|
|
| 83 |
"""
|
| 84 |
Synthesizes text to speech using the Hume TTS API and processes raw binary audio data.
|
| 85 |
|
| 86 |
Args:
|
| 87 |
+
prompt (str): The original user prompt to use as the description for generating the voice.
|
| 88 |
text (str): The generated text to be converted to speech.
|
|
|
|
| 89 |
|
| 90 |
Returns:
|
|
|
|
| 91 |
bytes: The raw binary audio data for playback.
|
| 92 |
|
| 93 |
Raises:
|
| 94 |
+
HumeError: If there is an error communicating with the Hume TTS API or parsing the response.
|
| 95 |
"""
|
| 96 |
logger.debug(
|
| 97 |
f"Processing TTS with Hume. Prompt length: {len(prompt)} characters. Text length: {len(text)} characters."
|
| 98 |
)
|
| 99 |
|
| 100 |
+
request_body = {"utterances": [{"text": text, "description": prompt}]}
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
try:
|
| 103 |
# Synthesize speech using the Hume TTS API
|
|
|
|
| 106 |
headers=hume_config.headers,
|
| 107 |
json=request_body,
|
| 108 |
)
|
| 109 |
+
response.raise_for_status()
|
| 110 |
+
except requests.RequestException as re:
|
| 111 |
+
logger.exception(f"Error communicating with Hume TTS API: {re}")
|
| 112 |
+
raise HumeError(f"Error communicating with Hume TTS API: {re}") from re
|
| 113 |
|
| 114 |
+
try:
|
| 115 |
+
# Parse JSON response
|
| 116 |
+
response_data = response.json()
|
| 117 |
+
except ValueError as ve:
|
| 118 |
+
logger.exception("Invalid JSON response from Hume TTS API")
|
| 119 |
+
raise HumeError("Invalid JSON response from Hume TTS API") from ve
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
|
| 121 |
+
try:
|
| 122 |
+
# Safely extract the generation result from the response JSON
|
| 123 |
+
generations = response_data.get("generations", [])
|
| 124 |
+
if not generations or "audio" not in generations[0]:
|
| 125 |
+
logger.error("Missing 'audio' data in the response.")
|
| 126 |
+
raise HumeError("Missing audio data in response from Hume TTS API")
|
| 127 |
+
base64_audio = generations[0]["audio"]
|
| 128 |
+
# Decode base64 encoded audio
|
| 129 |
+
audio = base64.b64decode(base64_audio)
|
| 130 |
+
except (KeyError, TypeError, base64.binascii.Error) as ae:
|
| 131 |
+
logger.exception(f"Error processing audio data: {ae}")
|
| 132 |
+
raise HumeError(f"Error processing audio data from Hume TTS API: {ae}") from ae
|
| 133 |
+
|
| 134 |
+
logger.info(f"Received audio data from Hume ({len(audio)} bytes).")
|
| 135 |
+
return audio
|
src/types.py
CHANGED
|
@@ -9,27 +9,14 @@ has a consistent structure including both the provider and the associated voice.
|
|
| 9 |
from typing import TypedDict, Literal, Dict
|
| 10 |
|
| 11 |
|
| 12 |
-
TTSProviderName = Literal["Hume AI", "ElevenLabs"
|
| 13 |
"""TTSProviderName represents the allowed provider names for TTS services."""
|
| 14 |
|
| 15 |
|
| 16 |
-
class OptionDetails(TypedDict):
|
| 17 |
-
"""
|
| 18 |
-
A typed dictionary representing the details of an option.
|
| 19 |
-
|
| 20 |
-
Attributes:
|
| 21 |
-
provider (TTSProviderName): The name of the provider (either 'Hume AI' or 'ElevenLabs').
|
| 22 |
-
voice (str): The name of the voice associated with the option.
|
| 23 |
-
"""
|
| 24 |
-
|
| 25 |
-
provider: TTSProviderName
|
| 26 |
-
voice: str
|
| 27 |
-
|
| 28 |
-
|
| 29 |
OptionKey = Literal["Option A", "Option B"]
|
| 30 |
"""OptionKey is restricted to the literal values 'Option A' or 'Option B'."""
|
| 31 |
|
| 32 |
|
| 33 |
-
OptionMap = Dict[OptionKey,
|
| 34 |
"""OptionMap defines the structure of the options mapping, where each key is an OptionKey
|
| 35 |
and the value is an OptionDetails dictionary."""
|
|
|
|
| 9 |
from typing import TypedDict, Literal, Dict
|
| 10 |
|
| 11 |
|
| 12 |
+
TTSProviderName = Literal["Hume AI", "ElevenLabs"]
|
| 13 |
"""TTSProviderName represents the allowed provider names for TTS services."""
|
| 14 |
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
OptionKey = Literal["Option A", "Option B"]
|
| 17 |
"""OptionKey is restricted to the literal values 'Option A' or 'Option B'."""
|
| 18 |
|
| 19 |
|
| 20 |
+
OptionMap = Dict[OptionKey, TTSProviderName]
|
| 21 |
"""OptionMap defines the structure of the options mapping, where each key is an OptionKey
|
| 22 |
and the value is an OptionDetails dictionary."""
|