Spaces:
Running
Running
File size: 7,246 Bytes
7f15e1c 3fde6b6 7f15e1c 3fde6b6 7f15e1c 3fde6b6 7f15e1c 3fde6b6 7f15e1c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
"""
Text-to-Speech (TTS) Service using Deepgram API
"""
import requests
import os
import base64
import re
from src.utils.logger import logger
from typing import Optional
class TTSService:
"""Service for handling text-to-speech conversion using Deepgram API"""
def __init__(self):
self.api_key = os.getenv("YOUR_DEEPGRAM_API_KEY")
self.base_url = "https://api.deepgram.com/v1/speak"
self.default_model = "aura-2-thalia-en"
if not self.api_key:
logger.error("Deepgram API key not found in environment variables")
raise ValueError("Deepgram API key is required")
def clean_text_for_speech(self, text: str) -> str:
"""
Clean text for speech synthesis by removing problematic characters
Args:
text (str): The text to clean
Returns:
str: Cleaned text suitable for speech synthesis
"""
if not text or not isinstance(text, str):
return ""
# Remove markdown formatting
text = re.sub(r'\*\*(.*?)\*\*', r'\1', text) # Remove bold **text**
text = re.sub(r'\*(.*?)\*', r'\1', text) # Remove italic *text*
text = re.sub(r'`(.*?)`', r'\1', text) # Remove code `text`
text = re.sub(r'#{1,6}\s', '', text) # Remove headers # ## ###
text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text) # Remove links [text](url) -> text
# Remove emojis and special unicode characters
# Emoticons
text = re.sub(r'[\U0001F600-\U0001F64F]', '', text)
# Misc symbols
text = re.sub(r'[\U0001F300-\U0001F5FF]', '', text)
# Transport & map
text = re.sub(r'[\U0001F680-\U0001F6FF]', '', text)
# Regional indicators
text = re.sub(r'[\U0001F1E0-\U0001F1FF]', '', text)
# Misc symbols
text = re.sub(r'[\U00002600-\U000026FF]', '', text)
# Dingbats
text = re.sub(r'[\U00002700-\U000027BF]', '', text)
# Variation selectors
text = re.sub(r'[\U0000FE00-\U0000FE0F]', '', text)
# Supplemental symbols
text = re.sub(r'[\U0001F900-\U0001F9FF]', '', text)
# Remove problematic punctuation and special characters
text = re.sub(r'[""'']', '"', text) # Replace smart quotes with regular quotes
text = re.sub(r'[–—]', '-', text) # Replace em/en dashes with hyphens
text = re.sub(r'[…]', '...', text) # Replace ellipsis character
text = re.sub(r'[«»]', '"', text) # Replace angle quotes
text = re.sub(r'[‹›]', "'", text) # Replace single angle quotes
# Remove control characters and zero-width characters
text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text) # Zero-width chars
text = re.sub(r'[\u0000-\u001F\u007F-\u009F]', '', text) # Control chars
# Clean up extra whitespace
text = re.sub(r'\s+', ' ', text) # Multiple spaces to single space
text = text.strip() # Trim leading/trailing spaces
# Remove multiple consecutive punctuation
text = re.sub(r'\.{3,}', '...', text) # Multiple dots to ellipsis
text = re.sub(r'!{2,}', '!', text) # Multiple exclamations to single
text = re.sub(r'\?{2,}', '?', text) # Multiple questions to single
# Ensure proper sentence endings
text = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', text) # Space after sentence endings
return text
async def text_to_speech(
self,
text: str,
model: Optional[str] = None,
format: str = "mp3"
) -> Optional[dict]:
"""
Convert text to speech using Deepgram API
Args:
text (str): The text to convert to speech
model (str): The TTS model to use (default: aura-2-thalia-en)
format (str): Audio format (default: mp3)
Returns:
dict: Contains audio data and metadata, or None if failed
"""
try:
if not text or not text.strip():
logger.warning("Empty text provided for TTS conversion")
return None
# Clean and prepare text
cleaned_text = self.clean_text_for_speech(text)
if not cleaned_text or not cleaned_text.strip():
logger.warning("Text became empty after cleaning for TTS")
return None
if len(cleaned_text) > 2000: # Limit text length for TTS
cleaned_text = cleaned_text[:2000] + "..."
logger.warning(f"Text truncated to 2000 characters for TTS")
# Prepare request
url = self.base_url
querystring = {"model": model or self.default_model}
payload = {"text": cleaned_text}
headers = {
"Authorization": f"Token {self.api_key}",
"Content-Type": "application/json"
}
logger.info(f"Converting text to speech: '{cleaned_text[:100]}...' (original: '{text[:50]}...')")
# Make request to Deepgram API
response = requests.post(
url,
json=payload,
headers=headers,
params=querystring,
timeout=30
)
if response.status_code == 200:
# Encode audio data as base64
audio_data = response.content
audio_base64 = base64.b64encode(audio_data).decode('utf-8')
# Determine MIME type based on format
mime_type = f"audio/{format}"
if format == "mp3":
mime_type = "audio/mpeg"
elif format == "wav":
mime_type = "audio/wav"
result = {
"audio_data": audio_base64,
"mime_type": mime_type,
"format": format,
"text": cleaned_text,
"model": model or self.default_model,
"size_bytes": len(audio_data)
}
logger.info(f"TTS conversion successful: {len(audio_data)} bytes")
return result
else:
logger.error(f"Deepgram TTS API error: {response.status_code} - {response.text}")
return None
except requests.exceptions.Timeout:
logger.error("TTS request timed out")
return None
except requests.exceptions.RequestException as e:
logger.error(f"TTS request failed: {str(e)}")
return None
except Exception as e:
logger.error(f"Unexpected error in TTS conversion: {str(e)}")
return None
def is_available(self) -> bool:
"""Check if TTS service is available"""
return bool(self.api_key)
# Global TTS service instance
tts_service = TTSService() |