Run_code_api / src /services /tts_service.py
ABAO77's picture
feat: add text cleaning functionality for TTS service to enhance input processing
3fde6b6
raw
history blame
7.25 kB
"""
Text-to-Speech (TTS) Service using Deepgram API
"""
import requests
import os
import base64
import re
from src.utils.logger import logger
from typing import Optional
class TTSService:
"""Service for handling text-to-speech conversion using Deepgram API"""
def __init__(self):
self.api_key = os.getenv("YOUR_DEEPGRAM_API_KEY")
self.base_url = "https://api.deepgram.com/v1/speak"
self.default_model = "aura-2-thalia-en"
if not self.api_key:
logger.error("Deepgram API key not found in environment variables")
raise ValueError("Deepgram API key is required")
def clean_text_for_speech(self, text: str) -> str:
"""
Clean text for speech synthesis by removing problematic characters
Args:
text (str): The text to clean
Returns:
str: Cleaned text suitable for speech synthesis
"""
if not text or not isinstance(text, str):
return ""
# Remove markdown formatting
text = re.sub(r'\*\*(.*?)\*\*', r'\1', text) # Remove bold **text**
text = re.sub(r'\*(.*?)\*', r'\1', text) # Remove italic *text*
text = re.sub(r'`(.*?)`', r'\1', text) # Remove code `text`
text = re.sub(r'#{1,6}\s', '', text) # Remove headers # ## ###
text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text) # Remove links [text](url) -> text
# Remove emojis and special unicode characters
# Emoticons
text = re.sub(r'[\U0001F600-\U0001F64F]', '', text)
# Misc symbols
text = re.sub(r'[\U0001F300-\U0001F5FF]', '', text)
# Transport & map
text = re.sub(r'[\U0001F680-\U0001F6FF]', '', text)
# Regional indicators
text = re.sub(r'[\U0001F1E0-\U0001F1FF]', '', text)
# Misc symbols
text = re.sub(r'[\U00002600-\U000026FF]', '', text)
# Dingbats
text = re.sub(r'[\U00002700-\U000027BF]', '', text)
# Variation selectors
text = re.sub(r'[\U0000FE00-\U0000FE0F]', '', text)
# Supplemental symbols
text = re.sub(r'[\U0001F900-\U0001F9FF]', '', text)
# Remove problematic punctuation and special characters
text = re.sub(r'[""'']', '"', text) # Replace smart quotes with regular quotes
text = re.sub(r'[–—]', '-', text) # Replace em/en dashes with hyphens
text = re.sub(r'[…]', '...', text) # Replace ellipsis character
text = re.sub(r'[«»]', '"', text) # Replace angle quotes
text = re.sub(r'[‹›]', "'", text) # Replace single angle quotes
# Remove control characters and zero-width characters
text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text) # Zero-width chars
text = re.sub(r'[\u0000-\u001F\u007F-\u009F]', '', text) # Control chars
# Clean up extra whitespace
text = re.sub(r'\s+', ' ', text) # Multiple spaces to single space
text = text.strip() # Trim leading/trailing spaces
# Remove multiple consecutive punctuation
text = re.sub(r'\.{3,}', '...', text) # Multiple dots to ellipsis
text = re.sub(r'!{2,}', '!', text) # Multiple exclamations to single
text = re.sub(r'\?{2,}', '?', text) # Multiple questions to single
# Ensure proper sentence endings
text = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', text) # Space after sentence endings
return text
async def text_to_speech(
self,
text: str,
model: Optional[str] = None,
format: str = "mp3"
) -> Optional[dict]:
"""
Convert text to speech using Deepgram API
Args:
text (str): The text to convert to speech
model (str): The TTS model to use (default: aura-2-thalia-en)
format (str): Audio format (default: mp3)
Returns:
dict: Contains audio data and metadata, or None if failed
"""
try:
if not text or not text.strip():
logger.warning("Empty text provided for TTS conversion")
return None
# Clean and prepare text
cleaned_text = self.clean_text_for_speech(text)
if not cleaned_text or not cleaned_text.strip():
logger.warning("Text became empty after cleaning for TTS")
return None
if len(cleaned_text) > 2000: # Limit text length for TTS
cleaned_text = cleaned_text[:2000] + "..."
logger.warning(f"Text truncated to 2000 characters for TTS")
# Prepare request
url = self.base_url
querystring = {"model": model or self.default_model}
payload = {"text": cleaned_text}
headers = {
"Authorization": f"Token {self.api_key}",
"Content-Type": "application/json"
}
logger.info(f"Converting text to speech: '{cleaned_text[:100]}...' (original: '{text[:50]}...')")
# Make request to Deepgram API
response = requests.post(
url,
json=payload,
headers=headers,
params=querystring,
timeout=30
)
if response.status_code == 200:
# Encode audio data as base64
audio_data = response.content
audio_base64 = base64.b64encode(audio_data).decode('utf-8')
# Determine MIME type based on format
mime_type = f"audio/{format}"
if format == "mp3":
mime_type = "audio/mpeg"
elif format == "wav":
mime_type = "audio/wav"
result = {
"audio_data": audio_base64,
"mime_type": mime_type,
"format": format,
"text": cleaned_text,
"model": model or self.default_model,
"size_bytes": len(audio_data)
}
logger.info(f"TTS conversion successful: {len(audio_data)} bytes")
return result
else:
logger.error(f"Deepgram TTS API error: {response.status_code} - {response.text}")
return None
except requests.exceptions.Timeout:
logger.error("TTS request timed out")
return None
except requests.exceptions.RequestException as e:
logger.error(f"TTS request failed: {str(e)}")
return None
except Exception as e:
logger.error(f"Unexpected error in TTS conversion: {str(e)}")
return None
def is_available(self) -> bool:
"""Check if TTS service is available"""
return bool(self.api_key)
# Global TTS service instance
tts_service = TTSService()