Spaces:

ABAO77
/

Run_code_api

Sleeping

App Files Files Community

Run_code_api / src /services /tts_service.py

ABAO77

feat: add text cleaning functionality for TTS service to enhance input processing

3fde6b6 2 months ago

raw

history blame

7.25 kB

	"""
	Text-to-Speech (TTS) Service using Deepgram API
	"""

	import requests
	import os
	import base64
	import re
	from src.utils.logger import logger
	from typing import Optional

	class TTSService:
	"""Service for handling text-to-speech conversion using Deepgram API"""

	def __init__(self):
	self.api_key = os.getenv("YOUR_DEEPGRAM_API_KEY")
	self.base_url = "https://api.deepgram.com/v1/speak"
	self.default_model = "aura-2-thalia-en"

	if not self.api_key:
	logger.error("Deepgram API key not found in environment variables")
	raise ValueError("Deepgram API key is required")

	def clean_text_for_speech(self, text: str) -> str:
	"""
	Clean text for speech synthesis by removing problematic characters

	Args:
	text (str): The text to clean

	Returns:
	str: Cleaned text suitable for speech synthesis
	"""
	if not text or not isinstance(text, str):
	return ""

	# Remove markdown formatting
	text = re.sub(r'\\(.?)\\', r'\1', text) # Remove bold text*
	text = re.sub(r'\(.?)\', r'\1', text) # Remove italic text*
	text = re.sub(r'`(.*?)`', r'\1', text) # Remove code `text`
	text = re.sub(r'#{1,6}\s', '', text) # Remove headers # ## ###
	text = re.sub(r'\[(.?)\]\(.?\)', r'\1', text) # Remove links [text](url) -> text

	# Remove emojis and special unicode characters
	# Emoticons
	text = re.sub(r'[\U0001F600-\U0001F64F]', '', text)
	# Misc symbols
	text = re.sub(r'[\U0001F300-\U0001F5FF]', '', text)
	# Transport & map
	text = re.sub(r'[\U0001F680-\U0001F6FF]', '', text)
	# Regional indicators
	text = re.sub(r'[\U0001F1E0-\U0001F1FF]', '', text)
	# Misc symbols
	text = re.sub(r'[\U00002600-\U000026FF]', '', text)
	# Dingbats
	text = re.sub(r'[\U00002700-\U000027BF]', '', text)
	# Variation selectors
	text = re.sub(r'[\U0000FE00-\U0000FE0F]', '', text)
	# Supplemental symbols
	text = re.sub(r'[\U0001F900-\U0001F9FF]', '', text)

	# Remove problematic punctuation and special characters
	text = re.sub(r'[""'']', '"', text) # Replace smart quotes with regular quotes
	text = re.sub(r'[–—]', '-', text) # Replace em/en dashes with hyphens
	text = re.sub(r'[…]', '...', text) # Replace ellipsis character
	text = re.sub(r'[«»]', '"', text) # Replace angle quotes
	text = re.sub(r'[‹›]', "'", text) # Replace single angle quotes

	# Remove control characters and zero-width characters
	text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text) # Zero-width chars
	text = re.sub(r'[\u0000-\u001F\u007F-\u009F]', '', text) # Control chars

	# Clean up extra whitespace
	text = re.sub(r'\s+', ' ', text) # Multiple spaces to single space
	text = text.strip() # Trim leading/trailing spaces

	# Remove multiple consecutive punctuation
	text = re.sub(r'\.{3,}', '...', text) # Multiple dots to ellipsis
	text = re.sub(r'!{2,}', '!', text) # Multiple exclamations to single
	text = re.sub(r'\?{2,}', '?', text) # Multiple questions to single

	# Ensure proper sentence endings
	text = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', text) # Space after sentence endings

	return text

	async def text_to_speech(
	self,
	text: str,
	model: Optional[str] = None,
	format: str = "mp3"
	) -> Optional[dict]:
	"""
	Convert text to speech using Deepgram API

	Args:
	text (str): The text to convert to speech
	model (str): The TTS model to use (default: aura-2-thalia-en)
	format (str): Audio format (default: mp3)

	Returns:
	dict: Contains audio data and metadata, or None if failed
	"""
	try:
	if not text or not text.strip():
	logger.warning("Empty text provided for TTS conversion")
	return None

	# Clean and prepare text
	cleaned_text = self.clean_text_for_speech(text)

	if not cleaned_text or not cleaned_text.strip():
	logger.warning("Text became empty after cleaning for TTS")
	return None

	if len(cleaned_text) > 2000: # Limit text length for TTS
	cleaned_text = cleaned_text[:2000] + "..."
	logger.warning(f"Text truncated to 2000 characters for TTS")

	# Prepare request
	url = self.base_url
	querystring = {"model": model or self.default_model}
	payload = {"text": cleaned_text}
	headers = {
	"Authorization": f"Token {self.api_key}",
	"Content-Type": "application/json"
	}

	logger.info(f"Converting text to speech: '{cleaned_text[:100]}...' (original: '{text[:50]}...')")

	# Make request to Deepgram API
	response = requests.post(
	url,
	json=payload,
	headers=headers,
	params=querystring,
	timeout=30
	)

	if response.status_code == 200:
	# Encode audio data as base64
	audio_data = response.content
	audio_base64 = base64.b64encode(audio_data).decode('utf-8')

	# Determine MIME type based on format
	mime_type = f"audio/{format}"
	if format == "mp3":
	mime_type = "audio/mpeg"
	elif format == "wav":
	mime_type = "audio/wav"

	result = {
	"audio_data": audio_base64,
	"mime_type": mime_type,
	"format": format,
	"text": cleaned_text,
	"model": model or self.default_model,
	"size_bytes": len(audio_data)
	}

	logger.info(f"TTS conversion successful: {len(audio_data)} bytes")
	return result

	else:
	logger.error(f"Deepgram TTS API error: {response.status_code} - {response.text}")
	return None

	except requests.exceptions.Timeout:
	logger.error("TTS request timed out")
	return None
	except requests.exceptions.RequestException as e:
	logger.error(f"TTS request failed: {str(e)}")
	return None
	except Exception as e:
	logger.error(f"Unexpected error in TTS conversion: {str(e)}")
	return None

	def is_available(self) -> bool:
	"""Check if TTS service is available"""
	return bool(self.api_key)

	# Global TTS service instance
	tts_service = TTSService()