File size: 7,246 Bytes
7f15e1c
 
 
 
 
 
 
3fde6b6
7f15e1c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3fde6b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7f15e1c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3fde6b6
 
 
 
 
 
7f15e1c
 
 
 
 
 
 
 
 
 
 
 
 
3fde6b6
7f15e1c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
"""
Text-to-Speech (TTS) Service using Deepgram API
"""

import requests
import os
import base64
import re
from src.utils.logger import logger
from typing import Optional

class TTSService:
    """Service for handling text-to-speech conversion using Deepgram API"""
    
    def __init__(self):
        self.api_key = os.getenv("YOUR_DEEPGRAM_API_KEY")
        self.base_url = "https://api.deepgram.com/v1/speak"
        self.default_model = "aura-2-thalia-en"
        
        if not self.api_key:
            logger.error("Deepgram API key not found in environment variables")
            raise ValueError("Deepgram API key is required")
    
    def clean_text_for_speech(self, text: str) -> str:
        """
        Clean text for speech synthesis by removing problematic characters
        
        Args:
            text (str): The text to clean
            
        Returns:
            str: Cleaned text suitable for speech synthesis
        """
        if not text or not isinstance(text, str):
            return ""
        
        # Remove markdown formatting
        text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)  # Remove bold **text**
        text = re.sub(r'\*(.*?)\*', r'\1', text)      # Remove italic *text*
        text = re.sub(r'`(.*?)`', r'\1', text)        # Remove code `text`
        text = re.sub(r'#{1,6}\s', '', text)          # Remove headers # ## ###
        text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text)  # Remove links [text](url) -> text
        
        # Remove emojis and special unicode characters
        # Emoticons
        text = re.sub(r'[\U0001F600-\U0001F64F]', '', text)
        # Misc symbols
        text = re.sub(r'[\U0001F300-\U0001F5FF]', '', text)
        # Transport & map
        text = re.sub(r'[\U0001F680-\U0001F6FF]', '', text)
        # Regional indicators
        text = re.sub(r'[\U0001F1E0-\U0001F1FF]', '', text)
        # Misc symbols
        text = re.sub(r'[\U00002600-\U000026FF]', '', text)
        # Dingbats
        text = re.sub(r'[\U00002700-\U000027BF]', '', text)
        # Variation selectors
        text = re.sub(r'[\U0000FE00-\U0000FE0F]', '', text)
        # Supplemental symbols
        text = re.sub(r'[\U0001F900-\U0001F9FF]', '', text)
        
        # Remove problematic punctuation and special characters
        text = re.sub(r'[""'']', '"', text)    # Replace smart quotes with regular quotes
        text = re.sub(r'[–—]', '-', text)      # Replace em/en dashes with hyphens
        text = re.sub(r'[…]', '...', text)     # Replace ellipsis character
        text = re.sub(r'[«»]', '"', text)      # Replace angle quotes
        text = re.sub(r'[‹›]', "'", text)      # Replace single angle quotes
        
        # Remove control characters and zero-width characters
        text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text)  # Zero-width chars
        text = re.sub(r'[\u0000-\u001F\u007F-\u009F]', '', text)  # Control chars
        
        # Clean up extra whitespace
        text = re.sub(r'\s+', ' ', text)       # Multiple spaces to single space
        text = text.strip()                    # Trim leading/trailing spaces
        
        # Remove multiple consecutive punctuation
        text = re.sub(r'\.{3,}', '...', text)  # Multiple dots to ellipsis
        text = re.sub(r'!{2,}', '!', text)     # Multiple exclamations to single
        text = re.sub(r'\?{2,}', '?', text)    # Multiple questions to single
        
        # Ensure proper sentence endings
        text = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', text)  # Space after sentence endings
        
        return text
    
    async def text_to_speech(
        self, 
        text: str, 
        model: Optional[str] = None,
        format: str = "mp3"
    ) -> Optional[dict]:
        """
        Convert text to speech using Deepgram API
        
        Args:
            text (str): The text to convert to speech
            model (str): The TTS model to use (default: aura-2-thalia-en)
            format (str): Audio format (default: mp3)
            
        Returns:
            dict: Contains audio data and metadata, or None if failed
        """
        try:
            if not text or not text.strip():
                logger.warning("Empty text provided for TTS conversion")
                return None
                
            # Clean and prepare text
            cleaned_text = self.clean_text_for_speech(text)
            
            if not cleaned_text or not cleaned_text.strip():
                logger.warning("Text became empty after cleaning for TTS")
                return None
                
            if len(cleaned_text) > 2000:  # Limit text length for TTS
                cleaned_text = cleaned_text[:2000] + "..."
                logger.warning(f"Text truncated to 2000 characters for TTS")
            
            # Prepare request
            url = self.base_url
            querystring = {"model": model or self.default_model}
            payload = {"text": cleaned_text}
            headers = {
                "Authorization": f"Token {self.api_key}",
                "Content-Type": "application/json"
            }
            
            logger.info(f"Converting text to speech: '{cleaned_text[:100]}...' (original: '{text[:50]}...')")
            
            # Make request to Deepgram API
            response = requests.post(
                url, 
                json=payload, 
                headers=headers, 
                params=querystring,
                timeout=30
            )
            
            if response.status_code == 200:
                # Encode audio data as base64
                audio_data = response.content
                audio_base64 = base64.b64encode(audio_data).decode('utf-8')
                
                # Determine MIME type based on format
                mime_type = f"audio/{format}"
                if format == "mp3":
                    mime_type = "audio/mpeg"
                elif format == "wav":
                    mime_type = "audio/wav"
                
                result = {
                    "audio_data": audio_base64,
                    "mime_type": mime_type,
                    "format": format,
                    "text": cleaned_text,
                    "model": model or self.default_model,
                    "size_bytes": len(audio_data)
                }
                
                logger.info(f"TTS conversion successful: {len(audio_data)} bytes")
                return result
                
            else:
                logger.error(f"Deepgram TTS API error: {response.status_code} - {response.text}")
                return None
                
        except requests.exceptions.Timeout:
            logger.error("TTS request timed out")
            return None
        except requests.exceptions.RequestException as e:
            logger.error(f"TTS request failed: {str(e)}")
            return None
        except Exception as e:
            logger.error(f"Unexpected error in TTS conversion: {str(e)}")
            return None
    
    def is_available(self) -> bool:
        """Check if TTS service is available"""
        return bool(self.api_key)

# Global TTS service instance
tts_service = TTSService()