""" Voice Cloner Module - Extracted TTS functionality Handles text-to-speech with voice cloning capabilities """ import os import tempfile import hashlib import threading import logging from typing import Optional, Dict logger = logging.getLogger("voicecloner") try: from TTS.api import TTS import torch TTS_AVAILABLE = True except Exception: TTS_AVAILABLE = False torch = None # TTS Configuration TTS_MODEL_NAME = os.environ.get("TTS_MODEL_NAME", "tts_models/multilingual/multi-dataset/xtts_v2") TTS_DEVICE = os.environ.get("TTS_DEVICE", "cuda" if (torch and torch.cuda.is_available()) else "cpu") TTS_USE_HALF = os.environ.get("TTS_USE_HALF", "1") in ("1", "true", "yes") # Global state _tts_model = None _tts_lock = threading.Lock() _speaker_hash_cache: Dict[str, str] = {} _tts_loaded_event = threading.Event() def compute_file_sha256(path: str) -> str: """Compute SHA256 hash of a file""" h = hashlib.sha256() with open(path, "rb") as f: while True: b = f.read(8192) if not b: break h.update(b) return h.hexdigest() def get_tts_model(): """Get or load TTS model (thread-safe) with better error handling""" global _tts_model if not TTS_AVAILABLE: logger.error("[TTS] TTS library not available") raise RuntimeError("TTS.api not available. Please install: pip install TTS") with _tts_lock: if _tts_model is None: try: logger.info(f"[TTS] Loading model {TTS_MODEL_NAME} on device {TTS_DEVICE}") _tts_model = TTS(TTS_MODEL_NAME) if TTS_DEVICE and torch: if TTS_DEVICE.startswith("cuda") and torch.cuda.is_available(): try: _tts_model.to(TTS_DEVICE) torch.backends.cudnn.benchmark = True if TTS_USE_HALF and hasattr(_tts_model, "model"): _tts_model.model.half() logger.info("[TTS] GPU optimization enabled") except Exception as e: logger.warning(f"[TTS] GPU optimization failed, using CPU: {e}") try: _tts_model.to("cpu") except Exception: pass logger.info("[TTS] Model loaded successfully") _tts_loaded_event.set() except Exception as e: logger.error(f"[TTS] Failed to load model: {e}") _tts_model = None raise RuntimeError(f"Failed to load TTS model: {str(e)}") if _tts_model is None: raise RuntimeError("TTS model failed to initialize") return _tts_model def synthesize_speech(text: str, speaker_wav: Optional[str] = None, language: Optional[str] = None, output_path: Optional[str] = None) -> str: """ Synthesize speech from text with robust error handling Args: text: Text to synthesize speaker_wav: Path to speaker sample WAV file (optional) language: Target language code (optional) output_path: Output file path (optional, creates temp file if None) Returns: Path to generated audio file """ if not text or not text.strip(): raise ValueError("Text is required and cannot be empty") try: tts = get_tts_model() except Exception as e: logger.error(f"Failed to get TTS model: {e}") raise RuntimeError(f"TTS model unavailable: {str(e)}") if output_path is None: fd, output_path = tempfile.mkstemp(suffix=".wav", prefix="tts_") os.close(fd) kwargs = {} if speaker_wav and os.path.exists(speaker_wav): kwargs["speaker_wav"] = speaker_wav logger.info(f"Using speaker sample: {speaker_wav}") if language: kwargs["language"] = language logger.info(f"Using language: {language}") try: logger.info(f"Synthesizing speech: '{text[:50]}...'") if torch and torch.cuda.is_available() and TTS_USE_HALF: try: with torch.inference_mode(): with torch.cuda.amp.autocast(): tts.tts_to_file(text=text, file_path=output_path, **kwargs) except Exception as e: logger.warning(f"GPU synthesis failed, trying CPU: {e}") with torch.inference_mode(): tts.tts_to_file(text=text, file_path=output_path, **kwargs) else: if torch: with torch.inference_mode(): tts.tts_to_file(text=text, file_path=output_path, **kwargs) else: tts.tts_to_file(text=text, file_path=output_path, **kwargs) logger.info(f"Speech synthesis successful: {output_path}") except Exception as e: logger.error(f"TTS synthesis failed: {e}") if os.path.exists(output_path): try: os.remove(output_path) except: pass raise RuntimeError(f"TTS synthesis failed: {str(e)}") return output_path def cache_speaker_sample(speaker_path: str) -> str: """Cache speaker sample to avoid reprocessing""" speaker_hash = compute_file_sha256(speaker_path) cached = _speaker_hash_cache.get(speaker_hash) if cached and os.path.exists(cached): return cached _speaker_hash_cache[speaker_hash] = speaker_path return speaker_path def is_available() -> bool: """Check if TTS is available""" return TTS_AVAILABLE def preload_model(): """Preload TTS model in background""" if TTS_AVAILABLE: def safe_preload(): try: get_tts_model() except Exception as e: logger.warning(f"[TTS] Background preload failed: {e}") threading.Thread(target=safe_preload, daemon=True).start() # Auto-preload on import try: preload_model() except Exception as e: logger.warning(f"[TTS] Failed to start preload: {e}")