Princeaka commited on
Commit
4305795
·
verified ·
1 Parent(s): 9cd3579

Update voicecloner.py

Browse files
Files changed (1) hide show
  1. voicecloner.py +50 -25
voicecloner.py CHANGED
@@ -42,34 +42,41 @@ def compute_file_sha256(path: str) -> str:
42
  return h.hexdigest()
43
 
44
  def get_tts_model():
45
- """Get or load TTS model (thread-safe)"""
46
  global _tts_model
47
  if not TTS_AVAILABLE:
48
- raise RuntimeError("TTS.api not available")
49
 
50
  with _tts_lock:
51
  if _tts_model is None:
52
- logger.info(f"[TTS] Loading model {TTS_MODEL_NAME} on device {TTS_DEVICE}")
53
- _tts_model = TTS(TTS_MODEL_NAME)
54
-
55
- if TTS_DEVICE and torch:
56
- if TTS_DEVICE.startswith("cuda") and torch.cuda.is_available():
57
- try:
58
- _tts_model.to(TTS_DEVICE)
59
- torch.backends.cudnn.benchmark = True
60
- if TTS_USE_HALF and hasattr(_tts_model, "model"):
61
- _tts_model.model.half()
62
- except Exception as e:
63
- logger.warning(f"[TTS] GPU optimization failed: {e}")
64
-
65
- logger.info("[TTS] Model loaded successfully")
66
- _tts_loaded_event.set()
 
 
 
 
 
 
 
67
 
68
  return _tts_model
69
 
70
  def synthesize_speech(text: str, speaker_wav: Optional[str] = None, language: Optional[str] = None, output_path: Optional[str] = None) -> str:
71
  """
72
- Synthesize speech from text
73
 
74
  Args:
75
  text: Text to synthesize
@@ -81,24 +88,36 @@ def synthesize_speech(text: str, speaker_wav: Optional[str] = None, language: Op
81
  Path to generated audio file
82
  """
83
  if not text or not text.strip():
84
- raise ValueError("Text is required")
85
 
86
- tts = get_tts_model()
 
 
 
 
87
 
88
  if output_path is None:
89
  fd, output_path = tempfile.mkstemp(suffix=".wav", prefix="tts_")
90
  os.close(fd)
91
 
92
  kwargs = {}
93
- if speaker_wav:
94
  kwargs["speaker_wav"] = speaker_wav
 
95
  if language:
96
  kwargs["language"] = language
 
97
 
98
  try:
 
99
  if torch and torch.cuda.is_available() and TTS_USE_HALF:
100
- with torch.inference_mode():
101
- with torch.cuda.amp.autocast():
 
 
 
 
 
102
  tts.tts_to_file(text=text, file_path=output_path, **kwargs)
103
  else:
104
  if torch:
@@ -106,10 +125,16 @@ def synthesize_speech(text: str, speaker_wav: Optional[str] = None, language: Op
106
  tts.tts_to_file(text=text, file_path=output_path, **kwargs)
107
  else:
108
  tts.tts_to_file(text=text, file_path=output_path, **kwargs)
 
 
109
  except Exception as e:
 
110
  if os.path.exists(output_path):
111
- os.remove(output_path)
112
- raise RuntimeError(f"TTS synthesis failed: {e}")
 
 
 
113
 
114
  return output_path
115
 
 
42
  return h.hexdigest()
43
 
44
  def get_tts_model():
45
+ """Get or load TTS model (thread-safe) with better error handling"""
46
  global _tts_model
47
  if not TTS_AVAILABLE:
48
+ raise RuntimeError("TTS.api not available. Please install: pip install TTS")
49
 
50
  with _tts_lock:
51
  if _tts_model is None:
52
+ try:
53
+ logger.info(f"[TTS] Loading model {TTS_MODEL_NAME} on device {TTS_DEVICE}")
54
+ _tts_model = TTS(TTS_MODEL_NAME)
55
+
56
+ if TTS_DEVICE and torch:
57
+ if TTS_DEVICE.startswith("cuda") and torch.cuda.is_available():
58
+ try:
59
+ _tts_model.to(TTS_DEVICE)
60
+ torch.backends.cudnn.benchmark = True
61
+ if TTS_USE_HALF and hasattr(_tts_model, "model"):
62
+ _tts_model.model.half()
63
+ logger.info("[TTS] GPU optimization enabled")
64
+ except Exception as e:
65
+ logger.warning(f"[TTS] GPU optimization failed, using CPU: {e}")
66
+ _tts_model.to("cpu")
67
+
68
+ logger.info("[TTS] Model loaded successfully")
69
+ _tts_loaded_event.set()
70
+ except Exception as e:
71
+ logger.error(f"[TTS] Failed to load model: {e}")
72
+ _tts_model = None
73
+ raise RuntimeError(f"Failed to load TTS model: {str(e)}")
74
 
75
  return _tts_model
76
 
77
  def synthesize_speech(text: str, speaker_wav: Optional[str] = None, language: Optional[str] = None, output_path: Optional[str] = None) -> str:
78
  """
79
+ Synthesize speech from text with robust error handling
80
 
81
  Args:
82
  text: Text to synthesize
 
88
  Path to generated audio file
89
  """
90
  if not text or not text.strip():
91
+ raise ValueError("Text is required and cannot be empty")
92
 
93
+ try:
94
+ tts = get_tts_model()
95
+ except Exception as e:
96
+ logger.error(f"Failed to get TTS model: {e}")
97
+ raise RuntimeError(f"TTS model unavailable: {str(e)}")
98
 
99
  if output_path is None:
100
  fd, output_path = tempfile.mkstemp(suffix=".wav", prefix="tts_")
101
  os.close(fd)
102
 
103
  kwargs = {}
104
+ if speaker_wav and os.path.exists(speaker_wav):
105
  kwargs["speaker_wav"] = speaker_wav
106
+ logger.info(f"Using speaker sample: {speaker_wav}")
107
  if language:
108
  kwargs["language"] = language
109
+ logger.info(f"Using language: {language}")
110
 
111
  try:
112
+ logger.info(f"Synthesizing speech: '{text[:50]}...'")
113
  if torch and torch.cuda.is_available() and TTS_USE_HALF:
114
+ try:
115
+ with torch.inference_mode():
116
+ with torch.cuda.amp.autocast():
117
+ tts.tts_to_file(text=text, file_path=output_path, **kwargs)
118
+ except Exception as e:
119
+ logger.warning(f"GPU synthesis failed, trying CPU: {e}")
120
+ with torch.inference_mode():
121
  tts.tts_to_file(text=text, file_path=output_path, **kwargs)
122
  else:
123
  if torch:
 
125
  tts.tts_to_file(text=text, file_path=output_path, **kwargs)
126
  else:
127
  tts.tts_to_file(text=text, file_path=output_path, **kwargs)
128
+
129
+ logger.info(f"Speech synthesis successful: {output_path}")
130
  except Exception as e:
131
+ logger.error(f"TTS synthesis failed: {e}")
132
  if os.path.exists(output_path):
133
+ try:
134
+ os.remove(output_path)
135
+ except:
136
+ pass
137
+ raise RuntimeError(f"TTS synthesis failed: {str(e)}")
138
 
139
  return output_path
140