dangthr commited on
Commit
37c0b74
·
verified ·
1 Parent(s): 2c31869

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -72
app.py CHANGED
@@ -8,27 +8,25 @@ import requests
8
  from urllib.parse import urlparse
9
  import warnings
10
 
11
- # --- Suppress Warnings ---
12
  warnings.filterwarnings("ignore", category=FutureWarning)
13
  warnings.filterwarnings("ignore", category=UserWarning)
14
 
15
- # --- System Path Setup ---
16
  current_dir = os.path.dirname(os.path.abspath(__file__))
17
  sys.path.append(current_dir)
18
  sys.path.append(os.path.join(current_dir, "indextts"))
19
 
20
- # --- Local Imports ---
21
  from indextts.infer_v2 import IndexTTS2
22
  from tools.download_files import download_model_from_huggingface
23
 
24
  def download_file(url, save_dir="temp_audio"):
25
  """
26
- Downloads a file from a URL or returns the path if it's a local file.
27
  """
28
- # Create save directory if it doesn't exist
29
  os.makedirs(save_dir, exist_ok=True)
30
 
31
- # Check if the input is a URL or a local path
32
  try:
33
  result = urlparse(url)
34
  is_url = all([result.scheme, result.netloc])
@@ -36,138 +34,126 @@ def download_file(url, save_dir="temp_audio"):
36
  is_url = False
37
 
38
  if not is_url:
39
- # It's a local path, check if it exists
40
  if os.path.exists(url):
41
- print(f"Using local file: {url}")
42
  return url
43
  else:
44
- raise FileNotFoundError(f"Local file not found: {url}")
45
 
46
- # It's a URL, proceed with download
47
  filename = os.path.basename(result.path)
48
- # Handle cases with no filename in URL
49
  if not filename:
50
  filename = f"audio_{int(time.time())}.wav"
51
 
52
  save_path = os.path.join(save_dir, filename)
53
 
54
- print(f"Downloading audio from {url} to {save_path}...")
55
  try:
56
  response = requests.get(url, stream=True, timeout=30)
57
- response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
58
 
59
  with open(save_path, "wb") as f:
60
  for chunk in response.iter_content(chunk_size=8192):
61
  f.write(chunk)
62
 
63
- print("Download complete.")
64
  return save_path
65
  except requests.exceptions.RequestException as e:
66
- print(f"Error downloading file: {e}")
67
  raise
68
 
69
  def main():
70
  """
71
- Main function to run the command-line TTS application.
72
  """
73
- # --- Command-Line Argument Parsing ---
74
  parser = argparse.ArgumentParser(
75
- description="IndexTTS: Command-Line Text-to-Speech Application",
76
  formatter_class=argparse.RawTextHelpFormatter
77
  )
78
 
79
- # Core arguments
80
- parser.add_argument("--prompt", type=str, required=True, help="Text to synthesize.")
81
- parser.add_argument("--input_audio", type=str, required=True, help="URL or local path to the voice reference audio (.wav).")
82
  parser.add_argument("--setting", type=int, choices=[1, 2, 3, 4], required=True,
83
- help="Emotion control method:\n"
84
- "1: Same as the voice reference audio.\n"
85
- "2: Use a separate emotion reference audio.\n"
86
- "3: Use an emotion vector.\n"
87
- "4: Use a text description for emotion.")
88
 
89
- # Emotion-specific arguments
90
- parser.add_argument("--emo_audio", type=str, help="URL or local path to the emotion reference audio (required for setting 2).")
91
- parser.add_argument("--emo_weight", type=float, default=0.8, help="Emotion weight/strength for setting 2 (default: 0.8).")
92
  parser.add_argument("--emo_vectors", type=float, nargs=8,
93
- metavar=('HAPPY', 'ANGRY', 'SAD', 'FEAR', 'DISGUST', 'DEPRESSED', 'SURPRISE', 'NEUTRAL'),
94
- help="Eight emotion vector values separated by spaces (required for setting 3).")
95
- parser.add_argument("--emo_text", type=str, help="Emotion description text, e.g., 'happy', 'sad' (required for setting 4).")
96
-
97
- # Configuration arguments
98
- parser.add_argument("--output_path", type=str, default=None, help="Path to save the output audio. If not provided, a default name will be generated in the 'outputs' directory.")
99
- parser.add_argument("--model_dir", type=str, default="checkpoints", help="Directory for model checkpoints.")
100
- parser.add_argument("--is_fp16", action="store_true", default=False, help="Enable fp16 inference.")
101
- parser.add_argument("--verbose", action="store_true", default=False, help="Enable verbose logging.")
102
 
103
  args = parser.parse_args()
104
 
105
- # --- Model and Asset Download ---
106
- print("Checking for model files...")
107
  download_model_from_huggingface(
108
  os.path.join(current_dir, "checkpoints"),
109
  os.path.join(current_dir, "checkpoints", "hf_cache")
110
  )
111
 
112
- # --- Model Loading ---
113
- print("Loading IndexTTS model...")
114
  tts = IndexTTS2(
115
  model_dir=args.model_dir,
116
  cfg_path=os.path.join(args.model_dir, "config.yaml"),
117
  is_fp16=args.is_fp16,
118
- use_cuda_kernel=False # Set to True if you have compatible environment
119
  )
120
- print("Model loaded successfully.")
121
 
122
- # --- Prepare Paths and Parameters ---
123
  os.makedirs("outputs", exist_ok=True)
124
- output_path = args.output_path or os.path.join("outputs", f"output_{int(time.time())}.wav")
 
 
 
125
 
126
  prompt_audio_path = download_file(args.input_audio)
127
 
128
- # Initialize inference parameters
129
  emo_audio_prompt = None
130
  emo_alpha = 1.0
131
  emo_vector = None
132
  use_emo_text = False
133
  emo_text_val = ""
134
-
135
- # The user provides 1-4, but the internal code uses 0-3
136
  emo_control_method = args.setting - 1
137
 
138
- # --- Configure Emotion Control Based on Setting ---
139
- if emo_control_method == 0: # Setting 1: Same as prompt
140
- print("Using emotion from the main voice reference audio.")
141
- pass # Defaults are correct
142
 
143
- elif emo_control_method == 1: # Setting 2: Use emotion audio
144
- print("Using emotion from a separate reference audio.")
145
  if not args.emo_audio:
146
- parser.error("--emo_audio is required for setting 2.")
147
  emo_audio_prompt = download_file(args.emo_audio)
148
  emo_alpha = args.emo_weight
149
- print(f"Emotion reference: {emo_audio_prompt}, Weight: {emo_alpha}")
150
 
151
- elif emo_control_method == 2: # Setting 3: Use emotion vector
152
- print("Using an emotion vector for control.")
153
  if not args.emo_vectors:
154
- parser.error("--emo_vectors are required for setting 3.")
155
  vec_sum = sum(args.emo_vectors)
156
  if vec_sum > 1.5:
157
- raise ValueError(f"The sum of emotion vectors cannot exceed 1.5. Current sum: {vec_sum}")
158
  emo_vector = args.emo_vectors
159
- print(f"Emotion vector: {emo_vector}")
160
 
161
- elif emo_control_method == 3: # Setting 4: Use emotion text
162
- print("Using a text description for emotion control.")
163
  if not args.emo_text:
164
- parser.error("--emo_text is required for setting 4.")
165
  use_emo_text = True
166
  emo_text_val = args.emo_text
167
- print(f"Emotion text: '{emo_text_val}'")
168
 
169
- # --- Run Inference ---
170
- print("\nStarting TTS inference...")
171
  tts.infer(
172
  spk_audio_prompt=prompt_audio_path,
173
  text=args.prompt,
@@ -178,12 +164,10 @@ def main():
178
  use_emo_text=use_emo_text,
179
  emo_text=emo_text_val,
180
  verbose=args.verbose,
181
- # You can add other advanced generation parameters here if needed
182
- # e.g., top_p=0.8, temperature=0.8, etc.
183
  )
184
 
185
- print(f"\n✨ Inference complete! Audio saved to: {output_path}")
186
-
187
 
188
  if __name__ == "__main__":
189
  main()
 
 
8
  from urllib.parse import urlparse
9
  import warnings
10
 
11
+ # --- 抑制警告信息 ---
12
  warnings.filterwarnings("ignore", category=FutureWarning)
13
  warnings.filterwarnings("ignore", category=UserWarning)
14
 
15
+ # --- 设置系统路径 ---
16
  current_dir = os.path.dirname(os.path.abspath(__file__))
17
  sys.path.append(current_dir)
18
  sys.path.append(os.path.join(current_dir, "indextts"))
19
 
20
+ # --- 导入本地模块 ---
21
  from indextts.infer_v2 import IndexTTS2
22
  from tools.download_files import download_model_from_huggingface
23
 
24
  def download_file(url, save_dir="temp_audio"):
25
  """
26
+ URL 下载文件,或者如果路径是本地文件则直接返回路径。
27
  """
 
28
  os.makedirs(save_dir, exist_ok=True)
29
 
 
30
  try:
31
  result = urlparse(url)
32
  is_url = all([result.scheme, result.netloc])
 
34
  is_url = False
35
 
36
  if not is_url:
 
37
  if os.path.exists(url):
38
+ print(f"使用本地文件: {url}")
39
  return url
40
  else:
41
+ raise FileNotFoundError(f"本地文件未找到: {url}")
42
 
 
43
  filename = os.path.basename(result.path)
 
44
  if not filename:
45
  filename = f"audio_{int(time.time())}.wav"
46
 
47
  save_path = os.path.join(save_dir, filename)
48
 
49
+ print(f"正在从 {url} 下载音频到 {save_path}...")
50
  try:
51
  response = requests.get(url, stream=True, timeout=30)
52
+ response.raise_for_status()
53
 
54
  with open(save_path, "wb") as f:
55
  for chunk in response.iter_content(chunk_size=8192):
56
  f.write(chunk)
57
 
58
+ print("下载完成。")
59
  return save_path
60
  except requests.exceptions.RequestException as e:
61
+ print(f"下载文件时出错: {e}")
62
  raise
63
 
64
  def main():
65
  """
66
+ 运行命令行文本转语音应用的主函数。
67
  """
 
68
  parser = argparse.ArgumentParser(
69
+ description="IndexTTS: 命令行文本转语音应用",
70
  formatter_class=argparse.RawTextHelpFormatter
71
  )
72
 
73
+ parser.add_argument("--prompt", type=str, required=True, help="需要合成的文本。")
74
+ parser.add_argument("--input_audio", type=str, required=True, help="音色参考音频的 URL 或本地路径 (.wav)。")
 
75
  parser.add_argument("--setting", type=int, choices=[1, 2, 3, 4], required=True,
76
+ help="情感控制方法:\n"
77
+ "1: 与音色参考音频相同。\n"
78
+ "2: 使用单独的情感参考音频。\n"
79
+ "3: 使用情感向量。\n"
80
+ "4: 使用文本描述来控制情感。")
81
 
82
+ parser.add_argument("--emo_audio", type=str, help="情感参考音频的 URL 或本地路径 (setting 2 必需)。")
83
+ parser.add_argument("--emo_weight", type=float, default=0.8, help="情感权重 (setting 2, 默认: 0.8)")
 
84
  parser.add_argument("--emo_vectors", type=float, nargs=8,
85
+ metavar=('', '', '', '', '厌恶', '低落', '惊喜', '平静'),
86
+ help="八个情感向量值,用空格分隔 (setting 3 必需)")
87
+ parser.add_argument("--emo_text", type=str, help="情感描述文本,例如 '高兴', '伤心' (setting 4 必需)")
88
+
89
+ parser.add_argument("--output_path", type=str, default=None, help="保存输出音频的路径。如果未提供,将使用默认名称。")
90
+ parser.add_argument("--model_dir", type=str, default="checkpoints", help="模型检查点目录。")
91
+ parser.add_argument("--is_fp16", action="store_true", default=False, help="启用 fp16 推理。")
92
+ parser.add_argument("--verbose", action="store_true", default=False, help="启用详细日志记录。")
 
93
 
94
  args = parser.parse_args()
95
 
96
+ print("正在检查模型文件...")
 
97
  download_model_from_huggingface(
98
  os.path.join(current_dir, "checkpoints"),
99
  os.path.join(current_dir, "checkpoints", "hf_cache")
100
  )
101
 
102
+ print("正在加载 IndexTTS 模型...")
 
103
  tts = IndexTTS2(
104
  model_dir=args.model_dir,
105
  cfg_path=os.path.join(args.model_dir, "config.yaml"),
106
  is_fp16=args.is_fp16,
107
+ use_cuda_kernel=False
108
  )
109
+ print("模型加载成功。")
110
 
 
111
  os.makedirs("outputs", exist_ok=True)
112
+
113
+ # --- 主要修改 ---
114
+ # 如果用户没有通过 --output_path 指定路径,则默认使用 'outputs/output.wav'
115
+ output_path = args.output_path or os.path.join("outputs", "output.wav")
116
 
117
  prompt_audio_path = download_file(args.input_audio)
118
 
 
119
  emo_audio_prompt = None
120
  emo_alpha = 1.0
121
  emo_vector = None
122
  use_emo_text = False
123
  emo_text_val = ""
 
 
124
  emo_control_method = args.setting - 1
125
 
126
+ if emo_control_method == 0:
127
+ print("使用音色参考音频中的情感。")
128
+ pass
 
129
 
130
+ elif emo_control_method == 1:
131
+ print("使用独立的情感参考音频。")
132
  if not args.emo_audio:
133
+ parser.error("--emo_audio 参数在 setting 2 中是必需的。")
134
  emo_audio_prompt = download_file(args.emo_audio)
135
  emo_alpha = args.emo_weight
136
+ print(f"情感参考: {emo_audio_prompt}, 权重: {emo_alpha}")
137
 
138
+ elif emo_control_method == 2:
139
+ print("使用情感向量进行控制。")
140
  if not args.emo_vectors:
141
+ parser.error("--emo_vectors 参数在 setting 3 中是必需的。")
142
  vec_sum = sum(args.emo_vectors)
143
  if vec_sum > 1.5:
144
+ raise ValueError(f"情感向量的总和不能超过1.5。当前总和: {vec_sum}")
145
  emo_vector = args.emo_vectors
146
+ print(f"情感向量: {emo_vector}")
147
 
148
+ elif emo_control_method == 3:
149
+ print("使用文本描述来控制情感。")
150
  if not args.emo_text:
151
+ parser.error("--emo_text 参数在 setting 4 中是必需的。")
152
  use_emo_text = True
153
  emo_text_val = args.emo_text
154
+ print(f"情感文本: '{emo_text_val}'")
155
 
156
+ print("\n开始 TTS 推理...")
 
157
  tts.infer(
158
  spk_audio_prompt=prompt_audio_path,
159
  text=args.prompt,
 
164
  use_emo_text=use_emo_text,
165
  emo_text=emo_text_val,
166
  verbose=args.verbose,
 
 
167
  )
168
 
169
+ print(f"\n✨ 推理完成!音频已保存至: {output_path}")
 
170
 
171
  if __name__ == "__main__":
172
  main()
173
+