IndexTTS-2-Demo

Runtime error

App Files Files Community

dangthr commited on Sep 14

Commit

37c0b74

verified ·

1 Parent(s): 2c31869

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -72

app.py CHANGED Viewed

@@ -8,27 +8,25 @@ import requests
 from urllib.parse import urlparse
 import warnings
-# --- Suppress Warnings ---
 warnings.filterwarnings("ignore", category=FutureWarning)
 warnings.filterwarnings("ignore", category=UserWarning)
-# --- System Path Setup ---
 current_dir = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(current_dir)
 sys.path.append(os.path.join(current_dir, "indextts"))
-# --- Local Imports ---
 from indextts.infer_v2 import IndexTTS2
 from tools.download_files import download_model_from_huggingface
 def download_file(url, save_dir="temp_audio"):
     """
-    Downloads a file from a URL or returns the path if it's a local file.
     """
-    # Create save directory if it doesn't exist
     os.makedirs(save_dir, exist_ok=True)
-    # Check if the input is a URL or a local path
     try:
         result = urlparse(url)
         is_url = all([result.scheme, result.netloc])
@@ -36,138 +34,126 @@ def download_file(url, save_dir="temp_audio"):
         is_url = False
     if not is_url:
-        # It's a local path, check if it exists
         if os.path.exists(url):
-            print(f"Using local file: {url}")
             return url
         else:
-            raise FileNotFoundError(f"Local file not found: {url}")
-    # It's a URL, proceed with download
     filename = os.path.basename(result.path)
-    # Handle cases with no filename in URL
     if not filename:
         filename = f"audio_{int(time.time())}.wav"
     save_path = os.path.join(save_dir, filename)
-    print(f"Downloading audio from {url} to {save_path}...")
     try:
         response = requests.get(url, stream=True, timeout=30)
-        response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)
         with open(save_path, "wb") as f:
             for chunk in response.iter_content(chunk_size=8192):
                 f.write(chunk)
-        print("Download complete.")
         return save_path
     except requests.exceptions.RequestException as e:
-        print(f"Error downloading file: {e}")
         raise
 def main():
     """
-    Main function to run the command-line TTS application.
     """
-    # --- Command-Line Argument Parsing ---
     parser = argparse.ArgumentParser(
-        description="IndexTTS: Command-Line Text-to-Speech Application",
         formatter_class=argparse.RawTextHelpFormatter
     )
-    # Core arguments
-    parser.add_argument("--prompt", type=str, required=True, help="Text to synthesize.")
-    parser.add_argument("--input_audio", type=str, required=True, help="URL or local path to the voice reference audio (.wav).")
     parser.add_argument("--setting", type=int, choices=[1, 2, 3, 4], required=True,
-                        help="Emotion control method:\n"
-                             "1: Same as the voice reference audio.\n"
-                             "2: Use a separate emotion reference audio.\n"
-                             "3: Use an emotion vector.\n"
-                             "4: Use a text description for emotion.")
-    # Emotion-specific arguments
-    parser.add_argument("--emo_audio", type=str, help="URL or local path to the emotion reference audio (required for setting 2).")
-    parser.add_argument("--emo_weight", type=float, default=0.8, help="Emotion weight/strength for setting 2 (default: 0.8).")
     parser.add_argument("--emo_vectors", type=float, nargs=8,
-                        metavar=('HAPPY', 'ANGRY', 'SAD', 'FEAR', 'DISGUST', 'DEPRESSED', 'SURPRISE', 'NEUTRAL'),
-                        help="Eight emotion vector values separated by spaces (required for setting 3).")
-    parser.add_argument("--emo_text", type=str, help="Emotion description text, e.g., 'happy', 'sad' (required for setting 4).")
-    # Configuration arguments
-    parser.add_argument("--output_path", type=str, default=None, help="Path to save the output audio. If not provided, a default name will be generated in the 'outputs' directory.")
-    parser.add_argument("--model_dir", type=str, default="checkpoints", help="Directory for model checkpoints.")
-    parser.add_argument("--is_fp16", action="store_true", default=False, help="Enable fp16 inference.")
-    parser.add_argument("--verbose", action="store_true", default=False, help="Enable verbose logging.")
     args = parser.parse_args()
-    # --- Model and Asset Download ---
-    print("Checking for model files...")
     download_model_from_huggingface(
         os.path.join(current_dir, "checkpoints"),
         os.path.join(current_dir, "checkpoints", "hf_cache")
     )
-    # --- Model Loading ---
-    print("Loading IndexTTS model...")
     tts = IndexTTS2(
         model_dir=args.model_dir,
         cfg_path=os.path.join(args.model_dir, "config.yaml"),
         is_fp16=args.is_fp16,
-        use_cuda_kernel=False  # Set to True if you have compatible environment
     )
-    print("Model loaded successfully.")
-    # --- Prepare Paths and Parameters ---
     os.makedirs("outputs", exist_ok=True)
-    output_path = args.output_path or os.path.join("outputs", f"output_{int(time.time())}.wav")
     prompt_audio_path = download_file(args.input_audio)
-    # Initialize inference parameters
     emo_audio_prompt = None
     emo_alpha = 1.0
     emo_vector = None
     use_emo_text = False
     emo_text_val = ""
-    # The user provides 1-4, but the internal code uses 0-3
     emo_control_method = args.setting - 1
-    # --- Configure Emotion Control Based on Setting ---
-    if emo_control_method == 0:  # Setting 1: Same as prompt
-        print("Using emotion from the main voice reference audio.")
-        pass  # Defaults are correct
-    elif emo_control_method == 1:  # Setting 2: Use emotion audio
-        print("Using emotion from a separate reference audio.")
         if not args.emo_audio:
-            parser.error("--emo_audio is required for setting 2.")
         emo_audio_prompt = download_file(args.emo_audio)
         emo_alpha = args.emo_weight
-        print(f"Emotion reference: {emo_audio_prompt}, Weight: {emo_alpha}")
-    elif emo_control_method == 2:  # Setting 3: Use emotion vector
-        print("Using an emotion vector for control.")
         if not args.emo_vectors:
-            parser.error("--emo_vectors are required for setting 3.")
         vec_sum = sum(args.emo_vectors)
         if vec_sum > 1.5:
-            raise ValueError(f"The sum of emotion vectors cannot exceed 1.5. Current sum: {vec_sum}")
         emo_vector = args.emo_vectors
-        print(f"Emotion vector: {emo_vector}")
-    elif emo_control_method == 3:  # Setting 4: Use emotion text
-        print("Using a text description for emotion control.")
         if not args.emo_text:
-            parser.error("--emo_text is required for setting 4.")
         use_emo_text = True
         emo_text_val = args.emo_text
-        print(f"Emotion text: '{emo_text_val}'")
-    # --- Run Inference ---
-    print("\nStarting TTS inference...")
     tts.infer(
         spk_audio_prompt=prompt_audio_path,
         text=args.prompt,
@@ -178,12 +164,10 @@ def main():
         use_emo_text=use_emo_text,
         emo_text=emo_text_val,
         verbose=args.verbose,
-        # You can add other advanced generation parameters here if needed
-        # e.g., top_p=0.8, temperature=0.8, etc.
     )
-    print(f"\n✨ Inference complete! Audio saved to: {output_path}")
 if __name__ == "__main__":
     main()

 from urllib.parse import urlparse
 import warnings
+# --- 抑制警告信息 ---
 warnings.filterwarnings("ignore", category=FutureWarning)
 warnings.filterwarnings("ignore", category=UserWarning)
+# --- 设置系统路径 ---
 current_dir = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(current_dir)
 sys.path.append(os.path.join(current_dir, "indextts"))
+# --- 导入本地模块 ---
 from indextts.infer_v2 import IndexTTS2
 from tools.download_files import download_model_from_huggingface
 def download_file(url, save_dir="temp_audio"):
     """
+    从 URL 下载文件，或者如果路径是本地文件则直接返回路径。
     """
     os.makedirs(save_dir, exist_ok=True)
     try:
         result = urlparse(url)
         is_url = all([result.scheme, result.netloc])
         is_url = False
     if not is_url:
         if os.path.exists(url):
+            print(f"使用本地文件: {url}")
             return url
         else:
+            raise FileNotFoundError(f"本地文件未找到: {url}")
     filename = os.path.basename(result.path)
     if not filename:
         filename = f"audio_{int(time.time())}.wav"
     save_path = os.path.join(save_dir, filename)
+    print(f"正在从 {url} 下载音频到 {save_path}...")
     try:
         response = requests.get(url, stream=True, timeout=30)
+        response.raise_for_status()
         with open(save_path, "wb") as f:
             for chunk in response.iter_content(chunk_size=8192):
                 f.write(chunk)
+        print("下载完成。")
         return save_path
     except requests.exceptions.RequestException as e:
+        print(f"下载文件时出错: {e}")
         raise
 def main():
     """
+    运行命令行文本转语音应用的主函数。
     """
     parser = argparse.ArgumentParser(
+        description="IndexTTS: 命令行文本转语音应用",
         formatter_class=argparse.RawTextHelpFormatter
     )
+    parser.add_argument("--prompt", type=str, required=True, help="需要合成的文本。")
+    parser.add_argument("--input_audio", type=str, required=True, help="音色参考音频的 URL 或本地路径 (.wav)。")
     parser.add_argument("--setting", type=int, choices=[1, 2, 3, 4], required=True,
+                        help="情感控制方法:\n"
+                             "1: 与音色参考音频相同。\n"
+                             "2: 使用单独的情感参考音频。\n"
+                             "3: 使用情感向量。\n"
+                             "4: 使用文本描述来控制情感。")
+    parser.add_argument("--emo_audio", type=str, help="情感参考音频的 URL 或本地路径 (setting 2 必需)。")
+    parser.add_argument("--emo_weight", type=float, default=0.8, help="情感权重 (setting 2, 默认: 0.8)。")
     parser.add_argument("--emo_vectors", type=float, nargs=8,
+                        metavar=('喜', '怒', '哀', '惧', '厌恶', '低落', '惊喜', '平静'),
+                        help="八个情感向量值，用空格分隔 (setting 3 必需)。")
+    parser.add_argument("--emo_text", type=str, help="情感描述文本，例如 '高兴', '伤心' (setting 4 必需)。")
+    parser.add_argument("--output_path", type=str, default=None, help="保存输出音频的路径。如果未提供，将使用默认名称。")
+    parser.add_argument("--model_dir", type=str, default="checkpoints", help="模型检查点目录。")
+    parser.add_argument("--is_fp16", action="store_true", default=False, help="启用 fp16 推理。")
+    parser.add_argument("--verbose", action="store_true", default=False, help="启用详细日志记录。")
     args = parser.parse_args()
+    print("正在检查模型文件...")
     download_model_from_huggingface(
         os.path.join(current_dir, "checkpoints"),
         os.path.join(current_dir, "checkpoints", "hf_cache")
     )
+    print("正在加载 IndexTTS 模型...")
     tts = IndexTTS2(
         model_dir=args.model_dir,
         cfg_path=os.path.join(args.model_dir, "config.yaml"),
         is_fp16=args.is_fp16,
+        use_cuda_kernel=False
     )
+    print("模型加载成功。")
     os.makedirs("outputs", exist_ok=True)
+    # --- 主要修改 ---
+    # 如果用户没有通过 --output_path 指定路径，则默认使用 'outputs/output.wav'
+    output_path = args.output_path or os.path.join("outputs", "output.wav")
     prompt_audio_path = download_file(args.input_audio)
     emo_audio_prompt = None
     emo_alpha = 1.0
     emo_vector = None
     use_emo_text = False
     emo_text_val = ""
     emo_control_method = args.setting - 1
+    if emo_control_method == 0:
+        print("使用音色参考音频中的情感。")
+        pass
+    elif emo_control_method == 1:
+        print("使用独立的情感参考音频。")
         if not args.emo_audio:
+            parser.error("--emo_audio 参数在 setting 2 中是必需的。")
         emo_audio_prompt = download_file(args.emo_audio)
         emo_alpha = args.emo_weight
+        print(f"情感参考: {emo_audio_prompt}, 权重: {emo_alpha}")
+    elif emo_control_method == 2:
+        print("使用情感向量进行控制。")
         if not args.emo_vectors:
+            parser.error("--emo_vectors 参数在 setting 3 中是必需的。")
         vec_sum = sum(args.emo_vectors)
         if vec_sum > 1.5:
+            raise ValueError(f"情感向量的总和不能超过1.5。当前总和: {vec_sum}")
         emo_vector = args.emo_vectors
+        print(f"情感向量: {emo_vector}")
+    elif emo_control_method == 3:
+        print("使用文本描述来控制情感。")
         if not args.emo_text:
+            parser.error("--emo_text 参数在 setting 4 中是必需的。")
         use_emo_text = True
         emo_text_val = args.emo_text
+        print(f"情感文本: '{emo_text_val}'")
+    print("\n开始 TTS 推理...")
     tts.infer(
         spk_audio_prompt=prompt_audio_path,
         text=args.prompt,
         use_emo_text=use_emo_text,
         emo_text=emo_text_val,
         verbose=args.verbose,
     )
+    print(f"\n✨ 推理完成！音频已保存至: {output_path}")
 if __name__ == "__main__":
     main()