Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,27 +8,25 @@ import requests
|
|
| 8 |
from urllib.parse import urlparse
|
| 9 |
import warnings
|
| 10 |
|
| 11 |
-
# ---
|
| 12 |
warnings.filterwarnings("ignore", category=FutureWarning)
|
| 13 |
warnings.filterwarnings("ignore", category=UserWarning)
|
| 14 |
|
| 15 |
-
# ---
|
| 16 |
current_dir = os.path.dirname(os.path.abspath(__file__))
|
| 17 |
sys.path.append(current_dir)
|
| 18 |
sys.path.append(os.path.join(current_dir, "indextts"))
|
| 19 |
|
| 20 |
-
# ---
|
| 21 |
from indextts.infer_v2 import IndexTTS2
|
| 22 |
from tools.download_files import download_model_from_huggingface
|
| 23 |
|
| 24 |
def download_file(url, save_dir="temp_audio"):
|
| 25 |
"""
|
| 26 |
-
|
| 27 |
"""
|
| 28 |
-
# Create save directory if it doesn't exist
|
| 29 |
os.makedirs(save_dir, exist_ok=True)
|
| 30 |
|
| 31 |
-
# Check if the input is a URL or a local path
|
| 32 |
try:
|
| 33 |
result = urlparse(url)
|
| 34 |
is_url = all([result.scheme, result.netloc])
|
|
@@ -36,138 +34,126 @@ def download_file(url, save_dir="temp_audio"):
|
|
| 36 |
is_url = False
|
| 37 |
|
| 38 |
if not is_url:
|
| 39 |
-
# It's a local path, check if it exists
|
| 40 |
if os.path.exists(url):
|
| 41 |
-
print(f"
|
| 42 |
return url
|
| 43 |
else:
|
| 44 |
-
raise FileNotFoundError(f"
|
| 45 |
|
| 46 |
-
# It's a URL, proceed with download
|
| 47 |
filename = os.path.basename(result.path)
|
| 48 |
-
# Handle cases with no filename in URL
|
| 49 |
if not filename:
|
| 50 |
filename = f"audio_{int(time.time())}.wav"
|
| 51 |
|
| 52 |
save_path = os.path.join(save_dir, filename)
|
| 53 |
|
| 54 |
-
print(f"
|
| 55 |
try:
|
| 56 |
response = requests.get(url, stream=True, timeout=30)
|
| 57 |
-
response.raise_for_status()
|
| 58 |
|
| 59 |
with open(save_path, "wb") as f:
|
| 60 |
for chunk in response.iter_content(chunk_size=8192):
|
| 61 |
f.write(chunk)
|
| 62 |
|
| 63 |
-
print("
|
| 64 |
return save_path
|
| 65 |
except requests.exceptions.RequestException as e:
|
| 66 |
-
print(f"
|
| 67 |
raise
|
| 68 |
|
| 69 |
def main():
|
| 70 |
"""
|
| 71 |
-
|
| 72 |
"""
|
| 73 |
-
# --- Command-Line Argument Parsing ---
|
| 74 |
parser = argparse.ArgumentParser(
|
| 75 |
-
description="IndexTTS:
|
| 76 |
formatter_class=argparse.RawTextHelpFormatter
|
| 77 |
)
|
| 78 |
|
| 79 |
-
|
| 80 |
-
parser.add_argument("--
|
| 81 |
-
parser.add_argument("--input_audio", type=str, required=True, help="URL or local path to the voice reference audio (.wav).")
|
| 82 |
parser.add_argument("--setting", type=int, choices=[1, 2, 3, 4], required=True,
|
| 83 |
-
help="
|
| 84 |
-
"1:
|
| 85 |
-
"2:
|
| 86 |
-
"3:
|
| 87 |
-
"4:
|
| 88 |
|
| 89 |
-
|
| 90 |
-
parser.add_argument("--
|
| 91 |
-
parser.add_argument("--emo_weight", type=float, default=0.8, help="Emotion weight/strength for setting 2 (default: 0.8).")
|
| 92 |
parser.add_argument("--emo_vectors", type=float, nargs=8,
|
| 93 |
-
metavar=('
|
| 94 |
-
help="
|
| 95 |
-
parser.add_argument("--emo_text", type=str, help="
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
parser.add_argument("--
|
| 99 |
-
parser.add_argument("--
|
| 100 |
-
parser.add_argument("--
|
| 101 |
-
parser.add_argument("--verbose", action="store_true", default=False, help="Enable verbose logging.")
|
| 102 |
|
| 103 |
args = parser.parse_args()
|
| 104 |
|
| 105 |
-
|
| 106 |
-
print("Checking for model files...")
|
| 107 |
download_model_from_huggingface(
|
| 108 |
os.path.join(current_dir, "checkpoints"),
|
| 109 |
os.path.join(current_dir, "checkpoints", "hf_cache")
|
| 110 |
)
|
| 111 |
|
| 112 |
-
|
| 113 |
-
print("Loading IndexTTS model...")
|
| 114 |
tts = IndexTTS2(
|
| 115 |
model_dir=args.model_dir,
|
| 116 |
cfg_path=os.path.join(args.model_dir, "config.yaml"),
|
| 117 |
is_fp16=args.is_fp16,
|
| 118 |
-
use_cuda_kernel=False
|
| 119 |
)
|
| 120 |
-
print("
|
| 121 |
|
| 122 |
-
# --- Prepare Paths and Parameters ---
|
| 123 |
os.makedirs("outputs", exist_ok=True)
|
| 124 |
-
|
|
|
|
|
|
|
|
|
|
| 125 |
|
| 126 |
prompt_audio_path = download_file(args.input_audio)
|
| 127 |
|
| 128 |
-
# Initialize inference parameters
|
| 129 |
emo_audio_prompt = None
|
| 130 |
emo_alpha = 1.0
|
| 131 |
emo_vector = None
|
| 132 |
use_emo_text = False
|
| 133 |
emo_text_val = ""
|
| 134 |
-
|
| 135 |
-
# The user provides 1-4, but the internal code uses 0-3
|
| 136 |
emo_control_method = args.setting - 1
|
| 137 |
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
pass # Defaults are correct
|
| 142 |
|
| 143 |
-
elif emo_control_method == 1:
|
| 144 |
-
print("
|
| 145 |
if not args.emo_audio:
|
| 146 |
-
parser.error("--emo_audio
|
| 147 |
emo_audio_prompt = download_file(args.emo_audio)
|
| 148 |
emo_alpha = args.emo_weight
|
| 149 |
-
print(f"
|
| 150 |
|
| 151 |
-
elif emo_control_method == 2:
|
| 152 |
-
print("
|
| 153 |
if not args.emo_vectors:
|
| 154 |
-
parser.error("--emo_vectors
|
| 155 |
vec_sum = sum(args.emo_vectors)
|
| 156 |
if vec_sum > 1.5:
|
| 157 |
-
raise ValueError(f"
|
| 158 |
emo_vector = args.emo_vectors
|
| 159 |
-
print(f"
|
| 160 |
|
| 161 |
-
elif emo_control_method == 3:
|
| 162 |
-
print("
|
| 163 |
if not args.emo_text:
|
| 164 |
-
parser.error("--emo_text
|
| 165 |
use_emo_text = True
|
| 166 |
emo_text_val = args.emo_text
|
| 167 |
-
print(f"
|
| 168 |
|
| 169 |
-
|
| 170 |
-
print("\nStarting TTS inference...")
|
| 171 |
tts.infer(
|
| 172 |
spk_audio_prompt=prompt_audio_path,
|
| 173 |
text=args.prompt,
|
|
@@ -178,12 +164,10 @@ def main():
|
|
| 178 |
use_emo_text=use_emo_text,
|
| 179 |
emo_text=emo_text_val,
|
| 180 |
verbose=args.verbose,
|
| 181 |
-
# You can add other advanced generation parameters here if needed
|
| 182 |
-
# e.g., top_p=0.8, temperature=0.8, etc.
|
| 183 |
)
|
| 184 |
|
| 185 |
-
print(f"\n✨
|
| 186 |
-
|
| 187 |
|
| 188 |
if __name__ == "__main__":
|
| 189 |
main()
|
|
|
|
|
|
| 8 |
from urllib.parse import urlparse
|
| 9 |
import warnings
|
| 10 |
|
| 11 |
+
# --- 抑制警告信息 ---
|
| 12 |
warnings.filterwarnings("ignore", category=FutureWarning)
|
| 13 |
warnings.filterwarnings("ignore", category=UserWarning)
|
| 14 |
|
| 15 |
+
# --- 设置系统路径 ---
|
| 16 |
current_dir = os.path.dirname(os.path.abspath(__file__))
|
| 17 |
sys.path.append(current_dir)
|
| 18 |
sys.path.append(os.path.join(current_dir, "indextts"))
|
| 19 |
|
| 20 |
+
# --- 导入本地模块 ---
|
| 21 |
from indextts.infer_v2 import IndexTTS2
|
| 22 |
from tools.download_files import download_model_from_huggingface
|
| 23 |
|
| 24 |
def download_file(url, save_dir="temp_audio"):
|
| 25 |
"""
|
| 26 |
+
从 URL 下载文件,或者如果路径是本地文件则直接返回路径。
|
| 27 |
"""
|
|
|
|
| 28 |
os.makedirs(save_dir, exist_ok=True)
|
| 29 |
|
|
|
|
| 30 |
try:
|
| 31 |
result = urlparse(url)
|
| 32 |
is_url = all([result.scheme, result.netloc])
|
|
|
|
| 34 |
is_url = False
|
| 35 |
|
| 36 |
if not is_url:
|
|
|
|
| 37 |
if os.path.exists(url):
|
| 38 |
+
print(f"使用本地文件: {url}")
|
| 39 |
return url
|
| 40 |
else:
|
| 41 |
+
raise FileNotFoundError(f"本地文件未找到: {url}")
|
| 42 |
|
|
|
|
| 43 |
filename = os.path.basename(result.path)
|
|
|
|
| 44 |
if not filename:
|
| 45 |
filename = f"audio_{int(time.time())}.wav"
|
| 46 |
|
| 47 |
save_path = os.path.join(save_dir, filename)
|
| 48 |
|
| 49 |
+
print(f"正在从 {url} 下载音频到 {save_path}...")
|
| 50 |
try:
|
| 51 |
response = requests.get(url, stream=True, timeout=30)
|
| 52 |
+
response.raise_for_status()
|
| 53 |
|
| 54 |
with open(save_path, "wb") as f:
|
| 55 |
for chunk in response.iter_content(chunk_size=8192):
|
| 56 |
f.write(chunk)
|
| 57 |
|
| 58 |
+
print("下载完成。")
|
| 59 |
return save_path
|
| 60 |
except requests.exceptions.RequestException as e:
|
| 61 |
+
print(f"下载文件时出错: {e}")
|
| 62 |
raise
|
| 63 |
|
| 64 |
def main():
|
| 65 |
"""
|
| 66 |
+
运行命令行文本转语音应用的主函数。
|
| 67 |
"""
|
|
|
|
| 68 |
parser = argparse.ArgumentParser(
|
| 69 |
+
description="IndexTTS: 命令行文本转语音应用",
|
| 70 |
formatter_class=argparse.RawTextHelpFormatter
|
| 71 |
)
|
| 72 |
|
| 73 |
+
parser.add_argument("--prompt", type=str, required=True, help="需要合成的文本。")
|
| 74 |
+
parser.add_argument("--input_audio", type=str, required=True, help="音色参考音频的 URL 或本地路径 (.wav)。")
|
|
|
|
| 75 |
parser.add_argument("--setting", type=int, choices=[1, 2, 3, 4], required=True,
|
| 76 |
+
help="情感控制方法:\n"
|
| 77 |
+
"1: 与音色参考音频相同。\n"
|
| 78 |
+
"2: 使用单独的情感参考音频。\n"
|
| 79 |
+
"3: 使用情感向量。\n"
|
| 80 |
+
"4: 使用文本描述来控制情感。")
|
| 81 |
|
| 82 |
+
parser.add_argument("--emo_audio", type=str, help="情感参考音频的 URL 或本地路径 (setting 2 必需)。")
|
| 83 |
+
parser.add_argument("--emo_weight", type=float, default=0.8, help="情感权重 (setting 2, 默认: 0.8)。")
|
|
|
|
| 84 |
parser.add_argument("--emo_vectors", type=float, nargs=8,
|
| 85 |
+
metavar=('喜', '怒', '哀', '惧', '厌恶', '低落', '惊喜', '平静'),
|
| 86 |
+
help="八个情感向量值,用空格分隔 (setting 3 必需)。")
|
| 87 |
+
parser.add_argument("--emo_text", type=str, help="情感描述文本,例如 '高兴', '伤心' (setting 4 必需)。")
|
| 88 |
+
|
| 89 |
+
parser.add_argument("--output_path", type=str, default=None, help="保存输出音频的路径。如果未提供,将使用默认名称。")
|
| 90 |
+
parser.add_argument("--model_dir", type=str, default="checkpoints", help="模型检查点目录。")
|
| 91 |
+
parser.add_argument("--is_fp16", action="store_true", default=False, help="启用 fp16 推理。")
|
| 92 |
+
parser.add_argument("--verbose", action="store_true", default=False, help="启用详细日志记录。")
|
|
|
|
| 93 |
|
| 94 |
args = parser.parse_args()
|
| 95 |
|
| 96 |
+
print("正在检查模型文件...")
|
|
|
|
| 97 |
download_model_from_huggingface(
|
| 98 |
os.path.join(current_dir, "checkpoints"),
|
| 99 |
os.path.join(current_dir, "checkpoints", "hf_cache")
|
| 100 |
)
|
| 101 |
|
| 102 |
+
print("正在加载 IndexTTS 模型...")
|
|
|
|
| 103 |
tts = IndexTTS2(
|
| 104 |
model_dir=args.model_dir,
|
| 105 |
cfg_path=os.path.join(args.model_dir, "config.yaml"),
|
| 106 |
is_fp16=args.is_fp16,
|
| 107 |
+
use_cuda_kernel=False
|
| 108 |
)
|
| 109 |
+
print("模型加载成功。")
|
| 110 |
|
|
|
|
| 111 |
os.makedirs("outputs", exist_ok=True)
|
| 112 |
+
|
| 113 |
+
# --- 主要修改 ---
|
| 114 |
+
# 如果用户没有通过 --output_path 指定路径,则默认使用 'outputs/output.wav'
|
| 115 |
+
output_path = args.output_path or os.path.join("outputs", "output.wav")
|
| 116 |
|
| 117 |
prompt_audio_path = download_file(args.input_audio)
|
| 118 |
|
|
|
|
| 119 |
emo_audio_prompt = None
|
| 120 |
emo_alpha = 1.0
|
| 121 |
emo_vector = None
|
| 122 |
use_emo_text = False
|
| 123 |
emo_text_val = ""
|
|
|
|
|
|
|
| 124 |
emo_control_method = args.setting - 1
|
| 125 |
|
| 126 |
+
if emo_control_method == 0:
|
| 127 |
+
print("使用音色参考音频中的情感。")
|
| 128 |
+
pass
|
|
|
|
| 129 |
|
| 130 |
+
elif emo_control_method == 1:
|
| 131 |
+
print("使用独立的情感参考音频。")
|
| 132 |
if not args.emo_audio:
|
| 133 |
+
parser.error("--emo_audio 参数在 setting 2 中是必需的。")
|
| 134 |
emo_audio_prompt = download_file(args.emo_audio)
|
| 135 |
emo_alpha = args.emo_weight
|
| 136 |
+
print(f"情感参考: {emo_audio_prompt}, 权重: {emo_alpha}")
|
| 137 |
|
| 138 |
+
elif emo_control_method == 2:
|
| 139 |
+
print("使用情感向量进行控制。")
|
| 140 |
if not args.emo_vectors:
|
| 141 |
+
parser.error("--emo_vectors 参数在 setting 3 中是必需的。")
|
| 142 |
vec_sum = sum(args.emo_vectors)
|
| 143 |
if vec_sum > 1.5:
|
| 144 |
+
raise ValueError(f"情感向量的总和不能超过1.5。当前总和: {vec_sum}")
|
| 145 |
emo_vector = args.emo_vectors
|
| 146 |
+
print(f"情感向量: {emo_vector}")
|
| 147 |
|
| 148 |
+
elif emo_control_method == 3:
|
| 149 |
+
print("使用文本描述来控制情感。")
|
| 150 |
if not args.emo_text:
|
| 151 |
+
parser.error("--emo_text 参数在 setting 4 中是必需的。")
|
| 152 |
use_emo_text = True
|
| 153 |
emo_text_val = args.emo_text
|
| 154 |
+
print(f"情感文本: '{emo_text_val}'")
|
| 155 |
|
| 156 |
+
print("\n开始 TTS 推理...")
|
|
|
|
| 157 |
tts.infer(
|
| 158 |
spk_audio_prompt=prompt_audio_path,
|
| 159 |
text=args.prompt,
|
|
|
|
| 164 |
use_emo_text=use_emo_text,
|
| 165 |
emo_text=emo_text_val,
|
| 166 |
verbose=args.verbose,
|
|
|
|
|
|
|
| 167 |
)
|
| 168 |
|
| 169 |
+
print(f"\n✨ 推理完成!音频已保存至: {output_path}")
|
|
|
|
| 170 |
|
| 171 |
if __name__ == "__main__":
|
| 172 |
main()
|
| 173 |
+
|