Spaces:

johnwang2026
/

voice

Sleeping

App Files Files Community

johnwang2026 commited on Oct 29

Commit

f9f1879

verified ·

1 Parent(s): 6dbf71c

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -22

app.py CHANGED Viewed

@@ -1,47 +1,57 @@
 import gradio as gr
-from transformers import AutoModel, AutoTokenizer
 import soundfile as sf
 import torch
 import os
-# 加载模型和Tokenizer（修复参数+移除device_map）
-model_name = "Soul-AILab/SoulX-Podcast-1.7B"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModel.from_pretrained(
-    model_name,
-    dtype=torch.float16,  # 替换 deprecated 的 torch_dtype
-    # 移除 device_map="auto"，改用手动分配设备（兼容无accelerate环境）
-)
-# 手动将模型移到GPU（无GPU自动用CPU）
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model = model.to(device)
-# 语音生成函数（补充设备适配）
 def generate_speech(text):
     if not text.strip():
         return None, "错误：请输入有效文本！"
-    # 文本编码并移到对应设备
-    inputs = tokenizer(text, return_tensors="pt").to(device)
     with torch.no_grad():
-        audio_output = model.generate(**inputs)
     # 保存音频
     output_path = "output.wav"
-    sf.write(output_path, audio_output[0].cpu().numpy(), samplerate=24000)
-    return output_path, "语音生成成功！"
-# 界面部分不变
-with gr.Blocks(title="SoulX-Podcast-1.7B 中英双语TTS") as demo:
-    gr.Markdown("# 🎤 SoulX-Podcast-1.7B 文本转语音")
-    gr.Markdown("支持中英双语输入，生成自然流畅的语音（采样率24000Hz）")
     with gr.Row():
         text_input = gr.Textbox(
             label="输入文本",
-            placeholder="请输入要转换的文本（建议≤200字），支持中英双语...",
             lines=5
         )
         audio_output = gr.Audio(label="生成的语音", type="filepath")

 import gradio as gr
+from transformers import AutoModelForTextToSpeech, AutoTokenizer, pipeline
 import soundfile as sf
 import torch
 import os
+# 初始化中英双语TTS管道（轻量模型，总体积<5GB）
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# 英文TTS（fastspeech2，体积~2GB）
+en_tokenizer = AutoTokenizer.from_pretrained("facebook/fastspeech2-en-ljspeech")
+en_model = AutoModelForTextToSpeech.from_pretrained("facebook/fastspeech2-en-ljspeech").to(device)
+# 中文TTS（Chinese-FastSpeech2，体积~3GB）
+zh_tokenizer = AutoTokenizer.from_pretrained("bakerk1234/Chinese-FastSpeech2")
+zh_model = AutoModelForTextToSpeech.from_pretrained("bakerk1234/Chinese-FastSpeech2").to(device)
+# 语音生成函数（自动识别语言，切换模型）
 def generate_speech(text):
     if not text.strip():
         return None, "错误：请输入有效文本！"
+    # 简单语言识别（中文含中文字符，英文不含）
+    is_chinese = any('\u4e00' <= char <= '\u9fff' for char in text)
+    if is_chinese:
+        tokenizer = zh_tokenizer
+        model = zh_model
+        samplerate = 22050  # 中文模型采样率
+    else:
+        tokenizer = en_tokenizer
+        model = en_model
+        samplerate = 22050  # 英文模型采样率
+    # 文本编码+生成语音
+    inputs = tokenizer(text, return_tensors="pt").to(device)
     with torch.no_grad():
+        audio_output = model.generate(**inputs).cpu().numpy()
     # 保存音频
     output_path = "output.wav"
+    sf.write(output_path, audio_output[0].T, samplerate=samplerate)  # 调整维度适配保存
+    return output_path, f"语音生成成功！（使用{'中文' if is_chinese else '英文'}轻量模型）"
+# 界面保持不变
+with gr.Blocks(title="中英双语TTS（轻量版）") as demo:
+    gr.Markdown("# 🎤 轻量中英双语文本转语音")
+    gr.Markdown("基于FastSpeech2模型，体积小（<5GB），适配免费Space，支持中英双语输入")
     with gr.Row():
         text_input = gr.Textbox(
             label="输入文本",
+            placeholder="请输入中文或英文文本（建议≤300字）...",
             lines=5
         )
         audio_output = gr.Audio(label="生成的语音", type="filepath")