积极的屁孩
commited on
Commit
·
a8377f8
1
Parent(s):
980462e
add space
Browse files
app.py
CHANGED
|
@@ -5,12 +5,12 @@ import site
|
|
| 5 |
import json
|
| 6 |
import torch
|
| 7 |
import gradio as gr
|
| 8 |
-
import gradio.spaces as spaces
|
| 9 |
import torchaudio
|
| 10 |
import numpy as np
|
| 11 |
from huggingface_hub import snapshot_download, hf_hub_download
|
| 12 |
import subprocess
|
| 13 |
import re
|
|
|
|
| 14 |
|
| 15 |
def install_espeak():
|
| 16 |
"""检测并安装espeak-ng依赖"""
|
|
@@ -351,6 +351,7 @@ def get_pipeline(pipeline_type):
|
|
| 351 |
return inference_pipeline
|
| 352 |
|
| 353 |
# 实现VEVO功能函数
|
|
|
|
| 354 |
def vevo_style(content_wav, style_wav):
|
| 355 |
temp_content_path = "wav/temp_content.wav"
|
| 356 |
temp_style_path = "wav/temp_style.wav"
|
|
@@ -433,6 +434,7 @@ def vevo_style(content_wav, style_wav):
|
|
| 433 |
traceback.print_exc()
|
| 434 |
raise e
|
| 435 |
|
|
|
|
| 436 |
def vevo_timbre(content_wav, reference_wav):
|
| 437 |
temp_content_path = "wav/temp_content.wav"
|
| 438 |
temp_reference_path = "wav/temp_reference.wav"
|
|
@@ -526,6 +528,7 @@ def vevo_timbre(content_wav, reference_wav):
|
|
| 526 |
traceback.print_exc()
|
| 527 |
raise e
|
| 528 |
|
|
|
|
| 529 |
def vevo_voice(content_wav, style_reference_wav, timbre_reference_wav):
|
| 530 |
temp_content_path = "wav/temp_content.wav"
|
| 531 |
temp_style_path = "wav/temp_style.wav"
|
|
@@ -647,6 +650,7 @@ def vevo_voice(content_wav, style_reference_wav, timbre_reference_wav):
|
|
| 647 |
traceback.print_exc()
|
| 648 |
raise e
|
| 649 |
|
|
|
|
| 650 |
def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_language="en", ref_language="en", style_ref_text_language="en"):
|
| 651 |
temp_ref_path = "wav/temp_ref.wav"
|
| 652 |
temp_timbre_path = "wav/temp_timbre.wav"
|
|
@@ -750,98 +754,93 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_langua
|
|
| 750 |
raise e
|
| 751 |
|
| 752 |
# 创建Gradio界面
|
| 753 |
-
|
| 754 |
-
|
| 755 |
-
|
| 756 |
-
|
| 757 |
-
|
| 758 |
-
|
| 759 |
-
|
| 760 |
-
|
| 761 |
-
|
| 762 |
-
|
| 763 |
-
|
| 764 |
-
|
| 765 |
-
|
| 766 |
-
|
| 767 |
-
|
| 768 |
-
|
| 769 |
-
|
| 770 |
-
|
| 771 |
-
|
| 772 |
-
</a>
|
| 773 |
-
</div>
|
| 774 |
-
""")
|
| 775 |
-
|
| 776 |
-
with gr.Tab("Vevo-Timbre"):
|
| 777 |
-
gr.Markdown("### Vevo-Timbre: Maintain style but transfer timbre")
|
| 778 |
-
with gr.Row():
|
| 779 |
-
with gr.Column():
|
| 780 |
-
timbre_content = gr.Audio(label="Source Audio", type="numpy")
|
| 781 |
-
timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
|
| 782 |
-
timbre_button = gr.Button("Generate")
|
| 783 |
-
with gr.Column():
|
| 784 |
-
timbre_output = gr.Audio(label="Result")
|
| 785 |
-
timbre_button.click(vevo_timbre, inputs=[timbre_content, timbre_reference], outputs=timbre_output)
|
| 786 |
-
|
| 787 |
-
with gr.Tab("Vevo-Style"):
|
| 788 |
-
gr.Markdown("### Vevo-Style: Maintain timbre but transfer style (accent, emotion, etc.)")
|
| 789 |
-
with gr.Row():
|
| 790 |
-
with gr.Column():
|
| 791 |
-
style_content = gr.Audio(label="Source Audio", type="numpy")
|
| 792 |
-
style_reference = gr.Audio(label="Style Reference", type="numpy")
|
| 793 |
-
style_button = gr.Button("Generate")
|
| 794 |
-
with gr.Column():
|
| 795 |
-
style_output = gr.Audio(label="Result")
|
| 796 |
-
style_button.click(vevo_style, inputs=[style_content, style_reference], outputs=style_output)
|
| 797 |
-
|
| 798 |
-
with gr.Tab("Vevo-Voice"):
|
| 799 |
-
gr.Markdown("### Vevo-Voice: Transfers both style and timbre with separate references")
|
| 800 |
-
with gr.Row():
|
| 801 |
-
with gr.Column():
|
| 802 |
-
voice_content = gr.Audio(label="Source Audio", type="numpy")
|
| 803 |
-
voice_style_reference = gr.Audio(label="Style Reference", type="numpy")
|
| 804 |
-
voice_timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
|
| 805 |
-
voice_button = gr.Button("Generate")
|
| 806 |
-
with gr.Column():
|
| 807 |
-
voice_output = gr.Audio(label="Result")
|
| 808 |
-
voice_button.click(vevo_voice, inputs=[voice_content, voice_style_reference, voice_timbre_reference], outputs=voice_output)
|
| 809 |
-
|
| 810 |
-
|
| 811 |
-
|
| 812 |
-
with gr.Tab("Vevo-TTS"):
|
| 813 |
-
gr.Markdown("### Vevo-TTS: Text-to-speech with separate style and timbre references")
|
| 814 |
-
with gr.Row():
|
| 815 |
-
with gr.Column():
|
| 816 |
-
tts_text = gr.Textbox(label="Target Text", placeholder="Enter text to synthesize...", lines=3)
|
| 817 |
-
tts_src_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Text Language", value="en")
|
| 818 |
-
tts_reference = gr.Audio(label="Style Reference", type="numpy")
|
| 819 |
-
tts_style_ref_text = gr.Textbox(label="Style Reference Text", placeholder="Enter style reference text...", lines=3)
|
| 820 |
-
tts_style_ref_text_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Style Reference Text Language", value="en")
|
| 821 |
-
tts_timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
|
| 822 |
-
tts_button = gr.Button("Generate")
|
| 823 |
-
with gr.Column():
|
| 824 |
-
tts_output = gr.Audio(label="Result")
|
| 825 |
-
|
| 826 |
-
tts_button.click(
|
| 827 |
-
vevo_tts,
|
| 828 |
-
inputs=[tts_text, tts_reference, tts_timbre_reference, tts_style_ref_text, tts_src_language, tts_style_ref_text_language],
|
| 829 |
-
outputs=tts_output
|
| 830 |
-
)
|
| 831 |
-
|
| 832 |
-
gr.Markdown("""
|
| 833 |
-
## About VEVO
|
| 834 |
-
VEVO is a versatile voice synthesis and conversion model that offers four main functionalities:
|
| 835 |
-
1. **Vevo-Style**: Maintains timbre but transfers style (accent, emotion, etc.)
|
| 836 |
-
2. **Vevo-Timbre**: Maintains style but transfers timbre
|
| 837 |
-
3. **Vevo-Voice**: Transfers both style and timbre with separate references
|
| 838 |
-
4. **Vevo-TTS**: Text-to-speech with separate style and timbre references
|
| 839 |
-
|
| 840 |
-
For more information, visit the [Amphion project](https://github.com/open-mmlab/Amphion)
|
| 841 |
""")
|
| 842 |
|
| 843 |
-
|
| 844 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 845 |
|
| 846 |
-
#
|
| 847 |
-
|
|
|
|
| 5 |
import json
|
| 6 |
import torch
|
| 7 |
import gradio as gr
|
|
|
|
| 8 |
import torchaudio
|
| 9 |
import numpy as np
|
| 10 |
from huggingface_hub import snapshot_download, hf_hub_download
|
| 11 |
import subprocess
|
| 12 |
import re
|
| 13 |
+
import spaces
|
| 14 |
|
| 15 |
def install_espeak():
|
| 16 |
"""检测并安装espeak-ng依赖"""
|
|
|
|
| 351 |
return inference_pipeline
|
| 352 |
|
| 353 |
# 实现VEVO功能函数
|
| 354 |
+
@spaces.GPU()
|
| 355 |
def vevo_style(content_wav, style_wav):
|
| 356 |
temp_content_path = "wav/temp_content.wav"
|
| 357 |
temp_style_path = "wav/temp_style.wav"
|
|
|
|
| 434 |
traceback.print_exc()
|
| 435 |
raise e
|
| 436 |
|
| 437 |
+
@spaces.GPU()
|
| 438 |
def vevo_timbre(content_wav, reference_wav):
|
| 439 |
temp_content_path = "wav/temp_content.wav"
|
| 440 |
temp_reference_path = "wav/temp_reference.wav"
|
|
|
|
| 528 |
traceback.print_exc()
|
| 529 |
raise e
|
| 530 |
|
| 531 |
+
@spaces.GPU()
|
| 532 |
def vevo_voice(content_wav, style_reference_wav, timbre_reference_wav):
|
| 533 |
temp_content_path = "wav/temp_content.wav"
|
| 534 |
temp_style_path = "wav/temp_style.wav"
|
|
|
|
| 650 |
traceback.print_exc()
|
| 651 |
raise e
|
| 652 |
|
| 653 |
+
@spaces.GPU()
|
| 654 |
def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_language="en", ref_language="en", style_ref_text_language="en"):
|
| 655 |
temp_ref_path = "wav/temp_ref.wav"
|
| 656 |
temp_timbre_path = "wav/temp_timbre.wav"
|
|
|
|
| 754 |
raise e
|
| 755 |
|
| 756 |
# 创建Gradio界面
|
| 757 |
+
with gr.Blocks(title="Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement") as demo:
|
| 758 |
+
gr.Markdown("# Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement")
|
| 759 |
+
# 添加链接标签行
|
| 760 |
+
with gr.Row(elem_id="links_row"):
|
| 761 |
+
gr.HTML("""
|
| 762 |
+
<div style="display: flex; justify-content: flex-start; gap: 8px; margin: 0 0; padding-left: 0px;">
|
| 763 |
+
<a href="https://arxiv.org/abs/2502.07243" target="_blank" style="text-decoration: none;">
|
| 764 |
+
<img alt="arXiv Paper" src="https://img.shields.io/badge/arXiv-Paper-red">
|
| 765 |
+
</a>
|
| 766 |
+
<a href="https://openreview.net/pdf?id=anQDiQZhDP" target="_blank" style="text-decoration: none;">
|
| 767 |
+
<img alt="ICLR Paper" src="https://img.shields.io/badge/ICLR-Paper-64b63a">
|
| 768 |
+
</a>
|
| 769 |
+
<a href="https://huggingface.co/amphion/Vevo" target="_blank" style="text-decoration: none;">
|
| 770 |
+
<img alt="HuggingFace Model" src="https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Model-yellow">
|
| 771 |
+
</a>
|
| 772 |
+
<a href="https://github.com/open-mmlab/Amphion/tree/main/models/vc/vevo" target="_blank" style="text-decoration: none;">
|
| 773 |
+
<img alt="GitHub Repo" src="https://img.shields.io/badge/GitHub-Repo-blue">
|
| 774 |
+
</a>
|
| 775 |
+
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 776 |
""")
|
| 777 |
|
| 778 |
+
with gr.Tab("Vevo-Timbre"):
|
| 779 |
+
gr.Markdown("### Vevo-Timbre: Maintain style but transfer timbre")
|
| 780 |
+
with gr.Row():
|
| 781 |
+
with gr.Column():
|
| 782 |
+
timbre_content = gr.Audio(label="Source Audio", type="numpy")
|
| 783 |
+
timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
|
| 784 |
+
timbre_button = gr.Button("Generate")
|
| 785 |
+
with gr.Column():
|
| 786 |
+
timbre_output = gr.Audio(label="Result")
|
| 787 |
+
timbre_button.click(vevo_timbre, inputs=[timbre_content, timbre_reference], outputs=timbre_output)
|
| 788 |
+
|
| 789 |
+
with gr.Tab("Vevo-Style"):
|
| 790 |
+
gr.Markdown("### Vevo-Style: Maintain timbre but transfer style (accent, emotion, etc.)")
|
| 791 |
+
with gr.Row():
|
| 792 |
+
with gr.Column():
|
| 793 |
+
style_content = gr.Audio(label="Source Audio", type="numpy")
|
| 794 |
+
style_reference = gr.Audio(label="Style Reference", type="numpy")
|
| 795 |
+
style_button = gr.Button("Generate")
|
| 796 |
+
with gr.Column():
|
| 797 |
+
style_output = gr.Audio(label="Result")
|
| 798 |
+
style_button.click(vevo_style, inputs=[style_content, style_reference], outputs=style_output)
|
| 799 |
+
|
| 800 |
+
with gr.Tab("Vevo-Voice"):
|
| 801 |
+
gr.Markdown("### Vevo-Voice: Transfers both style and timbre with separate references")
|
| 802 |
+
with gr.Row():
|
| 803 |
+
with gr.Column():
|
| 804 |
+
voice_content = gr.Audio(label="Source Audio", type="numpy")
|
| 805 |
+
voice_style_reference = gr.Audio(label="Style Reference", type="numpy")
|
| 806 |
+
voice_timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
|
| 807 |
+
voice_button = gr.Button("Generate")
|
| 808 |
+
with gr.Column():
|
| 809 |
+
voice_output = gr.Audio(label="Result")
|
| 810 |
+
voice_button.click(vevo_voice, inputs=[voice_content, voice_style_reference, voice_timbre_reference], outputs=voice_output)
|
| 811 |
+
|
| 812 |
+
|
| 813 |
+
|
| 814 |
+
with gr.Tab("Vevo-TTS"):
|
| 815 |
+
gr.Markdown("### Vevo-TTS: Text-to-speech with separate style and timbre references")
|
| 816 |
+
with gr.Row():
|
| 817 |
+
with gr.Column():
|
| 818 |
+
tts_text = gr.Textbox(label="Target Text", placeholder="Enter text to synthesize...", lines=3)
|
| 819 |
+
tts_src_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Text Language", value="en")
|
| 820 |
+
tts_reference = gr.Audio(label="Style Reference", type="numpy")
|
| 821 |
+
tts_style_ref_text = gr.Textbox(label="Style Reference Text", placeholder="Enter style reference text...", lines=3)
|
| 822 |
+
tts_style_ref_text_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Style Reference Text Language", value="en")
|
| 823 |
+
tts_timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
|
| 824 |
+
tts_button = gr.Button("Generate")
|
| 825 |
+
with gr.Column():
|
| 826 |
+
tts_output = gr.Audio(label="Result")
|
| 827 |
+
|
| 828 |
+
tts_button.click(
|
| 829 |
+
vevo_tts,
|
| 830 |
+
inputs=[tts_text, tts_reference, tts_timbre_reference, tts_style_ref_text, tts_src_language, tts_style_ref_text_language],
|
| 831 |
+
outputs=tts_output
|
| 832 |
+
)
|
| 833 |
+
|
| 834 |
+
gr.Markdown("""
|
| 835 |
+
## About VEVO
|
| 836 |
+
VEVO is a versatile voice synthesis and conversion model that offers four main functionalities:
|
| 837 |
+
1. **Vevo-Style**: Maintains timbre but transfers style (accent, emotion, etc.)
|
| 838 |
+
2. **Vevo-Timbre**: Maintains style but transfers timbre
|
| 839 |
+
3. **Vevo-Voice**: Transfers both style and timbre with separate references
|
| 840 |
+
4. **Vevo-TTS**: Text-to-speech with separate style and timbre references
|
| 841 |
+
|
| 842 |
+
For more information, visit the [Amphion project](https://github.com/open-mmlab/Amphion)
|
| 843 |
+
""")
|
| 844 |
|
| 845 |
+
# 启动应用
|
| 846 |
+
demo.launch()
|