Spaces:
Runtime error
Runtime error
Upload 5 files
Browse files- .gitignore +2 -0
- app.py +11 -9
.gitignore
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.pyc
|
| 2 |
+
__pycache__
|
app.py
CHANGED
|
@@ -4,7 +4,7 @@ import gradio as gr
|
|
| 4 |
from PIL import Image
|
| 5 |
from omegaconf import OmegaConf
|
| 6 |
from pathlib import Path
|
| 7 |
-
from vocoder.
|
| 8 |
from ldm.models.diffusion.ddim import DDIMSampler
|
| 9 |
from ldm.util import instantiate_from_config
|
| 10 |
from wav_evaluation.models.CLAPWrapper import CLAPWrapper
|
|
@@ -29,7 +29,7 @@ def initialize_model(config, ckpt):
|
|
| 29 |
return sampler
|
| 30 |
|
| 31 |
sampler = initialize_model('configs/text_to_audio/txt2audio_args.yaml', 'useful_ckpts/ta40multi_epoch=000085.ckpt')
|
| 32 |
-
vocoder =
|
| 33 |
clap_model = CLAPWrapper('useful_ckpts/CLAP/CLAP_weights_2022.pth','useful_ckpts/CLAP/config.yml',use_cuda=torch.cuda.is_available())
|
| 34 |
|
| 35 |
def select_best_audio(prompt,wav_list):
|
|
@@ -52,7 +52,7 @@ def txt2audio(sampler,vocoder,prompt, seed, scale, ddim_steps, n_samples=1, W=62
|
|
| 52 |
uc = None
|
| 53 |
if scale != 1.0:
|
| 54 |
uc = sampler.model.get_learned_conditioning(n_samples * [""])
|
| 55 |
-
c = sampler.model.get_learned_conditioning(n_samples * [prompt])
|
| 56 |
shape = [sampler.model.first_stage_model.embed_dim, H//8, W//8] # (z_dim, 80//2^x, 848//2^x)
|
| 57 |
samples_ddim, _ = sampler.sample(S=ddim_steps,
|
| 58 |
conditioning=c,
|
|
@@ -74,7 +74,7 @@ def txt2audio(sampler,vocoder,prompt, seed, scale, ddim_steps, n_samples=1, W=62
|
|
| 74 |
return best_wav
|
| 75 |
|
| 76 |
|
| 77 |
-
def predict(prompt, ddim_steps, num_samples, scale, seed)
|
| 78 |
melbins,mel_len = 80,624
|
| 79 |
with torch.no_grad():
|
| 80 |
result = txt2audio(
|
|
@@ -97,21 +97,23 @@ with gr.Blocks() as demo:
|
|
| 97 |
|
| 98 |
with gr.Row():
|
| 99 |
with gr.Column():
|
| 100 |
-
prompt = gr.Textbox(label="Prompt: Input your text here
|
| 101 |
run_button = gr.Button(label="Run")
|
| 102 |
|
| 103 |
|
| 104 |
with gr.Accordion("Advanced options", open=False):
|
| 105 |
num_samples = gr.Slider(
|
| 106 |
-
label="
|
|
|
|
|
|
|
| 107 |
# num_samples = 1
|
| 108 |
ddim_steps = gr.Slider(label="Steps", minimum=1,
|
| 109 |
maximum=150, value=100, step=1)
|
| 110 |
scale = gr.Slider(
|
| 111 |
-
label="Guidance Scale", minimum=0.1, maximum=4.0, value=1.5, step=0.1
|
| 112 |
)
|
| 113 |
seed = gr.Slider(
|
| 114 |
-
label="Seed",
|
| 115 |
minimum=0,
|
| 116 |
maximum=2147483647,
|
| 117 |
step=1,
|
|
@@ -138,4 +140,4 @@ with gr.Blocks() as demo:
|
|
| 138 |
with gr.Column():
|
| 139 |
pass
|
| 140 |
|
| 141 |
-
demo.launch()
|
|
|
|
| 4 |
from PIL import Image
|
| 5 |
from omegaconf import OmegaConf
|
| 6 |
from pathlib import Path
|
| 7 |
+
from vocoder.bigvgan.models import VocoderBigVGAN
|
| 8 |
from ldm.models.diffusion.ddim import DDIMSampler
|
| 9 |
from ldm.util import instantiate_from_config
|
| 10 |
from wav_evaluation.models.CLAPWrapper import CLAPWrapper
|
|
|
|
| 29 |
return sampler
|
| 30 |
|
| 31 |
sampler = initialize_model('configs/text_to_audio/txt2audio_args.yaml', 'useful_ckpts/ta40multi_epoch=000085.ckpt')
|
| 32 |
+
vocoder = VocoderBigVGAN('vocoder/logs/bigv16k53w',device=device)
|
| 33 |
clap_model = CLAPWrapper('useful_ckpts/CLAP/CLAP_weights_2022.pth','useful_ckpts/CLAP/config.yml',use_cuda=torch.cuda.is_available())
|
| 34 |
|
| 35 |
def select_best_audio(prompt,wav_list):
|
|
|
|
| 52 |
uc = None
|
| 53 |
if scale != 1.0:
|
| 54 |
uc = sampler.model.get_learned_conditioning(n_samples * [""])
|
| 55 |
+
c = sampler.model.get_learned_conditioning(n_samples * [prompt])# shape:[1,77,1280],即还没有变成句子embedding,仍是每个单词的embedding
|
| 56 |
shape = [sampler.model.first_stage_model.embed_dim, H//8, W//8] # (z_dim, 80//2^x, 848//2^x)
|
| 57 |
samples_ddim, _ = sampler.sample(S=ddim_steps,
|
| 58 |
conditioning=c,
|
|
|
|
| 74 |
return best_wav
|
| 75 |
|
| 76 |
|
| 77 |
+
def predict(prompt, ddim_steps, num_samples, scale, seed):# 经过试验,这个input_image需要是256x256、512x512的大小效果才正常,实际应该resize一下,输出再resize回去,但是他们使用的是pad,不知道为什么
|
| 78 |
melbins,mel_len = 80,624
|
| 79 |
with torch.no_grad():
|
| 80 |
result = txt2audio(
|
|
|
|
| 97 |
|
| 98 |
with gr.Row():
|
| 99 |
with gr.Column():
|
| 100 |
+
prompt = gr.Textbox(label="Prompt: Input your text here. ")
|
| 101 |
run_button = gr.Button(label="Run")
|
| 102 |
|
| 103 |
|
| 104 |
with gr.Accordion("Advanced options", open=False):
|
| 105 |
num_samples = gr.Slider(
|
| 106 |
+
label="Select from audios num.This number control the number of candidates \
|
| 107 |
+
(e.g., generate three audios and choose the best to show you). A Larger value usually lead to \
|
| 108 |
+
better quality with heavier computation", minimum=1, maximum=10, value=3, step=1)
|
| 109 |
# num_samples = 1
|
| 110 |
ddim_steps = gr.Slider(label="Steps", minimum=1,
|
| 111 |
maximum=150, value=100, step=1)
|
| 112 |
scale = gr.Slider(
|
| 113 |
+
label="Guidance Scale:(Large => more relevant to text but the quality may drop)", minimum=0.1, maximum=4.0, value=1.5, step=0.1
|
| 114 |
)
|
| 115 |
seed = gr.Slider(
|
| 116 |
+
label="Seed:Change this value (any integer number) will lead to a different generation result.",
|
| 117 |
minimum=0,
|
| 118 |
maximum=2147483647,
|
| 119 |
step=1,
|
|
|
|
| 140 |
with gr.Column():
|
| 141 |
pass
|
| 142 |
|
| 143 |
+
demo.launch(share=True)
|