Spaces:
Running
Running
| import torch | |
| import numpy as np | |
| import gradio as gr | |
| from PIL import Image | |
| from omegaconf import OmegaConf | |
| from pathlib import Path | |
| from vocoder.bigvgan.models import VocoderBigVGAN | |
| from ldm.models.diffusion.ddim import DDIMSampler | |
| from ldm.util import instantiate_from_config | |
| SAMPLE_RATE = 16000 | |
| torch.set_grad_enabled(False) | |
| device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") | |
| def initialize_model(config, ckpt=None): | |
| config = OmegaConf.load(config) | |
| model = instantiate_from_config(config.model) | |
| if ckpt == None: | |
| print("not load state dict") | |
| else: | |
| model.load_state_dict(torch.load(ckpt,map_location='cpu')["state_dict"], strict=False) | |
| model = model.to(device) | |
| model.cond_stage_model.to(model.device) | |
| model.cond_stage_model.device = model.device | |
| print(model.device,device,model.cond_stage_model.device) | |
| sampler = DDIMSampler(model) | |
| return sampler | |
| sampler = initialize_model('configs/img_to_audio/img2audio_args.yaml', 'useful_ckpts/ta54_epoch=000216.ckpt') | |
| vocoder = VocoderBigVGAN('vocoder/logs/bigv16k53w',device=device) | |
| def img2audio(sampler,vocoder,image, seed, scale, ddim_steps, W=624, H=80): | |
| # print(type(image))# np.ndarray | |
| n_samples = 1 # only support 1 sample | |
| prng = np.random.RandomState(seed) | |
| start_code = prng.randn(n_samples, sampler.model.first_stage_model.embed_dim, H // 8, W // 8) | |
| start_code = torch.from_numpy(start_code).to(device=device, dtype=torch.float32) | |
| uc = None | |
| if scale != 1.0: | |
| uc = sampler.model.get_learned_conditioning(n_samples * [""]) | |
| image = Image.fromarray(image) | |
| image = sampler.model.cond_stage_model.preprocess(image).unsqueeze(0) | |
| image_embedding = sampler.model.cond_stage_model.forward_img(image) | |
| c = image_embedding.repeat(n_samples, 1, 1)# shape:[1,77,1280],即还没有变成句子embedding,仍是每个单词的embedding | |
| shape = [sampler.model.first_stage_model.embed_dim, H//8, W//8] # (z_dim, 80//2^x, 848//2^x) | |
| samples_ddim, _ = sampler.sample(S=ddim_steps, | |
| conditioning=c, | |
| batch_size=n_samples, | |
| shape=shape, | |
| verbose=False, | |
| unconditional_guidance_scale=scale, | |
| unconditional_conditioning=uc, | |
| x_T=start_code) | |
| x_samples_ddim = sampler.model.decode_first_stage(samples_ddim) | |
| x_samples_ddim = torch.clamp((x_samples_ddim+1.0)/2.0, min=0.0, max=1.0) # [0, 1] | |
| wav_list = [] | |
| for idx,spec in enumerate(x_samples_ddim): | |
| wav = vocoder.vocode(spec) | |
| wav_list.append((SAMPLE_RATE,wav)) | |
| best_wav = wav_list[0] | |
| return best_wav | |
| def predict(image, ddim_steps, scale, seed):# 经过试验,这个input_image需要是256x256、512x512的大小效果才正常,实际应该resize一下,输出再resize回去,但是他们使用的是pad,不知道为什么 | |
| melbins,mel_len = 80,624 | |
| with torch.no_grad(): | |
| result = img2audio( | |
| sampler=sampler, | |
| vocoder=vocoder, | |
| image=image, | |
| seed=seed, | |
| scale=scale, | |
| ddim_steps=ddim_steps, | |
| H=melbins, W=mel_len | |
| ) | |
| return result | |
| with gr.Blocks() as demo: | |
| with gr.Row(): | |
| gr.Markdown("## Make-An-Audio: Image-to-Audio Generation") | |
| with gr.Row(): | |
| with gr.Column(): | |
| image = gr.Image(label="Input Image: Select ome image to upload ") | |
| run_button = gr.Button(label="Run") | |
| with gr.Accordion("Advanced options", open=False): | |
| # num_samples = 1 | |
| ddim_steps = gr.Slider(label="Steps", minimum=1, | |
| maximum=1000, value=100, step=1) | |
| scale = gr.Slider( | |
| label="Guidance Scale:(Large => more relevant to text but the quality may drop)", minimum=0.1, maximum=4.0, value=1.5, step=0.1 | |
| ) | |
| seed = gr.Slider( | |
| label="Seed:Change this value (any integer number) will lead to a different generation result.", | |
| minimum=0, | |
| maximum=2147483647, | |
| step=1, | |
| value=44, | |
| ) | |
| with gr.Column(): | |
| # audio_list = [] | |
| # for i in range(int(num_samples)): | |
| # audio_list.append(gr.outputs.Audio()) | |
| outaudio = gr.Audio() | |
| run_button.click(fn=predict, inputs=[ | |
| image,ddim_steps, scale, seed], outputs=[outaudio])# inputs的参数只能传gr.xxx | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Examples( | |
| examples = [['./example_imgs/cat.png',100,3,55],['./example_imgs/violin.png',100,3,55]], | |
| inputs = [image,ddim_steps, scale, seed], | |
| outputs = [outaudio] | |
| ) | |
| with gr.Column(): | |
| pass | |
| demo.launch() | |