Spaces:
Build error
Build error
Delete app.py
Browse files
app.py
DELETED
|
@@ -1,219 +0,0 @@
|
|
| 1 |
-
import math
|
| 2 |
-
import fire
|
| 3 |
-
import gradio as gr
|
| 4 |
-
import numpy as np
|
| 5 |
-
import rich
|
| 6 |
-
import torch
|
| 7 |
-
from contextlib import nullcontext
|
| 8 |
-
from einops import rearrange
|
| 9 |
-
from functools import partial
|
| 10 |
-
from ldm.models.diffusion.ddim import DDIMSampler
|
| 11 |
-
from ldm.util import load_and_preprocess, instantiate_from_config
|
| 12 |
-
from omegaconf import OmegaConf
|
| 13 |
-
from PIL import Image
|
| 14 |
-
from rich import print
|
| 15 |
-
from torch import autocast
|
| 16 |
-
from torchvision import transforms
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
_SHOW_INTERMEDIATE = True
|
| 20 |
-
_GPU_INDEX = 0
|
| 21 |
-
# _GPU_INDEX = 2
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
def load_model_from_config(config, ckpt, device, verbose=False):
|
| 25 |
-
print(f'Loading model from {ckpt}')
|
| 26 |
-
pl_sd = torch.load(ckpt, map_location=device)
|
| 27 |
-
if 'global_step' in pl_sd:
|
| 28 |
-
print(f'Global Step: {pl_sd["global_step"]}')
|
| 29 |
-
sd = pl_sd['state_dict']
|
| 30 |
-
model = instantiate_from_config(config.model)
|
| 31 |
-
m, u = model.load_state_dict(sd, strict=False)
|
| 32 |
-
if len(m) > 0 and verbose:
|
| 33 |
-
print('missing keys:')
|
| 34 |
-
print(m)
|
| 35 |
-
if len(u) > 0 and verbose:
|
| 36 |
-
print('unexpected keys:')
|
| 37 |
-
print(u)
|
| 38 |
-
|
| 39 |
-
model.to(device)
|
| 40 |
-
model.eval()
|
| 41 |
-
return model
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
@torch.no_grad()
|
| 45 |
-
def sample_model(input_im, model, sampler, precision, h, w, ddim_steps, n_samples, scale,
|
| 46 |
-
ddim_eta, x, y, z):
|
| 47 |
-
precision_scope = autocast if precision == 'autocast' else nullcontext
|
| 48 |
-
with precision_scope('cuda'):
|
| 49 |
-
with model.ema_scope():
|
| 50 |
-
c = model.get_learned_conditioning(input_im).tile(n_samples, 1, 1)
|
| 51 |
-
T = torch.tensor([math.radians(x), math.sin(
|
| 52 |
-
math.radians(y)), math.cos(math.radians(y)), z])
|
| 53 |
-
T = T[None, None, :].repeat(n_samples, 1, 1).to(c.device)
|
| 54 |
-
c = torch.cat([c, T], dim=-1)
|
| 55 |
-
c = model.cc_projection(c)
|
| 56 |
-
cond = {}
|
| 57 |
-
cond['c_crossattn'] = [c]
|
| 58 |
-
c_concat = model.encode_first_stage((input_im.to(c.device))).mode().detach()
|
| 59 |
-
cond['c_concat'] = [model.encode_first_stage((input_im.to(c.device))).mode().detach()
|
| 60 |
-
.repeat(n_samples, 1, 1, 1)]
|
| 61 |
-
if scale != 1.0:
|
| 62 |
-
uc = {}
|
| 63 |
-
uc['c_concat'] = [torch.zeros(n_samples, 4, h // 8, w // 8).to(c.device)]
|
| 64 |
-
uc['c_crossattn'] = [torch.zeros_like(c).to(c.device)]
|
| 65 |
-
else:
|
| 66 |
-
uc = None
|
| 67 |
-
|
| 68 |
-
shape = [4, h // 8, w // 8]
|
| 69 |
-
samples_ddim, _ = sampler.sample(S=ddim_steps,
|
| 70 |
-
conditioning=cond,
|
| 71 |
-
batch_size=n_samples,
|
| 72 |
-
shape=shape,
|
| 73 |
-
verbose=False,
|
| 74 |
-
unconditional_guidance_scale=scale,
|
| 75 |
-
unconditional_conditioning=uc,
|
| 76 |
-
eta=ddim_eta,
|
| 77 |
-
x_T=None)
|
| 78 |
-
print(samples_ddim.shape)
|
| 79 |
-
# samples_ddim = torch.nn.functional.interpolate(samples_ddim, 64, mode='nearest', antialias=False)
|
| 80 |
-
x_samples_ddim = model.decode_first_stage(samples_ddim)
|
| 81 |
-
return torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0).cpu()
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
def main(
|
| 85 |
-
model,
|
| 86 |
-
device,
|
| 87 |
-
input_im,
|
| 88 |
-
preprocess=True,
|
| 89 |
-
x=0.,
|
| 90 |
-
y=0.,
|
| 91 |
-
z=0.,
|
| 92 |
-
scale=3.0,
|
| 93 |
-
n_samples=4,
|
| 94 |
-
ddim_steps=50,
|
| 95 |
-
ddim_eta=1.0,
|
| 96 |
-
precision='fp32',
|
| 97 |
-
h=256,
|
| 98 |
-
w=256,
|
| 99 |
-
):
|
| 100 |
-
# input_im[input_im == [0., 0., 0.]] = [1., 1., 1., 1.]
|
| 101 |
-
print('old input_im:', input_im.size)
|
| 102 |
-
|
| 103 |
-
if preprocess:
|
| 104 |
-
input_im = load_and_preprocess(input_im)
|
| 105 |
-
input_im = (input_im / 255.0).astype(np.float32)
|
| 106 |
-
# (H, W, 3) array in [0, 1].
|
| 107 |
-
|
| 108 |
-
else:
|
| 109 |
-
input_im = input_im.resize([256, 256], Image.Resampling.LANCZOS)
|
| 110 |
-
input_im = np.asarray(input_im, dtype=np.float32) / 255.0
|
| 111 |
-
# (H, W, 4) array in [0, 1].
|
| 112 |
-
|
| 113 |
-
# old method: very important, thresholding background
|
| 114 |
-
# input_im[input_im[:, :, -1] <= 0.9] = [1., 1., 1., 1.]
|
| 115 |
-
|
| 116 |
-
# new method: apply correct method of compositing to avoid sudden transitions / thresholding
|
| 117 |
-
# (smoothly transition foreground to white background based on alpha values)
|
| 118 |
-
alpha = input_im[:, :, 3:4]
|
| 119 |
-
white_im = np.ones_like(input_im)
|
| 120 |
-
input_im = alpha * input_im + (1.0 - alpha) * white_im
|
| 121 |
-
|
| 122 |
-
input_im = input_im[:, :, 0:3]
|
| 123 |
-
# (H, W, 3) array in [0, 1].
|
| 124 |
-
|
| 125 |
-
print('new input_im:', input_im.shape, input_im.dtype, input_im.min(), input_im.max())
|
| 126 |
-
show_in_im = Image.fromarray((input_im * 255).astype(np.uint8))
|
| 127 |
-
|
| 128 |
-
input_im = transforms.ToTensor()(input_im).unsqueeze(0).to(device)
|
| 129 |
-
input_im = input_im * 2 - 1
|
| 130 |
-
input_im = transforms.functional.resize(input_im, [h, w])
|
| 131 |
-
|
| 132 |
-
sampler = DDIMSampler(model)
|
| 133 |
-
x_samples_ddim = sample_model(input_im, model, sampler, precision, h, w,
|
| 134 |
-
ddim_steps, n_samples, scale, ddim_eta, x, y, z)
|
| 135 |
-
|
| 136 |
-
output_ims = []
|
| 137 |
-
for x_sample in x_samples_ddim:
|
| 138 |
-
x_sample = 255. * rearrange(x_sample.cpu().numpy(), 'c h w -> h w c')
|
| 139 |
-
output_ims.append(Image.fromarray(x_sample.astype(np.uint8)))
|
| 140 |
-
|
| 141 |
-
if _SHOW_INTERMEDIATE:
|
| 142 |
-
return (output_ims, show_in_im)
|
| 143 |
-
else:
|
| 144 |
-
return output_ims
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
description = '''
|
| 148 |
-
Generate novel viewpoints of an object depicted in one input image using a fine-tuned version of Stable Diffusion.
|
| 149 |
-
'''
|
| 150 |
-
|
| 151 |
-
article = '''
|
| 152 |
-
## How to use this?
|
| 153 |
-
TBD
|
| 154 |
-
## How does this work?
|
| 155 |
-
TBD
|
| 156 |
-
'''
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
def run_demo(
|
| 160 |
-
device_idx=_GPU_INDEX,
|
| 161 |
-
ckpt='last.ckpt',
|
| 162 |
-
config='configs/sd-objaverse-finetune-c_concat-256.yaml',
|
| 163 |
-
):
|
| 164 |
-
|
| 165 |
-
device = f'cuda:{device_idx}'
|
| 166 |
-
config = OmegaConf.load(config)
|
| 167 |
-
model = load_model_from_config(config, ckpt, device=device)
|
| 168 |
-
|
| 169 |
-
inputs = [
|
| 170 |
-
gr.Image(type='pil', image_mode='RGBA'), # shape=[512, 512]
|
| 171 |
-
gr.Checkbox(True, label='Preprocess image (remove background and center)',
|
| 172 |
-
info='If enabled, the uploaded image will be preprocessed to remove the background and center the object by cropping and/or padding as necessary. '
|
| 173 |
-
'If disabled, the image will be used as-is, *BUT* a fully transparent or white background is required.'),
|
| 174 |
-
# gr.Number(label='polar (between axis z+)'),
|
| 175 |
-
# gr.Number(label='azimuth (between axis x+)'),
|
| 176 |
-
# gr.Number(label='z (distance from center)'),
|
| 177 |
-
gr.Slider(-90, 90, value=0, step=5, label='Polar angle (vertical rotation in degrees)',
|
| 178 |
-
info='Positive values move the camera down, while negative values move the camera up.'),
|
| 179 |
-
gr.Slider(-90, 90, value=0, step=5, label='Azimuth angle (horizontal rotation in degrees)',
|
| 180 |
-
info='Positive values move the camera right, while negative values move the camera left.'),
|
| 181 |
-
gr.Slider(-2, 2, value=0, step=0.5, label='Radius (distance from center)',
|
| 182 |
-
info='Positive values move the camera further away, while negative values move the camera closer.'),
|
| 183 |
-
gr.Slider(0, 30, value=3, step=1, label='cfg scale'),
|
| 184 |
-
gr.Slider(1, 8, value=4, step=1, label='Number of samples to generate'),
|
| 185 |
-
gr.Slider(5, 200, value=100, step=5, label='Number of steps'),
|
| 186 |
-
]
|
| 187 |
-
output = [gr.Gallery(label='Generated images from specified new viewpoint')]
|
| 188 |
-
output[0].style(grid=2)
|
| 189 |
-
|
| 190 |
-
if _SHOW_INTERMEDIATE:
|
| 191 |
-
output += [gr.Image(type='pil', image_mode='RGB', label='Preprocessed input image')]
|
| 192 |
-
|
| 193 |
-
fn_with_model = partial(main, model, device)
|
| 194 |
-
fn_with_model.__name__ = 'fn_with_model'
|
| 195 |
-
|
| 196 |
-
examples = [
|
| 197 |
-
# ['assets/zero-shot/bear.png', 0, 0, 0, 3, 4, 100],
|
| 198 |
-
# ['assets/zero-shot/car.png', 0, 0, 0, 3, 4, 100],
|
| 199 |
-
# ['assets/zero-shot/elephant.png', 0, 0, 0, 3, 4, 100],
|
| 200 |
-
# ['assets/zero-shot/pikachu.png', 0, 0, 0, 3, 4, 100],
|
| 201 |
-
# ['assets/zero-shot/spyro.png', 0, 0, 0, 3, 4, 100],
|
| 202 |
-
# ['assets/zero-shot/taxi.png', 0, 0, 0, 3, 4, 100],
|
| 203 |
-
]
|
| 204 |
-
|
| 205 |
-
demo = gr.Interface(
|
| 206 |
-
fn=fn_with_model,
|
| 207 |
-
title='Demo for Zero-Shot Control of Camera Viewpoints within a Single Image',
|
| 208 |
-
description=description,
|
| 209 |
-
article=article,
|
| 210 |
-
inputs=inputs,
|
| 211 |
-
outputs=output,
|
| 212 |
-
examples=examples,
|
| 213 |
-
allow_flagging='never',
|
| 214 |
-
)
|
| 215 |
-
demo.launch(enable_queue=True, share=True)
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
if __name__ == '__main__':
|
| 219 |
-
fire.Fire(run_demo)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|