init
Browse files
app.py
CHANGED
|
@@ -50,20 +50,31 @@ def center_crop(image):
|
|
| 50 |
|
| 51 |
|
| 52 |
##### load model
|
|
|
|
| 53 |
config = "configs/pipelines/stage_2_base.py"
|
| 54 |
config = Config.fromfile(config)
|
| 55 |
model = BUILDER.build(config.model).cuda().bfloat16().eval()
|
| 56 |
checkpoint_path = "checkpoints/Puffin-Base.pth"
|
| 57 |
checkpoint = torch.load(checkpoint_path)
|
| 58 |
-
|
| 59 |
|
| 60 |
checkpoint_path_vae = "checkpoints_vae/vae.pth"
|
| 61 |
checkpoint_vae = torch.load(checkpoint_path_vae)
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
description = r"""
|
| 65 |
<b>Official Gradio demo</b> for <a href='https://kangliao929.github.io/projects/puffin/' target='_blank'><b>Thinking with Camera: A Unified Multimodal Model for Camera-Centric Understanding and Generation</b></a>.<br>
|
| 66 |
-
🔥 We make the first attempt to integrate camera geometry into a unified multimodal model, introducing a camera-centric framework (Puffin) to advance multimodal spatial intelligence.<br>
|
| 67 |
🖼️ Try to switch the tasks and choose different prompts or images to get the generation or understanding results.<br>
|
| 68 |
"""
|
| 69 |
|
|
@@ -96,14 +107,14 @@ img_b64 = base64.b64encode(img_bytes).decode()
|
|
| 96 |
|
| 97 |
html_img = f'''
|
| 98 |
<div style="display:flex; justify-content:center; align-items:center; width:100%;">
|
| 99 |
-
<img src="data:image/png;base64,{img_b64}" style="border:none; width:
|
| 100 |
</div>
|
| 101 |
'''
|
| 102 |
|
| 103 |
@torch.inference_mode()
|
| 104 |
@spaces.GPU(duration=120)
|
| 105 |
# Multimodal Understanding function
|
| 106 |
-
def camera_understanding(image_src, question, seed, progress=gr.Progress(track_tqdm=True)):
|
| 107 |
# Clear CUDA cache before generating
|
| 108 |
torch.cuda.empty_cache()
|
| 109 |
|
|
@@ -114,6 +125,8 @@ def camera_understanding(image_src, question, seed, progress=gr.Progress(track_t
|
|
| 114 |
print(torch.cuda.is_available())
|
| 115 |
|
| 116 |
prompt = ("Describe the image in detail. Then reason its spatial distribution and estimate its camera parameters (roll, pitch, and field-of-view).")
|
|
|
|
|
|
|
| 117 |
|
| 118 |
image = Image.fromarray(image_src).convert('RGB')
|
| 119 |
image = center_crop(image)
|
|
@@ -124,11 +137,13 @@ def camera_understanding(image_src, question, seed, progress=gr.Progress(track_t
|
|
| 124 |
x = rearrange(x, 'h w c -> c h w')
|
| 125 |
|
| 126 |
with torch.no_grad():
|
| 127 |
-
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
text = outputs[0]
|
| 130 |
-
|
| 131 |
-
gen = Cam_Generator(mode="base")
|
| 132 |
cam = gen.get_cam(text)
|
| 133 |
|
| 134 |
bgr = np.array(image)[:, :, ::-1].astype(np.float32) / 255.0
|
|
@@ -337,7 +352,10 @@ with gr.Blocks(css=custom_css) as demo:
|
|
| 337 |
roll = gr.Slider(minimum=-0.7854, maximum=0.7854, value=0.1000, step=0.1000, label="roll value")
|
| 338 |
pitch = gr.Slider(minimum=-0.7854, maximum=0.7854, value=-0.1000, step=0.1000, label="pitch value")
|
| 339 |
fov = gr.Slider(minimum=0.3491, maximum=1.8326, value=1.5000, step=0.1000, label="fov value")
|
| 340 |
-
|
|
|
|
|
|
|
|
|
|
| 341 |
|
| 342 |
generation_button = gr.Button("Generate Images")
|
| 343 |
|
|
@@ -383,7 +401,7 @@ with gr.Blocks(css=custom_css) as demo:
|
|
| 383 |
|
| 384 |
generation_button.click(
|
| 385 |
fn=generate_image,
|
| 386 |
-
inputs=[prompt_input, seed_input, roll, pitch, fov],
|
| 387 |
outputs=image_output
|
| 388 |
)
|
| 389 |
|
|
|
|
| 50 |
|
| 51 |
|
| 52 |
##### load model
|
| 53 |
+
# base model
|
| 54 |
config = "configs/pipelines/stage_2_base.py"
|
| 55 |
config = Config.fromfile(config)
|
| 56 |
model = BUILDER.build(config.model).cuda().bfloat16().eval()
|
| 57 |
checkpoint_path = "checkpoints/Puffin-Base.pth"
|
| 58 |
checkpoint = torch.load(checkpoint_path)
|
| 59 |
+
model.load_state_dict(checkpoint, strict=False)
|
| 60 |
|
| 61 |
checkpoint_path_vae = "checkpoints_vae/vae.pth"
|
| 62 |
checkpoint_vae = torch.load(checkpoint_path_vae)
|
| 63 |
+
model.vae.load_state_dict(checkpoint_vae, strict=False)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
# thinking model
|
| 67 |
+
config_thinking = "configs/pipelines/stage_3_thinking.py"
|
| 68 |
+
config_thinking = Config.fromfile(config_thinking)
|
| 69 |
+
model_think = BUILDER.build(config_thinking.model).cuda().bfloat16().eval()
|
| 70 |
+
checkpoint_path = "checkpoints/Puffin-Thinking.pth"
|
| 71 |
+
checkpoint = torch.load(checkpoint_path)
|
| 72 |
+
model_think.load_state_dict(checkpoint, strict=False)
|
| 73 |
+
model_think.vae.load_state_dict(checkpoint_vae, strict=False)
|
| 74 |
|
| 75 |
description = r"""
|
| 76 |
<b>Official Gradio demo</b> for <a href='https://kangliao929.github.io/projects/puffin/' target='_blank'><b>Thinking with Camera: A Unified Multimodal Model for Camera-Centric Understanding and Generation</b></a>.<br>
|
| 77 |
+
🔥 We make the first attempt to integrate camera geometry into a unified multimodal model, introducing a camera-centric framework (<b>Puffin</b>) to advance multimodal spatial intelligence.<br>
|
| 78 |
🖼️ Try to switch the tasks and choose different prompts or images to get the generation or understanding results.<br>
|
| 79 |
"""
|
| 80 |
|
|
|
|
| 107 |
|
| 108 |
html_img = f'''
|
| 109 |
<div style="display:flex; justify-content:center; align-items:center; width:100%;">
|
| 110 |
+
<img src="data:image/png;base64,{img_b64}" style="border:none; width:150px; height:auto;"/>
|
| 111 |
</div>
|
| 112 |
'''
|
| 113 |
|
| 114 |
@torch.inference_mode()
|
| 115 |
@spaces.GPU(duration=120)
|
| 116 |
# Multimodal Understanding function
|
| 117 |
+
def camera_understanding(image_src, thinking_und, question, seed, progress=gr.Progress(track_tqdm=True)):
|
| 118 |
# Clear CUDA cache before generating
|
| 119 |
torch.cuda.empty_cache()
|
| 120 |
|
|
|
|
| 125 |
print(torch.cuda.is_available())
|
| 126 |
|
| 127 |
prompt = ("Describe the image in detail. Then reason its spatial distribution and estimate its camera parameters (roll, pitch, and field-of-view).")
|
| 128 |
+
if thinking_und:
|
| 129 |
+
prompt = ("Reason the spatial distribution of this image in a thinking mode, and then estimate its camera parameters (roll, pitch, and field-of-view).")
|
| 130 |
|
| 131 |
image = Image.fromarray(image_src).convert('RGB')
|
| 132 |
image = center_crop(image)
|
|
|
|
| 137 |
x = rearrange(x, 'h w c -> c h w')
|
| 138 |
|
| 139 |
with torch.no_grad():
|
| 140 |
+
if thinking_und:
|
| 141 |
+
outputs = model_think.understand(prompt=[prompt], pixel_values=[x], progress_bar=False)
|
| 142 |
+
else:
|
| 143 |
+
outputs = model.understand(prompt=[prompt], pixel_values=[x], progress_bar=False)
|
| 144 |
|
| 145 |
text = outputs[0]
|
| 146 |
+
gen = Cam_Generator(mode="cot") if thinking_und else Cam_Generator(mode="base")
|
|
|
|
| 147 |
cam = gen.get_cam(text)
|
| 148 |
|
| 149 |
bgr = np.array(image)[:, :, ::-1].astype(np.float32) / 255.0
|
|
|
|
| 352 |
roll = gr.Slider(minimum=-0.7854, maximum=0.7854, value=0.1000, step=0.1000, label="roll value")
|
| 353 |
pitch = gr.Slider(minimum=-0.7854, maximum=0.7854, value=-0.1000, step=0.1000, label="pitch value")
|
| 354 |
fov = gr.Slider(minimum=0.3491, maximum=1.8326, value=1.5000, step=0.1000, label="fov value")
|
| 355 |
+
with gr.Accordion("Settings", open=True):
|
| 356 |
+
with gr.Row():
|
| 357 |
+
thinking_und = gr.Checkbox(label="Thinking", value=False)
|
| 358 |
+
seed_input = gr.Number(label="Seed (Optional)", precision=0, value=42)
|
| 359 |
|
| 360 |
generation_button = gr.Button("Generate Images")
|
| 361 |
|
|
|
|
| 401 |
|
| 402 |
generation_button.click(
|
| 403 |
fn=generate_image,
|
| 404 |
+
inputs=[prompt_input, seed_input, roll, pitch, fov, thinking_und],
|
| 405 |
outputs=image_output
|
| 406 |
)
|
| 407 |
|