init
Browse files
app.py
CHANGED
|
@@ -75,11 +75,11 @@ model_think.vae.load_state_dict(checkpoint_vae, strict=False)
|
|
| 75 |
description = r"""
|
| 76 |
<b>Official Gradio demo</b> for <a href='https://kangliao929.github.io/projects/puffin/' target='_blank'><b>Thinking with Camera: A Unified Multimodal Model for Camera-Centric Understanding and Generation</b></a>.<br>
|
| 77 |
π₯ We make the first attempt to integrate camera geometry into a unified multimodal model, introducing a camera-centric framework (<b>Puffin</b>) to advance multimodal spatial intelligence.<br>
|
| 78 |
-
πΌοΈ Try to switch the
|
| 79 |
"""
|
| 80 |
|
| 81 |
article = r"""<h3>
|
| 82 |
-
<b>If Puffin is helpful, please help to star the <a href='https://github.com/KangLiao929/Puffin' target='_blank'>Github Repo</a>.
|
| 83 |
<hr>
|
| 84 |
|
| 85 |
π **Citation**
|
|
@@ -216,6 +216,10 @@ def generate_image(prompt_scene,
|
|
| 216 |
"The camera parameters (roll, pitch, and field-of-view) are: "
|
| 217 |
f"{roll:.4f}, {pitch:.4f}, {fov:.4f}."
|
| 218 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
gen = Cam_Generator()
|
| 220 |
cam_map = gen.get_cam(prompt_camera).to(model.device)
|
| 221 |
cam_map = cam_map / (math.pi / 2)
|
|
@@ -224,25 +228,41 @@ def generate_image(prompt_scene,
|
|
| 224 |
|
| 225 |
bsz = 4
|
| 226 |
with torch.no_grad():
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
|
| 242 |
images = rearrange(images, 'b c h w -> b h w c')
|
| 243 |
images = torch.clamp(127.5 * images + 128.0, 0, 255).to("cpu", dtype=torch.uint8).numpy()
|
| 244 |
ret_images = [Image.fromarray(image) for image in images]
|
| 245 |
-
return ret_images
|
| 246 |
|
| 247 |
|
| 248 |
# Gradio interface
|
|
@@ -361,6 +381,8 @@ with gr.Blocks(css=custom_css) as demo:
|
|
| 361 |
generation_button = gr.Button("Generate Images")
|
| 362 |
|
| 363 |
image_output = gr.Gallery(label="Generated images", columns=4, rows=1)
|
|
|
|
|
|
|
| 364 |
|
| 365 |
examples_t2i = gr.Examples(
|
| 366 |
label="Prompt examples",
|
|
@@ -405,7 +427,7 @@ with gr.Blocks(css=custom_css) as demo:
|
|
| 405 |
generation_button.click(
|
| 406 |
fn=generate_image,
|
| 407 |
inputs=[prompt_input, seed_input, roll, pitch, fov, thinking_gen],
|
| 408 |
-
outputs=image_output
|
| 409 |
)
|
| 410 |
|
| 411 |
understanding_button.click(
|
|
|
|
| 75 |
description = r"""
|
| 76 |
<b>Official Gradio demo</b> for <a href='https://kangliao929.github.io/projects/puffin/' target='_blank'><b>Thinking with Camera: A Unified Multimodal Model for Camera-Centric Understanding and Generation</b></a>.<br>
|
| 77 |
π₯ We make the first attempt to integrate camera geometry into a unified multimodal model, introducing a camera-centric framework (<b>Puffin</b>) to advance multimodal spatial intelligence.<br>
|
| 78 |
+
πΌοΈ Try to switch the task table and choose different prompts or images to get the generation or understanding results.<br>
|
| 79 |
"""
|
| 80 |
|
| 81 |
article = r"""<h3>
|
| 82 |
+
<b>If Puffin is helpful, please help to star the <a href='https://github.com/KangLiao929/Puffin' target='_blank'>Github Repo</a>. Thank you.</b></h3>
|
| 83 |
<hr>
|
| 84 |
|
| 85 |
π **Citation**
|
|
|
|
| 216 |
"The camera parameters (roll, pitch, and field-of-view) are: "
|
| 217 |
f"{roll:.4f}, {pitch:.4f}, {fov:.4f}."
|
| 218 |
)
|
| 219 |
+
|
| 220 |
+
prompt_thinking = ("Given a scene description and corresponding camera parameters, "
|
| 221 |
+
"merge them into a coherent prompt and generate an accurate visualization "
|
| 222 |
+
"that highlights visual cues for spatial reasoning.")
|
| 223 |
gen = Cam_Generator()
|
| 224 |
cam_map = gen.get_cam(prompt_camera).to(model.device)
|
| 225 |
cam_map = cam_map / (math.pi / 2)
|
|
|
|
| 228 |
|
| 229 |
bsz = 4
|
| 230 |
with torch.no_grad():
|
| 231 |
+
if thinking_gen:
|
| 232 |
+
images, output_reasoning = model_think.generate(
|
| 233 |
+
prompt=[prompt]*bsz,
|
| 234 |
+
cfg_prompt=[""]*bsz,
|
| 235 |
+
pixel_values_init=None,
|
| 236 |
+
cfg_scale=4.5,
|
| 237 |
+
num_steps=50,
|
| 238 |
+
cam_values=[[cam_map]]*bsz,
|
| 239 |
+
progress_bar=False,
|
| 240 |
+
reasoning=thinking_gen,
|
| 241 |
+
prompt_reasoning=[prompt_thinking]*bsz,
|
| 242 |
+
generator=generator,
|
| 243 |
+
height=512,
|
| 244 |
+
width=512
|
| 245 |
+
)
|
| 246 |
+
else:
|
| 247 |
+
images, output_reasoning = model.generate(
|
| 248 |
+
prompt=[prompt]*bsz,
|
| 249 |
+
cfg_prompt=[""]*bsz,
|
| 250 |
+
pixel_values_init=None,
|
| 251 |
+
cfg_scale=4.5,
|
| 252 |
+
num_steps=50,
|
| 253 |
+
cam_values=[[cam_map]]*bsz,
|
| 254 |
+
progress_bar=False,
|
| 255 |
+
reasoning=thinking_gen,
|
| 256 |
+
prompt_reasoning=[prompt_thinking]*bsz,
|
| 257 |
+
generator=generator,
|
| 258 |
+
height=512,
|
| 259 |
+
width=512
|
| 260 |
+
)
|
| 261 |
|
| 262 |
images = rearrange(images, 'b c h w -> b h w c')
|
| 263 |
images = torch.clamp(127.5 * images + 128.0, 0, 255).to("cpu", dtype=torch.uint8).numpy()
|
| 264 |
ret_images = [Image.fromarray(image) for image in images]
|
| 265 |
+
return ret_images, output_reasoning
|
| 266 |
|
| 267 |
|
| 268 |
# Gradio interface
|
|
|
|
| 381 |
generation_button = gr.Button("Generate Images")
|
| 382 |
|
| 383 |
image_output = gr.Gallery(label="Generated images", columns=4, rows=1)
|
| 384 |
+
|
| 385 |
+
output_reasoning = gr.Textbox(label="Response")
|
| 386 |
|
| 387 |
examples_t2i = gr.Examples(
|
| 388 |
label="Prompt examples",
|
|
|
|
| 427 |
generation_button.click(
|
| 428 |
fn=generate_image,
|
| 429 |
inputs=[prompt_input, seed_input, roll, pitch, fov, thinking_gen],
|
| 430 |
+
outputs=[image_output, output_reasoning]
|
| 431 |
)
|
| 432 |
|
| 433 |
understanding_button.click(
|