KangLiao commited on
Commit
a44d21a
Β·
1 Parent(s): ed5b647
Files changed (1) hide show
  1. app.py +40 -18
app.py CHANGED
@@ -75,11 +75,11 @@ model_think.vae.load_state_dict(checkpoint_vae, strict=False)
75
  description = r"""
76
  <b>Official Gradio demo</b> for <a href='https://kangliao929.github.io/projects/puffin/' target='_blank'><b>Thinking with Camera: A Unified Multimodal Model for Camera-Centric Understanding and Generation</b></a>.<br>
77
  πŸ”₯ We make the first attempt to integrate camera geometry into a unified multimodal model, introducing a camera-centric framework (<b>Puffin</b>) to advance multimodal spatial intelligence.<br>
78
- πŸ–ΌοΈ Try to switch the tasks and choose different prompts or images to get the generation or understanding results.<br>
79
  """
80
 
81
  article = r"""<h3>
82
- <b>If Puffin is helpful, please help to star the <a href='https://github.com/KangLiao929/Puffin' target='_blank'>Github Repo</a>. Thanks!</b></h3>
83
  <hr>
84
 
85
  πŸ“‘ **Citation**
@@ -216,6 +216,10 @@ def generate_image(prompt_scene,
216
  "The camera parameters (roll, pitch, and field-of-view) are: "
217
  f"{roll:.4f}, {pitch:.4f}, {fov:.4f}."
218
  )
 
 
 
 
219
  gen = Cam_Generator()
220
  cam_map = gen.get_cam(prompt_camera).to(model.device)
221
  cam_map = cam_map / (math.pi / 2)
@@ -224,25 +228,41 @@ def generate_image(prompt_scene,
224
 
225
  bsz = 4
226
  with torch.no_grad():
227
- images, output_reasoning = model.generate(
228
- prompt=[prompt]*bsz,
229
- cfg_prompt=[""]*bsz,
230
- pixel_values_init=None,
231
- cfg_scale=4.5,
232
- num_steps=50,
233
- cam_values=[[cam_map]]*bsz,
234
- progress_bar=False,
235
- reasoning=False,
236
- prompt_reasoning=[""]*bsz,
237
- generator=generator,
238
- height=512,
239
- width=512
240
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
 
242
  images = rearrange(images, 'b c h w -> b h w c')
243
  images = torch.clamp(127.5 * images + 128.0, 0, 255).to("cpu", dtype=torch.uint8).numpy()
244
  ret_images = [Image.fromarray(image) for image in images]
245
- return ret_images
246
 
247
 
248
  # Gradio interface
@@ -361,6 +381,8 @@ with gr.Blocks(css=custom_css) as demo:
361
  generation_button = gr.Button("Generate Images")
362
 
363
  image_output = gr.Gallery(label="Generated images", columns=4, rows=1)
 
 
364
 
365
  examples_t2i = gr.Examples(
366
  label="Prompt examples",
@@ -405,7 +427,7 @@ with gr.Blocks(css=custom_css) as demo:
405
  generation_button.click(
406
  fn=generate_image,
407
  inputs=[prompt_input, seed_input, roll, pitch, fov, thinking_gen],
408
- outputs=image_output
409
  )
410
 
411
  understanding_button.click(
 
75
  description = r"""
76
  <b>Official Gradio demo</b> for <a href='https://kangliao929.github.io/projects/puffin/' target='_blank'><b>Thinking with Camera: A Unified Multimodal Model for Camera-Centric Understanding and Generation</b></a>.<br>
77
  πŸ”₯ We make the first attempt to integrate camera geometry into a unified multimodal model, introducing a camera-centric framework (<b>Puffin</b>) to advance multimodal spatial intelligence.<br>
78
+ πŸ–ΌοΈ Try to switch the task table and choose different prompts or images to get the generation or understanding results.<br>
79
  """
80
 
81
  article = r"""<h3>
82
+ <b>If Puffin is helpful, please help to star the <a href='https://github.com/KangLiao929/Puffin' target='_blank'>Github Repo</a>. Thank you.</b></h3>
83
  <hr>
84
 
85
  πŸ“‘ **Citation**
 
216
  "The camera parameters (roll, pitch, and field-of-view) are: "
217
  f"{roll:.4f}, {pitch:.4f}, {fov:.4f}."
218
  )
219
+
220
+ prompt_thinking = ("Given a scene description and corresponding camera parameters, "
221
+ "merge them into a coherent prompt and generate an accurate visualization "
222
+ "that highlights visual cues for spatial reasoning.")
223
  gen = Cam_Generator()
224
  cam_map = gen.get_cam(prompt_camera).to(model.device)
225
  cam_map = cam_map / (math.pi / 2)
 
228
 
229
  bsz = 4
230
  with torch.no_grad():
231
+ if thinking_gen:
232
+ images, output_reasoning = model_think.generate(
233
+ prompt=[prompt]*bsz,
234
+ cfg_prompt=[""]*bsz,
235
+ pixel_values_init=None,
236
+ cfg_scale=4.5,
237
+ num_steps=50,
238
+ cam_values=[[cam_map]]*bsz,
239
+ progress_bar=False,
240
+ reasoning=thinking_gen,
241
+ prompt_reasoning=[prompt_thinking]*bsz,
242
+ generator=generator,
243
+ height=512,
244
+ width=512
245
+ )
246
+ else:
247
+ images, output_reasoning = model.generate(
248
+ prompt=[prompt]*bsz,
249
+ cfg_prompt=[""]*bsz,
250
+ pixel_values_init=None,
251
+ cfg_scale=4.5,
252
+ num_steps=50,
253
+ cam_values=[[cam_map]]*bsz,
254
+ progress_bar=False,
255
+ reasoning=thinking_gen,
256
+ prompt_reasoning=[prompt_thinking]*bsz,
257
+ generator=generator,
258
+ height=512,
259
+ width=512
260
+ )
261
 
262
  images = rearrange(images, 'b c h w -> b h w c')
263
  images = torch.clamp(127.5 * images + 128.0, 0, 255).to("cpu", dtype=torch.uint8).numpy()
264
  ret_images = [Image.fromarray(image) for image in images]
265
+ return ret_images, output_reasoning
266
 
267
 
268
  # Gradio interface
 
381
  generation_button = gr.Button("Generate Images")
382
 
383
  image_output = gr.Gallery(label="Generated images", columns=4, rows=1)
384
+
385
+ output_reasoning = gr.Textbox(label="Response")
386
 
387
  examples_t2i = gr.Examples(
388
  label="Prompt examples",
 
427
  generation_button.click(
428
  fn=generate_image,
429
  inputs=[prompt_input, seed_input, roll, pitch, fov, thinking_gen],
430
+ outputs=[image_output, output_reasoning]
431
  )
432
 
433
  understanding_button.click(