KangLiao commited on
Commit
fb41543
·
1 Parent(s): a72f5f3
Files changed (1) hide show
  1. app.py +28 -10
app.py CHANGED
@@ -50,20 +50,31 @@ def center_crop(image):
50
 
51
 
52
  ##### load model
 
53
  config = "configs/pipelines/stage_2_base.py"
54
  config = Config.fromfile(config)
55
  model = BUILDER.build(config.model).cuda().bfloat16().eval()
56
  checkpoint_path = "checkpoints/Puffin-Base.pth"
57
  checkpoint = torch.load(checkpoint_path)
58
- info = model.load_state_dict(checkpoint, strict=False)
59
 
60
  checkpoint_path_vae = "checkpoints_vae/vae.pth"
61
  checkpoint_vae = torch.load(checkpoint_path_vae)
62
- info_vae = model.vae.load_state_dict(checkpoint_vae, strict=False)
 
 
 
 
 
 
 
 
 
 
63
 
64
  description = r"""
65
  <b>Official Gradio demo</b> for <a href='https://kangliao929.github.io/projects/puffin/' target='_blank'><b>Thinking with Camera: A Unified Multimodal Model for Camera-Centric Understanding and Generation</b></a>.<br>
66
- 🔥 We make the first attempt to integrate camera geometry into a unified multimodal model, introducing a camera-centric framework (Puffin) to advance multimodal spatial intelligence.<br>
67
  🖼️ Try to switch the tasks and choose different prompts or images to get the generation or understanding results.<br>
68
  """
69
 
@@ -96,14 +107,14 @@ img_b64 = base64.b64encode(img_bytes).decode()
96
 
97
  html_img = f'''
98
  <div style="display:flex; justify-content:center; align-items:center; width:100%;">
99
- <img src="data:image/png;base64,{img_b64}" style="border:none; width:200px; height:auto;"/>
100
  </div>
101
  '''
102
 
103
  @torch.inference_mode()
104
  @spaces.GPU(duration=120)
105
  # Multimodal Understanding function
106
- def camera_understanding(image_src, question, seed, progress=gr.Progress(track_tqdm=True)):
107
  # Clear CUDA cache before generating
108
  torch.cuda.empty_cache()
109
 
@@ -114,6 +125,8 @@ def camera_understanding(image_src, question, seed, progress=gr.Progress(track_t
114
  print(torch.cuda.is_available())
115
 
116
  prompt = ("Describe the image in detail. Then reason its spatial distribution and estimate its camera parameters (roll, pitch, and field-of-view).")
 
 
117
 
118
  image = Image.fromarray(image_src).convert('RGB')
119
  image = center_crop(image)
@@ -124,11 +137,13 @@ def camera_understanding(image_src, question, seed, progress=gr.Progress(track_t
124
  x = rearrange(x, 'h w c -> c h w')
125
 
126
  with torch.no_grad():
127
- outputs = model.understand(prompt=[prompt], pixel_values=[x], progress_bar=False)
 
 
 
128
 
129
  text = outputs[0]
130
-
131
- gen = Cam_Generator(mode="base")
132
  cam = gen.get_cam(text)
133
 
134
  bgr = np.array(image)[:, :, ::-1].astype(np.float32) / 255.0
@@ -337,7 +352,10 @@ with gr.Blocks(css=custom_css) as demo:
337
  roll = gr.Slider(minimum=-0.7854, maximum=0.7854, value=0.1000, step=0.1000, label="roll value")
338
  pitch = gr.Slider(minimum=-0.7854, maximum=0.7854, value=-0.1000, step=0.1000, label="pitch value")
339
  fov = gr.Slider(minimum=0.3491, maximum=1.8326, value=1.5000, step=0.1000, label="fov value")
340
- seed_input = gr.Number(label="Seed (Optional)", precision=0, value=42)
 
 
 
341
 
342
  generation_button = gr.Button("Generate Images")
343
 
@@ -383,7 +401,7 @@ with gr.Blocks(css=custom_css) as demo:
383
 
384
  generation_button.click(
385
  fn=generate_image,
386
- inputs=[prompt_input, seed_input, roll, pitch, fov],
387
  outputs=image_output
388
  )
389
 
 
50
 
51
 
52
  ##### load model
53
+ # base model
54
  config = "configs/pipelines/stage_2_base.py"
55
  config = Config.fromfile(config)
56
  model = BUILDER.build(config.model).cuda().bfloat16().eval()
57
  checkpoint_path = "checkpoints/Puffin-Base.pth"
58
  checkpoint = torch.load(checkpoint_path)
59
+ model.load_state_dict(checkpoint, strict=False)
60
 
61
  checkpoint_path_vae = "checkpoints_vae/vae.pth"
62
  checkpoint_vae = torch.load(checkpoint_path_vae)
63
+ model.vae.load_state_dict(checkpoint_vae, strict=False)
64
+
65
+
66
+ # thinking model
67
+ config_thinking = "configs/pipelines/stage_3_thinking.py"
68
+ config_thinking = Config.fromfile(config_thinking)
69
+ model_think = BUILDER.build(config_thinking.model).cuda().bfloat16().eval()
70
+ checkpoint_path = "checkpoints/Puffin-Thinking.pth"
71
+ checkpoint = torch.load(checkpoint_path)
72
+ model_think.load_state_dict(checkpoint, strict=False)
73
+ model_think.vae.load_state_dict(checkpoint_vae, strict=False)
74
 
75
  description = r"""
76
  <b>Official Gradio demo</b> for <a href='https://kangliao929.github.io/projects/puffin/' target='_blank'><b>Thinking with Camera: A Unified Multimodal Model for Camera-Centric Understanding and Generation</b></a>.<br>
77
+ 🔥 We make the first attempt to integrate camera geometry into a unified multimodal model, introducing a camera-centric framework (<b>Puffin</b>) to advance multimodal spatial intelligence.<br>
78
  🖼️ Try to switch the tasks and choose different prompts or images to get the generation or understanding results.<br>
79
  """
80
 
 
107
 
108
  html_img = f'''
109
  <div style="display:flex; justify-content:center; align-items:center; width:100%;">
110
+ <img src="data:image/png;base64,{img_b64}" style="border:none; width:150px; height:auto;"/>
111
  </div>
112
  '''
113
 
114
  @torch.inference_mode()
115
  @spaces.GPU(duration=120)
116
  # Multimodal Understanding function
117
+ def camera_understanding(image_src, thinking_und, question, seed, progress=gr.Progress(track_tqdm=True)):
118
  # Clear CUDA cache before generating
119
  torch.cuda.empty_cache()
120
 
 
125
  print(torch.cuda.is_available())
126
 
127
  prompt = ("Describe the image in detail. Then reason its spatial distribution and estimate its camera parameters (roll, pitch, and field-of-view).")
128
+ if thinking_und:
129
+ prompt = ("Reason the spatial distribution of this image in a thinking mode, and then estimate its camera parameters (roll, pitch, and field-of-view).")
130
 
131
  image = Image.fromarray(image_src).convert('RGB')
132
  image = center_crop(image)
 
137
  x = rearrange(x, 'h w c -> c h w')
138
 
139
  with torch.no_grad():
140
+ if thinking_und:
141
+ outputs = model_think.understand(prompt=[prompt], pixel_values=[x], progress_bar=False)
142
+ else:
143
+ outputs = model.understand(prompt=[prompt], pixel_values=[x], progress_bar=False)
144
 
145
  text = outputs[0]
146
+ gen = Cam_Generator(mode="cot") if thinking_und else Cam_Generator(mode="base")
 
147
  cam = gen.get_cam(text)
148
 
149
  bgr = np.array(image)[:, :, ::-1].astype(np.float32) / 255.0
 
352
  roll = gr.Slider(minimum=-0.7854, maximum=0.7854, value=0.1000, step=0.1000, label="roll value")
353
  pitch = gr.Slider(minimum=-0.7854, maximum=0.7854, value=-0.1000, step=0.1000, label="pitch value")
354
  fov = gr.Slider(minimum=0.3491, maximum=1.8326, value=1.5000, step=0.1000, label="fov value")
355
+ with gr.Accordion("Settings", open=True):
356
+ with gr.Row():
357
+ thinking_und = gr.Checkbox(label="Thinking", value=False)
358
+ seed_input = gr.Number(label="Seed (Optional)", precision=0, value=42)
359
 
360
  generation_button = gr.Button("Generate Images")
361
 
 
401
 
402
  generation_button.click(
403
  fn=generate_image,
404
+ inputs=[prompt_input, seed_input, roll, pitch, fov, thinking_und],
405
  outputs=image_output
406
  )
407