Spaces:

KangLiao
/

Puffin

Running on Zero

App Files Files Community

KangLiao commited on Oct 12

Commit

fb41543

1 Parent(s): a72f5f3

init

Browse files

Files changed (1) hide show

app.py +28 -10

app.py CHANGED Viewed

@@ -50,20 +50,31 @@ def center_crop(image):
 ##### load model
 config = "configs/pipelines/stage_2_base.py"
 config = Config.fromfile(config)
 model = BUILDER.build(config.model).cuda().bfloat16().eval()
 checkpoint_path = "checkpoints/Puffin-Base.pth"
 checkpoint = torch.load(checkpoint_path)
-info = model.load_state_dict(checkpoint, strict=False)
 checkpoint_path_vae = "checkpoints_vae/vae.pth"
 checkpoint_vae = torch.load(checkpoint_path_vae)
-info_vae = model.vae.load_state_dict(checkpoint_vae, strict=False)
 description = r"""
 <b>Official Gradio demo</b> for <a href='https://kangliao929.github.io/projects/puffin/' target='_blank'><b>Thinking with Camera: A Unified Multimodal Model for Camera-Centric Understanding and Generation</b></a>.<br>
-🔥 We make the first attempt to integrate camera geometry into a unified multimodal model, introducing a camera-centric framework (Puffin) to advance multimodal spatial intelligence.<br>
 🖼️ Try to switch the tasks and choose different prompts or images to get the generation or understanding results.<br>
 """
@@ -96,14 +107,14 @@ img_b64 = base64.b64encode(img_bytes).decode()
 html_img = f'''
 <div style="display:flex; justify-content:center; align-items:center; width:100%;">
-    <img src="data:image/png;base64,{img_b64}" style="border:none; width:200px; height:auto;"/>
 </div>
 '''
 @torch.inference_mode()
 @spaces.GPU(duration=120)
 # Multimodal Understanding function
-def camera_understanding(image_src, question, seed, progress=gr.Progress(track_tqdm=True)):
     # Clear CUDA cache before generating
     torch.cuda.empty_cache()
@@ -114,6 +125,8 @@ def camera_understanding(image_src, question, seed, progress=gr.Progress(track_t
     print(torch.cuda.is_available())
     prompt = ("Describe the image in detail. Then reason its spatial distribution and estimate its camera parameters (roll, pitch, and field-of-view).")
     image = Image.fromarray(image_src).convert('RGB')
     image = center_crop(image)
@@ -124,11 +137,13 @@ def camera_understanding(image_src, question, seed, progress=gr.Progress(track_t
     x = rearrange(x, 'h w c -> c h w')
     with torch.no_grad():
-        outputs = model.understand(prompt=[prompt], pixel_values=[x], progress_bar=False)
     text = outputs[0]
-    gen = Cam_Generator(mode="base")
     cam = gen.get_cam(text)
     bgr = np.array(image)[:, :, ::-1].astype(np.float32) / 255.0
@@ -337,7 +352,10 @@ with gr.Blocks(css=custom_css) as demo:
                 roll = gr.Slider(minimum=-0.7854, maximum=0.7854, value=0.1000, step=0.1000, label="roll value")
                 pitch = gr.Slider(minimum=-0.7854, maximum=0.7854, value=-0.1000, step=0.1000, label="pitch value")
                 fov = gr.Slider(minimum=0.3491, maximum=1.8326, value=1.5000, step=0.1000, label="fov value")
-            seed_input = gr.Number(label="Seed (Optional)", precision=0, value=42)
         generation_button = gr.Button("Generate Images")
@@ -383,7 +401,7 @@ with gr.Blocks(css=custom_css) as demo:
     generation_button.click(
         fn=generate_image,
-        inputs=[prompt_input, seed_input, roll, pitch, fov],
         outputs=image_output
     )

 ##### load model
+# base model
 config = "configs/pipelines/stage_2_base.py"
 config = Config.fromfile(config)
 model = BUILDER.build(config.model).cuda().bfloat16().eval()
 checkpoint_path = "checkpoints/Puffin-Base.pth"
 checkpoint = torch.load(checkpoint_path)
+model.load_state_dict(checkpoint, strict=False)
 checkpoint_path_vae = "checkpoints_vae/vae.pth"
 checkpoint_vae = torch.load(checkpoint_path_vae)
+model.vae.load_state_dict(checkpoint_vae, strict=False)
+# thinking model
+config_thinking = "configs/pipelines/stage_3_thinking.py"
+config_thinking = Config.fromfile(config_thinking)
+model_think = BUILDER.build(config_thinking.model).cuda().bfloat16().eval()
+checkpoint_path = "checkpoints/Puffin-Thinking.pth"
+checkpoint = torch.load(checkpoint_path)
+model_think.load_state_dict(checkpoint, strict=False)
+model_think.vae.load_state_dict(checkpoint_vae, strict=False)
 description = r"""
 <b>Official Gradio demo</b> for <a href='https://kangliao929.github.io/projects/puffin/' target='_blank'><b>Thinking with Camera: A Unified Multimodal Model for Camera-Centric Understanding and Generation</b></a>.<br>
+🔥 We make the first attempt to integrate camera geometry into a unified multimodal model, introducing a camera-centric framework (<b>Puffin</b>) to advance multimodal spatial intelligence.<br>
 🖼️ Try to switch the tasks and choose different prompts or images to get the generation or understanding results.<br>
 """
 html_img = f'''
 <div style="display:flex; justify-content:center; align-items:center; width:100%;">
+    <img src="data:image/png;base64,{img_b64}" style="border:none; width:150px; height:auto;"/>
 </div>
 '''
 @torch.inference_mode()
 @spaces.GPU(duration=120)
 # Multimodal Understanding function
+def camera_understanding(image_src, thinking_und, question, seed, progress=gr.Progress(track_tqdm=True)):
     # Clear CUDA cache before generating
     torch.cuda.empty_cache()
     print(torch.cuda.is_available())
     prompt = ("Describe the image in detail. Then reason its spatial distribution and estimate its camera parameters (roll, pitch, and field-of-view).")
+    if thinking_und:
+        prompt = ("Reason the spatial distribution of this image in a thinking mode, and then estimate its camera parameters (roll, pitch, and field-of-view).")
     image = Image.fromarray(image_src).convert('RGB')
     image = center_crop(image)
     x = rearrange(x, 'h w c -> c h w')
     with torch.no_grad():
+        if thinking_und:
+            outputs = model_think.understand(prompt=[prompt], pixel_values=[x], progress_bar=False)
+        else:
+            outputs = model.understand(prompt=[prompt], pixel_values=[x], progress_bar=False)
     text = outputs[0]
+    gen = Cam_Generator(mode="cot") if thinking_und else Cam_Generator(mode="base")
     cam = gen.get_cam(text)
     bgr = np.array(image)[:, :, ::-1].astype(np.float32) / 255.0
                 roll = gr.Slider(minimum=-0.7854, maximum=0.7854, value=0.1000, step=0.1000, label="roll value")
                 pitch = gr.Slider(minimum=-0.7854, maximum=0.7854, value=-0.1000, step=0.1000, label="pitch value")
                 fov = gr.Slider(minimum=0.3491, maximum=1.8326, value=1.5000, step=0.1000, label="fov value")
+        with gr.Accordion("Settings", open=True):
+            with gr.Row():
+                thinking_und = gr.Checkbox(label="Thinking", value=False)
+                seed_input = gr.Number(label="Seed (Optional)", precision=0, value=42)
         generation_button = gr.Button("Generate Images")
     generation_button.click(
         fn=generate_image,
+        inputs=[prompt_input, seed_input, roll, pitch, fov, thinking_und],
         outputs=image_output
     )