Spaces:

KangLiao
/

Puffin

Running on Zero

App Files Files Community

KangLiao commited on Oct 12

Commit

a44d21a

1 Parent(s): ed5b647

init

Browse files

Files changed (1) hide show

app.py +40 -18

app.py CHANGED Viewed

@@ -75,11 +75,11 @@ model_think.vae.load_state_dict(checkpoint_vae, strict=False)
 description = r"""
 <b>Official Gradio demo</b> for <a href='https://kangliao929.github.io/projects/puffin/' target='_blank'><b>Thinking with Camera: A Unified Multimodal Model for Camera-Centric Understanding and Generation</b></a>.<br>
 🔥 We make the first attempt to integrate camera geometry into a unified multimodal model, introducing a camera-centric framework (<b>Puffin</b>) to advance multimodal spatial intelligence.<br>
-🖼️ Try to switch the tasks and choose different prompts or images to get the generation or understanding results.<br>
 """
 article = r"""<h3>
-<b>If Puffin is helpful, please help to star the <a href='https://github.com/KangLiao929/Puffin' target='_blank'>Github Repo</a>. Thanks!</b></h3>
 <hr>
 📑 **Citation**
@@ -216,6 +216,10 @@ def generate_image(prompt_scene,
         "The camera parameters (roll, pitch, and field-of-view) are: "
         f"{roll:.4f}, {pitch:.4f}, {fov:.4f}."
     )
     gen = Cam_Generator()
     cam_map = gen.get_cam(prompt_camera).to(model.device)
     cam_map = cam_map / (math.pi / 2)
@@ -224,25 +228,41 @@ def generate_image(prompt_scene,
     bsz = 4
     with torch.no_grad():
-        images, output_reasoning = model.generate(
-            prompt=[prompt]*bsz,
-            cfg_prompt=[""]*bsz,
-            pixel_values_init=None,
-            cfg_scale=4.5,
-            num_steps=50,
-            cam_values=[[cam_map]]*bsz,
-            progress_bar=False,
-            reasoning=False,
-            prompt_reasoning=[""]*bsz,
-            generator=generator,
-            height=512,
-            width=512
-        )
         images = rearrange(images, 'b c h w -> b h w c')
         images = torch.clamp(127.5 * images + 128.0, 0, 255).to("cpu", dtype=torch.uint8).numpy()
         ret_images = [Image.fromarray(image) for image in images]
-        return ret_images
 # Gradio interface
@@ -361,6 +381,8 @@ with gr.Blocks(css=custom_css) as demo:
         generation_button = gr.Button("Generate Images")
         image_output = gr.Gallery(label="Generated images", columns=4, rows=1)
         examples_t2i = gr.Examples(
             label="Prompt examples",
@@ -405,7 +427,7 @@ with gr.Blocks(css=custom_css) as demo:
     generation_button.click(
         fn=generate_image,
         inputs=[prompt_input, seed_input, roll, pitch, fov, thinking_gen],
-        outputs=image_output
     )
     understanding_button.click(

 description = r"""
 <b>Official Gradio demo</b> for <a href='https://kangliao929.github.io/projects/puffin/' target='_blank'><b>Thinking with Camera: A Unified Multimodal Model for Camera-Centric Understanding and Generation</b></a>.<br>
 🔥 We make the first attempt to integrate camera geometry into a unified multimodal model, introducing a camera-centric framework (<b>Puffin</b>) to advance multimodal spatial intelligence.<br>
+🖼️ Try to switch the task table and choose different prompts or images to get the generation or understanding results.<br>
 """
 article = r"""<h3>
+<b>If Puffin is helpful, please help to star the <a href='https://github.com/KangLiao929/Puffin' target='_blank'>Github Repo</a>. Thank you.</b></h3>
 <hr>
 📑 **Citation**
         "The camera parameters (roll, pitch, and field-of-view) are: "
         f"{roll:.4f}, {pitch:.4f}, {fov:.4f}."
     )
+    prompt_thinking = ("Given a scene description and corresponding camera parameters, "
+                       "merge them into a coherent prompt and generate an accurate visualization "
+                       "that highlights visual cues for spatial reasoning.")
     gen = Cam_Generator()
     cam_map = gen.get_cam(prompt_camera).to(model.device)
     cam_map = cam_map / (math.pi / 2)
     bsz = 4
     with torch.no_grad():
+        if thinking_gen:
+            images, output_reasoning = model_think.generate(
+                prompt=[prompt]*bsz,
+                cfg_prompt=[""]*bsz,
+                pixel_values_init=None,
+                cfg_scale=4.5,
+                num_steps=50,
+                cam_values=[[cam_map]]*bsz,
+                progress_bar=False,
+                reasoning=thinking_gen,
+                prompt_reasoning=[prompt_thinking]*bsz,
+                generator=generator,
+                height=512,
+                width=512
+            )
+        else:
+            images, output_reasoning = model.generate(
+                prompt=[prompt]*bsz,
+                cfg_prompt=[""]*bsz,
+                pixel_values_init=None,
+                cfg_scale=4.5,
+                num_steps=50,
+                cam_values=[[cam_map]]*bsz,
+                progress_bar=False,
+                reasoning=thinking_gen,
+                prompt_reasoning=[prompt_thinking]*bsz,
+                generator=generator,
+                height=512,
+                width=512
+            )
         images = rearrange(images, 'b c h w -> b h w c')
         images = torch.clamp(127.5 * images + 128.0, 0, 255).to("cpu", dtype=torch.uint8).numpy()
         ret_images = [Image.fromarray(image) for image in images]
+        return ret_images, output_reasoning
 # Gradio interface
         generation_button = gr.Button("Generate Images")
         image_output = gr.Gallery(label="Generated images", columns=4, rows=1)
+        output_reasoning = gr.Textbox(label="Response")
         examples_t2i = gr.Examples(
             label="Prompt examples",
     generation_button.click(
         fn=generate_image,
         inputs=[prompt_input, seed_input, roll, pitch, fov, thinking_gen],
+        outputs=[image_output, output_reasoning]
     )
     understanding_button.click(