Spaces:

KangLiao
/

Puffin

Running on Zero

App Files Files Community

KangLiao commited on Oct 12

Commit

70207e5

1 Parent(s): 008ea35

init

Browse files

Files changed (1) hide show

app.py +29 -20

app.py CHANGED Viewed

@@ -61,25 +61,31 @@ checkpoint_path_vae = "checkpoints_vae/vae.pth"
 checkpoint_vae = torch.load(checkpoint_path_vae)
 info_vae = model.vae.load_state_dict(checkpoint_vae, strict=False)
-def fig_to_image(fig):
-    buf = io.BytesIO()
-    fig.savefig(buf, format='png', bbox_inches='tight', pad_inches=0)
-    buf.seek(0)
-    img = Image.open(buf).convert('RGB')
-    buf.close()
-    return img
-def extract_up_lat_figs(fig_dict):
-    fig_up, fig_lat = None, None
-    others = {}
-    for k, fig in fig_dict.items():
-        if ("up_field" in k) and (fig_up is None):
-            fig_up = fig
-        elif ("latitude_field" in k) and (fig_lat is None):
-            fig_lat = fig
-        else:
-            others[k] = fig
-    return fig_up, fig_lat, others
 @torch.inference_mode()
@@ -217,7 +223,8 @@ css = '''
 .gradio-container {max-width: 960px !important}
 '''
 with gr.Blocks(css=css) as demo:
-    gr.Markdown("# Puffin")
     with gr.Tab("Camera-controllable Image Generation"):
         gr.Markdown(value="## Camera-controllable Image Generation")
@@ -284,5 +291,7 @@ with gr.Blocks(css=css) as demo:
         inputs=[image_input, und_seed_input],
         outputs=[understanding_output, camera_map]
     )
 demo.launch(share=True)

 checkpoint_vae = torch.load(checkpoint_path_vae)
 info_vae = model.vae.load_state_dict(checkpoint_vae, strict=False)
+description = r"""
+<b>Official Gradio demo</b> for <a href='https://kangliao929.github.io/projects/puffin/' target='_blank'><b>Thinking with Camera: A Unified Multimodal Model for Camera-Centric Understanding and Generation</b></a>.<br>
+🔥 We make the first attempt to seamlessly integrate camera geometry into a unified multimodal model, introducing a camera-centric framework (Puffin) to advance multimodal spatial intelligence.<br>
+🖼️ Try to switch the tasks and choose different prompts or images to get the generation or understanding results.<br>
+"""
+article = r"""<h3>
+<b>If Puffin is helpful, please help to star the <a href='https://github.com/KangLiao929/Puffin' target='_blank'>Github Repo</a>. Thanks!</b></h3>
+<hr>
+📑 **Citation**
+<br>
+If our work is useful for your research, please consider citing:
+```bibtex
+@article{liao2025puffin,
+title={Thinking with Camera: A Unified Multimodal Model for Camera-Centric Understanding and Generation},
+author={Liao, Kang and Wu, Size and Wu, Zhonghua and Jin, Linyi and Wang, Chao and Wang, Yikai and Wang, Fei and Li, Wei and Loy, Chen Change},
+journal={arXiv preprint arXiv:2510.10777},
+year={2025}
+}
+```
+📧 **Contact**
+<br>
+If you have any questions, please feel free to reach me out at <b>kang.liao@ntu.edu.sg</b>.
+<br>
+"""
 @torch.inference_mode()
 .gradio-container {max-width: 960px !important}
 '''
 with gr.Blocks(css=css) as demo:
+    #gr.Markdown("# Puffin")
+    gr.Markdown(description)
     with gr.Tab("Camera-controllable Image Generation"):
         gr.Markdown(value="## Camera-controllable Image Generation")
         inputs=[image_input, und_seed_input],
         outputs=[understanding_output, camera_map]
     )
+    gr.Markdown(article)
 demo.launch(share=True)