KangLiao commited on
Commit
70207e5
Β·
1 Parent(s): 008ea35
Files changed (1) hide show
  1. app.py +29 -20
app.py CHANGED
@@ -61,25 +61,31 @@ checkpoint_path_vae = "checkpoints_vae/vae.pth"
61
  checkpoint_vae = torch.load(checkpoint_path_vae)
62
  info_vae = model.vae.load_state_dict(checkpoint_vae, strict=False)
63
 
64
- def fig_to_image(fig):
65
- buf = io.BytesIO()
66
- fig.savefig(buf, format='png', bbox_inches='tight', pad_inches=0)
67
- buf.seek(0)
68
- img = Image.open(buf).convert('RGB')
69
- buf.close()
70
- return img
71
-
72
- def extract_up_lat_figs(fig_dict):
73
- fig_up, fig_lat = None, None
74
- others = {}
75
- for k, fig in fig_dict.items():
76
- if ("up_field" in k) and (fig_up is None):
77
- fig_up = fig
78
- elif ("latitude_field" in k) and (fig_lat is None):
79
- fig_lat = fig
80
- else:
81
- others[k] = fig
82
- return fig_up, fig_lat, others
 
 
 
 
 
 
83
 
84
 
85
  @torch.inference_mode()
@@ -217,7 +223,8 @@ css = '''
217
  .gradio-container {max-width: 960px !important}
218
  '''
219
  with gr.Blocks(css=css) as demo:
220
- gr.Markdown("# Puffin")
 
221
 
222
  with gr.Tab("Camera-controllable Image Generation"):
223
  gr.Markdown(value="## Camera-controllable Image Generation")
@@ -284,5 +291,7 @@ with gr.Blocks(css=css) as demo:
284
  inputs=[image_input, und_seed_input],
285
  outputs=[understanding_output, camera_map]
286
  )
 
 
287
 
288
  demo.launch(share=True)
 
61
  checkpoint_vae = torch.load(checkpoint_path_vae)
62
  info_vae = model.vae.load_state_dict(checkpoint_vae, strict=False)
63
 
64
+ description = r"""
65
+ <b>Official Gradio demo</b> for <a href='https://kangliao929.github.io/projects/puffin/' target='_blank'><b>Thinking with Camera: A Unified Multimodal Model for Camera-Centric Understanding and Generation</b></a>.<br>
66
+ πŸ”₯ We make the first attempt to seamlessly integrate camera geometry into a unified multimodal model, introducing a camera-centric framework (Puffin) to advance multimodal spatial intelligence.<br>
67
+ πŸ–ΌοΈ Try to switch the tasks and choose different prompts or images to get the generation or understanding results.<br>
68
+ """
69
+
70
+ article = r"""<h3>
71
+ <b>If Puffin is helpful, please help to star the <a href='https://github.com/KangLiao929/Puffin' target='_blank'>Github Repo</a>. Thanks!</b></h3>
72
+ <hr>
73
+ πŸ“‘ **Citation**
74
+ <br>
75
+ If our work is useful for your research, please consider citing:
76
+ ```bibtex
77
+ @article{liao2025puffin,
78
+ title={Thinking with Camera: A Unified Multimodal Model for Camera-Centric Understanding and Generation},
79
+ author={Liao, Kang and Wu, Size and Wu, Zhonghua and Jin, Linyi and Wang, Chao and Wang, Yikai and Wang, Fei and Li, Wei and Loy, Chen Change},
80
+ journal={arXiv preprint arXiv:2510.10777},
81
+ year={2025}
82
+ }
83
+ ```
84
+ πŸ“§ **Contact**
85
+ <br>
86
+ If you have any questions, please feel free to reach me out at <b>kang.liao@ntu.edu.sg</b>.
87
+ <br>
88
+ """
89
 
90
 
91
  @torch.inference_mode()
 
223
  .gradio-container {max-width: 960px !important}
224
  '''
225
  with gr.Blocks(css=css) as demo:
226
+ #gr.Markdown("# Puffin")
227
+ gr.Markdown(description)
228
 
229
  with gr.Tab("Camera-controllable Image Generation"):
230
  gr.Markdown(value="## Camera-controllable Image Generation")
 
291
  inputs=[image_input, und_seed_input],
292
  outputs=[understanding_output, camera_map]
293
  )
294
+
295
+ gr.Markdown(article)
296
 
297
  demo.launch(share=True)