init
Browse files
app.py
CHANGED
|
@@ -61,25 +61,31 @@ checkpoint_path_vae = "checkpoints_vae/vae.pth"
|
|
| 61 |
checkpoint_vae = torch.load(checkpoint_path_vae)
|
| 62 |
info_vae = model.vae.load_state_dict(checkpoint_vae, strict=False)
|
| 63 |
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
|
| 85 |
@torch.inference_mode()
|
|
@@ -217,7 +223,8 @@ css = '''
|
|
| 217 |
.gradio-container {max-width: 960px !important}
|
| 218 |
'''
|
| 219 |
with gr.Blocks(css=css) as demo:
|
| 220 |
-
gr.Markdown("# Puffin")
|
|
|
|
| 221 |
|
| 222 |
with gr.Tab("Camera-controllable Image Generation"):
|
| 223 |
gr.Markdown(value="## Camera-controllable Image Generation")
|
|
@@ -284,5 +291,7 @@ with gr.Blocks(css=css) as demo:
|
|
| 284 |
inputs=[image_input, und_seed_input],
|
| 285 |
outputs=[understanding_output, camera_map]
|
| 286 |
)
|
|
|
|
|
|
|
| 287 |
|
| 288 |
demo.launch(share=True)
|
|
|
|
| 61 |
checkpoint_vae = torch.load(checkpoint_path_vae)
|
| 62 |
info_vae = model.vae.load_state_dict(checkpoint_vae, strict=False)
|
| 63 |
|
| 64 |
+
description = r"""
|
| 65 |
+
<b>Official Gradio demo</b> for <a href='https://kangliao929.github.io/projects/puffin/' target='_blank'><b>Thinking with Camera: A Unified Multimodal Model for Camera-Centric Understanding and Generation</b></a>.<br>
|
| 66 |
+
π₯ We make the first attempt to seamlessly integrate camera geometry into a unified multimodal model, introducing a camera-centric framework (Puffin) to advance multimodal spatial intelligence.<br>
|
| 67 |
+
πΌοΈ Try to switch the tasks and choose different prompts or images to get the generation or understanding results.<br>
|
| 68 |
+
"""
|
| 69 |
+
|
| 70 |
+
article = r"""<h3>
|
| 71 |
+
<b>If Puffin is helpful, please help to star the <a href='https://github.com/KangLiao929/Puffin' target='_blank'>Github Repo</a>. Thanks!</b></h3>
|
| 72 |
+
<hr>
|
| 73 |
+
π **Citation**
|
| 74 |
+
<br>
|
| 75 |
+
If our work is useful for your research, please consider citing:
|
| 76 |
+
```bibtex
|
| 77 |
+
@article{liao2025puffin,
|
| 78 |
+
title={Thinking with Camera: A Unified Multimodal Model for Camera-Centric Understanding and Generation},
|
| 79 |
+
author={Liao, Kang and Wu, Size and Wu, Zhonghua and Jin, Linyi and Wang, Chao and Wang, Yikai and Wang, Fei and Li, Wei and Loy, Chen Change},
|
| 80 |
+
journal={arXiv preprint arXiv:2510.10777},
|
| 81 |
+
year={2025}
|
| 82 |
+
}
|
| 83 |
+
```
|
| 84 |
+
π§ **Contact**
|
| 85 |
+
<br>
|
| 86 |
+
If you have any questions, please feel free to reach me out at <b>kang.liao@ntu.edu.sg</b>.
|
| 87 |
+
<br>
|
| 88 |
+
"""
|
| 89 |
|
| 90 |
|
| 91 |
@torch.inference_mode()
|
|
|
|
| 223 |
.gradio-container {max-width: 960px !important}
|
| 224 |
'''
|
| 225 |
with gr.Blocks(css=css) as demo:
|
| 226 |
+
#gr.Markdown("# Puffin")
|
| 227 |
+
gr.Markdown(description)
|
| 228 |
|
| 229 |
with gr.Tab("Camera-controllable Image Generation"):
|
| 230 |
gr.Markdown(value="## Camera-controllable Image Generation")
|
|
|
|
| 291 |
inputs=[image_input, und_seed_input],
|
| 292 |
outputs=[understanding_output, camera_map]
|
| 293 |
)
|
| 294 |
+
|
| 295 |
+
gr.Markdown(article)
|
| 296 |
|
| 297 |
demo.launch(share=True)
|