KangLiao commited on
Commit
2d1f86e
·
1 Parent(s): da23342
Files changed (1) hide show
  1. app.py +15 -25
app.py CHANGED
@@ -47,7 +47,7 @@ else:
47
  @torch.inference_mode()
48
  @spaces.GPU(duration=120)
49
  # Multimodal Understanding function
50
- def multimodal_understanding(image_src, question, seed, progress=gr.Progress(track_tqdm=True)):
51
  # Clear CUDA cache before generating
52
  torch.cuda.empty_cache()
53
 
@@ -84,19 +84,15 @@ def multimodal_understanding(image_src, question, seed, progress=gr.Progress(tra
84
  single_batch["latitude_field"] = cam[2:].unsqueeze(0)
85
 
86
  figs = make_perspective_figures(single_batch, single_batch, n_pairs=1)
 
87
  for k, fig in figs.items():
88
- if "up_field" in k:
89
- suffix = "_up"
90
- elif "latitude_field" in k:
91
- suffix = "_lat"
92
- else:
93
- suffix = f"_{k}"
94
- out_path = os.path.join(save_dir, f"{stem}_camera_map_vis{suffix}.png")
95
- plt.tight_layout()
96
- fig.savefig(out_path, dpi=200, bbox_inches='tight', pad_inches=0)
97
  plt.close(fig)
 
98
 
99
- return text
100
 
101
 
102
  @torch.inference_mode()
@@ -190,11 +186,11 @@ with gr.Blocks(css=css) as demo:
190
  with gr.Tab("Camera Understanding"):
191
  gr.Markdown(value="## Camera Understanding")
192
  image_input = gr.Image()
193
- with gr.Column():
194
- question_input = gr.Textbox(label="Question")
195
 
196
  understanding_button = gr.Button("Chat")
197
  understanding_output = gr.Textbox(label="Response")
 
 
198
 
199
  with gr.Accordion("Advanced options", open=False):
200
  und_seed_input = gr.Number(label="Seed", precision=0, value=42)
@@ -202,16 +198,10 @@ with gr.Blocks(css=css) as demo:
202
  examples_inpainting = gr.Examples(
203
  label="Camera Understanding examples",
204
  examples=[
205
- [
206
- "Is the picture taken in winter?",
207
- "view.jpg",
208
- ],
209
- [
210
- "Briefly describe the image.",
211
- "view.jpg",
212
- ],
213
  ],
214
- inputs=[question_input, image_input],
215
  )
216
 
217
  generation_button.click(
@@ -221,9 +211,9 @@ with gr.Blocks(css=css) as demo:
221
  )
222
 
223
  understanding_button.click(
224
- multimodal_understanding,
225
- inputs=[image_input, question_input, und_seed_input],
226
- outputs=understanding_output
227
  )
228
 
229
  demo.launch(share=True)
 
47
  @torch.inference_mode()
48
  @spaces.GPU(duration=120)
49
  # Multimodal Understanding function
50
+ def camera_understanding(image_src, question, seed, progress=gr.Progress(track_tqdm=True)):
51
  # Clear CUDA cache before generating
52
  torch.cuda.empty_cache()
53
 
 
84
  single_batch["latitude_field"] = cam[2:].unsqueeze(0)
85
 
86
  figs = make_perspective_figures(single_batch, single_batch, n_pairs=1)
87
+ imgs = []
88
  for k, fig in figs.items():
89
+ fig.canvas.draw()
90
+ img = np.array(fig.canvas.renderer.buffer_rgba())
91
+ imgs.append(img)
 
 
 
 
 
 
92
  plt.close(fig)
93
+ merged_imgs = np.concatenate(imgs, axis=1)
94
 
95
+ return text, merged_imgs
96
 
97
 
98
  @torch.inference_mode()
 
186
  with gr.Tab("Camera Understanding"):
187
  gr.Markdown(value="## Camera Understanding")
188
  image_input = gr.Image()
 
 
189
 
190
  understanding_button = gr.Button("Chat")
191
  understanding_output = gr.Textbox(label="Response")
192
+
193
+ image_output = gr.Gallery(label="Camera Maps", columns=1, rows=1)
194
 
195
  with gr.Accordion("Advanced options", open=False):
196
  und_seed_input = gr.Number(label="Seed", precision=0, value=42)
 
198
  examples_inpainting = gr.Examples(
199
  label="Camera Understanding examples",
200
  examples=[
201
+ "assets/1.jpg",
202
+ "assets/2.jpg",
 
 
 
 
 
 
203
  ],
204
+ inputs=image_input,
205
  )
206
 
207
  generation_button.click(
 
211
  )
212
 
213
  understanding_button.click(
214
+ camera_understanding,
215
+ inputs=[image_input, und_seed_input],
216
+ outputs=[understanding_output, image_output]
217
  )
218
 
219
  demo.launch(share=True)