Spaces:
Running
on
Zero
Running
on
Zero
Show inference time for both models (#2)
Browse files- Show inference time for both models (089817df1694b76f37a405a84faf136be48f86b8)
Co-authored-by: Vik Korrapati <vikhyatk@users.noreply.huggingface.co>
app.py
CHANGED
|
@@ -2,6 +2,7 @@ import random
|
|
| 2 |
import requests
|
| 3 |
import json
|
| 4 |
import ast
|
|
|
|
| 5 |
|
| 6 |
import matplotlib.pyplot as plt
|
| 7 |
from PIL import Image, ImageDraw, ImageFont
|
|
@@ -156,6 +157,7 @@ def detect_qwen(image, prompt):
|
|
| 156 |
}
|
| 157 |
]
|
| 158 |
|
|
|
|
| 159 |
text = processor_qwen.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 160 |
image_inputs, video_inputs = process_vision_info(messages)
|
| 161 |
inputs = processor_qwen(
|
|
@@ -173,37 +175,41 @@ def detect_qwen(image, prompt):
|
|
| 173 |
output_text = processor_qwen.batch_decode(
|
| 174 |
generated_ids_trimmed, do_sample=True, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
| 175 |
)[0]
|
|
|
|
| 176 |
|
| 177 |
input_height = inputs['image_grid_thw'][0][1] * 14
|
| 178 |
input_width = inputs['image_grid_thw'][0][2] * 14
|
| 179 |
|
| 180 |
annotated_image = create_annotated_image(image, output_text, input_height, input_width)
|
| 181 |
|
| 182 |
-
|
|
|
|
| 183 |
|
| 184 |
|
| 185 |
@GPU
|
| 186 |
def detect_moondream(image, prompt, category_input):
|
|
|
|
| 187 |
if category_input in ["Object Detection", "Visual Grounding + Object Detection"]:
|
| 188 |
output_text = model_moondream.detect(image=image, object=prompt)
|
| 189 |
elif category_input == "Visual Grounding + Keypoint Detection":
|
| 190 |
output_text = model_moondream.point(image=image, object=prompt)
|
| 191 |
else:
|
| 192 |
output_text = model_moondream.query(image=image, question=prompt, reasoning=True)
|
|
|
|
| 193 |
|
| 194 |
annotated_image = create_annotated_image_normalized(image=image, json_data=output_text, label="object", explicit_color=None)
|
| 195 |
-
|
| 196 |
-
return annotated_image, output_text
|
| 197 |
|
| 198 |
-
|
|
|
|
|
|
|
| 199 |
def detect(image, prompt_model_1, prompt_model_2, category_input):
|
| 200 |
STANDARD_SIZE = (1024, 1024)
|
| 201 |
image.thumbnail(STANDARD_SIZE)
|
| 202 |
|
| 203 |
-
annotated_image_model_1, output_text_model_1 = detect_qwen(image, prompt_model_1)
|
| 204 |
-
annotated_image_model_2, output_text_model_2 = detect_moondream(image, prompt_model_2, category_input)
|
| 205 |
|
| 206 |
-
return annotated_image_model_1, output_text_model_1, annotated_image_model_2, output_text_model_2
|
| 207 |
|
| 208 |
css_hide_share = """
|
| 209 |
button#gradio-share-link-button-0 {
|
|
@@ -253,10 +259,12 @@ with gr.Blocks(theme=Ocean(), css=css_hide_share) as demo:
|
|
| 253 |
with gr.Column(scale=1):
|
| 254 |
output_image_model_1 = gr.Image(type="pil", label=f"Annotated image for {model_qwen_name}", height=400)
|
| 255 |
output_textbox_model_1 = gr.Textbox(label=f"Model response for {model_qwen_name}", lines=10)
|
|
|
|
| 256 |
|
| 257 |
with gr.Column(scale=1):
|
| 258 |
output_image_model_2 = gr.Image(type="pil", label=f"Annotated image for {model_moondream_name}", height=400)
|
| 259 |
output_textbox_model_2 = gr.Textbox(label=f"Model response for {model_moondream_name}", lines=10)
|
|
|
|
| 260 |
|
| 261 |
gr.Markdown("### Examples")
|
| 262 |
example_prompts = [
|
|
@@ -276,8 +284,15 @@ with gr.Blocks(theme=Ocean(), css=css_hide_share) as demo:
|
|
| 276 |
label="Click an example to populate the input"
|
| 277 |
)
|
| 278 |
|
| 279 |
-
generate_btn.click(
|
| 280 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
if __name__ == "__main__":
|
| 282 |
demo.launch()
|
| 283 |
|
|
|
|
| 2 |
import requests
|
| 3 |
import json
|
| 4 |
import ast
|
| 5 |
+
import time
|
| 6 |
|
| 7 |
import matplotlib.pyplot as plt
|
| 8 |
from PIL import Image, ImageDraw, ImageFont
|
|
|
|
| 157 |
}
|
| 158 |
]
|
| 159 |
|
| 160 |
+
t0 = time.perf_counter()
|
| 161 |
text = processor_qwen.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 162 |
image_inputs, video_inputs = process_vision_info(messages)
|
| 163 |
inputs = processor_qwen(
|
|
|
|
| 175 |
output_text = processor_qwen.batch_decode(
|
| 176 |
generated_ids_trimmed, do_sample=True, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
| 177 |
)[0]
|
| 178 |
+
elapsed_ms = (time.perf_counter() - t0) * 1_000
|
| 179 |
|
| 180 |
input_height = inputs['image_grid_thw'][0][1] * 14
|
| 181 |
input_width = inputs['image_grid_thw'][0][2] * 14
|
| 182 |
|
| 183 |
annotated_image = create_annotated_image(image, output_text, input_height, input_width)
|
| 184 |
|
| 185 |
+
time_taken = f"**Inference time ({model_qwen_name}):** {elapsed_ms:.0f} ms"
|
| 186 |
+
return annotated_image, output_text, time_taken
|
| 187 |
|
| 188 |
|
| 189 |
@GPU
|
| 190 |
def detect_moondream(image, prompt, category_input):
|
| 191 |
+
t0 = time.perf_counter()
|
| 192 |
if category_input in ["Object Detection", "Visual Grounding + Object Detection"]:
|
| 193 |
output_text = model_moondream.detect(image=image, object=prompt)
|
| 194 |
elif category_input == "Visual Grounding + Keypoint Detection":
|
| 195 |
output_text = model_moondream.point(image=image, object=prompt)
|
| 196 |
else:
|
| 197 |
output_text = model_moondream.query(image=image, question=prompt, reasoning=True)
|
| 198 |
+
elapsed_ms = (time.perf_counter() - t0) * 1_000
|
| 199 |
|
| 200 |
annotated_image = create_annotated_image_normalized(image=image, json_data=output_text, label="object", explicit_color=None)
|
|
|
|
|
|
|
| 201 |
|
| 202 |
+
time_taken = f"**Inference time ({model_moondream_name}):** {elapsed_ms:.0f} ms"
|
| 203 |
+
return annotated_image, output_text, time_taken
|
| 204 |
+
|
| 205 |
def detect(image, prompt_model_1, prompt_model_2, category_input):
|
| 206 |
STANDARD_SIZE = (1024, 1024)
|
| 207 |
image.thumbnail(STANDARD_SIZE)
|
| 208 |
|
| 209 |
+
annotated_image_model_1, output_text_model_1, timing_1 = detect_qwen(image, prompt_model_1)
|
| 210 |
+
annotated_image_model_2, output_text_model_2, timing_2 = detect_moondream(image, prompt_model_2, category_input)
|
| 211 |
|
| 212 |
+
return annotated_image_model_1, output_text_model_1, timing_1, annotated_image_model_2, output_text_model_2, timing_2
|
| 213 |
|
| 214 |
css_hide_share = """
|
| 215 |
button#gradio-share-link-button-0 {
|
|
|
|
| 259 |
with gr.Column(scale=1):
|
| 260 |
output_image_model_1 = gr.Image(type="pil", label=f"Annotated image for {model_qwen_name}", height=400)
|
| 261 |
output_textbox_model_1 = gr.Textbox(label=f"Model response for {model_qwen_name}", lines=10)
|
| 262 |
+
output_time_model_1 = gr.Markdown()
|
| 263 |
|
| 264 |
with gr.Column(scale=1):
|
| 265 |
output_image_model_2 = gr.Image(type="pil", label=f"Annotated image for {model_moondream_name}", height=400)
|
| 266 |
output_textbox_model_2 = gr.Textbox(label=f"Model response for {model_moondream_name}", lines=10)
|
| 267 |
+
output_time_model_2 = gr.Markdown()
|
| 268 |
|
| 269 |
gr.Markdown("### Examples")
|
| 270 |
example_prompts = [
|
|
|
|
| 284 |
label="Click an example to populate the input"
|
| 285 |
)
|
| 286 |
|
| 287 |
+
generate_btn.click(
|
| 288 |
+
fn=detect,
|
| 289 |
+
inputs=[image_input, prompt_input_model_1, prompt_input_model_2, category_input],
|
| 290 |
+
outputs=[
|
| 291 |
+
output_image_model_1, output_textbox_model_1, output_time_model_1,
|
| 292 |
+
output_image_model_2, output_textbox_model_2, output_time_model_2
|
| 293 |
+
]
|
| 294 |
+
)
|
| 295 |
+
|
| 296 |
if __name__ == "__main__":
|
| 297 |
demo.launch()
|
| 298 |
|