Spaces:

ariG23498
/

zero-shot-od

Running on Zero

App Files Files Community

zero-shot-od / app.py

ariG23498 HF Staff

update text labels usage

4fa3f07 verified 3 months ago

raw

history blame contribute delete

8.48 kB

	import time
	from dataclasses import dataclass
	from typing import List, Tuple

	import gradio as gr
	import spaces
	import torch
	from PIL import Image
	from transformers import (
	AutoProcessor,
	AutoModelForZeroShotObjectDetection,
	)

	# ---------------------------
	# Setup
	# ---------------------------

	def extract_model_short_name(model_id: str) -> str:
	return model_id.split("/")[-1].replace("-", " ").replace("_", " ")

	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

	# Model bundles for cleaner wiring
	@dataclass
	class ZSDetBundle:
	model_id: str
	model_name: str
	processor: AutoProcessor
	model: AutoModelForZeroShotObjectDetection

	# LLMDet
	model_llmdet_id = "iSEE-Laboratory/llmdet_tiny"
	processor_llmdet = AutoProcessor.from_pretrained(model_llmdet_id)
	model_llmdet = AutoModelForZeroShotObjectDetection.from_pretrained(model_llmdet_id)
	bundle_llmdet = ZSDetBundle(
	model_id=model_llmdet_id,
	model_name=extract_model_short_name(model_llmdet_id),
	processor=processor_llmdet,
	model=model_llmdet,
	)

	# MM GroundingDINO
	model_mm_grounding_id = "rziga/mm_grounding_dino_tiny_o365v1_goldg"
	processor_mm_grounding = AutoProcessor.from_pretrained(model_mm_grounding_id)
	model_mm_grounding = AutoModelForZeroShotObjectDetection.from_pretrained(model_mm_grounding_id)
	bundle_mm_grounding = ZSDetBundle(
	model_id=model_mm_grounding_id,
	model_name=extract_model_short_name(model_mm_grounding_id),
	processor=processor_mm_grounding,
	model=model_mm_grounding,
	)

	# OMDet Turbo
	model_omdet_id = "omlab/omdet-turbo-swin-tiny-hf"
	processor_omdet = AutoProcessor.from_pretrained(model_omdet_id)
	model_omdet = AutoModelForZeroShotObjectDetection.from_pretrained(model_omdet_id)
	bundle_omdet = ZSDetBundle(
	model_id=model_omdet_id,
	model_name=extract_model_short_name(model_omdet_id),
	processor=processor_omdet,
	model=model_omdet,
	)

	# OWLv2
	model_owlv2_id = "google/owlv2-large-patch14-ensemble"
	processor_owlv2 = AutoProcessor.from_pretrained(model_owlv2_id)
	model_owlv2 = AutoModelForZeroShotObjectDetection.from_pretrained(model_owlv2_id)
	bundle_owlv2 = ZSDetBundle(
	model_id=model_owlv2_id,
	model_name=extract_model_short_name(model_owlv2_id),
	processor=processor_owlv2,
	model=model_owlv2,
	)

	# ---------------------------
	# Inference
	# ---------------------------

	@spaces.GPU
	def detect(
	bundle: ZSDetBundle,
	image: Image.Image,
	prompts: List[str],
	threshold: float,
	) -> Tuple[List[Tuple[Tuple[int, int, int, int], str]], str]:
	"""
	Returns [(bbox, label_score_str), ...], time_str
	"""
	t0 = time.perf_counter()
	device = "cuda" if torch.cuda.is_available() else "cpu"

	# HF zero-shot OD expects list-of-list text
	texts = [prompts]
	inputs = bundle.processor(images=image, text=texts, return_tensors="pt").to(device)
	model = bundle.model.to(device).eval()

	with torch.inference_mode():
	outputs = model(**inputs)

	results = bundle.processor.post_process_grounded_object_detection(
	outputs, threshold=threshold, target_sizes=[image.size[::-1]], text_labels=texts,
	)[0]

	annotations = []

	for box, score, label_name in zip(results["boxes"], results["scores"], results["text_labels"]):
	if float(score) < threshold:
	continue

	xmin, ymin, xmax, ymax = map(lambda v: int(v), box.tolist())
	annotations.append(((xmin, ymin, xmax, ymax), f"{label_name} {float(score):.2f}"))

	elapsed_ms = (time.perf_counter() - t0) * 1000
	time_taken = f"Inference time ({bundle.model_name}): {elapsed_ms:.0f} ms"
	return annotations, time_taken

	def parse_prompts(prompts_str: str) -> List[str]:
	return [p.strip() for p in prompts_str.split(",") if p.strip()]

	def run_detection(
	image: Image.Image,
	prompts_str: str,
	threshold_llm: float,
	threshold_mm: float,
	threshold_owlv2: float,
	threshold_omdet: float,
	):
	prompts = parse_prompts(prompts_str)

	ann_llm, time_llm = detect(bundle_llmdet, image, prompts, threshold_llm)
	ann_mm, time_mm = detect(bundle_mm_grounding, image, prompts, threshold_mm)
	ann_owlv2, time_owlv2 = detect(bundle_owlv2, image, prompts, threshold_owlv2)
	ann_omdet, time_omdet = detect(bundle_omdet, image, prompts, threshold_omdet)

	return (
	(image, ann_llm), time_llm,
	(image, ann_mm), time_mm,
	(image, ann_owlv2), time_owlv2,
	(image, ann_omdet), time_omdet,
	)

	# ---------------------------
	# Compact Description
	# ---------------------------

	description_md = """
	# Zero-Shot Object Detection Arena

	Compare four zero-shot object detectors on the same image + prompts.
	Upload an image (or pick an example), add comma-separated prompts, tweak per-model thresholds, and hit Detect.
	You'll see bounding boxes, scores, and per-model inference time.

	Models
	- LLMDet Tiny — [`iSEE-Laboratory/llmdet_tiny`](https://huggingface.co/iSEE-Laboratory/llmdet_tiny)
	- MM GroundingDINO Tiny O365v1 GoldG — [`rziga/mm_grounding_dino_tiny_o365v1_goldg`](https://huggingface.co/rziga/mm_grounding_dino_tiny_o365v1_goldg)
	- OMDet Turbo Swin Tiny — [`omlab/omdet-turbo-swin-tiny-hf`](https://huggingface.co/omlab/omdet-turbo-swin-tiny-hf)
	- OWL-V2 Large Patch14 Ensemble — [`google/owlv2-large-patch14-ensemble`](https://huggingface.co/google/owlv2-large-patch14-ensemble)

	Tip: Lower thresholds ↑ recall but may ↑ false positives.
	"""

	# ---------------------------
	# UI
	# ---------------------------

	with gr.Blocks() as app:
	gr.Markdown(description_md)

	with gr.Row():
	with gr.Column(scale=1):
	image = gr.Image(type="pil", label="Upload an image", height=400)
	prompts = gr.Textbox(
	label="Prompts (comma-separated)",
	value="a cat, a remote control",
	placeholder="e.g., a cat, a remote control",
	)
	with gr.Accordion("Per-model confidence thresholds", open=True):
	threshold_llm = gr.Slider(label=f"Threshold — {bundle_llmdet.model_name}", minimum=0.0, maximum=1.0, value=0.3)
	threshold_mm = gr.Slider(label=f"Threshold — {bundle_mm_grounding.model_name}", minimum=0.0, maximum=1.0, value=0.3)
	threshold_owlv2 = gr.Slider(label=f"Threshold — {bundle_owlv2.model_name}", minimum=0.0, maximum=1.0, value=0.1)
	threshold_omdet = gr.Slider(label=f"Threshold — {bundle_omdet.model_name}", minimum=0.0, maximum=1.0, value=0.2)
	generate_btn = gr.Button(value="Detect")

	with gr.Row():
	with gr.Column(scale=2):
	output_image_llm = gr.AnnotatedImage(label=f"Annotated — {bundle_llmdet.model_name}", height=400)
	output_time_llm = gr.Markdown()
	with gr.Column(scale=2):
	output_image_mm = gr.AnnotatedImage(label=f"Annotated — {bundle_mm_grounding.model_name}", height=400)
	output_time_mm = gr.Markdown()

	with gr.Row():
	with gr.Column(scale=2):
	output_image_owlv2 = gr.AnnotatedImage(label=f"Annotated — {bundle_owlv2.model_name}", height=400)
	output_time_owlv2 = gr.Markdown()
	with gr.Column(scale=2):
	output_image_omdet = gr.AnnotatedImage(label=f"Annotated — {bundle_omdet.model_name}", height=400)
	output_time_omdet = gr.Markdown()

	gr.Markdown("### Examples")
	example_data = [
	["http://images.cocodataset.org/val2017/000000039769.jpg", "a cat, a remote control", 0.30, 0.30, 0.10, 0.30],
	["http://images.cocodataset.org/val2017/000000000139.jpg", "a person, a tv, a remote", 0.35, 0.30, 0.12, 0.30],
	]

	gr.Examples(
	examples=example_data,
	inputs=[image, prompts, threshold_llm, threshold_mm, threshold_owlv2, threshold_omdet],
	label="Click an example to populate the inputs",
	)

	inputs = [image, prompts, threshold_llm, threshold_mm, threshold_owlv2, threshold_omdet]
	outputs = [
	output_image_llm, output_time_llm,
	output_image_mm, output_time_mm,
	output_image_owlv2, output_time_owlv2,
	output_image_omdet, output_time_omdet,
	]
	generate_btn.click(fn=run_detection, inputs=inputs, outputs=outputs)
	image.upload(fn=run_detection, inputs=inputs, outputs=outputs)

	# Optional: queue to handle multiple users gracefully (tune as needed)
	app.launch()