Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,11 +1,10 @@
|
|
| 1 |
import os
|
| 2 |
-
import io
|
| 3 |
import time
|
| 4 |
-
import json
|
| 5 |
import glob
|
| 6 |
import tempfile
|
| 7 |
from uuid import uuid4
|
| 8 |
from typing import List, Tuple, Dict
|
|
|
|
| 9 |
import gradio as gr
|
| 10 |
from PIL import Image
|
| 11 |
import numpy as np
|
|
@@ -20,27 +19,21 @@ import spaces
|
|
| 20 |
MODELS = {
|
| 21 |
"ViT-B/16 LVD-1689M": "facebook/dinov3-vitb16-pretrain-lvd1689m",
|
| 22 |
"ViT-L/16 LVD-1689M": "facebook/dinov3-vitl16-pretrain-lvd1689m",
|
| 23 |
-
"ConvNeXt-Tiny LVD-1689M": "facebook/dinov3-convnext-tiny-pretrain-lvd1689m",
|
| 24 |
"ViT-7B/16 LVD-1689M": "facebook/dinov3-vit7b16-pretrain-lvd1689m",
|
| 25 |
}
|
| 26 |
DEFAULT_MODEL = "ViT-B/16 LVD-1689M"
|
| 27 |
HF_TOKEN = os.getenv("HF_TOKEN", None) # set in Space Secrets after requesting gated access
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
# ---------------------------
|
| 30 |
# ZeroGPU booking helpers
|
| 31 |
# ---------------------------
|
| 32 |
|
| 33 |
-
def _gpu_duration_single(image: Image.Image, *_args, **_kwargs) -> int:
|
| 34 |
-
"""Return a booking duration in seconds for a single image job.
|
| 35 |
-
|
| 36 |
-
Args:
|
| 37 |
-
image: PIL Image used only to estimate cost. Unused otherwise.
|
| 38 |
-
|
| 39 |
-
Returns:
|
| 40 |
-
Number of seconds to reserve GPU time for the task.
|
| 41 |
-
"""
|
| 42 |
-
return 120 # 120 seconds is plenty for single image feature extraction
|
| 43 |
-
|
| 44 |
def _gpu_duration_gallery(files: List[str], *_args, **_kwargs) -> int:
|
| 45 |
"""Return a booking duration for a gallery job based on file count.
|
| 46 |
|
|
@@ -53,6 +46,7 @@ def _gpu_duration_gallery(files: List[str], *_args, **_kwargs) -> int:
|
|
| 53 |
n = max(1, len(files) if files else 1)
|
| 54 |
return min(600, 35 * n + 30) # 35s per image plus 30s buffer capped at 10 minutes
|
| 55 |
|
|
|
|
| 56 |
def _gpu_duration_classify(*_args, **_kwargs) -> int:
|
| 57 |
"""Return a small booking duration for classification runs.
|
| 58 |
|
|
@@ -62,11 +56,11 @@ def _gpu_duration_classify(*_args, **_kwargs) -> int:
|
|
| 62 |
return 90 # small buffer for 1 query plus a handful of centroids
|
| 63 |
|
| 64 |
# ---------------------------
|
| 65 |
-
# Model loading and
|
| 66 |
# ---------------------------
|
| 67 |
|
| 68 |
-
def _load(model_id: str):
|
| 69 |
-
"""Load processor and model then move to CUDA eval.
|
| 70 |
|
| 71 |
Args:
|
| 72 |
model_id: Hugging Face model id to load.
|
|
@@ -78,12 +72,12 @@ def _load(model_id: str):
|
|
| 78 |
model_id, use_fast=True, token=HF_TOKEN if HF_TOKEN else None,
|
| 79 |
)
|
| 80 |
model = AutoModel.from_pretrained(
|
| 81 |
-
model_id,
|
| 82 |
-
token=HF_TOKEN if HF_TOKEN else None,
|
| 83 |
)
|
| 84 |
-
model.to("cuda").eval()
|
| 85 |
return processor, model
|
| 86 |
|
|
|
|
| 87 |
def _to_cuda_batchfeature(bf):
|
| 88 |
"""Move a BatchFeature or dict of tensors to CUDA.
|
| 89 |
|
|
@@ -97,25 +91,23 @@ def _to_cuda_batchfeature(bf):
|
|
| 97 |
return bf.to("cuda")
|
| 98 |
return {k: v.to("cuda") for k, v in bf.items()}
|
| 99 |
|
| 100 |
-
|
| 101 |
-
|
|
|
|
| 102 |
|
| 103 |
Args:
|
| 104 |
image: Input PIL image.
|
| 105 |
model_id: Backbone id from MODELS.
|
| 106 |
-
pooling: Either CLS or Mean of patch tokens.
|
| 107 |
-
want_overlay: If True produce a light heat overlay.
|
| 108 |
|
| 109 |
Returns:
|
| 110 |
-
|
| 111 |
"""
|
| 112 |
-
t0 = time.time()
|
| 113 |
processor, model = _load(model_id)
|
| 114 |
bf = processor(images=image, return_tensors="pt")
|
| 115 |
bf = _to_cuda_batchfeature(bf)
|
| 116 |
-
pixel_values = bf["pixel_values"]
|
| 117 |
|
| 118 |
-
with torch.
|
| 119 |
out = model(**bf)
|
| 120 |
|
| 121 |
if pooling == "CLS":
|
|
@@ -124,40 +116,15 @@ def _extract_core(image: Image.Image, model_id: str, pooling: str, want_overlay:
|
|
| 124 |
else:
|
| 125 |
emb = out.last_hidden_state[0, 0]
|
| 126 |
else:
|
| 127 |
-
if out.last_hidden_state.ndim == 3:
|
| 128 |
num_regs = getattr(model.config, "num_register_tokens", 0)
|
| 129 |
patch_tokens = out.last_hidden_state[0, 1 + num_regs :]
|
| 130 |
emb = patch_tokens.mean(dim=0)
|
| 131 |
-
else:
|
| 132 |
-
feat = out.last_hidden_state[0]
|
| 133 |
emb = feat.mean(dim=(1, 2))
|
| 134 |
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
overlay = None
|
| 138 |
-
if want_overlay and getattr(out, "last_hidden_state", None) is not None and out.last_hidden_state.ndim == 3:
|
| 139 |
-
num_regs = getattr(model.config, "num_register_tokens", 0)
|
| 140 |
-
patch_tokens = out.last_hidden_state[0, 1 + num_regs :]
|
| 141 |
-
num_patches = patch_tokens.shape[0]
|
| 142 |
-
h = int(num_patches ** 0.5)
|
| 143 |
-
w = h
|
| 144 |
-
if h * w != num_patches:
|
| 145 |
-
patch = getattr(model.config, "patch_size", 16)
|
| 146 |
-
h = int(pixel_values.shape[-2] // patch)
|
| 147 |
-
w = int(pixel_values.shape[-1] // patch)
|
| 148 |
-
mags = patch_tokens.norm(dim=1).reshape(h, w)
|
| 149 |
-
mags = (mags - mags.min()) / max(1e-8, (mags.max() - mags.min()))
|
| 150 |
-
m = (mags.cpu().numpy() * 255).astype(np.uint8)
|
| 151 |
-
heat = Image.fromarray(m).resize(image.size, Image.BILINEAR).convert("RGB")
|
| 152 |
-
overlay = Image.blend(image.convert("RGB"), heat, 0.35)
|
| 153 |
-
|
| 154 |
-
meta = {
|
| 155 |
-
"model_id": model_id,
|
| 156 |
-
"dtype": "fp16 on cuda",
|
| 157 |
-
"dim": int(emb.shape[0]),
|
| 158 |
-
"seconds": round(time.time() - t0, 3),
|
| 159 |
-
}
|
| 160 |
-
return emb, overlay, meta
|
| 161 |
|
| 162 |
# ---------------------------
|
| 163 |
# Utilities
|
|
@@ -181,6 +148,7 @@ def _open_images_from_paths(paths: List[str]) -> List[Image.Image]:
|
|
| 181 |
pass
|
| 182 |
return imgs
|
| 183 |
|
|
|
|
| 184 |
def _to_html_table(S: np.ndarray, names: List[str]) -> str:
|
| 185 |
"""Render a cosine similarity matrix as an HTML table.
|
| 186 |
|
|
@@ -200,15 +168,16 @@ def _to_html_table(S: np.ndarray, names: List[str]) -> str:
|
|
| 200 |
cells = "".join(f"<td style='padding:6px 8px;text-align:center'>{v:.3f}</td>" for v in r)
|
| 201 |
rows.append(f"<tr><th style='padding:6px 8px;text-align:left'>{names_safe[i]}</th>{cells}</tr>")
|
| 202 |
table = f"""
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
return table
|
| 211 |
|
|
|
|
| 212 |
def _normalize_rows(X: np.ndarray) -> np.ndarray:
|
| 213 |
"""Normalize rows to unit norm with safe clipping.
|
| 214 |
|
|
@@ -221,41 +190,6 @@ def _normalize_rows(X: np.ndarray) -> np.ndarray:
|
|
| 221 |
n = np.linalg.norm(X, axis=1, keepdims=True)
|
| 222 |
return X / np.clip(n, 1e-8, None)
|
| 223 |
|
| 224 |
-
# ---------------------------
|
| 225 |
-
# Single image API ZeroGPU
|
| 226 |
-
# ---------------------------
|
| 227 |
-
|
| 228 |
-
@spaces.GPU(duration=_gpu_duration_single)
|
| 229 |
-
def extract_embedding(image: Image.Image, model_name: str, pooling: str, want_overlay: bool):
|
| 230 |
-
"""Compute an embedding for one image and return preview outputs.
|
| 231 |
-
|
| 232 |
-
Args:
|
| 233 |
-
image: Input PIL image.
|
| 234 |
-
model_name: Key from MODELS.
|
| 235 |
-
pooling: Either CLS or Mean of patch tokens.
|
| 236 |
-
want_overlay: If True return a blended overlay.
|
| 237 |
-
|
| 238 |
-
Returns:
|
| 239 |
-
overlay_or_input: PIL image for display.
|
| 240 |
-
preview_head: String with first 16 values of the embedding.
|
| 241 |
-
meta: Dict with model info and timing.
|
| 242 |
-
npy_path: Path to a saved .npy embedding file.
|
| 243 |
-
"""
|
| 244 |
-
if image is None:
|
| 245 |
-
return None, "[]", {"error": "No image"}, None
|
| 246 |
-
if not torch.cuda.is_available():
|
| 247 |
-
raise RuntimeError("CUDA not available. Ensure Space hardware is ZeroGPU.")
|
| 248 |
-
|
| 249 |
-
model_id = MODELS[model_name]
|
| 250 |
-
emb, overlay, meta = _extract_core(image, model_id, pooling, want_overlay)
|
| 251 |
-
|
| 252 |
-
head = ", ".join(f"{x:.4f}" for x in emb[:16])
|
| 253 |
-
preview = f"[{head}{', ...' if emb.size > 16 else ''}]"
|
| 254 |
-
out_path = os.path.join(tempfile.gettempdir(), f"embedding_{uuid4().hex}.npy")
|
| 255 |
-
np.save(out_path, emb.astype(np.float32), allow_pickle=False)
|
| 256 |
-
|
| 257 |
-
return overlay if overlay else image, preview, meta, out_path
|
| 258 |
-
|
| 259 |
# ---------------------------
|
| 260 |
# Multi image similarity ZeroGPU
|
| 261 |
# ---------------------------
|
|
@@ -283,7 +217,7 @@ def batch_similarity(files: List[str], model_name: str, pooling: str):
|
|
| 283 |
imgs = _open_images_from_paths(paths)
|
| 284 |
embs = []
|
| 285 |
for img in imgs:
|
| 286 |
-
e
|
| 287 |
embs.append(e)
|
| 288 |
|
| 289 |
if len(embs) < 2:
|
|
@@ -310,6 +244,7 @@ def _init_state() -> Dict:
|
|
| 310 |
"""
|
| 311 |
return {"model_id": "", "pooling": "", "classes": {}}
|
| 312 |
|
|
|
|
| 313 |
def _summarize_state(state: Dict) -> Dict:
|
| 314 |
"""Summarize counts in the classifier state.
|
| 315 |
|
|
@@ -327,6 +262,7 @@ def _summarize_state(state: Dict) -> Dict:
|
|
| 327 |
"total_examples": int(sum(v.get("count", 0) for v in state.get("classes", {}).values())),
|
| 328 |
}
|
| 329 |
|
|
|
|
| 330 |
@spaces.GPU(duration=_gpu_duration_gallery)
|
| 331 |
def add_class(class_name: str, files: List[str], model_name: str, pooling: str, state: Dict):
|
| 332 |
"""Add images to a labeled class and update embeddings.
|
|
@@ -360,8 +296,8 @@ def add_class(class_name: str, files: List[str], model_name: str, pooling: str,
|
|
| 360 |
|
| 361 |
embs = []
|
| 362 |
for im in imgs:
|
| 363 |
-
e
|
| 364 |
-
embs.append(e
|
| 365 |
X = np.vstack(embs)
|
| 366 |
|
| 367 |
if class_name not in state["classes"]:
|
|
@@ -373,6 +309,7 @@ def add_class(class_name: str, files: List[str], model_name: str, pooling: str,
|
|
| 373 |
state["classes"][class_name]["count"] = new.shape[0]
|
| 374 |
return _summarize_state(state), state
|
| 375 |
|
|
|
|
| 376 |
@spaces.GPU(duration=_gpu_duration_classify)
|
| 377 |
def predict_class(image: Image.Image, model_name: str, pooling: str, state: Dict, top_k: int):
|
| 378 |
"""Predict a class using cosine to class centroids.
|
|
@@ -398,8 +335,7 @@ def predict_class(image: Image.Image, model_name: str, pooling: str, state: Dict
|
|
| 398 |
if state.get("model_id") != model_id or state.get("pooling") != pooling:
|
| 399 |
return {"error": "Model or pooling changed after building classes. Clear and rebuild."}, {}, None
|
| 400 |
|
| 401 |
-
q
|
| 402 |
-
q = q.astype(np.float32)[None, :]
|
| 403 |
qn = _normalize_rows(q)
|
| 404 |
|
| 405 |
names = []
|
|
@@ -427,6 +363,7 @@ def predict_class(image: Image.Image, model_name: str, pooling: str, state: Dict
|
|
| 427 |
) + "</ol>"
|
| 428 |
return {"top_k": top_k, "prediction": names[order[0]]}, result_dict, full_table
|
| 429 |
|
|
|
|
| 430 |
def clear_classes(_state: Dict):
|
| 431 |
"""Reset the classifier state to empty.
|
| 432 |
|
|
@@ -443,7 +380,7 @@ def clear_classes(_state: Dict):
|
|
| 443 |
# ---------------------------
|
| 444 |
|
| 445 |
with gr.Blocks() as app:
|
| 446 |
-
gr.Markdown("# DINOv3 -
|
| 447 |
|
| 448 |
with gr.Accordion("Paper and Citation", open=False):
|
| 449 |
gr.Markdown("""
|
|
@@ -462,35 +399,9 @@ with gr.Blocks() as app:
|
|
| 462 |
}
|
| 463 |
""")
|
| 464 |
|
| 465 |
-
# ------------- Embeddings -------------
|
| 466 |
-
with gr.Tab("Embeddings"):
|
| 467 |
-
with gr.Row():
|
| 468 |
-
with gr.Column():
|
| 469 |
-
img = gr.Image(type="pil", label="Image", height=360)
|
| 470 |
-
model_dd = gr.Dropdown(choices=list(MODELS.keys()), value=DEFAULT_MODEL, label="Backbone")
|
| 471 |
-
pooling = gr.Radio(["CLS", "Mean of patch tokens"], value="CLS", label="Pooling")
|
| 472 |
-
overlay = gr.Checkbox(value=True, label="Show overlay")
|
| 473 |
-
run_btn = gr.Button("Extract")
|
| 474 |
-
with gr.Column():
|
| 475 |
-
out_img = gr.Image(type="pil", label="Overlay or input", height=360)
|
| 476 |
-
preview = gr.Textbox(label="Embedding head", max_lines=2)
|
| 477 |
-
meta = gr.JSON(label="Meta")
|
| 478 |
-
download = gr.File(label="embedding.npy")
|
| 479 |
-
run_btn.click(extract_embedding, [img, model_dd, pooling, overlay], [out_img, preview, meta, download])
|
| 480 |
-
|
| 481 |
-
ex_single = []
|
| 482 |
-
for p in sorted(glob.glob("examples/*.*"))[:6]:
|
| 483 |
-
ex_single.append([p, DEFAULT_MODEL, "CLS", False])
|
| 484 |
-
if ex_single:
|
| 485 |
-
gr.Examples(
|
| 486 |
-
label="Examples",
|
| 487 |
-
examples=ex_single,
|
| 488 |
-
inputs=[img, model_dd, pooling, overlay],
|
| 489 |
-
)
|
| 490 |
-
|
| 491 |
# ------------- Similarity -------------
|
| 492 |
with gr.Tab("Similarity"):
|
| 493 |
-
gr.Markdown("Upload multiple images to compute a cosine similarity matrix and
|
| 494 |
files_in = gr.Files(label="Upload images", file_types=["image"], file_count="multiple", type="filepath")
|
| 495 |
gallery_preview = gr.Gallery(label="Preview", columns=4, height=300)
|
| 496 |
model_dd2 = gr.Dropdown(choices=list(MODELS.keys()), value=DEFAULT_MODEL, label="Backbone")
|
|
@@ -568,4 +479,4 @@ with gr.Blocks() as app:
|
|
| 568 |
)
|
| 569 |
|
| 570 |
if __name__ == "__main__":
|
| 571 |
-
app.queue().launch(mcp_server=True)
|
|
|
|
| 1 |
import os
|
|
|
|
| 2 |
import time
|
|
|
|
| 3 |
import glob
|
| 4 |
import tempfile
|
| 5 |
from uuid import uuid4
|
| 6 |
from typing import List, Tuple, Dict
|
| 7 |
+
|
| 8 |
import gradio as gr
|
| 9 |
from PIL import Image
|
| 10 |
import numpy as np
|
|
|
|
| 19 |
MODELS = {
|
| 20 |
"ViT-B/16 LVD-1689M": "facebook/dinov3-vitb16-pretrain-lvd1689m",
|
| 21 |
"ViT-L/16 LVD-1689M": "facebook/dinov3-vitl16-pretrain-lvd1689m",
|
|
|
|
| 22 |
"ViT-7B/16 LVD-1689M": "facebook/dinov3-vit7b16-pretrain-lvd1689m",
|
| 23 |
}
|
| 24 |
DEFAULT_MODEL = "ViT-B/16 LVD-1689M"
|
| 25 |
HF_TOKEN = os.getenv("HF_TOKEN", None) # set in Space Secrets after requesting gated access
|
| 26 |
|
| 27 |
+
# Force fp32 everywhere.
|
| 28 |
+
try:
|
| 29 |
+
torch.set_default_dtype(torch.float32)
|
| 30 |
+
except Exception:
|
| 31 |
+
pass
|
| 32 |
+
|
| 33 |
# ---------------------------
|
| 34 |
# ZeroGPU booking helpers
|
| 35 |
# ---------------------------
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
def _gpu_duration_gallery(files: List[str], *_args, **_kwargs) -> int:
|
| 38 |
"""Return a booking duration for a gallery job based on file count.
|
| 39 |
|
|
|
|
| 46 |
n = max(1, len(files) if files else 1)
|
| 47 |
return min(600, 35 * n + 30) # 35s per image plus 30s buffer capped at 10 minutes
|
| 48 |
|
| 49 |
+
|
| 50 |
def _gpu_duration_classify(*_args, **_kwargs) -> int:
|
| 51 |
"""Return a small booking duration for classification runs.
|
| 52 |
|
|
|
|
| 56 |
return 90 # small buffer for 1 query plus a handful of centroids
|
| 57 |
|
| 58 |
# ---------------------------
|
| 59 |
+
# Model loading and embedding extraction (fp32 only)
|
| 60 |
# ---------------------------
|
| 61 |
|
| 62 |
+
def _load(model_id: str) -> Tuple[AutoImageProcessor, AutoModel]:
|
| 63 |
+
"""Load processor and model then move to CUDA eval in float32.
|
| 64 |
|
| 65 |
Args:
|
| 66 |
model_id: Hugging Face model id to load.
|
|
|
|
| 72 |
model_id, use_fast=True, token=HF_TOKEN if HF_TOKEN else None,
|
| 73 |
)
|
| 74 |
model = AutoModel.from_pretrained(
|
| 75 |
+
model_id, low_cpu_mem_usage=True, token=HF_TOKEN if HF_TOKEN else None,
|
|
|
|
| 76 |
)
|
| 77 |
+
model.to("cuda").to(torch.float32).eval()
|
| 78 |
return processor, model
|
| 79 |
|
| 80 |
+
|
| 81 |
def _to_cuda_batchfeature(bf):
|
| 82 |
"""Move a BatchFeature or dict of tensors to CUDA.
|
| 83 |
|
|
|
|
| 91 |
return bf.to("cuda")
|
| 92 |
return {k: v.to("cuda") for k, v in bf.items()}
|
| 93 |
|
| 94 |
+
|
| 95 |
+
def _embed(image: Image.Image, model_id: str, pooling: str) -> np.ndarray:
|
| 96 |
+
"""Extract a single-image DINOv3 embedding.
|
| 97 |
|
| 98 |
Args:
|
| 99 |
image: Input PIL image.
|
| 100 |
model_id: Backbone id from MODELS.
|
| 101 |
+
pooling: Either "CLS" or "Mean of patch tokens".
|
|
|
|
| 102 |
|
| 103 |
Returns:
|
| 104 |
+
1D NumPy vector in float32.
|
| 105 |
"""
|
|
|
|
| 106 |
processor, model = _load(model_id)
|
| 107 |
bf = processor(images=image, return_tensors="pt")
|
| 108 |
bf = _to_cuda_batchfeature(bf)
|
|
|
|
| 109 |
|
| 110 |
+
with torch.inference_mode():
|
| 111 |
out = model(**bf)
|
| 112 |
|
| 113 |
if pooling == "CLS":
|
|
|
|
| 116 |
else:
|
| 117 |
emb = out.last_hidden_state[0, 0]
|
| 118 |
else:
|
| 119 |
+
if out.last_hidden_state.ndim == 3: # ViT tokens
|
| 120 |
num_regs = getattr(model.config, "num_register_tokens", 0)
|
| 121 |
patch_tokens = out.last_hidden_state[0, 1 + num_regs :]
|
| 122 |
emb = patch_tokens.mean(dim=0)
|
| 123 |
+
else: # Conv/backbone feature map [C,H,W]
|
| 124 |
+
feat = out.last_hidden_state[0]
|
| 125 |
emb = feat.mean(dim=(1, 2))
|
| 126 |
|
| 127 |
+
return emb.float().cpu().numpy().astype(np.float32)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
# ---------------------------
|
| 130 |
# Utilities
|
|
|
|
| 148 |
pass
|
| 149 |
return imgs
|
| 150 |
|
| 151 |
+
|
| 152 |
def _to_html_table(S: np.ndarray, names: List[str]) -> str:
|
| 153 |
"""Render a cosine similarity matrix as an HTML table.
|
| 154 |
|
|
|
|
| 168 |
cells = "".join(f"<td style='padding:6px 8px;text-align:center'>{v:.3f}</td>" for v in r)
|
| 169 |
rows.append(f"<tr><th style='padding:6px 8px;text-align:left'>{names_safe[i]}</th>{cells}</tr>")
|
| 170 |
table = f"""
|
| 171 |
+
<div style="overflow:auto; max-height:380px; border:1px solid #ddd">
|
| 172 |
+
<table style="border-collapse:collapse; width:100%">
|
| 173 |
+
{header}
|
| 174 |
+
{''.join(rows)}
|
| 175 |
+
</table>
|
| 176 |
+
</div>
|
| 177 |
+
"""
|
| 178 |
return table
|
| 179 |
|
| 180 |
+
|
| 181 |
def _normalize_rows(X: np.ndarray) -> np.ndarray:
|
| 182 |
"""Normalize rows to unit norm with safe clipping.
|
| 183 |
|
|
|
|
| 190 |
n = np.linalg.norm(X, axis=1, keepdims=True)
|
| 191 |
return X / np.clip(n, 1e-8, None)
|
| 192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
# ---------------------------
|
| 194 |
# Multi image similarity ZeroGPU
|
| 195 |
# ---------------------------
|
|
|
|
| 217 |
imgs = _open_images_from_paths(paths)
|
| 218 |
embs = []
|
| 219 |
for img in imgs:
|
| 220 |
+
e = _embed(img, model_id, pooling)
|
| 221 |
embs.append(e)
|
| 222 |
|
| 223 |
if len(embs) < 2:
|
|
|
|
| 244 |
"""
|
| 245 |
return {"model_id": "", "pooling": "", "classes": {}}
|
| 246 |
|
| 247 |
+
|
| 248 |
def _summarize_state(state: Dict) -> Dict:
|
| 249 |
"""Summarize counts in the classifier state.
|
| 250 |
|
|
|
|
| 262 |
"total_examples": int(sum(v.get("count", 0) for v in state.get("classes", {}).values())),
|
| 263 |
}
|
| 264 |
|
| 265 |
+
|
| 266 |
@spaces.GPU(duration=_gpu_duration_gallery)
|
| 267 |
def add_class(class_name: str, files: List[str], model_name: str, pooling: str, state: Dict):
|
| 268 |
"""Add images to a labeled class and update embeddings.
|
|
|
|
| 296 |
|
| 297 |
embs = []
|
| 298 |
for im in imgs:
|
| 299 |
+
e = _embed(im, model_id, pooling).astype(np.float32)
|
| 300 |
+
embs.append(e)
|
| 301 |
X = np.vstack(embs)
|
| 302 |
|
| 303 |
if class_name not in state["classes"]:
|
|
|
|
| 309 |
state["classes"][class_name]["count"] = new.shape[0]
|
| 310 |
return _summarize_state(state), state
|
| 311 |
|
| 312 |
+
|
| 313 |
@spaces.GPU(duration=_gpu_duration_classify)
|
| 314 |
def predict_class(image: Image.Image, model_name: str, pooling: str, state: Dict, top_k: int):
|
| 315 |
"""Predict a class using cosine to class centroids.
|
|
|
|
| 335 |
if state.get("model_id") != model_id or state.get("pooling") != pooling:
|
| 336 |
return {"error": "Model or pooling changed after building classes. Clear and rebuild."}, {}, None
|
| 337 |
|
| 338 |
+
q = _embed(image, model_id, pooling).astype(np.float32)[None, :]
|
|
|
|
| 339 |
qn = _normalize_rows(q)
|
| 340 |
|
| 341 |
names = []
|
|
|
|
| 363 |
) + "</ol>"
|
| 364 |
return {"top_k": top_k, "prediction": names[order[0]]}, result_dict, full_table
|
| 365 |
|
| 366 |
+
|
| 367 |
def clear_classes(_state: Dict):
|
| 368 |
"""Reset the classifier state to empty.
|
| 369 |
|
|
|
|
| 380 |
# ---------------------------
|
| 381 |
|
| 382 |
with gr.Blocks() as app:
|
| 383 |
+
gr.Markdown("# DINOv3 - Similarity, Classification")
|
| 384 |
|
| 385 |
with gr.Accordion("Paper and Citation", open=False):
|
| 386 |
gr.Markdown("""
|
|
|
|
| 399 |
}
|
| 400 |
""")
|
| 401 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 402 |
# ------------- Similarity -------------
|
| 403 |
with gr.Tab("Similarity"):
|
| 404 |
+
gr.Markdown("Upload multiple images to compute a cosine similarity matrix and download a CSV.")
|
| 405 |
files_in = gr.Files(label="Upload images", file_types=["image"], file_count="multiple", type="filepath")
|
| 406 |
gallery_preview = gr.Gallery(label="Preview", columns=4, height=300)
|
| 407 |
model_dd2 = gr.Dropdown(choices=list(MODELS.keys()), value=DEFAULT_MODEL, label="Backbone")
|
|
|
|
| 479 |
)
|
| 480 |
|
| 481 |
if __name__ == "__main__":
|
| 482 |
+
app.queue().launch(mcp_server=True)
|