Spaces:

atalaydenknalbant
/

DINOv3

Running on Zero

App Files Files Community

atalaydenknalbant commited on Aug 16

Commit

d32d1da

verified ·

1 Parent(s): 16d45ca

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -133

app.py CHANGED Viewed

@@ -1,11 +1,10 @@
 import os
-import io
 import time
-import json
 import glob
 import tempfile
 from uuid import uuid4
 from typing import List, Tuple, Dict
 import gradio as gr
 from PIL import Image
 import numpy as np
@@ -20,27 +19,21 @@ import spaces
 MODELS = {
     "ViT-B/16 LVD-1689M": "facebook/dinov3-vitb16-pretrain-lvd1689m",
     "ViT-L/16 LVD-1689M": "facebook/dinov3-vitl16-pretrain-lvd1689m",
-    "ConvNeXt-Tiny LVD-1689M": "facebook/dinov3-convnext-tiny-pretrain-lvd1689m",
     "ViT-7B/16 LVD-1689M": "facebook/dinov3-vit7b16-pretrain-lvd1689m",
 }
 DEFAULT_MODEL = "ViT-B/16 LVD-1689M"
 HF_TOKEN = os.getenv("HF_TOKEN", None)  # set in Space Secrets after requesting gated access
 # ---------------------------
 # ZeroGPU booking helpers
 # ---------------------------
-def _gpu_duration_single(image: Image.Image, *_args, **_kwargs) -> int:
-    """Return a booking duration in seconds for a single image job.
-    Args:
-        image: PIL Image used only to estimate cost. Unused otherwise.
-    Returns:
-        Number of seconds to reserve GPU time for the task.
-    """
-    return 120  # 120 seconds is plenty for single image feature extraction
 def _gpu_duration_gallery(files: List[str], *_args, **_kwargs) -> int:
     """Return a booking duration for a gallery job based on file count.
@@ -53,6 +46,7 @@ def _gpu_duration_gallery(files: List[str], *_args, **_kwargs) -> int:
     n = max(1, len(files) if files else 1)
     return min(600, 35 * n + 30)  # 35s per image plus 30s buffer capped at 10 minutes
 def _gpu_duration_classify(*_args, **_kwargs) -> int:
     """Return a small booking duration for classification runs.
@@ -62,11 +56,11 @@ def _gpu_duration_classify(*_args, **_kwargs) -> int:
     return 90  # small buffer for 1 query plus a handful of centroids
 # ---------------------------
-# Model loading and core logic
 # ---------------------------
-def _load(model_id: str):
-    """Load processor and model then move to CUDA eval.
     Args:
         model_id: Hugging Face model id to load.
@@ -78,12 +72,12 @@ def _load(model_id: str):
         model_id, use_fast=True, token=HF_TOKEN if HF_TOKEN else None,
     )
     model = AutoModel.from_pretrained(
-        model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True,
-        token=HF_TOKEN if HF_TOKEN else None,
     )
-    model.to("cuda").eval()
     return processor, model
 def _to_cuda_batchfeature(bf):
     """Move a BatchFeature or dict of tensors to CUDA.
@@ -97,25 +91,23 @@ def _to_cuda_batchfeature(bf):
         return bf.to("cuda")
     return {k: v.to("cuda") for k, v in bf.items()}
-def _extract_core(image: Image.Image, model_id: str, pooling: str, want_overlay: bool):
-    """Extract a DINOv3 embedding and optional heat overlay.
     Args:
         image: Input PIL image.
         model_id: Backbone id from MODELS.
-        pooling: Either CLS or Mean of patch tokens.
-        want_overlay: If True produce a light heat overlay.
     Returns:
-        Tuple of embedding array, optional overlay image, and a meta dict.
     """
-    t0 = time.time()
     processor, model = _load(model_id)
     bf = processor(images=image, return_tensors="pt")
     bf = _to_cuda_batchfeature(bf)
-    pixel_values = bf["pixel_values"]
-    with torch.amp.autocast("cuda", dtype=torch.float16), torch.inference_mode():
         out = model(**bf)
     if pooling == "CLS":
@@ -124,40 +116,15 @@ def _extract_core(image: Image.Image, model_id: str, pooling: str, want_overlay:
         else:
             emb = out.last_hidden_state[0, 0]
     else:
-        if out.last_hidden_state.ndim == 3:
             num_regs = getattr(model.config, "num_register_tokens", 0)
             patch_tokens = out.last_hidden_state[0, 1 + num_regs :]
             emb = patch_tokens.mean(dim=0)
-        else:
-            feat = out.last_hidden_state[0]  # [C,H,W]
             emb = feat.mean(dim=(1, 2))
-    emb = emb.float().cpu().numpy()
-    overlay = None
-    if want_overlay and getattr(out, "last_hidden_state", None) is not None and out.last_hidden_state.ndim == 3:
-        num_regs = getattr(model.config, "num_register_tokens", 0)
-        patch_tokens = out.last_hidden_state[0, 1 + num_regs :]
-        num_patches = patch_tokens.shape[0]
-        h = int(num_patches ** 0.5)
-        w = h
-        if h * w != num_patches:
-            patch = getattr(model.config, "patch_size", 16)
-            h = int(pixel_values.shape[-2] // patch)
-            w = int(pixel_values.shape[-1] // patch)
-        mags = patch_tokens.norm(dim=1).reshape(h, w)
-        mags = (mags - mags.min()) / max(1e-8, (mags.max() - mags.min()))
-        m = (mags.cpu().numpy() * 255).astype(np.uint8)
-        heat = Image.fromarray(m).resize(image.size, Image.BILINEAR).convert("RGB")
-        overlay = Image.blend(image.convert("RGB"), heat, 0.35)
-    meta = {
-        "model_id": model_id,
-        "dtype": "fp16 on cuda",
-        "dim": int(emb.shape[0]),
-        "seconds": round(time.time() - t0, 3),
-    }
-    return emb, overlay, meta
 # ---------------------------
 # Utilities
@@ -181,6 +148,7 @@ def _open_images_from_paths(paths: List[str]) -> List[Image.Image]:
             pass
     return imgs
 def _to_html_table(S: np.ndarray, names: List[str]) -> str:
     """Render a cosine similarity matrix as an HTML table.
@@ -200,15 +168,16 @@ def _to_html_table(S: np.ndarray, names: List[str]) -> str:
         cells = "".join(f"<td style='padding:6px 8px;text-align:center'>{v:.3f}</td>" for v in r)
         rows.append(f"<tr><th style='padding:6px 8px;text-align:left'>{names_safe[i]}</th>{cells}</tr>")
     table = f"""
- <div style="overflow:auto; max-height:380px; border:1px solid #ddd">
-   <table style="border-collapse:collapse; width:100%">
-     {header}
-     {''.join(rows)}
-   </table>
- </div>
- """
     return table
 def _normalize_rows(X: np.ndarray) -> np.ndarray:
     """Normalize rows to unit norm with safe clipping.
@@ -221,41 +190,6 @@ def _normalize_rows(X: np.ndarray) -> np.ndarray:
     n = np.linalg.norm(X, axis=1, keepdims=True)
     return X / np.clip(n, 1e-8, None)
-# ---------------------------
-# Single image API ZeroGPU
-# ---------------------------
-@spaces.GPU(duration=_gpu_duration_single)
-def extract_embedding(image: Image.Image, model_name: str, pooling: str, want_overlay: bool):
-    """Compute an embedding for one image and return preview outputs.
-    Args:
-        image: Input PIL image.
-        model_name: Key from MODELS.
-        pooling: Either CLS or Mean of patch tokens.
-        want_overlay: If True return a blended overlay.
-    Returns:
-        overlay_or_input: PIL image for display.
-        preview_head: String with first 16 values of the embedding.
-        meta: Dict with model info and timing.
-        npy_path: Path to a saved .npy embedding file.
-    """
-    if image is None:
-        return None, "[]", {"error": "No image"}, None
-    if not torch.cuda.is_available():
-        raise RuntimeError("CUDA not available. Ensure Space hardware is ZeroGPU.")
-    model_id = MODELS[model_name]
-    emb, overlay, meta = _extract_core(image, model_id, pooling, want_overlay)
-    head = ", ".join(f"{x:.4f}" for x in emb[:16])
-    preview = f"[{head}{', ...' if emb.size > 16 else ''}]"
-    out_path = os.path.join(tempfile.gettempdir(), f"embedding_{uuid4().hex}.npy")
-    np.save(out_path, emb.astype(np.float32), allow_pickle=False)
-    return overlay if overlay else image, preview, meta, out_path
 # ---------------------------
 # Multi image similarity ZeroGPU
 # ---------------------------
@@ -283,7 +217,7 @@ def batch_similarity(files: List[str], model_name: str, pooling: str):
     imgs = _open_images_from_paths(paths)
     embs = []
     for img in imgs:
-        e, _, _ = _extract_core(img, model_id, pooling, want_overlay=False)
         embs.append(e)
     if len(embs) < 2:
@@ -310,6 +244,7 @@ def _init_state() -> Dict:
     """
     return {"model_id": "", "pooling": "", "classes": {}}
 def _summarize_state(state: Dict) -> Dict:
     """Summarize counts in the classifier state.
@@ -327,6 +262,7 @@ def _summarize_state(state: Dict) -> Dict:
         "total_examples": int(sum(v.get("count", 0) for v in state.get("classes", {}).values())),
     }
 @spaces.GPU(duration=_gpu_duration_gallery)
 def add_class(class_name: str, files: List[str], model_name: str, pooling: str, state: Dict):
     """Add images to a labeled class and update embeddings.
@@ -360,8 +296,8 @@ def add_class(class_name: str, files: List[str], model_name: str, pooling: str,
     embs = []
     for im in imgs:
-        e, _, _ = _extract_core(im, model_id, pooling, want_overlay=False)
-        embs.append(e.astype(np.float32))
     X = np.vstack(embs)
     if class_name not in state["classes"]:
@@ -373,6 +309,7 @@ def add_class(class_name: str, files: List[str], model_name: str, pooling: str,
         state["classes"][class_name]["count"] = new.shape[0]
     return _summarize_state(state), state
 @spaces.GPU(duration=_gpu_duration_classify)
 def predict_class(image: Image.Image, model_name: str, pooling: str, state: Dict, top_k: int):
     """Predict a class using cosine to class centroids.
@@ -398,8 +335,7 @@ def predict_class(image: Image.Image, model_name: str, pooling: str, state: Dict
     if state.get("model_id") != model_id or state.get("pooling") != pooling:
         return {"error": "Model or pooling changed after building classes. Clear and rebuild."}, {}, None
-    q, _, _ = _extract_core(image, model_id, pooling, want_overlay=False)
-    q = q.astype(np.float32)[None, :]
     qn = _normalize_rows(q)
     names = []
@@ -427,6 +363,7 @@ def predict_class(image: Image.Image, model_name: str, pooling: str, state: Dict
     ) + "</ol>"
     return {"top_k": top_k, "prediction": names[order[0]]}, result_dict, full_table
 def clear_classes(_state: Dict):
     """Reset the classifier state to empty.
@@ -443,7 +380,7 @@ def clear_classes(_state: Dict):
 # ---------------------------
 with gr.Blocks() as app:
-    gr.Markdown("# DINOv3 - Embeddings, Similarity, Classification")
     with gr.Accordion("Paper and Citation", open=False):
         gr.Markdown("""
@@ -462,35 +399,9 @@ with gr.Blocks() as app:
                             }
                     """)
-    # ------------- Embeddings -------------
-    with gr.Tab("Embeddings"):
-        with gr.Row():
-            with gr.Column():
-                img = gr.Image(type="pil", label="Image", height=360)
-                model_dd = gr.Dropdown(choices=list(MODELS.keys()), value=DEFAULT_MODEL, label="Backbone")
-                pooling = gr.Radio(["CLS", "Mean of patch tokens"], value="CLS", label="Pooling")
-                overlay = gr.Checkbox(value=True, label="Show overlay")
-                run_btn = gr.Button("Extract")
-            with gr.Column():
-                out_img = gr.Image(type="pil", label="Overlay or input", height=360)
-                preview = gr.Textbox(label="Embedding head", max_lines=2)
-                meta = gr.JSON(label="Meta")
-                download = gr.File(label="embedding.npy")
-        run_btn.click(extract_embedding, [img, model_dd, pooling, overlay], [out_img, preview, meta, download])
-        ex_single = []
-        for p in sorted(glob.glob("examples/*.*"))[:6]:
-            ex_single.append([p, DEFAULT_MODEL, "CLS", False])
-        if ex_single:
-            gr.Examples(
-                label="Examples",
-                examples=ex_single,
-                inputs=[img, model_dd, pooling, overlay],
-            )
     # ------------- Similarity -------------
     with gr.Tab("Similarity"):
-        gr.Markdown("Upload multiple images to compute a cosine similarity matrix and return a CSV.")
         files_in = gr.Files(label="Upload images", file_types=["image"], file_count="multiple", type="filepath")
         gallery_preview = gr.Gallery(label="Preview", columns=4, height=300)
         model_dd2 = gr.Dropdown(choices=list(MODELS.keys()), value=DEFAULT_MODEL, label="Backbone")
@@ -568,4 +479,4 @@ with gr.Blocks() as app:
             )
 if __name__ == "__main__":
-    app.queue().launch(mcp_server=True)

 import os
 import time
 import glob
 import tempfile
 from uuid import uuid4
 from typing import List, Tuple, Dict
 import gradio as gr
 from PIL import Image
 import numpy as np
 MODELS = {
     "ViT-B/16 LVD-1689M": "facebook/dinov3-vitb16-pretrain-lvd1689m",
     "ViT-L/16 LVD-1689M": "facebook/dinov3-vitl16-pretrain-lvd1689m",
     "ViT-7B/16 LVD-1689M": "facebook/dinov3-vit7b16-pretrain-lvd1689m",
 }
 DEFAULT_MODEL = "ViT-B/16 LVD-1689M"
 HF_TOKEN = os.getenv("HF_TOKEN", None)  # set in Space Secrets after requesting gated access
+# Force fp32 everywhere.
+try:
+    torch.set_default_dtype(torch.float32)
+except Exception:
+    pass
 # ---------------------------
 # ZeroGPU booking helpers
 # ---------------------------
 def _gpu_duration_gallery(files: List[str], *_args, **_kwargs) -> int:
     """Return a booking duration for a gallery job based on file count.
     n = max(1, len(files) if files else 1)
     return min(600, 35 * n + 30)  # 35s per image plus 30s buffer capped at 10 minutes
 def _gpu_duration_classify(*_args, **_kwargs) -> int:
     """Return a small booking duration for classification runs.
     return 90  # small buffer for 1 query plus a handful of centroids
 # ---------------------------
+# Model loading and embedding extraction (fp32 only)
 # ---------------------------
+def _load(model_id: str) -> Tuple[AutoImageProcessor, AutoModel]:
+    """Load processor and model then move to CUDA eval in float32.
     Args:
         model_id: Hugging Face model id to load.
         model_id, use_fast=True, token=HF_TOKEN if HF_TOKEN else None,
     )
     model = AutoModel.from_pretrained(
+        model_id, low_cpu_mem_usage=True, token=HF_TOKEN if HF_TOKEN else None,
     )
+    model.to("cuda").to(torch.float32).eval()
     return processor, model
 def _to_cuda_batchfeature(bf):
     """Move a BatchFeature or dict of tensors to CUDA.
         return bf.to("cuda")
     return {k: v.to("cuda") for k, v in bf.items()}
+def _embed(image: Image.Image, model_id: str, pooling: str) -> np.ndarray:
+    """Extract a single-image DINOv3 embedding.
     Args:
         image: Input PIL image.
         model_id: Backbone id from MODELS.
+        pooling: Either "CLS" or "Mean of patch tokens".
     Returns:
+        1D NumPy vector in float32.
     """
     processor, model = _load(model_id)
     bf = processor(images=image, return_tensors="pt")
     bf = _to_cuda_batchfeature(bf)
+    with torch.inference_mode():
         out = model(**bf)
     if pooling == "CLS":
         else:
             emb = out.last_hidden_state[0, 0]
     else:
+        if out.last_hidden_state.ndim == 3:  # ViT tokens
             num_regs = getattr(model.config, "num_register_tokens", 0)
             patch_tokens = out.last_hidden_state[0, 1 + num_regs :]
             emb = patch_tokens.mean(dim=0)
+        else:  # Conv/backbone feature map [C,H,W]
+            feat = out.last_hidden_state[0]
             emb = feat.mean(dim=(1, 2))
+    return emb.float().cpu().numpy().astype(np.float32)
 # ---------------------------
 # Utilities
             pass
     return imgs
 def _to_html_table(S: np.ndarray, names: List[str]) -> str:
     """Render a cosine similarity matrix as an HTML table.
         cells = "".join(f"<td style='padding:6px 8px;text-align:center'>{v:.3f}</td>" for v in r)
         rows.append(f"<tr><th style='padding:6px 8px;text-align:left'>{names_safe[i]}</th>{cells}</tr>")
     table = f"""
+             <div style="overflow:auto; max-height:380px; border:1px solid #ddd">
+               <table style="border-collapse:collapse; width:100%">
+                 {header}
+                 {''.join(rows)}
+               </table>
+             </div>
+             """
     return table
 def _normalize_rows(X: np.ndarray) -> np.ndarray:
     """Normalize rows to unit norm with safe clipping.
     n = np.linalg.norm(X, axis=1, keepdims=True)
     return X / np.clip(n, 1e-8, None)
 # ---------------------------
 # Multi image similarity ZeroGPU
 # ---------------------------
     imgs = _open_images_from_paths(paths)
     embs = []
     for img in imgs:
+        e = _embed(img, model_id, pooling)
         embs.append(e)
     if len(embs) < 2:
     """
     return {"model_id": "", "pooling": "", "classes": {}}
 def _summarize_state(state: Dict) -> Dict:
     """Summarize counts in the classifier state.
         "total_examples": int(sum(v.get("count", 0) for v in state.get("classes", {}).values())),
     }
 @spaces.GPU(duration=_gpu_duration_gallery)
 def add_class(class_name: str, files: List[str], model_name: str, pooling: str, state: Dict):
     """Add images to a labeled class and update embeddings.
     embs = []
     for im in imgs:
+        e = _embed(im, model_id, pooling).astype(np.float32)
+        embs.append(e)
     X = np.vstack(embs)
     if class_name not in state["classes"]:
         state["classes"][class_name]["count"] = new.shape[0]
     return _summarize_state(state), state
 @spaces.GPU(duration=_gpu_duration_classify)
 def predict_class(image: Image.Image, model_name: str, pooling: str, state: Dict, top_k: int):
     """Predict a class using cosine to class centroids.
     if state.get("model_id") != model_id or state.get("pooling") != pooling:
         return {"error": "Model or pooling changed after building classes. Clear and rebuild."}, {}, None
+    q = _embed(image, model_id, pooling).astype(np.float32)[None, :]
     qn = _normalize_rows(q)
     names = []
     ) + "</ol>"
     return {"top_k": top_k, "prediction": names[order[0]]}, result_dict, full_table
 def clear_classes(_state: Dict):
     """Reset the classifier state to empty.
 # ---------------------------
 with gr.Blocks() as app:
+    gr.Markdown("# DINOv3 - Similarity, Classification")
     with gr.Accordion("Paper and Citation", open=False):
         gr.Markdown("""
                             }
                     """)
     # ------------- Similarity -------------
     with gr.Tab("Similarity"):
+        gr.Markdown("Upload multiple images to compute a cosine similarity matrix and download a CSV.")
         files_in = gr.Files(label="Upload images", file_types=["image"], file_count="multiple", type="filepath")
         gallery_preview = gr.Gallery(label="Preview", columns=4, height=300)
         model_dd2 = gr.Dropdown(choices=list(MODELS.keys()), value=DEFAULT_MODEL, label="Backbone")
             )
 if __name__ == "__main__":
+    app.queue().launch(mcp_server=True)