Spaces:

RedRocket
/

JTP-3-Demo

Running

App Files Files

RedHotTensors commited on 24 days ago

Commit

d62ba4b

1 Parent(s): 6884ab9

JTP-3 Hydra Release

Browse files

Files changed (7) hide show

README.md +5 -5
app.py +434 -0
glu.py +40 -0
hydra_pool.py +581 -0
image.py +271 -0
model.py +192 -0
requirements.txt +8 -0

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
-title: JTP 3 Demo
-emoji: 🌖
-colorFrom: blue
-colorTo: purple
 sdk: gradio
 sdk_version: 5.49.1
 app_file: app.py
@@ -11,4 +11,4 @@ license: apache-2.0
 short_description: JTP-3 Hydra Demo
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: JTP 3 Hydra Demo
+emoji: 🚀
+colorFrom: red
+colorTo: blue
 sdk: gradio
 sdk_version: 5.49.1
 app_file: app.py
 short_description: JTP-3 Hydra Demo
 ---
+<a href="https://https://huggingface.co/RedRocket/JTP-3">JTP-3 Hydra Main Repository</a>

app.py ADDED Viewed

	@@ -0,0 +1,434 @@

+from io import BytesIO
+from threading import Lock
+import numpy as np
+import torch
+from torch import Tensor
+from torch.nn import Parameter
+import spaces
+from huggingface_hub import hf_hub_download
+import gradio as gr
+from PIL import Image, ImageDraw, ImageFont
+import requests
+from model import load_model, process_image, patchify_image
+from image import unpatchify
+device = "cuda" if torch.cuda.is_available() else "cpu"
+PATCH_SIZE = 16
+MAX_SEQ_LEN = 1024
+model_lock = Lock()
+model, tag_list = load_model(
+    hf_hub_download(repo_id="RedRocket/JTP-3", filename="models/jtp-3-hydra.safetensors"),
+    device=device
+)
+model.requires_grad_(False)
+tags = {
+    tag.replace("_", " ").replace("vulva", "pussy"): idx
+    for idx, tag in enumerate(tag_list)
+}
+tag_list = list(tags.keys())
+FONT = ImageFont.load_default(24)
+@spaces.GPU(duration=5)
+@torch.no_grad()
+def run_classifier(image: Image.Image, cam_depth: int):
+    patches, patch_coords, patch_valid = patchify_image(image, PATCH_SIZE, MAX_SEQ_LEN)
+    patches = patches.unsqueeze(0).to(device=device, non_blocking=True)
+    patch_coords = patch_coords.unsqueeze(0).to(device=device, non_blocking=True)
+    patch_valid = patch_valid.unsqueeze(0).to(device=device, non_blocking=True)
+    patches = patches.to(dtype=torch.bfloat16).div_(127.5).sub_(1.0)
+    patch_coords = patch_coords.to(dtype=torch.int32)
+    with model_lock:
+        features = model.forward_intermediates(
+            patches,
+            patch_coord=patch_coords,
+            patch_valid=patch_valid,
+            indices=cam_depth,
+            output_dict=True,
+            output_fmt='NLC'
+        )
+        logits = model.forward_head(features["image_features"], patch_valid=patch_valid)
+        del features["image_features"]
+    features["patch_coords"] = patch_coords
+    features["patch_valid"] = patch_valid
+    del patches, patch_coords, patch_valid
+    probits = logits[0].float().sigmoid_().mul_(2.0).sub_(1.0) # scale to -1 to 1
+    values, indices = probits.cpu().topk(250)
+    predictions = {
+        tag_list[idx.item()]: val.item()
+        for idx, val in sorted(
+            zip(indices, values),
+            key=lambda item: item[1].item(),
+            reverse=True
+        )
+    }
+    return features, predictions
+@spaces.GPU(duration=5)
+@torch.no_grad()
+def run_cam(
+    display_image: Image.Image,
+    image: Image.Image, features: dict[str, Tensor],
+    tag_idx: int, cam_depth: int
+):
+    intermediates = features["image_intermediates"]
+    if len(intermediates) < cam_depth:
+        features, _ = run_classifier(image, cam_depth)
+        intermediates = features["image_intermediates"]
+    elif len(intermediates) > cam_depth:
+        intermediates = intermediates[-cam_depth:]
+    patch_coords = features["patch_coords"]
+    patch_valid = features["patch_valid"]
+    with model_lock:
+        saved_q = model.attn_pool.q
+        saved_p = model.attn_pool.out_proj.weight
+        try:
+            model.attn_pool.q = Parameter(saved_q[:, [tag_idx], :], requires_grad=False)
+            model.attn_pool.out_proj.weight = Parameter(saved_p[[tag_idx], :, :], requires_grad=False)
+            with torch.enable_grad():
+                for intermediate in intermediates:
+                    intermediate.requires_grad_(True).retain_grad()
+                    model.forward_head(intermediate, patch_valid=patch_valid)[0, 0].backward()
+        finally:
+            model.attn_pool.q = saved_q
+            model.attn_pool.out_proj.weight = saved_p
+    cam_1d: Tensor | None = None
+    for intermediate in intermediates:
+        patch_grad = (intermediate.grad.float() * intermediate.sign()).sum(dim=(0, 2))
+        intermediate.grad = None
+        if cam_1d is None:
+            cam_1d = patch_grad
+        else:
+            cam_1d.add_(patch_grad)
+    assert cam_1d is not None
+    cam_2d = unpatchify(cam_1d, patch_coords, patch_valid).cpu().numpy()
+    return cam_composite(display_image, cam_2d), features
+def cam_composite(image: Image.Image, cam: np.ndarray):
+    """
+    Overlays CAM on image and returns a PIL image.
+    Args:
+        image_pil: PIL Image (RGB)
+        cam: 2D numpy array (activation map)
+    Returns:
+        PIL.Image.Image with overlay
+    """
+    cam_abs = np.abs(cam)
+    cam_scale = cam_abs.max()
+    cam_rgba = np.dstack((
+        (cam < 0).astype(np.float32),
+        (cam > 0).astype(np.float32),
+        np.zeros_like(cam, dtype=np.float32),
+        cam_abs * (0.5 / cam_scale),
+    ))  # Shape: (H, W, 4)
+    cam_pil = Image.fromarray((cam_rgba * 255).astype(np.uint8))
+    cam_pil = cam_pil.resize(image.size, resample=Image.Resampling.NEAREST)
+    image = Image.blend(
+        image.convert('RGBA'),
+        image.convert('L').convert('RGBA'),
+        0.33
+    )
+    image = Image.alpha_composite(image, cam_pil)
+    draw = ImageDraw.Draw(image)
+    draw.text(
+        (image.width - 7, image.height - 7),
+        f"{cam_scale.item():.4g}",
+        anchor="rd", font=FONT, fill=(32, 32, 255, 255)
+    )
+    return image
+def filter_tags(predictions: dict[str, float], threshold: float):
+    predictions = {
+        key: value
+        for key, value in predictions.items()
+        if value >= threshold
+    }
+    tag_str = ", ".join(predictions.keys())
+    return tag_str, predictions
+def resize_image(image: Image.Image) -> Image.Image:
+    longest_side = max(image.height, image.width)
+    if longest_side < 1080:
+        return image
+    scale = 1080 / longest_side
+    return image.resize(
+        (
+            int(round(image.width * scale)),
+            int(round(image.height * scale)),
+        ),
+        resample=Image.Resampling.LANCZOS,
+        reducing_gap=3.0
+    )
+def image_upload(image: Image.Image):
+    display_image = resize_image(image)
+    processed_image = process_image(image, PATCH_SIZE, MAX_SEQ_LEN)
+    if display_image is not image and processed_image is not image:
+        image.close()
+    return (
+        "", {}, "None", "",
+        gr.skip() if display_image is image else display_image, display_image,
+        processed_image,
+    )
+def url_submit(url: str):
+    resp = requests.get(url, timeout=10)
+    resp.raise_for_status()
+    image = Image.open(BytesIO(resp.content))
+    display_image = resize_image(image)
+    processed_image = process_image(image, PATCH_SIZE, MAX_SEQ_LEN)
+    if display_image is not image and processed_image is not image:
+        image.close()
+    return (
+        "", {}, "None",
+        display_image, display_image,
+        processed_image,
+    )
+def image_changed(image: Image.Image, threshold: float, cam_depth: int):
+    features, predictions = run_classifier(image, cam_depth)
+    return *filter_tags(predictions, threshold), features, predictions
+def image_clear():
+    return (
+        "", {}, "None", "",
+        None, None,
+        None, None, {},
+    )
+def cam_changed(
+    display_image: Image.Image,
+    image: Image.Image, features: dict[str, Tensor],
+    tag: str, cam_depth: int
+):
+    if tag == "None":
+        return display_image, features
+    return run_cam(display_image, image, features, tags[tag], cam_depth)
+def tag_box_select(evt: gr.SelectData):
+    return evt.value
+custom_css = """
+.output-class { display: none; }
+.inferno-slider input[type=range] {
+    background: linear-gradient(to right,
+        #000004, #1b0c41, #4a0c6b, #781c6d,
+        #a52c60, #cf4446, #ed6925, #fb9b06,
+        #f7d13d, #fcffa4
+    ) !important;
+    background-size: 100% 100% !important;
+}
+#image_container-image {
+    width: 100%;
+    aspect-ratio: 1 / 1;
+    max-height: 100%;
+}
+#image_container img {
+    object-fit: contain !important;
+}
+.show-api, .show-api-divider {
+    display: none !important;
+}
+"""
+with gr.Blocks(
+    title="RedRocket JTP-3 Hydra Demo",
+    css=custom_css,
+    analytics_enabled=False,
+) as demo:
+    display_image_state = gr.State()
+    image_state = gr.State()
+    features_state = gr.State()
+    predictions_state = gr.State(value={})
+    gr.HTML(
+        "<h1 style='display:flex; flex-flow: row nowrap; align-items: center;'>"
+        "<a href='https://huggingface.co/RedRocket' target='_blank'>"
+        "<img src='https://huggingface.co/spaces/RedRocket/README/resolve/main/RedRocket.png' style='width: 2em; margin-right: 0.5em;'>"
+        "</a>"
+        "<span><a href='https://huggingface.co/RedRocket' target='_blank'>RedRocket</a> &ndash; JTP-3 Hydra Demo</span>"
+        "<span style='font-weight: normal;'>&nbsp;&bull;&nbsp;<a href='https://huggingface.co/RedRocket/JTP-3' target='_blank'>Download</a></span>"
+        "</h1>"
+    )
+    with gr.Row():
+        with gr.Column():
+            with gr.Column():
+                image = gr.Image(
+                    sources=['upload', 'clipboard'], type='pil',
+                    show_label=False,
+                    show_download_button=False,
+                    show_share_button=False,
+                    elem_id="image_container"
+                )
+                url = gr.Textbox(
+                    label="Upload Image via Url:",
+                    placeholder="https://example.com/image.jpg",
+                    max_lines=1,
+                    submit_btn="⮝",
+                )
+            with gr.Column():
+                cam_tag = gr.Dropdown(
+                    value="None", choices=["None"] + tag_list,
+                    label="CAM Attention Overlay (You can also click a tag on the right.)", show_label=True
+                )
+                cam_depth = gr.Slider(
+                    minimum=1, maximum=27, step=1, value=1,
+                    label="CAM Depth (1=fastest, more precise; 27=slowest, more general)"
+                )
+        with gr.Column():
+            threshold_slider = gr.Slider(minimum=0.00, maximum=1.00, step=0.01, value=0.30, label="Tag Threshold")
+            tag_string = gr.Textbox(lines=3, label="Tags", show_label=True, show_copy_button=True)
+            tag_box = gr.Label(num_top_classes=250, show_label=False, show_heading=False)
+    image.upload(
+        fn=image_upload,
+        inputs=[image],
+        outputs=[
+            tag_string, tag_box, cam_tag, url,
+            image, display_image_state,
+            image_state,
+        ],
+        show_progress='minimal',
+        show_progress_on=[image]
+    ).then(
+        fn=image_changed,
+        inputs=[image_state, threshold_slider, cam_depth],
+        outputs=[
+            tag_string, tag_box,
+            features_state, predictions_state,
+        ],
+        show_progress='minimal',
+        show_progress_on=[tag_box]
+    )
+    url.submit(
+        fn=url_submit,
+        inputs=[url],
+        outputs=[
+            tag_string, tag_box, cam_tag,
+            image, display_image_state,
+            image_state,
+        ],
+        show_progress='minimal',
+        show_progress_on=[url]
+    ).then(
+        fn=image_changed,
+        inputs=[image_state, threshold_slider, cam_depth],
+        outputs=[
+            tag_string, tag_box,
+            features_state, predictions_state,
+        ],
+        show_progress='minimal',
+        show_progress_on=[tag_box]
+    )
+    image.clear(
+        fn=image_clear,
+        inputs=[],
+        outputs=[
+            tag_string, tag_box, cam_tag, url,
+            image, display_image_state,
+            image_state, features_state, predictions_state,
+        ],
+        show_progress='hidden'
+    )
+    threshold_slider.input(
+        fn=filter_tags,
+        inputs=[predictions_state, threshold_slider],
+        outputs=[tag_string, tag_box],
+        trigger_mode='always_last',
+        show_progress='hidden'
+    )
+    cam_tag.input(
+        fn=cam_changed,
+        inputs=[
+            display_image_state,
+            image_state, features_state,
+            cam_tag, cam_depth,
+        ],
+        outputs=[image, features_state],
+        trigger_mode='always_last',
+        show_progress='minimal',
+        show_progress_on=[cam_tag]
+    )
+    cam_depth.input(
+        fn=cam_changed,
+        inputs=[
+            display_image_state,
+            image_state, features_state,
+            cam_tag, cam_depth,
+        ],
+        outputs=[image, features_state],
+        trigger_mode='always_last',
+        show_progress='minimal',
+        show_progress_on=[cam_depth]
+    )
+    tag_box.select(
+        fn=tag_box_select,
+        inputs=[],
+        outputs=[cam_tag],
+        trigger_mode='always_last',
+        show_progress='hidden',
+    ).then(
+        fn=cam_changed,
+        inputs=[
+            display_image_state,
+            image_state, features_state,
+            cam_tag, cam_depth,
+        ],
+        outputs=[image, features_state],
+        show_progress='minimal',
+        show_progress_on=[cam_tag]
+    )
+if __name__ == "__main__":
+    demo.launch()

glu.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from abc import abstractmethod
+from typing import Literal
+from torch import Tensor
+from torch.nn import Module
+from torch.nn.functional import silu, gelu
+class GatedUnit(Module):
+    def __init__(self, dim: int = -1) -> None:
+        super().__init__()
+        self.dim = dim
+    @abstractmethod
+    def _activation(self, x: Tensor) -> Tensor:
+        ...
+    def forward(self, x: Tensor) -> Tensor:
+        f, g = x.chunk(2, dim=self.dim)
+        return self._activation(f) * g
+class SwiGLU(GatedUnit):
+    def __init__(self, dim: int = -1) -> None:
+        super().__init__(dim)
+    def _activation(self, x: Tensor) -> Tensor:
+        return silu(x)
+class GeGLU(GatedUnit):
+    def __init__(
+        self,
+        dim: int = -1,
+        approximate: Literal["tanh", "none"] = "tanh"
+    ) -> None:
+        super().__init__(dim)
+        self.approximate = approximate
+    def _activation(self, x: Tensor) -> Tensor:
+        return gelu(x, self.approximate)

hydra_pool.py ADDED Viewed

	@@ -0,0 +1,581 @@

+import re
+from collections import defaultdict
+from math import sqrt
+from typing import Any, Iterable, Self, cast
+import torch
+from torch import Tensor
+from torch.nn import (
+    Module, ModuleList, Parameter, Buffer,
+    Linear, LayerNorm, RMSNorm, Dropout, Flatten,
+    init
+)
+from torch.nn.functional import pad, scaled_dot_product_attention
+from einops import rearrange
+from glu import SwiGLU
+class IndexedAdd(Module):
+    def __init__(
+        self,
+        n_indices: int,
+        dim: int,
+        weight_shape: tuple[int, ...] | None = None,
+        *,
+        inplace: bool = False,
+        device: torch.device | str | None = None,
+        dtype: torch.dtype | None = None,
+    ) -> None:
+        super().__init__()
+        self.dim = dim
+        self.inplace = inplace
+        self.index = Buffer(torch.empty(
+            2, n_indices,
+            device=device, dtype=torch.int32
+        ))
+        self.weight = Parameter(torch.ones(
+            *(sz if sz != -1 else n_indices for sz in weight_shape),
+            device=device, dtype=dtype
+        )) if weight_shape is not None else None
+    def _save_to_state_dict(
+        self,
+        destination: dict[str, Any],
+        prefix: str,
+        keep_vars: bool
+    ) -> None:
+        super()._save_to_state_dict(destination, prefix, keep_vars)
+        if keep_vars:
+            return
+        with torch.no_grad():
+            index_key = f"{prefix}index"
+            index = destination[index_key]
+            min_index = index.amin(None).item()
+            if min_index >= 0:
+                max_index = index.amax(None).item()
+                if max_index < (1 << 8):
+                    destination[index_key] = index.to(dtype=torch.uint8)
+                elif max_index < (1 << 16):
+                    destination[index_key] = index.to(dtype=torch.uint16)
+    @torch.no_grad()
+    def load_indices(self, indices: Iterable[tuple[int, int]], *, mean: bool = False) -> None:
+        if mean:
+            if self.weight is None:
+                raise ValueError("No weights to initialize with means.")
+            groups: dict[int, list[int]] = defaultdict(list)
+        idx = -1
+        for idx, (src, dst) in enumerate(indices):
+            self.index[0, idx] = src
+            self.index[1, idx] = dst
+            if mean:
+                groups[dst].append(idx)
+        if (idx + 1) != self.index.size(1):
+            raise IndexError(f"Expected {self.index.size(1)} indices, but got {idx + 1}.")
+        if not mean:
+            return
+        assert self.weight is not None
+        for idxs in groups.values():
+            if len(idxs) < 2:
+                continue
+            self.weight.index_fill_(
+                self.dim,
+                torch.tensor(idxs, device=self.weight.device, dtype=torch.int64),
+                1.0 / len(idxs)
+            )
+    def forward(self, dst: Tensor, src: Tensor) -> Tensor:
+        src = src.index_select(self.dim, self.index[0])
+        if self.weight is not None:
+            src.mul_(self.weight)
+        return (
+            dst.index_add_(self.dim, self.index[1], src)
+            if self.inplace else
+            dst.index_add(self.dim, self.index[1], src)
+        )
+class BatchLinear(Module):
+    def __init__(
+        self,
+        batch_shape: tuple[int, ...] | int,
+        in_features: int,
+        out_features: int,
+        *,
+        bias: bool = False,
+        flatten: bool = False,
+        bias_inplace: bool = True,
+        device: torch.device | str | None = None,
+        dtype: torch.dtype | None = None,
+    ) -> None:
+        super().__init__()
+        if isinstance(batch_shape, int):
+            batch_shape = (batch_shape,)
+        elif not batch_shape:
+            raise ValueError("At least one batch dimension is required.")
+        self.flatten = -(len(batch_shape) + 1) if flatten else 0
+        self.weight = Parameter(torch.empty(
+            *batch_shape, in_features, out_features,
+            device=device, dtype=dtype
+        ))
+        bt = self.weight.flatten(end_dim=-3).mT
+        for idx in range(bt.size(0)):
+            init.kaiming_uniform_(bt[idx], a=sqrt(5))
+        self.bias = Parameter(torch.zeros(
+            *batch_shape, out_features,
+            device=device, dtype=dtype
+        )) if bias else None
+        self.bias_inplace = bias_inplace
+    def forward(self, x: Tensor) -> Tensor:
+        # ... B... 1 I @ B... I O -> ... B... O
+        x = torch.matmul(x.unsqueeze(-2), self.weight).squeeze(-2)
+        if self.bias is not None:
+            if self.bias_inplace:
+                x.add_(self.bias)
+            else:
+                x = x + self.bias
+        if self.flatten:
+            x = x.flatten(self.flatten)
+        return x
+class Mean(Module):
+    def __init__(self, dim: tuple[int, ...] | int = -1, *, keepdim: bool = False) -> None:
+        super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mean(self.dim, self.keepdim)
+class _MidBlock(Module):
+    def __init__(
+        self,
+        attn_dim: int,
+        head_dim: int,
+        n_classes: int,
+        *,
+        ff_ratio: float,
+        ff_dropout: float,
+        q_cls_inplace: bool = True,
+        device: torch.device | str | None,
+        dtype: torch.dtype | None,
+    ) -> None:
+        super().__init__()
+        self.head_dim = head_dim
+        self.q_cls_inplace = q_cls_inplace
+        hidden_dim = int(attn_dim * ff_ratio)
+        self.q_proj = Linear(
+            attn_dim, attn_dim, bias=False,
+            device=device, dtype=dtype
+        )
+        self.q_cls = Parameter(torch.zeros(
+            n_classes, attn_dim,
+            device=device, dtype=dtype
+        ))
+        self.q_norm = RMSNorm(head_dim, eps=1e-5, elementwise_affine=False)
+        self.attn_out = Linear(
+            attn_dim, attn_dim, bias=False,
+            device=device, dtype=dtype
+        )
+        self.ff_norm = LayerNorm(
+            attn_dim,
+            device=device, dtype=dtype
+        )
+        self.ff_in = Linear(
+            attn_dim, hidden_dim * 2, bias=False,
+            device=device, dtype=dtype
+        )
+        self.ff_act = SwiGLU()
+        self.ff_drop = Dropout(ff_dropout)
+        self.ff_out = Linear(
+            hidden_dim, attn_dim, bias=False,
+            device=device, dtype=dtype
+        )
+    def _forward_q(self, x: Tensor) -> Tensor:
+        x = self.q_proj(x)
+        if self.q_cls_inplace:
+            x.add_(self.q_cls)
+        else:
+            x = x + self.q_cls
+        x = self.q_norm(x)
+        x = rearrange(x, "... s (h e) -> ... h s e", e=self.head_dim)
+        return x
+    def _forward_attn(self, x: Tensor, k: Tensor, v: Tensor, attn_mask: Tensor | None) -> Tensor:
+        a = scaled_dot_product_attention(
+            self._forward_q(x), k, v,
+            attn_mask=attn_mask
+        )
+        a = rearrange(a, "... h s e -> ... s (h e)")
+        a = self.attn_out(a)
+        return x + a
+    def _forward_ff(self, x: Tensor) -> Tensor:
+        f = self.ff_norm(x)
+        f = self.ff_in(f)
+        f = self.ff_act(f)
+        f = self.ff_drop(f)
+        f = self.ff_out(f)
+        return x + f
+    def forward(self, x: Tensor, k: Tensor, v: Tensor, attn_mask: Tensor | None = None) -> Tensor:
+        x = self._forward_attn(x, k, v, attn_mask)
+        x = self._forward_ff(x)
+        return x
+class HydraPool(Module):
+    def __init__(
+        self,
+        attn_dim: int,
+        head_dim: int,
+        n_classes: int,
+        *,
+        mid_blocks: int = 0,
+        roots: tuple[int, int, int] = (0, 0, 0),
+        ff_ratio: float = 3.0,
+        ff_dropout: float = 0.0,
+        input_dim: int = -1,
+        output_dim: int = 1,
+        device: torch.device | str | None = None,
+        dtype: torch.dtype | None = None,
+    ) -> None:
+        super().__init__()
+        if input_dim < 0:
+            input_dim = attn_dim
+        assert attn_dim % head_dim == 0
+        n_heads = attn_dim // head_dim
+        self.n_classes = n_classes
+        self.head_dim = head_dim
+        self.output_dim = output_dim
+        self._has_roots = False
+        self._has_ff = False
+        self.q: Parameter | Buffer
+        self._q_normed: bool | None
+        if roots != (0, 0, 0):
+            self._has_roots = True
+            n_roots, n_classroots, n_subclasses = roots
+            if n_classroots < n_roots:
+                raise ValueError("Number of classroots cannot be less than the number of roots.")
+            self.cls = Parameter(torch.randn(
+                n_heads, n_classes, head_dim,
+                device=device, dtype=dtype
+            ))
+            self.roots = Parameter(torch.randn(
+                n_heads, n_roots, head_dim,
+                device=device, dtype=dtype
+            )) if n_roots > 0 else None
+            self.clsroots = IndexedAdd(
+                n_classroots, dim=-2, weight_shape=(n_heads, -1, 1),
+                device=device, dtype=dtype
+            ) if n_classroots > 0 else None
+            self.clscls = IndexedAdd(
+                n_subclasses, dim=-2, weight_shape=(n_heads, -1, 1),
+                inplace=True, device=device, dtype=dtype
+            ) if n_subclasses > 0 else None
+            self.q = Buffer(torch.empty(
+                n_heads, n_classes, head_dim,
+                device=device, dtype=dtype
+            ))
+            self._q_normed = None
+        else:
+            self.q = Parameter(torch.randn(
+                n_heads, n_classes, head_dim,
+                device=device, dtype=dtype
+            ))
+            self._q_normed = False
+        self.kv = Linear(
+            input_dim, attn_dim * 2, bias=False,
+            device=device, dtype=dtype
+        )
+        self.qk_norm = RMSNorm(
+            head_dim, eps=1e-5, elementwise_affine=False
+        )
+        if ff_ratio > 0.0:
+            self._has_ff = True
+            hidden_dim = int(attn_dim * ff_ratio)
+            self.ff_norm = LayerNorm(
+                attn_dim,
+                device=device, dtype=dtype
+            )
+            self.ff_in = Linear(
+                attn_dim, hidden_dim * 2, bias=False,
+                device=device, dtype=dtype
+            )
+            self.ff_act = SwiGLU()
+            self.ff_drop = Dropout(ff_dropout)
+            self.ff_out = Linear(
+                hidden_dim, attn_dim, bias=False,
+                device=device, dtype=dtype
+            )
+        elif mid_blocks > 0:
+            raise ValueError("Feedforward required with mid blocks.")
+        self.mid_blocks = ModuleList(
+            _MidBlock(
+                attn_dim, head_dim, n_classes,
+                ff_ratio=ff_ratio, ff_dropout=ff_dropout,
+                device=device, dtype=dtype
+            ) for _ in range(mid_blocks)
+        )
+        self.out_proj = BatchLinear(
+            n_classes, attn_dim, output_dim * 2,
+            device=device, dtype=dtype
+        )
+        self.out_act = SwiGLU()
+    @property
+    def has_roots(self) -> bool:
+        return self._has_roots
+    def get_extra_state(self) -> dict[str, Any]:
+        return { "q_normed": self._q_normed }
+    def set_extra_state(self, state: dict[str, Any]) -> None:
+        self._q_normed = state["q_normed"]
+    def create_head(self) -> Module:
+        if self.output_dim == 1:
+            return Flatten(-2)
+        return Mean(-1)
+    def train(self, mode: bool = True) -> Self:
+        super().train(mode)
+        if mode:
+            if self._has_roots:
+                self._q_normed = None
+            else:
+                self._q_normed = False
+        else:
+            if self._has_roots:
+                self._cache_query()
+        return self
+    def inference(self) -> Self:
+        super().train(False)
+        self._cache_query()
+        if self._has_roots:
+            self._has_roots = False
+            self.q = Parameter(self.q)
+            del self.cls, self.roots, self.clsroots, self.clscls
+        return self
+    def _cache_query(self) -> None:
+        assert not self.training
+        if self._q_normed:
+            return
+        with torch.no_grad():
+            self.q.to(device=self.kv.weight.device)
+            self.q.copy_(self._forward_q())
+            self._q_normed = True
+    def _forward_q(self) -> Tensor:
+        match self._q_normed:
+            case None:
+                assert self._has_roots
+                if self.roots is not None:
+                    q = self.qk_norm(self.roots)
+                    q = self.clsroots(self.cls, q)
+                else:
+                    q = self.cls
+                if self.clscls is not None:
+                    q = self.clscls(q, q.detach())
+                q = self.qk_norm(q)
+                return q
+            case False:
+                assert not self._has_roots
+                return self.qk_norm(self.q)
+            case True:
+                return self.q
+    def _forward_attn(self, x: Tensor, attn_mask: Tensor | None) -> tuple[Tensor, Tensor, Tensor]:
+        q = self._forward_q().expand(*x.shape[:-2], -1, -1, -1)
+        x = self.kv(x)
+        k, v = rearrange(x, "... s (n h e) -> n ... h s e", n=2, e=self.head_dim).unbind(0)
+        k = self.qk_norm(k)
+        x = scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
+        return rearrange(x, "... h s e -> ... s (h e)"), k, v
+    def _forward_ff(self, x: Tensor) -> Tensor:
+        if not self._has_ff:
+            return x
+        f = self.ff_norm(x)
+        f = self.ff_in(f)
+        f = self.ff_act(f)
+        f = self.ff_drop(f)
+        f = self.ff_out(f)
+        return x + f
+    def _forward_out(self, x: Tensor) -> Tensor:
+        x = self.out_proj(x)
+        x = self.out_act(x)
+        return x
+    def forward(self, x: Tensor, attn_mask: Tensor | None = None) -> Tensor:
+        x, k, v = self._forward_attn(x, attn_mask)
+        x = self._forward_ff(x)
+        for block in self.mid_blocks:
+            x = block(x, k, v, attn_mask)
+        x = self._forward_out(x)
+        return x
+    def prune_roots(self, retain_classes: set[int]) -> tuple[list[int], list[int]]:
+        if not self._has_roots or self.roots is None:
+            raise TypeError("No roots to prune.")
+        if self.clscls is not None:
+            raise TypeError("Subclass roots cannot be pruned.")
+        used_roots: set[int] = set()
+        used_clsroots: list[int] = []
+        assert self.clsroots is not None
+        clsroots = [
+            cast(list[int], clsroot.tolist())
+            for clsroot in self.clsroots.index.cpu().unbind(1)
+        ]
+        for idx, (src, dest) in enumerate(clsroots):
+            if dest in retain_classes:
+                used_roots.add(src)
+                used_clsroots.append(idx)
+        sorted_roots = sorted(used_roots)
+        del used_roots
+        rootmap = {
+            root: idx
+            for idx, root in enumerate(sorted_roots)
+        }
+        clsmap = {
+            cls: idx
+            for idx, cls in enumerate(sorted(retain_classes))
+        }
+        for idx in used_clsroots:
+            src, dest = clsroots[idx]
+            self.clsroots.index[0, idx] = rootmap[src]
+            self.clsroots.index[1, idx] = clsmap[dest]
+        return sorted_roots, used_clsroots
+    @staticmethod
+    def for_state(
+        state_dict: dict[str, Any],
+        prefix: str = "",
+        *,
+        ff_dropout: float = 0.0,
+        device: torch.device | str | None = None,
+        dtype: torch.dtype | None = None,
+    ) -> "HydraPool":
+        n_heads, n_classes, head_dim = state_dict[f"{prefix}q"].shape
+        attn_dim = n_heads * head_dim
+        roots_t = state_dict.get(f"{prefix}roots")
+        clsroots_t = state_dict.get(f"{prefix}clsroots.index")
+        clscls_t = state_dict.get(f"{prefix}clscls.index")
+        roots = (
+            roots_t.size(1) if roots_t is not None else 0,
+            clsroots_t.size(1) if clsroots_t is not None else 0,
+            clscls_t.size(1) if clscls_t is not None else 0
+        )
+        input_dim = state_dict[f"{prefix}kv.weight"].size(1)
+        output_dim = state_dict[f"{prefix}out_proj.weight"].size(2) // 2
+        # avoid off-by-one issue due to truncation
+        ffout_t = state_dict.get(f"{prefix}ff_out.weight")
+        hidden_dim = ffout_t.size(1) + 0.5 if ffout_t is not None else 0
+        ff_ratio = hidden_dim / attn_dim
+        pattern = re.compile(rf"^{re.escape(prefix)}mid_blocks\.([0-9]+)\.")
+        mid_blocks = max([-1, *(
+            int(match[1])
+            for key in state_dict
+            if (match := pattern.match(key)) is not None
+        )]) + 1
+        return HydraPool(
+            attn_dim,
+            head_dim,
+            n_classes,
+            mid_blocks=mid_blocks,
+            roots=roots,
+            ff_ratio=ff_ratio,
+            ff_dropout=ff_dropout,
+            input_dim=input_dim,
+            output_dim=output_dim,
+            device=device,
+            dtype=dtype
+        )

image.py ADDED Viewed

	@@ -0,0 +1,271 @@

+from io import BytesIO
+from typing import Any, Callable, cast
+from warnings import warn, catch_warnings, filterwarnings
+import numpy as np
+from torch import Tensor
+from einops import rearrange
+import PIL.Image as image
+import PIL.ImageCms as image_cms
+from PIL.Image import Image, Resampling
+from PIL.ImageCms import (
+    Direction, Intent, ImageCmsProfile, PyCMSError,
+    createProfile, getDefaultIntent, isIntentSupported, profileToProfile
+)
+from PIL.ImageOps import exif_transpose
+try:
+    import pillow_jxl
+except ImportError:
+    pass
+image.MAX_IMAGE_PIXELS = None
+_SRGB = createProfile(colorSpace='sRGB')
+_INTENT_FLAGS = {
+    Intent.PERCEPTUAL: image_cms.FLAGS["HIGHRESPRECALC"],
+    Intent.RELATIVE_COLORIMETRIC: (
+        image_cms.FLAGS["HIGHRESPRECALC"] |
+        image_cms.FLAGS["BLACKPOINTCOMPENSATION"]
+    ),
+    Intent.ABSOLUTE_COLORIMETRIC: image_cms.FLAGS["HIGHRESPRECALC"]
+}
+class CMSWarning(UserWarning):
+    def __init__(
+        self,
+        message: str,
+        *,
+        path: str | None = None,
+        cms_info: dict[str, Any] | None = None,
+        cause: Exception | None = None,
+    ):
+        super().__init__(message)
+        self.__cause__ = cause
+        self.path = path
+        self.cms_info = cms_info
+        self.add_note(f"path: {path}")
+        self.add_note(f"info: {cms_info}")
+def _coalesce_intent(intent: Intent | int) -> Intent:
+    if isinstance(intent, Intent):
+        return intent
+    match intent:
+        case 0:
+            return Intent.PERCEPTUAL
+        case 1:
+            return Intent.RELATIVE_COLORIMETRIC
+        case 2:
+            return Intent.SATURATION
+        case 3:
+            return Intent.ABSOLUTE_COLORIMETRIC
+        case _:
+            raise ValueError("invalid intent")
+def _add_info(info: dict[str, Any], source: object, key: str) -> None:
+    try:
+        if (value := getattr(source, key, None)) is not None:
+            info[key] = value
+    except Exception:
+        pass
+def open_srgb(
+    path: str,
+    *,
+    resize: Callable[[tuple[int, int]], tuple[int, int] | None] | tuple[int, int] | None = None,
+    crop: Callable[[tuple[int, int]], tuple[int, int, int, int] | None] | tuple[int, int, int, int] | None = None,
+    expect: tuple[int, int] | None = None,
+) -> Image:
+    with open(path, "rb", buffering=(1024 * 1024)) as file:
+        img: Image = image.open(file)
+        try:
+            out = process_srgb(img, resize=resize, crop=crop, expect=expect)
+        except:
+            img.close()
+            raise
+        if img is not out:
+            img.close()
+        return out
+def process_srgb(
+    img: Image,
+    *,
+    resize: Callable[[tuple[int, int]], tuple[int, int] | None] | tuple[int, int] | None = None,
+    crop: Callable[[tuple[int, int]], tuple[int, int, int, int] | None] | tuple[int, int, int, int] | None = None,
+    expect: tuple[int, int] | None = None,
+) -> Image:
+    img.load()
+    try:
+        exif_transpose(img, in_place=True)
+    except Exception:
+        pass # corrupt EXIF metadata is fine
+    size = (img.width, img.height)
+    if expect is not None and size != expect:
+        raise RuntimeError(
+            f"Image is {size[0]}x{size[1]}, "
+            f"but expected {expect[0]}x{expect[1]}."
+        )
+    if (icc_raw := img.info.get("icc_profile")) is not None:
+        cms_info: dict[str, Any] = {
+            "native_mode": img.mode,
+            "transparency": img.has_transparency_data,
+        }
+        try:
+            profile = ImageCmsProfile(BytesIO(icc_raw))
+            _add_info(cms_info, profile.profile, "profile_description")
+            _add_info(cms_info, profile.profile, "target")
+            _add_info(cms_info, profile.profile, "xcolor_space")
+            _add_info(cms_info, profile.profile, "connection_space")
+            _add_info(cms_info, profile.profile, "colorimetric_intent")
+            _add_info(cms_info, profile.profile, "rendering_intent")
+            working_mode = img.mode
+            if img.mode.startswith(("RGB", "BGR", "P")):
+                working_mode = "RGBA" if img.has_transparency_data else "RGB"
+            elif img.mode.startswith(("L", "I", "F")) or img.mode == "1":
+                working_mode = "LA" if img.has_transparency_data else "L"
+            if img.mode != working_mode:
+                cms_info["working_mode"] = working_mode
+                img = img.convert(working_mode)
+            mode = "RGBA" if img.has_transparency_data else "RGB"
+            intent = Intent.RELATIVE_COLORIMETRIC
+            if isIntentSupported(profile, intent, Direction.INPUT) != 1:
+                intent = _coalesce_intent(getDefaultIntent(profile))
+            cms_info["conversion_intent"] = intent
+            if (flags := _INTENT_FLAGS.get(intent)) is None:
+                raise RuntimeError("Unsupported intent")
+            if img.mode == mode:
+                profileToProfile(
+                    img,
+                    profile,
+                    _SRGB,
+                    renderingIntent=intent,
+                    inPlace=True,
+                    flags=flags
+                )
+            else:
+                img = cast(Image, profileToProfile(
+                    img,
+                    profile,
+                    _SRGB,
+                    renderingIntent=intent,
+                    outputMode=mode,
+                    flags=flags
+                ))
+        except Exception as ex:
+            pass
+    if img.has_transparency_data:
+        if img.mode != "RGBa":
+            try:
+                img = img.convert("RGBa")
+            except ValueError:
+                img = img.convert("RGBA").convert("RGBa")
+    elif img.mode != "RGB":
+        img = img.convert("RGB")
+    if crop is not None and not isinstance(crop, tuple):
+        crop = crop(size)
+    if crop is not None:
+        left, top, right, bottom = crop
+        size = (right - left, top - bottom)
+    if resize is not None and not isinstance(resize, tuple):
+        resize = resize(size)
+    if resize is not None and size != resize:
+        img = img.resize(
+            resize,
+            Resampling.LANCZOS,
+            box=crop,
+            reducing_gap=3.0
+        )
+        crop = None
+    if crop is not None:
+        img = img.crop(crop)
+    return img
+def put_srgb(img: Image, tensor: Tensor) -> None:
+    if img.mode not in ("RGB", "RGBA", "RGBa"):
+        raise ValueError(f"Image has non-RGB mode {img.mode}.")
+    np.copyto(tensor.numpy(), np.asarray(img)[:, :, :3], casting="no")
+def put_srgb_patch(
+    img: Image,
+    patch_data: Tensor,
+    patch_coord: Tensor,
+    patch_valid: Tensor,
+    patch_size: int
+) -> None:
+    if img.mode not in ("RGB", "RGBA", "RGBa"):
+        raise ValueError(f"Image has non-RGB mode {img.mode}.")
+    patches = rearrange(
+        np.asarray(img)[:, :, :3],
+        "(h p1) (w p2) c -> h w (p1 p2 c)",
+        p1=patch_size, p2=patch_size
+    )
+    coords = np.stack(np.meshgrid(
+        np.arange(patches.shape[0], dtype=np.int16),
+        np.arange(patches.shape[1], dtype=np.int16),
+        indexing="ij"
+    ), axis=-1)
+    coords = rearrange(coords, "h w c -> (h w) c")
+    patches = rearrange(patches, "h w p -> (h w) p")
+    n = patches.shape[0]
+    np.copyto(patch_data[:n].numpy(), patches, casting="no")
+    np.copyto(patch_coord[:n].numpy(), coords, casting="no")
+    patch_valid[:n] = True
+def unpatchify(input: Tensor, coords: Tensor, valid: Tensor) -> Tensor:
+    """
+    Scatter valid patches from (seqlen, ...) to (H, W, ...), using coords and valid mask.
+    Args:
+        input: Tensor of shape (seqlen, ...), patch data.
+        coords: Tensor of shape (seqlen, 2), spatial coordinates [y, x] for each patch.
+        valid: Tensor of shape (seqlen,), boolean mask for valid patches.
+    Returns:
+        Tensor of shape (H, W, ...), with valid patches scattered to their spatial locations.
+    """
+    valid_coords = coords[0, valid[0]]  # (n_valid, 2)
+    valid_patches = input[valid[0]]  # (n_valid, ...)
+    h = int(valid_coords[:, 0].max().item()) + 1
+    w = int(valid_coords[:, 1].max().item()) + 1
+    output_shape = (h, w) + input.shape[1:]
+    output = input.new_zeros(output_shape)
+    output[valid_coords[:, 0], valid_coords[:, 1]] = valid_patches
+    return output

model.py ADDED Viewed

	@@ -0,0 +1,192 @@

+from math import ceil
+import torch
+from torch import Tensor
+from torch.nn import Identity
+import timm
+from timm.models import NaFlexVit
+from PIL import Image
+from safetensors import safe_open
+from image import process_srgb, put_srgb_patch
+def sdpa_attn_mask(
+    patch_valid: Tensor,
+    num_prefix_tokens: int = 0,
+    symmetric: bool = True,
+    q_len: int | None = None,
+    dtype: torch.dtype | None = None,
+) -> Tensor:
+    mask = patch_valid.unflatten(-1, (1, 1, -1))
+    if num_prefix_tokens:
+        mask = torch.cat((
+            torch.ones(
+                *mask.shape[:-1], num_prefix_tokens,
+                device=patch_valid.device, dtype=torch.bool
+            ), mask
+        ), dim=-1)
+    return mask
+timm.models.naflexvit.create_attention_mask = sdpa_attn_mask
+def get_image_size_for_seq(
+    image_hw: tuple[int, int],
+    patch_size: int = 16,
+    max_seq_len: int = 1024,
+    max_ratio: float = 1.0,
+    eps: float = 1e-5,
+) -> tuple[int, int]:
+    """Determine image size for sequence length constraint."""
+    assert max_ratio >= 1.0
+    assert eps * 2 < max_ratio
+    h, w = image_hw
+    max_py = int(max((h * max_ratio) // patch_size, 1))
+    max_px = int(max((w * max_ratio) // patch_size, 1))
+    if (max_py * max_px) <= max_seq_len:
+        return max_py * patch_size, max_px * patch_size
+    def patchify(ratio: float) -> tuple[int, int]:
+        return (
+            min(int(ceil((h * ratio) / patch_size)), max_py),
+            min(int(ceil((w * ratio) / patch_size)), max_px)
+        )
+    py, px = patchify(eps)
+    if (py * px) > max_seq_len:
+        raise ValueError(f"Image of size {w}x{h} is too large.")
+    ratio = eps
+    while (max_ratio - ratio) >= eps:
+        mid = (ratio + max_ratio) / 2.0
+        mpy, mpx = patchify(mid)
+        seq_len = mpy * mpx
+        if seq_len > max_seq_len:
+            max_ratio = mid
+            continue
+        ratio = mid
+        py = mpy
+        px = mpx
+        if seq_len == max_seq_len:
+            break
+    assert py >= 1 and px >= 1
+    return py * patch_size, px * patch_size
+def process_image(img: Image.Image, patch_size: int, max_seq_len: int) -> Image.Image:
+    def compute_resize(wh: tuple[int, int]) -> tuple[int, int]:
+        h, w = get_image_size_for_seq((wh[1], wh[0]), patch_size, max_seq_len)
+        return w, h
+    return process_srgb(img, resize=compute_resize)
+def patchify_image(img: Image.Image, patch_size: int, max_seq_len: int, share_memory: bool = False) -> tuple[Tensor, Tensor, Tensor]:
+    patches = torch.zeros(max_seq_len, patch_size * patch_size * 3, device="cpu", dtype=torch.uint8)
+    patch_coords = torch.zeros(max_seq_len, 2, device="cpu", dtype=torch.int16)
+    patch_valid = torch.zeros(max_seq_len, device="cpu", dtype=torch.bool)
+    if share_memory:
+        patches.share_memory_()
+        patch_coords.share_memory_()
+        patch_valid.share_memory_()
+    put_srgb_patch(img, patches, patch_coords, patch_valid, patch_size)
+    return patches, patch_coords, patch_valid
+def load_image(
+    path: str,
+    patch_size: int = 16,
+    max_seq_len: int = 1024,
+    share_memory: bool = False
+) -> tuple[Tensor, Tensor, Tensor]:
+    with open(path, "rb", buffering=(1024 * 1024)) as file:
+        img: Image.Image = Image.open(file)
+        try:
+            processed = process_image(img, patch_size, max_seq_len)
+        except:
+            img.close()
+            raise
+    if img is not processed:
+        img.close()
+    return patchify_image(processed, patch_size, max_seq_len, share_memory)
+def load_model(path: str, device: torch.device | str | None = None) -> tuple[NaFlexVit, list[str]]:
+    with safe_open(path, framework="pt", device="cpu") as file:
+        metadata = file.metadata()
+        state_dict = {
+            key: file.get_tensor(key)
+            for key in file.keys()
+        }
+    arch = metadata["modelspec.architecture"]
+    if not arch.startswith("naflexvit_so400m_patch16_siglip"):
+        raise ValueError(f"Unrecognized model architecture: {arch}")
+    tags = metadata["classifier.labels"].split("\n")
+    model = timm.create_model(
+        'naflexvit_so400m_patch16_siglip',
+        pretrained=False, num_classes=0,
+        pos_embed_interp_mode="bilinear",
+        weight_init="skip", fix_init=False,
+        device="cpu", dtype=torch.bfloat16,
+    )
+    match arch[31:]:
+        case "": # vanilla
+            model.reset_classifier(len(tags))
+        case "+rr_slim":
+            model.reset_classifier(len(tags))
+            if "attn_pool.q.weight" not in state_dict:
+                model.attn_pool.q = Identity()
+            if "head.bias" not in state_dict:
+                model.head.bias = None
+        case "+rr_chonker":
+            from chonker_pool import ChonkerPool
+            model.attn_pool = ChonkerPool(
+                2, 1152, 72,
+                device=device, dtype=torch.bfloat16
+            )
+            model.head = model.attn_pool.create_head(len(tags))
+            model.num_classes = len(tags)
+        case "+rr_hydra":
+            from hydra_pool import HydraPool
+            model.attn_pool = HydraPool.for_state(
+                state_dict, "attn_pool.",
+                device=device, dtype=torch.bfloat16
+            )
+            model.head = model.attn_pool.create_head()
+            model.num_classes = len(tags)
+            state_dict["attn_pool._extra_state"] = { "q_normed": True }
+        case _:
+            raise ValueError(f"Unrecognized model architecture: {arch}")
+    model.eval().to(dtype=torch.bfloat16)
+    model.load_state_dict(state_dict, strict=True)
+    model.to(device=device)
+    return model, tags

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+torch
+timm
+numpy
+pillow
+einops
+safetensors
+gradio
+requests