Spaces:

lorebianchi98
/

Talk2DINO

Running on Zero

App Files Files Community

lorebianchi98 commited on Oct 8

Commit

593b176

1 Parent(s): bd34a5b

First commit

Browse files

Files changed (8) hide show

.gitattributes +1 -0
app.py +237 -0
assets/overview.png +3 -0
examples/0_pikachu.png +3 -0
examples/1_jurassic.png +3 -0
examples/2_falcon.png +3 -0
requirements.txt +20 -0
src/plot.py +57 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,237 @@

+import gradio as gr
+import torch
+import numpy as np
+from transformers import AutoModel
+import os
+import torchvision.transforms.functional as F
+from src.plot import plot_qualitative
+from PIL import Image
+from io import BytesIO
+import base64
+from pathlib import Path
+# --- Setup ---
+os.environ["GRADIO_TEMP_DIR"] = "tmp"
+os.makedirs(os.environ["GRADIO_TEMP_DIR"], exist_ok=True)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# --- Load Models ---
+model_B = AutoModel.from_pretrained("lorebianchi98/Talk2DINO-ViTB", trust_remote_code=True).to(device).eval()
+model_L = AutoModel.from_pretrained("lorebianchi98/Talk2DINO-ViTL", trust_remote_code=True).to(device).eval()
+MODELS = {"ViT-B": model_B, "ViT-L": model_L}
+# --- Example Setup ---
+EXAMPLE_IMAGES_DIR = Path("examples").resolve()
+example_images = sorted([str(p) for p in EXAMPLE_IMAGES_DIR.glob("*.png")])
+DEFAULT_CLASSES = {
+    "0_pikachu.png": "pikachu,traffic_sign,forest,road,cap",
+    "1_jurassic.png": "dinosaur,smoke,vegetation,person",
+    "2_falcon.png": "millenium_falcon,space"
+}
+DEFAULT_BG_THRESH = 0.55
+DEFAULT_BG_CLEAN = False
+# --- Inference Function ---
+def talk2dino_infer(input_image, class_text, selected_model="ViT-B",
+                    apply_pamr=True, with_background=False, bg_thresh=0.55, apply_bg_clean=False):
+    if input_image is None:
+        raise gr.Error("No image detected. Please select or upload an image first.")
+    model = MODELS[selected_model]
+    text = [t.strip() for t in class_text.replace("_", " ").split(",") if t.strip()]
+    if len(text) == 0:
+        raise gr.Error("Please provide at least one class name before generating segmentation.")
+    img = F.to_tensor(input_image).unsqueeze(0).float().to(device) * 255.0
+    # Generate color palette
+    palette = [
+        [255, 0, 0],
+        [255, 255, 0],
+        [0, 255, 0],
+        [0, 255, 255],
+        [0, 0, 255],
+        [128, 128, 128]
+    ]
+    if len(text) > len(palette):
+        for _ in range(len(text) - len(palette)):
+            palette.append([np.random.randint(0, 255) for _ in range(3)])
+    if with_background:
+        palette.insert(0, [0, 0, 0])
+        model.with_bg_clean = apply_bg_clean
+    with torch.no_grad():
+        text_emb = model.build_dataset_class_tokens("sub_imagenet_template", text)
+        text_emb = model.build_text_embedding(text_emb)
+        mask, _ = model.generate_masks(img, img_metas=None, text_emb=text_emb,
+                                       classnames=text, apply_pamr=apply_pamr)
+        if with_background:
+            background = torch.ones_like(mask[:, :1]) * bg_thresh
+            mask = torch.cat([background, mask], dim=1)
+        mask = mask.argmax(dim=1)
+    if with_background:
+        text = ["background"] + text
+    img_out = plot_qualitative(
+        img.cpu()[0].permute(1, 2, 0).int().numpy(),
+        mask.cpu()[0].numpy(),
+        palette,
+        texts=text
+    )
+    return img_out
+# --- Gradio Interface ---
+with gr.Blocks(title="Talk2DINO Demo") as demo:
+    # Overview Section
+    overview_img = Image.open("assets/overview.png").convert("RGB")
+    overview_img = overview_img.resize((int(overview_img.width * 0.7), int(overview_img.height * 0.7)))
+    buffered = BytesIO()
+    overview_img.save(buffered, format="PNG")
+    img_str = base64.b64encode(buffered.getvalue()).decode()
+    gr.Markdown(f"""
+    # 🦖 Talk2DINO Demo
+    ![Overview](data:image/png;base64,{img_str})
+    <div style="font-size: x-large; white-space: nowrap; display: flex; align-items: center; gap: 10px;">
+        <a href="https://lorebianchi98.github.io/Talk2DINO/" target="_blank">Project page</a>
+        <span>|</span>
+        <a href="http://arxiv.org/abs/2411.19331" target="_blank">
+            <img src="https://img.shields.io/badge/arXiv-2411.19331-b31b1b.svg" style="height:28px; vertical-align:middle;">
+        </a>
+        <span>|</span>
+        <a href="https://huggingface.co/papers/2411.19331" target="_blank">
+            <img src="https://img.shields.io/badge/HuggingFace-Paper-yellow.svg" style="height:28px; vertical-align:middle;">
+        </a>
+    </div>
+    ---
+    This demo allows you to **perform open-vocabulary semantic segmentation** on images using Talk2DINO.
+    **How to use:**
+    1. Upload an image or select one from the example gallery.
+    2. Enter a comma-separated list of class names you want to segment (e.g., `pikachu, forest, road`).
+    3. Adjust optional parameters:
+    - **Model**: choose between ViT-B and ViT-L
+    - **Apply PAMR**: refine masks after initial prediction
+    - **Include Background**: visualize background areas
+    - **Background Threshold**: threshold for background intensity
+    - **Apply Background Cleaning**: remove background noise when enabled
+    4. Click **Generate Segmentation** to see the segmentation overlay.
+    """)
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(type="pil", label="Input Image", value=None)
+            if example_images:
+                example_gallery = gr.Gallery(
+                    value=example_images,
+                    label="Or select from example images",
+                    show_label=True,
+                    columns=3,
+                    object_fit="contain",
+                    height="auto"
+                )
+        with gr.Column():
+            model_selector = gr.Dropdown(
+                label="Select Model",
+                choices=["ViT-B", "ViT-L"],
+                value="ViT-B"
+            )
+            class_text = gr.Textbox(
+                label="Comma-separated Classes",
+                value="",
+                placeholder="e.g. pikachu, road, tree"
+            )
+            apply_pamr = gr.Checkbox(label="Apply PAMR", value=True)
+            with_background = gr.Checkbox(label="Include Background", value=False)
+            bg_thresh = gr.Slider(
+                label="Background Threshold",
+                minimum=0.0,
+                maximum=1.0,
+                value=DEFAULT_BG_THRESH,
+                step=0.01,
+                interactive=False
+            )
+            apply_bg_clean = gr.Checkbox(
+                label="Apply Background Cleaning",
+                value=False,
+                interactive=False
+            )
+            generate_button = gr.Button("🚀 Generate Segmentation", interactive=False)
+            output_image = gr.Image(type="numpy", label="Segmentation Overlay")
+    # --- Background Option Toggle ---
+    def toggle_bg_options(with_bg):
+        if with_bg:
+            return gr.update(interactive=True, value=DEFAULT_BG_THRESH), gr.update(interactive=True, value=DEFAULT_BG_CLEAN)
+        else:
+            return gr.update(interactive=False, value=DEFAULT_BG_THRESH), gr.update(interactive=False, value=DEFAULT_BG_CLEAN)
+    with_background.change(
+        fn=toggle_bg_options,
+        inputs=[with_background],
+        outputs=[bg_thresh, apply_bg_clean]
+    )
+    # --- Enable Button Only When Classes Exist ---
+    def enable_generate_button(text):
+        return gr.update(interactive=bool(text.strip()))
+    class_text.change(fn=enable_generate_button, inputs=[class_text], outputs=[generate_button])
+    # --- Example Image Loader ---
+    def load_example_image(evt: gr.SelectData):
+        selected = evt.value["image"]
+        if isinstance(selected, str):
+            img = Image.open(selected).convert("RGB")
+            filename = Path(selected).name
+        elif isinstance(selected, dict):
+            img = Image.open(selected["path"]).convert("RGB")
+            filename = Path(selected["path"]).name
+        else:
+            img = Image.fromarray(selected)
+            filename = None
+        class_val = DEFAULT_CLASSES.get(filename, "")
+        return img, class_val, gr.update(interactive=bool(class_val.strip()))
+    if example_images:
+        example_gallery.select(
+            fn=load_example_image,
+            inputs=[],
+            outputs=[input_image, class_text, generate_button]
+        )
+    # --- User Upload Reset ---
+    def on_upload_image(img):
+        if img is None:
+            return None, "", gr.update(interactive=False)
+        return img, "", gr.update(interactive=False)
+    input_image.upload(
+        fn=on_upload_image,
+        inputs=[input_image],
+        outputs=[input_image, class_text, generate_button]
+    )
+    # --- Generate Segmentation ---
+    generate_button.click(
+        talk2dino_infer,
+        inputs=[input_image, class_text, model_selector, apply_pamr, with_background, bg_thresh, apply_bg_clean],
+        outputs=output_image
+    )
+demo.launch(server_port=7870, share=False)

assets/overview.png ADDED Viewed

Git LFS Details

SHA256: fcefc8c68cf95a966f769852ea51e7efa7ea2398b21936cacaa2eb5c6fff0358
Pointer size: 130 Bytes
Size of remote file: 89.5 kB

examples/0_pikachu.png ADDED Viewed

Git LFS Details

SHA256: 7a5efcbce11e4a293ebb743c8857c0654c6bce0b89beb59f6ca71d64311c4106
Pointer size: 131 Bytes
Size of remote file: 377 kB

examples/1_jurassic.png ADDED Viewed

Git LFS Details

SHA256: 804a011b7b5e312dda9a6a57ccb32947d6f74413b6311170dd96fcf10b792705
Pointer size: 131 Bytes
Size of remote file: 364 kB

examples/2_falcon.png ADDED Viewed

Git LFS Details

SHA256: 80a818fbce8acd2bda1e570dc6c0775d2100d0a227c768c5d7ff83275870709c
Pointer size: 131 Bytes
Size of remote file: 297 kB

requirements.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+git+https://github.com/openai/CLIP.git
+matplotlib
+opencv-python
+pyyaml
+requests
+scikit-image
+tqdm
+omegaconf
+einops
+timm
+transformers
+webdataset
+numpy==1.24.1
+jaxtyping
+rich
+scikit-learn
+safetensors==0.4.3
+gradio
+torch
+torchvision

src/plot.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import numpy as np
+from matplotlib import pyplot as plt
+from matplotlib.patches import Rectangle
+def plot_qualitative(image, sim, palette, texts, alpha=0.6, legend_height=0.1):
+    """
+    image: HxWx3 uint8 image
+    sim: HxW segmentation mask with integer class IDs
+    palette: list of [R,G,B] colors
+    texts: list of class names corresponding to IDs
+    alpha: transparency for overlay
+    legend_height: fraction of figure height reserved for legend
+    """
+    qualitative_plot = np.zeros((sim.shape[0], sim.shape[1], 3), dtype=np.uint8)
+    for j in np.unique(sim):
+        qualitative_plot[sim == j] = np.array(palette[j])
+    # Normalize images for alpha blending
+    img_float = image.astype(np.float32) / 255.0
+    overlay_float = qualitative_plot.astype(np.float32) / 255.0
+    # Figure with space for legend
+    fig_height = img_float.shape[0] / 100
+    fig_width = img_float.shape[1] / 100
+    fig = plt.figure(figsize=(fig_width, fig_height + legend_height * fig_height), dpi=100)
+    # Main image axis
+    ax_img = fig.add_axes([0, legend_height, 1, 1 - legend_height])
+    ax_img.imshow(img_float)
+    ax_img.imshow(overlay_float, alpha=alpha)
+    ax_img.axis("off")
+    # Legend axis
+    ax_legend = fig.add_axes([0, 0, 1, legend_height])
+    ax_legend.axis("off")
+    # Draw legend rectangles
+    unique_classes = np.unique(sim)
+    num_classes = len(unique_classes)
+    for idx, cls in enumerate(unique_classes):
+        color = np.array(palette[cls]) / 255.0
+        # Rectangle: (x, y), width, height
+        rect_width = 1 / num_classes * 0.8
+        rect = Rectangle((idx / num_classes, 0.1), rect_width, 0.6, facecolor=color)
+        ax_legend.add_patch(rect)
+        # Add text label centered on rectangle
+        ax_legend.text(idx / num_classes + rect_width / 2, 0.8, texts[cls],
+                       ha='center', va='bottom', fontsize=10)
+    # Extract as NumPy array
+    fig.canvas.draw()
+    buf = np.asarray(fig.canvas.renderer.buffer_rgba())
+    img_array = (buf[:, :, :3]).copy()  # drop alpha
+    plt.close(fig)
+    return img_array