Spaces:

fkonrad
/

ViT-Visualizer

Sleeping

ViT-Visualizer / app.py

Felix Konrad

Please please work.

55226e5 2 months ago

7.9 kB

	import os
	import matplotlib.pyplot as plt
	import matplotlib.cm as cm
	import numpy as np
	import gradio as gr
	from transformers import AutoModel, AutoImageProcessor
	from PIL import Image
	import torch

	os.environ["HF_HUB_OFFLINE"] = "0"

	# Global state to store loaded model + processors
	state = {
	"model_type": None,
	"model": None,
	"processor": None,
	"repo_id": None,
	}

	def similarity_heatmap(image):
	"""
	Compute cosine similarity between CLS token and patch tokens
	"""
	model, processor = state["model"], state["processor"]

	inputs = processor(images=image, return_tensors="pt")
	pixel_values = inputs["pixel_values"].to(model.device) # shape: (1, 3, H, W)

	# get ViT patch size (from model config)
	patch_size = model.config.patch_size # usually 16

	# Compute patch grid (needed for resizing later)
	H_patch = pixel_values.shape[2] // patch_size
	W_patch = pixel_values.shape[3] // patch_size

	with torch.no_grad():
	outputs = model(pixel_values) # last_hidden_state: (1, seq_len, hidden_dim)
	last_hidden_state = outputs.last_hidden_state
	cls_token = last_hidden_state[:, 0, :] # shape: (1, hidden_dim)
	patch_tokens = last_hidden_state[:, 1:, :] # shape: (1, num_patches, hidden_dim)

	cls_norm = cls_token / cls_token.norm(dim=-1, keepdim=True)
	patch_norm = patch_tokens / patch_tokens.norm(dim=-1, keepdim=True)

	cos_sim = torch.einsum("bd,bpd->bp", cls_norm, patch_norm) # shape: (1, num_patches)
	cos_sim = cos_sim.reshape((H_patch, W_patch))
	return np.array(cos_sim)

	def overlay_cosine_grid_on_image(cos_grid: np.ndarray, image: Image.Image, alpha=0.5, colormap="viridis"):
	"""
	cos_grid: (H_patch, W_patch) numpy array of cosine similarities
	image: PIL.Image
	alpha: blending factor
	colormap: matplotlib colormap name
	"""
	# Normalize cosine values to [0, 1] for colormap
	norm_grid = (cos_grid - cos_grid.min()) / (cos_grid.max() - cos_grid.min() + 1e-8)

	# Apply colormap
	cmap = cm.get_cmap(colormap)
	heatmap_rgba = cmap(norm_grid) # shape: (H_patch, W_patch, 4)

	# Convert to RGB 0-255
	heatmap_rgb = (heatmap_rgba[:, :, :3] * 255).astype(np.uint8)
	heatmap_img = Image.fromarray(heatmap_rgb)

	# Resize heatmap to match original image size
	heatmap_resized = heatmap_img.resize(image.size, resample=Image.BILINEAR)

	# Blend with original image
	blended = Image.blend(image.convert("RGBA"), heatmap_resized.convert("RGBA"), alpha=alpha)

	return blended

	def load_model(repo_id: str, revision: str = None):
	"""
	Load a Hugging Face model + processor from Hub.
	Works with any public repo_id.
	"""
	try:
	# Clean up inputs
	repo_id = repo_id.strip()
	if not repo_id:
	return "Please enter a model repo ID"

	if revision and revision.strip() == "":
	revision = None

	# First try without cache_dir to avoid permission issues
	try:
	model = AutoModel.from_pretrained(
	repo_id,
	revision=revision,
	trust_remote_code=True,
	use_auth_token=False # Explicitly no auth for public models
	)

	processor = AutoImageProcessor.from_pretrained(
	repo_id,
	revision=revision,
	trust_remote_code=True,
	use_auth_token=False
	)
	except Exception as e1:
	# If that fails, try with explicit cache directory
	model = AutoModel.from_pretrained(
	repo_id,
	revision=revision,
	cache_dir="/tmp/model_cache", # Use /tmp for better permissions
	trust_remote_code=True,
	use_auth_token=False,
	local_files_only=False # Ensure we can download
	)

	processor = AutoImageProcessor.from_pretrained(
	repo_id,
	revision=revision,
	cache_dir="/tmp/model_cache",
	trust_remote_code=True,
	use_auth_token=False,
	local_files_only=False
	)

	# Move to appropriate device
	device = "cuda" if torch.cuda.is_available() else "cpu"
	model.to(device)
	model.eval()

	# Validate it's a Vision Transformer
	if not hasattr(model.config, 'patch_size'):
	return f"Model '{repo_id}' doesn't appear to be a Vision Transformer (no patch_size in config)"

	# Update global state
	state["model"] = model
	state["processor"] = processor
	state["repo_id"] = repo_id
	state["model_type"] = "custom"

	patch_size = model.config.patch_size
	return f"Successfully loaded ViT model '{repo_id}' (patch size: {patch_size}) on {device}"

	except Exception as e:
	error_str = str(e).lower()
	if "repository not found" in error_str or "404" in error_str:
	return f"Repository '{repo_id}' not found. Please check the repo ID."
	elif "connection" in error_str or "network" in error_str or "offline" in error_str:
	return f"Network error: {str(e)}"
	elif "permission" in error_str or "forbidden" in error_str:
	return f"Permission denied. This might be a private repository."
	else:
	return f"Error loading model: {str(e)}"

	def display_image(image: Image):
	"""
	Simply returns the uploaded image.
	"""
	return image

	def visualize_cosine_heatmap(image: Image):
	"""
	Generate and overlay cosine similarity heatmap on the input image.
	"""
	if state["model"] is None:
	return None # Return None if no model is loaded

	try:
	cos_grid = similarity_heatmap(image)
	blended = overlay_cosine_grid_on_image(cos_grid, image)
	return blended
	except Exception as e:
	print(f"Error generating heatmap: {e}")
	return None

	# Gradio UI
	with gr.Blocks(title="ViT CLS Visualizer") as demo:
	gr.Markdown("# ViT CLS-Visualizer")
	gr.Markdown(
	"Enter the Hugging Face model repo ID (must be public), upload an image, "
	"and visualize the cosine similarity between the CLS token and patches."
	)

	gr.Markdown("### Popular Vision Transformer models to try:")
	gr.Markdown(
	"- `google/vit-base-patch16-224`\n"
	"- `facebook/deit-base-distilled-patch16-224`\n"
	"- `microsoft/dit-base`"
	)

	with gr.Row():
	repo_input = gr.Textbox(
	label="Hugging Face Model Repo ID",
	placeholder="e.g. google/vit-base-patch16-224",
	value="google/vit-base-patch16-224"
	)
	revision_input = gr.Textbox(
	label="Revision (optional)",
	placeholder="branch, tag, or commit hash"
	)
	load_btn = gr.Button("Load Model", variant="primary")

	load_status = gr.Textbox(label="Model Status", interactive=False)

	with gr.Row():
	with gr.Column():
	image_input = gr.Image(type="pil", label="Upload Image")
	image_output = gr.Image(label="Uploaded Image")

	with gr.Column():
	compute_btn = gr.Button("Compute Heatmap", variant="primary")
	heatmap_output = gr.Image(label="Cosine Similarity Heatmap")

	# Events
	load_btn.click(
	fn=load_model,
	inputs=[repo_input, revision_input],
	outputs=load_status
	)

	image_input.change(
	fn=display_image,
	inputs=image_input,
	outputs=image_output
	)

	compute_btn.click(
	fn=visualize_cosine_heatmap,
	inputs=image_input,
	outputs=heatmap_output
	)

	if __name__ == "__main__":
	demo.launch()