Spaces:

Godreign
/

DEIT

Running

App Files Files Community

Godreign commited on Oct 9

Commit

9b1dcad

verified ·

1 Parent(s): a7838bb

rollbak to old ui

Browse files

Files changed (1) hide show

app.py +144 -126

app.py CHANGED Viewed

@@ -11,29 +11,35 @@ import urllib.request
 import json
 import cv2
 # Model Configs
 MODEL_CONFIGS = {
-    "DeiT-Tiny": {"type": "hf", "id": "facebook/deit-tiny-patch16-224", "desc": "Lightweight Vision Transformer"},
-    "DeiT-Small": {"type": "hf", "id": "facebook/deit-small-patch16-224", "desc": "Small Vision Transformer"},
-    "ViT-Base": {"type": "hf", "id": "google/vit-base-patch16-224", "desc": "Base Vision Transformer"},
-    "ConvNeXt-Tiny": {"type": "timm", "id": "convnext_tiny", "desc": "Tiny ConvNeXt CNN"},
-    "ConvNeXt-Nano": {"type": "timm", "id": "convnext_nano", "desc": "Nano ConvNeXt CNN"},
-    "EfficientNet-B0": {"type": "efficientnet", "id": "efficientnet-b0", "desc": "EfficientNet B0"},
-    "EfficientNet-B1": {"type": "efficientnet", "id": "efficientnet-b1", "desc": "EfficientNet B1"},
-    "ResNet-50": {"type": "timm", "id": "resnet50", "desc": "Classic ResNet-50 CNN"},
-    "MobileNet-V2": {"type": "timm", "id": "mobilenetv2_100", "desc": "Lightweight MobileNet-V2"},
-    "MaxViT-Tiny": {"type": "timm", "id": "maxvit_tiny_tf_224", "desc": "Tiny MaxViT Hybrid"},
-    "MobileViT-Small": {"type": "timm", "id": "mobilevit_s", "desc": "Small MobileViT"},
-    "EdgeNeXt-Small": {"type": "timm", "id": "edgenext_small", "desc": "Small EdgeNeXt"},
-    "RegNetY-002": {"type": "timm", "id": "regnety_002", "desc": "RegNetY-002 CNN"}
 }
 # ImageNet Labels
 IMAGENET_URL = "https://raw.githubusercontent.com/anishathalye/imagenet-simple-labels/master/imagenet-simple-labels.json"
 with urllib.request.urlopen(IMAGENET_URL) as url:
     IMAGENET_LABELS = json.load(url)
 # Lazy Load
 loaded_models = {}
 def load_model(model_name):
@@ -45,17 +51,20 @@ def load_model(model_name):
         extractor = AutoFeatureExtractor.from_pretrained(config["id"])
         model = AutoModelForImageClassification.from_pretrained(config["id"], output_attentions=True)
         model.eval()
         for param in model.parameters():
             param.requires_grad = True
     elif config["type"] == "timm":
         model = timm.create_model(config["id"], pretrained=True)
         model.eval()
         for param in model.parameters():
             param.requires_grad = True
         extractor = None
     elif config["type"] == "efficientnet":
         model = EfficientNet.from_pretrained(config["id"])
         model.eval()
         for param in model.parameters():
             param.requires_grad = True
         extractor = None
@@ -63,14 +72,21 @@ def load_model(model_name):
     loaded_models[model_name] = (model, extractor)
     return model, extractor
 # Adversarial Noise
 def add_adversarial_noise(image, epsilon):
     img_array = np.array(image).astype(np.float32) / 255.0
     noise = np.random.randn(*img_array.shape) * epsilon
     noisy_img = np.clip(img_array + noise, 0, 1)
     return Image.fromarray((noisy_img * 255).astype(np.uint8))
 # Grad-CAM for Class-Specific Attention
 def get_gradcam_for_class(model, image_tensor, class_idx):
     grad = None
     fmap = None
@@ -83,6 +99,7 @@ def get_gradcam_for_class(model, image_tensor, class_idx):
         nonlocal grad
         grad = grad_out[0].detach()
     last_conv = None
     for name, module in reversed(list(model.named_modules())):
         if isinstance(module, torch.nn.Conv2d):
@@ -111,10 +128,15 @@ def get_gradcam_for_class(model, image_tensor, class_idx):
     cam = cam.squeeze().cpu().numpy()
     cam = cv2.resize(cam, (224, 224))
     cam = (cam - cam.min()) / (cam.max() - cam.min() + 1e-8)
     return cam
 # ViT Attention for Class-Specific
 def vit_attention_for_class(model, extractor, image, class_idx):
     inputs = extractor(images=image, return_tensors="pt")
     inputs['pixel_values'].requires_grad = True
     outputs = model(**inputs)
@@ -123,6 +145,7 @@ def vit_attention_for_class(model, extractor, image, class_idx):
     model.zero_grad()
     score.backward()
     if hasattr(outputs, 'attentions') and outputs.attentions is not None:
         attn = outputs.attentions[-1]
         attn = attn.mean(1)
@@ -134,7 +157,10 @@ def vit_attention_for_class(model, extractor, image, class_idx):
     return np.ones((14, 14))
 # Grad-CAM Helper for CNNs (Top Prediction)
 def get_gradcam(model, image_tensor):
     grad = None
     fmap = None
@@ -176,9 +202,13 @@ def get_gradcam(model, image_tensor):
     cam = cam.squeeze().cpu().numpy()
     cam = cv2.resize(cam, (224, 224))
     cam = (cam - cam.min()) / (cam.max() - cam.min() + 1e-8)
     return cam
 # ViT Attention Rollout Helper
 def vit_attention_rollout(outputs):
     if not hasattr(outputs, 'attentions') or outputs.attentions is None:
         return np.ones((14, 14))
@@ -191,12 +221,18 @@ def vit_attention_rollout(outputs):
     attn_map = (attn_map - attn_map.min()) / (attn_map.max() - attn_map.min() + 1e-8)
     return attn_map
 # Create Gradient Legend
 def create_gradient_legend():
     width, height = 400, 60
     gradient = np.zeros((height, width, 3), dtype=np.uint8)
     for i in range(width):
         value = int(255 * i / width)
         color_single = np.array([[[value]]], dtype=np.uint8)
         color_rgb = cv2.applyColorMap(color_single, cv2.COLORMAP_JET)
@@ -204,20 +240,22 @@ def create_gradient_legend():
     gradient = cv2.cvtColor(gradient, cv2.COLOR_BGR2RGB)
     from PIL import ImageDraw, ImageFont
     gradient_pil = Image.fromarray(gradient)
     draw = ImageDraw.Draw(gradient_pil)
     try:
         font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 14)
     except:
         font = ImageFont.load_default()
     draw.text((10, 20), "Low Attention", fill=(255, 255, 255), font=font)
     draw.text((width - 120, 20), "High Attention", fill=(255, 255, 255), font=font)
     return gradient_pil
 def overlay_attention(pil_img, attention_map):
     heatmap = (attention_map * 255).astype(np.uint8)
     heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)
@@ -227,7 +265,10 @@ def overlay_attention(pil_img, attention_map):
     blended = Image.blend(pil_img.convert("RGB"), heatmap_pil, alpha=0.4)
     return blended
 # Main Prediction Function
 def predict(image, model_name, noise_level):
     try:
         if image is None:
@@ -236,9 +277,7 @@ def predict(image, model_name, noise_level):
         if model_name is None:
             return {"Error": "Please select a model"}, None, None
-        # Extract model name from dropdown (remove description)
-        model_name = model_name.split(" - ")[0]
         if noise_level > 0:
             image = add_adversarial_noise(image, noise_level)
@@ -246,7 +285,8 @@ def predict(image, model_name, noise_level):
         transform = T.Compose([
             T.Resize((224, 224)),
             T.ToTensor(),
-            T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
         ])
         if MODEL_CONFIGS[model_name]["type"] == "hf":
@@ -280,7 +320,10 @@ def predict(image, model_name, noise_level):
         print(error_msg)
         return {"Error": str(e)}, None, None
 # Class-Specific Attention
 def get_class_specific_attention(image, model_name, class_query):
     try:
         if image is None:
@@ -289,9 +332,7 @@ def get_class_specific_attention(image, model_name, class_query):
         if not class_query or class_query.strip() == "":
             return None, None, "Please enter a class name"
-        # Extract model name from dropdown (remove description)
-        model_name = model_name.split(" - ")[0]
         class_query_lower = class_query.lower().strip()
         matching_idx = None
         matched_label = None
@@ -299,6 +340,7 @@ def get_class_specific_attention(image, model_name, class_query):
         model, extractor = load_model(model_name)
         if MODEL_CONFIGS[model_name]["type"] == "hf":
             for idx, label in model.config.id2label.items():
                 if class_query_lower in label.lower():
                     matching_idx = idx
@@ -308,9 +350,11 @@ def get_class_specific_attention(image, model_name, class_query):
             if matching_idx is None:
                 return None, None, f"Class '{class_query}' not found in model labels. Try a different class name or check sample classes."
             att_map = vit_attention_for_class(model, extractor, image, matching_idx)
         else:
             for idx, label in enumerate(IMAGENET_LABELS):
                 if class_query_lower in label.lower():
                     matching_idx = idx
@@ -320,10 +364,12 @@ def get_class_specific_attention(image, model_name, class_query):
             if matching_idx is None:
                 return None, None, f"Class '{class_query}' not found in ImageNet labels. Try a different class name or check sample classes."
             transform = T.Compose([
                 T.Resize((224, 224)),
                 T.ToTensor(),
-                T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
             ])
             x = transform(image).unsqueeze(0)
             x.requires_grad = True
@@ -339,7 +385,10 @@ def get_class_specific_attention(image, model_name, class_query):
         print(error_trace)
         return None, None, f"Error generating attention map: {str(e)}"
 # Sample Classes
 SAMPLE_CLASSES = [
     "cat", "dog", "tiger", "lion", "elephant",
     "car", "truck", "airplane", "ship", "train",
@@ -348,122 +397,91 @@ SAMPLE_CLASSES = [
     "person", "bicycle", "building", "tree", "flower"
 ]
-# Improved Gradio UI
-with gr.Blocks(theme=gr.themes.Default(primary_hue="blue", secondary_hue="gray", font=["Inter", "sans-serif"])) as demo:
-    gr.Markdown("""
-        # 🧠 Advanced Image Classification Studio
-        Explore state-of-the-art image classification with multiple models, adversarial testing, and attention visualization.
-        """, elem_classes=["header-text"])
-    with gr.Tabs() as tabs:
-        with gr.TabItem("🔍 Predict & Analyze"):
-            with gr.Row(variant="panel"):
-                with gr.Column(scale=1, min_width=300):
-                    gr.Markdown("### 📷 Input")
-                    input_image = gr.Image(type="pil", label="Upload Image", height=300, interactive=True)
-                    model_dropdown = gr.Dropdown(
-                        choices=[f"{name} - {MODEL_CONFIGS[name]['desc']}" for name in MODEL_CONFIGS.keys()],
-                        label="Select Model",
-                        value="DeiT-Tiny - Lightweight Vision Transformer",
-                        interactive=True,
-                        info="Choose from various architectures (Transformers, CNNs, Hybrids)"
-                    )
-                    with gr.Group():
-                        gr.Markdown("### 🎭 Adversarial Testing")
-                        noise_slider = gr.Slider(
-                            minimum=0, maximum=0.3, value=0, step=0.01,
-                            label="Noise Level (ε)",
-                            info="Add noise to test model robustness",
-                            interactive=True
-                        )
-                    run_button = gr.Button("🚀 Run Prediction", variant="primary")
-                with gr.Column(scale=2):
-                    gr.Markdown("### 📊 Results")
-                    output_label = gr.Label(num_top_classes=5, label="Top 5 Predictions", show_label=True)
-                    with gr.Row():
-                        output_image = gr.Image(label="Attention Map (Top Prediction)", height=300)
-                        processed_image = gr.Image(label="Processed Image (with noise)", height=300, visible=False)
-        with gr.TabItem("🎨 Class-Specific Attention"):
-            gr.Markdown("### Visualize Model Attention for Specific Classes")
-            with gr.Row(variant="panel"):
-                with gr.Column(scale=1, min_width=300):
-                    class_input = gr.Textbox(
-                        label="Enter Class Name",
-                        placeholder="e.g., cat, dog, car, pizza...",
-                        info="Type any ImageNet class name",
-                        interactive=True
-                    )
-                    class_button = gr.Button("🎯 Generate Attention Map", variant="primary")
-                    with gr.Accordion("💡 Sample Classes", open=False):
-                        sample_buttons = gr.CheckboxGroup(
-                            choices=SAMPLE_CLASSES,
-                            label="Select or click to auto-fill",
-                            interactive=True
-                        )
-                with gr.Column(scale=2):
-                    class_output_image = gr.Image(label="Class-Specific Attention Map", height=300)
-                    gradient_legend = gr.Image(label="Attention Scale", interactive=False)
-                    class_status = gr.Textbox(label="Status", interactive=False, lines=2)
-        with gr.TabItem("ℹ️ About Models"):
-            gr.Markdown("""
-                ### Available Models
-                Explore different architectures and their strengths:
-                """)
-            for model_name, config in MODEL_CONFIGS.items():
-                with gr.Accordion(f"{model_name}", open=False):
-                    gr.Markdown(f"- **Type**: {config['type'].upper()}")
-                    gr.Markdown(f"- **Description**: {config['desc']}")
-                    gr.Markdown(f"- **Model ID**: {config['id']}")
     gr.Markdown("""
-        ---
-        ### 💡 How to Use
-        - **Predict & Analyze**: Upload an image, select a model, adjust noise level, and run prediction to see top classes and attention maps.
-        - **Class-Specific Attention**: Enter a class name or select from samples to visualize where the model focuses for that class.
-        - **Adversarial Testing**: Use the noise slider to test model robustness against perturbations.
-        - **Model Info**: Check the 'About Models' tab for details on available architectures.
-        """, elem_classes=["footer-text"])
-    # Event Handlers
-    def update_class_input(selected_classes):
-        return selected_classes[0] if selected_classes else ""
     run_button.click(
-        fn=predict,
         inputs=[input_image, model_dropdown, noise_slider],
-        outputs=[output_label, output_image, processed_image],
-        show_progress=True
     )
     sample_buttons.change(
-        fn=update_class_input,
         inputs=[sample_buttons],
         outputs=[class_input]
     )
     class_button.click(
-        fn=get_class_specific_attention,
         inputs=[input_image, model_dropdown, class_input],
-        outputs=[class_output_image, gradient_legend, class_status],
-        show_progress=True
     )
-    # Add custom CSS for improved styling
-    gr.HTML("""
-        <style>
-            .header-text { font-size: 2rem; font-weight: bold; color: #1E3A8A; margin-bottom: 1rem; }
-            .footer-text { font-size: 0.9rem; color: #4B5563; }
-            .gr-button { transition: all 0.3s ease; }
-            .gr-button:hover { transform: scale(1.05); }
-            .gr-panel { border-radius: 8px; box-shadow: 0 2px 8px rgba(0,0,0,0.1); }
-            .gr-image { border-radius: 8px; }
-            .gr-accordion { margin-bottom: 1rem; }
-        </style>
-    """)
 if __name__ == "__main__":
     demo.launch()

 import json
 import cv2
+# ---------------------------
 # Model Configs
+# ---------------------------
 MODEL_CONFIGS = {
+    "DeiT-Tiny": {"type": "hf", "id": "facebook/deit-tiny-patch16-224"},
+    "DeiT-Small": {"type": "hf", "id": "facebook/deit-small-patch16-224"},
+    "ViT-Base": {"type": "hf", "id": "google/vit-base-patch16-224"},
+    "ConvNeXt-Tiny": {"type": "timm", "id": "convnext_tiny"},
+    "ConvNeXt-Nano": {"type": "timm", "id": "convnext_nano"},
+    "EfficientNet-B0": {"type": "efficientnet", "id": "efficientnet-b0"},
+    "EfficientNet-B1": {"type": "efficientnet", "id": "efficientnet-b1"},
+    "ResNet-50": {"type": "timm", "id": "resnet50"},
+    "MobileNet-V2": {"type": "timm", "id": "mobilenetv2_100"},
+    "MaxViT-Tiny": {"type": "timm", "id": "maxvit_tiny_tf_224"},
+    "MobileViT-Small": {"type": "timm", "id": "mobilevit_s"},
+    "EdgeNeXt-Small": {"type": "timm", "id": "edgenext_small"},
+    "RegNetY-002": {"type": "timm", "id": "regnety_002"}
 }
+# ---------------------------
 # ImageNet Labels
+# ---------------------------
 IMAGENET_URL = "https://raw.githubusercontent.com/anishathalye/imagenet-simple-labels/master/imagenet-simple-labels.json"
 with urllib.request.urlopen(IMAGENET_URL) as url:
     IMAGENET_LABELS = json.load(url)
+# ---------------------------
 # Lazy Load
+# ---------------------------
 loaded_models = {}
 def load_model(model_name):
         extractor = AutoFeatureExtractor.from_pretrained(config["id"])
         model = AutoModelForImageClassification.from_pretrained(config["id"], output_attentions=True)
         model.eval()
+        # Enable gradients for class-specific attention
         for param in model.parameters():
             param.requires_grad = True
     elif config["type"] == "timm":
         model = timm.create_model(config["id"], pretrained=True)
         model.eval()
+        # Enable gradients for class-specific attention
         for param in model.parameters():
             param.requires_grad = True
         extractor = None
     elif config["type"] == "efficientnet":
         model = EfficientNet.from_pretrained(config["id"])
         model.eval()
+        # Enable gradients for class-specific attention
         for param in model.parameters():
             param.requires_grad = True
         extractor = None
     loaded_models[model_name] = (model, extractor)
     return model, extractor
+# ---------------------------
 # Adversarial Noise
+# ---------------------------
 def add_adversarial_noise(image, epsilon):
+    """Add random noise to image"""
     img_array = np.array(image).astype(np.float32) / 255.0
     noise = np.random.randn(*img_array.shape) * epsilon
     noisy_img = np.clip(img_array + noise, 0, 1)
     return Image.fromarray((noisy_img * 255).astype(np.uint8))
+# ---------------------------
 # Grad-CAM for Class-Specific Attention
+# ---------------------------
 def get_gradcam_for_class(model, image_tensor, class_idx):
     grad = None
     fmap = None
         nonlocal grad
         grad = grad_out[0].detach()
+    # Find last conv layer
     last_conv = None
     for name, module in reversed(list(model.named_modules())):
         if isinstance(module, torch.nn.Conv2d):
     cam = cam.squeeze().cpu().numpy()
     cam = cv2.resize(cam, (224, 224))
     cam = (cam - cam.min()) / (cam.max() - cam.min() + 1e-8)
     return cam
+# ---------------------------
 # ViT Attention for Class-Specific
+# ---------------------------
 def vit_attention_for_class(model, extractor, image, class_idx):
+    """Get attention map for specific class in ViT"""
     inputs = extractor(images=image, return_tensors="pt")
     inputs['pixel_values'].requires_grad = True
     outputs = model(**inputs)
     model.zero_grad()
     score.backward()
+    # Use last layer attention
     if hasattr(outputs, 'attentions') and outputs.attentions is not None:
         attn = outputs.attentions[-1]
         attn = attn.mean(1)
     return np.ones((14, 14))
+# ---------------------------
 # Grad-CAM Helper for CNNs (Top Prediction)
+# ---------------------------
 def get_gradcam(model, image_tensor):
     grad = None
     fmap = None
     cam = cam.squeeze().cpu().numpy()
     cam = cv2.resize(cam, (224, 224))
     cam = (cam - cam.min()) / (cam.max() - cam.min() + 1e-8)
     return cam
+# ---------------------------
 # ViT Attention Rollout Helper
+# ---------------------------
 def vit_attention_rollout(outputs):
     if not hasattr(outputs, 'attentions') or outputs.attentions is None:
         return np.ones((14, 14))
     attn_map = (attn_map - attn_map.min()) / (attn_map.max() - attn_map.min() + 1e-8)
     return attn_map
+# ---------------------------
 # Create Gradient Legend
+# ---------------------------
 def create_gradient_legend():
+    """Create a gradient legend image showing attention scale"""
     width, height = 400, 60
     gradient = np.zeros((height, width, 3), dtype=np.uint8)
+    # Create gradient from blue to red (matching COLORMAP_JET)
     for i in range(width):
+        # OpenCV's COLORMAP_JET: blue (low) -> cyan -> green -> yellow -> red (high)
         value = int(255 * i / width)
         color_single = np.array([[[value]]], dtype=np.uint8)
         color_rgb = cv2.applyColorMap(color_single, cv2.COLORMAP_JET)
     gradient = cv2.cvtColor(gradient, cv2.COLOR_BGR2RGB)
+    # Convert to PIL and add text
     from PIL import ImageDraw, ImageFont
     gradient_pil = Image.fromarray(gradient)
     draw = ImageDraw.Draw(gradient_pil)
+    # Use default font
     try:
         font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 14)
     except:
         font = ImageFont.load_default()
+    # Add text labels
     draw.text((10, 20), "Low Attention", fill=(255, 255, 255), font=font)
     draw.text((width - 120, 20), "High Attention", fill=(255, 255, 255), font=font)
     return gradient_pil
 def overlay_attention(pil_img, attention_map):
     heatmap = (attention_map * 255).astype(np.uint8)
     heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)
     blended = Image.blend(pil_img.convert("RGB"), heatmap_pil, alpha=0.4)
     return blended
+# ---------------------------
 # Main Prediction Function
+# ---------------------------
 def predict(image, model_name, noise_level):
     try:
         if image is None:
         if model_name is None:
             return {"Error": "Please select a model"}, None, None
+        # Apply adversarial noise if requested
         if noise_level > 0:
             image = add_adversarial_noise(image, noise_level)
         transform = T.Compose([
             T.Resize((224, 224)),
             T.ToTensor(),
+            T.Normalize(mean=[0.485, 0.456, 0.406],
+                        std=[0.229, 0.224, 0.225])
         ])
         if MODEL_CONFIGS[model_name]["type"] == "hf":
         print(error_msg)
         return {"Error": str(e)}, None, None
+# ---------------------------
 # Class-Specific Attention
+# ---------------------------
 def get_class_specific_attention(image, model_name, class_query):
     try:
         if image is None:
         if not class_query or class_query.strip() == "":
             return None, None, "Please enter a class name"
+        # Find matching class
         class_query_lower = class_query.lower().strip()
         matching_idx = None
         matched_label = None
         model, extractor = load_model(model_name)
         if MODEL_CONFIGS[model_name]["type"] == "hf":
+            # Search in HF model labels
             for idx, label in model.config.id2label.items():
                 if class_query_lower in label.lower():
                     matching_idx = idx
             if matching_idx is None:
                 return None, None, f"Class '{class_query}' not found in model labels. Try a different class name or check sample classes."
+            # Get attention for this class
             att_map = vit_attention_for_class(model, extractor, image, matching_idx)
         else:
+            # Search in ImageNet labels
             for idx, label in enumerate(IMAGENET_LABELS):
                 if class_query_lower in label.lower():
                     matching_idx = idx
             if matching_idx is None:
                 return None, None, f"Class '{class_query}' not found in ImageNet labels. Try a different class name or check sample classes."
+            # Get Grad-CAM for this class
             transform = T.Compose([
                 T.Resize((224, 224)),
                 T.ToTensor(),
+                T.Normalize(mean=[0.485, 0.456, 0.406],
+                            std=[0.229, 0.224, 0.225])
             ])
             x = transform(image).unsqueeze(0)
             x.requires_grad = True
         print(error_trace)
         return None, None, f"Error generating attention map: {str(e)}"
+# ---------------------------
 # Sample Classes
+# ---------------------------
 SAMPLE_CLASSES = [
     "cat", "dog", "tiger", "lion", "elephant",
     "car", "truck", "airplane", "ship", "train",
     "person", "bicycle", "building", "tree", "flower"
 ]
+# ---------------------------
+# Gradio UI
+# ---------------------------
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🧠 Enhanced Multi-Model Image Classifier")
+    gr.Markdown("### Features: Adversarial Examples | Class-Specific Attention | 13+ Models")
+    with gr.Row():
+        with gr.Column(scale=1):
+            input_image = gr.Image(type="pil", label="📸 Upload Image")
+            model_dropdown = gr.Dropdown(
+                choices=list(MODEL_CONFIGS.keys()),
+                label="🤖 Select Model",
+                value="DeiT-Tiny"
+            )
+            gr.Markdown("### 🎭 Adversarial Noise")
+            noise_slider = gr.Slider(
+                minimum=0,
+                maximum=0.3,
+                value=0,
+                step=0.01,
+                label="Noise Level (ε)",
+                info="Add random noise to test model robustness"
+            )
+            run_button = gr.Button("🚀 Run Model", variant="primary")
+        with gr.Column(scale=2):
+            output_label = gr.Label(num_top_classes=5, label="🎯 Top 5 Predictions")
+            output_image = gr.Image(label="🔍 Attention Map (Top Prediction)")
+            processed_image = gr.Image(label="🖼️ Processed Image (with noise if applied)", visible=False)
+    gr.Markdown("---")
+    gr.Markdown("### 🎨 Class-Specific Attention Visualization")
+    gr.Markdown("*Type any class name to see where the model looks for that specific object*")
+    with gr.Row():
+        with gr.Column(scale=1):
+            class_input = gr.Textbox(
+                label="🔍 Enter Class Name",
+                placeholder="e.g., cat, dog, car, pizza...",
+                info="Type any ImageNet class name"
+            )
+            class_button = gr.Button("🎯 Generate Class-Specific Attention", variant="primary")
+            gr.Markdown("**💡 Sample classes to try:**")
+            sample_buttons = gr.Radio(
+                choices=SAMPLE_CLASSES,
+                label="Click to auto-fill",
+                interactive=True
+            )
+        with gr.Column(scale=2):
+            class_output_image = gr.Image(label="🔍 Class-Specific Attention Map")
+            class_status = gr.Textbox(label="Status", interactive=False)
+    gr.Markdown("---")
     gr.Markdown("""
+    ### 💡 Tips:
+    - **Adversarial Noise**: Adjust the slider to add random noise and see how robust the model is
+    - **Class-Specific Attention**: Type any ImageNet class to visualize what the model looks for (e.g., "tiger", "sports car", "pizza")
+    - **Model Variety**: Try different architectures (ViT, CNN, Hybrid) to compare their behavior
+    """)
+    # Event handlers
     run_button.click(
+        predict,
         inputs=[input_image, model_dropdown, noise_slider],
+        outputs=[output_label, output_image, processed_image]
     )
+    # When user selects a sample class, update the text input
     sample_buttons.change(
+        lambda x: x,
         inputs=[sample_buttons],
         outputs=[class_input]
     )
+    # Generate attention map
     class_button.click(
+        get_class_specific_attention,
         inputs=[input_image, model_dropdown, class_input],
+        outputs=[class_output_image, gradient_legend, class_status]
     )
 if __name__ == "__main__":
     demo.launch()