Spaces:

sagar007
/

SegmentVision

Sleeping

App Files Files Community

sagar007 commited on Mar 25

Commit

b066832

verified ·

1 Parent(s): e31b682

Update app.py

Browse files

Files changed (1) hide show

app.py +237 -180

app.py CHANGED Viewed

@@ -1,220 +1,277 @@
 import gradio as gr
 import torch
-from PIL import Image
-import cv2
 import numpy as np
-from transformers import CLIPProcessor, CLIPModel
-from ultralytics import FastSAM
-import supervision as sv
 import os
-import requests
-from tqdm.auto import tqdm  # For a nice progress bar
-# --- Constants and Model Initialization ---
-# CLIP
-CLIP_MODEL_NAME = "openai/clip-vit-base-patch32"
-# FastSAM
-# *Corrected* HuggingFace link for the weights
-FASTSAM_WEIGHTS_URL = "https://huggingface.co/spaces/An-619/FastSAM/resolve/6f76f474c656d2cb29599f49c296a8784b02d04b/weights/FastSAM-s.pt"
-FASTSAM_WEIGHTS_NAME = "FastSAM-s.pt"
-# Default FastSAM parameters
-DEFAULT_IMGSZ = 640
-DEFAULT_CONFIDENCE = 0.4
-DEFAULT_IOU = 0.9
-DEFAULT_RETINA_MASKS = False
-# --- Helper Functions ---
-def download_file(url, filename):
-    """Downloads a file from a URL with a progress bar."""
-    response = requests.get(url, stream=True)
-    response.raise_for_status()  # Raise an exception for bad status codes
-    total_size = int(response.headers.get('content-length', 0))
-    block_size = 1024  # 1 KB
-    progress_bar = tqdm(total=total_size, unit='iB', unit_scale=True)
-    with open(filename, 'wb') as file:
-        for data in response.iter_content(block_size):
-            progress_bar.update(len(data))
-            file.write(data)
-    progress_bar.close()
-    if total_size != 0 and progress_bar.n != total_size:
-        raise ValueError("Error: Download failed.")
-# --- Model Loading ---
-# Load CLIP model (this part is correct in your original code)
-model = CLIPModel.from_pretrained(CLIP_MODEL_NAME)
-processor = CLIPProcessor.from_pretrained(CLIP_MODEL_NAME)
-# Load FastSAM model with dynamic device handling
-if not os.path.exists(FASTSAM_WEIGHTS_NAME):
-    print(f"Downloading FastSAM weights from {FASTSAM_WEIGHTS_URL}...")
-    try:
-        download_file(FASTSAM_WEIGHTS_URL, FASTSAM_WEIGHTS_NAME)
-        print("FastSAM weights downloaded successfully.")
     except Exception as e:
-        print(f"Error downloading FastSAM weights: {e}")
-        raise  # Re-raise the exception to stop execution
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-fast_sam = FastSAM(FASTSAM_WEIGHTS_NAME)
-fast_sam.to(device)
-print(f"FastSAM loaded on device: {device}")
-# --- Processing Functions ---
-def process_image_clip(image, text_input):
-    # ... (Your CLIP processing function remains the same) ...
-    if image is None:
-        return "Please upload an image first."
-    if not text_input:
-        return "Please enter some text to check in the image."
     try:
-        # Convert numpy array to PIL Image if needed
-        if isinstance(image, np.ndarray):
-            image = Image.fromarray(image)
-        # Create a list of candidate labels
-        candidate_labels = [text_input, f"not {text_input}"]
-        # Process image and text
-        inputs = processor(
-            images=image,
-            text=candidate_labels,
-            return_tensors="pt",
-            padding=True
-        )
-        # Get model predictions
-        outputs = model(**{k: v for k, v in inputs.items()})
-        logits_per_image = outputs.logits_per_image
-        probs = logits_per_image.softmax(dim=1)
-        # Get confidence for the positive label
-        confidence = float(probs[0][0])
-        return f"Confidence that the image contains '{text_input}': {confidence:.2%}"
-    except Exception as e:
-        return f"Error processing image: {str(e)}"
-def process_image_fastsam(image, imgsz, conf, iou, retina_masks):
-    if image is None:
-        return None, "Please upload an image to segment."
-    try:
-        # Convert PIL image to numpy array if needed
-        if isinstance(image, Image.Image):
-            image_np = np.array(image)
-        else:
-            image_np = image
-        # Run FastSAM inference
-        results = fast_sam(image_np, device=device, retina_masks=retina_masks, imgsz=imgsz, conf=conf, iou=iou)
-        # Check if results are valid
-        if results is None or len(results) == 0 or results[0] is None:
-          return None, "FastSAM did not return valid results. Try adjusting parameters or using a different image."
-        # Get detections
-        detections = sv.Detections.from_ultralytics(results[0])
-          # Check if detections are valid
-        if detections is None or len(detections) == 0:
-          return None, "No objects detected in the image. Try lowering the confidence threshold."
-        # Create annotator
-        box_annotator = sv.BoxAnnotator()
-        mask_annotator = sv.MaskAnnotator()
-        # Annotate image
-        annotated_image = mask_annotator.annotate(scene=image_np.copy(), detections=detections)
-        annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections)
-        return Image.fromarray(annotated_image), None # Return None for the error message since there's no error
-    except RuntimeError as re:
-      if "out of memory" in str(re).lower():
-          return None, "Error: Out of memory. Try reducing the image size (imgsz) or disabling retina masks."
-      else:
-          return None, f"Runtime error during FastSAM processing: {str(re)}"
     except Exception as e:
-        return None, f"Error processing image with FastSAM: {str(e)}"
 # --- Gradio Interface ---
-with gr.Blocks(css="footer {visibility: hidden}") as demo:
-    # ... (Your Markdown and CLIP tab remain mostly the same) ...
-    gr.Markdown("""
-    # CLIP and FastSAM Demo
-    This demo combines two powerful AI models:
-    - **CLIP**: For zero-shot image classification
-    - **FastSAM**: For automatic image segmentation
-    Try uploading an image and use either of the tabs below!
-    """)
-    with gr.Tab("CLIP Zero-Shot Classification"):
-        with gr.Row():
-            image_input = gr.Image(label="Input Image")
-            text_input = gr.Textbox(
-                label="What do you want to check in the image?",
-                placeholder="e.g., 'a dog', 'sunset', 'people playing'",
-                info="Enter any concept you want to check in the image"
             )
-        output_text = gr.Textbox(label="Result")
-        classify_btn = gr.Button("Classify")
-        classify_btn.click(fn=process_image_clip, inputs=[image_input, text_input], outputs=output_text)
-        gr.Examples(
-            examples=[
-                ["https://raw.githubusercontent.com/gradio-app/gradio/main/demo/kitchen/kitchen.png", "kitchen"],
-                ["https://raw.githubusercontent.com/gradio-app/gradio/main/demo/calculator/calculator.jpg", "calculator"],
-            ],
-            inputs=[image_input, text_input],
-        )
-    with gr.Tab("FastSAM Segmentation"):
-        with gr.Row():
-            image_input_sam = gr.Image(label="Input Image")
-            with gr.Column():
-                imgsz_slider = gr.Slider(minimum=320, maximum=1920, step=32, value=DEFAULT_IMGSZ, label="Image Size (imgsz)")
-                conf_slider = gr.Slider(minimum=0.0, maximum=1.0, step=0.05, value=DEFAULT_CONFIDENCE, label="Confidence Threshold")
-                iou_slider = gr.Slider(minimum=0.0, maximum=1.0, step=0.05, value=DEFAULT_IOU, label="IoU Threshold")
-                retina_checkbox = gr.Checkbox(label="Retina Masks", value=DEFAULT_RETINA_MASKS)
-        with gr.Row():
-          image_output = gr.Image(label="Segmentation Result")
-          error_output = gr.Textbox(label="Error Message", type="text") # Added for displaying errors
-        segment_btn = gr.Button("Segment")
-        segment_btn.click(
-            fn=process_image_fastsam,
-            inputs=[image_input_sam, imgsz_slider, conf_slider, iou_slider, retina_checkbox],
-            outputs=[image_output, error_output] # Output to both image and error textboxes
-        )
-        gr.Examples(
-            examples=[
-                ["https://raw.githubusercontent.com/gradio-app/gradio/main/demo/kitchen/kitchen.png"],
-                ["https://raw.githubusercontent.com/gradio-app/gradio/main/demo/calculator/calculator.jpg"],
-            ],
-            inputs=[image_input_sam],
-        )
-    # ... (Your final Markdown remains the same) ...
-    gr.Markdown("""
-    ### How to use:
-    1. **CLIP Classification**: Upload an image and enter text to check if that concept exists in the image
-    2. **FastSAM Segmentation**: Upload an image to get automatic segmentation with bounding boxes and masks
-    ### Note:
-    - The models run on CPU by default, so processing might take a few seconds. If you have a GPU, it will be used automatically.
-    - For best results, use clear images with good lighting.
-    - You can adjust FastSAM parameters (Image Size, Confidence, IoU, Retina Masks) in the Segmentation tab.
-    """)
-demo.launch(share=True)

 import gradio as gr
 import torch
+from transformers import AutoProcessor, AutoModel
+from PIL import Image, ImageDraw, ImageFont
 import numpy as np
+import random
 import os
+import wget # To download weights
+# --- Configuration & Model Loading ---
+# Device Selection
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using device: {DEVICE}")
+# --- CLIP Setup ---
+CLIP_MODEL_ID = "openai/clip-vit-base-patch32"
+clip_processor = None
+clip_model = None
+def load_clip_model():
+    global clip_processor, clip_model
+    if clip_processor is None:
+        print(f"Loading CLIP processor: {CLIP_MODEL_ID}...")
+        clip_processor = AutoProcessor.from_pretrained(CLIP_MODEL_ID)
+        print("CLIP processor loaded.")
+    if clip_model is None:
+        print(f"Loading CLIP model: {CLIP_MODEL_ID}...")
+        clip_model = AutoModel.from_pretrained(CLIP_MODEL_ID).to(DEVICE)
+        print(f"CLIP model loaded to {DEVICE}.")
+# --- FastSAM Setup ---
+# Use a smaller model suitable for Spaces CPU/basic GPU if needed
+FASTSAM_CHECKPOINT = "FastSAM-s.pt"
+FASTSAM_CHECKPOINT_URL = f"https://huggingface.co/spaces/An-619/FastSAM/resolve/main/{FASTSAM_CHECKPOINT}" # Example URL, find official if possible
+fastsam_model = None
+def download_fastsam_weights():
+    if not os.path.exists(FASTSAM_CHECKPOINT):
+        print(f"Downloading FastSAM weights: {FASTSAM_CHECKPOINT}...")
+        try:
+            wget.download(FASTSAM_CHECKPOINT_URL, FASTSAM_CHECKPOINT)
+            print("FastSAM weights downloaded.")
+        except Exception as e:
+            print(f"Error downloading FastSAM weights: {e}")
+            print("Please ensure the URL is correct and reachable, or manually place the weights file.")
+            return False
+    return os.path.exists(FASTSAM_CHECKPOINT)
+def load_fastsam_model():
+    global fastsam_model
+    if fastsam_model is None:
+        if download_fastsam_weights():
+            try:
+                from fastsam import FastSAM, FastSAMPrompt # Import here after potential download
+                print(f"Loading FastSAM model: {FASTSAM_CHECKPOINT}...")
+                fastsam_model = FastSAM(FASTSAM_CHECKPOINT)
+                print(f"FastSAM model loaded.") # Device handled internally by FastSAM based on its setup/torch device
+            except ImportError:
+                print("Error: 'fastsam' library not found. Please install it (pip install fastsam).")
+            except Exception as e:
+                print(f"Error loading FastSAM model: {e}")
+        else:
+            print("FastSAM weights not found. Cannot load model.")
+# --- Processing Functions ---
+# CLIP Zero-Shot Classification Function
+def run_clip_zero_shot(image: Image.Image, text_labels: str):
+    if clip_model is None or clip_processor is None:
+        load_clip_model() # Attempt to load if not already loaded
+        if clip_model is None:
+             return "Error: CLIP Model not loaded. Check logs.", None
+    if not text_labels:
+        return "Please provide comma-separated text labels.", None
+    if image is None:
+        return "Please upload an image.", None
+    labels = [label.strip() for label in text_labels.split(',')]
+    if not labels:
+         return "No valid labels provided.", None
+    print(f"Running CLIP zero-shot classification with labels: {labels}")
+    try:
+        # Ensure image is RGB
+        if image.mode != "RGB":
+            image = image.convert("RGB")
+        inputs = clip_processor(text=labels, images=image, return_tensors="pt", padding=True).to(DEVICE)
+        with torch.no_grad():
+            outputs = clip_model(**inputs)
+            logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+            probs = logits_per_image.softmax(dim=1)  # convert to probabilities
+        print("CLIP processing complete.")
+        # Format output for Gradio Label
+        confidences = {labels[i]: float(probs[0, i].item()) for i in range(len(labels))}
+        return confidences, image # Return original image for display alongside results
     except Exception as e:
+        print(f"Error during CLIP processing: {e}")
+        return f"An error occurred: {e}", None
+# FastSAM Segmentation Function
+def run_fastsam_segmentation(image_pil: Image.Image, conf_threshold: float = 0.4, iou_threshold: float = 0.9):
+    if fastsam_model is None:
+        load_fastsam_model() # Attempt to load if not already loaded
+        if fastsam_model is None:
+             return "Error: FastSAM Model not loaded. Check logs.", None
+    if image_pil is None:
+        return "Please upload an image.", None
+    print("Running FastSAM segmentation...")
     try:
+         # Ensure image is RGB
+        if image_pil.mode != "RGB":
+            image_pil = image_pil.convert("RGB")
+        # FastSAM expects a BGR numpy array or path usually. Let's try with RGB numpy.
+        # If it fails, uncomment the BGR conversion line.
+        image_np_rgb = np.array(image_pil)
+        # image_np_bgr = image_np_rgb[:, :, ::-1] # Convert RGB to BGR if needed
+        # Run FastSAM inference
+        # Adjust imgsz, conf, iou as needed. Higher imgsz = more detail, slower.
+        everything_results = fastsam_model(
+            image_np_rgb, # Use image_np_bgr if conversion needed
+            device=DEVICE,
+            retina_masks=True,
+            imgsz=640, # Smaller size for faster inference on limited hardware
+            conf=conf_threshold,
+            iou=iou_threshold,
+        )
+        # Process results using FastSAMPrompt
+        from fastsam import FastSAMPrompt # Make sure it's imported
+        prompt_process = FastSAMPrompt(image_np_rgb, everything_results, device=DEVICE)
+        # Get all annotations (masks)
+        ann = prompt_process.everything_prompt()
+        print(f"FastSAM found {len(ann[0]['masks']) if ann and ann[0] else 0} masks.")
+        # --- Plotting Masks on Image (Manual) ---
+        output_image = image_pil.copy()
+        if ann and ann[0] is not None and 'masks' in ann[0] and len(ann[0]['masks']) > 0:
+            masks = ann[0]['masks'].cpu().numpy() # shape (N, H, W)
+            # Create overlay image
+            overlay = Image.new('RGBA', output_image.size, (0, 0, 0, 0))
+            draw = ImageDraw.Draw(overlay)
+            for i in range(masks.shape[0]):
+                mask = masks[i] # shape (H, W), boolean
+                # Generate random color with some transparency
+                color = (random.randint(50, 255), random.randint(50, 255), random.randint(50, 255), 128) # RGBA with alpha
+                # Create a single-channel image from the boolean mask
+                mask_image = Image.fromarray((mask * 255).astype(np.uint8), mode='L')
+                # Apply color to the mask area on the overlay
+                draw.bitmap((0,0), mask_image, fill=color)
+            # Composite the overlay onto the original image
+            output_image = Image.alpha_composite(output_image.convert('RGBA'), overlay).convert('RGB')
+        print("FastSAM processing and plotting complete.")
+        return output_image, image_pil # Return segmented and original images
     except Exception as e:
+        print(f"Error during FastSAM processing: {e}")
+        import traceback
+        traceback.print_exc() # Print detailed traceback
+        return f"An error occurred: {e}", None
 # --- Gradio Interface ---
+# Pre-load models on startup (optional but good for performance)
+print("Attempting to preload models...")
+load_clip_model()
+load_fastsam_model()
+print("Preloading finished (or attempted).")
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# CLIP & FastSAM Demo")
+    gr.Markdown("Explore Zero-Shot Classification with CLIP and 'Segment Anything' with FastSAM.")
+    with gr.Tabs():
+        # --- CLIP Tab ---
+        with gr.TabItem("CLIP Zero-Shot Classification"):
+            gr.Markdown("Upload an image and provide comma-separated candidate labels (e.g., 'cat, dog, car'). CLIP will predict the probability of the image matching each label.")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    clip_input_image = gr.Image(type="pil", label="Input Image")
+                    clip_text_labels = gr.Textbox(label="Comma-Separated Labels", placeholder="e.g., astronaut, mountain, dog playing fetch")
+                    clip_button = gr.Button("Run CLIP Classification", variant="primary")
+                with gr.Column(scale=1):
+                    clip_output_label = gr.Label(label="Classification Probabilities")
+                    clip_output_image_display = gr.Image(type="pil", label="Input Image Preview") # Show input for context
+            clip_button.click(
+                run_clip_zero_shot,
+                inputs=[clip_input_image, clip_text_labels],
+                outputs=[clip_output_label, clip_output_image_display]
+            )
+            gr.Examples(
+                examples=[
+                    ["examples/astronaut.jpg", "astronaut, moon, rover, mountain"],
+                    ["examples/dog_bike.jpg", "dog, bicycle, person, park, grass"],
+                ],
+                inputs=[clip_input_image, clip_text_labels],
+                outputs=[clip_output_label, clip_output_image_display],
+                fn=run_clip_zero_shot,
+                cache_examples=False, # Re-run for live demo
             )
+        # --- FastSAM Tab ---
+        with gr.TabItem("FastSAM Segmentation"):
+            gr.Markdown("Upload an image. FastSAM will attempt to segment all objects/regions in the image.")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    fastsam_input_image = gr.Image(type="pil", label="Input Image")
+                    with gr.Row():
+                        fastsam_conf = gr.Slider(minimum=0.1, maximum=1.0, value=0.4, step=0.05, label="Confidence Threshold")
+                        fastsam_iou = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="IoU Threshold")
+                    fastsam_button = gr.Button("Run FastSAM Segmentation", variant="primary")
+                with gr.Column(scale=1):
+                    fastsam_output_image = gr.Image(type="pil", label="Segmented Image")
+                    # fastsam_input_display = gr.Image(type="pil", label="Original Image") # Optional: show original side-by-side
+            fastsam_button.click(
+                run_fastsam_segmentation,
+                inputs=[fastsam_input_image, fastsam_conf, fastsam_iou],
+                outputs=[fastsam_output_image] # Removed the second output for simplicity, adjust if needed
+            )
+            gr.Examples(
+                examples=[
+                    ["examples/dogs.jpg", 0.4, 0.9],
+                    ["examples/fruits.jpg", 0.5, 0.8],
+                ],
+                inputs=[fastsam_input_image, fastsam_conf, fastsam_iou],
+                outputs=[fastsam_output_image],
+                fn=run_fastsam_segmentation,
+                cache_examples=False, # Re-run for live demo
+            )
+    # Add example images (optional, but helpful)
+    # Create an 'examples' folder and add some jpg images like 'astronaut.jpg', 'dog_bike.jpg', 'dogs.jpg', 'fruits.jpg'
+    if not os.path.exists("examples"):
+        os.makedirs("examples")
+        print("Created 'examples' directory. Please add some images (e.g., astronaut.jpg, dog_bike.jpg) for the examples to work.")
+        # You might need to download some sample images here too if running on a fresh env
+        try:
+            print("Downloading example images...")
+            wget.download("https://huggingface.co/spaces/gradio/image-segmentation/resolve/main/images/lion.jpg", "examples/lion.jpg")
+            wget.download("https://raw.githubusercontent.com/openai/CLIP/main/CLIP.png", "examples/clip_logo.png")
+            wget.download("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/gradio-logo.png", "examples/gradio_logo.png")
+            # Manually add the examples used above if these don't match
+            print("Example images downloaded (or attempted). Please verify.")
+        except Exception as e:
+             print(f"Could not download example images: {e}")
+# Launch the Gradio app
+if __name__ == "__main__":
+    demo.launch(debug=True) # Set debug=False for deployment