Spaces:

akhaliq
/

moondream3-preview

Running on Zero

File size: 10,429 Bytes

import gradio as gr
import torch
from transformers import AutoModelForCausalLM
from PIL import Image
import numpy as np
from io import BytesIO
import spaces

# Initialize model globally
model = None

def load_model():
    global model
    if model is None:
        model = AutoModelForCausalLM.from_pretrained(
            "moondream/moondream3-preview",
            trust_remote_code=True,
            dtype=torch.bfloat16,
            device_map={"": "cuda"},
        )
        model.compile()
    return model

@spaces.GPU(duration=120)
def process_image(image, task, question, caption_length, object_query, reasoning, temperature, top_p, max_tokens):
    model = load_model()
    
    settings = {
        "temperature": temperature,
        "top_p": top_p,
        "max_tokens": max_tokens
    }
    
    results = []
    
    if task == "Query":
        if image is not None:
            result = model.query(
                image=Image.fromarray(image),
                question=question,
                reasoning=reasoning,
                settings=settings
            )
            return result["answer"], None, None
        else:
            result = model.query(
                question=question,
                reasoning=reasoning,
                settings=settings
            )
            return result["answer"], None, None
            
    elif task == "Caption":
        if image is None:
            return "Please upload an image for captioning", None, None
        result = model.caption(
            Image.fromarray(image),
            length=caption_length.lower(),
            settings=settings
        )
        return result["caption"], None, None
        
    elif task == "Point":
        if image is None:
            return "Please upload an image for point detection", None, None
        result = model.point(Image.fromarray(image), object_query)
        
        # Visualize points on image
        img_with_points = image.copy()
        h, w = img_with_points.shape[:2]
        
        points_text = "Points found:\n"
        for i, point in enumerate(result.get("points", [])):
            x = int(point['x'] * w)
            y = int(point['y'] * h)
            # Draw a red circle at each point
            cv2_available = False
            try:
                import cv2
                cv2.circle(img_with_points, (x, y), 10, (255, 0, 0), -1)
                cv2_available = True
            except:
                # Fallback to numpy if cv2 not available
                for dx in range(-5, 6):
                    for dy in range(-5, 6):
                        if dx*dx + dy*dy <= 25:  # Circle with radius 5
                            px, py = x + dx, y + dy
                            if 0 <= px < w and 0 <= py < h:
                                img_with_points[py, px] = [255, 0, 0]
            
            points_text += f"Point {i+1}: x={point['x']:.3f}, y={point['y']:.3f}\n"
        
        return points_text, img_with_points, None
        
    elif task == "Detect":
        if image is None:
            return "Please upload an image for object detection", None, None
        
        detect_settings = settings.copy()
        detect_settings["max_objects"] = 10
        
        result = model.detect(Image.fromarray(image), object_query, settings=detect_settings)
        
        # Visualize bounding boxes
        img_with_boxes = image.copy()
        h, w = img_with_boxes.shape[:2]
        
        boxes_text = "Objects detected:\n"
        for i, obj in enumerate(result.get("objects", [])):
            x_min = int(obj['x_min'] * w)
            y_min = int(obj['y_min'] * h)
            x_max = int(obj['x_max'] * w)
            y_max = int(obj['y_max'] * h)
            
            # Draw bounding box
            thickness = 3
            # Top and bottom borders
            img_with_boxes[y_min:y_min+thickness, x_min:x_max] = [0, 255, 0]
            img_with_boxes[y_max-thickness:y_max, x_min:x_max] = [0, 255, 0]
            # Left and right borders
            img_with_boxes[y_min:y_max, x_min:x_min+thickness] = [0, 255, 0]
            img_with_boxes[y_min:y_max, x_max-thickness:x_max] = [0, 255, 0]
            
            boxes_text += f"Object {i+1}: x_min={obj['x_min']:.3f}, y_min={obj['y_min']:.3f}, x_max={obj['x_max']:.3f}, y_max={obj['y_max']:.3f}\n"
        
        return boxes_text, None, img_with_boxes

with gr.Blocks(title="Moondream 3 Preview", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # 🌙 Moondream 3 Preview - Vision Language Model
        
        Experience the power of Moondream 3, a state-of-the-art vision language model with mixture-of-experts architecture.
        This demo showcases all four skills: Query, Caption, Point, and Detect.
        
        [Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
        """
    )
    
    with gr.Row():
        with gr.Column(scale=1):
            input_image = gr.Image(label="Upload Image (optional for Query)", type="numpy")
            task_type = gr.Radio(
                choices=["Query", "Caption", "Point", "Detect"],
                value="Query",
                label="Select Task"
            )
            
            with gr.Column(visible=True) as query_options:
                question_input = gr.Textbox(
                    label="Question",
                    placeholder="Ask anything about the image or enter a text-only question",
                    lines=2
                )
                reasoning_toggle = gr.Checkbox(
                    label="Enable Reasoning (better for complex questions)",
                    value=True
                )
            
            with gr.Column(visible=False) as caption_options:
                caption_length = gr.Radio(
                    choices=["Short", "Normal", "Long"],
                    value="Normal",
                    label="Caption Length"
                )
            
            with gr.Column(visible=False) as point_detect_options:
                object_query_input = gr.Textbox(
                    label="Object to Find",
                    placeholder="e.g., 'person wearing red shirt', 'car', 'dog'",
                    lines=1
                )
            
            gr.Markdown("### Advanced Settings")
            with gr.Accordion("Generation Parameters", open=False):
                temperature = gr.Slider(
                    minimum=0.1,
                    maximum=2.0,
                    value=0.7,
                    step=0.1,
                    label="Temperature"
                )
                top_p = gr.Slider(
                    minimum=0.1,
                    maximum=1.0,
                    value=0.95,
                    step=0.05,
                    label="Top-p"
                )
                max_tokens = gr.Slider(
                    minimum=50,
                    maximum=2048,
                    value=512,
                    step=50,
                    label="Max Tokens"
                )
            
            submit_btn = gr.Button("🚀 Process", variant="primary")
        
        with gr.Column(scale=1):
            output_text = gr.Textbox(
                label="Output",
                lines=10,
                show_copy_button=True
            )
            output_image_points = gr.Image(
                label="Visualization (Points)",
                visible=False
            )
            output_image_boxes = gr.Image(
                label="Visualization (Bounding Boxes)",
                visible=False
            )
    
    def update_interface(task):
        return {
            query_options: gr.Column(visible=(task == "Query")),
            caption_options: gr.Column(visible=(task == "Caption")),
            point_detect_options: gr.Column(visible=(task in ["Point", "Detect"])),
            output_image_points: gr.Image(visible=False),
            output_image_boxes: gr.Image(visible=False)
        }
    
    def process_and_update_visibility(image, task, question, caption_length, object_query, reasoning, temperature, top_p, max_tokens):
        text_output, points_img, boxes_img = process_image(
            image, task, question, caption_length, object_query, reasoning,
            temperature, top_p, max_tokens
        )
        
        return {
            output_text: text_output,
            output_image_points: gr.Image(value=points_img, visible=(points_img is not None)),
            output_image_boxes: gr.Image(value=boxes_img, visible=(boxes_img is not None))
        }
    
    task_type.change(
        update_interface,
        inputs=[task_type],
        outputs=[query_options, caption_options, point_detect_options, output_image_points, output_image_boxes]
    )
    
    submit_btn.click(
        process_and_update_visibility,
        inputs=[
            input_image, task_type, question_input, caption_length,
            object_query_input, reasoning_toggle, temperature, top_p, max_tokens
        ],
        outputs=[output_text, output_image_points, output_image_boxes]
    )
    
    gr.Examples(
        examples=[
            [None, "Query", "Explain the concept of neural networks", "Normal", "", True, 0.7, 0.95, 512],
            [None, "Query", "What is the capital of France?", "Normal", "", False, 0.3, 0.95, 256],
        ],
        inputs=[
            input_image, task_type, question_input, caption_length,
            object_query_input, reasoning_toggle, temperature, top_p, max_tokens
        ],
        label="Example Queries"
    )
    
    gr.Markdown(
        """
        ### About Moondream 3
        
        - **Architecture**: 9B total parameters, 2B active, with mixture-of-experts
        - **Skills**: Query (Q&A), Caption, Point detection, Object detection
        - **Features**: 32K context length, multi-crop high resolution processing
        - **Model**: [moondream/moondream3-preview](https://huggingface.co/moondream/moondream3-preview)
        
        ### Tips:
        - **Query**: Ask open-ended questions about images or use for text-only tasks
        - **Caption**: Generate short, normal, or long descriptions of images
        - **Point**: Find specific objects and get their coordinates
        - **Detect**: Get bounding boxes for objects in images
        - Enable reasoning for complex visual understanding tasks
        """
    )

demo.launch()