Spaces:

A-Mahla
/

Smol2Operator

Running on Zero

File size: 9,736 Bytes

import spaces

import re
from typing import Tuple, Optional

import gradio as gr
import numpy as np
from PIL import Image, ImageDraw, ImageFont
from smolvlm_inference import TransformersModel

from prompt import OS_SYSTEM_PROMPT

# --- Configuration ---
MODEL_ID = "smolagents/SmolVLM2-2.2B-Instruct-Agentic-GUI"

# --- Model and Processor Loading (Load once) ---
print(f"Loading model and processor for {MODEL_ID}...")
model = None
processor = None
model_loaded = False
load_error_message = ""



model = TransformersModel(
    model_id=MODEL_ID,
    to_device="cuda:0",
)


title = "Smol2Operator Demo"

description = """
This is a demo of the Smol2Operator model designed to interact with graphical user interfaces (GUIs) and perform actions within them.
This proof-of-concept (POC) version, described in [blogpost], showcases the model’s core capabilities.
This compact release is intentionally scoped to fundamental tasks, with complex workflows planned for future iterations. :hugging_face: 
"""



SYSTEM_PROMPT: str = OS_SYSTEM_PROMPT


def get_navigation_prompt(task, image, step=1):
    """
    Get the prompt for the navigation task.
    - task: The task to complete
    - image: The current screenshot of the web page
    - step: The current step of the task
    """
    system_prompt = SYSTEM_PROMPT
    return [
        {
            "role": "system",
            "content": [
                {"type": "text", "text": system_prompt},
            ],
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": image,
                },
                {"type": "text", "text": f"Please generate the next move according to the UI screenshot, instruction and previous actions.\n\nInstruction: {task}\n\nPrevious actions:\nNone"},
            ],
        },
    ]


def array_to_image(image_array: np.ndarray) -> Image.Image:
    if image_array is None:
        raise ValueError("No image provided. Please upload an image before submitting.")
    # Convert numpy array to PIL Image
    img = Image.fromarray(np.uint8(image_array))
    return img


def parse_actions_from_response(response: str) -> list[str]:
    """Parse actions from model response using regex pattern."""
    pattern = r"<code>\n(.*?)\n</code>"
    matches = re.findall(pattern, response, re.DOTALL)
    return matches


def extract_coordinates_from_action(action_code: str) -> list[dict]:
    """Extract coordinates from action code for localization actions."""
    localization_actions = []
    
    # Patterns for different action types
    patterns = {
        'click': r'click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)',
        'double_click': r'double_click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)',
        'move_mouse': r'move_mouse\((?:self,\s*)?(?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))\)',
        'drag': r'drag\(\[([0-9.]+),\s*([0-9.]+)\],\s*\[([0-9.]+),\s*([0-9.]+)\]\)'
    }
    
    for action_type, pattern in patterns.items():
        matches = re.finditer(pattern, action_code)
        for match in matches:
            if action_type == 'drag':
                # Drag has from and to coordinates
                from_x, from_y, to_x, to_y = match.groups()
                localization_actions.append({
                    'type': 'drag_from',
                    'x': float(from_x),
                    'y': float(from_y),
                    'action': action_type
                })
                localization_actions.append({
                    'type': 'drag_to',
                    'x': float(to_x),
                    'y': float(to_y),
                    'action': action_type
                })
            else:
                # Single coordinate actions
                x_val = match.group(1)
                y_val = match.group(2) if match.group(2) else x_val  # Handle single coordinate case
                if x_val and y_val:
                    localization_actions.append({
                        'type': action_type,
                        'x': float(x_val),
                        'y': float(y_val),
                        'action': action_type
                    })
    
    return localization_actions


def create_localized_image(original_image: Image.Image, coordinates: list[dict]) -> Optional[Image.Image]:
    """Create an image with localization markers drawn on it."""
    if not coordinates:
        return None
    
    # Create a copy of the original image
    img_copy = original_image.copy()
    draw = ImageDraw.Draw(img_copy)
    
    # Get image dimensions
    width, height = img_copy.size
    
    # Try to load a font, fallback to default if not available
    font = ImageFont.load_default()

    
    # Color scheme for different actions
    colors = {
        'click': 'red',
        'double_click': 'blue',
        'move_mouse': 'green',
        'drag_from': 'orange',
        'drag_to': 'purple'
    }
    
    for i, coord in enumerate(coordinates):
        # Convert normalized coordinates to pixel coordinates
        pixel_x = int(coord['x'] * width)
        pixel_y = int(coord['y'] * height)
        
        # Get color for this action type
        color = colors.get(coord['type'], 'red')
        
        # Draw a circle at the coordinate
        circle_radius = 8
        draw.ellipse([
            pixel_x - circle_radius, pixel_y - circle_radius,
            pixel_x + circle_radius, pixel_y + circle_radius
        ], fill=color, outline='white', width=2)
        
        # Add text label
        label = f"{coord['type']}({coord['x']:.2f},{coord['y']:.2f})"
        if font:
            draw.text((pixel_x + 10, pixel_y - 10), label, fill=color, font=font)
        else:
            draw.text((pixel_x + 10, pixel_y - 10), label, fill=color)
        
        # For drag actions, draw an arrow
        if coord['type'] == 'drag_from' and i + 1 < len(coordinates) and coordinates[i + 1]['type'] == 'drag_to':
            next_coord = coordinates[i + 1]
            end_x = int(next_coord['x'] * width)
            end_y = int(next_coord['y'] * height)
            
            # Draw arrow line
            draw.line([pixel_x, pixel_y, end_x, end_y], fill='orange', width=3)
            
            # Draw arrowhead
            arrow_size = 10
            dx = end_x - pixel_x
            dy = end_y - pixel_y
            length = (dx**2 + dy**2)**0.5
            if length > 0:
                dx_norm = dx / length
                dy_norm = dy / length
                
                # Arrowhead points
                arrow_x1 = end_x - arrow_size * dx_norm + arrow_size * dy_norm * 0.5
                arrow_y1 = end_y - arrow_size * dy_norm - arrow_size * dx_norm * 0.5
                arrow_x2 = end_x - arrow_size * dx_norm - arrow_size * dy_norm * 0.5
                arrow_y2 = end_y - arrow_size * dy_norm + arrow_size * dx_norm * 0.5
                
                draw.polygon([end_x, end_y, arrow_x1, arrow_y1, arrow_x2, arrow_y2], fill='orange')
    
    return img_copy


# --- Gradio processing function ---
@spaces.GPU
def navigate(input_numpy_image: np.ndarray, task: str) -> Tuple[str, Optional[Image.Image]]:
    input_pil_image = array_to_image(input_numpy_image)
    assert isinstance(input_pil_image, Image.Image)

    prompt = get_navigation_prompt(task, input_pil_image)


    if model is None:
        raise ValueError("Model not loaded")
    
    navigation_str = model.generate(prompt, max_new_tokens=500)
    print(f"Navigation string: {navigation_str}")
    navigation_str = navigation_str.strip()

    # Parse actions from the response
    actions = parse_actions_from_response(navigation_str)
    
    # Extract coordinates from all actions
    all_coordinates = []
    for action_code in actions:
        coordinates = extract_coordinates_from_action(action_code)
        all_coordinates.extend(coordinates)
    
    # Create localized image if there are coordinates
    localized_image = None
    if all_coordinates:
        localized_image = create_localized_image(input_pil_image, all_coordinates)
        print(f"Found {len(all_coordinates)} localization actions")
    
    return navigation_str, localized_image


# --- Load Example Data ---
example_1_image: str = "./assets/google.png"
example_1_image = Image.open(example_1_image)
example_1_task = "Search for the name of the current UK Prime Minister."

example_2_image: str = "./assets/huggingface.png"
example_2_image = Image.open(example_2_image)
example_2_task = "Find the most trending model."


with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown(f"<h1 style='text-align: center;'>{title}</h1>")
    # gr.Markdown(description)

    with gr.Row():
        input_image_component = gr.Image(label="UI Image", height=500)
    with gr.Row():
        with gr.Column():
            task_component = gr.Textbox(
                label="task",
                placeholder="e.g., Search for the name of the current UK Prime Minister.",
                info="Type the task you want the model to complete.",
            )
            submit_button = gr.Button("Call Agent", variant="primary")

        with gr.Column():
            output_coords_component = gr.Textbox(label="Agent Output", lines=10)

    submit_button.click(navigate, [input_image_component, task_component], [output_coords_component, input_image_component])

    gr.Examples(
        examples=[[example_1_image, example_1_task], [example_2_image, example_2_task]],
        inputs=[input_image_component, task_component],
        outputs=[output_coords_component, input_image_component],
        fn=navigate,
        cache_examples=True,
    )

demo.queue(api_open=False)
demo.launch(debug=True)