File size: 9,736 Bytes
57a1dfe
 
d32faf0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57a1dfe
d32faf0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a429fc9
 
d32faf0
 
 
 
0b1561b
d32faf0
 
f5357fe
d32faf0
 
ba82d5c
d32faf0
a429fc9
d32faf0
 
 
 
a429fc9
d32faf0
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
import spaces

import re
from typing import Tuple, Optional

import gradio as gr
import numpy as np
from PIL import Image, ImageDraw, ImageFont
from smolvlm_inference import TransformersModel

from prompt import OS_SYSTEM_PROMPT

# --- Configuration ---
MODEL_ID = "smolagents/SmolVLM2-2.2B-Instruct-Agentic-GUI"

# --- Model and Processor Loading (Load once) ---
print(f"Loading model and processor for {MODEL_ID}...")
model = None
processor = None
model_loaded = False
load_error_message = ""



model = TransformersModel(
    model_id=MODEL_ID,
    to_device="cuda:0",
)


title = "Smol2Operator Demo"

description = """
This is a demo of the Smol2Operator model designed to interact with graphical user interfaces (GUIs) and perform actions within them.
This proof-of-concept (POC) version, described in [blogpost], showcases the model’s core capabilities.
This compact release is intentionally scoped to fundamental tasks, with complex workflows planned for future iterations. :hugging_face: 
"""



SYSTEM_PROMPT: str = OS_SYSTEM_PROMPT


def get_navigation_prompt(task, image, step=1):
    """
    Get the prompt for the navigation task.
    - task: The task to complete
    - image: The current screenshot of the web page
    - step: The current step of the task
    """
    system_prompt = SYSTEM_PROMPT
    return [
        {
            "role": "system",
            "content": [
                {"type": "text", "text": system_prompt},
            ],
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": image,
                },
                {"type": "text", "text": f"Please generate the next move according to the UI screenshot, instruction and previous actions.\n\nInstruction: {task}\n\nPrevious actions:\nNone"},
            ],
        },
    ]


def array_to_image(image_array: np.ndarray) -> Image.Image:
    if image_array is None:
        raise ValueError("No image provided. Please upload an image before submitting.")
    # Convert numpy array to PIL Image
    img = Image.fromarray(np.uint8(image_array))
    return img


def parse_actions_from_response(response: str) -> list[str]:
    """Parse actions from model response using regex pattern."""
    pattern = r"<code>\n(.*?)\n</code>"
    matches = re.findall(pattern, response, re.DOTALL)
    return matches


def extract_coordinates_from_action(action_code: str) -> list[dict]:
    """Extract coordinates from action code for localization actions."""
    localization_actions = []
    
    # Patterns for different action types
    patterns = {
        'click': r'click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)',
        'double_click': r'double_click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)',
        'move_mouse': r'move_mouse\((?:self,\s*)?(?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))\)',
        'drag': r'drag\(\[([0-9.]+),\s*([0-9.]+)\],\s*\[([0-9.]+),\s*([0-9.]+)\]\)'
    }
    
    for action_type, pattern in patterns.items():
        matches = re.finditer(pattern, action_code)
        for match in matches:
            if action_type == 'drag':
                # Drag has from and to coordinates
                from_x, from_y, to_x, to_y = match.groups()
                localization_actions.append({
                    'type': 'drag_from',
                    'x': float(from_x),
                    'y': float(from_y),
                    'action': action_type
                })
                localization_actions.append({
                    'type': 'drag_to',
                    'x': float(to_x),
                    'y': float(to_y),
                    'action': action_type
                })
            else:
                # Single coordinate actions
                x_val = match.group(1)
                y_val = match.group(2) if match.group(2) else x_val  # Handle single coordinate case
                if x_val and y_val:
                    localization_actions.append({
                        'type': action_type,
                        'x': float(x_val),
                        'y': float(y_val),
                        'action': action_type
                    })
    
    return localization_actions


def create_localized_image(original_image: Image.Image, coordinates: list[dict]) -> Optional[Image.Image]:
    """Create an image with localization markers drawn on it."""
    if not coordinates:
        return None
    
    # Create a copy of the original image
    img_copy = original_image.copy()
    draw = ImageDraw.Draw(img_copy)
    
    # Get image dimensions
    width, height = img_copy.size
    
    # Try to load a font, fallback to default if not available
    font = ImageFont.load_default()

    
    # Color scheme for different actions
    colors = {
        'click': 'red',
        'double_click': 'blue',
        'move_mouse': 'green',
        'drag_from': 'orange',
        'drag_to': 'purple'
    }
    
    for i, coord in enumerate(coordinates):
        # Convert normalized coordinates to pixel coordinates
        pixel_x = int(coord['x'] * width)
        pixel_y = int(coord['y'] * height)
        
        # Get color for this action type
        color = colors.get(coord['type'], 'red')
        
        # Draw a circle at the coordinate
        circle_radius = 8
        draw.ellipse([
            pixel_x - circle_radius, pixel_y - circle_radius,
            pixel_x + circle_radius, pixel_y + circle_radius
        ], fill=color, outline='white', width=2)
        
        # Add text label
        label = f"{coord['type']}({coord['x']:.2f},{coord['y']:.2f})"
        if font:
            draw.text((pixel_x + 10, pixel_y - 10), label, fill=color, font=font)
        else:
            draw.text((pixel_x + 10, pixel_y - 10), label, fill=color)
        
        # For drag actions, draw an arrow
        if coord['type'] == 'drag_from' and i + 1 < len(coordinates) and coordinates[i + 1]['type'] == 'drag_to':
            next_coord = coordinates[i + 1]
            end_x = int(next_coord['x'] * width)
            end_y = int(next_coord['y'] * height)
            
            # Draw arrow line
            draw.line([pixel_x, pixel_y, end_x, end_y], fill='orange', width=3)
            
            # Draw arrowhead
            arrow_size = 10
            dx = end_x - pixel_x
            dy = end_y - pixel_y
            length = (dx**2 + dy**2)**0.5
            if length > 0:
                dx_norm = dx / length
                dy_norm = dy / length
                
                # Arrowhead points
                arrow_x1 = end_x - arrow_size * dx_norm + arrow_size * dy_norm * 0.5
                arrow_y1 = end_y - arrow_size * dy_norm - arrow_size * dx_norm * 0.5
                arrow_x2 = end_x - arrow_size * dx_norm - arrow_size * dy_norm * 0.5
                arrow_y2 = end_y - arrow_size * dy_norm + arrow_size * dx_norm * 0.5
                
                draw.polygon([end_x, end_y, arrow_x1, arrow_y1, arrow_x2, arrow_y2], fill='orange')
    
    return img_copy


# --- Gradio processing function ---
@spaces.GPU
def navigate(input_numpy_image: np.ndarray, task: str) -> Tuple[str, Optional[Image.Image]]:
    input_pil_image = array_to_image(input_numpy_image)
    assert isinstance(input_pil_image, Image.Image)

    prompt = get_navigation_prompt(task, input_pil_image)


    if model is None:
        raise ValueError("Model not loaded")
    
    navigation_str = model.generate(prompt, max_new_tokens=500)
    print(f"Navigation string: {navigation_str}")
    navigation_str = navigation_str.strip()

    # Parse actions from the response
    actions = parse_actions_from_response(navigation_str)
    
    # Extract coordinates from all actions
    all_coordinates = []
    for action_code in actions:
        coordinates = extract_coordinates_from_action(action_code)
        all_coordinates.extend(coordinates)
    
    # Create localized image if there are coordinates
    localized_image = None
    if all_coordinates:
        localized_image = create_localized_image(input_pil_image, all_coordinates)
        print(f"Found {len(all_coordinates)} localization actions")
    
    return navigation_str, localized_image


# --- Load Example Data ---
example_1_image: str = "./assets/google.png"
example_1_image = Image.open(example_1_image)
example_1_task = "Search for the name of the current UK Prime Minister."

example_2_image: str = "./assets/huggingface.png"
example_2_image = Image.open(example_2_image)
example_2_task = "Find the most trending model."


with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown(f"<h1 style='text-align: center;'>{title}</h1>")
    # gr.Markdown(description)

    with gr.Row():
        input_image_component = gr.Image(label="UI Image", height=500)
    with gr.Row():
        with gr.Column():
            task_component = gr.Textbox(
                label="task",
                placeholder="e.g., Search for the name of the current UK Prime Minister.",
                info="Type the task you want the model to complete.",
            )
            submit_button = gr.Button("Call Agent", variant="primary")

        with gr.Column():
            output_coords_component = gr.Textbox(label="Agent Output", lines=10)

    submit_button.click(navigate, [input_image_component, task_component], [output_coords_component, input_image_component])

    gr.Examples(
        examples=[[example_1_image, example_1_task], [example_2_image, example_2_task]],
        inputs=[input_image_component, task_component],
        outputs=[output_coords_component, input_image_component],
        fn=navigate,
        cache_examples=True,
    )

demo.queue(api_open=False)
demo.launch(debug=True)