Spaces:
Running
on
Zero
Running
on
Zero
File size: 9,736 Bytes
57a1dfe d32faf0 57a1dfe d32faf0 a429fc9 d32faf0 0b1561b d32faf0 f5357fe d32faf0 ba82d5c d32faf0 a429fc9 d32faf0 a429fc9 d32faf0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 |
import spaces
import re
from typing import Tuple, Optional
import gradio as gr
import numpy as np
from PIL import Image, ImageDraw, ImageFont
from smolvlm_inference import TransformersModel
from prompt import OS_SYSTEM_PROMPT
# --- Configuration ---
MODEL_ID = "smolagents/SmolVLM2-2.2B-Instruct-Agentic-GUI"
# --- Model and Processor Loading (Load once) ---
print(f"Loading model and processor for {MODEL_ID}...")
model = None
processor = None
model_loaded = False
load_error_message = ""
model = TransformersModel(
model_id=MODEL_ID,
to_device="cuda:0",
)
title = "Smol2Operator Demo"
description = """
This is a demo of the Smol2Operator model designed to interact with graphical user interfaces (GUIs) and perform actions within them.
This proof-of-concept (POC) version, described in [blogpost], showcases the model’s core capabilities.
This compact release is intentionally scoped to fundamental tasks, with complex workflows planned for future iterations. :hugging_face:
"""
SYSTEM_PROMPT: str = OS_SYSTEM_PROMPT
def get_navigation_prompt(task, image, step=1):
"""
Get the prompt for the navigation task.
- task: The task to complete
- image: The current screenshot of the web page
- step: The current step of the task
"""
system_prompt = SYSTEM_PROMPT
return [
{
"role": "system",
"content": [
{"type": "text", "text": system_prompt},
],
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"Please generate the next move according to the UI screenshot, instruction and previous actions.\n\nInstruction: {task}\n\nPrevious actions:\nNone"},
],
},
]
def array_to_image(image_array: np.ndarray) -> Image.Image:
if image_array is None:
raise ValueError("No image provided. Please upload an image before submitting.")
# Convert numpy array to PIL Image
img = Image.fromarray(np.uint8(image_array))
return img
def parse_actions_from_response(response: str) -> list[str]:
"""Parse actions from model response using regex pattern."""
pattern = r"<code>\n(.*?)\n</code>"
matches = re.findall(pattern, response, re.DOTALL)
return matches
def extract_coordinates_from_action(action_code: str) -> list[dict]:
"""Extract coordinates from action code for localization actions."""
localization_actions = []
# Patterns for different action types
patterns = {
'click': r'click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)',
'double_click': r'double_click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)',
'move_mouse': r'move_mouse\((?:self,\s*)?(?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))\)',
'drag': r'drag\(\[([0-9.]+),\s*([0-9.]+)\],\s*\[([0-9.]+),\s*([0-9.]+)\]\)'
}
for action_type, pattern in patterns.items():
matches = re.finditer(pattern, action_code)
for match in matches:
if action_type == 'drag':
# Drag has from and to coordinates
from_x, from_y, to_x, to_y = match.groups()
localization_actions.append({
'type': 'drag_from',
'x': float(from_x),
'y': float(from_y),
'action': action_type
})
localization_actions.append({
'type': 'drag_to',
'x': float(to_x),
'y': float(to_y),
'action': action_type
})
else:
# Single coordinate actions
x_val = match.group(1)
y_val = match.group(2) if match.group(2) else x_val # Handle single coordinate case
if x_val and y_val:
localization_actions.append({
'type': action_type,
'x': float(x_val),
'y': float(y_val),
'action': action_type
})
return localization_actions
def create_localized_image(original_image: Image.Image, coordinates: list[dict]) -> Optional[Image.Image]:
"""Create an image with localization markers drawn on it."""
if not coordinates:
return None
# Create a copy of the original image
img_copy = original_image.copy()
draw = ImageDraw.Draw(img_copy)
# Get image dimensions
width, height = img_copy.size
# Try to load a font, fallback to default if not available
font = ImageFont.load_default()
# Color scheme for different actions
colors = {
'click': 'red',
'double_click': 'blue',
'move_mouse': 'green',
'drag_from': 'orange',
'drag_to': 'purple'
}
for i, coord in enumerate(coordinates):
# Convert normalized coordinates to pixel coordinates
pixel_x = int(coord['x'] * width)
pixel_y = int(coord['y'] * height)
# Get color for this action type
color = colors.get(coord['type'], 'red')
# Draw a circle at the coordinate
circle_radius = 8
draw.ellipse([
pixel_x - circle_radius, pixel_y - circle_radius,
pixel_x + circle_radius, pixel_y + circle_radius
], fill=color, outline='white', width=2)
# Add text label
label = f"{coord['type']}({coord['x']:.2f},{coord['y']:.2f})"
if font:
draw.text((pixel_x + 10, pixel_y - 10), label, fill=color, font=font)
else:
draw.text((pixel_x + 10, pixel_y - 10), label, fill=color)
# For drag actions, draw an arrow
if coord['type'] == 'drag_from' and i + 1 < len(coordinates) and coordinates[i + 1]['type'] == 'drag_to':
next_coord = coordinates[i + 1]
end_x = int(next_coord['x'] * width)
end_y = int(next_coord['y'] * height)
# Draw arrow line
draw.line([pixel_x, pixel_y, end_x, end_y], fill='orange', width=3)
# Draw arrowhead
arrow_size = 10
dx = end_x - pixel_x
dy = end_y - pixel_y
length = (dx**2 + dy**2)**0.5
if length > 0:
dx_norm = dx / length
dy_norm = dy / length
# Arrowhead points
arrow_x1 = end_x - arrow_size * dx_norm + arrow_size * dy_norm * 0.5
arrow_y1 = end_y - arrow_size * dy_norm - arrow_size * dx_norm * 0.5
arrow_x2 = end_x - arrow_size * dx_norm - arrow_size * dy_norm * 0.5
arrow_y2 = end_y - arrow_size * dy_norm + arrow_size * dx_norm * 0.5
draw.polygon([end_x, end_y, arrow_x1, arrow_y1, arrow_x2, arrow_y2], fill='orange')
return img_copy
# --- Gradio processing function ---
@spaces.GPU
def navigate(input_numpy_image: np.ndarray, task: str) -> Tuple[str, Optional[Image.Image]]:
input_pil_image = array_to_image(input_numpy_image)
assert isinstance(input_pil_image, Image.Image)
prompt = get_navigation_prompt(task, input_pil_image)
if model is None:
raise ValueError("Model not loaded")
navigation_str = model.generate(prompt, max_new_tokens=500)
print(f"Navigation string: {navigation_str}")
navigation_str = navigation_str.strip()
# Parse actions from the response
actions = parse_actions_from_response(navigation_str)
# Extract coordinates from all actions
all_coordinates = []
for action_code in actions:
coordinates = extract_coordinates_from_action(action_code)
all_coordinates.extend(coordinates)
# Create localized image if there are coordinates
localized_image = None
if all_coordinates:
localized_image = create_localized_image(input_pil_image, all_coordinates)
print(f"Found {len(all_coordinates)} localization actions")
return navigation_str, localized_image
# --- Load Example Data ---
example_1_image: str = "./assets/google.png"
example_1_image = Image.open(example_1_image)
example_1_task = "Search for the name of the current UK Prime Minister."
example_2_image: str = "./assets/huggingface.png"
example_2_image = Image.open(example_2_image)
example_2_task = "Find the most trending model."
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown(f"<h1 style='text-align: center;'>{title}</h1>")
# gr.Markdown(description)
with gr.Row():
input_image_component = gr.Image(label="UI Image", height=500)
with gr.Row():
with gr.Column():
task_component = gr.Textbox(
label="task",
placeholder="e.g., Search for the name of the current UK Prime Minister.",
info="Type the task you want the model to complete.",
)
submit_button = gr.Button("Call Agent", variant="primary")
with gr.Column():
output_coords_component = gr.Textbox(label="Agent Output", lines=10)
submit_button.click(navigate, [input_image_component, task_component], [output_coords_component, input_image_component])
gr.Examples(
examples=[[example_1_image, example_1_task], [example_2_image, example_2_task]],
inputs=[input_image_component, task_component],
outputs=[output_coords_component, input_image_component],
fn=navigate,
cache_examples=True,
)
demo.queue(api_open=False)
demo.launch(debug=True)
|