Spaces:
Runtime error
Runtime error
| import spaces | |
| import re | |
| from typing import Tuple, Optional | |
| import gradio as gr | |
| import numpy as np | |
| from PIL import Image, ImageDraw, ImageFont | |
| from smolvlm_inference import TransformersModel | |
| from prompt import OS_SYSTEM_PROMPT | |
| # --- Configuration --- | |
| MODEL_ID = "smolagents/SmolVLM2-2.2B-Instruct-Agentic-GUI" | |
| # --- Model and Processor Loading (Load once) --- | |
| print(f"Loading model and processor for {MODEL_ID}...") | |
| model = None | |
| processor = None | |
| model_loaded = False | |
| load_error_message = "" | |
| model = TransformersModel( | |
| model_id=MODEL_ID, | |
| to_device="cuda:0", | |
| ) | |
| title = "Smol2Operator Demo" | |
| description = """ | |
| This is a demo of the Smol2Operator model designed to interact with graphical user interfaces (GUIs) and perform actions within them. | |
| This proof-of-concept (POC) version, described in [blogpost], showcases the modelβs core capabilities. | |
| This compact release is intentionally scoped to fundamental tasks, with complex workflows planned for future iterations. :hugging_face: | |
| """ | |
| SYSTEM_PROMPT: str = OS_SYSTEM_PROMPT | |
| def get_navigation_prompt(task, image, step=1): | |
| """ | |
| Get the prompt for the navigation task. | |
| - task: The task to complete | |
| - image: The current screenshot of the web page | |
| - step: The current step of the task | |
| """ | |
| system_prompt = SYSTEM_PROMPT | |
| return [ | |
| { | |
| "role": "system", | |
| "content": [ | |
| {"type": "text", "text": system_prompt}, | |
| ], | |
| }, | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "image", | |
| "image": image, | |
| }, | |
| {"type": "text", "text": f"Please generate the next move according to the UI screenshot, instruction and previous actions.\n\nInstruction: {task}\n\nPrevious actions:\nNone"}, | |
| ], | |
| }, | |
| ] | |
| def array_to_image(image_array: np.ndarray) -> Image.Image: | |
| if image_array is None: | |
| raise ValueError("No image provided. Please upload an image before submitting.") | |
| # Convert numpy array to PIL Image | |
| img = Image.fromarray(np.uint8(image_array)) | |
| return img | |
| def parse_actions_from_response(response: str) -> list[str]: | |
| """Parse actions from model response using regex pattern.""" | |
| pattern = r"<code>\n(.*?)\n</code>" | |
| matches = re.findall(pattern, response, re.DOTALL) | |
| return matches | |
| def extract_coordinates_from_action(action_code: str) -> list[dict]: | |
| """Extract coordinates from action code for localization actions.""" | |
| localization_actions = [] | |
| # Patterns for different action types | |
| patterns = { | |
| 'click': r'click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)', | |
| 'double_click': r'double_click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)', | |
| 'move_mouse': r'move_mouse\((?:self,\s*)?(?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))\)', | |
| 'drag': r'drag\(\[([0-9.]+),\s*([0-9.]+)\],\s*\[([0-9.]+),\s*([0-9.]+)\]\)' | |
| } | |
| for action_type, pattern in patterns.items(): | |
| matches = re.finditer(pattern, action_code) | |
| for match in matches: | |
| if action_type == 'drag': | |
| # Drag has from and to coordinates | |
| from_x, from_y, to_x, to_y = match.groups() | |
| localization_actions.append({ | |
| 'type': 'drag_from', | |
| 'x': float(from_x), | |
| 'y': float(from_y), | |
| 'action': action_type | |
| }) | |
| localization_actions.append({ | |
| 'type': 'drag_to', | |
| 'x': float(to_x), | |
| 'y': float(to_y), | |
| 'action': action_type | |
| }) | |
| else: | |
| # Single coordinate actions | |
| x_val = match.group(1) | |
| y_val = match.group(2) if match.group(2) else x_val # Handle single coordinate case | |
| if x_val and y_val: | |
| localization_actions.append({ | |
| 'type': action_type, | |
| 'x': float(x_val), | |
| 'y': float(y_val), | |
| 'action': action_type | |
| }) | |
| return localization_actions | |
| def create_localized_image(original_image: Image.Image, coordinates: list[dict]) -> Optional[Image.Image]: | |
| """Create an image with localization markers drawn on it.""" | |
| if not coordinates: | |
| return None | |
| # Create a copy of the original image | |
| img_copy = original_image.copy() | |
| draw = ImageDraw.Draw(img_copy) | |
| # Get image dimensions | |
| width, height = img_copy.size | |
| # Try to load a font, fallback to default if not available | |
| font = ImageFont.load_default() | |
| # Color scheme for different actions | |
| colors = { | |
| 'click': 'red', | |
| 'double_click': 'blue', | |
| 'move_mouse': 'green', | |
| 'drag_from': 'orange', | |
| 'drag_to': 'purple' | |
| } | |
| for i, coord in enumerate(coordinates): | |
| # Convert normalized coordinates to pixel coordinates | |
| pixel_x = int(coord['x'] * width) | |
| pixel_y = int(coord['y'] * height) | |
| # Get color for this action type | |
| color = colors.get(coord['type'], 'red') | |
| # Draw a circle at the coordinate | |
| circle_radius = 8 | |
| draw.ellipse([ | |
| pixel_x - circle_radius, pixel_y - circle_radius, | |
| pixel_x + circle_radius, pixel_y + circle_radius | |
| ], fill=color, outline='white', width=2) | |
| # Add text label | |
| label = f"{coord['type']}({coord['x']:.2f},{coord['y']:.2f})" | |
| if font: | |
| draw.text((pixel_x + 10, pixel_y - 10), label, fill=color, font=font) | |
| else: | |
| draw.text((pixel_x + 10, pixel_y - 10), label, fill=color) | |
| # For drag actions, draw an arrow | |
| if coord['type'] == 'drag_from' and i + 1 < len(coordinates) and coordinates[i + 1]['type'] == 'drag_to': | |
| next_coord = coordinates[i + 1] | |
| end_x = int(next_coord['x'] * width) | |
| end_y = int(next_coord['y'] * height) | |
| # Draw arrow line | |
| draw.line([pixel_x, pixel_y, end_x, end_y], fill='orange', width=3) | |
| # Draw arrowhead | |
| arrow_size = 10 | |
| dx = end_x - pixel_x | |
| dy = end_y - pixel_y | |
| length = (dx**2 + dy**2)**0.5 | |
| if length > 0: | |
| dx_norm = dx / length | |
| dy_norm = dy / length | |
| # Arrowhead points | |
| arrow_x1 = end_x - arrow_size * dx_norm + arrow_size * dy_norm * 0.5 | |
| arrow_y1 = end_y - arrow_size * dy_norm - arrow_size * dx_norm * 0.5 | |
| arrow_x2 = end_x - arrow_size * dx_norm - arrow_size * dy_norm * 0.5 | |
| arrow_y2 = end_y - arrow_size * dy_norm + arrow_size * dx_norm * 0.5 | |
| draw.polygon([end_x, end_y, arrow_x1, arrow_y1, arrow_x2, arrow_y2], fill='orange') | |
| return img_copy | |
| # --- Gradio processing function --- | |
| def navigate(input_numpy_image: np.ndarray, task: str) -> Tuple[str, Optional[Image.Image]]: | |
| input_pil_image = array_to_image(input_numpy_image) | |
| assert isinstance(input_pil_image, Image.Image) | |
| prompt = get_navigation_prompt(task, input_pil_image) | |
| if model is None: | |
| raise ValueError("Model not loaded") | |
| navigation_str = model.generate(prompt, max_new_tokens=500) | |
| print(f"Navigation string: {navigation_str}") | |
| navigation_str = navigation_str.strip() | |
| # Parse actions from the response | |
| actions = parse_actions_from_response(navigation_str) | |
| # Extract coordinates from all actions | |
| all_coordinates = [] | |
| for action_code in actions: | |
| coordinates = extract_coordinates_from_action(action_code) | |
| all_coordinates.extend(coordinates) | |
| # Create localized image if there are coordinates | |
| localized_image = None | |
| if all_coordinates: | |
| localized_image = create_localized_image(input_pil_image, all_coordinates) | |
| print(f"Found {len(all_coordinates)} localization actions") | |
| return navigation_str, localized_image | |
| # --- Load Example Data --- | |
| example_1_image: str = "./assets/google.png" | |
| example_1_image = Image.open(example_1_image) | |
| example_1_task = "Search for the name of the current UK Prime Minister." | |
| example_2_image: str = "./assets/huggingface.png" | |
| example_2_image = Image.open(example_2_image) | |
| example_2_task = "Find the most trending model." | |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(f"<h1 style='text-align: center;'>{title}</h1>") | |
| # gr.Markdown(description) | |
| with gr.Row(): | |
| input_image_component = gr.Image(label="UI Image", height=500) | |
| with gr.Row(): | |
| with gr.Column(): | |
| task_component = gr.Textbox( | |
| label="task", | |
| placeholder="e.g., Search for the name of the current UK Prime Minister.", | |
| info="Type the task you want the model to complete.", | |
| ) | |
| submit_button = gr.Button("Call Agent", variant="primary") | |
| with gr.Column(): | |
| output_coords_component = gr.Textbox(label="Agent Output", lines=10) | |
| submit_button.click(navigate, [input_image_component, task_component], [output_coords_component, input_image_component]) | |
| gr.Examples( | |
| examples=[[example_1_image, example_1_task], [example_2_image, example_2_task]], | |
| inputs=[input_image_component, task_component], | |
| outputs=[output_coords_component, input_image_component], | |
| fn=navigate, | |
| cache_examples=True, | |
| ) | |
| demo.queue(api_open=False) | |
| demo.launch(debug=True) | |