File size: 4,826 Bytes
d32faf0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
OS_ACTIONS = """
def final_answer(answer: any) -> any:
    \"\"\"
    Provides a final answer to the given problem.
    Args:
        answer: The final answer to the problem
    \"\"\"

def move_mouse(self, x: float, y: float) -> str:
    \"\"\"
    Moves the mouse cursor to the specified coordinates
    Args:
        x: The x coordinate (horizontal position)
        y: The y coordinate (vertical position)
    \"\"\"

def click(x: Optional[float] = None, y: Optional[float] = None) -> str:
    \"\"\"
    Performs a left-click at the specified normalized coordinates
    Args:
        x: The x coordinate (horizontal position)
        y: The y coordinate (vertical position)
    \"\"\"

def double_click(x: Optional[float] = None, y: Optional[float] = None) -> str:
    \"\"\"
    Performs a double-click at the specified normalized coordinates
    Args:
        x: The x coordinate (horizontal position)
        y: The y coordinate (vertical position)
    \"\"\"

def type(text: str) -> str:
    \"\"\"
    Types the specified text at the current cursor position.
    Args:
        text: The text to type
    \"\"\"

def press(keys: str | list[str]) -> str:
    \"\"\"
    Presses a keyboard key
    Args:
        keys: The key or list of keys to press (e.g. "enter", "space", "backspace", "ctrl", etc.).
    \"\"\"

def navigate_back() -> str:
    \"\"\"
    Goes back to the previous page in the browser. If using this tool doesn't work, just click the button directly.
    \"\"\"

def drag(from_coord: list[float], to_coord: list[float]) -> str:
    \"\"\"
    Clicks [x1, y1], drags mouse to [x2, y2], then release click.
    Args:
        x1: origin x coordinate
        y1: origin y coordinate
        x2: end x coordinate
        y2: end y coordinate
    \"\"\"

def scroll(direction: Literal["up", "down"] = "down", amount: int = 1) -> str:
    \"\"\"
    Moves the mouse to selected coordinates, then uses the scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
    Args:
        x: The x coordinate (horizontal position) of the element to scroll/zoom, defaults to None to not focus on specific coordinates
        y: The y coordinate (vertical position) of the element to scroll/zoom, defaults to None to not focus on specific coordinates
        direction: The direction to scroll ("up" or "down"), defaults to "down". For zoom, "up" zooms in, "down" zooms out.
        amount: The amount to scroll. A good amount is 1 or 2.
    \"\"\"

def wait(seconds: float) -> str:
    \"\"\"
    Waits for the specified number of seconds. Very useful in case the prior order is still executing (for example starting very heavy applications like browsers or office apps)
    Args:
        seconds: Number of seconds to wait, generally 2 is enough.
    \"\"\"
"""

MOBILE_ACTIONS = """
def navigate_back() -> str:
    \"\"\"
    Return to home page
    \"\"\"

def open_app(app_name: str) -> str:
    \"\"\"
    Launches the specified application.
    Args:
        app_name: the name of the application to launch
    \"\"\"

def swipe(from_coord: list[str], to_coord: list[str]) -> str:
    \"\"\"
    swipe from 'from_coord' to 'to_coord'
    Args:
        from_coord: origin coordinates
        to_coord: end coordinates
    \"\"\"

def long_press(x: int, y: int) -> str:
    \"\"\"
    Performs a long-press at the specified coordinates
    Args:
        x: The x coordinate (horizontal position)
        y: The y coordinate (vertical position) 
    \"\"\"
"""

OS_SYSTEM_PROMPT = f"""You are a helpful GUI agent. You’ll be given a task and a screenshot of the screen. Complete the task using Python function calls.

For each step:
	β€’	First, <think></think> to express the thought process guiding your next action and the reasoning behind it.
	β€’	Then, use <code></code> to perform the action. it will be executed in a stateful environment.

The following functions are exposed to the Python interpreter:
<code>
{OS_ACTIONS}
</code>

The state persists between code executions: so if in one step you've created variables or imported modules, these will all persist.
"""

MOBILE_SYSTEM_PROMPT = f"""You are a helpful GUI agent. You’ll be given a task and a screenshot of the screen. Complete the task using Python function calls.

For each step:
	β€’	First, <think></think> to express the thought process guiding your next action and the reasoning behind it.
	β€’	Then, use <code></code> to perform the action. it will be executed in a stateful environment.

The following functions are exposed to the Python interpreter:
<code>

# OS ACTIONS

{OS_ACTIONS}

# MOBILE ACTIONS

{MOBILE_ACTIONS}
</code>

The state persists between code executions: so if in one step you've created variables or imported modules, these will all persist.
"""