Dev/uitars 15 (#181)

* debug uitars1.0, add uitars1.5

* update pyautogui parser

* modify function name

* update parser

* update prompt
This commit is contained in:
Shihao Liang
2025-04-21 13:44:08 +08:00
committed by GitHub
parent bd2e980666
commit b92c716df7

View File

@@ -1195,43 +1195,55 @@ AGUVIS_GROUNDING_APPEND_PROMPT = """<|recipient|>os
pyautogui.{function_name}""" pyautogui.{function_name}"""
UITARS_ACTION_SPACE = """ UITARS_ACTION_SPACE = """
click(start_box='[x1, y1, x2, y2]') click(start_box='<|box_start|>(x1,y1)<|box_end|>')
left_double(start_box='[x1, y1, x2, y2]') left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
right_single(start_box='[x1, y1, x2, y2]') right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]') drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
hotkey(key='') hotkey(key='')
type(content='') #If you want to submit your input, use "\\n" at the end of `content`. type(content='') #If you want to submit your input, use "\\n" at the end of `content`.
scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left') scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
wait() #Sleep for 5s and take a screenshot to check for any changes. wait() #Sleep for 5s and take a screenshot to check for any changes.
finished() finished()
""" """
UITARS_CALL_USR_ACTION_SPACE = """ UITARS_CALL_USR_ACTION_SPACE = """
click(start_box='[x1, y1, x2, y2]') click(start_box='<|box_start|>(x1,y1)<|box_end|>')
left_double(start_box='[x1, y1, x2, y2]') left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
right_single(start_box='[x1, y1, x2, y2]') right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]') drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
hotkey(key='') hotkey(key='')
type(content='') #If you want to submit your input, use "\\n" at the end of `content`. type(content='') #If you want to submit your input, use "\\n" at the end of `content`.
scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left') scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
wait() #Sleep for 5s and take a screenshot to check for any changes. wait() #Sleep for 5s and take a screenshot to check for any changes.
finished() finished()
call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help. call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.
""" """
UITARS_NORMAL_ACTION_SPACE = """
click(start_box='<|box_start|>(x1,y1)<|box_end|>')
left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
hotkey(key='')
type(content='') #If you want to submit your input, use "\\n" at the end of `content`.
scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
wait() #Sleep for 5s and take a screenshot to check for any changes.
finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
"""
UITARS_USR_PROMPT_NOTHOUGHT = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. UITARS_USR_PROMPT_NOTHOUGHT = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
## Output Format ## Output Format
``` ```
Action: ... Action: ...
``` ```
## Action Space ## Action Space
click(start_box='[x1, y1, x2, y2]') click(start_box='<|box_start|>(x1,y1)<|box_end|>')
left_double(start_box='[x1, y1, x2, y2]') left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
right_single(start_box='[x1, y1, x2, y2]') right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]') drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
hotkey(key='') hotkey(key='')
type(content='') #If you want to submit your input, use "\\n" at the end of `content`. type(content='') #If you want to submit your input, use "\\n" at the end of `content`.
scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left') scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
wait() #Sleep for 5s and take a screenshot to check for any changes. wait() #Sleep for 5s and take a screenshot to check for any changes.
finished() finished()
call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help. call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.