From b92c716df74e35f95b8cc9aff1b9a7676de525b0 Mon Sep 17 00:00:00 2001 From: Shihao Liang <1017512024@qq.com> Date: Mon, 21 Apr 2025 13:44:08 +0800 Subject: [PATCH] Dev/uitars 15 (#181) * debug uitars1.0, add uitars1.5 * update pyautogui parser * modify function name * update parser * update prompt --- mm_agents/prompts.py | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/mm_agents/prompts.py b/mm_agents/prompts.py index 37609c4..2305aaa 100644 --- a/mm_agents/prompts.py +++ b/mm_agents/prompts.py @@ -1195,43 +1195,55 @@ AGUVIS_GROUNDING_APPEND_PROMPT = """<|recipient|>os pyautogui.{function_name}""" UITARS_ACTION_SPACE = """ -click(start_box='[x1, y1, x2, y2]') -left_double(start_box='[x1, y1, x2, y2]') -right_single(start_box='[x1, y1, x2, y2]') -drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]') +click(start_box='<|box_start|>(x1,y1)<|box_end|>') +left_double(start_box='<|box_start|>(x1,y1)<|box_end|>') +right_single(start_box='<|box_start|>(x1,y1)<|box_end|>') +drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>') hotkey(key='') type(content='') #If you want to submit your input, use "\\n" at the end of `content`. -scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left') +scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left') wait() #Sleep for 5s and take a screenshot to check for any changes. finished() """ UITARS_CALL_USR_ACTION_SPACE = """ -click(start_box='[x1, y1, x2, y2]') -left_double(start_box='[x1, y1, x2, y2]') -right_single(start_box='[x1, y1, x2, y2]') -drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]') +click(start_box='<|box_start|>(x1,y1)<|box_end|>') +left_double(start_box='<|box_start|>(x1,y1)<|box_end|>') +right_single(start_box='<|box_start|>(x1,y1)<|box_end|>') +drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>') hotkey(key='') type(content='') #If you want to submit your input, use "\\n" at the end of `content`. -scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left') +scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left') wait() #Sleep for 5s and take a screenshot to check for any changes. finished() call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help. """ +UITARS_NORMAL_ACTION_SPACE = """ +click(start_box='<|box_start|>(x1,y1)<|box_end|>') +left_double(start_box='<|box_start|>(x1,y1)<|box_end|>') +right_single(start_box='<|box_start|>(x1,y1)<|box_end|>') +drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>') +hotkey(key='') +type(content='') #If you want to submit your input, use "\\n" at the end of `content`. +scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left') +wait() #Sleep for 5s and take a screenshot to check for any changes. +finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format. +""" + UITARS_USR_PROMPT_NOTHOUGHT = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. ## Output Format ``` Action: ... ``` ## Action Space -click(start_box='[x1, y1, x2, y2]') -left_double(start_box='[x1, y1, x2, y2]') -right_single(start_box='[x1, y1, x2, y2]') -drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]') +click(start_box='<|box_start|>(x1,y1)<|box_end|>') +left_double(start_box='<|box_start|>(x1,y1)<|box_end|>') +right_single(start_box='<|box_start|>(x1,y1)<|box_end|>') +drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>') hotkey(key='') type(content='') #If you want to submit your input, use "\\n" at the end of `content`. -scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left') +scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left') wait() #Sleep for 5s and take a screenshot to check for any changes. finished() call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.