diff --git a/mm_agents/prompts.py b/mm_agents/prompts.py index 37609c4..2305aaa 100644 --- a/mm_agents/prompts.py +++ b/mm_agents/prompts.py @@ -1195,43 +1195,55 @@ AGUVIS_GROUNDING_APPEND_PROMPT = """<|recipient|>os pyautogui.{function_name}""" UITARS_ACTION_SPACE = """ -click(start_box='[x1, y1, x2, y2]') -left_double(start_box='[x1, y1, x2, y2]') -right_single(start_box='[x1, y1, x2, y2]') -drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]') +click(start_box='<|box_start|>(x1,y1)<|box_end|>') +left_double(start_box='<|box_start|>(x1,y1)<|box_end|>') +right_single(start_box='<|box_start|>(x1,y1)<|box_end|>') +drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>') hotkey(key='') type(content='') #If you want to submit your input, use "\\n" at the end of `content`. -scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left') +scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left') wait() #Sleep for 5s and take a screenshot to check for any changes. finished() """ UITARS_CALL_USR_ACTION_SPACE = """ -click(start_box='[x1, y1, x2, y2]') -left_double(start_box='[x1, y1, x2, y2]') -right_single(start_box='[x1, y1, x2, y2]') -drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]') +click(start_box='<|box_start|>(x1,y1)<|box_end|>') +left_double(start_box='<|box_start|>(x1,y1)<|box_end|>') +right_single(start_box='<|box_start|>(x1,y1)<|box_end|>') +drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>') hotkey(key='') type(content='') #If you want to submit your input, use "\\n" at the end of `content`. -scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left') +scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left') wait() #Sleep for 5s and take a screenshot to check for any changes. finished() call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help. """ +UITARS_NORMAL_ACTION_SPACE = """ +click(start_box='<|box_start|>(x1,y1)<|box_end|>') +left_double(start_box='<|box_start|>(x1,y1)<|box_end|>') +right_single(start_box='<|box_start|>(x1,y1)<|box_end|>') +drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>') +hotkey(key='') +type(content='') #If you want to submit your input, use "\\n" at the end of `content`. +scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left') +wait() #Sleep for 5s and take a screenshot to check for any changes. +finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format. +""" + UITARS_USR_PROMPT_NOTHOUGHT = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. ## Output Format ``` Action: ... ``` ## Action Space -click(start_box='[x1, y1, x2, y2]') -left_double(start_box='[x1, y1, x2, y2]') -right_single(start_box='[x1, y1, x2, y2]') -drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]') +click(start_box='<|box_start|>(x1,y1)<|box_end|>') +left_double(start_box='<|box_start|>(x1,y1)<|box_end|>') +right_single(start_box='<|box_start|>(x1,y1)<|box_end|>') +drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>') hotkey(key='') type(content='') #If you want to submit your input, use "\\n" at the end of `content`. -scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left') +scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left') wait() #Sleep for 5s and take a screenshot to check for any changes. finished() call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.