From 862d704b8c82da2c6eb6429036a01bf3144ee1b5 Mon Sep 17 00:00:00 2001
From: Xinyuan Wang <109049819+XinyuanWangCS@users.noreply.github.com>
Date: Thu, 31 Jul 2025 08:53:49 +0800
Subject: [PATCH] Wxy/opencua (#290)

* OpenCUA Agent code base

* update url

* debug, modify url input

* debug opencua

* show result

* debug agent history overlap

* modify opencua agent; add comment lines

* update parallel; clean code; use sleep 3s

* ui-tars-0717

* update detail

* add system password to system prompt

* add running command
---
 mm_agents/opencua_agent.py | 28 ++++++++++++++++++--------
 run_multienv_opencua.py    | 41 ++++++++++++++++++++++++++++++++------
 2 files changed, 55 insertions(+), 14 deletions(-)

diff --git a/mm_agents/opencua_agent.py b/mm_agents/opencua_agent.py
index 4e1b371..bb3de9a 100644
--- a/mm_agents/opencua_agent.py
+++ b/mm_agents/opencua_agent.py
@@ -14,21 +14,33 @@ import re
 import os
 import ast
 import time
-import json
 import math
-import copy
 import httpx
 import base64
 import backoff
-from io import BytesIO
 from loguru import logger
-from PIL import Image
 from typing import Dict, List, Tuple, Optional
 
+# System prompts used in the training data
 AGNET_SYS_PROMPT_L1 = "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.\n\nFor each step, provide your response in this format:\n\nAction:\n  Provide clear, concise, and actionable instructions:\n  - If the action involves interacting with a specific target:\n    - Describe target explicitly without using coordinates\n    - Specify element names when possible (use original language if non-English)\n    - Describe features (shape, color, position) if name unavailable\n    - For window control buttons, identify correctly (minimize \"—\", maximize \"□\", close \"X\")\n  - if the action involves keyboard actions like 'press', 'write', 'hotkey':\n    - Consolidate repetitive keypresses with count\n    - Specify expected text outcome for typing actions\n\nFinally, output the action as PyAutoGUI code or the following functions:\n- {\"name\": \"computer.triple_click\", \"description\": \"Triple click on the screen\", \"parameters\": {\"type\": \"object\", \"properties\": {\"x\": {\"type\": \"number\", \"description\": \"The x coordinate of the triple click\"}, \"y\": {\"type\": \"number\", \"description\": \"The y coordinate of the triple click\"}}, \"required\": [\"x\", \"y\"]}}\n- {\"name\": \"computer.terminate\", \"description\": \"Terminate the current task and report its completion status\", \"parameters\": {\"type\": \"object\", \"properties\": {\"status\": {\"type\": \"string\", \"enum\": [\"success\", \"fail\"], \"description\": \"The status of the task\"}}, \"required\": [\"status\"]}}".strip()
-AGNET_SYS_PROMPT_L2 = "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.\n\nFor each step, provide your response in this format:\n\nThought:\n  - Step by Step Progress Assessment:\n    - Analyze completed task parts and their contribution to the overall goal\n    - Reflect on potential errors, unexpected results, or obstacles\n    - If previous action was incorrect, predict a logical recovery step\n  - Next Action Analysis:\n    - List possible next actions based on current state\n    - Evaluate options considering current state and previous actions\n    - Propose most logical next action\n    - Anticipate consequences of the proposed action\n  - For Text Input Actions:\n    - Note current cursor position\n    - Consolidate repetitive actions (specify count for multiple keypresses)\n    - Describe expected final text outcome\n  - Use first-person perspective in reasoning\n\nAction:\n  Provide clear, concise, and actionable instructions:\n  - If the action involves interacting with a specific target:\n    - Describe target explicitly without using coordinates\n    - Specify element names when possible (use original language if non-English)\n    - Describe features (shape, color, position) if name unavailable\n    - For window control buttons, identify correctly (minimize \"—\", maximize \"□\", close \"X\")\n  - if the action involves keyboard actions like 'press', 'write', 'hotkey':\n    - Consolidate repetitive keypresses with count\n    - Specify expected text outcome for typing actions\n\nFinally, output the action as PyAutoGUI code or the following functions:\n- {\"name\": \"computer.triple_click\", \"description\": \"Triple click on the screen\", \"parameters\": {\"type\": \"object\", \"properties\": {\"x\": {\"type\": \"number\", \"description\": \"The x coordinate of the triple click\"}, \"y\": {\"type\": \"number\", \"description\": \"The y coordinate of the triple click\"}}, \"required\": [\"x\", \"y\"]}}\n- {\"name\": \"computer.terminate\", \"description\": \"Terminate the current task and report its completion status\", \"parameters\": {\"type\": \"object\", \"properties\": {\"status\": {\"type\": \"string\", \"enum\": [\"success\", \"fail\"], \"description\": \"The status of the task\"}}, \"required\": [\"status\"]}}".strip()
+# AGNET_SYS_PROMPT_L2 = "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.\n\nFor each step, provide your response in this format:\n\nThought:\n  - Step by Step Progress Assessment:\n    - Analyze completed task parts and their contribution to the overall goal\n    - Reflect on potential errors, unexpected results, or obstacles\n    - If previous action was incorrect, predict a logical recovery step\n  - Next Action Analysis:\n    - List possible next actions based on current state\n    - Evaluate options considering current state and previous actions\n    - Propose most logical next action\n    - Anticipate consequences of the proposed action\n  - For Text Input Actions:\n    - Note current cursor position\n    - Consolidate repetitive actions (specify count for multiple keypresses)\n    - Describe expected final text outcome\n  - Use first-person perspective in reasoning\n\nAction:\n  Provide clear, concise, and actionable instructions:\n  - If the action involves interacting with a specific target:\n    - Describe target explicitly without using coordinates\n    - Specify element names when possible (use original language if non-English)\n    - Describe features (shape, color, position) if name unavailable\n    - For window control buttons, identify correctly (minimize \"—\", maximize \"□\", close \"X\")\n  - if the action involves keyboard actions like 'press', 'write', 'hotkey':\n    - Consolidate repetitive keypresses with count\n    - Specify expected text outcome for typing actions\n\nFinally, output the action as PyAutoGUI code or the following functions:\n- {\"name\": \"computer.triple_click\", \"description\": \"Triple click on the screen\", \"parameters\": {\"type\": \"object\", \"properties\": {\"x\": {\"type\": \"number\", \"description\": \"The x coordinate of the triple click\"}, \"y\": {\"type\": \"number\", \"description\": \"The y coordinate of the triple click\"}}, \"required\": [\"x\", \"y\"]}}\n- {\"name\": \"computer.terminate\", \"description\": \"Terminate the current task and report its completion status\", \"parameters\": {\"type\": \"object\", \"properties\": {\"status\": {\"type\": \"string\", \"enum\": [\"success\", \"fail\"], \"description\": \"The status of the task\"}}, \"required\": [\"status\"]}}".strip()
 AGNET_SYS_PROMPT_L3 = "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.\n\nFor each step, provide your response in this format:\n\nObservation:\n  - Describe the current computer state based on the full screenshot in detail. \n  - Application Context:\n    - The active application\n    - The active window or page\n    - Overall layout and visible interface\n  - Key Elements:\n    - Menu items and toolbars \n    - Buttons and controls\n    - Text fields and content\n    - Dialog boxes or popups\n    - Error messages or notifications\n    - Loading states\n    - Other key elements\n  - Describe any content, elements, options, information or clues that are possibly relevant to achieving the task goal, including their name, content, or shape (if possible).\n\nThought:\n  - Step by Step Progress Assessment:\n    - Analyze completed task parts and their contribution to the overall goal\n    - Reflect on potential errors, unexpected results, or obstacles\n    - If previous action was incorrect, predict a logical recovery step\n  - Next Action Analysis:\n    - List possible next actions based on current state\n    - Evaluate options considering current state and previous actions\n    - Propose most logical next action\n    - Anticipate consequences of the proposed action\n  - For Text Input Actions:\n    - Note current cursor position\n    - Consolidate repetitive actions (specify count for multiple keypresses)\n    - Describe expected final text outcome\n  - Use first-person perspective in reasoning\n\nAction:\n  Provide clear, concise, and actionable instructions:\n  - If the action involves interacting with a specific target:\n    - Describe target explicitly without using coordinates\n    - Specify element names when possible (use original language if non-English)\n    - Describe features (shape, color, position) if name unavailable\n    - For window control buttons, identify correctly (minimize \"—\", maximize \"□\", close \"X\")\n  - if the action involves keyboard actions like 'press', 'write', 'hotkey':\n    - Consolidate repetitive keypresses with count\n    - Specify expected text outcome for typing actions\n\nFinally, output the action as PyAutoGUI code or the following functions:\n- {\"name\": \"computer.triple_click\", \"description\": \"Triple click on the screen\", \"parameters\": {\"type\": \"object\", \"properties\": {\"x\": {\"type\": \"number\", \"description\": \"The x coordinate of the triple click\"}, \"y\": {\"type\": \"number\", \"description\": \"The y coordinate of the triple click\"}}, \"required\": [\"x\", \"y\"]}}\n- {\"name\": \"computer.terminate\", \"description\": \"Terminate the current task and report its completion status\", \"parameters\": {\"type\": \"object\", \"properties\": {\"status\": {\"type\": \"string\", \"enum\": [\"success\", \"fail\"], \"description\": \"The status of the task\"}}, \"required\": [\"status\"]}}\n".strip()
 
+# Testing prompt on OSWorld-Verified
+AGNET_SYS_PROMPT_L2 = """You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task. The password of the computer is "osworld-public-evaluation". If the task is not possible to do, output the action computer.terminate(status='failure').
+
+For each step, provide your response in this format:
+
+Thought:\n  - Step by Step Progress Assessment:\n    - Analyze completed task parts and their contribution to the overall goal\n    - Reflect on potential errors, unexpected results, or obstacles\n    - If previous action was incorrect, predict a logical recovery step\n  - Next Action Analysis:\n    - List possible next actions based on current state\n    - Evaluate options considering current state and previous actions\n    - Propose most logical next action\n    - Anticipate consequences of the proposed action\n  - For Text Input Actions:\n    - Note current cursor position\n    - Consolidate repetitive actions (specify count for multiple keypresses)\n    - Describe expected final text outcome\n  - Use first-person perspective in reasoning
+
+Action:\n  Provide clear, concise, and actionable instructions:\n  - If the action involves interacting with a specific target:\n    - Describe target explicitly without using coordinates\n    - Specify element names when possible (use original language if non-English)\n    - Describe features (shape, color, position) if name unavailable\n    - For window control buttons, identify correctly (minimize "—", maximize "□", close "X")\n  - if the action involves keyboard actions like \'press\', \'write\', \'hotkey\':\n    - Consolidate repetitive keypresses with count\n    - Specify expected text outcome for typing actions
+
+Finally, output the action as PyAutoGUI code or the following functions:
+- {"name": "computer.triple_click", "description": "Triple click on the screen", "parameters": {"type": "object", "properties": {"x": {"type": "number", "description": "The x coordinate of the triple click"}, "y": {"type": "number", "description": "The y coordinate of the triple click"}}, "required": ["x", "y"]}}
+- {"name": "computer.terminate", "description": "Terminate the current task and report its completion status", "parameters": {"type": "object", "properties": {"status": {"type": "string", "enum": ["success", "failure"], "description": "The status of the task"}}, "required": ["status"]}}
+""".strip()
+
+
 STEP_TEMPLATE = "# Step {step_num}:\n"
 INSTRUTION_TEMPLATE = "# Task Instruction:\n{instruction}\n\nPlease generate the next move according to the screenshot, task instruction and previous steps (if provided).\n"
 
@@ -638,12 +650,12 @@ class OpenCUAAgent:
         logger.info(f"Model Output: \n{response}")
         if not response:
             logger.error("No response found in the response.")
-            return "ERROR", [], {}
+            return "ERROR", ["DONE"], {}
 
         low_level_instruction, pyautogui_actions, other_cot = parse_response_to_cot_and_action(response, self.screen_size, self.coordinate_type)
-        if not pyautogui_actions:
+        if not pyautogui_actions or len(pyautogui_actions) == 0:
             logger.error("No pyautogui actions found in the response.")
-            return response, [], {}
+            return response, ["FAIL"], {}
 
         pyautogui_actions = [
             self._scale_scroll_for_windows(code) for code in pyautogui_actions
diff --git a/run_multienv_opencua.py b/run_multienv_opencua.py
index 6f8af63..3ca1f40 100644
--- a/run_multienv_opencua.py
+++ b/run_multienv_opencua.py
@@ -1,3 +1,35 @@
+"""
+    This is the script to run OpenCUA agents on OSWorld tasks using AWS provider.
+
+    You should first host the OpenCUA model on your local machine or a server.
+
+    Command for OpenCUA-7B and OpenCUA-32B:
+    ```
+        python run_multienv_opencua.py \
+            --headless \
+            --observation_type screenshot \
+            --model OpenCUA-32B \
+            --result_dir ./results --test_all_meta_path evaluation_examples/test_all_no_gdrive.json \
+            --max_steps 100 \
+            --num_envs 30  \
+            --coordinate_type qwen25
+    ```
+
+    Command for OpenCUA-Qwen2-7B and OpenCUA-A3B:
+    ```
+        python run_multienv_opencua.py \
+            --headless \
+            --observation_type screenshot \
+            --model OpenCUA-A3B \
+            --result_dir ./results \
+            --test_all_meta_path evaluation_examples/test_nogdrive.json \
+            --max_steps 100 \
+            --num_envs 10  \
+            --coordinate_type relative
+    ```
+
+"""
+
 from __future__ import annotations
 import argparse
 import datetime
@@ -7,9 +39,7 @@ import os
 import sys
 import signal
 import time
-from typing import List, Dict
-import math
-from tqdm import tqdm
+from typing import List
 from multiprocessing import Process, Manager
 from multiprocessing import current_process
 import lib_run_single
@@ -26,7 +56,7 @@ if os.path.exists(".env"):
     from dotenv import load_dotenv
     load_dotenv()
 
-#  Logger Configs {{{ #
+#  Logger Configs 
 def config() -> argparse.Namespace:
     parser = argparse.ArgumentParser(
         description="Run end-to-end evaluation on the benchmark"
@@ -58,7 +88,7 @@ def config() -> argparse.Namespace:
     parser.add_argument("--model", type=str, default="opencua")
     parser.add_argument("--temperature", type=float, default=0)
     parser.add_argument("--top_p", type=float, default=0.9)
-    parser.add_argument("--max_tokens", type=int, default=8196)
+    parser.add_argument("--max_tokens", type=int, default=2048)
     parser.add_argument("--stop_token", type=str, default=None)
 
     # OpenCUAagent config
@@ -129,7 +159,6 @@ stdout_handler.addFilter(logging.Filter("desktopenv"))
 logger.addHandler(file_handler)
 logger.addHandler(debug_handler)
 logger.addHandler(stdout_handler)
-#  }}} Logger Configs #
 
 logger = logging.getLogger("desktopenv.experiment")