feat: enhance run_coact.py and related agents with improved task handling and configuration

- Updated TASK_DESCRIPTION in run_coact.py to clarify task-solving steps and requirements. - Modified configuration parameters for provider name and client password for better security and flexibility. - Enhanced OrchestratorUserProxyAgent to include user instruction in the auto-reply and improved screenshot handling. - Adjusted coding_agent.py to ensure proper verification of results before saving changes. - Improved CUA agent prompts to maintain application state and handle user instructions more effectively. - Ensured existing code logic remains unchanged while enhancing functionality and usability.
2025-08-13 09:04:09 +00:00
parent d2ae0f697d
commit 7fb5860da0
4 changed files with 100 additions and 59 deletions
--- a/mm_agents/coact/coding_agent.py
+++ b/mm_agents/coact/coding_agent.py
@@ -7,22 +7,20 @@ from .autogen.agentchat.contrib.multimodal_conversable_agent import MultimodalCo


 CODER_SYSTEM_MESSAGE = """# Your role
-You are a coding assistant, you need to solve a task step-by-step given by the user. 
-You can write code in ```bash...``` code blocks for bash scripts, and ```python...``` code blocks for python code. 
-
-# Important notes
- Once you complete the task, reply ONLY with "TERMINATE" to end the task.
- DO NOT mix the TERMINATE with any other words or code blocks in your reply.
- When you write code, you must identify the language (whether it is python or bash) of the code.
+- You are a programmer, you need to solve a task step-by-step given by the user. 
+- You can write code in ```bash...``` code blocks for bash scripts, and ```python...``` code blocks for python code. 
 - Your linux username is "user".
- Wrap all your code in ONE code block. DO NOT let user save the code as a file and execute it for you.
 - If you want to use sudo, follow the format: "echo {CLIENT_PASSWORD} | sudo -S [YOUR COMMANDS]" (no quotes for the word "{CLIENT_PASSWORD}").
- Ignore the error: "sudo: /etc/sudoers.d is world writable".
- Your python code will be sent line-by-line into a interactive python terminal. Do not include __main__ in your code.
- When import a package, you need to check if the package is installed. If not, you need to install it yourself.
+
+# Requirements
+- You MUST verify the result before save the changes.
+- When you write code, you must identify the language (whether it is python or bash) of the code.
+- Wrap all your code in ONE code block. DO NOT let user save the code as a file and execute it for you.
+- Do not include __main__ in your python code.
+- When you modify a spreadsheet, **make sure every value is in the expected cell**.
+- When importing a package, you need to check if the package has been installed. If not, you need to install it yourself.
 - You need to print the progressive and final result.
 - If you met execution error, you need to analyze the error message and try to fix the error.
- IMPORTANT: If you modified a file like spreadsheet, you should close and reopen the file by operating the GUI, so that I can see what you changed.
 """

 class TerminalProxyAgent(MultimodalConversableAgent):
@@ -62,15 +60,15 @@ class TerminalProxyAgent(MultimodalConversableAgent):
                exitcode = 0
                logs = output_dict["output"]
            else:
-                exitcode = -1
+                exitcode = 0
                logs = output_dict["output"]
        elif lang in PYTHON_VARIANTS:
            output_dict = self.env.controller.run_python_script(code)
            if output_dict["status"] == "error":
-                exitcode = -1
+                exitcode = 0
                logs = output_dict["output"]
            else:
-                exitcode = -1
+                exitcode = 0
                logs = output_dict["message"]
        else:
            exitcode = -1
--- a/mm_agents/coact/cua_agent.py
+++ b/mm_agents/coact/cua_agent.py
@@ -19,10 +19,11 @@ PROMPT_TEMPLATE = """# Task

 # Hints
 - Sudo password is "{CLIENT_PASSWORD}".
- If you meet "Authentication required" dialog, enter the "{CLIENT_PASSWORD}" to continue.
- Do not close the any application or window or tab that is already opened.
- Do not close the window at the end of the task.
+- Keep the windows/applications opened at the end of the task.
+- Do not use shortcut to reload the application except for the browser, just close and reopen.
+- If "The document has been changed by others" pops out, you should click "cancel" and reopen the file.
 - If you have completed the user task, reply with the information you want the user to know along with 'TERMINATE'.
+- If you don't know how to continue the task, reply your concern or question along with 'IDK'.
 """.strip()
 DEFAULT_REPLY = "Please continue the user task. If you have completed the user task, reply with the information you want the user to know along with 'TERMINATE'."

@@ -118,7 +119,9 @@ def call_openai_cua(client: OpenAI,
                    "environment": environment,
                }],
                input=history_inputs,
-                reasoning={"summary": "concise"},
+                reasoning={
+                    "summary": "concise"
+                },
                tool_choice="required",
                truncation="auto",
            )
@@ -205,6 +208,10 @@ def run_cua(
                    reasoning = "My thinking process\n" + "\n- ".join(reasoning_list) + '\nPlease check the screenshot and see if it fulfills your requirements.'
                    breakflag = True
                    break
+                if 'IDK' in o.content[0].text:
+                    reasoning = f"{o.content[0].text}. I don't know how to complete the task. Please check the current screenshot."
+                    breakflag = True
+                    break
                try:
                    json.loads(o.content[0].text)
                    history_inputs.pop(len(history_inputs) - len(response.output) + i)
--- a/mm_agents/coact/operator_agent.py
+++ b/mm_agents/coact/operator_agent.py
@@ -23,13 +23,13 @@ class OrchestratorAgent(MultimodalConversableAgent):
        "type": "function",
        "function": {
            "name": "call_gui_agent",
-            "description": """Let a GUI agent to solve a task. GUI agent can operate the computer by clicking and typing. Require detailed task description.""",
+            "description": """Let a OS Operator to solve a task. OS operator can operate the computer by clicking and typing (not accurate in dense UI). Require detailed task description.""",
            "parameters": {
                "type": "object",
                "properties": {
                    "task": {
                        "type": "string",
-                        "description": "[REQUIRED] A detailed task to be solved.",
+                        "description": "[REQUIRED] A detailed task to be solved with step-by-step guidance.",
                    },
                },
            },
@@ -40,17 +40,17 @@ class OrchestratorAgent(MultimodalConversableAgent):
        "type": "function",
        "function": {
            "name": "call_coding_agent",
-            "description": """Let a coding agent to solve a task. Coding agent can write python and bash code with many tools to solve a task. Especially good for file (like spreadsheet) operation. Require detailed task and environment description.""",
+            "description": """(You MUST use this first) Let a programmer to solve a task. Coding agent can write python and bash code with many tools to solve a task. Require detailed task and environment description.""",
            "parameters": {
                "type": "object",
                "properties": {
                    "task": {
                        "type": "string",
-                        "description": "[REQUIRED] A detailed task to be solved. The task should be a coding task.",
+                        "description": "[REQUIRED] A detailed task to be solved.",
                    },
                    "environment": {
                        "type": "string",
-                        "description": "[REQUIRED] The environment description of the coding agent. It should be a detailed description of the system state, including the current directory, the opened files, the running processes, etc.",
+                        "description": "[REQUIRED] The environment description of the coding agent. It should be a detailed description of the system state, including the opened files, the running processes, etc.",
                    }
                },
            },
@@ -101,6 +101,8 @@ class OrchestratorAgent(MultimodalConversableAgent):

        if system_message is None:
            self.update_system_message("")
+        else:
+            self.update_system_message(system_message)

        self.update_tool_signature(self.CALL_CODING_AGENT_TOOL, is_remove=False)
        self.update_tool_signature(self.CALL_GUI_AGENT_TOOL, is_remove=False)
@@ -110,7 +112,7 @@ class OrchestratorAgent(MultimodalConversableAgent):
 class OrchestratorUserProxyAgent(MultimodalConversableAgent):
    """(In preview) A proxy agent for the captain agent, that can execute code and provide feedback to the other agents."""

-    DEFAULT_AUTO_REPLY = "I'm a proxy and I can only execute your tool or end the conversation. If you think the problem is solved, please reply me only with 'TERMINATE'. If you think the task is impossible to solve, please reply me only with 'INFEASIBLE'."
+    DEFAULT_AUTO_REPLY = "Thank you! Note that the user's task is: {user_instruction}. Please continue the task. If you think the everything is solved, please reply me only with 'TERMINATE'. But once you think the task is impossible to solve, please reply me only with 'INFEASIBLE'."

    DEFAULT_USER_PROXY_AGENT_DESCRIPTIONS = {
        "ALWAYS": "An attentive HUMAN user who can answer questions about the task, and can perform tasks such as running Python code or inputting command line commands at a Linux terminal and reporting back the execution results.",
@@ -156,6 +158,9 @@ class OrchestratorUserProxyAgent(MultimodalConversableAgent):
        coding_max_steps: int = 30,
        history_save_dir: str = "",
        llm_model: str = "o4-mini",
+        region: str = "us-east-1",
+        client_password: str = "",
+        user_instruction: str = "",
    ):
        description = (
            description if description is not None else self.DEFAULT_USER_PROXY_AGENT_DESCRIPTIONS[human_input_mode]
@@ -168,7 +173,7 @@ class OrchestratorUserProxyAgent(MultimodalConversableAgent):
            human_input_mode=human_input_mode,
            code_execution_config=code_execution_config,
            llm_config=llm_config,
-            default_auto_reply=default_auto_reply,
+            default_auto_reply=default_auto_reply.format(user_instruction=user_instruction),
            description=description,
        )
        self.register_function(
@@ -183,14 +188,27 @@ class OrchestratorUserProxyAgent(MultimodalConversableAgent):
            "sleep_after_execution": sleep_after_execution,
            "truncate_history_inputs": truncate_history_inputs,
        }
+        self.region = region
+        self.client_password = client_password
+
+        from desktop_env.providers.aws.manager import IMAGE_ID_MAP
+        screen_size = (screen_width, screen_height)
+        ami_id = IMAGE_ID_MAP[region].get(screen_size, IMAGE_ID_MAP[region][(1920, 1080)])
+
        self.env = DesktopEnv(
            path_to_vm=path_to_vm,
+            action_space="pyautogui",
            provider_name=provider_name,
            os_type="Ubuntu",
-            action_space="pyautogui",
-            snapshot_name="init_state",
+            region=region,
+            snapshot_name=ami_id,
+            screen_size=screen_size,
+            headless=True,
            require_a11y_tree=observation_type in ["a11y_tree", "screenshot_a11y_tree", "som"],
+            enable_proxy=True,
+            client_password=client_password
        )
+
        self.history_save_dir = history_save_dir
        self.cua_call_count = 0
        self.coding_call_count = 0
@@ -216,7 +234,9 @@ class OrchestratorUserProxyAgent(MultimodalConversableAgent):
                                                   screen_width=screen_width,
                                                   screen_height=screen_height,
                                                   sleep_after_execution=self.cua_config["sleep_after_execution"],
-                                                   truncate_history_inputs=self.cua_config["truncate_history_inputs"])
+                                                   truncate_history_inputs=self.cua_config["truncate_history_inputs"],
+                                                   client_password=self.client_password
+                                                   )
            screenshot = self.env.controller.get_screenshot()

            with open(os.path.join(cua_path, "history_inputs.json"), "w") as f:
@@ -234,6 +254,8 @@ class OrchestratorUserProxyAgent(MultimodalConversableAgent):
            result = result.replace("TERMINATE", "").strip()
            if result == "":
                result = "Task completed. Please check the screenshot."
+        elif "IDK" in result:
+            result = result.replace("IDK", "").strip()
        else:
            result = f"I didn't complete the task and I have to go. Now I'm working on \"{result}\", please check the current screenshot."
        return f"# Response from GUI agent: {result}<img data:image/png;base64,{base64.b64encode(screenshot).decode('utf-8')}>"
@@ -246,7 +268,7 @@ class OrchestratorUserProxyAgent(MultimodalConversableAgent):
            coding_agent = MultimodalConversableAgent(
                name="coding_agent",
                llm_config=LLMConfig(api_type="openai", model=self.llm_model),
-                system_message=CODER_SYSTEM_MESSAGE,
+                system_message=CODER_SYSTEM_MESSAGE.format(CLIENT_PASSWORD=self.client_password),
            )
            code_interpreter = TerminalProxyAgent(
                name="code_interpreter",