feat: enhance run_coact.py and related agents with improved task handling and configuration

- Updated TASK_DESCRIPTION in run_coact.py to clarify task-solving steps and requirements. - Modified configuration parameters for provider name and client password for better security and flexibility. - Enhanced OrchestratorUserProxyAgent to include user instruction in the auto-reply and improved screenshot handling. - Adjusted coding_agent.py to ensure proper verification of results before saving changes. - Improved CUA agent prompts to maintain application state and handle user instructions more effectively. - Ensured existing code logic remains unchanged while enhancing functionality and usability.
2025-08-13 09:04:09 +00:00
parent d2ae0f697d
commit 7fb5860da0
4 changed files with 100 additions and 59 deletions
--- a/mm_agents/coact/operator_agent.py
+++ b/mm_agents/coact/operator_agent.py
@@ -23,13 +23,13 @@ class OrchestratorAgent(MultimodalConversableAgent):
        "type": "function",
        "function": {
            "name": "call_gui_agent",
-            "description": """Let a GUI agent to solve a task. GUI agent can operate the computer by clicking and typing. Require detailed task description.""",
+            "description": """Let a OS Operator to solve a task. OS operator can operate the computer by clicking and typing (not accurate in dense UI). Require detailed task description.""",
            "parameters": {
                "type": "object",
                "properties": {
                    "task": {
                        "type": "string",
-                        "description": "[REQUIRED] A detailed task to be solved.",
+                        "description": "[REQUIRED] A detailed task to be solved with step-by-step guidance.",
                    },
                },
            },
@@ -40,17 +40,17 @@ class OrchestratorAgent(MultimodalConversableAgent):
        "type": "function",
        "function": {
            "name": "call_coding_agent",
-            "description": """Let a coding agent to solve a task. Coding agent can write python and bash code with many tools to solve a task. Especially good for file (like spreadsheet) operation. Require detailed task and environment description.""",
+            "description": """(You MUST use this first) Let a programmer to solve a task. Coding agent can write python and bash code with many tools to solve a task. Require detailed task and environment description.""",
            "parameters": {
                "type": "object",
                "properties": {
                    "task": {
                        "type": "string",
-                        "description": "[REQUIRED] A detailed task to be solved. The task should be a coding task.",
+                        "description": "[REQUIRED] A detailed task to be solved.",
                    },
                    "environment": {
                        "type": "string",
-                        "description": "[REQUIRED] The environment description of the coding agent. It should be a detailed description of the system state, including the current directory, the opened files, the running processes, etc.",
+                        "description": "[REQUIRED] The environment description of the coding agent. It should be a detailed description of the system state, including the opened files, the running processes, etc.",
                    }
                },
            },
@@ -101,6 +101,8 @@ class OrchestratorAgent(MultimodalConversableAgent):

        if system_message is None:
            self.update_system_message("")
+        else:
+            self.update_system_message(system_message)

        self.update_tool_signature(self.CALL_CODING_AGENT_TOOL, is_remove=False)
        self.update_tool_signature(self.CALL_GUI_AGENT_TOOL, is_remove=False)
@@ -110,7 +112,7 @@ class OrchestratorAgent(MultimodalConversableAgent):
 class OrchestratorUserProxyAgent(MultimodalConversableAgent):
    """(In preview) A proxy agent for the captain agent, that can execute code and provide feedback to the other agents."""

-    DEFAULT_AUTO_REPLY = "I'm a proxy and I can only execute your tool or end the conversation. If you think the problem is solved, please reply me only with 'TERMINATE'. If you think the task is impossible to solve, please reply me only with 'INFEASIBLE'."
+    DEFAULT_AUTO_REPLY = "Thank you! Note that the user's task is: {user_instruction}. Please continue the task. If you think the everything is solved, please reply me only with 'TERMINATE'. But once you think the task is impossible to solve, please reply me only with 'INFEASIBLE'."

    DEFAULT_USER_PROXY_AGENT_DESCRIPTIONS = {
        "ALWAYS": "An attentive HUMAN user who can answer questions about the task, and can perform tasks such as running Python code or inputting command line commands at a Linux terminal and reporting back the execution results.",
@@ -156,6 +158,9 @@ class OrchestratorUserProxyAgent(MultimodalConversableAgent):
        coding_max_steps: int = 30,
        history_save_dir: str = "",
        llm_model: str = "o4-mini",
+        region: str = "us-east-1",
+        client_password: str = "",
+        user_instruction: str = "",
    ):
        description = (
            description if description is not None else self.DEFAULT_USER_PROXY_AGENT_DESCRIPTIONS[human_input_mode]
@@ -168,7 +173,7 @@ class OrchestratorUserProxyAgent(MultimodalConversableAgent):
            human_input_mode=human_input_mode,
            code_execution_config=code_execution_config,
            llm_config=llm_config,
-            default_auto_reply=default_auto_reply,
+            default_auto_reply=default_auto_reply.format(user_instruction=user_instruction),
            description=description,
        )
        self.register_function(
@@ -183,14 +188,27 @@ class OrchestratorUserProxyAgent(MultimodalConversableAgent):
            "sleep_after_execution": sleep_after_execution,
            "truncate_history_inputs": truncate_history_inputs,
        }
+        self.region = region
+        self.client_password = client_password
+
+        from desktop_env.providers.aws.manager import IMAGE_ID_MAP
+        screen_size = (screen_width, screen_height)
+        ami_id = IMAGE_ID_MAP[region].get(screen_size, IMAGE_ID_MAP[region][(1920, 1080)])
+
        self.env = DesktopEnv(
            path_to_vm=path_to_vm,
+            action_space="pyautogui",
            provider_name=provider_name,
            os_type="Ubuntu",
-            action_space="pyautogui",
-            snapshot_name="init_state",
+            region=region,
+            snapshot_name=ami_id,
+            screen_size=screen_size,
+            headless=True,
            require_a11y_tree=observation_type in ["a11y_tree", "screenshot_a11y_tree", "som"],
+            enable_proxy=True,
+            client_password=client_password
        )
+
        self.history_save_dir = history_save_dir
        self.cua_call_count = 0
        self.coding_call_count = 0
@@ -216,7 +234,9 @@ class OrchestratorUserProxyAgent(MultimodalConversableAgent):
                                                   screen_width=screen_width,
                                                   screen_height=screen_height,
                                                   sleep_after_execution=self.cua_config["sleep_after_execution"],
-                                                   truncate_history_inputs=self.cua_config["truncate_history_inputs"])
+                                                   truncate_history_inputs=self.cua_config["truncate_history_inputs"],
+                                                   client_password=self.client_password
+                                                   )
            screenshot = self.env.controller.get_screenshot()

            with open(os.path.join(cua_path, "history_inputs.json"), "w") as f:
@@ -234,6 +254,8 @@ class OrchestratorUserProxyAgent(MultimodalConversableAgent):
            result = result.replace("TERMINATE", "").strip()
            if result == "":
                result = "Task completed. Please check the screenshot."
+        elif "IDK" in result:
+            result = result.replace("IDK", "").strip()
        else:
            result = f"I didn't complete the task and I have to go. Now I'm working on \"{result}\", please check the current screenshot."
        return f"# Response from GUI agent: {result}<img data:image/png;base64,{base64.b64encode(screenshot).decode('utf-8')}>"
@@ -246,7 +268,7 @@ class OrchestratorUserProxyAgent(MultimodalConversableAgent):
            coding_agent = MultimodalConversableAgent(
                name="coding_agent",
                llm_config=LLMConfig(api_type="openai", model=self.llm_model),
-                system_message=CODER_SYSTEM_MESSAGE,
+                system_message=CODER_SYSTEM_MESSAGE.format(CLIENT_PASSWORD=self.client_password),
            )
            code_interpreter = TerminalProxyAgent(
                name="code_interpreter",