diff --git a/mm_agents/coact/coding_agent.py b/mm_agents/coact/coding_agent.py
index f180bf4..cea9c8c 100644
--- a/mm_agents/coact/coding_agent.py
+++ b/mm_agents/coact/coding_agent.py
@@ -7,22 +7,20 @@ from .autogen.agentchat.contrib.multimodal_conversable_agent import MultimodalCo
CODER_SYSTEM_MESSAGE = """# Your role
-You are a coding assistant, you need to solve a task step-by-step given by the user.
-You can write code in ```bash...``` code blocks for bash scripts, and ```python...``` code blocks for python code.
-
-# Important notes
-- Once you complete the task, reply ONLY with "TERMINATE" to end the task.
-- DO NOT mix the TERMINATE with any other words or code blocks in your reply.
-- When you write code, you must identify the language (whether it is python or bash) of the code.
+- You are a programmer, you need to solve a task step-by-step given by the user.
+- You can write code in ```bash...``` code blocks for bash scripts, and ```python...``` code blocks for python code.
- Your linux username is "user".
-- Wrap all your code in ONE code block. DO NOT let user save the code as a file and execute it for you.
- If you want to use sudo, follow the format: "echo {CLIENT_PASSWORD} | sudo -S [YOUR COMMANDS]" (no quotes for the word "{CLIENT_PASSWORD}").
-- Ignore the error: "sudo: /etc/sudoers.d is world writable".
-- Your python code will be sent line-by-line into a interactive python terminal. Do not include __main__ in your code.
-- When import a package, you need to check if the package is installed. If not, you need to install it yourself.
+
+# Requirements
+- You MUST verify the result before save the changes.
+- When you write code, you must identify the language (whether it is python or bash) of the code.
+- Wrap all your code in ONE code block. DO NOT let user save the code as a file and execute it for you.
+- Do not include __main__ in your python code.
+- When you modify a spreadsheet, **make sure every value is in the expected cell**.
+- When importing a package, you need to check if the package has been installed. If not, you need to install it yourself.
- You need to print the progressive and final result.
- If you met execution error, you need to analyze the error message and try to fix the error.
-- IMPORTANT: If you modified a file like spreadsheet, you should close and reopen the file by operating the GUI, so that I can see what you changed.
"""
class TerminalProxyAgent(MultimodalConversableAgent):
@@ -62,15 +60,15 @@ class TerminalProxyAgent(MultimodalConversableAgent):
exitcode = 0
logs = output_dict["output"]
else:
- exitcode = -1
+ exitcode = 0
logs = output_dict["output"]
elif lang in PYTHON_VARIANTS:
output_dict = self.env.controller.run_python_script(code)
if output_dict["status"] == "error":
- exitcode = -1
+ exitcode = 0
logs = output_dict["output"]
else:
- exitcode = -1
+ exitcode = 0
logs = output_dict["message"]
else:
exitcode = -1
diff --git a/mm_agents/coact/cua_agent.py b/mm_agents/coact/cua_agent.py
index c0c6522..33848eb 100644
--- a/mm_agents/coact/cua_agent.py
+++ b/mm_agents/coact/cua_agent.py
@@ -19,10 +19,11 @@ PROMPT_TEMPLATE = """# Task
# Hints
- Sudo password is "{CLIENT_PASSWORD}".
-- If you meet "Authentication required" dialog, enter the "{CLIENT_PASSWORD}" to continue.
-- Do not close the any application or window or tab that is already opened.
-- Do not close the window at the end of the task.
+- Keep the windows/applications opened at the end of the task.
+- Do not use shortcut to reload the application except for the browser, just close and reopen.
+- If "The document has been changed by others" pops out, you should click "cancel" and reopen the file.
- If you have completed the user task, reply with the information you want the user to know along with 'TERMINATE'.
+- If you don't know how to continue the task, reply your concern or question along with 'IDK'.
""".strip()
DEFAULT_REPLY = "Please continue the user task. If you have completed the user task, reply with the information you want the user to know along with 'TERMINATE'."
@@ -118,7 +119,9 @@ def call_openai_cua(client: OpenAI,
"environment": environment,
}],
input=history_inputs,
- reasoning={"summary": "concise"},
+ reasoning={
+ "summary": "concise"
+ },
tool_choice="required",
truncation="auto",
)
@@ -205,6 +208,10 @@ def run_cua(
reasoning = "My thinking process\n" + "\n- ".join(reasoning_list) + '\nPlease check the screenshot and see if it fulfills your requirements.'
breakflag = True
break
+ if 'IDK' in o.content[0].text:
+ reasoning = f"{o.content[0].text}. I don't know how to complete the task. Please check the current screenshot."
+ breakflag = True
+ break
try:
json.loads(o.content[0].text)
history_inputs.pop(len(history_inputs) - len(response.output) + i)
diff --git a/mm_agents/coact/operator_agent.py b/mm_agents/coact/operator_agent.py
index e300ffd..9779df0 100644
--- a/mm_agents/coact/operator_agent.py
+++ b/mm_agents/coact/operator_agent.py
@@ -23,13 +23,13 @@ class OrchestratorAgent(MultimodalConversableAgent):
"type": "function",
"function": {
"name": "call_gui_agent",
- "description": """Let a GUI agent to solve a task. GUI agent can operate the computer by clicking and typing. Require detailed task description.""",
+ "description": """Let a OS Operator to solve a task. OS operator can operate the computer by clicking and typing (not accurate in dense UI). Require detailed task description.""",
"parameters": {
"type": "object",
"properties": {
"task": {
"type": "string",
- "description": "[REQUIRED] A detailed task to be solved.",
+ "description": "[REQUIRED] A detailed task to be solved with step-by-step guidance.",
},
},
},
@@ -40,17 +40,17 @@ class OrchestratorAgent(MultimodalConversableAgent):
"type": "function",
"function": {
"name": "call_coding_agent",
- "description": """Let a coding agent to solve a task. Coding agent can write python and bash code with many tools to solve a task. Especially good for file (like spreadsheet) operation. Require detailed task and environment description.""",
+ "description": """(You MUST use this first) Let a programmer to solve a task. Coding agent can write python and bash code with many tools to solve a task. Require detailed task and environment description.""",
"parameters": {
"type": "object",
"properties": {
"task": {
"type": "string",
- "description": "[REQUIRED] A detailed task to be solved. The task should be a coding task.",
+ "description": "[REQUIRED] A detailed task to be solved.",
},
"environment": {
"type": "string",
- "description": "[REQUIRED] The environment description of the coding agent. It should be a detailed description of the system state, including the current directory, the opened files, the running processes, etc.",
+ "description": "[REQUIRED] The environment description of the coding agent. It should be a detailed description of the system state, including the opened files, the running processes, etc.",
}
},
},
@@ -101,6 +101,8 @@ class OrchestratorAgent(MultimodalConversableAgent):
if system_message is None:
self.update_system_message("")
+ else:
+ self.update_system_message(system_message)
self.update_tool_signature(self.CALL_CODING_AGENT_TOOL, is_remove=False)
self.update_tool_signature(self.CALL_GUI_AGENT_TOOL, is_remove=False)
@@ -110,7 +112,7 @@ class OrchestratorAgent(MultimodalConversableAgent):
class OrchestratorUserProxyAgent(MultimodalConversableAgent):
"""(In preview) A proxy agent for the captain agent, that can execute code and provide feedback to the other agents."""
- DEFAULT_AUTO_REPLY = "I'm a proxy and I can only execute your tool or end the conversation. If you think the problem is solved, please reply me only with 'TERMINATE'. If you think the task is impossible to solve, please reply me only with 'INFEASIBLE'."
+ DEFAULT_AUTO_REPLY = "Thank you! Note that the user's task is: {user_instruction}. Please continue the task. If you think the everything is solved, please reply me only with 'TERMINATE'. But once you think the task is impossible to solve, please reply me only with 'INFEASIBLE'."
DEFAULT_USER_PROXY_AGENT_DESCRIPTIONS = {
"ALWAYS": "An attentive HUMAN user who can answer questions about the task, and can perform tasks such as running Python code or inputting command line commands at a Linux terminal and reporting back the execution results.",
@@ -156,6 +158,9 @@ class OrchestratorUserProxyAgent(MultimodalConversableAgent):
coding_max_steps: int = 30,
history_save_dir: str = "",
llm_model: str = "o4-mini",
+ region: str = "us-east-1",
+ client_password: str = "",
+ user_instruction: str = "",
):
description = (
description if description is not None else self.DEFAULT_USER_PROXY_AGENT_DESCRIPTIONS[human_input_mode]
@@ -168,7 +173,7 @@ class OrchestratorUserProxyAgent(MultimodalConversableAgent):
human_input_mode=human_input_mode,
code_execution_config=code_execution_config,
llm_config=llm_config,
- default_auto_reply=default_auto_reply,
+ default_auto_reply=default_auto_reply.format(user_instruction=user_instruction),
description=description,
)
self.register_function(
@@ -183,14 +188,27 @@ class OrchestratorUserProxyAgent(MultimodalConversableAgent):
"sleep_after_execution": sleep_after_execution,
"truncate_history_inputs": truncate_history_inputs,
}
+ self.region = region
+ self.client_password = client_password
+
+ from desktop_env.providers.aws.manager import IMAGE_ID_MAP
+ screen_size = (screen_width, screen_height)
+ ami_id = IMAGE_ID_MAP[region].get(screen_size, IMAGE_ID_MAP[region][(1920, 1080)])
+
self.env = DesktopEnv(
path_to_vm=path_to_vm,
+ action_space="pyautogui",
provider_name=provider_name,
os_type="Ubuntu",
- action_space="pyautogui",
- snapshot_name="init_state",
+ region=region,
+ snapshot_name=ami_id,
+ screen_size=screen_size,
+ headless=True,
require_a11y_tree=observation_type in ["a11y_tree", "screenshot_a11y_tree", "som"],
+ enable_proxy=True,
+ client_password=client_password
)
+
self.history_save_dir = history_save_dir
self.cua_call_count = 0
self.coding_call_count = 0
@@ -216,7 +234,9 @@ class OrchestratorUserProxyAgent(MultimodalConversableAgent):
screen_width=screen_width,
screen_height=screen_height,
sleep_after_execution=self.cua_config["sleep_after_execution"],
- truncate_history_inputs=self.cua_config["truncate_history_inputs"])
+ truncate_history_inputs=self.cua_config["truncate_history_inputs"],
+ client_password=self.client_password
+ )
screenshot = self.env.controller.get_screenshot()
with open(os.path.join(cua_path, "history_inputs.json"), "w") as f:
@@ -234,6 +254,8 @@ class OrchestratorUserProxyAgent(MultimodalConversableAgent):
result = result.replace("TERMINATE", "").strip()
if result == "":
result = "Task completed. Please check the screenshot."
+ elif "IDK" in result:
+ result = result.replace("IDK", "").strip()
else:
result = f"I didn't complete the task and I have to go. Now I'm working on \"{result}\", please check the current screenshot."
return f"# Response from GUI agent: {result}
"
@@ -246,7 +268,7 @@ class OrchestratorUserProxyAgent(MultimodalConversableAgent):
coding_agent = MultimodalConversableAgent(
name="coding_agent",
llm_config=LLMConfig(api_type="openai", model=self.llm_model),
- system_message=CODER_SYSTEM_MESSAGE,
+ system_message=CODER_SYSTEM_MESSAGE.format(CLIENT_PASSWORD=self.client_password),
)
code_interpreter = TerminalProxyAgent(
name="code_interpreter",
diff --git a/run_coact.py b/run_coact.py
index 79b35cd..1461b19 100644
--- a/run_coact.py
+++ b/run_coact.py
@@ -6,6 +6,7 @@ import shutil
import traceback
from typing import Dict, List
import json
+import time
import os
from mm_agents.coact.operator_agent import OrchestratorAgent, OrchestratorUserProxyAgent
from mm_agents.coact.autogen import LLMConfig
@@ -16,30 +17,34 @@ import sys
TASK_DESCRIPTION = """# Your role
-You are a task solver, you need to try your best to complete a computer-using task step-by-step.
-- Based on the task description AND the screenshot, provide a detailed plan. The screenshot includes the current state of the computer, and it includes a lot of hints.
-- Do not do anything else out of the user's instruction, like discover the file out of the user specific location or imagine the conditions. This will affect the judgement of the infeasible task.
-- When you see the interactable element in the screenshot is dense (like a spreadsheet is opening), you MUST try coding agent first.
-- When you let coding agent to modify an existing file, you MUST let it check the file content first.
-- After coding agent is done, you MUST check the final result carefully either by looking into the screenshot or by GUI agent and make sure EVERY value is in the desired position (e.g., the required cells in the spreadsheet are filled).
-- When you call GUI agent, it will have a **20-step** budget to complete your task. Each step is a one-time interaction with OS like mouse click or keyboard typing. Please take this into account when you plan the actions.
-- Remember to save the file (if applicable) before completing the task.
+You are a task solver, you need to complete a computer-using task step-by-step.
+1. Describe the screenshot.
+2. Provide a detailed plan, including a list of user requirements like specific file name, file path, etc.
+3. Follow the following instructions and complete the task with your skills.
+ - If you think the task is impossible to complete (no file, wrong environment, etc.), reply with "INFEASIBLE" to end the conversation.
+ - **Do not** do (or let coding/GUI agent do) anything else out of the user's instruction like change the file name. This will make the task fail.
+ - Check every screenshot carefully and see if it fulfills the task requirement.
+ - You MUST try the Coding Agent first for file operation tasks like spreadsheet modification.
+4. Verify the result and see if it fulfills the user's requirement.
-# Your skills
+# Your helpers
You can use the following tools to solve the task. You can only call one of gui agent or coding agent per reply:
-- call_coding_agent: Let a coding agent to solve a task. Coding agent can write python or bash code to modify everything on the computer. It requires a environment description and a detailed task description.
-- call_gui_agent: Let a GUI agent to solve a task. GUI agent can operate the computer by clicking and typing (not that accurate). It will have a **20-step** budget to complete your task. Require a detailed task description.
-# About the task
-- Check every screenshot carefully and see if it fulfills the task requirement.
-- If the task is completed, reply with "TERMINATE" to end the conversation.
-- If you think the task is impossible to complete (no file, wrong environment, etc.), reply with "INFEASIBLE" to end the conversation.
-- TERMINATE and INFEASIBLE are used to determine if the task is completed. Therefore, do not use it in your response unless the task is completed.
+## Programmer
+Let a programmer to solve a subtask you assigned.
+The Programmer can write python or bash code to modify almost everything in the computer, like files, apps, system settings, etc.
+It requires a environment description and a detailed task description. As detailed as possible.
+Can use any python package you instructed.
+Will return a summary with the output of the code.
+When letting coding agent to modify the spreadsheet, after the task completed, you MUST make sure EVERY modified value in the spreadsheet is in the desired position (e.g., filled in the expected cell) by a GUI Operator.
+After that, if anything is wrong, tell the programmer to modify it.
-# User task
-{instruction}
-Please first check carefully if my task is possible to complete. If not, reply with "INFEASIBLE".
-If possible to complete, please complete this task on my computer. I will not provide further information to you.
+## GUI Operator
+Let a GUI agent to solve a subtask you assigned.
+GUI agent can operate the computer by clicking and typing (but not accurate).
+Require a detailed task description.
+When you call GUI agent, it will only have a **20-step** budget to complete your task. Each step is a one-time interaction with OS like mouse click or keyboard typing. Please take this into account when you plan the actions.
+If you let GUI Operator to check the result, you MUST let it close and reopen the file because programmer's result will NOT be updated to the screen.
"""
@@ -50,22 +55,22 @@ def config() -> argparse.Namespace:
# environment config
parser.add_argument("--path_to_vm", type=str, default=None)
- parser.add_argument("--provider_name", type=str, default="docker")
+ parser.add_argument("--provider_name", type=str, default="aws")
parser.add_argument("--screen_width", type=int, default=1920)
parser.add_argument("--screen_height", type=int, default=1080)
parser.add_argument("--sleep_after_execution", type=float, default=0.5)
parser.add_argument("--region", type=str, default="us-east-1")
- parser.add_argument("--client_password", type=str, default="")
+ parser.add_argument("--client_password", type=str, default="osworld-public-evaluation")
# agent config
- parser.add_argument("--oai_config_path", type=str, default="OAI_CONFIG_LIST")
+ parser.add_argument("--oai_config_path", type=str, default="/home/ubuntu/OSWorld/mm_agents/coact/OAI_CONFIG_LIST")
parser.add_argument("--orchestrator_model", type=str, default="o3")
parser.add_argument("--coding_model", type=str, default="o4-mini")
parser.add_argument("--cua_model", type=str, default="computer-use-preview")
parser.add_argument("--orchestrator_max_steps", type=int, default=15)
parser.add_argument("--coding_max_steps", type=int, default=20)
parser.add_argument("--cua_max_steps", type=int, default=25)
- parser.add_argument("--cut_off_steps", type=int, default=150)
+ parser.add_argument("--cut_off_steps", type=int, default=200)
# example config
parser.add_argument("--domain", type=str, default="all")
@@ -77,7 +82,7 @@ def config() -> argparse.Namespace:
)
# logging related
- parser.add_argument("--result_dir", type=str, default="./results")
+ parser.add_argument("--result_dir", type=str, default="./results_coact")
parser.add_argument("--num_envs", type=int, default=1, help="Number of environments to run in parallel")
parser.add_argument("--log_level", type=str, choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
default='INFO', help="Set the logging level")
@@ -158,10 +163,11 @@ def process_task(task_info,
with llm_config:
orchestrator = OrchestratorAgent(
name="orchestrator",
+ system_message=TASK_DESCRIPTION
)
orchestrator_proxy = OrchestratorUserProxyAgent(
name="orchestrator_proxy",
- is_termination_msg=lambda x: x.get("content", "") and ("terminate" in x.get("content", "")[0]["text"].lower() or "infeasible" in x.get("content", "")[0]["text"].lower()),
+ is_termination_msg=lambda x: x.get("content", "") and (x.get("content", "")[0]["text"].lower() == "terminate" or x.get("content", "")[0]["text"].lower() == "infeasible"),
human_input_mode="NEVER",
provider_name=provider_name,
path_to_vm=path_to_vm,
@@ -175,14 +181,22 @@ def process_task(task_info,
cua_max_steps=cua_max_steps,
coding_max_steps=coding_max_steps,
region=region,
- client_password=client_password
+ client_password=client_password,
+ user_instruction=task_config["instruction"]
)
- obs = orchestrator_proxy.reset(task_config=task_config)
+ orchestrator_proxy.reset(task_config=task_config)
+ time.sleep(60)
+ screenshot = orchestrator_proxy.env.controller.get_screenshot()
+ with open(os.path.join(history_save_dir, f'initial_screenshot_orchestrator.png'), "wb") as f:
+ f.write(screenshot)
+
orchestrator_proxy.initiate_chat(
recipient=orchestrator,
- message=TASK_DESCRIPTION.format(instruction=task_config["instruction"]) + "
",
+ message=f"""{task_config["instruction"]}
+Check my computer screenshot and describe it first. If this task is possible to complete, please complete it on my computer. If not, reply with "INFEASIBLE" to end the conversation.
+I will not provide further information to you.""" + "
",
max_turns=orchestrator_max_steps
)