From 27319ce1e359fecf530d3e23619a46d609f178db Mon Sep 17 00:00:00 2001 From: Yuan Mengqi <100453613+yuanmengqi@users.noreply.github.com> Date: Sun, 13 Jul 2025 00:25:37 +0800 Subject: [PATCH 1/5] fix password&resolution (#251) * fix chrome * fix: fix proxy setup * feat&fix: add proxy support in setup and remove hardcoded proxy from example * fix tasks * fix chrome finished * fix * clean chrome_fix code * clean chrome_fix code * fix chrome 2888b4e6-5b47-4b57-8bf5-c73827890774 * fix multiapps * fix chrome 2888b4e6-5b47-4b57-8bf5-c73827890774 * fix some multi_apps tasks * fix some multi_apps tasks * fix password&resolution * fix password&resolution --------- Co-authored-by: adlsdztony --- desktop_env/controllers/setup.py | 24 ++++++++++++++++++- desktop_env/desktop_env.py | 2 +- desktop_env/providers/aws/manager.py | 15 ++++++++---- .../21760ecb-8f62-40d2-8d85-0cee5725cb72.json | 2 +- .../550ce7e7-747b-495f-b122-acdc4d0b8e54.json | 2 +- .../a669ef01-ded5-4099-9ea9-25e99b569840.json | 2 +- .../ef9d12bd-bcee-4ba0-a40e-918400f43ddf.json | 2 +- .../02ce9a50-7af2-47ed-8596-af0c230501f8.json | 2 +- .../36037439-2044-4b50-b9d1-875b5a332143.json | 2 +- .../48d05431-6cd5-4e76-82eb-12b60d823f7d.json | 2 +- .../a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json | 2 +- .../b337d106-053f-4d37-8da0-7f9c4043a66b.json | 2 +- .../13584542-872b-42d8-b299-866967b5c3ef.json | 2 +- .../23393935-50c7-4a86-aeea-2b78fd089c5c.json | 2 +- .../28cc3b7e-b194-4bc9-8353-d04c0f4d56d2.json | 2 +- .../37887e8c-da15-4192-923c-08fa390a176d.json | 2 +- .../3ce045a0-877b-42aa-8d2c-b4a863336ab8.json | 2 +- .../4783cc41-c03c-4e1b-89b4-50658f642bd5.json | 2 +- .../4d117223-a354-47fb-8b45-62ab1390a95f.json | 2 +- .../5c1075ca-bb34-46a3-a7a0-029bd7463e79.json | 2 +- .../5ced85fc-fa1a-4217-95fd-0fb530545ce2.json | 2 +- .../5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57.json | 2 +- .../6f56bf42-85b8-4fbb-8e06-6c44960184ba.json | 2 +- .../94d95f96-9699-4208-98ba-3c3119edf9c2.json | 2 +- .../a462a795-fdc7-4b23-b689-e8b6df786b78.json | 2 +- .../a4d98375-215b-4a4d-aee9-3d4370fccc41.json | 2 +- .../b6781586-6346-41cd-935a-a6b1487918fc.json | 2 +- .../bedcedc4-4d72-425e-ad62-21960b11fe0d.json | 2 +- .../e0df059f-28a6-4169-924f-b9623e7184cc.json | 2 +- .../ec4e3f68-9ea4-4c18-a5c9-69f89d1178b3.json | 2 +- .../f9be0997-4b7c-45c5-b05c-4612b44a6118.json | 2 +- .../215dfd39-f493-4bc3-a027-8a97d72c61bf.json | 2 +- .../386dbd0e-0241-4a0a-b6a2-6704fba26b1c.json | 2 +- .../59f21cfb-0120-4326-b255-a5b827b38967.json | 2 +- .../8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89.json | 2 +- .../8d9fd4e2-6fdb-46b0-b9b9-02f06495c62f.json | 2 +- .../8f080098-ddb1-424c-b438-4e96e5e4786e.json | 2 +- .../9195653c-f4aa-453d-aa95-787f6ccfaae9.json | 2 +- .../a5bbbcd5-b398-4c91-83d4-55e1e31bbb81.json | 2 +- .../aa4b5023-aef6-4ed9-bdc9-705f59ab9ad6.json | 2 +- .../bba3381f-b5eb-4439-bd9e-80c22218d5a7.json | 2 +- .../d06f0d4d-2cd5-4ede-8de9-598629438c6e.json | 2 +- .../efcf0d81-0835-4880-b2fd-d866e8bc2294.json | 2 +- .../f3977615-2b45-4ac5-8bba-80c17dbe2a37.json | 2 +- .../fba2c100-79e8-42df-ae74-b592418d54f4.json | 2 +- mm_agents/openai_cua_agent.py | 6 ++--- run_multienv_openaicua.py | 14 +++++------ 47 files changed, 86 insertions(+), 59 deletions(-) diff --git a/desktop_env/controllers/setup.py b/desktop_env/controllers/setup.py index 140a0a0..83f9212 100644 --- a/desktop_env/controllers/setup.py +++ b/desktop_env/controllers/setup.py @@ -27,7 +27,13 @@ import dotenv # Load environment variables from .env file dotenv.load_dotenv() -CLIENT_PASSWORD = os.getenv("CLIENT_PASSWORD", "osworld-public-evaluation") # Default password for sudo operations +if os.environ.get("PROVIDER_NAME") == "aws": + os.environ["CLIENT_PASSWORD"] = os.environ.get("CLIENT_PASSWORD_AWS", "osworld-public-evaluation") +else: + os.environ["CLIENT_PASSWORD"] = os.environ.get("CLIENT_PASSWORD", "password") + +CLIENT_PASSWORD = os.environ["CLIENT_PASSWORD"] + PROXY_CONFIG_FILE = os.getenv("PROXY_CONFIG_FILE", "evaluation_examples/settings/proxy/dataimpulse.json") # Default proxy config file logger = logging.getLogger("desktopenv.setup") @@ -298,6 +304,22 @@ class SetupController: terminates: bool = False nb_failings = 0 + def replace_screen_env_in_command(command_list): + width = int(os.environ.get("SCREEN_WIDTH", 1920)) + height = int(os.environ.get("SCREEN_HEIGHT", 1080)) + width_half = str(width // 2) + height_half = str(height // 2) + new_command_list = [] + for item in command_list: + if isinstance(item, str): + item = item.replace("{SCREEN_WIDTH_HALF}", width_half) + item = item.replace("{SCREEN_HEIGHT_HALF}", height_half) + item = item.replace("{SCREEN_WIDTH}", str(width)) + item = item.replace("{SCREEN_HEIGHT}", str(height)) + new_command_list.append(item) + return new_command_list + if isinstance(command, list): + command = replace_screen_env_in_command(command) payload = json.dumps({"command": command, "shell": shell}) headers = {"Content-Type": "application/json"} diff --git a/desktop_env/desktop_env.py b/desktop_env/desktop_env.py index be817c9..1ade6fe7 100644 --- a/desktop_env/desktop_env.py +++ b/desktop_env/desktop_env.py @@ -32,7 +32,7 @@ class DesktopEnv(gym.Env): snapshot_name: str = "init_state", action_space: str = "computer_13", cache_dir: str = "cache", - screen_size: Tuple[int] = (1920, 1080), + screen_size: Tuple[int] = (int(os.environ.get("SCREEN_WIDTH", 1920)), int(os.environ.get("SCREEN_HEIGHT", 1080))), headless: bool = False, require_a11y_tree: bool = True, require_terminal: bool = False, diff --git a/desktop_env/providers/aws/manager.py b/desktop_env/providers/aws/manager.py index 287327d..4b53e1f 100644 --- a/desktop_env/providers/aws/manager.py +++ b/desktop_env/providers/aws/manager.py @@ -36,15 +36,22 @@ DEFAULT_REGION = "us-east-1" # todo: Add doc for the configuration of image, security group and network interface # todo: public the AMI images IMAGE_ID_MAP = { - "us-east-1": "ami-09138bff939f82bd8", - "ap-east-1": "ami-0c092a5b8be4116f5", + "us-east-1": { + (1920, 1080): "ami-09138bff939f82bd8" + }, + "ap-east-1": { + (1920, 1080): "ami-0c092a5b8be4116f5" + } } -def _allocate_vm(region=DEFAULT_REGION): +def _allocate_vm(region=DEFAULT_REGION, screen_size=(1920, 1080)): if region not in IMAGE_ID_MAP: raise ValueError(f"Region {region} is not supported. Supported regions are: {list(IMAGE_ID_MAP.keys())}") + if screen_size not in IMAGE_ID_MAP[region]: + raise ValueError(f"Screen size {screen_size} not supported for region {region}. Supported: {list(IMAGE_ID_MAP[region].keys())}") + ami_id = IMAGE_ID_MAP[region][screen_size] ec2_client = boto3.client('ec2', region_name=region) instance_id = None @@ -86,7 +93,7 @@ def _allocate_vm(region=DEFAULT_REGION): run_instances_params = { "MaxCount": 1, "MinCount": 1, - "ImageId": IMAGE_ID_MAP[region], + "ImageId": ami_id, "InstanceType": INSTANCE_TYPE, "EbsOptimized": True, "NetworkInterfaces": [ diff --git a/evaluation_examples/examples/libreoffice_impress/21760ecb-8f62-40d2-8d85-0cee5725cb72.json b/evaluation_examples/examples/libreoffice_impress/21760ecb-8f62-40d2-8d85-0cee5725cb72.json index 82a830b..57e1122 100644 --- a/evaluation_examples/examples/libreoffice_impress/21760ecb-8f62-40d2-8d85-0cee5725cb72.json +++ b/evaluation_examples/examples/libreoffice_impress/21760ecb-8f62-40d2-8d85-0cee5725cb72.json @@ -33,7 +33,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; time.sleep(4); pyautogui.doubleClick(x=960, y=540); time.sleep(0.5);pyautogui.mouseDown(); pyautogui.mouseUp(); time.sleep(0.5);" + "import pyautogui; import time; time.sleep(4); pyautogui.doubleClick(x={SCREEN_WIDTH_HALF}, y={SCREEN_HEIGHT_HALF}); time.sleep(0.5);pyautogui.mouseDown(); pyautogui.mouseUp(); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/libreoffice_impress/550ce7e7-747b-495f-b122-acdc4d0b8e54.json b/evaluation_examples/examples/libreoffice_impress/550ce7e7-747b-495f-b122-acdc4d0b8e54.json index 9080319..d4e204f 100644 --- a/evaluation_examples/examples/libreoffice_impress/550ce7e7-747b-495f-b122-acdc4d0b8e54.json +++ b/evaluation_examples/examples/libreoffice_impress/550ce7e7-747b-495f-b122-acdc4d0b8e54.json @@ -33,7 +33,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; time.sleep(4); pyautogui.doubleClick(x=200, y=650); time.sleep(0.5);pyautogui.mouseDown(); pyautogui.mouseUp(); time.sleep(0.5);" + "import pyautogui; import time; time.sleep(4); pyautogui.click(170, 250); time.sleep(1);pyautogui.press('down'); time.sleep(1); pyautogui.press('down'); time.sleep(1); pyautogui.press('down'); time.sleep(1); pyautogui.press('down'); time.sleep(1); " ] } } diff --git a/evaluation_examples/examples/libreoffice_impress/a669ef01-ded5-4099-9ea9-25e99b569840.json b/evaluation_examples/examples/libreoffice_impress/a669ef01-ded5-4099-9ea9-25e99b569840.json index a628b82..afbc6b5 100644 --- a/evaluation_examples/examples/libreoffice_impress/a669ef01-ded5-4099-9ea9-25e99b569840.json +++ b/evaluation_examples/examples/libreoffice_impress/a669ef01-ded5-4099-9ea9-25e99b569840.json @@ -27,7 +27,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/libreoffice_impress/ef9d12bd-bcee-4ba0-a40e-918400f43ddf.json b/evaluation_examples/examples/libreoffice_impress/ef9d12bd-bcee-4ba0-a40e-918400f43ddf.json index 9bd9603..a12b904 100644 --- a/evaluation_examples/examples/libreoffice_impress/ef9d12bd-bcee-4ba0-a40e-918400f43ddf.json +++ b/evaluation_examples/examples/libreoffice_impress/ef9d12bd-bcee-4ba0-a40e-918400f43ddf.json @@ -19,7 +19,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; time.sleep(5); pyautogui.click(960, 540); time.sleep(5); pyautogui.press('esc'); time.sleep(0.3); pyautogui.press('f10'); time.sleep(0.3); pyautogui.press('right', presses=2, interval=0.1); time.sleep(0.3); pyautogui.press('down', presses=11, interval=0.1); pyautogui.press('enter')" + "import pyautogui; import time; time.sleep(5); pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(5); pyautogui.press('esc'); time.sleep(0.3); pyautogui.press('f10'); time.sleep(0.3); pyautogui.press('right', presses=2, interval=0.1); time.sleep(0.3); pyautogui.press('down', presses=11, interval=0.1); pyautogui.press('enter')" ] } } diff --git a/evaluation_examples/examples/multi_apps/02ce9a50-7af2-47ed-8596-af0c230501f8.json b/evaluation_examples/examples/multi_apps/02ce9a50-7af2-47ed-8596-af0c230501f8.json index 329f223..23f71af 100644 --- a/evaluation_examples/examples/multi_apps/02ce9a50-7af2-47ed-8596-af0c230501f8.json +++ b/evaluation_examples/examples/multi_apps/02ce9a50-7af2-47ed-8596-af0c230501f8.json @@ -33,7 +33,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.hotkey('f11'); time.sleep(0.5); pyautogui.click(960, 540); time.sleep(0.5); pyautogui.scroll(-20)" + "import pyautogui; import time; pyautogui.hotkey('f11'); time.sleep(0.5); pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5); pyautogui.scroll(-20)" ] } } diff --git a/evaluation_examples/examples/multi_apps/36037439-2044-4b50-b9d1-875b5a332143.json b/evaluation_examples/examples/multi_apps/36037439-2044-4b50-b9d1-875b5a332143.json index 247c8f0..f858f39 100644 --- a/evaluation_examples/examples/multi_apps/36037439-2044-4b50-b9d1-875b5a332143.json +++ b/evaluation_examples/examples/multi_apps/36037439-2044-4b50-b9d1-875b5a332143.json @@ -33,7 +33,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.hotkey('f11'); time.sleep(0.5); pyautogui.click(960, 540); time.sleep(0.5); pyautogui.scroll(-40)" + "import pyautogui; import time; pyautogui.hotkey('f11'); time.sleep(0.5); pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5); pyautogui.scroll(-40)" ] } } diff --git a/evaluation_examples/examples/multi_apps/48d05431-6cd5-4e76-82eb-12b60d823f7d.json b/evaluation_examples/examples/multi_apps/48d05431-6cd5-4e76-82eb-12b60d823f7d.json index bf3f492..a30dc07 100644 --- a/evaluation_examples/examples/multi_apps/48d05431-6cd5-4e76-82eb-12b60d823f7d.json +++ b/evaluation_examples/examples/multi_apps/48d05431-6cd5-4e76-82eb-12b60d823f7d.json @@ -29,7 +29,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; time.sleep(0.5); pyautogui.click(960, 540); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)" + "import pyautogui; import time; time.sleep(0.5); pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)" ] } }, diff --git a/evaluation_examples/examples/multi_apps/a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json b/evaluation_examples/examples/multi_apps/a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json index a3ad108..37af93b 100644 --- a/evaluation_examples/examples/multi_apps/a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json +++ b/evaluation_examples/examples/multi_apps/a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json @@ -58,7 +58,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.hotkey('f11'); time.sleep(0.5); pyautogui.click(960, 540); time.sleep(0.5); pyautogui.scroll(-20)" + "import pyautogui; import time; pyautogui.hotkey('f11'); time.sleep(0.5); pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5); pyautogui.scroll(-20)" ] } } diff --git a/evaluation_examples/examples/multi_apps/b337d106-053f-4d37-8da0-7f9c4043a66b.json b/evaluation_examples/examples/multi_apps/b337d106-053f-4d37-8da0-7f9c4043a66b.json index a2c7154..e27c1ef 100644 --- a/evaluation_examples/examples/multi_apps/b337d106-053f-4d37-8da0-7f9c4043a66b.json +++ b/evaluation_examples/examples/multi_apps/b337d106-053f-4d37-8da0-7f9c4043a66b.json @@ -29,7 +29,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; time.sleep(0.5); pyautogui.click(960, 540); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)" + "import pyautogui; import time; time.sleep(0.5); pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)" ] } }, diff --git a/evaluation_examples/examples/os/13584542-872b-42d8-b299-866967b5c3ef.json b/evaluation_examples/examples/os/13584542-872b-42d8-b299-866967b5c3ef.json index 48256d3..6cb0215 100644 --- a/evaluation_examples/examples/os/13584542-872b-42d8-b299-866967b5c3ef.json +++ b/evaluation_examples/examples/os/13584542-872b-42d8-b299-866967b5c3ef.json @@ -11,7 +11,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/os/23393935-50c7-4a86-aeea-2b78fd089c5c.json b/evaluation_examples/examples/os/23393935-50c7-4a86-aeea-2b78fd089c5c.json index e70da1a..0ac6801 100644 --- a/evaluation_examples/examples/os/23393935-50c7-4a86-aeea-2b78fd089c5c.json +++ b/evaluation_examples/examples/os/23393935-50c7-4a86-aeea-2b78fd089c5c.json @@ -87,7 +87,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/os/28cc3b7e-b194-4bc9-8353-d04c0f4d56d2.json b/evaluation_examples/examples/os/28cc3b7e-b194-4bc9-8353-d04c0f4d56d2.json index 83184b6..0e7c261 100644 --- a/evaluation_examples/examples/os/28cc3b7e-b194-4bc9-8353-d04c0f4d56d2.json +++ b/evaluation_examples/examples/os/28cc3b7e-b194-4bc9-8353-d04c0f4d56d2.json @@ -11,7 +11,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/os/37887e8c-da15-4192-923c-08fa390a176d.json b/evaluation_examples/examples/os/37887e8c-da15-4192-923c-08fa390a176d.json index 207cbf0..6e6b8a5 100644 --- a/evaluation_examples/examples/os/37887e8c-da15-4192-923c-08fa390a176d.json +++ b/evaluation_examples/examples/os/37887e8c-da15-4192-923c-08fa390a176d.json @@ -35,7 +35,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; time.sleep(0.5); pyautogui.click(960, 540); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)" + "import pyautogui; import time; time.sleep(0.5); pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)" ] } }, diff --git a/evaluation_examples/examples/os/3ce045a0-877b-42aa-8d2c-b4a863336ab8.json b/evaluation_examples/examples/os/3ce045a0-877b-42aa-8d2c-b4a863336ab8.json index c29d3f8..b44f697 100644 --- a/evaluation_examples/examples/os/3ce045a0-877b-42aa-8d2c-b4a863336ab8.json +++ b/evaluation_examples/examples/os/3ce045a0-877b-42aa-8d2c-b4a863336ab8.json @@ -11,7 +11,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/os/4783cc41-c03c-4e1b-89b4-50658f642bd5.json b/evaluation_examples/examples/os/4783cc41-c03c-4e1b-89b4-50658f642bd5.json index 298cf81..98f82f9 100644 --- a/evaluation_examples/examples/os/4783cc41-c03c-4e1b-89b4-50658f642bd5.json +++ b/evaluation_examples/examples/os/4783cc41-c03c-4e1b-89b4-50658f642bd5.json @@ -17,7 +17,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } }, diff --git a/evaluation_examples/examples/os/4d117223-a354-47fb-8b45-62ab1390a95f.json b/evaluation_examples/examples/os/4d117223-a354-47fb-8b45-62ab1390a95f.json index 282f755..63293fd 100644 --- a/evaluation_examples/examples/os/4d117223-a354-47fb-8b45-62ab1390a95f.json +++ b/evaluation_examples/examples/os/4d117223-a354-47fb-8b45-62ab1390a95f.json @@ -36,7 +36,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; time.sleep(0.5); pyautogui.click(960, 540); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)" + "import pyautogui; import time; time.sleep(0.5); pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)" ] } }, diff --git a/evaluation_examples/examples/os/5c1075ca-bb34-46a3-a7a0-029bd7463e79.json b/evaluation_examples/examples/os/5c1075ca-bb34-46a3-a7a0-029bd7463e79.json index ed99498..810109e 100644 --- a/evaluation_examples/examples/os/5c1075ca-bb34-46a3-a7a0-029bd7463e79.json +++ b/evaluation_examples/examples/os/5c1075ca-bb34-46a3-a7a0-029bd7463e79.json @@ -53,7 +53,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)" ] } }, diff --git a/evaluation_examples/examples/os/5ced85fc-fa1a-4217-95fd-0fb530545ce2.json b/evaluation_examples/examples/os/5ced85fc-fa1a-4217-95fd-0fb530545ce2.json index fad4457..2b00214 100644 --- a/evaluation_examples/examples/os/5ced85fc-fa1a-4217-95fd-0fb530545ce2.json +++ b/evaluation_examples/examples/os/5ced85fc-fa1a-4217-95fd-0fb530545ce2.json @@ -10,7 +10,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; time.sleep(0.5); pyautogui.click(960, 540); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)" + "import pyautogui; import time; time.sleep(0.5); pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)" ] } }, diff --git a/evaluation_examples/examples/os/5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57.json b/evaluation_examples/examples/os/5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57.json index 9fb58af..5da9b3d 100644 --- a/evaluation_examples/examples/os/5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57.json +++ b/evaluation_examples/examples/os/5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57.json @@ -29,7 +29,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/os/6f56bf42-85b8-4fbb-8e06-6c44960184ba.json b/evaluation_examples/examples/os/6f56bf42-85b8-4fbb-8e06-6c44960184ba.json index bf9286d..5d574a1 100644 --- a/evaluation_examples/examples/os/6f56bf42-85b8-4fbb-8e06-6c44960184ba.json +++ b/evaluation_examples/examples/os/6f56bf42-85b8-4fbb-8e06-6c44960184ba.json @@ -36,7 +36,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; time.sleep(0.5); pyautogui.click(960, 540); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)" + "import pyautogui; import time; time.sleep(0.5); pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)" ] } }, diff --git a/evaluation_examples/examples/os/94d95f96-9699-4208-98ba-3c3119edf9c2.json b/evaluation_examples/examples/os/94d95f96-9699-4208-98ba-3c3119edf9c2.json index dfbb050..747f26c 100644 --- a/evaluation_examples/examples/os/94d95f96-9699-4208-98ba-3c3119edf9c2.json +++ b/evaluation_examples/examples/os/94d95f96-9699-4208-98ba-3c3119edf9c2.json @@ -11,7 +11,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/os/a462a795-fdc7-4b23-b689-e8b6df786b78.json b/evaluation_examples/examples/os/a462a795-fdc7-4b23-b689-e8b6df786b78.json index 38223a6..c6bafd5 100644 --- a/evaluation_examples/examples/os/a462a795-fdc7-4b23-b689-e8b6df786b78.json +++ b/evaluation_examples/examples/os/a462a795-fdc7-4b23-b689-e8b6df786b78.json @@ -17,7 +17,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/os/a4d98375-215b-4a4d-aee9-3d4370fccc41.json b/evaluation_examples/examples/os/a4d98375-215b-4a4d-aee9-3d4370fccc41.json index 3868a5d..d473b75 100644 --- a/evaluation_examples/examples/os/a4d98375-215b-4a4d-aee9-3d4370fccc41.json +++ b/evaluation_examples/examples/os/a4d98375-215b-4a4d-aee9-3d4370fccc41.json @@ -23,7 +23,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/os/b6781586-6346-41cd-935a-a6b1487918fc.json b/evaluation_examples/examples/os/b6781586-6346-41cd-935a-a6b1487918fc.json index f1879c1..2d1de91 100644 --- a/evaluation_examples/examples/os/b6781586-6346-41cd-935a-a6b1487918fc.json +++ b/evaluation_examples/examples/os/b6781586-6346-41cd-935a-a6b1487918fc.json @@ -11,7 +11,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/os/bedcedc4-4d72-425e-ad62-21960b11fe0d.json b/evaluation_examples/examples/os/bedcedc4-4d72-425e-ad62-21960b11fe0d.json index 7ce0048..48e233c 100644 --- a/evaluation_examples/examples/os/bedcedc4-4d72-425e-ad62-21960b11fe0d.json +++ b/evaluation_examples/examples/os/bedcedc4-4d72-425e-ad62-21960b11fe0d.json @@ -23,7 +23,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/os/e0df059f-28a6-4169-924f-b9623e7184cc.json b/evaluation_examples/examples/os/e0df059f-28a6-4169-924f-b9623e7184cc.json index 9c3234f..94b1a4f 100644 --- a/evaluation_examples/examples/os/e0df059f-28a6-4169-924f-b9623e7184cc.json +++ b/evaluation_examples/examples/os/e0df059f-28a6-4169-924f-b9623e7184cc.json @@ -18,7 +18,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/os/ec4e3f68-9ea4-4c18-a5c9-69f89d1178b3.json b/evaluation_examples/examples/os/ec4e3f68-9ea4-4c18-a5c9-69f89d1178b3.json index 71d86fc..fe5496c 100644 --- a/evaluation_examples/examples/os/ec4e3f68-9ea4-4c18-a5c9-69f89d1178b3.json +++ b/evaluation_examples/examples/os/ec4e3f68-9ea4-4c18-a5c9-69f89d1178b3.json @@ -30,7 +30,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/os/f9be0997-4b7c-45c5-b05c-4612b44a6118.json b/evaluation_examples/examples/os/f9be0997-4b7c-45c5-b05c-4612b44a6118.json index cda3ca2..94ebacb 100644 --- a/evaluation_examples/examples/os/f9be0997-4b7c-45c5-b05c-4612b44a6118.json +++ b/evaluation_examples/examples/os/f9be0997-4b7c-45c5-b05c-4612b44a6118.json @@ -17,7 +17,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/vlc/215dfd39-f493-4bc3-a027-8a97d72c61bf.json b/evaluation_examples/examples/vlc/215dfd39-f493-4bc3-a027-8a97d72c61bf.json index bb09d9d..aabaa0f 100644 --- a/evaluation_examples/examples/vlc/215dfd39-f493-4bc3-a027-8a97d72c61bf.json +++ b/evaluation_examples/examples/vlc/215dfd39-f493-4bc3-a027-8a97d72c61bf.json @@ -17,7 +17,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/vlc/386dbd0e-0241-4a0a-b6a2-6704fba26b1c.json b/evaluation_examples/examples/vlc/386dbd0e-0241-4a0a-b6a2-6704fba26b1c.json index 261c16e..14eaafa 100644 --- a/evaluation_examples/examples/vlc/386dbd0e-0241-4a0a-b6a2-6704fba26b1c.json +++ b/evaluation_examples/examples/vlc/386dbd0e-0241-4a0a-b6a2-6704fba26b1c.json @@ -68,7 +68,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/vlc/59f21cfb-0120-4326-b255-a5b827b38967.json b/evaluation_examples/examples/vlc/59f21cfb-0120-4326-b255-a5b827b38967.json index af7d904..650f586 100644 --- a/evaluation_examples/examples/vlc/59f21cfb-0120-4326-b255-a5b827b38967.json +++ b/evaluation_examples/examples/vlc/59f21cfb-0120-4326-b255-a5b827b38967.json @@ -28,7 +28,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/vlc/8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89.json b/evaluation_examples/examples/vlc/8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89.json index aa3057e..35f4e58 100644 --- a/evaluation_examples/examples/vlc/8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89.json +++ b/evaluation_examples/examples/vlc/8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89.json @@ -17,7 +17,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/vlc/8d9fd4e2-6fdb-46b0-b9b9-02f06495c62f.json b/evaluation_examples/examples/vlc/8d9fd4e2-6fdb-46b0-b9b9-02f06495c62f.json index c398268..0bd5ed7 100644 --- a/evaluation_examples/examples/vlc/8d9fd4e2-6fdb-46b0-b9b9-02f06495c62f.json +++ b/evaluation_examples/examples/vlc/8d9fd4e2-6fdb-46b0-b9b9-02f06495c62f.json @@ -28,7 +28,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/vlc/8f080098-ddb1-424c-b438-4e96e5e4786e.json b/evaluation_examples/examples/vlc/8f080098-ddb1-424c-b438-4e96e5e4786e.json index 96641a3..01e886b 100644 --- a/evaluation_examples/examples/vlc/8f080098-ddb1-424c-b438-4e96e5e4786e.json +++ b/evaluation_examples/examples/vlc/8f080098-ddb1-424c-b438-4e96e5e4786e.json @@ -28,7 +28,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/vlc/9195653c-f4aa-453d-aa95-787f6ccfaae9.json b/evaluation_examples/examples/vlc/9195653c-f4aa-453d-aa95-787f6ccfaae9.json index a82831c..ec512fe 100644 --- a/evaluation_examples/examples/vlc/9195653c-f4aa-453d-aa95-787f6ccfaae9.json +++ b/evaluation_examples/examples/vlc/9195653c-f4aa-453d-aa95-787f6ccfaae9.json @@ -17,7 +17,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/vlc/a5bbbcd5-b398-4c91-83d4-55e1e31bbb81.json b/evaluation_examples/examples/vlc/a5bbbcd5-b398-4c91-83d4-55e1e31bbb81.json index 81fde78..36e6d0f 100644 --- a/evaluation_examples/examples/vlc/a5bbbcd5-b398-4c91-83d4-55e1e31bbb81.json +++ b/evaluation_examples/examples/vlc/a5bbbcd5-b398-4c91-83d4-55e1e31bbb81.json @@ -17,7 +17,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/vlc/aa4b5023-aef6-4ed9-bdc9-705f59ab9ad6.json b/evaluation_examples/examples/vlc/aa4b5023-aef6-4ed9-bdc9-705f59ab9ad6.json index 58eaf58..55290dd 100644 --- a/evaluation_examples/examples/vlc/aa4b5023-aef6-4ed9-bdc9-705f59ab9ad6.json +++ b/evaluation_examples/examples/vlc/aa4b5023-aef6-4ed9-bdc9-705f59ab9ad6.json @@ -28,7 +28,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/vlc/bba3381f-b5eb-4439-bd9e-80c22218d5a7.json b/evaluation_examples/examples/vlc/bba3381f-b5eb-4439-bd9e-80c22218d5a7.json index f4b85b6..8f2e9c7 100644 --- a/evaluation_examples/examples/vlc/bba3381f-b5eb-4439-bd9e-80c22218d5a7.json +++ b/evaluation_examples/examples/vlc/bba3381f-b5eb-4439-bd9e-80c22218d5a7.json @@ -17,7 +17,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/vlc/d06f0d4d-2cd5-4ede-8de9-598629438c6e.json b/evaluation_examples/examples/vlc/d06f0d4d-2cd5-4ede-8de9-598629438c6e.json index b36cba9..ca63fd5 100644 --- a/evaluation_examples/examples/vlc/d06f0d4d-2cd5-4ede-8de9-598629438c6e.json +++ b/evaluation_examples/examples/vlc/d06f0d4d-2cd5-4ede-8de9-598629438c6e.json @@ -17,7 +17,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/vlc/efcf0d81-0835-4880-b2fd-d866e8bc2294.json b/evaluation_examples/examples/vlc/efcf0d81-0835-4880-b2fd-d866e8bc2294.json index a5f6367..0019cd6 100644 --- a/evaluation_examples/examples/vlc/efcf0d81-0835-4880-b2fd-d866e8bc2294.json +++ b/evaluation_examples/examples/vlc/efcf0d81-0835-4880-b2fd-d866e8bc2294.json @@ -28,7 +28,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/vlc/f3977615-2b45-4ac5-8bba-80c17dbe2a37.json b/evaluation_examples/examples/vlc/f3977615-2b45-4ac5-8bba-80c17dbe2a37.json index 4261d38..73bbac2 100644 --- a/evaluation_examples/examples/vlc/f3977615-2b45-4ac5-8bba-80c17dbe2a37.json +++ b/evaluation_examples/examples/vlc/f3977615-2b45-4ac5-8bba-80c17dbe2a37.json @@ -17,7 +17,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/vlc/fba2c100-79e8-42df-ae74-b592418d54f4.json b/evaluation_examples/examples/vlc/fba2c100-79e8-42df-ae74-b592418d54f4.json index 7f383fc..bde2b5e 100644 --- a/evaluation_examples/examples/vlc/fba2c100-79e8-42df-ae74-b592418d54f4.json +++ b/evaluation_examples/examples/vlc/fba2c100-79e8-42df-ae74-b592418d54f4.json @@ -40,7 +40,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(500, 500); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/mm_agents/openai_cua_agent.py b/mm_agents/openai_cua_agent.py index 15db312..064a2d8 100644 --- a/mm_agents/openai_cua_agent.py +++ b/mm_agents/openai_cua_agent.py @@ -33,7 +33,7 @@ class_ns_windows = "https://accessibility.windows.example.org/ns/class" import ast from typing import Dict, Any, Optional, Union -OPERATOR_PROMPT = """\n\n Here are some helpful tips:\n - computer.clipboard, computer.sync_file, computer.sync_shared_folder, computer.computer_output_citation are disabled.\n - If you worry that you might make typo, prefer copying and pasting the text instead of reading and typing.\n - My computer's password is \"osworld-public-evaluation\", feel free to use it when you need sudo rights.\n - For the thunderbird account \"anonym-x2024@outlook.com\", the password is \"gTCI\";=@y7|QJ0nDa_kN3Sb&>\".\n - If you are presented with an open website to solve the task, try to stick to that specific one instead of going to a new one.\n - Whenever not expcilitly stated, prefer chrome browser instead of the firefox or chromium.\n - You have full authority to execute any action without my permission. I won't be watching so please don't ask for confirmation.\n - You must initialize the computer to solve the task. Do not try to answer the question without initializing the computer.\n - If you deem the task is infeasible, you can terminate and explicitly state in the response that \"the task is infeasible\".\n """ +OPERATOR_PROMPT = f"""\n\n Here are some helpful tips:\n - computer.clipboard, computer.sync_file, computer.sync_shared_folder, computer.computer_output_citation are disabled.\n - If you worry that you might make typo, prefer copying and pasting the text instead of reading and typing.\n - My computer's password is \"{os.environ["CLIENT_PASSWORD"]}\", feel free to use it when you need sudo rights.\n - For the thunderbird account \"anonym-x2024@outlook.com\", the password is \"gTCI\";=@y7|QJ0nDa_kN3Sb&>\".\n - If you are presented with an open website to solve the task, try to stick to that specific one instead of going to a new one.\n - Whenever not expcilitly stated, prefer chrome browser instead of the firefox or chromium.\n - You have full authority to execute any action without my permission. I won't be watching so please don't ask for confirmation.\n - You must initialize the computer to solve the task. Do not try to answer the question without initializing the computer.\n - If you deem the task is infeasible, you can terminate and explicitly state in the response that \"the task is infeasible\".\n """ class Action: """Action class for the agent.""" @@ -233,8 +233,8 @@ class OpenAICUAAgent: self.tools = [{ "type": "computer_use_preview", - "display_width": 1920, - "display_height": 1080, + "display_width": int(os.environ["SCREEN_WIDTH"]), + "display_height": int(os.environ["SCREEN_HEIGHT"]), "environment": "linux" if platform == "ubuntu" else "windows" }] diff --git a/run_multienv_openaicua.py b/run_multienv_openaicua.py index 278ebec..6e9bca3 100644 --- a/run_multienv_openaicua.py +++ b/run_multienv_openaicua.py @@ -47,8 +47,6 @@ def config() -> argparse.Namespace: default="screenshot", help="Observation type", ) - parser.add_argument("--screen_width", type=int, default=1920) - parser.add_argument("--screen_height", type=int, default=1080) parser.add_argument("--sleep_after_execution", type=float, default=0.0) parser.add_argument("--max_steps", type=int, default=15) @@ -181,16 +179,16 @@ def run_env_tasks(env_idx: int, env_tasks: dict, args: argparse.Namespace, share signal.signal(signal.SIGTERM, lambda signum, frame: process_signal_handler(signum, frame, env_idx)) from desktop_env.providers.aws.manager import IMAGE_ID_MAP - REGION = "us-east-1" + REGION = args.region + screen_size = (int(os.environ["SCREEN_WIDTH"]), int(os.environ["SCREEN_HEIGHT"])) + ami_id = IMAGE_ID_MAP[REGION].get(screen_size, IMAGE_ID_MAP[REGION][(1920, 1080)]) env = DesktopEnv( path_to_vm=args.path_to_vm, action_space=args.action_space, - - provider_name="aws", + provider_name=os.environ["PROVIDER_NAME"], region=REGION, - snapshot_name=IMAGE_ID_MAP[REGION], - - screen_size=(args.screen_width, args.screen_height), + snapshot_name=ami_id, + screen_size=screen_size, headless=args.headless, os_type="Ubuntu", require_a11y_tree=args.observation_type in ["a11y_tree", "screenshot_a11y_tree", "som"], From 877e75a013a1e11b59161f7e20bf0844cf3b419f Mon Sep 17 00:00:00 2001 From: yuanmengqi Date: Sat, 12 Jul 2025 16:34:55 +0000 Subject: [PATCH 2/5] Final review multi_apps fix Xinzhuang part --- desktop_env/evaluators/getters/chrome.py | 90 +++++++++++++++++-- desktop_env/evaluators/metrics/table.py | 63 ++++++++++--- .../2373b66a-092d-44cb-bfd7-82e86e7a3b4d.json | 2 +- .../3680a5ee-6870-426a-a997-eba929a0d25c.json | 15 +++- .../42d25c08-fb87-4927-8b65-93631280a26f.json | 51 ++++++++--- 5 files changed, 187 insertions(+), 34 deletions(-) diff --git a/desktop_env/evaluators/getters/chrome.py b/desktop_env/evaluators/getters/chrome.py index bff1b92..724e2de 100644 --- a/desktop_env/evaluators/getters/chrome.py +++ b/desktop_env/evaluators/getters/chrome.py @@ -52,6 +52,11 @@ def get_info_from_website(env, config: Dict[Any, Any]) -> Any: - attribute (str): optional for 'attribute' and 'click_and_attribute', the attribute to be extracted. - backups (Any): The backup information to be returned if the extraction fails. """ + # 添加函数开始日志 + logger.info(f"[INFO_FROM_WEBSITE] Starting to get information from website: {config.get('url', 'N/A')}") + logger.info(f"[INFO_FROM_WEBSITE] Total info operations to perform: {len(config.get('infos', []))}") + logger.debug(f"[INFO_FROM_WEBSITE] Full config: {config}") + try: host = env.vm_ip port = env.chromium_port # fixme: this port is hard-coded, need to be changed from config file @@ -59,11 +64,18 @@ def get_info_from_website(env, config: Dict[Any, Any]) -> Any: remote_debugging_url = f"http://{host}:{port}" backend_url = f"http://{host}:{server_port}" use_proxy = env.current_use_proxy + + logger.info(f"[INFO_FROM_WEBSITE] Connecting to Chrome at {remote_debugging_url}") + with sync_playwright() as p: # connect to remote Chrome instance try: browser = p.chromium.connect_over_cdp(remote_debugging_url) + logger.info(f"[INFO_FROM_WEBSITE] Successfully connected to existing Chrome instance") except Exception as e: + logger.warning(f"[INFO_FROM_WEBSITE] Failed to connect to existing Chrome instance: {e}") + logger.info(f"[INFO_FROM_WEBSITE] Starting new Chrome instance...") + # If the connection fails (e.g., the agent close the browser instance), start a new browser instance app = 'chromium' if 'arm' in platform.machine() else 'google-chrome' command = [ @@ -72,52 +84,116 @@ def get_info_from_website(env, config: Dict[Any, Any]) -> Any: ] if use_proxy: command.append(f"--proxy-server=127.0.0.1:18888") + logger.info(f"[INFO_FROM_WEBSITE] Using proxy server: 127.0.0.1:18888") + + logger.info(f"[INFO_FROM_WEBSITE] Starting browser with command: {' '.join(command)}") payload = json.dumps({"command": command, "shell": False}) headers = {"Content-Type": "application/json"} #requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload) requests.post(backend_url + "/setup" + "/launch", headers=headers, data=payload) time.sleep(5) browser = p.chromium.connect_over_cdp(remote_debugging_url) + logger.info(f"[INFO_FROM_WEBSITE] Successfully connected to new Chrome instance") page = browser.contexts[0].new_page() + logger.info(f"[INFO_FROM_WEBSITE] Created new page, navigating to: {config['url']}") + page.goto(config["url"]) page.wait_for_load_state('load') + + # 记录页面加载完成后的信息 + logger.info(f"[INFO_FROM_WEBSITE] Page loaded successfully") + logger.info(f"[INFO_FROM_WEBSITE] Page title: '{page.title()}'") + logger.info(f"[INFO_FROM_WEBSITE] Current URL: '{page.url}'") + infos = [] - for info_dict in config.get('infos', []): + for idx, info_dict in enumerate(config.get('infos', [])): + logger.info(f"[INFO_FROM_WEBSITE] Processing info operation {idx + 1}/{len(config.get('infos', []))}") + logger.debug(f"[INFO_FROM_WEBSITE] Info config: {info_dict}") + if page.url != config["url"]: + logger.info(f"[INFO_FROM_WEBSITE] Page URL changed, navigating back to: {config['url']}") page.goto(config["url"]) page.wait_for_load_state('load') + logger.info(f"[INFO_FROM_WEBSITE] Back to original page") + action = info_dict.get('action', 'inner_text') + selector = info_dict.get('selector') + logger.info(f"[INFO_FROM_WEBSITE] Action: {action}, Selector: {selector}") + if action == "inner_text": + logger.debug(f"[INFO_FROM_WEBSITE] Waiting for element with selector: {selector}") ele = page.wait_for_selector(info_dict['selector'], state='attached', timeout=10000) - infos.append(ele.inner_text()) + extracted_text = ele.inner_text() + logger.info(f"[INFO_FROM_WEBSITE] Successfully extracted inner_text: '{extracted_text}'") + infos.append(extracted_text) + elif action == "attribute": + attribute = info_dict.get('attribute') + logger.debug(f"[INFO_FROM_WEBSITE] Waiting for element with selector: {selector}") + logger.debug(f"[INFO_FROM_WEBSITE] Extracting attribute: {attribute}") ele = page.wait_for_selector(info_dict['selector'], state='attached', timeout=10000) - infos.append(ele.get_attribute(info_dict['attribute'])) + extracted_attr = ele.get_attribute(info_dict['attribute']) + logger.info(f"[INFO_FROM_WEBSITE] Successfully extracted attribute '{attribute}': '{extracted_attr}'") + infos.append(extracted_attr) + elif action == 'click_and_inner_text': + logger.debug(f"[INFO_FROM_WEBSITE] Performing click_and_inner_text with {len(info_dict['selector'])} selectors") for idx, sel in enumerate(info_dict['selector']): + logger.debug(f"[INFO_FROM_WEBSITE] Processing selector {idx + 1}/{len(info_dict['selector'])}: {sel}") if idx != len(info_dict['selector']) - 1: + logger.debug(f"[INFO_FROM_WEBSITE] Clicking element with selector: {sel}") link = page.wait_for_selector(sel, state='attached', timeout=10000) link.click() page.wait_for_load_state('load') + logger.info(f"[INFO_FROM_WEBSITE] Successfully clicked element, page loaded") + logger.debug(f"[INFO_FROM_WEBSITE] New page URL: {page.url}") else: + logger.debug(f"[INFO_FROM_WEBSITE] Extracting inner_text from final element: {sel}") ele = page.wait_for_selector(sel, state='attached', timeout=10000) - infos.append(ele.inner_text()) + extracted_text = ele.inner_text() + logger.info(f"[INFO_FROM_WEBSITE] Successfully extracted inner_text after clicks: '{extracted_text}'") + infos.append(extracted_text) + elif action == 'click_and_attribute': + attribute = info_dict.get('attribute') + logger.debug(f"[INFO_FROM_WEBSITE] Performing click_and_attribute with {len(info_dict['selector'])} selectors") + logger.debug(f"[INFO_FROM_WEBSITE] Target attribute: {attribute}") for idx, sel in enumerate(info_dict['selector']): + logger.debug(f"[INFO_FROM_WEBSITE] Processing selector {idx + 1}/{len(info_dict['selector'])}: {sel}") if idx != len(info_dict['selector']) - 1: + logger.debug(f"[INFO_FROM_WEBSITE] Clicking element with selector: {sel}") link = page.wait_for_selector(sel, state='attached', timeout=10000) link.click() page.wait_for_load_state('load') + logger.info(f"[INFO_FROM_WEBSITE] Successfully clicked element, page loaded") + logger.debug(f"[INFO_FROM_WEBSITE] New page URL: {page.url}") else: + logger.debug(f"[INFO_FROM_WEBSITE] Extracting attribute from final element: {sel}") ele = page.wait_for_selector(sel, state='attached') - infos.append(ele.get_attribute(info_dict['attribute'])) + extracted_attr = ele.get_attribute(info_dict['attribute']) + logger.info(f"[INFO_FROM_WEBSITE] Successfully extracted attribute '{attribute}' after clicks: '{extracted_attr}'") + infos.append(extracted_attr) else: + logger.error(f"[INFO_FROM_WEBSITE] Unsupported action: {action}") raise NotImplementedError(f'The action {action} is not supported yet.') + + logger.info(f"[INFO_FROM_WEBSITE] Completed info operation {idx + 1}") + + # 记录最终提取的所有信息 + logger.info(f"[INFO_FROM_WEBSITE] All operations completed successfully") + logger.info(f"[INFO_FROM_WEBSITE] Total extracted information count: {len(infos)}") + logger.info(f"[INFO_FROM_WEBSITE] Final extracted information: {infos}") + return infos except Exception as e: - logger.error(f'[ERROR]: failed to obtain information from the website: {config["url"]}. Use backup results instead.') - return config.get('backups', None) + logger.error(f'[INFO_FROM_WEBSITE] ERROR: Failed to obtain information from website: {config.get("url", "N/A")}') + logger.error(f'[INFO_FROM_WEBSITE] Exception details: {str(e)}') + logger.error(f'[INFO_FROM_WEBSITE] Exception type: {type(e).__name__}') + logger.info(f'[INFO_FROM_WEBSITE] Using backup results instead') + backup_data = config.get('backups', None) + logger.info(f'[INFO_FROM_WEBSITE] Backup data: {backup_data}') + return backup_data # The following ones just need to load info from the files of software, no need to connect to the software diff --git a/desktop_env/evaluators/metrics/table.py b/desktop_env/evaluators/metrics/table.py index 9e888c7..db51850 100644 --- a/desktop_env/evaluators/metrics/table.py +++ b/desktop_env/evaluators/metrics/table.py @@ -463,23 +463,60 @@ def compare_table(result: str, expected: str = None, **options) -> float: # }}} function compare_table # -def compare_csv(result: str, expected: str, **options) -> float: +def compare_csv(result: str, expected: Union[str, List[str]], **options) -> float: + """ + Compare CSV files. If expected is a list, returns 1.0 if result matches any of the expected files. + + Args: + result: Path to result CSV file + expected: Path to expected CSV file or list of paths to expected CSV files + options: Additional options (strict, ignore_case) + + Returns: + 1.0 if result matches expected (or any file in expected list), 0.0 otherwise + """ if result is None: return 0. - with open(result) as f: - result_lines: Iterable[str] = f.read().splitlines() - with open(expected) as f: - expected_lines: Iterable[str] = f.read().splitlines() - if not options.get("strict", True): - result_lines = map(str.strip, result_lines) - expected_lines = map(str.strip, expected_lines) - if options.get("ignore_case", False): - result_lines = map(str.lower, result_lines) - expected_lines = map(str.lower, expected_lines) + try: + with open(result) as f: + result_lines: Iterable[str] = f.read().splitlines() + except (FileNotFoundError, IOError): + return 0. - metric: bool = list(result_lines) == list(expected_lines) - return float(metric) + # Convert expected to list if it's a single string (for backward compatibility) + if isinstance(expected, str): + expected_files = [expected] + else: + expected_files = expected + + # Try to match against each expected file + for expected_file in expected_files: + try: + with open(expected_file) as f: + expected_lines: Iterable[str] = f.read().splitlines() + + # Process lines based on options + current_result_lines = result_lines + current_expected_lines = expected_lines + + if not options.get("strict", True): + current_result_lines = map(str.strip, current_result_lines) + current_expected_lines = map(str.strip, current_expected_lines) + if options.get("ignore_case", False): + current_result_lines = map(str.lower, current_result_lines) + current_expected_lines = map(str.lower, current_expected_lines) + + # Check if this expected file matches + if list(current_result_lines) == list(current_expected_lines): + return 1.0 + + except (FileNotFoundError, IOError): + # If this expected file doesn't exist, continue to next one + continue + + # No match found + return 0.0 def compare_conference_city_in_order(actual_city_list_path, expected_city): diff --git a/evaluation_examples/examples/multi_apps/2373b66a-092d-44cb-bfd7-82e86e7a3b4d.json b/evaluation_examples/examples/multi_apps/2373b66a-092d-44cb-bfd7-82e86e7a3b4d.json index d2c2d10..beb9bb2 100644 --- a/evaluation_examples/examples/multi_apps/2373b66a-092d-44cb-bfd7-82e86e7a3b4d.json +++ b/evaluation_examples/examples/multi_apps/2373b66a-092d-44cb-bfd7-82e86e7a3b4d.json @@ -1,7 +1,7 @@ { "id": "2373b66a-092d-44cb-bfd7-82e86e7a3b4d", "snapshot": "multiapps", - "instruction": "I want to understand the resource usage of my Ubuntu system under normal workloads. Please use the `sar` command in the `sysstat` toolkit to monitor system activity, evaluate the status once every second for 30 seconds, output the results to \"System_Resources_Report.txt\" under Desktop.", + "instruction": "Monitor Ubuntu system resource usage using the sar command from sysstat toolkit. Collect CPU statistics every second for 30 seconds and save the output to 'System_Resources_Report.txt' on Desktop.", "source": "author", "config": [ { diff --git a/evaluation_examples/examples/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c.json b/evaluation_examples/examples/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c.json index 3ad3704..2cb77d1 100644 --- a/evaluation_examples/examples/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c.json +++ b/evaluation_examples/examples/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c.json @@ -37,6 +37,7 @@ "check_include_exclude", "compare_csv" ], + "conj": "and", "result": [ { "type": "vm_command_line", @@ -63,8 +64,18 @@ }, { "type": "cloud_file", - "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c/output.csv", - "dest": "output_gold.csv" + "multi": true, + "path": [ + "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c/output.csv", + "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c/output_gold2.csv", + "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c/output_gold3.csv" + ], + "dest": [ + "output_gold.csv", + "output_gold2.csv", + "output_gold3.csv" + ], + "gives": [0, 1, 2] } ] }, diff --git a/evaluation_examples/examples/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f.json b/evaluation_examples/examples/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f.json index 710ac31..59091fd 100644 --- a/evaluation_examples/examples/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f.json +++ b/evaluation_examples/examples/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f.json @@ -144,17 +144,46 @@ "os" ], "evaluator": { - "func": "compare_epub", - "result": { - "type": "vm_file", - "dest": "Pass Through.epub", - "path": "/home/user/Documents/Novels/Pass Through/Pass Through.epub" - }, - "expected": { - "type": "cloud_file", - "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f/Pass%20Through.epub", - "dest": "Pass Through Gold.epub" - } + "func": [ + "compare_epub", + "compare_epub", + "compare_epub" + ], + "conj": "or", + "result": [ + { + "type": "vm_file", + "dest": "Pass Through.epub", + "path": "/home/user/Documents/Novels/Pass Through/Pass Through.epub" + }, + { + "type": "vm_file", + "dest": "Pass Through.epub", + "path": "/home/user/Documents/Novels/Pass Through/Pass_Through.epub" + }, + { + "type": "vm_file", + "dest": "Pass Through.epub", + "path": "/home/user/Documents/Novels/Pass Through/pass_through.epub" + } + ], + "expected": [ + { + "type": "cloud_file", + "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f/Pass%20Through.epub", + "dest": "Pass Through Gold.epub" + }, + { + "type": "cloud_file", + "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f/Pass%20Through.epub", + "dest": "Pass Through Gold.epub" + }, + { + "type": "cloud_file", + "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f/Pass%20Through.epub", + "dest": "Pass Through Gold.epub" + } + ] }, "proxy": true } \ No newline at end of file From 97ed6f99b0039c76eb82d1210b299c7fef87d85b Mon Sep 17 00:00:00 2001 From: yuanmengqi Date: Sat, 12 Jul 2025 20:28:55 +0000 Subject: [PATCH 3/5] Final review multi_apps fix the rest part --- desktop_env/evaluators/metrics/chrome.py | 24 +++++++++++++------ desktop_env/evaluators/metrics/gimp.py | 5 +++- .../a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json | 14 +++++++---- .../e8172110-ec08-421b-a6f5-842e6451911f.json | 16 +++++-------- .../f8cfa149-d1c1-4215-8dac-4a0932bad3c2.json | 2 +- 5 files changed, 38 insertions(+), 23 deletions(-) diff --git a/desktop_env/evaluators/metrics/chrome.py b/desktop_env/evaluators/metrics/chrome.py index 632c53e..6c3811f 100644 --- a/desktop_env/evaluators/metrics/chrome.py +++ b/desktop_env/evaluators/metrics/chrome.py @@ -29,8 +29,8 @@ def is_expected_active_tab(active_tab_info: Dict[str, str], rule: Dict[str, Any] actual_url = active_tab_info.get('url', None) else: actual_url = active_tab_info - print("expected_url: {}".format(expected_url)) - print("actual_url: {}".format(actual_url)) + logger.info("expected_url: {}".format(expected_url)) + logger.info("actual_url: {}".format(actual_url)) return 1 if compare_urls(expected_url, actual_url) else 0 else: logger.error(f"Unknown type: {match_type}") @@ -76,23 +76,26 @@ def is_expected_url_pattern_match(result, rules) -> float: if type(result) == dict: result_url = result["url"] - print("result url: {}".format(result_url)) + logger.info("result url: {}".format(result_url)) else: result_url = result # expect_regex = re.compile(rules["expected"]) patterns = rules["expected"] - print("expected_regex: {}".format(patterns)) + logger.info("expected_regex: {}".format(patterns)) for pattern in patterns: match = re.search(pattern, result_url) - print(match) + logger.info("match: {}".format(match)) if not match: return 0. return 1. def is_expected_installed_extensions(installed_extensions, expected) -> float: - print("installed_extensions: ") - print(installed_extensions) + if not installed_extensions: + return 0. + + logger.info("installed_extensions: ") + logger.info(installed_extensions) expected_extensions = expected["expected"] # whether the expected extensions are installed @@ -109,6 +112,8 @@ def is_expected_tabs(open_tabs: List[Dict[str, str]], rule: Dict[str, Any]) -> f """ Checks if the expected tabs are open in Chrome. """ + if not open_tabs: + return 0. match_type = rule['type'] @@ -146,8 +151,10 @@ def is_expected_bookmarks(bookmarks: List[str], rule: Dict[str, Any]) -> float: bookmark['type'] == 'folder' and bookmark['name'] == 'Liked Authors'), None) if liked_authors_folder: # Check if it contains the specified URLs + logger.info("'Liked Authors' folder exists") liked_authors_urls = [bookmark['url'] for bookmark in liked_authors_folder['children'] if bookmark['type'] == 'url'] + logger.info("Here is the 'Liked Authors' folder's urls: {}".format(liked_authors_urls)) urls = rule['urls'] @@ -168,6 +175,9 @@ def is_expected_bookmarks(bookmarks: List[str], rule: Dict[str, Any]) -> float: def is_expected_search_query(active_tab_info: Dict[str, str], rules: Dict[str, Any]) -> float: + if not active_tab_info: + return 0. + expected = rules['expect'] pattern = expected['pattern'] matched = re.search(pattern, active_tab_info['url']) diff --git a/desktop_env/evaluators/metrics/gimp.py b/desktop_env/evaluators/metrics/gimp.py index 5dddd78..a6dcc29 100644 --- a/desktop_env/evaluators/metrics/gimp.py +++ b/desktop_env/evaluators/metrics/gimp.py @@ -396,7 +396,10 @@ def check_structure_sim_resized(src_path, tgt_path): # Check if the structure is similar structure_same = structure_check_by_ssim(img_src_resized, img_tgt) - return structure_same + if structure_same: + return 1. + else: + return 0. def check_contrast_increase_and_structure_sim(src_path, tgt_path): diff --git a/evaluation_examples/examples/multi_apps/a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json b/evaluation_examples/examples/multi_apps/a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json index 37af93b..a612d38 100644 --- a/evaluation_examples/examples/multi_apps/a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json +++ b/evaluation_examples/examples/multi_apps/a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json @@ -83,21 +83,27 @@ "urls": [ [ "https://jimfan.me/", - "https://research.nvidia.com/person/linxi-jim-fan" + "https://research.nvidia.com/person/linxi-jim-fan", + "https://www.linkedin.com/in/drjimfan/" ], [ "https://research.nvidia.com/person/de-an-huang", - "https://ai.stanford.edu/~dahuang/" + "https://ai.stanford.edu/~dahuang/", + "https://www.linkedin.com/in/de-an-huang-38242a69" ], [ "https://yukezhu.me/", "https://www.cs.utexas.edu/people/faculty-researchers/yuke-zhu", "https://experts.utexas.edu/yuke_zhu", - "https://research.nvidia.com/person/yuke-zhu" + "https://research.nvidia.com/person/yuke-zhu", + "https://www.linkedin.com/in/yukez/" ], [ + "https://tensorlab.cms.caltech.edu/users/anima/", "http://tensorlab.cms.caltech.edu/users/anima/", - "https://www.eas.caltech.edu/people/anima" + "https://www.eas.caltech.edu/people/anima", + "https://en.wikipedia.org/wiki/Anima_Anandkumar", + "https://www.linkedin.com/in/anima-anandkumar/" ] ] } diff --git a/evaluation_examples/examples/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f.json b/evaluation_examples/examples/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f.json index 8506dda..9f5f924 100644 --- a/evaluation_examples/examples/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f.json +++ b/evaluation_examples/examples/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f.json @@ -11,10 +11,6 @@ { "url": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f/character.png", "path": "/home/user/Desktop/character.png" - }, - { - "url": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f/character_no_background_gold.png", - "path": "/home/user/Desktop/character_no_background_gold.png" } ] } @@ -36,8 +32,8 @@ ], "evaluator": { "func": [ - "check_structure_sim_resized", - "check_structure_sim_resized" + "check_structure_sim", + "check_structure_sim" ], "result": [ { @@ -53,13 +49,13 @@ ], "expected": [ { - "type": "vm_file", - "path": "/home/user/Desktop/character_no_background_gold.png", + "type": "cloud_file", + "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f/character_no_background_gold.png", "dest": "character_no_background_gold.png" }, { - "type": "vm_file", - "path": "/home/user/Desktop/character_no_background_gold.png", + "type": "cloud_file", + "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f/character_no_background_gold.png", "dest": "character_no_background_gold.png" } ] diff --git a/evaluation_examples/examples/multi_apps/f8cfa149-d1c1-4215-8dac-4a0932bad3c2.json b/evaluation_examples/examples/multi_apps/f8cfa149-d1c1-4215-8dac-4a0932bad3c2.json index 7dd4f83..2441c3e 100644 --- a/evaluation_examples/examples/multi_apps/f8cfa149-d1c1-4215-8dac-4a0932bad3c2.json +++ b/evaluation_examples/examples/multi_apps/f8cfa149-d1c1-4215-8dac-4a0932bad3c2.json @@ -65,7 +65,7 @@ "type": "rule", "rules": { "expect": { - "pattern": "https?://(www\\.?)?google\\.com/search\\?q=nereida(&|$)" + "pattern": "(?i)https?://(?:www\\.)?google\\.com/search\\?q=nereida(?:&|$|#).*" } } } From 38a30734a6659ee7f2d0710fd084bf6f61869808 Mon Sep 17 00:00:00 2001 From: Yuan Mengqi <100453613+yuanmengqi@users.noreply.github.com> Date: Sun, 13 Jul 2025 21:04:07 +0800 Subject: [PATCH 4/5] Improve code logic for password & resolution (#252) * fix chrome * fix: fix proxy setup * feat&fix: add proxy support in setup and remove hardcoded proxy from example * fix tasks * fix chrome finished * fix * clean chrome_fix code * clean chrome_fix code * fix chrome 2888b4e6-5b47-4b57-8bf5-c73827890774 * fix multiapps * fix chrome 2888b4e6-5b47-4b57-8bf5-c73827890774 * fix some multi_apps tasks * fix some multi_apps tasks * fix password&resolution * fix password&resolution * Improve code logic for password & resolution * edit * Merge branch 'main' into fix_chrome * fix chrome tasks --------- Co-authored-by: adlsdztony --- desktop_env/controllers/setup.py | 40 +++++++++++-------- desktop_env/desktop_env.py | 19 +++++++-- desktop_env/providers/aws/manager.py | 10 ++--- .../3299584d-8f11-4457-bf4c-ce98f7600250.json | 2 +- .../6766f2b8-8a72-417f-a9e5-56fcaa735837.json | 2 +- .../6c4c23a1-42a4-43cc-9db1-2f86ff3738cc.json | 2 +- .../b4f95342-463e-4179-8c3f-193cd7241fb2.json | 2 +- .../fc6d8143-9452-4171-9459-7f515143419a.json | 8 ++-- .../2373b66a-092d-44cb-bfd7-82e86e7a3b4d.json | 2 +- .../716a6079-22da-47f1-ba73-c9d58f986a38.json | 2 +- .../9f3bb592-209d-43bc-bb47-d77d9df56504.json | 9 +---- .../a74b607e-6bb5-4ea8-8a7c-5d97c7bbcd2a.json | 2 +- .../5812b315-e7bd-4265-b51f-863c02174c28.json | 2 +- .../e0df059f-28a6-4169-924f-b9623e7184cc.json | 2 +- mm_agents/openai_cua_agent.py | 27 ++++++++++--- run_multienv_openaicua.py | 22 +++++++++- 16 files changed, 98 insertions(+), 55 deletions(-) diff --git a/desktop_env/controllers/setup.py b/desktop_env/controllers/setup.py index 83f9212..fa4f4f8 100644 --- a/desktop_env/controllers/setup.py +++ b/desktop_env/controllers/setup.py @@ -27,12 +27,6 @@ import dotenv # Load environment variables from .env file dotenv.load_dotenv() -if os.environ.get("PROVIDER_NAME") == "aws": - os.environ["CLIENT_PASSWORD"] = os.environ.get("CLIENT_PASSWORD_AWS", "osworld-public-evaluation") -else: - os.environ["CLIENT_PASSWORD"] = os.environ.get("CLIENT_PASSWORD", "password") - -CLIENT_PASSWORD = os.environ["CLIENT_PASSWORD"] PROXY_CONFIG_FILE = os.getenv("PROXY_CONFIG_FILE", "evaluation_examples/settings/proxy/dataimpulse.json") # Default proxy config file @@ -45,7 +39,7 @@ init_proxy_pool(PROXY_CONFIG_FILE) # initialize the global proxy pool MAX_RETRIES = 20 class SetupController: - def __init__(self, vm_ip: str, server_port: int = 5000, chromium_port: int = 9222, vlc_port: int = 8080, cache_dir: str = "cache"): + def __init__(self, vm_ip: str, server_port: int = 5000, chromium_port: int = 9222, vlc_port: int = 8080, cache_dir: str = "cache", client_password: str = "", screen_width: int = 1920, screen_height: int = 1080): self.vm_ip: str = vm_ip self.server_port: int = server_port self.chromium_port: int = chromium_port @@ -54,6 +48,9 @@ class SetupController: self.http_server_setup_root: str = f"http://{vm_ip}:{server_port}/setup" self.cache_dir: str = cache_dir self.use_proxy: bool = False + self.client_password: str = client_password + self.screen_width: int = screen_width + self.screen_height: int = screen_height def reset_cache_dir(self, cache_dir: str): self.cache_dir = cache_dir @@ -304,22 +301,31 @@ class SetupController: terminates: bool = False nb_failings = 0 - def replace_screen_env_in_command(command_list): - width = int(os.environ.get("SCREEN_WIDTH", 1920)) - height = int(os.environ.get("SCREEN_HEIGHT", 1080)) + def replace_screen_env_in_command(command): + password = self.client_password + width = self.screen_width + height = self.screen_height width_half = str(width // 2) height_half = str(height // 2) new_command_list = [] - for item in command_list: - if isinstance(item, str): + new_command = "" + if isinstance(command, str): + new_command = command.replace("{CLIENT_PASSWORD}", password) + new_command = new_command.replace("{SCREEN_WIDTH_HALF}", width_half) + new_command = new_command.replace("{SCREEN_HEIGHT_HALF}", height_half) + new_command = new_command.replace("{SCREEN_WIDTH}", str(width)) + new_command = new_command.replace("{SCREEN_HEIGHT}", str(height)) + return new_command + else: + for item in command: + item = item.replace("{CLIENT_PASSWORD}", password) item = item.replace("{SCREEN_WIDTH_HALF}", width_half) item = item.replace("{SCREEN_HEIGHT_HALF}", height_half) item = item.replace("{SCREEN_WIDTH}", str(width)) item = item.replace("{SCREEN_HEIGHT}", str(height)) - new_command_list.append(item) - return new_command_list - if isinstance(command, list): - command = replace_screen_env_in_command(command) + new_command_list.append(item) + return new_command_list + command = replace_screen_env_in_command(command) payload = json.dumps({"command": command, "shell": shell}) headers = {"Content-Type": "application/json"} @@ -467,7 +473,7 @@ class SetupController: except requests.exceptions.RequestException as e: logger.error("An error occurred while trying to send the request: %s", e) - def _proxy_setup(self, client_password: str = CLIENT_PASSWORD): + def _proxy_setup(self, client_password: str = ""): """Setup system-wide proxy configuration using proxy pool Args: diff --git a/desktop_env/desktop_env.py b/desktop_env/desktop_env.py index 1ade6fe7..39803b7 100644 --- a/desktop_env/desktop_env.py +++ b/desktop_env/desktop_env.py @@ -26,7 +26,7 @@ class DesktopEnv(gym.Env): """ def __init__( self, - provider_name: str = "vmware", + provider_name: str = "aws", region: str = None, path_to_vm: str = None, snapshot_name: str = "init_state", @@ -38,6 +38,7 @@ class DesktopEnv(gym.Env): require_terminal: bool = False, os_type: str = "Ubuntu", enable_proxy: bool = False, + client_password: str = "", ): """ Args: @@ -59,6 +60,16 @@ class DesktopEnv(gym.Env): self.region = region self.provider_name = provider_name self.enable_proxy = enable_proxy # Store proxy enablement setting + if client_password == "": + if self.provider_name == "aws": + self.client_password = "osworld-public-evaluation" + else: + self.client_password = "password" + else: + self.client_password = client_password + + self.screen_width = screen_size[0] + self.screen_height = screen_size[1] # Default self.server_port = 5000 @@ -88,7 +99,7 @@ class DesktopEnv(gym.Env): if provider_name in {"vmware", "virtualbox"} else path_to_vm else: - self.path_to_vm = self.manager.get_vm_path(os_type=self.os_type, region=region) + self.path_to_vm = self.manager.get_vm_path(os_type=self.os_type, region=region, screen_size=(self.screen_width, self.screen_height)) try: self.snapshot_name = snapshot_name self.cache_dir_base: str = cache_dir @@ -136,7 +147,7 @@ class DesktopEnv(gym.Env): self.vnc_port = int(vm_ip_ports[3]) self.vlc_port = int(vm_ip_ports[4]) self.controller = PythonController(vm_ip=self.vm_ip, server_port=self.server_port) - self.setup_controller = SetupController(vm_ip=self.vm_ip, server_port=self.server_port, chromium_port=self.chromium_port, vlc_port=self.vlc_port, cache_dir=self.cache_dir_base) + self.setup_controller = SetupController(vm_ip=self.vm_ip, server_port=self.server_port, chromium_port=self.chromium_port, vlc_port=self.vlc_port, cache_dir=self.cache_dir_base, client_password=self.client_password, screen_width=self.screen_width, screen_height=self.screen_height) def _revert_to_snapshot(self): # Revert to certain snapshot of the virtual machine, and refresh the path to vm and ip of vm @@ -197,7 +208,7 @@ class DesktopEnv(gym.Env): if task_config is not None: if task_config.get("proxy", False) and self.enable_proxy: # If using proxy and proxy is enabled, set up the proxy configuration - self.setup_controller._proxy_setup() + self.setup_controller._proxy_setup(self.client_password) self._set_task_info(task_config) self.setup_controller.reset_cache_dir(self.cache_dir) logger.info("Setting up environment...") diff --git a/desktop_env/providers/aws/manager.py b/desktop_env/providers/aws/manager.py index 4b53e1f..2d80380 100644 --- a/desktop_env/providers/aws/manager.py +++ b/desktop_env/providers/aws/manager.py @@ -164,11 +164,11 @@ def _allocate_vm(region=DEFAULT_REGION, screen_size=(1920, 1080)): return instance_id -def _allocate_vm_with_proxy(region=DEFAULT_REGION, proxy_config_file=None): +def _allocate_vm_with_proxy(region=DEFAULT_REGION, proxy_config_file=None, screen_size=(1920, 1080)): """Allocate a VM with proxy configuration""" if not PROXY_SUPPORT_AVAILABLE: logger.warning("Proxy support not available, falling back to regular VM allocation") - return _allocate_vm(region) + return _allocate_vm(region, screen_size=screen_size) from desktop_env.providers.aws.provider_with_proxy import AWSProviderWithProxy @@ -268,11 +268,11 @@ class AWSVMManager(VMManager): def _list_free_vms(self, region=DEFAULT_REGION): pass - def get_vm_path(self, region=DEFAULT_REGION, **kwargs): + def get_vm_path(self, region=DEFAULT_REGION, screen_size=(1920, 1080), **kwargs): if self.proxy_config_file: logger.info("Allocating a new VM with proxy configuration in region: {}".format(region)) - new_vm_path = _allocate_vm_with_proxy(region, self.proxy_config_file) + new_vm_path = _allocate_vm_with_proxy(region, self.proxy_config_file, screen_size=screen_size) else: logger.info("Allocating a new VM in region: {}".format(region)) - new_vm_path = _allocate_vm(region) + new_vm_path = _allocate_vm(region, screen_size=screen_size) return new_vm_path \ No newline at end of file diff --git a/evaluation_examples/examples/chrome/3299584d-8f11-4457-bf4c-ce98f7600250.json b/evaluation_examples/examples/chrome/3299584d-8f11-4457-bf4c-ce98f7600250.json index 6b9c6bb..56e42b2 100644 --- a/evaluation_examples/examples/chrome/3299584d-8f11-4457-bf4c-ce98f7600250.json +++ b/evaluation_examples/examples/chrome/3299584d-8f11-4457-bf4c-ce98f7600250.json @@ -7,7 +7,7 @@ { "type": "execute", "parameters": { - "command": "echo password | sudo -S apt update -y && echo password | sudo -S apt install jq -y", + "command": "echo {CLIENT_PASSWORD} | sudo -S apt update -y && echo {CLIENT_PASSWORD} | sudo -S apt install jq -y", "shell": true } }, diff --git a/evaluation_examples/examples/chrome/6766f2b8-8a72-417f-a9e5-56fcaa735837.json b/evaluation_examples/examples/chrome/6766f2b8-8a72-417f-a9e5-56fcaa735837.json index a33fb4c..8c05157 100644 --- a/evaluation_examples/examples/chrome/6766f2b8-8a72-417f-a9e5-56fcaa735837.json +++ b/evaluation_examples/examples/chrome/6766f2b8-8a72-417f-a9e5-56fcaa735837.json @@ -18,7 +18,7 @@ { "type": "execute", "parameters": { - "command": "echo password | sudo -S apt-get update -y && echo password | sudo -S apt-get install unzip -y && unzip /home/user/Desktop/helloExtension.zip -d /home/user/Desktop/ && rm /home/user/Desktop/helloExtension.zip", + "command": "echo {CLIENT_PASSWORD} | sudo -S apt-get update -y && echo {CLIENT_PASSWORD} | sudo -S apt-get install unzip -y && unzip /home/user/Desktop/helloExtension.zip -d /home/user/Desktop/ && rm /home/user/Desktop/helloExtension.zip", "shell": true } }, diff --git a/evaluation_examples/examples/chrome/6c4c23a1-42a4-43cc-9db1-2f86ff3738cc.json b/evaluation_examples/examples/chrome/6c4c23a1-42a4-43cc-9db1-2f86ff3738cc.json index d63ec3f..dd6a5c9 100644 --- a/evaluation_examples/examples/chrome/6c4c23a1-42a4-43cc-9db1-2f86ff3738cc.json +++ b/evaluation_examples/examples/chrome/6c4c23a1-42a4-43cc-9db1-2f86ff3738cc.json @@ -74,5 +74,5 @@ } } }, - "proxy": true + "proxy": false } \ No newline at end of file diff --git a/evaluation_examples/examples/chrome/b4f95342-463e-4179-8c3f-193cd7241fb2.json b/evaluation_examples/examples/chrome/b4f95342-463e-4179-8c3f-193cd7241fb2.json index b4a7214..7773484 100644 --- a/evaluation_examples/examples/chrome/b4f95342-463e-4179-8c3f-193cd7241fb2.json +++ b/evaluation_examples/examples/chrome/b4f95342-463e-4179-8c3f-193cd7241fb2.json @@ -1,7 +1,7 @@ { "id": "b4f95342-463e-4179-8c3f-193cd7241fb2", "snapshot": "chrome", - "instruction": "Find the next available date for Diamond.", + "instruction": "List as many of the next available dates for Diamond Campground as possible.", "source": "test_task_1", "config": [ { diff --git a/evaluation_examples/examples/chrome/fc6d8143-9452-4171-9459-7f515143419a.json b/evaluation_examples/examples/chrome/fc6d8143-9452-4171-9459-7f515143419a.json index 985200a..7fea695 100644 --- a/evaluation_examples/examples/chrome/fc6d8143-9452-4171-9459-7f515143419a.json +++ b/evaluation_examples/examples/chrome/fc6d8143-9452-4171-9459-7f515143419a.json @@ -1,7 +1,7 @@ { "id": "fc6d8143-9452-4171-9459-7f515143419a", "snapshot": "chrome", - "instruction": "Find the status of tomorrow flights from New York airports to Columbus in Ohio.", + "instruction": "Find the status of tomorrow flights from New York-Kennedy airport to Chicago-O'Hare airport.", "source": "test_task_0", "config": [ { @@ -65,12 +65,12 @@ "from": "tomorrow" }, "expected": { - "start": "NYC", - "end": "CMH", + "start": "JFK", + "end": "ORD", "time": "{DoW}, {Month} {Day0D}, {Year}" } } } }, - "proxy": true + "proxy": false } \ No newline at end of file diff --git a/evaluation_examples/examples/multi_apps/2373b66a-092d-44cb-bfd7-82e86e7a3b4d.json b/evaluation_examples/examples/multi_apps/2373b66a-092d-44cb-bfd7-82e86e7a3b4d.json index beb9bb2..574d506 100644 --- a/evaluation_examples/examples/multi_apps/2373b66a-092d-44cb-bfd7-82e86e7a3b4d.json +++ b/evaluation_examples/examples/multi_apps/2373b66a-092d-44cb-bfd7-82e86e7a3b4d.json @@ -7,7 +7,7 @@ { "type": "command", "parameters": { - "command": "echo password | sudo -S apt-get update && echo password | sudo -S apt-get install sysstat", + "command": "echo {CLIENT_PASSWORD} | sudo -S apt-get update && echo {CLIENT_PASSWORD} | sudo -S apt-get install sysstat", "shell": "true" } } diff --git a/evaluation_examples/examples/multi_apps/716a6079-22da-47f1-ba73-c9d58f986a38.json b/evaluation_examples/examples/multi_apps/716a6079-22da-47f1-ba73-c9d58f986a38.json index 30738d3..02170d9 100644 --- a/evaluation_examples/examples/multi_apps/716a6079-22da-47f1-ba73-c9d58f986a38.json +++ b/evaluation_examples/examples/multi_apps/716a6079-22da-47f1-ba73-c9d58f986a38.json @@ -25,7 +25,7 @@ { "type": "command", "parameters": { - "command": "echo password | sudo -S apt install xsel && xsel -bc", + "command": "echo {CLIENT_PASSWORD} | sudo -S apt install xsel && xsel -bc", "shell": "true" } } diff --git a/evaluation_examples/examples/multi_apps/9f3bb592-209d-43bc-bb47-d77d9df56504.json b/evaluation_examples/examples/multi_apps/9f3bb592-209d-43bc-bb47-d77d9df56504.json index b1ad263..c1b5bad 100644 --- a/evaluation_examples/examples/multi_apps/9f3bb592-209d-43bc-bb47-d77d9df56504.json +++ b/evaluation_examples/examples/multi_apps/9f3bb592-209d-43bc-bb47-d77d9df56504.json @@ -61,14 +61,7 @@ { "type": "command", "parameters": { - "command": "echo password | sudo -S pip install pysrt", - "shell": "true" - } - }, - { - "type": "command", - "parameters": { - "command": "echo osworld-public-evaluation | sudo -S pip install pysrt", + "command": "echo {CLIENT_PASSWORD} | sudo -S pip install pysrt", "shell": "true" } } diff --git a/evaluation_examples/examples/multi_apps/a74b607e-6bb5-4ea8-8a7c-5d97c7bbcd2a.json b/evaluation_examples/examples/multi_apps/a74b607e-6bb5-4ea8-8a7c-5d97c7bbcd2a.json index 9536a92..dd77b9e 100644 --- a/evaluation_examples/examples/multi_apps/a74b607e-6bb5-4ea8-8a7c-5d97c7bbcd2a.json +++ b/evaluation_examples/examples/multi_apps/a74b607e-6bb5-4ea8-8a7c-5d97c7bbcd2a.json @@ -18,7 +18,7 @@ { "type": "execute", "parameters": { - "command": "echo password | sudo -S apt-get update -y && echo password | sudo -S apt-get install unzip -y && unzip /home/user/Desktop/helloExtension.zip -d /home/user/Desktop/ && rm /home/user/Desktop/helloExtension.zip", + "command": "echo {CLIENT_PASSWORD} | sudo -S apt-get update -y && echo {CLIENT_PASSWORD} | sudo -S apt-get install unzip -y && unzip /home/user/Desktop/helloExtension.zip -d /home/user/Desktop/ && rm /home/user/Desktop/helloExtension.zip", "shell": true } }, diff --git a/evaluation_examples/examples/os/5812b315-e7bd-4265-b51f-863c02174c28.json b/evaluation_examples/examples/os/5812b315-e7bd-4265-b51f-863c02174c28.json index 67ccb29..f2fe7bf 100644 --- a/evaluation_examples/examples/os/5812b315-e7bd-4265-b51f-863c02174c28.json +++ b/evaluation_examples/examples/os/5812b315-e7bd-4265-b51f-863c02174c28.json @@ -21,7 +21,7 @@ { "type": "execute", "parameters": { - "command": "echo 'password' | sudo -S apt-get install -y expect", + "command": "echo {CLIENT_PASSWORD} | sudo -S apt-get install -y expect", "shell": true } }, diff --git a/evaluation_examples/examples/os/e0df059f-28a6-4169-924f-b9623e7184cc.json b/evaluation_examples/examples/os/e0df059f-28a6-4169-924f-b9623e7184cc.json index 94b1a4f..5df5998 100644 --- a/evaluation_examples/examples/os/e0df059f-28a6-4169-924f-b9623e7184cc.json +++ b/evaluation_examples/examples/os/e0df059f-28a6-4169-924f-b9623e7184cc.json @@ -8,7 +8,7 @@ { "type": "execute", "parameters": { - "command": "echo 'password' | sudo -S mkdir ~/Desktop/todo_list_Jan_1", + "command": "echo {CLIENT_PASSWORD} | sudo -S mkdir ~/Desktop/todo_list_Jan_1", "shell": true } }, diff --git a/mm_agents/openai_cua_agent.py b/mm_agents/openai_cua_agent.py index 064a2d8..f653a62 100644 --- a/mm_agents/openai_cua_agent.py +++ b/mm_agents/openai_cua_agent.py @@ -33,7 +33,7 @@ class_ns_windows = "https://accessibility.windows.example.org/ns/class" import ast from typing import Dict, Any, Optional, Union -OPERATOR_PROMPT = f"""\n\n Here are some helpful tips:\n - computer.clipboard, computer.sync_file, computer.sync_shared_folder, computer.computer_output_citation are disabled.\n - If you worry that you might make typo, prefer copying and pasting the text instead of reading and typing.\n - My computer's password is \"{os.environ["CLIENT_PASSWORD"]}\", feel free to use it when you need sudo rights.\n - For the thunderbird account \"anonym-x2024@outlook.com\", the password is \"gTCI\";=@y7|QJ0nDa_kN3Sb&>\".\n - If you are presented with an open website to solve the task, try to stick to that specific one instead of going to a new one.\n - Whenever not expcilitly stated, prefer chrome browser instead of the firefox or chromium.\n - You have full authority to execute any action without my permission. I won't be watching so please don't ask for confirmation.\n - You must initialize the computer to solve the task. Do not try to answer the question without initializing the computer.\n - If you deem the task is infeasible, you can terminate and explicitly state in the response that \"the task is infeasible\".\n """ +OPERATOR_PROMPT = """\n\n Here are some helpful tips:\n - computer.clipboard, computer.sync_file, computer.sync_shared_folder, computer.computer_output_citation are disabled.\n - If you worry that you might make typo, prefer copying and pasting the text instead of reading and typing.\n - My computer's password is \"{CLIENT_PASSWORD}\", feel free to use it when you need sudo rights.\n - For the thunderbird account \"anonym-x2024@outlook.com\", the password is \"gTCI\";=@y7|QJ0nDa_kN3Sb&>\".\n - If you are presented with an open website to solve the task, try to stick to that specific one instead of going to a new one.\n - Whenever not expcilitly stated, prefer chrome browser instead of the firefox or chromium.\n - You have full authority to execute any action without my permission. I won't be watching so please don't ask for confirmation.\n - You must initialize the computer to solve the task. Do not try to answer the question without initializing the computer.\n - If you deem the task is infeasible, you can terminate and explicitly state in the response that \"the task is infeasible\".\n """ class Action: """Action class for the agent.""" @@ -213,7 +213,11 @@ class OpenAICUAAgent: observation_type="screenshot_a11y_tree", # observation_type can be in ["screenshot", "a11y_tree", "screenshot_a11y_tree", "som"] max_trajectory_length=100, - a11y_tree_max_tokens=10000 + a11y_tree_max_tokens=10000, + client_password="", + provider_name="aws", + screen_width=1920, + screen_height=1080 ): self.env = env self.platform = platform @@ -231,12 +235,22 @@ class OpenAICUAAgent: self.actions = [] self.observations = [] + self.screen_width = screen_width + self.screen_height = screen_height + self.tools = [{ "type": "computer_use_preview", - "display_width": int(os.environ["SCREEN_WIDTH"]), - "display_height": int(os.environ["SCREEN_HEIGHT"]), + "display_width": self.screen_width, + "display_height": self.screen_height, "environment": "linux" if platform == "ubuntu" else "windows" }] + if client_password == "": + if provider_name == "aws": + self.client_password = "osworld-public-evaluation" + else: + self.client_password = "password" + else: + self.client_password = client_password if observation_type == "screenshot": if action_space == "computer_13": @@ -630,7 +644,8 @@ class OpenAICUAAgent: """ Predict the next action(s) based on the current observation. """ - + prompt = OPERATOR_PROMPT.replace("{CLIENT_PASSWORD}", self.client_password) + base64_image = encode_image(obs["screenshot"]) if self.cua_messages == []: self.cua_messages.append({ @@ -642,7 +657,7 @@ class OpenAICUAAgent: }, { "type": "input_text", - "text": "\n " + instruction + OPERATOR_PROMPT, + "text": "\n " + instruction + prompt, } ] }) diff --git a/run_multienv_openaicua.py b/run_multienv_openaicua.py index 6e9bca3..c4eb18c 100644 --- a/run_multienv_openaicua.py +++ b/run_multienv_openaicua.py @@ -78,6 +78,18 @@ def config() -> argparse.Namespace: parser.add_argument( "--region", type=str, default="us-east-1", help="AWS region for the VM" ) + parser.add_argument( + "--provider_name", type=str, default="aws", choices=["aws", "virtualbox", "vmware", "docker", "azure"], help="Provider name" + ) + parser.add_argument( + "--client_password", type=str, default="", help="Client password" + ) + parser.add_argument( + "--screen_width", type=int, default=1920, help="Screen width" + ) + parser.add_argument( + "--screen_height", type=int, default=1080, help="Screen height" + ) args = parser.parse_args() return args @@ -180,18 +192,20 @@ def run_env_tasks(env_idx: int, env_tasks: dict, args: argparse.Namespace, share from desktop_env.providers.aws.manager import IMAGE_ID_MAP REGION = args.region - screen_size = (int(os.environ["SCREEN_WIDTH"]), int(os.environ["SCREEN_HEIGHT"])) + screen_size = (args.screen_width, args.screen_height) ami_id = IMAGE_ID_MAP[REGION].get(screen_size, IMAGE_ID_MAP[REGION][(1920, 1080)]) env = DesktopEnv( path_to_vm=args.path_to_vm, action_space=args.action_space, - provider_name=os.environ["PROVIDER_NAME"], + provider_name=args.provider_name, region=REGION, snapshot_name=ami_id, screen_size=screen_size, headless=args.headless, os_type="Ubuntu", require_a11y_tree=args.observation_type in ["a11y_tree", "screenshot_a11y_tree", "som"], + enable_proxy=True, + client_password=args.client_password ) active_environments.append(env) agent = OpenAICUAAgent( @@ -203,6 +217,10 @@ def run_env_tasks(env_idx: int, env_tasks: dict, args: argparse.Namespace, share action_space=args.action_space, observation_type=args.observation_type, max_trajectory_length=args.max_trajectory_length, + client_password=args.client_password, + provider_name=args.provider_name, + screen_width=args.screen_width, + screen_height=args.screen_height ) logger.info(f"Executing tasks in environment {env_idx + 1}/{args.num_envs}") From 349f2fd9fed76a5b174853aff9e71994aa16a22a Mon Sep 17 00:00:00 2001 From: Zilong Zhou Date: Sun, 13 Jul 2025 21:10:49 +0800 Subject: [PATCH 5/5] Feat/claude cua support (#253) * feat: add claude support * feat: add script for end-to-end evaluation with logging and task distribution * feat&fix: add tool result handling and update model default in evaluation script * chore: remove run_test_env.py script * feat&fix: implement action parsing for tool calls and update default action space * fix: update text formatting in action parsing and replace logger import * feat&fix: implement action parsing for tool calls and add screen size handling * feat: add setup instructions for Anthropic API integration * feat: add notice about image size limitations for Anthropic API * Delete test_env/logger.py * Delete test_env/utils.py --- desktop_env/desktop_env.py | 13 +- mm_agents/anthropic/README.md | 18 + mm_agents/anthropic/__init__.py | 23 ++ mm_agents/anthropic/main.py | 442 ++++++++++++++++++++++++ mm_agents/anthropic/tools/__init__.py | 14 + mm_agents/anthropic/tools/base.py | 69 ++++ mm_agents/anthropic/tools/bash.py | 144 ++++++++ mm_agents/anthropic/tools/collection.py | 34 ++ mm_agents/anthropic/tools/computer.py | 260 ++++++++++++++ mm_agents/anthropic/tools/edit.py | 290 ++++++++++++++++ mm_agents/anthropic/tools/run.py | 42 +++ mm_agents/anthropic/utils.py | 246 +++++++++++++ run_multienv_claude.py | 384 ++++++++++++++++++++ 13 files changed, 1975 insertions(+), 4 deletions(-) create mode 100644 mm_agents/anthropic/README.md create mode 100644 mm_agents/anthropic/__init__.py create mode 100644 mm_agents/anthropic/main.py create mode 100644 mm_agents/anthropic/tools/__init__.py create mode 100644 mm_agents/anthropic/tools/base.py create mode 100644 mm_agents/anthropic/tools/bash.py create mode 100644 mm_agents/anthropic/tools/collection.py create mode 100644 mm_agents/anthropic/tools/computer.py create mode 100644 mm_agents/anthropic/tools/edit.py create mode 100644 mm_agents/anthropic/tools/run.py create mode 100644 mm_agents/anthropic/utils.py create mode 100644 run_multienv_claude.py diff --git a/desktop_env/desktop_env.py b/desktop_env/desktop_env.py index 39803b7..833e54e 100644 --- a/desktop_env/desktop_env.py +++ b/desktop_env/desktop_env.py @@ -19,6 +19,8 @@ Metric = Callable[[Any, Any], float] Getter = Callable[[gym.Env, Dict[str, Any]], Any] MAX_RETRIES = 5 # Maximum retries for environment setup + + class DesktopEnv(gym.Env): """ @@ -115,7 +117,7 @@ class DesktopEnv(gym.Env): # mode: human or machine self.instruction = None - assert action_space in ["computer_13", "pyautogui"] + assert action_space in ["computer_13", "pyautogui", "claude_computer_use"] self.action_space = action_space # todo: refactor it to the ActType # episodic stuffs, like counters, will be updated or reset @@ -318,7 +320,7 @@ class DesktopEnv(gym.Env): reward = 0 # todo: Define reward calculation for each example done = False # todo: Define episode termination condition for each example info = {} - + logger.info(f"Step {self._step_no} in trajectory {self._traj_no} with action: {action}") # handle the special actions if action in ['WAIT', 'FAIL', 'DONE'] or (type(action) == dict and action['action_type'] in ['WAIT', 'FAIL', 'DONE']): if action == 'WAIT': @@ -333,12 +335,15 @@ class DesktopEnv(gym.Env): if self.action_space == "computer_13": # the set of all possible actions defined in the action representation self.controller.execute_action(action) - elif self.action_space == "pyautogui": + elif self.action_space == "pyautogui" or self.action_space == "claude_computer_use": if action in ['WAIT', 'FAIL', 'DONE']: self.controller.execute_action(action) else: # the set of all possible python commands insides `pyautogui` - self.controller.execute_python_command(action) + if type(action) == str: + self.controller.execute_python_command(action) + elif type(action) == dict: + self.controller.execute_python_command(action['command']) time.sleep(pause) observation = self._get_obs() diff --git a/mm_agents/anthropic/README.md b/mm_agents/anthropic/README.md new file mode 100644 index 0000000..e58ee9b --- /dev/null +++ b/mm_agents/anthropic/README.md @@ -0,0 +1,18 @@ +# Anthropic Agent Integration +> Notice: As Anthropic API only supports image’s long edge is less than 1568 pixels and image is less than ~1,600 tokens, we resize the screenshot to 1280x720. +## Setup +To run with the Anthropic API, you need to set up your environment with the necessary API keys and configurations. Follow these steps: +1. **Install Dependencies**: Ensure you have the required Python packages installed. You can do this by running: +```bash +pip install anthropic +``` +2. **Set Environment Variables**: You need to set the environment variable with your API key. You can do this in .env: +For aws bedrock: +```.env +AWS_ACCESS_KEY_ID=your_access_key_id +AWS_SECRET_ACCESS_KEY=your_secret_access_key +``` +For anthropic, you need set APIProvider to `anthropic` and set the API key: +```.env +ANTHROPIC_API_KEY=your_anthropic_api_key +``` \ No newline at end of file diff --git a/mm_agents/anthropic/__init__.py b/mm_agents/anthropic/__init__.py new file mode 100644 index 0000000..2fd2845 --- /dev/null +++ b/mm_agents/anthropic/__init__.py @@ -0,0 +1,23 @@ +""" +Anthropic agent implementation +""" + +from .main import AnthropicAgent +from .tools import ( + BashTool, + CLIResult, + ComputerTool, + EditTool, + ToolCollection, + ToolResult +) + +__all__ = [ + 'AnthropicAgent', + 'BashTool', + 'CLIResult', + 'ComputerTool', + 'EditTool', + 'ToolCollection', + 'ToolResult' +] \ No newline at end of file diff --git a/mm_agents/anthropic/main.py b/mm_agents/anthropic/main.py new file mode 100644 index 0000000..4cffc16 --- /dev/null +++ b/mm_agents/anthropic/main.py @@ -0,0 +1,442 @@ +import base64 +import os +import time +from typing import Any, cast, Optional, Dict +from PIL import Image +import io + +from anthropic import ( + Anthropic, + AnthropicBedrock, + AnthropicVertex, + APIError, + APIResponseValidationError, + APIStatusError, +) +from anthropic.types.beta import ( + BetaMessageParam, + BetaTextBlockParam, +) +from .utils import COMPUTER_USE_BETA_FLAG, PROMPT_CACHING_BETA_FLAG,SYSTEM_PROMPT, SYSTEM_PROMPT_WINDOWS, APIProvider, PROVIDER_TO_DEFAULT_MODEL_NAME +from .utils import _response_to_params, _inject_prompt_caching, _maybe_filter_to_n_most_recent_images + +import logging +logger = logging.getLogger("desktopenv.agent") + +class AnthropicAgent: + def __init__(self, + platform: str = "Ubuntu", + model: str = "claude-3-5-sonnet-20241022", + provider: APIProvider = APIProvider.BEDROCK, + max_tokens: int = 4096, + api_key: str = os.environ.get("ANTHROPIC_API_KEY", None), + system_prompt_suffix: str = "", + only_n_most_recent_images: Optional[int] = 10, + action_space: str = "claude_computer_use", + screen_size: tuple[int, int] = (1920, 1080), + *args, **kwargs + ): + self.platform = platform + self.action_space = action_space + self.logger = logger + self.class_name = self.__class__.__name__ + self.model_name = model + self.provider = provider + self.max_tokens = max_tokens + self.api_key = api_key + self.system_prompt_suffix = system_prompt_suffix + self.only_n_most_recent_images = only_n_most_recent_images + self.messages: list[BetaMessageParam] = [] + self.screen_size = screen_size + self.resize_factor = ( + screen_size[0] / 1280, # Assuming 1280 is the base width + screen_size[1] / 720 # Assuming 720 is the base height + ) + + def add_tool_result(self, tool_call_id: str, result: str, screenshot: bytes = None): + """Add tool result to message history""" + tool_result_content = [ + { + "type": "tool_result", + "tool_use_id": tool_call_id, + "content": [{"type": "text", "text": result}] + } + ] + + # Add screenshot if provided + if screenshot is not None: + screenshot_base64 = base64.b64encode(screenshot).decode('utf-8') + tool_result_content[0]["content"].append({ + "type": "image", + "source": { + "type": "base64", + "media_type": "image/png", + "data": screenshot_base64 + } + }) + + self.messages.append({ + "role": "user", + "content": tool_result_content + }) + + def parse_actions_from_tool_call(self, tool_call: Dict) -> str: + result = "" + function_args = ( + tool_call["input"] + ) + + action = function_args.get("action") + if not action: + action = tool_call.function.name + action_conversion = { + "left click": "click", + "right click": "right_click" + } + action = action_conversion.get(action, action) + + text = function_args.get("text") + coordinate = function_args.get("coordinate") + scroll_direction = function_args.get("scroll_direction") + scroll_amount = function_args.get("scroll_amount") + duration = function_args.get("duration") + + # resize coordinates if resize_factor is set + if coordinate and self.resize_factor: + coordinate = ( + int(coordinate[0] * self.resize_factor[0]), + int(coordinate[1] * self.resize_factor[1]) + ) + + # Handle mouse move and drag actions + if action in ("mouse_move", "left_click_drag"): + if coordinate is None: + raise ValueError(f"coordinate is required for {action}") + if text is not None: + raise ValueError(f"text is not accepted for {action}") + if not isinstance(coordinate, (list, tuple)) or len(coordinate) != 2: + raise ValueError(f"{coordinate} must be a tuple of length 2") + if not all(isinstance(i, int) for i in coordinate): + raise ValueError(f"{coordinate} must be a tuple of ints") + + x, y = coordinate[0], coordinate[1] + if action == "mouse_move": + result += ( + f"pyautogui.moveTo({x}, {y}, duration={duration or 0.5})\n" + ) + expected_outcome = f"Mouse moved to ({x},{y})." + elif action == "left_click_drag": + result += ( + f"pyautogui.dragTo({x}, {y}, duration={duration or 0.5})\n" + ) + expected_outcome = f"Cursor dragged to ({x},{y})." + + # Handle keyboard actions + elif action in ("key", "type"): + if text is None: + raise ValueError(f"text is required for {action}") + if coordinate is not None: + raise ValueError(f"coordinate is not accepted for {action}") + if not isinstance(text, str): + raise ValueError(f"{text} must be a string") + + if action == "key": + key_conversion = { + "page_down": "pagedown", + "page_up": "pageup", + "super_l": "win", + "super": "command", + "escape": "esc" + } + keys = text.split('+') + for key in keys: + key = key.strip().lower() + key = key_conversion.get(key, key) + result += (f"pyautogui.keyDown('{key}')\n") + for key in reversed(keys): + key = key.strip().lower() + key = key_conversion.get(key, key) + result += (f"pyautogui.keyUp('{key}')\n") + expected_outcome = f"Key {key} pressed." + elif action == "type": + result += ( + f"pyautogui.typewrite(\"\"\"{text}\"\"\", interval=0.01)\n" + ) + expected_outcome = f"Text {text} written." + + # Handle scroll actions + elif action == "scroll": + if coordinate is None: + if scroll_direction in ("up", "down"): + result += ( + f"pyautogui.scroll({scroll_amount if scroll_direction == 'up' else -scroll_amount})\n" + ) + elif scroll_direction in ("left", "right"): + result += ( + f"pyautogui.hscroll({scroll_amount if scroll_direction == 'right' else -scroll_amount})\n" + ) + else: + if scroll_direction in ("up", "down"): + x, y = coordinate[0], coordinate[1] + result += ( + f"pyautogui.scroll({scroll_amount if scroll_direction == 'up' else -scroll_amount}, {x}, {y})\n" + ) + elif scroll_direction in ("left", "right"): + x, y = coordinate[0], coordinate[1] + result += ( + f"pyautogui.hscroll({scroll_amount if scroll_direction == 'right' else -scroll_amount}, {x}, {y})\n" + ) + expected_outcome = "Scroll action finished" + + # Handle click actions + elif action in ("left_click", "right_click", "double_click", "middle_click", "left_press"): + if coordinate is not None: + x, y = coordinate + if action == "left_click": + result += (f"pyautogui.click({x}, {y})\n") + elif action == "right_click": + result += (f"pyautogui.rightClick({x}, {y})\n") + elif action == "double_click": + result += (f"pyautogui.doubleClick({x}, {y})\n") + elif action == "middle_click": + result += (f"pyautogui.middleClick({x}, {y})\n") + elif action == "left_press": + result += (f"pyautogui.mouseDown({x}, {y})\n") + result += ("time.sleep(1)\n") + result += (f"pyautogui.mouseUp({x}, {y})\n") + else: + if action == "left_click": + result += ("pyautogui.click()\n") + elif action == "right_click": + result += ("pyautogui.rightClick()\n") + elif action == "double_click": + result += ("pyautogui.doubleClick()\n") + elif action == "middle_click": + result += ("pyautogui.middleClick()\n") + elif action == "left_press": + result += ("pyautogui.mouseDown()\n") + result += ("time.sleep(1)\n") + result += ("pyautogui.mouseUp()\n") + expected_outcome = "Click action finished" + + elif action == "wait": + result += "pyautogui.sleep(0.5)\n" + expected_outcome = "Wait for 0.5 seconds" + elif action == "fail": + result += "FAIL" + expected_outcome = "Finished" + elif action == "done": + result += "DONE" + expected_outcome = "Finished" + elif action == "call_user": + result += "CALL_USER" + expected_outcome = "Call user" + elif action == "screenshot": + result += "pyautogui.sleep(0.1)\n" + expected_outcome = "Screenshot taken" + else: + raise ValueError(f"Invalid action: {action}") + + return result + + def predict(self, task_instruction: str, obs: Dict = None, system: Any = None): + system = BetaTextBlockParam( + type="text", + text=f"{SYSTEM_PROMPT_WINDOWS if self.platform == 'Windows' else SYSTEM_PROMPT}{' ' + self.system_prompt_suffix if self.system_prompt_suffix else ''}" + ) + + # resize screenshot if resize_factor is set + if obs and "screenshot" in obs: + # Convert bytes to PIL Image + screenshot_bytes = obs["screenshot"] + screenshot_image = Image.open(io.BytesIO(screenshot_bytes)) + + # Calculate new size based on resize factor + new_width, new_height = 1280, 720 + + # Resize the image + resized_image = screenshot_image.resize((new_width, new_height), Image.Resampling.LANCZOS) + + # Convert back to bytes + output_buffer = io.BytesIO() + resized_image.save(output_buffer, format='PNG') + obs["screenshot"] = output_buffer.getvalue() + + + if not self.messages: + + init_screenshot = obs + init_screenshot_base64 = base64.b64encode(init_screenshot["screenshot"]).decode('utf-8') + self.messages.append({ + "role": "user", + "content": [ + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/png", + "data": init_screenshot_base64, + }, + }, + {"type": "text", "text": task_instruction}, + ] + }) + + if self.messages and "tool_use" in [content_block["type"] for content_block in self.messages[-1]["content"]]: + self.add_tool_result( + self.messages[-1]["content"][-1]["id"], + f"Success", + screenshot=obs.get("screenshot") if obs else None + ) + + enable_prompt_caching = False + betas = ["computer-use-2025-01-24"] + if self.model_name == "claude-3-7-sonnet-20250219" or self.model_name == "claude-4-opus-20250514" or self.model_name == "claude-4-sonnet-20250514": + betas = ["computer-use-2025-01-24"] + elif self.model_name == "claude-3-5-sonnet-20241022": + betas = [COMPUTER_USE_BETA_FLAG] + + image_truncation_threshold = 10 + if self.provider == APIProvider.ANTHROPIC: + client = Anthropic(api_key=self.api_key, max_retries=4) + enable_prompt_caching = True + elif self.provider == APIProvider.VERTEX: + client = AnthropicVertex() + elif self.provider == APIProvider.BEDROCK: + client = AnthropicBedrock( + # Authenticate by either providing the keys below or use the default AWS credential providers, such as + # using ~/.aws/credentials or the "AWS_SECRET_ACCESS_KEY" and "AWS_ACCESS_KEY_ID" environment variables. + aws_access_key=os.getenv('AWS_ACCESS_KEY_ID'), + aws_secret_key=os.getenv('AWS_SECRET_ACCESS_KEY'), + # aws_region changes the aws region to which the request is made. By default, we read AWS_REGION, + # and if that's not present, we default to us-east-1. Note that we do not read ~/.aws/config for the region. + aws_region=os.getenv('AWS_DEFAULT_REGION'), + ) + + if enable_prompt_caching: + betas.append(PROMPT_CACHING_BETA_FLAG) + _inject_prompt_caching(self.messages) + image_truncation_threshold = 50 + system["cache_control"] = {"type": "ephemeral"} + + if self.only_n_most_recent_images: + _maybe_filter_to_n_most_recent_images( + self.messages, + self.only_n_most_recent_images, + min_removal_threshold=image_truncation_threshold, + ) + + try: + + if self.model_name == "claude-3-5-sonnet-20241022": + tools = [ + {'name': 'computer', 'type': 'computer_20241022', 'display_width_px': 1280, 'display_height_px': 720, 'display_number': 1}, + # {'type': 'bash_20241022', 'name': 'bash'}, + # {'name': 'str_replace_editor', 'type': 'text_editor_20241022'} + ] if self.platform == 'Ubuntu' else [ + {'name': 'computer', 'type': 'computer_20241022', 'display_width_px': 1280, 'display_height_px': 720, 'display_number': 1}, + ] + elif self.model_name == "claude-3-7-sonnet-20250219" or self.model_name == "claude-4-opus-20250514" or self.model_name == "claude-4-sonnet-20250514": + tools = [ + {'name': 'computer', 'type': 'computer_20250124', 'display_width_px': 1280, 'display_height_px': 720, 'display_number': 1}, + # {'type': 'bash_20250124', 'name': 'bash'}, + # {'name': 'str_replace_editor', 'type': 'text_editor_20250124'} + ] if self.platform == 'Ubuntu' else [ + {'name': 'computer', 'type': 'computer_20250124', 'display_width_px': 1280, 'display_height_px': 720, 'display_number': 1}, + ] + extra_body = { + "thinking": {"type": "enabled", "budget_tokens": 1024} + } + response = None + if self.model_name == "claude-3-7-sonnet-20250219" or self.model_name == "claude-4-opus-20250514" or self.model_name == "claude-4-sonnet-20250514": + response = client.beta.messages.create( + max_tokens=self.max_tokens, + messages=self.messages, + model=PROVIDER_TO_DEFAULT_MODEL_NAME[self.provider, self.model_name], + system=[system], + tools=tools, + betas=betas, + extra_body=extra_body + ) + elif self.model_name == "claude-3-5-sonnet-20241022": + response = client.beta.messages.create( + max_tokens=self.max_tokens, + messages=self.messages, + model=PROVIDER_TO_DEFAULT_MODEL_NAME[self.provider, self.model_name], + system=[system], + tools=tools, + betas=betas, + ) + + except (APIError, APIStatusError, APIResponseValidationError) as e: + self.logger.exception(f"Anthropic API error: {str(e)}") + try: + self.logger.warning("Retrying with backup API key...") + backup_client = Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY_BACKUP"), max_retries=4) + + if self.model_name == "claude-3-7-sonnet-20250219" or self.model_name == "claude-4-opus-20250514" or self.model_name == "claude-4-sonnet-20250514": + response = backup_client.beta.messages.create( + max_tokens=self.max_tokens, + messages=self.messages, + model=PROVIDER_TO_DEFAULT_MODEL_NAME[APIProvider.ANTHROPIC, self.model_name], + system=[system], + tools=tools, + betas=betas, + extra_body=extra_body + ) + elif self.model_name == "claude-3-5-sonnet-20241022": + response = backup_client.beta.messages.create( + max_tokens=self.max_tokens, + messages=self.messages, + model=PROVIDER_TO_DEFAULT_MODEL_NAME[APIProvider.ANTHROPIC, self.model_name], + system=[system], + tools=tools, + betas=betas, + ) + self.logger.info("Successfully used backup API key") + except Exception as backup_e: + self.logger.exception(f"Backup API call also failed: {str(backup_e)}") + return None, None + + except Exception as e: + self.logger.exception(f"Error in Anthropic API: {str(e)}") + return None, None + + response_params = _response_to_params(response) + logger.info(f"Received response params: {response_params}") + + # Store response in message history + self.messages.append({ + "role": "assistant", + "content": response_params + }) + + actions: list[Any] = [] + reasonings: list[str] = [] + for content_block in response_params: + if content_block["type"] == "tool_use": + actions.append({ + "name": content_block["name"], + "input": cast(dict[str, Any], content_block["input"]), + "id": content_block["id"], + "action_type": content_block.get("type"), + "command": self.parse_actions_from_tool_call(content_block) + }) + elif content_block["type"] == "text": + reasonings.append(content_block["text"]) + if isinstance(reasonings, list) and len(reasonings) > 0: + reasonings = reasonings[0] + else: + reasonings = "" + logger.info(f"Received actions: {actions}") + logger.info(f"Received reasonings: {reasonings}") + if len(actions) == 0: + actions = ["DONE"] + return reasonings, actions + + def reset(self, *args, **kwargs): + """ + Reset the agent's state. + """ + self.messages = [] + self.logger.info(f"{self.class_name} reset.") \ No newline at end of file diff --git a/mm_agents/anthropic/tools/__init__.py b/mm_agents/anthropic/tools/__init__.py new file mode 100644 index 0000000..3e9210e --- /dev/null +++ b/mm_agents/anthropic/tools/__init__.py @@ -0,0 +1,14 @@ +from .base import CLIResult, ToolResult +from .bash import BashTool +from .collection import ToolCollection +from .computer import ComputerTool +from .edit import EditTool + +__ALL__ = [ + BashTool, + CLIResult, + ComputerTool, + EditTool, + ToolCollection, + ToolResult, +] \ No newline at end of file diff --git a/mm_agents/anthropic/tools/base.py b/mm_agents/anthropic/tools/base.py new file mode 100644 index 0000000..4d6690f --- /dev/null +++ b/mm_agents/anthropic/tools/base.py @@ -0,0 +1,69 @@ +from abc import ABCMeta, abstractmethod +from dataclasses import dataclass, fields, replace +from typing import Any, Optional + +from anthropic.types.beta import BetaToolUnionParam + + +class BaseAnthropicTool(metaclass=ABCMeta): + """Abstract base class for Anthropic-defined tools.""" + + @abstractmethod + def __call__(self, **kwargs) -> Any: + """Executes the tool with the given arguments.""" + ... + + @abstractmethod + def to_params( + self, + ) -> BetaToolUnionParam: + raise NotImplementedError + + +@dataclass(frozen=True) #kw_only=True, +class ToolResult: + """Represents the result of a tool execution.""" + + output: Optional[str] = None + error: Optional[str] = None + base64_image: Optional[str] = None + system: Optional[str] = None + + def __bool__(self): + return any(getattr(self, field.name) for field in fields(self)) + + def __add__(self, other: "ToolResult"): + def combine_fields( + field: Optional[str], other_field: Optional[str], concatenate: bool = True + ): + if field and other_field: + if concatenate: + return field + other_field + raise ValueError("Cannot combine tool results") + return field or other_field + + return ToolResult( + output=combine_fields(self.output, other.output), + error=combine_fields(self.error, other.error), + base64_image=combine_fields(self.base64_image, other.base64_image, False), + system=combine_fields(self.system, other.system), + ) + + def replace(self, **kwargs): + """Returns a new ToolResult with the given fields replaced.""" + return replace(self, **kwargs) + + +class CLIResult(ToolResult): + """A ToolResult that can be rendered as a CLI output.""" + + +class ToolFailure(ToolResult): + """A ToolResult that represents a failure.""" + + +class ToolError(Exception): + """Raised when a tool encounters an error.""" + + def __init__(self, message): + self.message = message \ No newline at end of file diff --git a/mm_agents/anthropic/tools/bash.py b/mm_agents/anthropic/tools/bash.py new file mode 100644 index 0000000..bbd92e3 --- /dev/null +++ b/mm_agents/anthropic/tools/bash.py @@ -0,0 +1,144 @@ +import asyncio +import os +from typing import ClassVar, Literal, Optional + +from anthropic.types.beta import BetaToolBash20241022Param + +from .base import BaseAnthropicTool, CLIResult, ToolError, ToolResult + + +class _BashSession: + """A session of a bash shell.""" + + _started: bool + _process: asyncio.subprocess.Process + + command: str = "/bin/bash" + _output_delay: float = 0.2 # seconds + _timeout: float = 120.0 # seconds + _sentinel: str = "<>" + + def __init__(self): + self._started = False + self._timed_out = False + + async def start(self): + if self._started: + return + + self._process = await asyncio.create_subprocess_shell( + self.command, + preexec_fn=os.setsid, + shell=True, + bufsize=0, + stdin=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + + self._started = True + + def stop(self): + """Terminate the bash shell.""" + if not self._started: + raise ToolError("Session has not started.") + if self._process.returncode is not None: + return + self._process.terminate() + + async def run(self, command: str): + """Execute a command in the bash shell.""" + if not self._started: + raise ToolError("Session has not started.") + if self._process.returncode is not None: + return ToolResult( + system="tool must be restarted", + error=f"bash has exited with returncode {self._process.returncode}", + ) + if self._timed_out: + raise ToolError( + f"timed out: bash has not returned in {self._timeout} seconds and must be restarted", + ) + + # we know these are not None because we created the process with PIPEs + assert self._process.stdin + assert self._process.stdout + assert self._process.stderr + + # send command to the process + self._process.stdin.write( + command.encode() + f"; echo '{self._sentinel}'\n".encode() + ) + await self._process.stdin.drain() + + # read output from the process, until the sentinel is found + try: + async with asyncio.timeout(self._timeout): + while True: + await asyncio.sleep(self._output_delay) + # if we read directly from stdout/stderr, it will wait forever for + # EOF. use the StreamReader buffer directly instead. + output = self._process.stdout._buffer.decode() # pyright: ignore[reportAttributeAccessIssue] + if self._sentinel in output: + # strip the sentinel and break + output = output[: output.index(self._sentinel)] + break + except asyncio.TimeoutError: + self._timed_out = True + raise ToolError( + f"timed out: bash has not returned in {self._timeout} seconds and must be restarted", + ) from None + + if output.endswith("\n"): + output = output[:-1] + + error = self._process.stderr._buffer.decode() # pyright: ignore[reportAttributeAccessIssue] + if error.endswith("\n"): + error = error[:-1] + + # clear the buffers so that the next output can be read correctly + self._process.stdout._buffer.clear() # pyright: ignore[reportAttributeAccessIssue] + self._process.stderr._buffer.clear() # pyright: ignore[reportAttributeAccessIssue] + + return CLIResult(output=output, error=error) + + +class BashTool(BaseAnthropicTool): + """ + A tool that allows the agent to run bash commands. + The tool parameters are defined by Anthropic and are not editable. + """ + + _session: Optional[_BashSession] + name: ClassVar[Literal["bash"]] = "bash" + api_type: ClassVar[Literal["bash_20241022"]] = "bash_20241022" + + def __init__(self): + self._session = None + super().__init__() + + async def __call__( + self, command: Optional[str] = None, restart: bool = False, **kwargs + ): + if restart: + if self._session: + self._session.stop() + self._session = _BashSession() + await self._session.start() + + return ToolResult(system="tool has been restarted.") + + if self._session is None: + self._session = _BashSession() + await self._session.start() + + if command is not None: + return await self._session.run(command) + + raise ToolError("no command provided.") + + def to_params(self) -> BetaToolBash20241022Param: + return { + "type": self.api_type, + "name": self.name, + } \ No newline at end of file diff --git a/mm_agents/anthropic/tools/collection.py b/mm_agents/anthropic/tools/collection.py new file mode 100644 index 0000000..7b9e0dc --- /dev/null +++ b/mm_agents/anthropic/tools/collection.py @@ -0,0 +1,34 @@ +"""Collection classes for managing multiple tools.""" + +from typing import Any + +from anthropic.types.beta import BetaToolUnionParam + +from .base import ( + BaseAnthropicTool, + ToolError, + ToolFailure, + ToolResult, +) + + +class ToolCollection: + """A collection of anthropic-defined tools.""" + + def __init__(self, *tools: BaseAnthropicTool): + self.tools = tools + self.tool_map = {tool.to_params()["name"]: tool for tool in tools} + + def to_params( + self, + ) -> list[BetaToolUnionParam]: + return [tool.to_params() for tool in self.tools] + + async def run(self, *, name: str, tool_input: dict[str, Any]) -> ToolResult: + tool = self.tool_map.get(name) + if not tool: + return ToolFailure(error=f"Tool {name} is invalid") + try: + return await tool(**tool_input) + except ToolError as e: + return ToolFailure(error=e.message) \ No newline at end of file diff --git a/mm_agents/anthropic/tools/computer.py b/mm_agents/anthropic/tools/computer.py new file mode 100644 index 0000000..fa9d2dc --- /dev/null +++ b/mm_agents/anthropic/tools/computer.py @@ -0,0 +1,260 @@ +import asyncio +import base64 +import os +import shlex +import shutil +from enum import Enum +from pathlib import Path +from typing import Literal, TypedDict, Optional, Tuple +from uuid import uuid4 + +from anthropic.types.beta import BetaToolComputerUse20241022Param + +from .base import BaseAnthropicTool, ToolError, ToolResult +from .run import run + +OUTPUT_DIR = "/tmp/outputs" + +TYPING_DELAY_MS = 12 +TYPING_GROUP_SIZE = 50 + +Action = Literal[ + "key", + "type", + "mouse_move", + "left_click", + "left_click_drag", + "right_click", + "middle_click", + "double_click", + "screenshot", + "cursor_position", +] + + +class Resolution(TypedDict): + width: int + height: int + + +# sizes above XGA/WXGA are not recommended (see README.md) +# scale down to one of these targets if ComputerTool._scaling_enabled is set +MAX_SCALING_TARGETS: dict[str, Resolution] = { + "XGA": Resolution(width=1024, height=768), # 4:3 + "WXGA": Resolution(width=1280, height=800), # 16:10 + "FWXGA": Resolution(width=1366, height=768), # ~16:9 +} + + +class ScalingSource(Enum): + COMPUTER = "computer" + API = "api" + + +class ComputerToolOptions(TypedDict): + display_height_px: int + display_width_px: int + display_number: Optional[int] + + +def chunks(s: str, chunk_size: int) -> list[str]: + return [s[i : i + chunk_size] for i in range(0, len(s), chunk_size)] + + +class ComputerTool(BaseAnthropicTool): + """ + A tool that allows the agent to interact with the screen, keyboard, and mouse of the current computer. + The tool parameters are defined by Anthropic and are not editable. + """ + + name: Literal["computer"] = "computer" + api_type: Literal["computer_20241022"] = "computer_20241022" + width: int + height: int + display_num: Optional[int] + + _screenshot_delay = 2.0 + _scaling_enabled = True + + @property + def options(self) -> ComputerToolOptions: + width, height = self.scale_coordinates( + ScalingSource.COMPUTER, self.width, self.height + ) + return { + "display_width_px": width, + "display_height_px": height, + "display_number": self.display_num, + } + + def to_params(self) -> BetaToolComputerUse20241022Param: + return {"name": self.name, "type": self.api_type, **self.options} + + def __init__(self): + super().__init__() + + self.width = int(os.getenv("WIDTH") or 0) + self.height = int(os.getenv("HEIGHT") or 0) + assert self.width and self.height, "WIDTH, HEIGHT must be set" + if (display_num := os.getenv("DISPLAY_NUM")) is not None: + self.display_num = int(display_num) + self._display_prefix = f"DISPLAY=:{self.display_num} " + else: + self.display_num = None + self._display_prefix = "" + + self.xdotool = f"{self._display_prefix}xdotool" + + async def __call__( + self, + *, + action: Action, + text: Optional[str] = None, + coordinate: Optional[Tuple[int, int]] = None, + **kwargs, + ): + if action in ("mouse_move", "left_click_drag"): + if coordinate is None: + raise ToolError(f"coordinate is required for {action}") + if text is not None: + raise ToolError(f"text is not accepted for {action}") + if not isinstance(coordinate, list) or len(coordinate) != 2: + raise ToolError(f"{coordinate} must be a tuple of length 2") + if not all(isinstance(i, int) and i >= 0 for i in coordinate): + raise ToolError(f"{coordinate} must be a tuple of non-negative ints") + + x, y = self.scale_coordinates( + ScalingSource.API, coordinate[0], coordinate[1] + ) + + if action == "mouse_move": + return await self.shell(f"{self.xdotool} mousemove --sync {x} {y}") + elif action == "left_click_drag": + return await self.shell( + f"{self.xdotool} mousedown 1 mousemove --sync {x} {y} mouseup 1" + ) + + if action in ("key", "type"): + if text is None: + raise ToolError(f"text is required for {action}") + if coordinate is not None: + raise ToolError(f"coordinate is not accepted for {action}") + if not isinstance(text, str): + raise ToolError(output=f"{text} must be a string") + + if action == "key": + return await self.shell(f"{self.xdotool} key -- {text}") + elif action == "type": + results: list[ToolResult] = [] + for chunk in chunks(text, TYPING_GROUP_SIZE): + cmd = f"{self.xdotool} type --delay {TYPING_DELAY_MS} -- {shlex.quote(chunk)}" + results.append(await self.shell(cmd, take_screenshot=False)) + screenshot_base64 = (await self.screenshot()).base64_image + return ToolResult( + output="".join(result.output or "" for result in results), + error="".join(result.error or "" for result in results), + base64_image=screenshot_base64, + ) + + if action in ( + "left_click", + "right_click", + "double_click", + "middle_click", + "screenshot", + "cursor_position", + ): + if text is not None: + raise ToolError(f"text is not accepted for {action}") + if coordinate is not None: + raise ToolError(f"coordinate is not accepted for {action}") + + if action == "screenshot": + return await self.screenshot() + elif action == "cursor_position": + result = await self.shell( + f"{self.xdotool} getmouselocation --shell", + take_screenshot=False, + ) + output = result.output or "" + x, y = self.scale_coordinates( + ScalingSource.COMPUTER, + int(output.split("X=")[1].split("\n")[0]), + int(output.split("Y=")[1].split("\n")[0]), + ) + return result.replace(output=f"X={x},Y={y}") + else: + click_arg = { + "left_click": "1", + "right_click": "3", + "middle_click": "2", + "double_click": "--repeat 2 --delay 500 1", + }[action] + return await self.shell(f"{self.xdotool} click {click_arg}") + + raise ToolError(f"Invalid action: {action}") + + async def screenshot(self): + """Take a screenshot of the current screen and return the base64 encoded image.""" + output_dir = Path(OUTPUT_DIR) + output_dir.mkdir(parents=True, exist_ok=True) + path = output_dir / f"screenshot_{uuid4().hex}.png" + + # Try gnome-screenshot first + if shutil.which("gnome-screenshot"): + screenshot_cmd = f"{self._display_prefix}gnome-screenshot -f {path} -p" + else: + # Fall back to scrot if gnome-screenshot isn't available + screenshot_cmd = f"{self._display_prefix}scrot -p {path}" + + result = await self.shell(screenshot_cmd, take_screenshot=False) + if self._scaling_enabled: + x, y = self.scale_coordinates( + ScalingSource.COMPUTER, self.width, self.height + ) + await self.shell( + f"convert {path} -resize {x}x{y}! {path}", take_screenshot=False + ) + + if path.exists(): + return result.replace( + base64_image=base64.b64encode(path.read_bytes()).decode() + ) + raise ToolError(f"Failed to take screenshot: {result.error}") + + async def shell(self, command: str, take_screenshot=True) -> ToolResult: + """Run a shell command and return the output, error, and optionally a screenshot.""" + _, stdout, stderr = await run(command) + base64_image = None + + if take_screenshot: + # delay to let things settle before taking a screenshot + await asyncio.sleep(self._screenshot_delay) + base64_image = (await self.screenshot()).base64_image + + return ToolResult(output=stdout, error=stderr, base64_image=base64_image) + + def scale_coordinates(self, source: ScalingSource, x: int, y: int): + """Scale coordinates to a target maximum resolution.""" + if not self._scaling_enabled: + return x, y + ratio = self.width / self.height + target_dimension = None + for dimension in MAX_SCALING_TARGETS.values(): + # allow some error in the aspect ratio - not ratios are exactly 16:9 + if abs(dimension["width"] / dimension["height"] - ratio) < 0.02: + if dimension["width"] < self.width: + target_dimension = dimension + break + if target_dimension is None: + return x, y + # should be less than 1 + x_scaling_factor = target_dimension["width"] / self.width + y_scaling_factor = target_dimension["height"] / self.height + if source == ScalingSource.API: + if x > self.width or y > self.height: + raise ToolError(f"Coordinates {x}, {y} are out of bounds") + # scale up + return round(x / x_scaling_factor), round(y / y_scaling_factor) + # scale down + return round(x * x_scaling_factor), round(y * y_scaling_factor) \ No newline at end of file diff --git a/mm_agents/anthropic/tools/edit.py b/mm_agents/anthropic/tools/edit.py new file mode 100644 index 0000000..2e1cf6a --- /dev/null +++ b/mm_agents/anthropic/tools/edit.py @@ -0,0 +1,290 @@ +from collections import defaultdict +from pathlib import Path +from typing import Literal, get_args, Optional, List + +from anthropic.types.beta import BetaToolTextEditor20241022Param + +from .base import BaseAnthropicTool, CLIResult, ToolError, ToolResult +from .run import maybe_truncate, run + +Command = Literal[ + "view", + "create", + "str_replace", + "insert", + "undo_edit", +] +SNIPPET_LINES: int = 4 + + +class EditTool(BaseAnthropicTool): + """ + An filesystem editor tool that allows the agent to view, create, and edit files. + The tool parameters are defined by Anthropic and are not editable. + """ + + api_type: Literal["text_editor_20241022"] = "text_editor_20241022" + name: Literal["str_replace_editor"] = "str_replace_editor" + + _file_history: dict[Path, list[str]] + + def __init__(self): + self._file_history = defaultdict(list) + super().__init__() + + def to_params(self) -> BetaToolTextEditor20241022Param: + return { + "name": self.name, + "type": self.api_type, + } + + async def __call__( + self, + *, + command: Command, + path: str, + file_text: Optional[str] = None, + view_range: Optional[list[int]] = None, + old_str: Optional[str] = None, + new_str: Optional[str] = None, + insert_line: Optional[int] = None, + **kwargs, + ): + _path = Path(path) + self.validate_path(command, _path) + if command == "view": + return await self.view(_path, view_range) + elif command == "create": + if file_text is None: + raise ToolError("Parameter `file_text` is required for command: create") + self.write_file(_path, file_text) + self._file_history[_path].append(file_text) + return ToolResult(output=f"File created successfully at: {_path}") + elif command == "str_replace": + if old_str is None: + raise ToolError( + "Parameter `old_str` is required for command: str_replace" + ) + return self.str_replace(_path, old_str, new_str) + elif command == "insert": + if insert_line is None: + raise ToolError( + "Parameter `insert_line` is required for command: insert" + ) + if new_str is None: + raise ToolError("Parameter `new_str` is required for command: insert") + return self.insert(_path, insert_line, new_str) + elif command == "undo_edit": + return self.undo_edit(_path) + raise ToolError( + f'Unrecognized command {command}. The allowed commands for the {self.name} tool are: {", ".join(get_args(Command))}' + ) + + def validate_path(self, command: str, path: Path): + """ + Check that the path/command combination is valid. + """ + # Check if its an absolute path + if not path.is_absolute(): + suggested_path = Path("") / path + raise ToolError( + f"The path {path} is not an absolute path, it should start with `/`. Maybe you meant {suggested_path}?" + ) + # Check if path exists + if not path.exists() and command != "create": + raise ToolError( + f"The path {path} does not exist. Please provide a valid path." + ) + if path.exists() and command == "create": + raise ToolError( + f"File already exists at: {path}. Cannot overwrite files using command `create`." + ) + # Check if the path points to a directory + if path.is_dir(): + if command != "view": + raise ToolError( + f"The path {path} is a directory and only the `view` command can be used on directories" + ) + + async def view(self, path: Path, view_range: Optional[List[int]] = None): + """Implement the view command""" + if path.is_dir(): + if view_range: + raise ToolError( + "The `view_range` parameter is not allowed when `path` points to a directory." + ) + + _, stdout, stderr = await run( + rf"find {path} -maxdepth 2 -not -path '*/\.*'" + ) + if not stderr: + stdout = f"Here's the files and directories up to 2 levels deep in {path}, excluding hidden items:\n{stdout}\n" + return CLIResult(output=stdout, error=stderr) + + file_content = self.read_file(path) + init_line = 1 + if view_range: + if len(view_range) != 2 or not all(isinstance(i, int) for i in view_range): + raise ToolError( + "Invalid `view_range`. It should be a list of two integers." + ) + file_lines = file_content.split("\n") + n_lines_file = len(file_lines) + init_line, final_line = view_range + if init_line < 1 or init_line > n_lines_file: + raise ToolError( + f"Invalid `view_range`: {view_range}. Its first element `{init_line}` should be within the range of lines of the file: {[1, n_lines_file]}" + ) + if final_line > n_lines_file: + raise ToolError( + f"Invalid `view_range`: {view_range}. Its second element `{final_line}` should be smaller than the number of lines in the file: `{n_lines_file}`" + ) + if final_line != -1 and final_line < init_line: + raise ToolError( + f"Invalid `view_range`: {view_range}. Its second element `{final_line}` should be larger or equal than its first `{init_line}`" + ) + + if final_line == -1: + file_content = "\n".join(file_lines[init_line - 1 :]) + else: + file_content = "\n".join(file_lines[init_line - 1 : final_line]) + + return CLIResult( + output=self._make_output(file_content, str(path), init_line=init_line) + ) + + def str_replace(self, path: Path, old_str: str, new_str: Optional[str]): + """Implement the str_replace command, which replaces old_str with new_str in the file content""" + # Read the file content + file_content = self.read_file(path).expandtabs() + old_str = old_str.expandtabs() + new_str = new_str.expandtabs() if new_str is not None else "" + + # Check if old_str is unique in the file + occurrences = file_content.count(old_str) + if occurrences == 0: + raise ToolError( + f"No replacement was performed, old_str `{old_str}` did not appear verbatim in {path}." + ) + elif occurrences > 1: + file_content_lines = file_content.split("\n") + lines = [ + idx + 1 + for idx, line in enumerate(file_content_lines) + if old_str in line + ] + raise ToolError( + f"No replacement was performed. Multiple occurrences of old_str `{old_str}` in lines {lines}. Please ensure it is unique" + ) + + # Replace old_str with new_str + new_file_content = file_content.replace(old_str, new_str) + + # Write the new content to the file + self.write_file(path, new_file_content) + + # Save the content to history + self._file_history[path].append(file_content) + + # Create a snippet of the edited section + replacement_line = file_content.split(old_str)[0].count("\n") + start_line = max(0, replacement_line - SNIPPET_LINES) + end_line = replacement_line + SNIPPET_LINES + new_str.count("\n") + snippet = "\n".join(new_file_content.split("\n")[start_line : end_line + 1]) + + # Prepare the success message + success_msg = f"The file {path} has been edited. " + success_msg += self._make_output( + snippet, f"a snippet of {path}", start_line + 1 + ) + success_msg += "Review the changes and make sure they are as expected. Edit the file again if necessary." + + return CLIResult(output=success_msg) + + def insert(self, path: Path, insert_line: int, new_str: str): + """Implement the insert command, which inserts new_str at the specified line in the file content.""" + file_text = self.read_file(path).expandtabs() + new_str = new_str.expandtabs() + file_text_lines = file_text.split("\n") + n_lines_file = len(file_text_lines) + + if insert_line < 0 or insert_line > n_lines_file: + raise ToolError( + f"Invalid `insert_line` parameter: {insert_line}. It should be within the range of lines of the file: {[0, n_lines_file]}" + ) + + new_str_lines = new_str.split("\n") + new_file_text_lines = ( + file_text_lines[:insert_line] + + new_str_lines + + file_text_lines[insert_line:] + ) + snippet_lines = ( + file_text_lines[max(0, insert_line - SNIPPET_LINES) : insert_line] + + new_str_lines + + file_text_lines[insert_line : insert_line + SNIPPET_LINES] + ) + + new_file_text = "\n".join(new_file_text_lines) + snippet = "\n".join(snippet_lines) + + self.write_file(path, new_file_text) + self._file_history[path].append(file_text) + + success_msg = f"The file {path} has been edited. " + success_msg += self._make_output( + snippet, + "a snippet of the edited file", + max(1, insert_line - SNIPPET_LINES + 1), + ) + success_msg += "Review the changes and make sure they are as expected (correct indentation, no duplicate lines, etc). Edit the file again if necessary." + return CLIResult(output=success_msg) + + def undo_edit(self, path: Path): + """Implement the undo_edit command.""" + if not self._file_history[path]: + raise ToolError(f"No edit history found for {path}.") + + old_text = self._file_history[path].pop() + self.write_file(path, old_text) + + return CLIResult( + output=f"Last edit to {path} undone successfully. {self._make_output(old_text, str(path))}" + ) + + def read_file(self, path: Path): + """Read the content of a file from a given path; raise a ToolError if an error occurs.""" + try: + return path.read_text() + except Exception as e: + raise ToolError(f"Ran into {e} while trying to read {path}") from None + + def write_file(self, path: Path, file: str): + """Write the content of a file to a given path; raise a ToolError if an error occurs.""" + try: + path.write_text(file) + except Exception as e: + raise ToolError(f"Ran into {e} while trying to write to {path}") from None + + def _make_output( + self, + file_content: str, + file_descriptor: str, + init_line: int = 1, + expand_tabs: bool = True, + ): + """Generate output for the CLI based on the content of a file.""" + file_content = maybe_truncate(file_content) + if expand_tabs: + file_content = file_content.expandtabs() + file_content = "\n".join( + [ + f"{i + init_line:6}\t{line}" + for i, line in enumerate(file_content.split("\n")) + ] + ) + return ( + f"Here's the result of running `cat -n` on {file_descriptor}:\n" + + file_content + + "\n" + ) \ No newline at end of file diff --git a/mm_agents/anthropic/tools/run.py b/mm_agents/anthropic/tools/run.py new file mode 100644 index 0000000..1f50e94 --- /dev/null +++ b/mm_agents/anthropic/tools/run.py @@ -0,0 +1,42 @@ +"""Utility to run shell commands asynchronously with a timeout.""" + +import asyncio +from typing import Optional +TRUNCATED_MESSAGE: str = "To save on context only part of this file has been shown to you. You should retry this tool after you have searched inside the file with `grep -n` in order to find the line numbers of what you are looking for." +MAX_RESPONSE_LEN: int = 16000 + + +def maybe_truncate(content: str, truncate_after: Optional[int] = MAX_RESPONSE_LEN): + """Truncate content and append a notice if content exceeds the specified length.""" + return ( + content + if not truncate_after or len(content) <= truncate_after + else content[:truncate_after] + TRUNCATED_MESSAGE + ) + + +async def run( + cmd: str, + timeout: Optional[float] = 120.0, # seconds + truncate_after: Optional[int] = MAX_RESPONSE_LEN, +): + """Run a shell command asynchronously with a timeout.""" + process = await asyncio.create_subprocess_shell( + cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE + ) + + try: + stdout, stderr = await asyncio.wait_for(process.communicate(), timeout=timeout) + return ( + process.returncode or 0, + maybe_truncate(stdout.decode(), truncate_after=truncate_after), + maybe_truncate(stderr.decode(), truncate_after=truncate_after), + ) + except asyncio.TimeoutError as exc: + try: + process.kill() + except ProcessLookupError: + pass + raise TimeoutError( + f"Command '{cmd}' timed out after {timeout} seconds" + ) from exc \ No newline at end of file diff --git a/mm_agents/anthropic/utils.py b/mm_agents/anthropic/utils.py new file mode 100644 index 0000000..195a82c --- /dev/null +++ b/mm_agents/anthropic/utils.py @@ -0,0 +1,246 @@ +""" +Utility functions for the Anthropic API. +""" +from typing import List, Union, cast +from enum import Enum +from anthropic import ( + Anthropic, + AnthropicBedrock, + AnthropicVertex, + APIError, + APIResponseValidationError, + APIStatusError, +) +from anthropic.types.beta import ( + BetaCacheControlEphemeralParam, + BetaContentBlockParam, + BetaImageBlockParam, + BetaMessage, + BetaMessageParam, + BetaTextBlock, + BetaTextBlockParam, + BetaToolResultBlockParam, + BetaToolUseBlockParam, +) +from datetime import datetime + +from .tools import ToolResult + + +COMPUTER_USE_BETA_FLAG = "computer-use-2024-10-22" +PROMPT_CACHING_BETA_FLAG = "prompt-caching-2024-07-31" + + +class APIProvider(Enum): + ANTHROPIC = "anthropic" + BEDROCK = "bedrock" + VERTEX = "vertex" + + +PROVIDER_TO_DEFAULT_MODEL_NAME: dict[(APIProvider, str), str] = { + (APIProvider.ANTHROPIC, "claude-3-5-sonnet-20241022"): "claude-3-5-sonnet-20241022", + (APIProvider.BEDROCK, "claude-3-5-sonnet-20241022"): "us.anthropic.claude-3-5-sonnet-20241022-v2:0", + (APIProvider.VERTEX, "claude-3-5-sonnet-20241022"): "claude-3-5-sonnet-v1@20241022", + (APIProvider.ANTHROPIC, "claude-3-7-sonnet-20250219"): "claude-3-7-sonnet-20250219", + (APIProvider.BEDROCK, "claude-3-7-sonnet-20250219"): "us.anthropic.claude-3-7-sonnet-20250219-v1:0", + (APIProvider.VERTEX, "claude-3-7-sonnet-20250219"): "claude-3-7-sonnet-v1@20250219", + (APIProvider.ANTHROPIC, "claude-4-opus-20250514"): "claude-4-opus-20250514", + (APIProvider.BEDROCK, "claude-4-opus-20250514"): "us.anthropic.claude-opus-4-20250514-v1:0", + (APIProvider.VERTEX, "claude-4-opus-20250514"): "claude-4-opus-v1@20250514", + (APIProvider.ANTHROPIC, "claude-4-sonnet-20250514"): "claude-4-sonnet-20250514", + (APIProvider.BEDROCK, "claude-4-sonnet-20250514"): "us.anthropic.claude-sonnet-4-20250514-v1:0", + (APIProvider.VERTEX, "claude-4-sonnet-20250514"): "claude-sonnet-4-v1@20250514", +} + + +# This system prompt is optimized for the Docker environment in this repository and +# specific tool combinations enabled. +# We encourage modifying this system prompt to ensure the model has context for the +# environment it is running in, and to provide any additional information that may be +# helpful for the task at hand. +SYSTEM_PROMPT = f""" +* You are utilising an Ubuntu virtual machine using x86_64 architecture with internet access. +* You can feel free to install Ubuntu applications with your bash tool. Use curl instead of wget. +* To open browser, please just click on the Chrome icon. Note, Chrome is what is installed on your system. +* Using bash tool you can start GUI applications, but you need to set export DISPLAY=:1 and use a subshell. For example "(DISPLAY=:1 xterm &)". GUI apps run with bash tool will appear within your desktop environment, but they may take some time to appear. Take a screenshot to confirm it did. +* When using your bash tool with commands that are expected to output very large quantities of text, redirect into a tmp file and use str_replace_editor or `grep -n -B -A ` to confirm output. +* When viewing a page it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available. +* DO NOT ask users for clarification during task execution. DO NOT stop to request more information from users. Always take action using available tools. +* When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request. +* The current date is {datetime.today().strftime('%A, %B %d, %Y')}. +* Home directory of this Ubuntu system is '/home/user'. + + + +* If the item you are looking at is a pdf, if after taking a single screenshot of the pdf it seems that you want to read the entire document instead of trying to continue to read the pdf from your screenshots + navigation, determine the URL, use curl to download the pdf, install and use pdftotext to convert it to a text file, and then read that text file directly with your StrReplaceEditTool. +""" + +SYSTEM_PROMPT_WINDOWS = f""" +* You are utilising a Windows virtual machine using x86_64 architecture with internet access. +* To open browser, please just click on the Chrome icon. Note, Chrome is what is installed on your system. +* When viewing a page it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available. +* The current date is {datetime.today().strftime('%A, %B %d, %Y')}. +* Home directory of this Windows system is 'C:\\Users\\user'. +* When you want to open some applications on Windows, please use Double Click on it instead of clicking once. +""" + + + +def _make_api_tool_result( + result: ToolResult, tool_use_id: str + ) -> BetaToolResultBlockParam: + """Convert an agent ToolResult to an API ToolResultBlockParam.""" + tool_result_content: Union[List[Union[BetaTextBlockParam, + BetaImageBlockParam]], str] = [] + is_error = False + + if not result or (result.get('error') is not None and result.get('error') != ""): + is_error = True + error_message = str(result.get('error', 'Unknown error occurred')) if result else 'No result received' + tool_result_content = [{ + "type": "text", + "text": _maybe_prepend_system_tool_result(result, error_message) + }] + + else: + if result.get('output'): + tool_result_content.append({ + "type": "text", + "text": _maybe_prepend_system_tool_result( + result, + str(result.get('output', '') + if result else '') + ), + }) + + if result.get('base64_image'): + tool_result_content.append({ + "type": "image", + "source": { + "type": "base64", + "media_type": "image/png", + "data": result.get('base64_image', ''), + }, + }) + + if not tool_result_content: + tool_result_content.append({ + "type": "text", + "text": "Action completed successfully" + }) + + return { + "type": "tool_result", + "content": tool_result_content, + "tool_use_id": tool_use_id, + "is_error": is_error, + } + +def _maybe_prepend_system_tool_result(result: ToolResult, result_text: str): + if not result: + return result_text + + if result.get('system', False): + result_text = f"{result.get('system','')}\n{result_text}" + return result_text + + + +def _inject_prompt_caching( + messages: list[BetaMessageParam], +): + """ + Set cache breakpoints for the 3 most recent turns + one cache breakpoint is left for tools/system prompt, to be shared across sessions + """ + + breakpoints_remaining = 3 + for message in reversed(messages): + if message["role"] == "user" and isinstance( + content := message["content"], list + ): + if breakpoints_remaining: + breakpoints_remaining -= 1 + # Use type ignore to bypass TypedDict check until SDK types are updated + content[-1]["cache_control"] = BetaCacheControlEphemeralParam( # type: ignore + {"type": "ephemeral"} + ) + else: + content[-1].pop("cache_control", None) + # we'll only every have one extra turn per loop + break + + +def _maybe_filter_to_n_most_recent_images( + messages: list[BetaMessageParam], + images_to_keep: int, + min_removal_threshold: int, +): + """ + With the assumption that images are screenshots that are of diminishing value as + the conversation progresses, remove all but the final `images_to_keep` tool_result + images in place, with a chunk of min_removal_threshold to reduce the amount we + break the implicit prompt cache. + """ + if images_to_keep is None: + return messages + + tool_result_blocks = cast( + list[BetaToolResultBlockParam], + [ + item + for message in messages + for item in ( + message["content"] if isinstance(message["content"], list) else [] + ) + if isinstance(item, dict) and item.get("type") == "tool_result" + ], + ) + + total_images = sum( + 1 + for tool_result in tool_result_blocks + for content in tool_result.get("content", []) + if isinstance(content, dict) and content.get("type") == "image" + ) + + images_to_remove = total_images - images_to_keep + # for better cache behavior, we want to remove in chunks + images_to_remove -= images_to_remove % min_removal_threshold + + for tool_result in tool_result_blocks: + if isinstance(tool_result.get("content"), list): + new_content = [] + for content in tool_result.get("content", []): + if isinstance(content, dict) and content.get("type") == "image": + if images_to_remove > 0: + images_to_remove -= 1 + continue + new_content.append(content) + tool_result["content"] = new_content + + +def _response_to_params( + response: BetaMessage, +) -> list[BetaContentBlockParam]: + res: list[BetaContentBlockParam] = [] + if response.content: + for block in response.content: + if isinstance(block, BetaTextBlock): + if block.text: + res.append(BetaTextBlockParam(type="text", text=block.text)) + elif getattr(block, "type", None) == "thinking": + # Handle thinking blocks - include signature field + thinking_block = { + "type": "thinking", + "thinking": getattr(block, "thinking", None), + } + if hasattr(block, "signature"): + thinking_block["signature"] = getattr(block, "signature", None) + res.append(cast(BetaContentBlockParam, thinking_block)) + else: + # Handle tool use blocks normally + res.append(cast(BetaToolUseBlockParam, block.model_dump())) + return res + else: + return [] \ No newline at end of file diff --git a/run_multienv_claude.py b/run_multienv_claude.py new file mode 100644 index 0000000..2770334 --- /dev/null +++ b/run_multienv_claude.py @@ -0,0 +1,384 @@ +"""Script to run end-to-end evaluation on the benchmark. +Utils and basic architecture credit to https://github.com/web-arena-x/webarena/blob/main/run.py. +""" + +import argparse +import datetime +import json +import logging +import os +import sys +from typing import List, Dict +import math +from tqdm import tqdm +from multiprocessing import Process, Manager +import lib_run_single +from desktop_env.desktop_env import DesktopEnv +from mm_agents.anthropic import AnthropicAgent as PromptAgent + +# import fake_run_single as lib_run_single +# from test_env import DesktopEnv + +# .env +from dotenv import load_dotenv +load_dotenv() + + +# Logger Configs {{{ # +logger = logging.getLogger() +logger.setLevel(logging.DEBUG) + +datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S") + +file_handler = logging.FileHandler( + os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8" +) +debug_handler = logging.FileHandler( + os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8" +) +stdout_handler = logging.StreamHandler(sys.stdout) +sdebug_handler = logging.FileHandler( + os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8" +) + +file_handler.setLevel(logging.INFO) +debug_handler.setLevel(logging.DEBUG) +stdout_handler.setLevel(logging.INFO) +sdebug_handler.setLevel(logging.DEBUG) + +formatter = logging.Formatter( + fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s" +) +file_handler.setFormatter(formatter) +debug_handler.setFormatter(formatter) +stdout_handler.setFormatter(formatter) +sdebug_handler.setFormatter(formatter) + +stdout_handler.addFilter(logging.Filter("desktopenv")) +sdebug_handler.addFilter(logging.Filter("desktopenv")) + +logger.addHandler(file_handler) +logger.addHandler(debug_handler) +logger.addHandler(stdout_handler) +logger.addHandler(sdebug_handler) +# }}} Logger Configs # + +logger = logging.getLogger("desktopenv.experiment") + + +def config() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Run end-to-end evaluation on the benchmark" + ) + + # environment config + parser.add_argument("--path_to_vm", type=str, default=None) + parser.add_argument( + "--headless", action="store_true", help="Run in headless machine" + ) + parser.add_argument( + "--action_space", type=str, default="claude_computer_use", help="Action type" + ) + parser.add_argument( + "--observation_type", + choices=["screenshot", "a11y_tree", "screenshot_a11y_tree", "som"], + default="a11y_tree", + help="Observation type", + ) + parser.add_argument("--screen_width", type=int, default=1920) + parser.add_argument("--screen_height", type=int, default=1080) + parser.add_argument("--sleep_after_execution", type=float, default=0.0) + parser.add_argument("--max_steps", type=int, default=15) + + # agent config + parser.add_argument("--max_trajectory_length", type=int, default=3) + parser.add_argument( + "--test_config_base_dir", type=str, default="evaluation_examples" + ) + + # lm config + parser.add_argument("--model", type=str, default="claude-4-sonnet-20250514") + parser.add_argument("--temperature", type=float, default=1.0) + parser.add_argument("--top_p", type=float, default=0.9) + parser.add_argument("--max_tokens", type=int, default=1500) + parser.add_argument("--stop_token", type=str, default=None) + + # example config + parser.add_argument("--domain", type=str, default="all") + parser.add_argument( + "--test_all_meta_path", type=str, default="evaluation_examples/test_all.json" + ) + + # logging related + parser.add_argument("--result_dir", type=str, default="./results") + parser.add_argument("--num_envs", type=int, default=1, help="Number of environments to run in parallel") + + # aws config + parser.add_argument( + "--region", type=str, default="us-east-1", help="AWS region for the VM" + ) + + args = parser.parse_args() + return args + + +def distribute_tasks(test_all_meta: dict, num_envs: int) -> List[Dict]: + """Distribute tasks evenly across environments.""" + # Flatten the tasks into a single list + all_tasks = [] + for domain, examples in test_all_meta.items(): + for example_id in examples: + all_tasks.append((domain, example_id)) + + # Calculate tasks per environment + tasks_per_env = math.ceil(len(all_tasks) / num_envs) + + # Distribute tasks + distributed_tasks = [] + for i in range(num_envs): + env_tasks = {} + start_idx = i * tasks_per_env + end_idx = min((i + 1) * tasks_per_env, len(all_tasks)) + + for domain, example_id in all_tasks[start_idx:end_idx]: + if domain not in env_tasks: + env_tasks[domain] = [] + env_tasks[domain].append(example_id) + + distributed_tasks.append(env_tasks) + + return distributed_tasks + + + +def run_env_tasks(env_idx: int, env: DesktopEnv, agent: PromptAgent, env_tasks: dict, args: argparse.Namespace, shared_scores: list): + """Run tasks for a single environment.""" + logger.info(f"Executing tasks in environment {env_idx + 1}/{args.num_envs}") + + for domain in tqdm(env_tasks, desc=f"Env{env_idx+1}-Domain"): + for example_id in tqdm(env_tasks[domain], desc="Example", leave=False): + config_file = os.path.join( + args.test_config_base_dir, f"examples/{domain}/{example_id}.json" + ) + with open(config_file, "r", encoding="utf-8") as f: + example = json.load(f) + + logger.info(f"[Env {env_idx+1}][Domain]: {domain}") + logger.info(f"[Env {env_idx+1}][Example ID]: {example_id}") + logger.info(f"[Env {env_idx+1}][Instruction]: {example['instruction']}") + + example_result_dir = os.path.join( + args.result_dir, + args.action_space, + args.observation_type, + args.model, + domain, + example_id, + ) + os.makedirs(example_result_dir, exist_ok=True) + + try: + lib_run_single.run_single_example( + agent, + env, + example, + args.max_steps, + example["instruction"], + args, + example_result_dir, + shared_scores, + ) + except Exception as e: + import traceback + # logger traceback + logger.error(f"Exception in Env{env_idx+1} {domain}/{example_id}: {e}") + logger.error(traceback.format_exc()) + env.controller.end_recording( + os.path.join(example_result_dir, "recording.mp4") + ) + with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f: + f.write( + json.dumps( + {"Error": f"Time limit exceeded in {domain}/{example_id}"} + ) + ) + f.write("\n") + + env.close() + + +def test(args: argparse.Namespace, test_all_meta: dict) -> None: + logger.info("Args: %s", args) + + distributed_tasks = distribute_tasks(test_all_meta, args.num_envs) + + # First, set up all environments + logger.info("Setting up all environments...") + envs = [] + agents = [] + + for env_idx in range(args.num_envs): + logger.info(f"Setting up environment {env_idx + 1}/{args.num_envs}") + + agent = PromptAgent( + model=args.model, + max_tokens=args.max_tokens, + top_p=args.top_p, + temperature=args.temperature, + action_space=args.action_space, + observation_type=args.observation_type, + max_trajectory_length=args.max_trajectory_length, + screen_size=(args.screen_width, args.screen_height), + ) + agents.append(agent) + + from desktop_env.providers.aws.manager import IMAGE_ID_MAP + REGION = "us-east-1" + env = DesktopEnv( + path_to_vm=args.path_to_vm, + action_space=agent.action_space, + + provider_name="aws", + region="us-east-1", + snapshot_name=IMAGE_ID_MAP[REGION], + screen_size=(args.screen_width, args.screen_height), + headless=args.headless, + os_type="Ubuntu", + require_a11y_tree=args.observation_type + in ["a11y_tree", "screenshot_a11y_tree", "som"], + ) + envs.append(env) + + logger.info("All environments are ready. Starting parallel task execution...") + + # Create a shared list for scores across processes + with Manager() as manager: + shared_scores = manager.list() + + # Create and start processes for each environment + processes = [] + for env_idx, (env, agent, env_tasks) in enumerate(zip(envs, agents, distributed_tasks)): + p = Process( + target=run_env_tasks, + args=(env_idx, env, agent, env_tasks, args, shared_scores) + ) + processes.append(p) + p.start() + + # Wait for all processes to complete + for p in processes: + p.join() + + # Convert shared list to regular list + scores = list(shared_scores) + + logger.info(f"Average score: {sum(scores) / len(scores) if scores else 0}") + + +def get_unfinished( + action_space, use_model, observation_type, result_dir, total_file_json +): + target_dir = os.path.join(result_dir, action_space, observation_type, use_model) + + if not os.path.exists(target_dir): + return total_file_json + + finished = {} + for domain in os.listdir(target_dir): + finished[domain] = [] + domain_path = os.path.join(target_dir, domain) + if os.path.isdir(domain_path): + for example_id in os.listdir(domain_path): + if example_id == "onboard": + continue + example_path = os.path.join(domain_path, example_id) + if os.path.isdir(example_path): + if "result.txt" not in os.listdir(example_path): + # empty all files under example_id + for file in os.listdir(example_path): + os.remove(os.path.join(example_path, file)) + else: + finished[domain].append(example_id) + + if not finished: + return total_file_json + + for domain, examples in finished.items(): + if domain in total_file_json: + total_file_json[domain] = [ + x for x in total_file_json[domain] if x not in examples + ] + + return total_file_json + + +def get_result(action_space, use_model, observation_type, result_dir, total_file_json): + target_dir = os.path.join(result_dir, action_space, observation_type, use_model) + if not os.path.exists(target_dir): + print("New experiment, no result yet.") + return None + + all_result = [] + + for domain in os.listdir(target_dir): + domain_path = os.path.join(target_dir, domain) + if os.path.isdir(domain_path): + for example_id in os.listdir(domain_path): + example_path = os.path.join(domain_path, example_id) + if os.path.isdir(example_path): + if "result.txt" in os.listdir(example_path): + # empty all files under example_id + try: + all_result.append( + float( + open( + os.path.join(example_path, "result.txt"), "r" + ).read() + ) + ) + except: + all_result.append(0.0) + + if not all_result: + print("New experiment, no result yet.") + return None + else: + print("Current Success Rate:", sum(all_result) / len(all_result) * 100, "%") + return all_result + + +if __name__ == "__main__": + ####### The complete version of the list of examples ####### + os.environ["TOKENIZERS_PARALLELISM"] = "false" + + args = config() + + with open(args.test_all_meta_path, "r", encoding="utf-8") as f: + test_all_meta = json.load(f) + + if args.domain != "all": + test_all_meta = {args.domain: test_all_meta[args.domain]} + + test_file_list = get_unfinished( + args.action_space, + args.model, + args.observation_type, + args.result_dir, + test_all_meta, + ) + left_info = "" + for domain in test_file_list: + left_info += f"{domain}: {len(test_file_list[domain])}\n" + logger.info(f"Left tasks:\n{left_info}") + + get_result( + args.action_space, + args.model, + args.observation_type, + args.result_dir, + test_all_meta, + ) + test(args, test_file_list) + + +# path_to_vm can be a list["xxx","xxx"] \ No newline at end of file