From 27319ce1e359fecf530d3e23619a46d609f178db Mon Sep 17 00:00:00 2001 From: Yuan Mengqi <100453613+yuanmengqi@users.noreply.github.com> Date: Sun, 13 Jul 2025 00:25:37 +0800 Subject: [PATCH 1/3] fix password&resolution (#251) * fix chrome * fix: fix proxy setup * feat&fix: add proxy support in setup and remove hardcoded proxy from example * fix tasks * fix chrome finished * fix * clean chrome_fix code * clean chrome_fix code * fix chrome 2888b4e6-5b47-4b57-8bf5-c73827890774 * fix multiapps * fix chrome 2888b4e6-5b47-4b57-8bf5-c73827890774 * fix some multi_apps tasks * fix some multi_apps tasks * fix password&resolution * fix password&resolution --------- Co-authored-by: adlsdztony --- desktop_env/controllers/setup.py | 24 ++++++++++++++++++- desktop_env/desktop_env.py | 2 +- desktop_env/providers/aws/manager.py | 15 ++++++++---- .../21760ecb-8f62-40d2-8d85-0cee5725cb72.json | 2 +- .../550ce7e7-747b-495f-b122-acdc4d0b8e54.json | 2 +- .../a669ef01-ded5-4099-9ea9-25e99b569840.json | 2 +- .../ef9d12bd-bcee-4ba0-a40e-918400f43ddf.json | 2 +- .../02ce9a50-7af2-47ed-8596-af0c230501f8.json | 2 +- .../36037439-2044-4b50-b9d1-875b5a332143.json | 2 +- .../48d05431-6cd5-4e76-82eb-12b60d823f7d.json | 2 +- .../a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json | 2 +- .../b337d106-053f-4d37-8da0-7f9c4043a66b.json | 2 +- .../13584542-872b-42d8-b299-866967b5c3ef.json | 2 +- .../23393935-50c7-4a86-aeea-2b78fd089c5c.json | 2 +- .../28cc3b7e-b194-4bc9-8353-d04c0f4d56d2.json | 2 +- .../37887e8c-da15-4192-923c-08fa390a176d.json | 2 +- .../3ce045a0-877b-42aa-8d2c-b4a863336ab8.json | 2 +- .../4783cc41-c03c-4e1b-89b4-50658f642bd5.json | 2 +- .../4d117223-a354-47fb-8b45-62ab1390a95f.json | 2 +- .../5c1075ca-bb34-46a3-a7a0-029bd7463e79.json | 2 +- .../5ced85fc-fa1a-4217-95fd-0fb530545ce2.json | 2 +- .../5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57.json | 2 +- .../6f56bf42-85b8-4fbb-8e06-6c44960184ba.json | 2 +- .../94d95f96-9699-4208-98ba-3c3119edf9c2.json | 2 +- .../a462a795-fdc7-4b23-b689-e8b6df786b78.json | 2 +- .../a4d98375-215b-4a4d-aee9-3d4370fccc41.json | 2 +- .../b6781586-6346-41cd-935a-a6b1487918fc.json | 2 +- .../bedcedc4-4d72-425e-ad62-21960b11fe0d.json | 2 +- .../e0df059f-28a6-4169-924f-b9623e7184cc.json | 2 +- .../ec4e3f68-9ea4-4c18-a5c9-69f89d1178b3.json | 2 +- .../f9be0997-4b7c-45c5-b05c-4612b44a6118.json | 2 +- .../215dfd39-f493-4bc3-a027-8a97d72c61bf.json | 2 +- .../386dbd0e-0241-4a0a-b6a2-6704fba26b1c.json | 2 +- .../59f21cfb-0120-4326-b255-a5b827b38967.json | 2 +- .../8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89.json | 2 +- .../8d9fd4e2-6fdb-46b0-b9b9-02f06495c62f.json | 2 +- .../8f080098-ddb1-424c-b438-4e96e5e4786e.json | 2 +- .../9195653c-f4aa-453d-aa95-787f6ccfaae9.json | 2 +- .../a5bbbcd5-b398-4c91-83d4-55e1e31bbb81.json | 2 +- .../aa4b5023-aef6-4ed9-bdc9-705f59ab9ad6.json | 2 +- .../bba3381f-b5eb-4439-bd9e-80c22218d5a7.json | 2 +- .../d06f0d4d-2cd5-4ede-8de9-598629438c6e.json | 2 +- .../efcf0d81-0835-4880-b2fd-d866e8bc2294.json | 2 +- .../f3977615-2b45-4ac5-8bba-80c17dbe2a37.json | 2 +- .../fba2c100-79e8-42df-ae74-b592418d54f4.json | 2 +- mm_agents/openai_cua_agent.py | 6 ++--- run_multienv_openaicua.py | 14 +++++------ 47 files changed, 86 insertions(+), 59 deletions(-) diff --git a/desktop_env/controllers/setup.py b/desktop_env/controllers/setup.py index 140a0a0..83f9212 100644 --- a/desktop_env/controllers/setup.py +++ b/desktop_env/controllers/setup.py @@ -27,7 +27,13 @@ import dotenv # Load environment variables from .env file dotenv.load_dotenv() -CLIENT_PASSWORD = os.getenv("CLIENT_PASSWORD", "osworld-public-evaluation") # Default password for sudo operations +if os.environ.get("PROVIDER_NAME") == "aws": + os.environ["CLIENT_PASSWORD"] = os.environ.get("CLIENT_PASSWORD_AWS", "osworld-public-evaluation") +else: + os.environ["CLIENT_PASSWORD"] = os.environ.get("CLIENT_PASSWORD", "password") + +CLIENT_PASSWORD = os.environ["CLIENT_PASSWORD"] + PROXY_CONFIG_FILE = os.getenv("PROXY_CONFIG_FILE", "evaluation_examples/settings/proxy/dataimpulse.json") # Default proxy config file logger = logging.getLogger("desktopenv.setup") @@ -298,6 +304,22 @@ class SetupController: terminates: bool = False nb_failings = 0 + def replace_screen_env_in_command(command_list): + width = int(os.environ.get("SCREEN_WIDTH", 1920)) + height = int(os.environ.get("SCREEN_HEIGHT", 1080)) + width_half = str(width // 2) + height_half = str(height // 2) + new_command_list = [] + for item in command_list: + if isinstance(item, str): + item = item.replace("{SCREEN_WIDTH_HALF}", width_half) + item = item.replace("{SCREEN_HEIGHT_HALF}", height_half) + item = item.replace("{SCREEN_WIDTH}", str(width)) + item = item.replace("{SCREEN_HEIGHT}", str(height)) + new_command_list.append(item) + return new_command_list + if isinstance(command, list): + command = replace_screen_env_in_command(command) payload = json.dumps({"command": command, "shell": shell}) headers = {"Content-Type": "application/json"} diff --git a/desktop_env/desktop_env.py b/desktop_env/desktop_env.py index be817c9..1ade6fe7 100644 --- a/desktop_env/desktop_env.py +++ b/desktop_env/desktop_env.py @@ -32,7 +32,7 @@ class DesktopEnv(gym.Env): snapshot_name: str = "init_state", action_space: str = "computer_13", cache_dir: str = "cache", - screen_size: Tuple[int] = (1920, 1080), + screen_size: Tuple[int] = (int(os.environ.get("SCREEN_WIDTH", 1920)), int(os.environ.get("SCREEN_HEIGHT", 1080))), headless: bool = False, require_a11y_tree: bool = True, require_terminal: bool = False, diff --git a/desktop_env/providers/aws/manager.py b/desktop_env/providers/aws/manager.py index 287327d..4b53e1f 100644 --- a/desktop_env/providers/aws/manager.py +++ b/desktop_env/providers/aws/manager.py @@ -36,15 +36,22 @@ DEFAULT_REGION = "us-east-1" # todo: Add doc for the configuration of image, security group and network interface # todo: public the AMI images IMAGE_ID_MAP = { - "us-east-1": "ami-09138bff939f82bd8", - "ap-east-1": "ami-0c092a5b8be4116f5", + "us-east-1": { + (1920, 1080): "ami-09138bff939f82bd8" + }, + "ap-east-1": { + (1920, 1080): "ami-0c092a5b8be4116f5" + } } -def _allocate_vm(region=DEFAULT_REGION): +def _allocate_vm(region=DEFAULT_REGION, screen_size=(1920, 1080)): if region not in IMAGE_ID_MAP: raise ValueError(f"Region {region} is not supported. Supported regions are: {list(IMAGE_ID_MAP.keys())}") + if screen_size not in IMAGE_ID_MAP[region]: + raise ValueError(f"Screen size {screen_size} not supported for region {region}. Supported: {list(IMAGE_ID_MAP[region].keys())}") + ami_id = IMAGE_ID_MAP[region][screen_size] ec2_client = boto3.client('ec2', region_name=region) instance_id = None @@ -86,7 +93,7 @@ def _allocate_vm(region=DEFAULT_REGION): run_instances_params = { "MaxCount": 1, "MinCount": 1, - "ImageId": IMAGE_ID_MAP[region], + "ImageId": ami_id, "InstanceType": INSTANCE_TYPE, "EbsOptimized": True, "NetworkInterfaces": [ diff --git a/evaluation_examples/examples/libreoffice_impress/21760ecb-8f62-40d2-8d85-0cee5725cb72.json b/evaluation_examples/examples/libreoffice_impress/21760ecb-8f62-40d2-8d85-0cee5725cb72.json index 82a830b..57e1122 100644 --- a/evaluation_examples/examples/libreoffice_impress/21760ecb-8f62-40d2-8d85-0cee5725cb72.json +++ b/evaluation_examples/examples/libreoffice_impress/21760ecb-8f62-40d2-8d85-0cee5725cb72.json @@ -33,7 +33,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; time.sleep(4); pyautogui.doubleClick(x=960, y=540); time.sleep(0.5);pyautogui.mouseDown(); pyautogui.mouseUp(); time.sleep(0.5);" + "import pyautogui; import time; time.sleep(4); pyautogui.doubleClick(x={SCREEN_WIDTH_HALF}, y={SCREEN_HEIGHT_HALF}); time.sleep(0.5);pyautogui.mouseDown(); pyautogui.mouseUp(); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/libreoffice_impress/550ce7e7-747b-495f-b122-acdc4d0b8e54.json b/evaluation_examples/examples/libreoffice_impress/550ce7e7-747b-495f-b122-acdc4d0b8e54.json index 9080319..d4e204f 100644 --- a/evaluation_examples/examples/libreoffice_impress/550ce7e7-747b-495f-b122-acdc4d0b8e54.json +++ b/evaluation_examples/examples/libreoffice_impress/550ce7e7-747b-495f-b122-acdc4d0b8e54.json @@ -33,7 +33,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; time.sleep(4); pyautogui.doubleClick(x=200, y=650); time.sleep(0.5);pyautogui.mouseDown(); pyautogui.mouseUp(); time.sleep(0.5);" + "import pyautogui; import time; time.sleep(4); pyautogui.click(170, 250); time.sleep(1);pyautogui.press('down'); time.sleep(1); pyautogui.press('down'); time.sleep(1); pyautogui.press('down'); time.sleep(1); pyautogui.press('down'); time.sleep(1); " ] } } diff --git a/evaluation_examples/examples/libreoffice_impress/a669ef01-ded5-4099-9ea9-25e99b569840.json b/evaluation_examples/examples/libreoffice_impress/a669ef01-ded5-4099-9ea9-25e99b569840.json index a628b82..afbc6b5 100644 --- a/evaluation_examples/examples/libreoffice_impress/a669ef01-ded5-4099-9ea9-25e99b569840.json +++ b/evaluation_examples/examples/libreoffice_impress/a669ef01-ded5-4099-9ea9-25e99b569840.json @@ -27,7 +27,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/libreoffice_impress/ef9d12bd-bcee-4ba0-a40e-918400f43ddf.json b/evaluation_examples/examples/libreoffice_impress/ef9d12bd-bcee-4ba0-a40e-918400f43ddf.json index 9bd9603..a12b904 100644 --- a/evaluation_examples/examples/libreoffice_impress/ef9d12bd-bcee-4ba0-a40e-918400f43ddf.json +++ b/evaluation_examples/examples/libreoffice_impress/ef9d12bd-bcee-4ba0-a40e-918400f43ddf.json @@ -19,7 +19,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; time.sleep(5); pyautogui.click(960, 540); time.sleep(5); pyautogui.press('esc'); time.sleep(0.3); pyautogui.press('f10'); time.sleep(0.3); pyautogui.press('right', presses=2, interval=0.1); time.sleep(0.3); pyautogui.press('down', presses=11, interval=0.1); pyautogui.press('enter')" + "import pyautogui; import time; time.sleep(5); pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(5); pyautogui.press('esc'); time.sleep(0.3); pyautogui.press('f10'); time.sleep(0.3); pyautogui.press('right', presses=2, interval=0.1); time.sleep(0.3); pyautogui.press('down', presses=11, interval=0.1); pyautogui.press('enter')" ] } } diff --git a/evaluation_examples/examples/multi_apps/02ce9a50-7af2-47ed-8596-af0c230501f8.json b/evaluation_examples/examples/multi_apps/02ce9a50-7af2-47ed-8596-af0c230501f8.json index 329f223..23f71af 100644 --- a/evaluation_examples/examples/multi_apps/02ce9a50-7af2-47ed-8596-af0c230501f8.json +++ b/evaluation_examples/examples/multi_apps/02ce9a50-7af2-47ed-8596-af0c230501f8.json @@ -33,7 +33,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.hotkey('f11'); time.sleep(0.5); pyautogui.click(960, 540); time.sleep(0.5); pyautogui.scroll(-20)" + "import pyautogui; import time; pyautogui.hotkey('f11'); time.sleep(0.5); pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5); pyautogui.scroll(-20)" ] } } diff --git a/evaluation_examples/examples/multi_apps/36037439-2044-4b50-b9d1-875b5a332143.json b/evaluation_examples/examples/multi_apps/36037439-2044-4b50-b9d1-875b5a332143.json index 247c8f0..f858f39 100644 --- a/evaluation_examples/examples/multi_apps/36037439-2044-4b50-b9d1-875b5a332143.json +++ b/evaluation_examples/examples/multi_apps/36037439-2044-4b50-b9d1-875b5a332143.json @@ -33,7 +33,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.hotkey('f11'); time.sleep(0.5); pyautogui.click(960, 540); time.sleep(0.5); pyautogui.scroll(-40)" + "import pyautogui; import time; pyautogui.hotkey('f11'); time.sleep(0.5); pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5); pyautogui.scroll(-40)" ] } } diff --git a/evaluation_examples/examples/multi_apps/48d05431-6cd5-4e76-82eb-12b60d823f7d.json b/evaluation_examples/examples/multi_apps/48d05431-6cd5-4e76-82eb-12b60d823f7d.json index bf3f492..a30dc07 100644 --- a/evaluation_examples/examples/multi_apps/48d05431-6cd5-4e76-82eb-12b60d823f7d.json +++ b/evaluation_examples/examples/multi_apps/48d05431-6cd5-4e76-82eb-12b60d823f7d.json @@ -29,7 +29,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; time.sleep(0.5); pyautogui.click(960, 540); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)" + "import pyautogui; import time; time.sleep(0.5); pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)" ] } }, diff --git a/evaluation_examples/examples/multi_apps/a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json b/evaluation_examples/examples/multi_apps/a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json index a3ad108..37af93b 100644 --- a/evaluation_examples/examples/multi_apps/a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json +++ b/evaluation_examples/examples/multi_apps/a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json @@ -58,7 +58,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.hotkey('f11'); time.sleep(0.5); pyautogui.click(960, 540); time.sleep(0.5); pyautogui.scroll(-20)" + "import pyautogui; import time; pyautogui.hotkey('f11'); time.sleep(0.5); pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5); pyautogui.scroll(-20)" ] } } diff --git a/evaluation_examples/examples/multi_apps/b337d106-053f-4d37-8da0-7f9c4043a66b.json b/evaluation_examples/examples/multi_apps/b337d106-053f-4d37-8da0-7f9c4043a66b.json index a2c7154..e27c1ef 100644 --- a/evaluation_examples/examples/multi_apps/b337d106-053f-4d37-8da0-7f9c4043a66b.json +++ b/evaluation_examples/examples/multi_apps/b337d106-053f-4d37-8da0-7f9c4043a66b.json @@ -29,7 +29,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; time.sleep(0.5); pyautogui.click(960, 540); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)" + "import pyautogui; import time; time.sleep(0.5); pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)" ] } }, diff --git a/evaluation_examples/examples/os/13584542-872b-42d8-b299-866967b5c3ef.json b/evaluation_examples/examples/os/13584542-872b-42d8-b299-866967b5c3ef.json index 48256d3..6cb0215 100644 --- a/evaluation_examples/examples/os/13584542-872b-42d8-b299-866967b5c3ef.json +++ b/evaluation_examples/examples/os/13584542-872b-42d8-b299-866967b5c3ef.json @@ -11,7 +11,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/os/23393935-50c7-4a86-aeea-2b78fd089c5c.json b/evaluation_examples/examples/os/23393935-50c7-4a86-aeea-2b78fd089c5c.json index e70da1a..0ac6801 100644 --- a/evaluation_examples/examples/os/23393935-50c7-4a86-aeea-2b78fd089c5c.json +++ b/evaluation_examples/examples/os/23393935-50c7-4a86-aeea-2b78fd089c5c.json @@ -87,7 +87,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/os/28cc3b7e-b194-4bc9-8353-d04c0f4d56d2.json b/evaluation_examples/examples/os/28cc3b7e-b194-4bc9-8353-d04c0f4d56d2.json index 83184b6..0e7c261 100644 --- a/evaluation_examples/examples/os/28cc3b7e-b194-4bc9-8353-d04c0f4d56d2.json +++ b/evaluation_examples/examples/os/28cc3b7e-b194-4bc9-8353-d04c0f4d56d2.json @@ -11,7 +11,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/os/37887e8c-da15-4192-923c-08fa390a176d.json b/evaluation_examples/examples/os/37887e8c-da15-4192-923c-08fa390a176d.json index 207cbf0..6e6b8a5 100644 --- a/evaluation_examples/examples/os/37887e8c-da15-4192-923c-08fa390a176d.json +++ b/evaluation_examples/examples/os/37887e8c-da15-4192-923c-08fa390a176d.json @@ -35,7 +35,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; time.sleep(0.5); pyautogui.click(960, 540); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)" + "import pyautogui; import time; time.sleep(0.5); pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)" ] } }, diff --git a/evaluation_examples/examples/os/3ce045a0-877b-42aa-8d2c-b4a863336ab8.json b/evaluation_examples/examples/os/3ce045a0-877b-42aa-8d2c-b4a863336ab8.json index c29d3f8..b44f697 100644 --- a/evaluation_examples/examples/os/3ce045a0-877b-42aa-8d2c-b4a863336ab8.json +++ b/evaluation_examples/examples/os/3ce045a0-877b-42aa-8d2c-b4a863336ab8.json @@ -11,7 +11,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/os/4783cc41-c03c-4e1b-89b4-50658f642bd5.json b/evaluation_examples/examples/os/4783cc41-c03c-4e1b-89b4-50658f642bd5.json index 298cf81..98f82f9 100644 --- a/evaluation_examples/examples/os/4783cc41-c03c-4e1b-89b4-50658f642bd5.json +++ b/evaluation_examples/examples/os/4783cc41-c03c-4e1b-89b4-50658f642bd5.json @@ -17,7 +17,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } }, diff --git a/evaluation_examples/examples/os/4d117223-a354-47fb-8b45-62ab1390a95f.json b/evaluation_examples/examples/os/4d117223-a354-47fb-8b45-62ab1390a95f.json index 282f755..63293fd 100644 --- a/evaluation_examples/examples/os/4d117223-a354-47fb-8b45-62ab1390a95f.json +++ b/evaluation_examples/examples/os/4d117223-a354-47fb-8b45-62ab1390a95f.json @@ -36,7 +36,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; time.sleep(0.5); pyautogui.click(960, 540); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)" + "import pyautogui; import time; time.sleep(0.5); pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)" ] } }, diff --git a/evaluation_examples/examples/os/5c1075ca-bb34-46a3-a7a0-029bd7463e79.json b/evaluation_examples/examples/os/5c1075ca-bb34-46a3-a7a0-029bd7463e79.json index ed99498..810109e 100644 --- a/evaluation_examples/examples/os/5c1075ca-bb34-46a3-a7a0-029bd7463e79.json +++ b/evaluation_examples/examples/os/5c1075ca-bb34-46a3-a7a0-029bd7463e79.json @@ -53,7 +53,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)" ] } }, diff --git a/evaluation_examples/examples/os/5ced85fc-fa1a-4217-95fd-0fb530545ce2.json b/evaluation_examples/examples/os/5ced85fc-fa1a-4217-95fd-0fb530545ce2.json index fad4457..2b00214 100644 --- a/evaluation_examples/examples/os/5ced85fc-fa1a-4217-95fd-0fb530545ce2.json +++ b/evaluation_examples/examples/os/5ced85fc-fa1a-4217-95fd-0fb530545ce2.json @@ -10,7 +10,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; time.sleep(0.5); pyautogui.click(960, 540); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)" + "import pyautogui; import time; time.sleep(0.5); pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)" ] } }, diff --git a/evaluation_examples/examples/os/5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57.json b/evaluation_examples/examples/os/5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57.json index 9fb58af..5da9b3d 100644 --- a/evaluation_examples/examples/os/5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57.json +++ b/evaluation_examples/examples/os/5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57.json @@ -29,7 +29,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/os/6f56bf42-85b8-4fbb-8e06-6c44960184ba.json b/evaluation_examples/examples/os/6f56bf42-85b8-4fbb-8e06-6c44960184ba.json index bf9286d..5d574a1 100644 --- a/evaluation_examples/examples/os/6f56bf42-85b8-4fbb-8e06-6c44960184ba.json +++ b/evaluation_examples/examples/os/6f56bf42-85b8-4fbb-8e06-6c44960184ba.json @@ -36,7 +36,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; time.sleep(0.5); pyautogui.click(960, 540); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)" + "import pyautogui; import time; time.sleep(0.5); pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)" ] } }, diff --git a/evaluation_examples/examples/os/94d95f96-9699-4208-98ba-3c3119edf9c2.json b/evaluation_examples/examples/os/94d95f96-9699-4208-98ba-3c3119edf9c2.json index dfbb050..747f26c 100644 --- a/evaluation_examples/examples/os/94d95f96-9699-4208-98ba-3c3119edf9c2.json +++ b/evaluation_examples/examples/os/94d95f96-9699-4208-98ba-3c3119edf9c2.json @@ -11,7 +11,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/os/a462a795-fdc7-4b23-b689-e8b6df786b78.json b/evaluation_examples/examples/os/a462a795-fdc7-4b23-b689-e8b6df786b78.json index 38223a6..c6bafd5 100644 --- a/evaluation_examples/examples/os/a462a795-fdc7-4b23-b689-e8b6df786b78.json +++ b/evaluation_examples/examples/os/a462a795-fdc7-4b23-b689-e8b6df786b78.json @@ -17,7 +17,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/os/a4d98375-215b-4a4d-aee9-3d4370fccc41.json b/evaluation_examples/examples/os/a4d98375-215b-4a4d-aee9-3d4370fccc41.json index 3868a5d..d473b75 100644 --- a/evaluation_examples/examples/os/a4d98375-215b-4a4d-aee9-3d4370fccc41.json +++ b/evaluation_examples/examples/os/a4d98375-215b-4a4d-aee9-3d4370fccc41.json @@ -23,7 +23,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/os/b6781586-6346-41cd-935a-a6b1487918fc.json b/evaluation_examples/examples/os/b6781586-6346-41cd-935a-a6b1487918fc.json index f1879c1..2d1de91 100644 --- a/evaluation_examples/examples/os/b6781586-6346-41cd-935a-a6b1487918fc.json +++ b/evaluation_examples/examples/os/b6781586-6346-41cd-935a-a6b1487918fc.json @@ -11,7 +11,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/os/bedcedc4-4d72-425e-ad62-21960b11fe0d.json b/evaluation_examples/examples/os/bedcedc4-4d72-425e-ad62-21960b11fe0d.json index 7ce0048..48e233c 100644 --- a/evaluation_examples/examples/os/bedcedc4-4d72-425e-ad62-21960b11fe0d.json +++ b/evaluation_examples/examples/os/bedcedc4-4d72-425e-ad62-21960b11fe0d.json @@ -23,7 +23,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/os/e0df059f-28a6-4169-924f-b9623e7184cc.json b/evaluation_examples/examples/os/e0df059f-28a6-4169-924f-b9623e7184cc.json index 9c3234f..94b1a4f 100644 --- a/evaluation_examples/examples/os/e0df059f-28a6-4169-924f-b9623e7184cc.json +++ b/evaluation_examples/examples/os/e0df059f-28a6-4169-924f-b9623e7184cc.json @@ -18,7 +18,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/os/ec4e3f68-9ea4-4c18-a5c9-69f89d1178b3.json b/evaluation_examples/examples/os/ec4e3f68-9ea4-4c18-a5c9-69f89d1178b3.json index 71d86fc..fe5496c 100644 --- a/evaluation_examples/examples/os/ec4e3f68-9ea4-4c18-a5c9-69f89d1178b3.json +++ b/evaluation_examples/examples/os/ec4e3f68-9ea4-4c18-a5c9-69f89d1178b3.json @@ -30,7 +30,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/os/f9be0997-4b7c-45c5-b05c-4612b44a6118.json b/evaluation_examples/examples/os/f9be0997-4b7c-45c5-b05c-4612b44a6118.json index cda3ca2..94ebacb 100644 --- a/evaluation_examples/examples/os/f9be0997-4b7c-45c5-b05c-4612b44a6118.json +++ b/evaluation_examples/examples/os/f9be0997-4b7c-45c5-b05c-4612b44a6118.json @@ -17,7 +17,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/vlc/215dfd39-f493-4bc3-a027-8a97d72c61bf.json b/evaluation_examples/examples/vlc/215dfd39-f493-4bc3-a027-8a97d72c61bf.json index bb09d9d..aabaa0f 100644 --- a/evaluation_examples/examples/vlc/215dfd39-f493-4bc3-a027-8a97d72c61bf.json +++ b/evaluation_examples/examples/vlc/215dfd39-f493-4bc3-a027-8a97d72c61bf.json @@ -17,7 +17,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/vlc/386dbd0e-0241-4a0a-b6a2-6704fba26b1c.json b/evaluation_examples/examples/vlc/386dbd0e-0241-4a0a-b6a2-6704fba26b1c.json index 261c16e..14eaafa 100644 --- a/evaluation_examples/examples/vlc/386dbd0e-0241-4a0a-b6a2-6704fba26b1c.json +++ b/evaluation_examples/examples/vlc/386dbd0e-0241-4a0a-b6a2-6704fba26b1c.json @@ -68,7 +68,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/vlc/59f21cfb-0120-4326-b255-a5b827b38967.json b/evaluation_examples/examples/vlc/59f21cfb-0120-4326-b255-a5b827b38967.json index af7d904..650f586 100644 --- a/evaluation_examples/examples/vlc/59f21cfb-0120-4326-b255-a5b827b38967.json +++ b/evaluation_examples/examples/vlc/59f21cfb-0120-4326-b255-a5b827b38967.json @@ -28,7 +28,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/vlc/8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89.json b/evaluation_examples/examples/vlc/8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89.json index aa3057e..35f4e58 100644 --- a/evaluation_examples/examples/vlc/8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89.json +++ b/evaluation_examples/examples/vlc/8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89.json @@ -17,7 +17,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/vlc/8d9fd4e2-6fdb-46b0-b9b9-02f06495c62f.json b/evaluation_examples/examples/vlc/8d9fd4e2-6fdb-46b0-b9b9-02f06495c62f.json index c398268..0bd5ed7 100644 --- a/evaluation_examples/examples/vlc/8d9fd4e2-6fdb-46b0-b9b9-02f06495c62f.json +++ b/evaluation_examples/examples/vlc/8d9fd4e2-6fdb-46b0-b9b9-02f06495c62f.json @@ -28,7 +28,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/vlc/8f080098-ddb1-424c-b438-4e96e5e4786e.json b/evaluation_examples/examples/vlc/8f080098-ddb1-424c-b438-4e96e5e4786e.json index 96641a3..01e886b 100644 --- a/evaluation_examples/examples/vlc/8f080098-ddb1-424c-b438-4e96e5e4786e.json +++ b/evaluation_examples/examples/vlc/8f080098-ddb1-424c-b438-4e96e5e4786e.json @@ -28,7 +28,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/vlc/9195653c-f4aa-453d-aa95-787f6ccfaae9.json b/evaluation_examples/examples/vlc/9195653c-f4aa-453d-aa95-787f6ccfaae9.json index a82831c..ec512fe 100644 --- a/evaluation_examples/examples/vlc/9195653c-f4aa-453d-aa95-787f6ccfaae9.json +++ b/evaluation_examples/examples/vlc/9195653c-f4aa-453d-aa95-787f6ccfaae9.json @@ -17,7 +17,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/vlc/a5bbbcd5-b398-4c91-83d4-55e1e31bbb81.json b/evaluation_examples/examples/vlc/a5bbbcd5-b398-4c91-83d4-55e1e31bbb81.json index 81fde78..36e6d0f 100644 --- a/evaluation_examples/examples/vlc/a5bbbcd5-b398-4c91-83d4-55e1e31bbb81.json +++ b/evaluation_examples/examples/vlc/a5bbbcd5-b398-4c91-83d4-55e1e31bbb81.json @@ -17,7 +17,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/vlc/aa4b5023-aef6-4ed9-bdc9-705f59ab9ad6.json b/evaluation_examples/examples/vlc/aa4b5023-aef6-4ed9-bdc9-705f59ab9ad6.json index 58eaf58..55290dd 100644 --- a/evaluation_examples/examples/vlc/aa4b5023-aef6-4ed9-bdc9-705f59ab9ad6.json +++ b/evaluation_examples/examples/vlc/aa4b5023-aef6-4ed9-bdc9-705f59ab9ad6.json @@ -28,7 +28,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/vlc/bba3381f-b5eb-4439-bd9e-80c22218d5a7.json b/evaluation_examples/examples/vlc/bba3381f-b5eb-4439-bd9e-80c22218d5a7.json index f4b85b6..8f2e9c7 100644 --- a/evaluation_examples/examples/vlc/bba3381f-b5eb-4439-bd9e-80c22218d5a7.json +++ b/evaluation_examples/examples/vlc/bba3381f-b5eb-4439-bd9e-80c22218d5a7.json @@ -17,7 +17,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/vlc/d06f0d4d-2cd5-4ede-8de9-598629438c6e.json b/evaluation_examples/examples/vlc/d06f0d4d-2cd5-4ede-8de9-598629438c6e.json index b36cba9..ca63fd5 100644 --- a/evaluation_examples/examples/vlc/d06f0d4d-2cd5-4ede-8de9-598629438c6e.json +++ b/evaluation_examples/examples/vlc/d06f0d4d-2cd5-4ede-8de9-598629438c6e.json @@ -17,7 +17,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/vlc/efcf0d81-0835-4880-b2fd-d866e8bc2294.json b/evaluation_examples/examples/vlc/efcf0d81-0835-4880-b2fd-d866e8bc2294.json index a5f6367..0019cd6 100644 --- a/evaluation_examples/examples/vlc/efcf0d81-0835-4880-b2fd-d866e8bc2294.json +++ b/evaluation_examples/examples/vlc/efcf0d81-0835-4880-b2fd-d866e8bc2294.json @@ -28,7 +28,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/vlc/f3977615-2b45-4ac5-8bba-80c17dbe2a37.json b/evaluation_examples/examples/vlc/f3977615-2b45-4ac5-8bba-80c17dbe2a37.json index 4261d38..73bbac2 100644 --- a/evaluation_examples/examples/vlc/f3977615-2b45-4ac5-8bba-80c17dbe2a37.json +++ b/evaluation_examples/examples/vlc/f3977615-2b45-4ac5-8bba-80c17dbe2a37.json @@ -17,7 +17,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/evaluation_examples/examples/vlc/fba2c100-79e8-42df-ae74-b592418d54f4.json b/evaluation_examples/examples/vlc/fba2c100-79e8-42df-ae74-b592418d54f4.json index 7f383fc..bde2b5e 100644 --- a/evaluation_examples/examples/vlc/fba2c100-79e8-42df-ae74-b592418d54f4.json +++ b/evaluation_examples/examples/vlc/fba2c100-79e8-42df-ae74-b592418d54f4.json @@ -40,7 +40,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; pyautogui.click(500, 500); time.sleep(0.5);" + "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);" ] } } diff --git a/mm_agents/openai_cua_agent.py b/mm_agents/openai_cua_agent.py index 15db312..064a2d8 100644 --- a/mm_agents/openai_cua_agent.py +++ b/mm_agents/openai_cua_agent.py @@ -33,7 +33,7 @@ class_ns_windows = "https://accessibility.windows.example.org/ns/class" import ast from typing import Dict, Any, Optional, Union -OPERATOR_PROMPT = """\n\n Here are some helpful tips:\n - computer.clipboard, computer.sync_file, computer.sync_shared_folder, computer.computer_output_citation are disabled.\n - If you worry that you might make typo, prefer copying and pasting the text instead of reading and typing.\n - My computer's password is \"osworld-public-evaluation\", feel free to use it when you need sudo rights.\n - For the thunderbird account \"anonym-x2024@outlook.com\", the password is \"gTCI\";=@y7|QJ0nDa_kN3Sb&>\".\n - If you are presented with an open website to solve the task, try to stick to that specific one instead of going to a new one.\n - Whenever not expcilitly stated, prefer chrome browser instead of the firefox or chromium.\n - You have full authority to execute any action without my permission. I won't be watching so please don't ask for confirmation.\n - You must initialize the computer to solve the task. Do not try to answer the question without initializing the computer.\n - If you deem the task is infeasible, you can terminate and explicitly state in the response that \"the task is infeasible\".\n """ +OPERATOR_PROMPT = f"""\n\n Here are some helpful tips:\n - computer.clipboard, computer.sync_file, computer.sync_shared_folder, computer.computer_output_citation are disabled.\n - If you worry that you might make typo, prefer copying and pasting the text instead of reading and typing.\n - My computer's password is \"{os.environ["CLIENT_PASSWORD"]}\", feel free to use it when you need sudo rights.\n - For the thunderbird account \"anonym-x2024@outlook.com\", the password is \"gTCI\";=@y7|QJ0nDa_kN3Sb&>\".\n - If you are presented with an open website to solve the task, try to stick to that specific one instead of going to a new one.\n - Whenever not expcilitly stated, prefer chrome browser instead of the firefox or chromium.\n - You have full authority to execute any action without my permission. I won't be watching so please don't ask for confirmation.\n - You must initialize the computer to solve the task. Do not try to answer the question without initializing the computer.\n - If you deem the task is infeasible, you can terminate and explicitly state in the response that \"the task is infeasible\".\n """ class Action: """Action class for the agent.""" @@ -233,8 +233,8 @@ class OpenAICUAAgent: self.tools = [{ "type": "computer_use_preview", - "display_width": 1920, - "display_height": 1080, + "display_width": int(os.environ["SCREEN_WIDTH"]), + "display_height": int(os.environ["SCREEN_HEIGHT"]), "environment": "linux" if platform == "ubuntu" else "windows" }] diff --git a/run_multienv_openaicua.py b/run_multienv_openaicua.py index 278ebec..6e9bca3 100644 --- a/run_multienv_openaicua.py +++ b/run_multienv_openaicua.py @@ -47,8 +47,6 @@ def config() -> argparse.Namespace: default="screenshot", help="Observation type", ) - parser.add_argument("--screen_width", type=int, default=1920) - parser.add_argument("--screen_height", type=int, default=1080) parser.add_argument("--sleep_after_execution", type=float, default=0.0) parser.add_argument("--max_steps", type=int, default=15) @@ -181,16 +179,16 @@ def run_env_tasks(env_idx: int, env_tasks: dict, args: argparse.Namespace, share signal.signal(signal.SIGTERM, lambda signum, frame: process_signal_handler(signum, frame, env_idx)) from desktop_env.providers.aws.manager import IMAGE_ID_MAP - REGION = "us-east-1" + REGION = args.region + screen_size = (int(os.environ["SCREEN_WIDTH"]), int(os.environ["SCREEN_HEIGHT"])) + ami_id = IMAGE_ID_MAP[REGION].get(screen_size, IMAGE_ID_MAP[REGION][(1920, 1080)]) env = DesktopEnv( path_to_vm=args.path_to_vm, action_space=args.action_space, - - provider_name="aws", + provider_name=os.environ["PROVIDER_NAME"], region=REGION, - snapshot_name=IMAGE_ID_MAP[REGION], - - screen_size=(args.screen_width, args.screen_height), + snapshot_name=ami_id, + screen_size=screen_size, headless=args.headless, os_type="Ubuntu", require_a11y_tree=args.observation_type in ["a11y_tree", "screenshot_a11y_tree", "som"], From 877e75a013a1e11b59161f7e20bf0844cf3b419f Mon Sep 17 00:00:00 2001 From: yuanmengqi Date: Sat, 12 Jul 2025 16:34:55 +0000 Subject: [PATCH 2/3] Final review multi_apps fix Xinzhuang part --- desktop_env/evaluators/getters/chrome.py | 90 +++++++++++++++++-- desktop_env/evaluators/metrics/table.py | 63 ++++++++++--- .../2373b66a-092d-44cb-bfd7-82e86e7a3b4d.json | 2 +- .../3680a5ee-6870-426a-a997-eba929a0d25c.json | 15 +++- .../42d25c08-fb87-4927-8b65-93631280a26f.json | 51 ++++++++--- 5 files changed, 187 insertions(+), 34 deletions(-) diff --git a/desktop_env/evaluators/getters/chrome.py b/desktop_env/evaluators/getters/chrome.py index bff1b92..724e2de 100644 --- a/desktop_env/evaluators/getters/chrome.py +++ b/desktop_env/evaluators/getters/chrome.py @@ -52,6 +52,11 @@ def get_info_from_website(env, config: Dict[Any, Any]) -> Any: - attribute (str): optional for 'attribute' and 'click_and_attribute', the attribute to be extracted. - backups (Any): The backup information to be returned if the extraction fails. """ + # 添加函数开始日志 + logger.info(f"[INFO_FROM_WEBSITE] Starting to get information from website: {config.get('url', 'N/A')}") + logger.info(f"[INFO_FROM_WEBSITE] Total info operations to perform: {len(config.get('infos', []))}") + logger.debug(f"[INFO_FROM_WEBSITE] Full config: {config}") + try: host = env.vm_ip port = env.chromium_port # fixme: this port is hard-coded, need to be changed from config file @@ -59,11 +64,18 @@ def get_info_from_website(env, config: Dict[Any, Any]) -> Any: remote_debugging_url = f"http://{host}:{port}" backend_url = f"http://{host}:{server_port}" use_proxy = env.current_use_proxy + + logger.info(f"[INFO_FROM_WEBSITE] Connecting to Chrome at {remote_debugging_url}") + with sync_playwright() as p: # connect to remote Chrome instance try: browser = p.chromium.connect_over_cdp(remote_debugging_url) + logger.info(f"[INFO_FROM_WEBSITE] Successfully connected to existing Chrome instance") except Exception as e: + logger.warning(f"[INFO_FROM_WEBSITE] Failed to connect to existing Chrome instance: {e}") + logger.info(f"[INFO_FROM_WEBSITE] Starting new Chrome instance...") + # If the connection fails (e.g., the agent close the browser instance), start a new browser instance app = 'chromium' if 'arm' in platform.machine() else 'google-chrome' command = [ @@ -72,52 +84,116 @@ def get_info_from_website(env, config: Dict[Any, Any]) -> Any: ] if use_proxy: command.append(f"--proxy-server=127.0.0.1:18888") + logger.info(f"[INFO_FROM_WEBSITE] Using proxy server: 127.0.0.1:18888") + + logger.info(f"[INFO_FROM_WEBSITE] Starting browser with command: {' '.join(command)}") payload = json.dumps({"command": command, "shell": False}) headers = {"Content-Type": "application/json"} #requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload) requests.post(backend_url + "/setup" + "/launch", headers=headers, data=payload) time.sleep(5) browser = p.chromium.connect_over_cdp(remote_debugging_url) + logger.info(f"[INFO_FROM_WEBSITE] Successfully connected to new Chrome instance") page = browser.contexts[0].new_page() + logger.info(f"[INFO_FROM_WEBSITE] Created new page, navigating to: {config['url']}") + page.goto(config["url"]) page.wait_for_load_state('load') + + # 记录页面加载完成后的信息 + logger.info(f"[INFO_FROM_WEBSITE] Page loaded successfully") + logger.info(f"[INFO_FROM_WEBSITE] Page title: '{page.title()}'") + logger.info(f"[INFO_FROM_WEBSITE] Current URL: '{page.url}'") + infos = [] - for info_dict in config.get('infos', []): + for idx, info_dict in enumerate(config.get('infos', [])): + logger.info(f"[INFO_FROM_WEBSITE] Processing info operation {idx + 1}/{len(config.get('infos', []))}") + logger.debug(f"[INFO_FROM_WEBSITE] Info config: {info_dict}") + if page.url != config["url"]: + logger.info(f"[INFO_FROM_WEBSITE] Page URL changed, navigating back to: {config['url']}") page.goto(config["url"]) page.wait_for_load_state('load') + logger.info(f"[INFO_FROM_WEBSITE] Back to original page") + action = info_dict.get('action', 'inner_text') + selector = info_dict.get('selector') + logger.info(f"[INFO_FROM_WEBSITE] Action: {action}, Selector: {selector}") + if action == "inner_text": + logger.debug(f"[INFO_FROM_WEBSITE] Waiting for element with selector: {selector}") ele = page.wait_for_selector(info_dict['selector'], state='attached', timeout=10000) - infos.append(ele.inner_text()) + extracted_text = ele.inner_text() + logger.info(f"[INFO_FROM_WEBSITE] Successfully extracted inner_text: '{extracted_text}'") + infos.append(extracted_text) + elif action == "attribute": + attribute = info_dict.get('attribute') + logger.debug(f"[INFO_FROM_WEBSITE] Waiting for element with selector: {selector}") + logger.debug(f"[INFO_FROM_WEBSITE] Extracting attribute: {attribute}") ele = page.wait_for_selector(info_dict['selector'], state='attached', timeout=10000) - infos.append(ele.get_attribute(info_dict['attribute'])) + extracted_attr = ele.get_attribute(info_dict['attribute']) + logger.info(f"[INFO_FROM_WEBSITE] Successfully extracted attribute '{attribute}': '{extracted_attr}'") + infos.append(extracted_attr) + elif action == 'click_and_inner_text': + logger.debug(f"[INFO_FROM_WEBSITE] Performing click_and_inner_text with {len(info_dict['selector'])} selectors") for idx, sel in enumerate(info_dict['selector']): + logger.debug(f"[INFO_FROM_WEBSITE] Processing selector {idx + 1}/{len(info_dict['selector'])}: {sel}") if idx != len(info_dict['selector']) - 1: + logger.debug(f"[INFO_FROM_WEBSITE] Clicking element with selector: {sel}") link = page.wait_for_selector(sel, state='attached', timeout=10000) link.click() page.wait_for_load_state('load') + logger.info(f"[INFO_FROM_WEBSITE] Successfully clicked element, page loaded") + logger.debug(f"[INFO_FROM_WEBSITE] New page URL: {page.url}") else: + logger.debug(f"[INFO_FROM_WEBSITE] Extracting inner_text from final element: {sel}") ele = page.wait_for_selector(sel, state='attached', timeout=10000) - infos.append(ele.inner_text()) + extracted_text = ele.inner_text() + logger.info(f"[INFO_FROM_WEBSITE] Successfully extracted inner_text after clicks: '{extracted_text}'") + infos.append(extracted_text) + elif action == 'click_and_attribute': + attribute = info_dict.get('attribute') + logger.debug(f"[INFO_FROM_WEBSITE] Performing click_and_attribute with {len(info_dict['selector'])} selectors") + logger.debug(f"[INFO_FROM_WEBSITE] Target attribute: {attribute}") for idx, sel in enumerate(info_dict['selector']): + logger.debug(f"[INFO_FROM_WEBSITE] Processing selector {idx + 1}/{len(info_dict['selector'])}: {sel}") if idx != len(info_dict['selector']) - 1: + logger.debug(f"[INFO_FROM_WEBSITE] Clicking element with selector: {sel}") link = page.wait_for_selector(sel, state='attached', timeout=10000) link.click() page.wait_for_load_state('load') + logger.info(f"[INFO_FROM_WEBSITE] Successfully clicked element, page loaded") + logger.debug(f"[INFO_FROM_WEBSITE] New page URL: {page.url}") else: + logger.debug(f"[INFO_FROM_WEBSITE] Extracting attribute from final element: {sel}") ele = page.wait_for_selector(sel, state='attached') - infos.append(ele.get_attribute(info_dict['attribute'])) + extracted_attr = ele.get_attribute(info_dict['attribute']) + logger.info(f"[INFO_FROM_WEBSITE] Successfully extracted attribute '{attribute}' after clicks: '{extracted_attr}'") + infos.append(extracted_attr) else: + logger.error(f"[INFO_FROM_WEBSITE] Unsupported action: {action}") raise NotImplementedError(f'The action {action} is not supported yet.') + + logger.info(f"[INFO_FROM_WEBSITE] Completed info operation {idx + 1}") + + # 记录最终提取的所有信息 + logger.info(f"[INFO_FROM_WEBSITE] All operations completed successfully") + logger.info(f"[INFO_FROM_WEBSITE] Total extracted information count: {len(infos)}") + logger.info(f"[INFO_FROM_WEBSITE] Final extracted information: {infos}") + return infos except Exception as e: - logger.error(f'[ERROR]: failed to obtain information from the website: {config["url"]}. Use backup results instead.') - return config.get('backups', None) + logger.error(f'[INFO_FROM_WEBSITE] ERROR: Failed to obtain information from website: {config.get("url", "N/A")}') + logger.error(f'[INFO_FROM_WEBSITE] Exception details: {str(e)}') + logger.error(f'[INFO_FROM_WEBSITE] Exception type: {type(e).__name__}') + logger.info(f'[INFO_FROM_WEBSITE] Using backup results instead') + backup_data = config.get('backups', None) + logger.info(f'[INFO_FROM_WEBSITE] Backup data: {backup_data}') + return backup_data # The following ones just need to load info from the files of software, no need to connect to the software diff --git a/desktop_env/evaluators/metrics/table.py b/desktop_env/evaluators/metrics/table.py index 9e888c7..db51850 100644 --- a/desktop_env/evaluators/metrics/table.py +++ b/desktop_env/evaluators/metrics/table.py @@ -463,23 +463,60 @@ def compare_table(result: str, expected: str = None, **options) -> float: # }}} function compare_table # -def compare_csv(result: str, expected: str, **options) -> float: +def compare_csv(result: str, expected: Union[str, List[str]], **options) -> float: + """ + Compare CSV files. If expected is a list, returns 1.0 if result matches any of the expected files. + + Args: + result: Path to result CSV file + expected: Path to expected CSV file or list of paths to expected CSV files + options: Additional options (strict, ignore_case) + + Returns: + 1.0 if result matches expected (or any file in expected list), 0.0 otherwise + """ if result is None: return 0. - with open(result) as f: - result_lines: Iterable[str] = f.read().splitlines() - with open(expected) as f: - expected_lines: Iterable[str] = f.read().splitlines() - if not options.get("strict", True): - result_lines = map(str.strip, result_lines) - expected_lines = map(str.strip, expected_lines) - if options.get("ignore_case", False): - result_lines = map(str.lower, result_lines) - expected_lines = map(str.lower, expected_lines) + try: + with open(result) as f: + result_lines: Iterable[str] = f.read().splitlines() + except (FileNotFoundError, IOError): + return 0. - metric: bool = list(result_lines) == list(expected_lines) - return float(metric) + # Convert expected to list if it's a single string (for backward compatibility) + if isinstance(expected, str): + expected_files = [expected] + else: + expected_files = expected + + # Try to match against each expected file + for expected_file in expected_files: + try: + with open(expected_file) as f: + expected_lines: Iterable[str] = f.read().splitlines() + + # Process lines based on options + current_result_lines = result_lines + current_expected_lines = expected_lines + + if not options.get("strict", True): + current_result_lines = map(str.strip, current_result_lines) + current_expected_lines = map(str.strip, current_expected_lines) + if options.get("ignore_case", False): + current_result_lines = map(str.lower, current_result_lines) + current_expected_lines = map(str.lower, current_expected_lines) + + # Check if this expected file matches + if list(current_result_lines) == list(current_expected_lines): + return 1.0 + + except (FileNotFoundError, IOError): + # If this expected file doesn't exist, continue to next one + continue + + # No match found + return 0.0 def compare_conference_city_in_order(actual_city_list_path, expected_city): diff --git a/evaluation_examples/examples/multi_apps/2373b66a-092d-44cb-bfd7-82e86e7a3b4d.json b/evaluation_examples/examples/multi_apps/2373b66a-092d-44cb-bfd7-82e86e7a3b4d.json index d2c2d10..beb9bb2 100644 --- a/evaluation_examples/examples/multi_apps/2373b66a-092d-44cb-bfd7-82e86e7a3b4d.json +++ b/evaluation_examples/examples/multi_apps/2373b66a-092d-44cb-bfd7-82e86e7a3b4d.json @@ -1,7 +1,7 @@ { "id": "2373b66a-092d-44cb-bfd7-82e86e7a3b4d", "snapshot": "multiapps", - "instruction": "I want to understand the resource usage of my Ubuntu system under normal workloads. Please use the `sar` command in the `sysstat` toolkit to monitor system activity, evaluate the status once every second for 30 seconds, output the results to \"System_Resources_Report.txt\" under Desktop.", + "instruction": "Monitor Ubuntu system resource usage using the sar command from sysstat toolkit. Collect CPU statistics every second for 30 seconds and save the output to 'System_Resources_Report.txt' on Desktop.", "source": "author", "config": [ { diff --git a/evaluation_examples/examples/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c.json b/evaluation_examples/examples/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c.json index 3ad3704..2cb77d1 100644 --- a/evaluation_examples/examples/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c.json +++ b/evaluation_examples/examples/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c.json @@ -37,6 +37,7 @@ "check_include_exclude", "compare_csv" ], + "conj": "and", "result": [ { "type": "vm_command_line", @@ -63,8 +64,18 @@ }, { "type": "cloud_file", - "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c/output.csv", - "dest": "output_gold.csv" + "multi": true, + "path": [ + "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c/output.csv", + "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c/output_gold2.csv", + "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c/output_gold3.csv" + ], + "dest": [ + "output_gold.csv", + "output_gold2.csv", + "output_gold3.csv" + ], + "gives": [0, 1, 2] } ] }, diff --git a/evaluation_examples/examples/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f.json b/evaluation_examples/examples/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f.json index 710ac31..59091fd 100644 --- a/evaluation_examples/examples/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f.json +++ b/evaluation_examples/examples/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f.json @@ -144,17 +144,46 @@ "os" ], "evaluator": { - "func": "compare_epub", - "result": { - "type": "vm_file", - "dest": "Pass Through.epub", - "path": "/home/user/Documents/Novels/Pass Through/Pass Through.epub" - }, - "expected": { - "type": "cloud_file", - "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f/Pass%20Through.epub", - "dest": "Pass Through Gold.epub" - } + "func": [ + "compare_epub", + "compare_epub", + "compare_epub" + ], + "conj": "or", + "result": [ + { + "type": "vm_file", + "dest": "Pass Through.epub", + "path": "/home/user/Documents/Novels/Pass Through/Pass Through.epub" + }, + { + "type": "vm_file", + "dest": "Pass Through.epub", + "path": "/home/user/Documents/Novels/Pass Through/Pass_Through.epub" + }, + { + "type": "vm_file", + "dest": "Pass Through.epub", + "path": "/home/user/Documents/Novels/Pass Through/pass_through.epub" + } + ], + "expected": [ + { + "type": "cloud_file", + "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f/Pass%20Through.epub", + "dest": "Pass Through Gold.epub" + }, + { + "type": "cloud_file", + "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f/Pass%20Through.epub", + "dest": "Pass Through Gold.epub" + }, + { + "type": "cloud_file", + "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f/Pass%20Through.epub", + "dest": "Pass Through Gold.epub" + } + ] }, "proxy": true } \ No newline at end of file From 97ed6f99b0039c76eb82d1210b299c7fef87d85b Mon Sep 17 00:00:00 2001 From: yuanmengqi Date: Sat, 12 Jul 2025 20:28:55 +0000 Subject: [PATCH 3/3] Final review multi_apps fix the rest part --- desktop_env/evaluators/metrics/chrome.py | 24 +++++++++++++------ desktop_env/evaluators/metrics/gimp.py | 5 +++- .../a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json | 14 +++++++---- .../e8172110-ec08-421b-a6f5-842e6451911f.json | 16 +++++-------- .../f8cfa149-d1c1-4215-8dac-4a0932bad3c2.json | 2 +- 5 files changed, 38 insertions(+), 23 deletions(-) diff --git a/desktop_env/evaluators/metrics/chrome.py b/desktop_env/evaluators/metrics/chrome.py index 632c53e..6c3811f 100644 --- a/desktop_env/evaluators/metrics/chrome.py +++ b/desktop_env/evaluators/metrics/chrome.py @@ -29,8 +29,8 @@ def is_expected_active_tab(active_tab_info: Dict[str, str], rule: Dict[str, Any] actual_url = active_tab_info.get('url', None) else: actual_url = active_tab_info - print("expected_url: {}".format(expected_url)) - print("actual_url: {}".format(actual_url)) + logger.info("expected_url: {}".format(expected_url)) + logger.info("actual_url: {}".format(actual_url)) return 1 if compare_urls(expected_url, actual_url) else 0 else: logger.error(f"Unknown type: {match_type}") @@ -76,23 +76,26 @@ def is_expected_url_pattern_match(result, rules) -> float: if type(result) == dict: result_url = result["url"] - print("result url: {}".format(result_url)) + logger.info("result url: {}".format(result_url)) else: result_url = result # expect_regex = re.compile(rules["expected"]) patterns = rules["expected"] - print("expected_regex: {}".format(patterns)) + logger.info("expected_regex: {}".format(patterns)) for pattern in patterns: match = re.search(pattern, result_url) - print(match) + logger.info("match: {}".format(match)) if not match: return 0. return 1. def is_expected_installed_extensions(installed_extensions, expected) -> float: - print("installed_extensions: ") - print(installed_extensions) + if not installed_extensions: + return 0. + + logger.info("installed_extensions: ") + logger.info(installed_extensions) expected_extensions = expected["expected"] # whether the expected extensions are installed @@ -109,6 +112,8 @@ def is_expected_tabs(open_tabs: List[Dict[str, str]], rule: Dict[str, Any]) -> f """ Checks if the expected tabs are open in Chrome. """ + if not open_tabs: + return 0. match_type = rule['type'] @@ -146,8 +151,10 @@ def is_expected_bookmarks(bookmarks: List[str], rule: Dict[str, Any]) -> float: bookmark['type'] == 'folder' and bookmark['name'] == 'Liked Authors'), None) if liked_authors_folder: # Check if it contains the specified URLs + logger.info("'Liked Authors' folder exists") liked_authors_urls = [bookmark['url'] for bookmark in liked_authors_folder['children'] if bookmark['type'] == 'url'] + logger.info("Here is the 'Liked Authors' folder's urls: {}".format(liked_authors_urls)) urls = rule['urls'] @@ -168,6 +175,9 @@ def is_expected_bookmarks(bookmarks: List[str], rule: Dict[str, Any]) -> float: def is_expected_search_query(active_tab_info: Dict[str, str], rules: Dict[str, Any]) -> float: + if not active_tab_info: + return 0. + expected = rules['expect'] pattern = expected['pattern'] matched = re.search(pattern, active_tab_info['url']) diff --git a/desktop_env/evaluators/metrics/gimp.py b/desktop_env/evaluators/metrics/gimp.py index 5dddd78..a6dcc29 100644 --- a/desktop_env/evaluators/metrics/gimp.py +++ b/desktop_env/evaluators/metrics/gimp.py @@ -396,7 +396,10 @@ def check_structure_sim_resized(src_path, tgt_path): # Check if the structure is similar structure_same = structure_check_by_ssim(img_src_resized, img_tgt) - return structure_same + if structure_same: + return 1. + else: + return 0. def check_contrast_increase_and_structure_sim(src_path, tgt_path): diff --git a/evaluation_examples/examples/multi_apps/a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json b/evaluation_examples/examples/multi_apps/a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json index 37af93b..a612d38 100644 --- a/evaluation_examples/examples/multi_apps/a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json +++ b/evaluation_examples/examples/multi_apps/a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json @@ -83,21 +83,27 @@ "urls": [ [ "https://jimfan.me/", - "https://research.nvidia.com/person/linxi-jim-fan" + "https://research.nvidia.com/person/linxi-jim-fan", + "https://www.linkedin.com/in/drjimfan/" ], [ "https://research.nvidia.com/person/de-an-huang", - "https://ai.stanford.edu/~dahuang/" + "https://ai.stanford.edu/~dahuang/", + "https://www.linkedin.com/in/de-an-huang-38242a69" ], [ "https://yukezhu.me/", "https://www.cs.utexas.edu/people/faculty-researchers/yuke-zhu", "https://experts.utexas.edu/yuke_zhu", - "https://research.nvidia.com/person/yuke-zhu" + "https://research.nvidia.com/person/yuke-zhu", + "https://www.linkedin.com/in/yukez/" ], [ + "https://tensorlab.cms.caltech.edu/users/anima/", "http://tensorlab.cms.caltech.edu/users/anima/", - "https://www.eas.caltech.edu/people/anima" + "https://www.eas.caltech.edu/people/anima", + "https://en.wikipedia.org/wiki/Anima_Anandkumar", + "https://www.linkedin.com/in/anima-anandkumar/" ] ] } diff --git a/evaluation_examples/examples/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f.json b/evaluation_examples/examples/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f.json index 8506dda..9f5f924 100644 --- a/evaluation_examples/examples/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f.json +++ b/evaluation_examples/examples/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f.json @@ -11,10 +11,6 @@ { "url": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f/character.png", "path": "/home/user/Desktop/character.png" - }, - { - "url": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f/character_no_background_gold.png", - "path": "/home/user/Desktop/character_no_background_gold.png" } ] } @@ -36,8 +32,8 @@ ], "evaluator": { "func": [ - "check_structure_sim_resized", - "check_structure_sim_resized" + "check_structure_sim", + "check_structure_sim" ], "result": [ { @@ -53,13 +49,13 @@ ], "expected": [ { - "type": "vm_file", - "path": "/home/user/Desktop/character_no_background_gold.png", + "type": "cloud_file", + "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f/character_no_background_gold.png", "dest": "character_no_background_gold.png" }, { - "type": "vm_file", - "path": "/home/user/Desktop/character_no_background_gold.png", + "type": "cloud_file", + "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f/character_no_background_gold.png", "dest": "character_no_background_gold.png" } ] diff --git a/evaluation_examples/examples/multi_apps/f8cfa149-d1c1-4215-8dac-4a0932bad3c2.json b/evaluation_examples/examples/multi_apps/f8cfa149-d1c1-4215-8dac-4a0932bad3c2.json index 7dd4f83..2441c3e 100644 --- a/evaluation_examples/examples/multi_apps/f8cfa149-d1c1-4215-8dac-4a0932bad3c2.json +++ b/evaluation_examples/examples/multi_apps/f8cfa149-d1c1-4215-8dac-4a0932bad3c2.json @@ -65,7 +65,7 @@ "type": "rule", "rules": { "expect": { - "pattern": "https?://(www\\.?)?google\\.com/search\\?q=nereida(&|$)" + "pattern": "(?i)https?://(?:www\\.)?google\\.com/search\\?q=nereida(?:&|$|#).*" } } }