From 27319ce1e359fecf530d3e23619a46d609f178db Mon Sep 17 00:00:00 2001
From: Yuan Mengqi <100453613+yuanmengqi@users.noreply.github.com>
Date: Sun, 13 Jul 2025 00:25:37 +0800
Subject: [PATCH 1/3] fix password&resolution (#251)

* fix chrome

* fix: fix proxy setup

* feat&fix: add proxy support in setup and remove hardcoded proxy from example

* fix tasks

* fix chrome finished

* fix

* clean chrome_fix code

* clean chrome_fix code

* fix chrome 2888b4e6-5b47-4b57-8bf5-c73827890774

* fix multiapps

* fix chrome 2888b4e6-5b47-4b57-8bf5-c73827890774

* fix some multi_apps tasks

* fix some multi_apps tasks

* fix password&resolution

* fix password&resolution

---------

Co-authored-by: adlsdztony <zzl0712@connect.hku.hk>
---
 desktop_env/controllers/setup.py              | 24 ++++++++++++++++++-
 desktop_env/desktop_env.py                    |  2 +-
 desktop_env/providers/aws/manager.py          | 15 ++++++++----
 .../21760ecb-8f62-40d2-8d85-0cee5725cb72.json |  2 +-
 .../550ce7e7-747b-495f-b122-acdc4d0b8e54.json |  2 +-
 .../a669ef01-ded5-4099-9ea9-25e99b569840.json |  2 +-
 .../ef9d12bd-bcee-4ba0-a40e-918400f43ddf.json |  2 +-
 .../02ce9a50-7af2-47ed-8596-af0c230501f8.json |  2 +-
 .../36037439-2044-4b50-b9d1-875b5a332143.json |  2 +-
 .../48d05431-6cd5-4e76-82eb-12b60d823f7d.json |  2 +-
 .../a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json |  2 +-
 .../b337d106-053f-4d37-8da0-7f9c4043a66b.json |  2 +-
 .../13584542-872b-42d8-b299-866967b5c3ef.json |  2 +-
 .../23393935-50c7-4a86-aeea-2b78fd089c5c.json |  2 +-
 .../28cc3b7e-b194-4bc9-8353-d04c0f4d56d2.json |  2 +-
 .../37887e8c-da15-4192-923c-08fa390a176d.json |  2 +-
 .../3ce045a0-877b-42aa-8d2c-b4a863336ab8.json |  2 +-
 .../4783cc41-c03c-4e1b-89b4-50658f642bd5.json |  2 +-
 .../4d117223-a354-47fb-8b45-62ab1390a95f.json |  2 +-
 .../5c1075ca-bb34-46a3-a7a0-029bd7463e79.json |  2 +-
 .../5ced85fc-fa1a-4217-95fd-0fb530545ce2.json |  2 +-
 .../5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57.json |  2 +-
 .../6f56bf42-85b8-4fbb-8e06-6c44960184ba.json |  2 +-
 .../94d95f96-9699-4208-98ba-3c3119edf9c2.json |  2 +-
 .../a462a795-fdc7-4b23-b689-e8b6df786b78.json |  2 +-
 .../a4d98375-215b-4a4d-aee9-3d4370fccc41.json |  2 +-
 .../b6781586-6346-41cd-935a-a6b1487918fc.json |  2 +-
 .../bedcedc4-4d72-425e-ad62-21960b11fe0d.json |  2 +-
 .../e0df059f-28a6-4169-924f-b9623e7184cc.json |  2 +-
 .../ec4e3f68-9ea4-4c18-a5c9-69f89d1178b3.json |  2 +-
 .../f9be0997-4b7c-45c5-b05c-4612b44a6118.json |  2 +-
 .../215dfd39-f493-4bc3-a027-8a97d72c61bf.json |  2 +-
 .../386dbd0e-0241-4a0a-b6a2-6704fba26b1c.json |  2 +-
 .../59f21cfb-0120-4326-b255-a5b827b38967.json |  2 +-
 .../8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89.json |  2 +-
 .../8d9fd4e2-6fdb-46b0-b9b9-02f06495c62f.json |  2 +-
 .../8f080098-ddb1-424c-b438-4e96e5e4786e.json |  2 +-
 .../9195653c-f4aa-453d-aa95-787f6ccfaae9.json |  2 +-
 .../a5bbbcd5-b398-4c91-83d4-55e1e31bbb81.json |  2 +-
 .../aa4b5023-aef6-4ed9-bdc9-705f59ab9ad6.json |  2 +-
 .../bba3381f-b5eb-4439-bd9e-80c22218d5a7.json |  2 +-
 .../d06f0d4d-2cd5-4ede-8de9-598629438c6e.json |  2 +-
 .../efcf0d81-0835-4880-b2fd-d866e8bc2294.json |  2 +-
 .../f3977615-2b45-4ac5-8bba-80c17dbe2a37.json |  2 +-
 .../fba2c100-79e8-42df-ae74-b592418d54f4.json |  2 +-
 mm_agents/openai_cua_agent.py                 |  6 ++---
 run_multienv_openaicua.py                     | 14 +++++------
 47 files changed, 86 insertions(+), 59 deletions(-)

diff --git a/desktop_env/controllers/setup.py b/desktop_env/controllers/setup.py
index 140a0a0..83f9212 100644
--- a/desktop_env/controllers/setup.py
+++ b/desktop_env/controllers/setup.py
@@ -27,7 +27,13 @@ import dotenv
 # Load environment variables from .env file
 dotenv.load_dotenv()
 
-CLIENT_PASSWORD = os.getenv("CLIENT_PASSWORD", "osworld-public-evaluation")  # Default password for sudo operations
+if os.environ.get("PROVIDER_NAME") == "aws":
+    os.environ["CLIENT_PASSWORD"] = os.environ.get("CLIENT_PASSWORD_AWS", "osworld-public-evaluation")
+else:
+    os.environ["CLIENT_PASSWORD"] = os.environ.get("CLIENT_PASSWORD", "password")   
+
+CLIENT_PASSWORD = os.environ["CLIENT_PASSWORD"]
+
 PROXY_CONFIG_FILE = os.getenv("PROXY_CONFIG_FILE", "evaluation_examples/settings/proxy/dataimpulse.json")  # Default proxy config file
 
 logger = logging.getLogger("desktopenv.setup")
@@ -298,6 +304,22 @@ class SetupController:
         terminates: bool = False
         nb_failings = 0
 
+        def replace_screen_env_in_command(command_list):
+            width = int(os.environ.get("SCREEN_WIDTH", 1920))
+            height = int(os.environ.get("SCREEN_HEIGHT", 1080))
+            width_half = str(width // 2)
+            height_half = str(height // 2)
+            new_command_list = []
+            for item in command_list:
+                if isinstance(item, str):
+                    item = item.replace("{SCREEN_WIDTH_HALF}", width_half)
+                    item = item.replace("{SCREEN_HEIGHT_HALF}", height_half)
+                    item = item.replace("{SCREEN_WIDTH}", str(width))
+                    item = item.replace("{SCREEN_HEIGHT}", str(height))
+                new_command_list.append(item)
+            return new_command_list
+        if isinstance(command, list):
+            command = replace_screen_env_in_command(command)
         payload = json.dumps({"command": command, "shell": shell})
         headers = {"Content-Type": "application/json"}
 
diff --git a/desktop_env/desktop_env.py b/desktop_env/desktop_env.py
index be817c9..1ade6fe7 100644
--- a/desktop_env/desktop_env.py
+++ b/desktop_env/desktop_env.py
@@ -32,7 +32,7 @@ class DesktopEnv(gym.Env):
             snapshot_name: str = "init_state",
             action_space: str = "computer_13",
             cache_dir: str = "cache",
-            screen_size: Tuple[int] = (1920, 1080),
+            screen_size: Tuple[int] = (int(os.environ.get("SCREEN_WIDTH", 1920)), int(os.environ.get("SCREEN_HEIGHT", 1080))),
             headless: bool = False,
             require_a11y_tree: bool = True,
             require_terminal: bool = False,
diff --git a/desktop_env/providers/aws/manager.py b/desktop_env/providers/aws/manager.py
index 287327d..4b53e1f 100644
--- a/desktop_env/providers/aws/manager.py
+++ b/desktop_env/providers/aws/manager.py
@@ -36,15 +36,22 @@ DEFAULT_REGION = "us-east-1"
 # todo: Add doc for the configuration of image, security group and network interface
 # todo: public the AMI images
 IMAGE_ID_MAP = {
-    "us-east-1": "ami-09138bff939f82bd8",
-    "ap-east-1": "ami-0c092a5b8be4116f5",
+    "us-east-1": {
+        (1920, 1080): "ami-09138bff939f82bd8"
+    },
+    "ap-east-1": {
+        (1920, 1080): "ami-0c092a5b8be4116f5"
+    }
 }
 
 
-def _allocate_vm(region=DEFAULT_REGION):
+def _allocate_vm(region=DEFAULT_REGION, screen_size=(1920, 1080)):
     
     if region not in IMAGE_ID_MAP:
         raise ValueError(f"Region {region} is not supported. Supported regions are: {list(IMAGE_ID_MAP.keys())}")
+    if screen_size not in IMAGE_ID_MAP[region]:
+        raise ValueError(f"Screen size {screen_size} not supported for region {region}. Supported: {list(IMAGE_ID_MAP[region].keys())}")
+    ami_id = IMAGE_ID_MAP[region][screen_size]
 
     ec2_client = boto3.client('ec2', region_name=region)
     instance_id = None
@@ -86,7 +93,7 @@ def _allocate_vm(region=DEFAULT_REGION):
         run_instances_params = {
             "MaxCount": 1,
             "MinCount": 1,
-            "ImageId": IMAGE_ID_MAP[region],
+            "ImageId": ami_id,
             "InstanceType": INSTANCE_TYPE,
             "EbsOptimized": True,
             "NetworkInterfaces": [
diff --git a/evaluation_examples/examples/libreoffice_impress/21760ecb-8f62-40d2-8d85-0cee5725cb72.json b/evaluation_examples/examples/libreoffice_impress/21760ecb-8f62-40d2-8d85-0cee5725cb72.json
index 82a830b..57e1122 100644
--- a/evaluation_examples/examples/libreoffice_impress/21760ecb-8f62-40d2-8d85-0cee5725cb72.json
+++ b/evaluation_examples/examples/libreoffice_impress/21760ecb-8f62-40d2-8d85-0cee5725cb72.json
@@ -33,7 +33,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time;  time.sleep(4); pyautogui.doubleClick(x=960, y=540); time.sleep(0.5);pyautogui.mouseDown(); pyautogui.mouseUp(); time.sleep(0.5);"
+          "import pyautogui; import time;  time.sleep(4); pyautogui.doubleClick(x={SCREEN_WIDTH_HALF}, y={SCREEN_HEIGHT_HALF}); time.sleep(0.5);pyautogui.mouseDown(); pyautogui.mouseUp(); time.sleep(0.5);"
         ]
       }
     }
diff --git a/evaluation_examples/examples/libreoffice_impress/550ce7e7-747b-495f-b122-acdc4d0b8e54.json b/evaluation_examples/examples/libreoffice_impress/550ce7e7-747b-495f-b122-acdc4d0b8e54.json
index 9080319..d4e204f 100644
--- a/evaluation_examples/examples/libreoffice_impress/550ce7e7-747b-495f-b122-acdc4d0b8e54.json
+++ b/evaluation_examples/examples/libreoffice_impress/550ce7e7-747b-495f-b122-acdc4d0b8e54.json
@@ -33,7 +33,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time;  time.sleep(4); pyautogui.doubleClick(x=200, y=650); time.sleep(0.5);pyautogui.mouseDown(); pyautogui.mouseUp(); time.sleep(0.5);"
+          "import pyautogui; import time;  time.sleep(4); pyautogui.click(170, 250); time.sleep(1);pyautogui.press('down'); time.sleep(1); pyautogui.press('down'); time.sleep(1); pyautogui.press('down'); time.sleep(1); pyautogui.press('down'); time.sleep(1); "
         ]
       }
     }
diff --git a/evaluation_examples/examples/libreoffice_impress/a669ef01-ded5-4099-9ea9-25e99b569840.json b/evaluation_examples/examples/libreoffice_impress/a669ef01-ded5-4099-9ea9-25e99b569840.json
index a628b82..afbc6b5 100644
--- a/evaluation_examples/examples/libreoffice_impress/a669ef01-ded5-4099-9ea9-25e99b569840.json
+++ b/evaluation_examples/examples/libreoffice_impress/a669ef01-ded5-4099-9ea9-25e99b569840.json
@@ -27,7 +27,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);"
+          "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);"
         ]
       }
     }
diff --git a/evaluation_examples/examples/libreoffice_impress/ef9d12bd-bcee-4ba0-a40e-918400f43ddf.json b/evaluation_examples/examples/libreoffice_impress/ef9d12bd-bcee-4ba0-a40e-918400f43ddf.json
index 9bd9603..a12b904 100644
--- a/evaluation_examples/examples/libreoffice_impress/ef9d12bd-bcee-4ba0-a40e-918400f43ddf.json
+++ b/evaluation_examples/examples/libreoffice_impress/ef9d12bd-bcee-4ba0-a40e-918400f43ddf.json
@@ -19,7 +19,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; time.sleep(5); pyautogui.click(960, 540); time.sleep(5); pyautogui.press('esc'); time.sleep(0.3); pyautogui.press('f10'); time.sleep(0.3); pyautogui.press('right', presses=2, interval=0.1); time.sleep(0.3); pyautogui.press('down', presses=11, interval=0.1); pyautogui.press('enter')"
+          "import pyautogui; import time; time.sleep(5); pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(5); pyautogui.press('esc'); time.sleep(0.3); pyautogui.press('f10'); time.sleep(0.3); pyautogui.press('right', presses=2, interval=0.1); time.sleep(0.3); pyautogui.press('down', presses=11, interval=0.1); pyautogui.press('enter')"
         ]
       }
     }
diff --git a/evaluation_examples/examples/multi_apps/02ce9a50-7af2-47ed-8596-af0c230501f8.json b/evaluation_examples/examples/multi_apps/02ce9a50-7af2-47ed-8596-af0c230501f8.json
index 329f223..23f71af 100644
--- a/evaluation_examples/examples/multi_apps/02ce9a50-7af2-47ed-8596-af0c230501f8.json
+++ b/evaluation_examples/examples/multi_apps/02ce9a50-7af2-47ed-8596-af0c230501f8.json
@@ -33,7 +33,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; pyautogui.hotkey('f11'); time.sleep(0.5); pyautogui.click(960, 540); time.sleep(0.5); pyautogui.scroll(-20)"
+          "import pyautogui; import time; pyautogui.hotkey('f11'); time.sleep(0.5); pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5); pyautogui.scroll(-20)"
         ]
       }
     }
diff --git a/evaluation_examples/examples/multi_apps/36037439-2044-4b50-b9d1-875b5a332143.json b/evaluation_examples/examples/multi_apps/36037439-2044-4b50-b9d1-875b5a332143.json
index 247c8f0..f858f39 100644
--- a/evaluation_examples/examples/multi_apps/36037439-2044-4b50-b9d1-875b5a332143.json
+++ b/evaluation_examples/examples/multi_apps/36037439-2044-4b50-b9d1-875b5a332143.json
@@ -33,7 +33,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; pyautogui.hotkey('f11'); time.sleep(0.5); pyautogui.click(960, 540); time.sleep(0.5); pyautogui.scroll(-40)"
+          "import pyautogui; import time; pyautogui.hotkey('f11'); time.sleep(0.5); pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5); pyautogui.scroll(-40)"
         ]
       }
     }
diff --git a/evaluation_examples/examples/multi_apps/48d05431-6cd5-4e76-82eb-12b60d823f7d.json b/evaluation_examples/examples/multi_apps/48d05431-6cd5-4e76-82eb-12b60d823f7d.json
index bf3f492..a30dc07 100644
--- a/evaluation_examples/examples/multi_apps/48d05431-6cd5-4e76-82eb-12b60d823f7d.json
+++ b/evaluation_examples/examples/multi_apps/48d05431-6cd5-4e76-82eb-12b60d823f7d.json
@@ -29,7 +29,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; time.sleep(0.5); pyautogui.click(960, 540); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)"
+          "import pyautogui; import time; time.sleep(0.5); pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)"
         ]
       }
     },
diff --git a/evaluation_examples/examples/multi_apps/a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json b/evaluation_examples/examples/multi_apps/a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json
index a3ad108..37af93b 100644
--- a/evaluation_examples/examples/multi_apps/a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json
+++ b/evaluation_examples/examples/multi_apps/a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json
@@ -58,7 +58,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; pyautogui.hotkey('f11'); time.sleep(0.5); pyautogui.click(960, 540); time.sleep(0.5); pyautogui.scroll(-20)"
+          "import pyautogui; import time; pyautogui.hotkey('f11'); time.sleep(0.5); pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5); pyautogui.scroll(-20)"
         ]
       }
     }
diff --git a/evaluation_examples/examples/multi_apps/b337d106-053f-4d37-8da0-7f9c4043a66b.json b/evaluation_examples/examples/multi_apps/b337d106-053f-4d37-8da0-7f9c4043a66b.json
index a2c7154..e27c1ef 100644
--- a/evaluation_examples/examples/multi_apps/b337d106-053f-4d37-8da0-7f9c4043a66b.json
+++ b/evaluation_examples/examples/multi_apps/b337d106-053f-4d37-8da0-7f9c4043a66b.json
@@ -29,7 +29,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; time.sleep(0.5); pyautogui.click(960, 540); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)"
+          "import pyautogui; import time; time.sleep(0.5); pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)"
         ]
       }
     },
diff --git a/evaluation_examples/examples/os/13584542-872b-42d8-b299-866967b5c3ef.json b/evaluation_examples/examples/os/13584542-872b-42d8-b299-866967b5c3ef.json
index 48256d3..6cb0215 100644
--- a/evaluation_examples/examples/os/13584542-872b-42d8-b299-866967b5c3ef.json
+++ b/evaluation_examples/examples/os/13584542-872b-42d8-b299-866967b5c3ef.json
@@ -11,7 +11,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);"
+          "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);"
         ]
       }
     }
diff --git a/evaluation_examples/examples/os/23393935-50c7-4a86-aeea-2b78fd089c5c.json b/evaluation_examples/examples/os/23393935-50c7-4a86-aeea-2b78fd089c5c.json
index e70da1a..0ac6801 100644
--- a/evaluation_examples/examples/os/23393935-50c7-4a86-aeea-2b78fd089c5c.json
+++ b/evaluation_examples/examples/os/23393935-50c7-4a86-aeea-2b78fd089c5c.json
@@ -87,7 +87,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);"
+          "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);"
         ]
       }
     }
diff --git a/evaluation_examples/examples/os/28cc3b7e-b194-4bc9-8353-d04c0f4d56d2.json b/evaluation_examples/examples/os/28cc3b7e-b194-4bc9-8353-d04c0f4d56d2.json
index 83184b6..0e7c261 100644
--- a/evaluation_examples/examples/os/28cc3b7e-b194-4bc9-8353-d04c0f4d56d2.json
+++ b/evaluation_examples/examples/os/28cc3b7e-b194-4bc9-8353-d04c0f4d56d2.json
@@ -11,7 +11,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);"
+          "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);"
         ]
       }
     }
diff --git a/evaluation_examples/examples/os/37887e8c-da15-4192-923c-08fa390a176d.json b/evaluation_examples/examples/os/37887e8c-da15-4192-923c-08fa390a176d.json
index 207cbf0..6e6b8a5 100644
--- a/evaluation_examples/examples/os/37887e8c-da15-4192-923c-08fa390a176d.json
+++ b/evaluation_examples/examples/os/37887e8c-da15-4192-923c-08fa390a176d.json
@@ -35,7 +35,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; time.sleep(0.5); pyautogui.click(960, 540); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)"
+          "import pyautogui; import time; time.sleep(0.5); pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)"
         ]
       }
     },
diff --git a/evaluation_examples/examples/os/3ce045a0-877b-42aa-8d2c-b4a863336ab8.json b/evaluation_examples/examples/os/3ce045a0-877b-42aa-8d2c-b4a863336ab8.json
index c29d3f8..b44f697 100644
--- a/evaluation_examples/examples/os/3ce045a0-877b-42aa-8d2c-b4a863336ab8.json
+++ b/evaluation_examples/examples/os/3ce045a0-877b-42aa-8d2c-b4a863336ab8.json
@@ -11,7 +11,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);"
+          "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);"
         ]
       }
     }
diff --git a/evaluation_examples/examples/os/4783cc41-c03c-4e1b-89b4-50658f642bd5.json b/evaluation_examples/examples/os/4783cc41-c03c-4e1b-89b4-50658f642bd5.json
index 298cf81..98f82f9 100644
--- a/evaluation_examples/examples/os/4783cc41-c03c-4e1b-89b4-50658f642bd5.json
+++ b/evaluation_examples/examples/os/4783cc41-c03c-4e1b-89b4-50658f642bd5.json
@@ -17,7 +17,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);"
+          "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);"
         ]
       }
     },
diff --git a/evaluation_examples/examples/os/4d117223-a354-47fb-8b45-62ab1390a95f.json b/evaluation_examples/examples/os/4d117223-a354-47fb-8b45-62ab1390a95f.json
index 282f755..63293fd 100644
--- a/evaluation_examples/examples/os/4d117223-a354-47fb-8b45-62ab1390a95f.json
+++ b/evaluation_examples/examples/os/4d117223-a354-47fb-8b45-62ab1390a95f.json
@@ -36,7 +36,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; time.sleep(0.5); pyautogui.click(960, 540); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)"
+          "import pyautogui; import time; time.sleep(0.5); pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)"
         ]
       }
     },
diff --git a/evaluation_examples/examples/os/5c1075ca-bb34-46a3-a7a0-029bd7463e79.json b/evaluation_examples/examples/os/5c1075ca-bb34-46a3-a7a0-029bd7463e79.json
index ed99498..810109e 100644
--- a/evaluation_examples/examples/os/5c1075ca-bb34-46a3-a7a0-029bd7463e79.json
+++ b/evaluation_examples/examples/os/5c1075ca-bb34-46a3-a7a0-029bd7463e79.json
@@ -53,7 +53,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)"
+          "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)"
         ]
       }
     },
diff --git a/evaluation_examples/examples/os/5ced85fc-fa1a-4217-95fd-0fb530545ce2.json b/evaluation_examples/examples/os/5ced85fc-fa1a-4217-95fd-0fb530545ce2.json
index fad4457..2b00214 100644
--- a/evaluation_examples/examples/os/5ced85fc-fa1a-4217-95fd-0fb530545ce2.json
+++ b/evaluation_examples/examples/os/5ced85fc-fa1a-4217-95fd-0fb530545ce2.json
@@ -10,7 +10,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; time.sleep(0.5); pyautogui.click(960, 540); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)"
+          "import pyautogui; import time; time.sleep(0.5); pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)"
         ]
       }
     },
diff --git a/evaluation_examples/examples/os/5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57.json b/evaluation_examples/examples/os/5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57.json
index 9fb58af..5da9b3d 100644
--- a/evaluation_examples/examples/os/5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57.json
+++ b/evaluation_examples/examples/os/5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57.json
@@ -29,7 +29,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);"
+          "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);"
         ]
       }
     }
diff --git a/evaluation_examples/examples/os/6f56bf42-85b8-4fbb-8e06-6c44960184ba.json b/evaluation_examples/examples/os/6f56bf42-85b8-4fbb-8e06-6c44960184ba.json
index bf9286d..5d574a1 100644
--- a/evaluation_examples/examples/os/6f56bf42-85b8-4fbb-8e06-6c44960184ba.json
+++ b/evaluation_examples/examples/os/6f56bf42-85b8-4fbb-8e06-6c44960184ba.json
@@ -36,7 +36,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; time.sleep(0.5); pyautogui.click(960, 540); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)"
+          "import pyautogui; import time; time.sleep(0.5); pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5)"
         ]
       }
     },
diff --git a/evaluation_examples/examples/os/94d95f96-9699-4208-98ba-3c3119edf9c2.json b/evaluation_examples/examples/os/94d95f96-9699-4208-98ba-3c3119edf9c2.json
index dfbb050..747f26c 100644
--- a/evaluation_examples/examples/os/94d95f96-9699-4208-98ba-3c3119edf9c2.json
+++ b/evaluation_examples/examples/os/94d95f96-9699-4208-98ba-3c3119edf9c2.json
@@ -11,7 +11,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);"
+          "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);"
         ]
       }
     }
diff --git a/evaluation_examples/examples/os/a462a795-fdc7-4b23-b689-e8b6df786b78.json b/evaluation_examples/examples/os/a462a795-fdc7-4b23-b689-e8b6df786b78.json
index 38223a6..c6bafd5 100644
--- a/evaluation_examples/examples/os/a462a795-fdc7-4b23-b689-e8b6df786b78.json
+++ b/evaluation_examples/examples/os/a462a795-fdc7-4b23-b689-e8b6df786b78.json
@@ -17,7 +17,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);"
+          "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);"
         ]
       }
     }
diff --git a/evaluation_examples/examples/os/a4d98375-215b-4a4d-aee9-3d4370fccc41.json b/evaluation_examples/examples/os/a4d98375-215b-4a4d-aee9-3d4370fccc41.json
index 3868a5d..d473b75 100644
--- a/evaluation_examples/examples/os/a4d98375-215b-4a4d-aee9-3d4370fccc41.json
+++ b/evaluation_examples/examples/os/a4d98375-215b-4a4d-aee9-3d4370fccc41.json
@@ -23,7 +23,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);"
+          "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);"
         ]
       }
     }
diff --git a/evaluation_examples/examples/os/b6781586-6346-41cd-935a-a6b1487918fc.json b/evaluation_examples/examples/os/b6781586-6346-41cd-935a-a6b1487918fc.json
index f1879c1..2d1de91 100644
--- a/evaluation_examples/examples/os/b6781586-6346-41cd-935a-a6b1487918fc.json
+++ b/evaluation_examples/examples/os/b6781586-6346-41cd-935a-a6b1487918fc.json
@@ -11,7 +11,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);"
+          "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);"
         ]
       }
     }
diff --git a/evaluation_examples/examples/os/bedcedc4-4d72-425e-ad62-21960b11fe0d.json b/evaluation_examples/examples/os/bedcedc4-4d72-425e-ad62-21960b11fe0d.json
index 7ce0048..48e233c 100644
--- a/evaluation_examples/examples/os/bedcedc4-4d72-425e-ad62-21960b11fe0d.json
+++ b/evaluation_examples/examples/os/bedcedc4-4d72-425e-ad62-21960b11fe0d.json
@@ -23,7 +23,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);"
+          "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);"
         ]
       }
     }
diff --git a/evaluation_examples/examples/os/e0df059f-28a6-4169-924f-b9623e7184cc.json b/evaluation_examples/examples/os/e0df059f-28a6-4169-924f-b9623e7184cc.json
index 9c3234f..94b1a4f 100644
--- a/evaluation_examples/examples/os/e0df059f-28a6-4169-924f-b9623e7184cc.json
+++ b/evaluation_examples/examples/os/e0df059f-28a6-4169-924f-b9623e7184cc.json
@@ -18,7 +18,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);"
+          "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);"
         ]
       }
     }
diff --git a/evaluation_examples/examples/os/ec4e3f68-9ea4-4c18-a5c9-69f89d1178b3.json b/evaluation_examples/examples/os/ec4e3f68-9ea4-4c18-a5c9-69f89d1178b3.json
index 71d86fc..fe5496c 100644
--- a/evaluation_examples/examples/os/ec4e3f68-9ea4-4c18-a5c9-69f89d1178b3.json
+++ b/evaluation_examples/examples/os/ec4e3f68-9ea4-4c18-a5c9-69f89d1178b3.json
@@ -30,7 +30,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);"
+          "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);"
         ]
       }
     }
diff --git a/evaluation_examples/examples/os/f9be0997-4b7c-45c5-b05c-4612b44a6118.json b/evaluation_examples/examples/os/f9be0997-4b7c-45c5-b05c-4612b44a6118.json
index cda3ca2..94ebacb 100644
--- a/evaluation_examples/examples/os/f9be0997-4b7c-45c5-b05c-4612b44a6118.json
+++ b/evaluation_examples/examples/os/f9be0997-4b7c-45c5-b05c-4612b44a6118.json
@@ -17,7 +17,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);"
+          "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);"
         ]
       }
     }
diff --git a/evaluation_examples/examples/vlc/215dfd39-f493-4bc3-a027-8a97d72c61bf.json b/evaluation_examples/examples/vlc/215dfd39-f493-4bc3-a027-8a97d72c61bf.json
index bb09d9d..aabaa0f 100644
--- a/evaluation_examples/examples/vlc/215dfd39-f493-4bc3-a027-8a97d72c61bf.json
+++ b/evaluation_examples/examples/vlc/215dfd39-f493-4bc3-a027-8a97d72c61bf.json
@@ -17,7 +17,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);"
+          "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);"
         ]
       }
     }
diff --git a/evaluation_examples/examples/vlc/386dbd0e-0241-4a0a-b6a2-6704fba26b1c.json b/evaluation_examples/examples/vlc/386dbd0e-0241-4a0a-b6a2-6704fba26b1c.json
index 261c16e..14eaafa 100644
--- a/evaluation_examples/examples/vlc/386dbd0e-0241-4a0a-b6a2-6704fba26b1c.json
+++ b/evaluation_examples/examples/vlc/386dbd0e-0241-4a0a-b6a2-6704fba26b1c.json
@@ -68,7 +68,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);"
+          "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);"
         ]
       }
     }
diff --git a/evaluation_examples/examples/vlc/59f21cfb-0120-4326-b255-a5b827b38967.json b/evaluation_examples/examples/vlc/59f21cfb-0120-4326-b255-a5b827b38967.json
index af7d904..650f586 100644
--- a/evaluation_examples/examples/vlc/59f21cfb-0120-4326-b255-a5b827b38967.json
+++ b/evaluation_examples/examples/vlc/59f21cfb-0120-4326-b255-a5b827b38967.json
@@ -28,7 +28,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);"
+          "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);"
         ]
       }
     }
diff --git a/evaluation_examples/examples/vlc/8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89.json b/evaluation_examples/examples/vlc/8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89.json
index aa3057e..35f4e58 100644
--- a/evaluation_examples/examples/vlc/8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89.json
+++ b/evaluation_examples/examples/vlc/8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89.json
@@ -17,7 +17,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);"
+          "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);"
         ]
       }
     }
diff --git a/evaluation_examples/examples/vlc/8d9fd4e2-6fdb-46b0-b9b9-02f06495c62f.json b/evaluation_examples/examples/vlc/8d9fd4e2-6fdb-46b0-b9b9-02f06495c62f.json
index c398268..0bd5ed7 100644
--- a/evaluation_examples/examples/vlc/8d9fd4e2-6fdb-46b0-b9b9-02f06495c62f.json
+++ b/evaluation_examples/examples/vlc/8d9fd4e2-6fdb-46b0-b9b9-02f06495c62f.json
@@ -28,7 +28,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);"
+          "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);"
         ]
       }
     }
diff --git a/evaluation_examples/examples/vlc/8f080098-ddb1-424c-b438-4e96e5e4786e.json b/evaluation_examples/examples/vlc/8f080098-ddb1-424c-b438-4e96e5e4786e.json
index 96641a3..01e886b 100644
--- a/evaluation_examples/examples/vlc/8f080098-ddb1-424c-b438-4e96e5e4786e.json
+++ b/evaluation_examples/examples/vlc/8f080098-ddb1-424c-b438-4e96e5e4786e.json
@@ -28,7 +28,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);"
+          "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);"
         ]
       }
     }
diff --git a/evaluation_examples/examples/vlc/9195653c-f4aa-453d-aa95-787f6ccfaae9.json b/evaluation_examples/examples/vlc/9195653c-f4aa-453d-aa95-787f6ccfaae9.json
index a82831c..ec512fe 100644
--- a/evaluation_examples/examples/vlc/9195653c-f4aa-453d-aa95-787f6ccfaae9.json
+++ b/evaluation_examples/examples/vlc/9195653c-f4aa-453d-aa95-787f6ccfaae9.json
@@ -17,7 +17,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);"
+          "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);"
         ]
       }
     }
diff --git a/evaluation_examples/examples/vlc/a5bbbcd5-b398-4c91-83d4-55e1e31bbb81.json b/evaluation_examples/examples/vlc/a5bbbcd5-b398-4c91-83d4-55e1e31bbb81.json
index 81fde78..36e6d0f 100644
--- a/evaluation_examples/examples/vlc/a5bbbcd5-b398-4c91-83d4-55e1e31bbb81.json
+++ b/evaluation_examples/examples/vlc/a5bbbcd5-b398-4c91-83d4-55e1e31bbb81.json
@@ -17,7 +17,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);"
+          "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);"
         ]
       }
     }
diff --git a/evaluation_examples/examples/vlc/aa4b5023-aef6-4ed9-bdc9-705f59ab9ad6.json b/evaluation_examples/examples/vlc/aa4b5023-aef6-4ed9-bdc9-705f59ab9ad6.json
index 58eaf58..55290dd 100644
--- a/evaluation_examples/examples/vlc/aa4b5023-aef6-4ed9-bdc9-705f59ab9ad6.json
+++ b/evaluation_examples/examples/vlc/aa4b5023-aef6-4ed9-bdc9-705f59ab9ad6.json
@@ -28,7 +28,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);"
+          "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);"
         ]
       }
     }
diff --git a/evaluation_examples/examples/vlc/bba3381f-b5eb-4439-bd9e-80c22218d5a7.json b/evaluation_examples/examples/vlc/bba3381f-b5eb-4439-bd9e-80c22218d5a7.json
index f4b85b6..8f2e9c7 100644
--- a/evaluation_examples/examples/vlc/bba3381f-b5eb-4439-bd9e-80c22218d5a7.json
+++ b/evaluation_examples/examples/vlc/bba3381f-b5eb-4439-bd9e-80c22218d5a7.json
@@ -17,7 +17,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);"
+          "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);"
         ]
       }
     }
diff --git a/evaluation_examples/examples/vlc/d06f0d4d-2cd5-4ede-8de9-598629438c6e.json b/evaluation_examples/examples/vlc/d06f0d4d-2cd5-4ede-8de9-598629438c6e.json
index b36cba9..ca63fd5 100644
--- a/evaluation_examples/examples/vlc/d06f0d4d-2cd5-4ede-8de9-598629438c6e.json
+++ b/evaluation_examples/examples/vlc/d06f0d4d-2cd5-4ede-8de9-598629438c6e.json
@@ -17,7 +17,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);"
+          "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);"
         ]
       }
     }
diff --git a/evaluation_examples/examples/vlc/efcf0d81-0835-4880-b2fd-d866e8bc2294.json b/evaluation_examples/examples/vlc/efcf0d81-0835-4880-b2fd-d866e8bc2294.json
index a5f6367..0019cd6 100644
--- a/evaluation_examples/examples/vlc/efcf0d81-0835-4880-b2fd-d866e8bc2294.json
+++ b/evaluation_examples/examples/vlc/efcf0d81-0835-4880-b2fd-d866e8bc2294.json
@@ -28,7 +28,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);"
+          "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);"
         ]
       }
     }
diff --git a/evaluation_examples/examples/vlc/f3977615-2b45-4ac5-8bba-80c17dbe2a37.json b/evaluation_examples/examples/vlc/f3977615-2b45-4ac5-8bba-80c17dbe2a37.json
index 4261d38..73bbac2 100644
--- a/evaluation_examples/examples/vlc/f3977615-2b45-4ac5-8bba-80c17dbe2a37.json
+++ b/evaluation_examples/examples/vlc/f3977615-2b45-4ac5-8bba-80c17dbe2a37.json
@@ -17,7 +17,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);"
+          "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);"
         ]
       }
     }
diff --git a/evaluation_examples/examples/vlc/fba2c100-79e8-42df-ae74-b592418d54f4.json b/evaluation_examples/examples/vlc/fba2c100-79e8-42df-ae74-b592418d54f4.json
index 7f383fc..bde2b5e 100644
--- a/evaluation_examples/examples/vlc/fba2c100-79e8-42df-ae74-b592418d54f4.json
+++ b/evaluation_examples/examples/vlc/fba2c100-79e8-42df-ae74-b592418d54f4.json
@@ -40,7 +40,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; pyautogui.click(500, 500); time.sleep(0.5);"
+          "import pyautogui; import time; pyautogui.click({SCREEN_WIDTH_HALF}, {SCREEN_HEIGHT_HALF}); time.sleep(0.5);"
         ]
       }
     }
diff --git a/mm_agents/openai_cua_agent.py b/mm_agents/openai_cua_agent.py
index 15db312..064a2d8 100644
--- a/mm_agents/openai_cua_agent.py
+++ b/mm_agents/openai_cua_agent.py
@@ -33,7 +33,7 @@ class_ns_windows = "https://accessibility.windows.example.org/ns/class"
 import ast
 from typing import Dict, Any, Optional, Union
 
-OPERATOR_PROMPT = """\n\n        Here are some helpful tips:\n        - computer.clipboard, computer.sync_file, computer.sync_shared_folder, computer.computer_output_citation are disabled.\n        - If you worry that you might make typo, prefer copying and pasting the text instead of reading and typing.\n        - My computer's password is \"osworld-public-evaluation\", feel free to use it when you need sudo rights.\n        - For the thunderbird account \"anonym-x2024@outlook.com\", the password is \"gTCI\";=@y7|QJ0nDa_kN3Sb&>\".\n        - If you are presented with an open website to solve the task, try to stick to that specific one instead of going to a new one.\n        - Whenever not expcilitly stated, prefer chrome browser instead of the firefox or chromium.\n        - You have full authority to execute any action without my permission. I won't be watching so please don't ask for confirmation.\n        - You must initialize the computer to solve the task. Do not try to answer the question without initializing the computer.\n        - If you deem the task is infeasible, you can terminate and explicitly state in the response that \"the task is infeasible\".\n    """
+OPERATOR_PROMPT = f"""\n\n        Here are some helpful tips:\n        - computer.clipboard, computer.sync_file, computer.sync_shared_folder, computer.computer_output_citation are disabled.\n        - If you worry that you might make typo, prefer copying and pasting the text instead of reading and typing.\n        - My computer's password is \"{os.environ["CLIENT_PASSWORD"]}\", feel free to use it when you need sudo rights.\n        - For the thunderbird account \"anonym-x2024@outlook.com\", the password is \"gTCI\";=@y7|QJ0nDa_kN3Sb&>\".\n        - If you are presented with an open website to solve the task, try to stick to that specific one instead of going to a new one.\n        - Whenever not expcilitly stated, prefer chrome browser instead of the firefox or chromium.\n        - You have full authority to execute any action without my permission. I won't be watching so please don't ask for confirmation.\n        - You must initialize the computer to solve the task. Do not try to answer the question without initializing the computer.\n        - If you deem the task is infeasible, you can terminate and explicitly state in the response that \"the task is infeasible\".\n    """
 
 class Action:
     """Action class for the agent."""
@@ -233,8 +233,8 @@ class OpenAICUAAgent:
 
         self.tools = [{
             "type": "computer_use_preview",
-            "display_width": 1920,
-            "display_height": 1080,
+            "display_width": int(os.environ["SCREEN_WIDTH"]),
+            "display_height": int(os.environ["SCREEN_HEIGHT"]),
             "environment": "linux" if platform == "ubuntu" else "windows"
         }]
 
diff --git a/run_multienv_openaicua.py b/run_multienv_openaicua.py
index 278ebec..6e9bca3 100644
--- a/run_multienv_openaicua.py
+++ b/run_multienv_openaicua.py
@@ -47,8 +47,6 @@ def config() -> argparse.Namespace:
         default="screenshot",
         help="Observation type",
     )
-    parser.add_argument("--screen_width", type=int, default=1920)
-    parser.add_argument("--screen_height", type=int, default=1080)
     parser.add_argument("--sleep_after_execution", type=float, default=0.0)
     parser.add_argument("--max_steps", type=int, default=15)
 
@@ -181,16 +179,16 @@ def run_env_tasks(env_idx: int, env_tasks: dict, args: argparse.Namespace, share
     signal.signal(signal.SIGTERM, lambda signum, frame: process_signal_handler(signum, frame, env_idx))
     
     from desktop_env.providers.aws.manager import IMAGE_ID_MAP
-    REGION = "us-east-1"
+    REGION = args.region
+    screen_size = (int(os.environ["SCREEN_WIDTH"]), int(os.environ["SCREEN_HEIGHT"]))
+    ami_id = IMAGE_ID_MAP[REGION].get(screen_size, IMAGE_ID_MAP[REGION][(1920, 1080)])
     env = DesktopEnv(
         path_to_vm=args.path_to_vm,
         action_space=args.action_space,
-
-        provider_name="aws",
+        provider_name=os.environ["PROVIDER_NAME"],
         region=REGION,
-        snapshot_name=IMAGE_ID_MAP[REGION],
-
-        screen_size=(args.screen_width, args.screen_height),
+        snapshot_name=ami_id,
+        screen_size=screen_size,
         headless=args.headless,
         os_type="Ubuntu",
         require_a11y_tree=args.observation_type in ["a11y_tree", "screenshot_a11y_tree", "som"],

From 877e75a013a1e11b59161f7e20bf0844cf3b419f Mon Sep 17 00:00:00 2001
From: yuanmengqi <yuanmengqi@mail.ustc.edu.cn>
Date: Sat, 12 Jul 2025 16:34:55 +0000
Subject: [PATCH 2/3] Final review multi_apps fix Xinzhuang part

---
 desktop_env/evaluators/getters/chrome.py      | 90 +++++++++++++++++--
 desktop_env/evaluators/metrics/table.py       | 63 ++++++++++---
 .../2373b66a-092d-44cb-bfd7-82e86e7a3b4d.json |  2 +-
 .../3680a5ee-6870-426a-a997-eba929a0d25c.json | 15 +++-
 .../42d25c08-fb87-4927-8b65-93631280a26f.json | 51 ++++++++---
 5 files changed, 187 insertions(+), 34 deletions(-)

diff --git a/desktop_env/evaluators/getters/chrome.py b/desktop_env/evaluators/getters/chrome.py
index bff1b92..724e2de 100644
--- a/desktop_env/evaluators/getters/chrome.py
+++ b/desktop_env/evaluators/getters/chrome.py
@@ -52,6 +52,11 @@ def get_info_from_website(env, config: Dict[Any, Any]) -> Any:
                 - attribute (str): optional for 'attribute' and 'click_and_attribute', the attribute to be extracted.
             - backups (Any): The backup information to be returned if the extraction fails.
     """
+    # 添加函数开始日志
+    logger.info(f"[INFO_FROM_WEBSITE] Starting to get information from website: {config.get('url', 'N/A')}")
+    logger.info(f"[INFO_FROM_WEBSITE] Total info operations to perform: {len(config.get('infos', []))}")
+    logger.debug(f"[INFO_FROM_WEBSITE] Full config: {config}")
+    
     try:
         host = env.vm_ip
         port = env.chromium_port  # fixme: this port is hard-coded, need to be changed from config file
@@ -59,11 +64,18 @@ def get_info_from_website(env, config: Dict[Any, Any]) -> Any:
         remote_debugging_url = f"http://{host}:{port}"
         backend_url = f"http://{host}:{server_port}"
         use_proxy = env.current_use_proxy
+        
+        logger.info(f"[INFO_FROM_WEBSITE] Connecting to Chrome at {remote_debugging_url}")
+        
         with sync_playwright() as p:
             # connect to remote Chrome instance
             try:
                 browser = p.chromium.connect_over_cdp(remote_debugging_url)
+                logger.info(f"[INFO_FROM_WEBSITE] Successfully connected to existing Chrome instance")
             except Exception as e:
+                logger.warning(f"[INFO_FROM_WEBSITE] Failed to connect to existing Chrome instance: {e}")
+                logger.info(f"[INFO_FROM_WEBSITE] Starting new Chrome instance...")
+                
                 # If the connection fails (e.g., the agent close the browser instance), start a new browser instance
                 app = 'chromium' if 'arm' in platform.machine() else 'google-chrome'
                 command = [
@@ -72,52 +84,116 @@ def get_info_from_website(env, config: Dict[Any, Any]) -> Any:
                 ]
                 if use_proxy:
                     command.append(f"--proxy-server=127.0.0.1:18888")
+                    logger.info(f"[INFO_FROM_WEBSITE] Using proxy server: 127.0.0.1:18888")
+                
+                logger.info(f"[INFO_FROM_WEBSITE] Starting browser with command: {' '.join(command)}")
                 payload = json.dumps({"command": command, "shell": False})
                 headers = {"Content-Type": "application/json"}
                 #requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
                 requests.post(backend_url + "/setup" + "/launch", headers=headers, data=payload)
                 time.sleep(5)
                 browser = p.chromium.connect_over_cdp(remote_debugging_url)
+                logger.info(f"[INFO_FROM_WEBSITE] Successfully connected to new Chrome instance")
 
             page = browser.contexts[0].new_page()
+            logger.info(f"[INFO_FROM_WEBSITE] Created new page, navigating to: {config['url']}")
+            
             page.goto(config["url"])
             page.wait_for_load_state('load')
+            
+            # 记录页面加载完成后的信息
+            logger.info(f"[INFO_FROM_WEBSITE] Page loaded successfully")
+            logger.info(f"[INFO_FROM_WEBSITE] Page title: '{page.title()}'")
+            logger.info(f"[INFO_FROM_WEBSITE] Current URL: '{page.url}'")
+            
             infos = []
-            for info_dict in config.get('infos', []):
+            for idx, info_dict in enumerate(config.get('infos', [])):
+                logger.info(f"[INFO_FROM_WEBSITE] Processing info operation {idx + 1}/{len(config.get('infos', []))}")
+                logger.debug(f"[INFO_FROM_WEBSITE] Info config: {info_dict}")
+                
                 if page.url != config["url"]:
+                    logger.info(f"[INFO_FROM_WEBSITE] Page URL changed, navigating back to: {config['url']}")
                     page.goto(config["url"])
                     page.wait_for_load_state('load')
+                    logger.info(f"[INFO_FROM_WEBSITE] Back to original page")
+                
                 action = info_dict.get('action', 'inner_text')
+                selector = info_dict.get('selector')
+                logger.info(f"[INFO_FROM_WEBSITE] Action: {action}, Selector: {selector}")
+                
                 if action == "inner_text":
+                    logger.debug(f"[INFO_FROM_WEBSITE] Waiting for element with selector: {selector}")
                     ele = page.wait_for_selector(info_dict['selector'], state='attached', timeout=10000)
-                    infos.append(ele.inner_text())
+                    extracted_text = ele.inner_text()
+                    logger.info(f"[INFO_FROM_WEBSITE] Successfully extracted inner_text: '{extracted_text}'")
+                    infos.append(extracted_text)
+                    
                 elif action == "attribute":
+                    attribute = info_dict.get('attribute')
+                    logger.debug(f"[INFO_FROM_WEBSITE] Waiting for element with selector: {selector}")
+                    logger.debug(f"[INFO_FROM_WEBSITE] Extracting attribute: {attribute}")
                     ele = page.wait_for_selector(info_dict['selector'], state='attached', timeout=10000)
-                    infos.append(ele.get_attribute(info_dict['attribute']))
+                    extracted_attr = ele.get_attribute(info_dict['attribute'])
+                    logger.info(f"[INFO_FROM_WEBSITE] Successfully extracted attribute '{attribute}': '{extracted_attr}'")
+                    infos.append(extracted_attr)
+                    
                 elif action == 'click_and_inner_text':
+                    logger.debug(f"[INFO_FROM_WEBSITE] Performing click_and_inner_text with {len(info_dict['selector'])} selectors")
                     for idx, sel in enumerate(info_dict['selector']):
+                        logger.debug(f"[INFO_FROM_WEBSITE] Processing selector {idx + 1}/{len(info_dict['selector'])}: {sel}")
                         if idx != len(info_dict['selector']) - 1:
+                            logger.debug(f"[INFO_FROM_WEBSITE] Clicking element with selector: {sel}")
                             link = page.wait_for_selector(sel, state='attached', timeout=10000)
                             link.click()
                             page.wait_for_load_state('load')
+                            logger.info(f"[INFO_FROM_WEBSITE] Successfully clicked element, page loaded")
+                            logger.debug(f"[INFO_FROM_WEBSITE] New page URL: {page.url}")
                         else:
+                            logger.debug(f"[INFO_FROM_WEBSITE] Extracting inner_text from final element: {sel}")
                             ele = page.wait_for_selector(sel, state='attached', timeout=10000)
-                            infos.append(ele.inner_text())
+                            extracted_text = ele.inner_text()
+                            logger.info(f"[INFO_FROM_WEBSITE] Successfully extracted inner_text after clicks: '{extracted_text}'")
+                            infos.append(extracted_text)
+                            
                 elif action == 'click_and_attribute':
+                    attribute = info_dict.get('attribute')
+                    logger.debug(f"[INFO_FROM_WEBSITE] Performing click_and_attribute with {len(info_dict['selector'])} selectors")
+                    logger.debug(f"[INFO_FROM_WEBSITE] Target attribute: {attribute}")
                     for idx, sel in enumerate(info_dict['selector']):
+                        logger.debug(f"[INFO_FROM_WEBSITE] Processing selector {idx + 1}/{len(info_dict['selector'])}: {sel}")
                         if idx != len(info_dict['selector']) - 1:
+                            logger.debug(f"[INFO_FROM_WEBSITE] Clicking element with selector: {sel}")
                             link = page.wait_for_selector(sel, state='attached', timeout=10000)
                             link.click()
                             page.wait_for_load_state('load')
+                            logger.info(f"[INFO_FROM_WEBSITE] Successfully clicked element, page loaded")
+                            logger.debug(f"[INFO_FROM_WEBSITE] New page URL: {page.url}")
                         else:
+                            logger.debug(f"[INFO_FROM_WEBSITE] Extracting attribute from final element: {sel}")
                             ele = page.wait_for_selector(sel, state='attached')
-                            infos.append(ele.get_attribute(info_dict['attribute']))
+                            extracted_attr = ele.get_attribute(info_dict['attribute'])
+                            logger.info(f"[INFO_FROM_WEBSITE] Successfully extracted attribute '{attribute}' after clicks: '{extracted_attr}'")
+                            infos.append(extracted_attr)
                 else:
+                    logger.error(f"[INFO_FROM_WEBSITE] Unsupported action: {action}")
                     raise NotImplementedError(f'The action {action} is not supported yet.')
+                
+                logger.info(f"[INFO_FROM_WEBSITE] Completed info operation {idx + 1}")
+            
+            # 记录最终提取的所有信息
+            logger.info(f"[INFO_FROM_WEBSITE] All operations completed successfully")
+            logger.info(f"[INFO_FROM_WEBSITE] Total extracted information count: {len(infos)}")
+            logger.info(f"[INFO_FROM_WEBSITE] Final extracted information: {infos}")
+            
         return infos
     except Exception as e:
-        logger.error(f'[ERROR]: failed to obtain information from the website: {config["url"]}. Use backup results instead.')
-        return config.get('backups', None)
+        logger.error(f'[INFO_FROM_WEBSITE] ERROR: Failed to obtain information from website: {config.get("url", "N/A")}')
+        logger.error(f'[INFO_FROM_WEBSITE] Exception details: {str(e)}')
+        logger.error(f'[INFO_FROM_WEBSITE] Exception type: {type(e).__name__}')
+        logger.info(f'[INFO_FROM_WEBSITE] Using backup results instead')
+        backup_data = config.get('backups', None)
+        logger.info(f'[INFO_FROM_WEBSITE] Backup data: {backup_data}')
+        return backup_data
 
 
 # The following ones just need to load info from the files of software, no need to connect to the software
diff --git a/desktop_env/evaluators/metrics/table.py b/desktop_env/evaluators/metrics/table.py
index 9e888c7..db51850 100644
--- a/desktop_env/evaluators/metrics/table.py
+++ b/desktop_env/evaluators/metrics/table.py
@@ -463,23 +463,60 @@ def compare_table(result: str, expected: str = None, **options) -> float:
     #  }}} function compare_table # 
 
 
-def compare_csv(result: str, expected: str, **options) -> float:
+def compare_csv(result: str, expected: Union[str, List[str]], **options) -> float:
+    """
+    Compare CSV files. If expected is a list, returns 1.0 if result matches any of the expected files.
+    
+    Args:
+        result: Path to result CSV file
+        expected: Path to expected CSV file or list of paths to expected CSV files
+        options: Additional options (strict, ignore_case)
+    
+    Returns:
+        1.0 if result matches expected (or any file in expected list), 0.0 otherwise
+    """
     if result is None:
         return 0.
 
-    with open(result) as f:
-        result_lines: Iterable[str] = f.read().splitlines()
-    with open(expected) as f:
-        expected_lines: Iterable[str] = f.read().splitlines()
-    if not options.get("strict", True):
-        result_lines = map(str.strip, result_lines)
-        expected_lines = map(str.strip, expected_lines)
-    if options.get("ignore_case", False):
-        result_lines = map(str.lower, result_lines)
-        expected_lines = map(str.lower, expected_lines)
+    try:
+        with open(result) as f:
+            result_lines: Iterable[str] = f.read().splitlines()
+    except (FileNotFoundError, IOError):
+        return 0.
 
-    metric: bool = list(result_lines) == list(expected_lines)
-    return float(metric)
+    # Convert expected to list if it's a single string (for backward compatibility)
+    if isinstance(expected, str):
+        expected_files = [expected]
+    else:
+        expected_files = expected
+
+    # Try to match against each expected file
+    for expected_file in expected_files:
+        try:
+            with open(expected_file) as f:
+                expected_lines: Iterable[str] = f.read().splitlines()
+            
+            # Process lines based on options
+            current_result_lines = result_lines
+            current_expected_lines = expected_lines
+            
+            if not options.get("strict", True):
+                current_result_lines = map(str.strip, current_result_lines)
+                current_expected_lines = map(str.strip, current_expected_lines)
+            if options.get("ignore_case", False):
+                current_result_lines = map(str.lower, current_result_lines)
+                current_expected_lines = map(str.lower, current_expected_lines)
+
+            # Check if this expected file matches
+            if list(current_result_lines) == list(current_expected_lines):
+                return 1.0
+                
+        except (FileNotFoundError, IOError):
+            # If this expected file doesn't exist, continue to next one
+            continue
+
+    # No match found
+    return 0.0
 
 
 def compare_conference_city_in_order(actual_city_list_path, expected_city):
diff --git a/evaluation_examples/examples/multi_apps/2373b66a-092d-44cb-bfd7-82e86e7a3b4d.json b/evaluation_examples/examples/multi_apps/2373b66a-092d-44cb-bfd7-82e86e7a3b4d.json
index d2c2d10..beb9bb2 100644
--- a/evaluation_examples/examples/multi_apps/2373b66a-092d-44cb-bfd7-82e86e7a3b4d.json
+++ b/evaluation_examples/examples/multi_apps/2373b66a-092d-44cb-bfd7-82e86e7a3b4d.json
@@ -1,7 +1,7 @@
 {
   "id": "2373b66a-092d-44cb-bfd7-82e86e7a3b4d",
   "snapshot": "multiapps",
-  "instruction": "I want to understand the resource usage of my Ubuntu system under normal workloads. Please use the `sar` command in the `sysstat` toolkit to monitor system activity, evaluate the status once every second for 30 seconds, output the results to \"System_Resources_Report.txt\" under Desktop.",
+  "instruction": "Monitor Ubuntu system resource usage using the sar command from sysstat toolkit. Collect CPU statistics every second for 30 seconds and save the output to 'System_Resources_Report.txt' on Desktop.",
   "source": "author",
   "config": [
     {
diff --git a/evaluation_examples/examples/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c.json b/evaluation_examples/examples/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c.json
index 3ad3704..2cb77d1 100644
--- a/evaluation_examples/examples/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c.json
+++ b/evaluation_examples/examples/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c.json
@@ -37,6 +37,7 @@
       "check_include_exclude",
       "compare_csv"
     ],
+    "conj": "and",
     "result": [
       {
         "type": "vm_command_line",
@@ -63,8 +64,18 @@
       },
       {
         "type": "cloud_file",
-        "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c/output.csv",
-        "dest": "output_gold.csv"
+        "multi": true,
+        "path": [
+          "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c/output.csv",
+          "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c/output_gold2.csv",
+          "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c/output_gold3.csv"
+        ],
+        "dest": [
+          "output_gold.csv",
+          "output_gold2.csv", 
+          "output_gold3.csv"
+        ],
+        "gives": [0, 1, 2]
       }
     ]
   },
diff --git a/evaluation_examples/examples/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f.json b/evaluation_examples/examples/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f.json
index 710ac31..59091fd 100644
--- a/evaluation_examples/examples/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f.json
+++ b/evaluation_examples/examples/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f.json
@@ -144,17 +144,46 @@
     "os"
   ],
   "evaluator": {
-    "func": "compare_epub",
-    "result": {
-      "type": "vm_file",
-      "dest": "Pass Through.epub",
-      "path": "/home/user/Documents/Novels/Pass Through/Pass Through.epub"
-    },
-    "expected": {
-      "type": "cloud_file",
-      "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f/Pass%20Through.epub",
-      "dest": "Pass Through Gold.epub"
-    }
+    "func": [
+      "compare_epub",
+      "compare_epub",
+      "compare_epub"
+    ],
+    "conj": "or",
+    "result": [
+      {
+        "type": "vm_file",
+        "dest": "Pass Through.epub",
+        "path": "/home/user/Documents/Novels/Pass Through/Pass Through.epub"
+      },
+      {
+        "type": "vm_file",
+        "dest": "Pass Through.epub",
+        "path": "/home/user/Documents/Novels/Pass Through/Pass_Through.epub"
+      },
+      {
+        "type": "vm_file",
+        "dest": "Pass Through.epub",
+        "path": "/home/user/Documents/Novels/Pass Through/pass_through.epub"
+      }
+    ],
+    "expected": [
+      {
+        "type": "cloud_file",
+        "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f/Pass%20Through.epub",
+        "dest": "Pass Through Gold.epub"
+      },
+      {
+        "type": "cloud_file",
+        "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f/Pass%20Through.epub",
+        "dest": "Pass Through Gold.epub"
+      },
+      {
+        "type": "cloud_file",
+        "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f/Pass%20Through.epub",
+        "dest": "Pass Through Gold.epub"
+      }
+    ]
   },
   "proxy": true
 }
\ No newline at end of file

From 97ed6f99b0039c76eb82d1210b299c7fef87d85b Mon Sep 17 00:00:00 2001
From: yuanmengqi <yuanmengqi@mail.ustc.edu.cn>
Date: Sat, 12 Jul 2025 20:28:55 +0000
Subject: [PATCH 3/3] Final review multi_apps fix the rest part

---
 desktop_env/evaluators/metrics/chrome.py      | 24 +++++++++++++------
 desktop_env/evaluators/metrics/gimp.py        |  5 +++-
 .../a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json | 14 +++++++----
 .../e8172110-ec08-421b-a6f5-842e6451911f.json | 16 +++++--------
 .../f8cfa149-d1c1-4215-8dac-4a0932bad3c2.json |  2 +-
 5 files changed, 38 insertions(+), 23 deletions(-)

diff --git a/desktop_env/evaluators/metrics/chrome.py b/desktop_env/evaluators/metrics/chrome.py
index 632c53e..6c3811f 100644
--- a/desktop_env/evaluators/metrics/chrome.py
+++ b/desktop_env/evaluators/metrics/chrome.py
@@ -29,8 +29,8 @@ def is_expected_active_tab(active_tab_info: Dict[str, str], rule: Dict[str, Any]
             actual_url = active_tab_info.get('url', None)
         else:
             actual_url = active_tab_info
-        print("expected_url: {}".format(expected_url))
-        print("actual_url: {}".format(actual_url))
+        logger.info("expected_url: {}".format(expected_url))
+        logger.info("actual_url: {}".format(actual_url))
         return 1 if compare_urls(expected_url, actual_url) else 0
     else:
         logger.error(f"Unknown type: {match_type}")
@@ -76,23 +76,26 @@ def is_expected_url_pattern_match(result, rules) -> float:
 
     if type(result) == dict:
         result_url = result["url"]
-        print("result url: {}".format(result_url))
+        logger.info("result url: {}".format(result_url))
     else:
         result_url = result
     # expect_regex = re.compile(rules["expected"])
     patterns = rules["expected"]
-    print("expected_regex: {}".format(patterns))
+    logger.info("expected_regex: {}".format(patterns))
     for pattern in patterns:
         match = re.search(pattern, result_url)
-        print(match)
+        logger.info("match: {}".format(match))
         if not match:
             return 0.
     return 1.
 
 
 def is_expected_installed_extensions(installed_extensions, expected) -> float:
-    print("installed_extensions: ")
-    print(installed_extensions)
+    if not installed_extensions:
+        return 0.
+
+    logger.info("installed_extensions: ")
+    logger.info(installed_extensions)
     expected_extensions = expected["expected"]
 
     # whether the expected extensions are installed
@@ -109,6 +112,8 @@ def is_expected_tabs(open_tabs: List[Dict[str, str]], rule: Dict[str, Any]) -> f
     """
     Checks if the expected tabs are open in Chrome.
     """
+    if not open_tabs:
+        return 0.
 
     match_type = rule['type']
 
@@ -146,8 +151,10 @@ def is_expected_bookmarks(bookmarks: List[str], rule: Dict[str, Any]) -> float:
                                      bookmark['type'] == 'folder' and bookmark['name'] == 'Liked Authors'), None)
         if liked_authors_folder:
             # Check if it contains the specified URLs
+            logger.info("'Liked Authors' folder exists")
             liked_authors_urls = [bookmark['url'] for bookmark in liked_authors_folder['children'] if
                                   bookmark['type'] == 'url']
+            logger.info("Here is the 'Liked Authors' folder's urls: {}".format(liked_authors_urls))
 
             urls = rule['urls']
 
@@ -168,6 +175,9 @@ def is_expected_bookmarks(bookmarks: List[str], rule: Dict[str, Any]) -> float:
 
 
 def is_expected_search_query(active_tab_info: Dict[str, str], rules: Dict[str, Any]) -> float:
+    if not active_tab_info:
+        return 0.
+
     expected = rules['expect']
     pattern = expected['pattern']
     matched = re.search(pattern, active_tab_info['url'])
diff --git a/desktop_env/evaluators/metrics/gimp.py b/desktop_env/evaluators/metrics/gimp.py
index 5dddd78..a6dcc29 100644
--- a/desktop_env/evaluators/metrics/gimp.py
+++ b/desktop_env/evaluators/metrics/gimp.py
@@ -396,7 +396,10 @@ def check_structure_sim_resized(src_path, tgt_path):
 
     # Check if the structure is similar
     structure_same = structure_check_by_ssim(img_src_resized, img_tgt)
-    return structure_same
+    if structure_same:
+        return 1.
+    else:
+        return 0.
 
 
 def check_contrast_increase_and_structure_sim(src_path, tgt_path):
diff --git a/evaluation_examples/examples/multi_apps/a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json b/evaluation_examples/examples/multi_apps/a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json
index 37af93b..a612d38 100644
--- a/evaluation_examples/examples/multi_apps/a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json
+++ b/evaluation_examples/examples/multi_apps/a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json
@@ -83,21 +83,27 @@
         "urls": [
           [
             "https://jimfan.me/",
-            "https://research.nvidia.com/person/linxi-jim-fan"
+            "https://research.nvidia.com/person/linxi-jim-fan",
+            "https://www.linkedin.com/in/drjimfan/"
           ],
           [
             "https://research.nvidia.com/person/de-an-huang",
-            "https://ai.stanford.edu/~dahuang/"
+            "https://ai.stanford.edu/~dahuang/",
+            "https://www.linkedin.com/in/de-an-huang-38242a69"
           ],
           [
             "https://yukezhu.me/",
             "https://www.cs.utexas.edu/people/faculty-researchers/yuke-zhu",
             "https://experts.utexas.edu/yuke_zhu",
-            "https://research.nvidia.com/person/yuke-zhu"
+            "https://research.nvidia.com/person/yuke-zhu",
+            "https://www.linkedin.com/in/yukez/"
           ],
           [
+            "https://tensorlab.cms.caltech.edu/users/anima/",
             "http://tensorlab.cms.caltech.edu/users/anima/",
-            "https://www.eas.caltech.edu/people/anima"
+            "https://www.eas.caltech.edu/people/anima",
+            "https://en.wikipedia.org/wiki/Anima_Anandkumar",
+            "https://www.linkedin.com/in/anima-anandkumar/"
           ]
         ]
       }
diff --git a/evaluation_examples/examples/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f.json b/evaluation_examples/examples/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f.json
index 8506dda..9f5f924 100644
--- a/evaluation_examples/examples/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f.json
+++ b/evaluation_examples/examples/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f.json
@@ -11,10 +11,6 @@
           {
             "url": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f/character.png",
             "path": "/home/user/Desktop/character.png"
-          },
-          {
-            "url": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f/character_no_background_gold.png",
-            "path": "/home/user/Desktop/character_no_background_gold.png"
           }
         ]
       }
@@ -36,8 +32,8 @@
   ],
   "evaluator": {
     "func": [
-      "check_structure_sim_resized",
-      "check_structure_sim_resized"
+      "check_structure_sim",
+      "check_structure_sim"
     ],
     "result": [
       {
@@ -53,13 +49,13 @@
     ],
     "expected": [
       {
-        "type": "vm_file",
-        "path": "/home/user/Desktop/character_no_background_gold.png",
+        "type": "cloud_file",
+        "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f/character_no_background_gold.png",
         "dest": "character_no_background_gold.png"
       },
       {
-        "type": "vm_file",
-        "path": "/home/user/Desktop/character_no_background_gold.png",
+        "type": "cloud_file",
+        "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f/character_no_background_gold.png",
         "dest": "character_no_background_gold.png"
       }
     ]
diff --git a/evaluation_examples/examples/multi_apps/f8cfa149-d1c1-4215-8dac-4a0932bad3c2.json b/evaluation_examples/examples/multi_apps/f8cfa149-d1c1-4215-8dac-4a0932bad3c2.json
index 7dd4f83..2441c3e 100644
--- a/evaluation_examples/examples/multi_apps/f8cfa149-d1c1-4215-8dac-4a0932bad3c2.json
+++ b/evaluation_examples/examples/multi_apps/f8cfa149-d1c1-4215-8dac-4a0932bad3c2.json
@@ -65,7 +65,7 @@
       "type": "rule",
       "rules": {
         "expect": {
-          "pattern": "https?://(www\\.?)?google\\.com/search\\?q=nereida(&|$)"
+          "pattern": "(?i)https?://(?:www\\.)?google\\.com/search\\?q=nereida(?:&|$|#).*"
         }
       }
     }