Merge branch 'main' into fix_chrome

2025-07-13 10:16:08 +00:00
parent a16b54c175
commit 572a94b6df
15 changed files with 228 additions and 128 deletions
--- a/desktop_env/desktop_env.py
+++ b/desktop_env/desktop_env.py
@@ -32,7 +32,7 @@ class DesktopEnv(gym.Env):
            snapshot_name: str = "init_state",
            action_space: str = "computer_13",
            cache_dir: str = "cache",
-            screen_size: Tuple[int] = (1920, 1080),
+            screen_size: Tuple[int] = (int(os.environ.get("SCREEN_WIDTH", 1920)), int(os.environ.get("SCREEN_HEIGHT", 1080))),
            headless: bool = False,
            require_a11y_tree: bool = True,
            require_terminal: bool = False,
--- a/desktop_env/evaluators/getters/chrome.py
+++ b/desktop_env/evaluators/getters/chrome.py
@@ -52,6 +52,11 @@ def get_info_from_website(env, config: Dict[Any, Any]) -> Any:
                - attribute (str): optional for 'attribute' and 'click_and_attribute', the attribute to be extracted.
            - backups (Any): The backup information to be returned if the extraction fails.
    """
+    # 添加函数开始日志
+    logger.info(f"[INFO_FROM_WEBSITE] Starting to get information from website: {config.get('url', 'N/A')}")
+    logger.info(f"[INFO_FROM_WEBSITE] Total info operations to perform: {len(config.get('infos', []))}")
+    logger.debug(f"[INFO_FROM_WEBSITE] Full config: {config}")
+    
    try:
        host = env.vm_ip
        port = env.chromium_port  # fixme: this port is hard-coded, need to be changed from config file
@@ -59,11 +64,18 @@ def get_info_from_website(env, config: Dict[Any, Any]) -> Any:
        remote_debugging_url = f"http://{host}:{port}"
        backend_url = f"http://{host}:{server_port}"
        use_proxy = env.current_use_proxy
+        
+        logger.info(f"[INFO_FROM_WEBSITE] Connecting to Chrome at {remote_debugging_url}")
+        
        with sync_playwright() as p:
            # connect to remote Chrome instance
            try:
                browser = p.chromium.connect_over_cdp(remote_debugging_url)
+                logger.info(f"[INFO_FROM_WEBSITE] Successfully connected to existing Chrome instance")
            except Exception as e:
+                logger.warning(f"[INFO_FROM_WEBSITE] Failed to connect to existing Chrome instance: {e}")
+                logger.info(f"[INFO_FROM_WEBSITE] Starting new Chrome instance...")
+                
                # If the connection fails (e.g., the agent close the browser instance), start a new browser instance
                app = 'chromium' if 'arm' in platform.machine() else 'google-chrome'
                command = [
@@ -72,52 +84,116 @@ def get_info_from_website(env, config: Dict[Any, Any]) -> Any:
                ]
                if use_proxy:
                    command.append(f"--proxy-server=127.0.0.1:18888")
+                    logger.info(f"[INFO_FROM_WEBSITE] Using proxy server: 127.0.0.1:18888")
+                
+                logger.info(f"[INFO_FROM_WEBSITE] Starting browser with command: {' '.join(command)}")
                payload = json.dumps({"command": command, "shell": False})
                headers = {"Content-Type": "application/json"}
                #requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
                requests.post(backend_url + "/setup" + "/launch", headers=headers, data=payload)
                time.sleep(5)
                browser = p.chromium.connect_over_cdp(remote_debugging_url)
+                logger.info(f"[INFO_FROM_WEBSITE] Successfully connected to new Chrome instance")

            page = browser.contexts[0].new_page()
+            logger.info(f"[INFO_FROM_WEBSITE] Created new page, navigating to: {config['url']}")
+            
            page.goto(config["url"])
            page.wait_for_load_state('load')
+            
+            # 记录页面加载完成后的信息
+            logger.info(f"[INFO_FROM_WEBSITE] Page loaded successfully")
+            logger.info(f"[INFO_FROM_WEBSITE] Page title: '{page.title()}'")
+            logger.info(f"[INFO_FROM_WEBSITE] Current URL: '{page.url}'")
+            
            infos = []
-            for info_dict in config.get('infos', []):
+            for idx, info_dict in enumerate(config.get('infos', [])):
+                logger.info(f"[INFO_FROM_WEBSITE] Processing info operation {idx + 1}/{len(config.get('infos', []))}")
+                logger.debug(f"[INFO_FROM_WEBSITE] Info config: {info_dict}")
+                
                if page.url != config["url"]:
+                    logger.info(f"[INFO_FROM_WEBSITE] Page URL changed, navigating back to: {config['url']}")
                    page.goto(config["url"])
                    page.wait_for_load_state('load')
+                    logger.info(f"[INFO_FROM_WEBSITE] Back to original page")
+                
                action = info_dict.get('action', 'inner_text')
+                selector = info_dict.get('selector')
+                logger.info(f"[INFO_FROM_WEBSITE] Action: {action}, Selector: {selector}")
+                
                if action == "inner_text":
+                    logger.debug(f"[INFO_FROM_WEBSITE] Waiting for element with selector: {selector}")
                    ele = page.wait_for_selector(info_dict['selector'], state='attached', timeout=10000)
-                    infos.append(ele.inner_text())
+                    extracted_text = ele.inner_text()
+                    logger.info(f"[INFO_FROM_WEBSITE] Successfully extracted inner_text: '{extracted_text}'")
+                    infos.append(extracted_text)
+                    
                elif action == "attribute":
+                    attribute = info_dict.get('attribute')
+                    logger.debug(f"[INFO_FROM_WEBSITE] Waiting for element with selector: {selector}")
+                    logger.debug(f"[INFO_FROM_WEBSITE] Extracting attribute: {attribute}")
                    ele = page.wait_for_selector(info_dict['selector'], state='attached', timeout=10000)
-                    infos.append(ele.get_attribute(info_dict['attribute']))
+                    extracted_attr = ele.get_attribute(info_dict['attribute'])
+                    logger.info(f"[INFO_FROM_WEBSITE] Successfully extracted attribute '{attribute}': '{extracted_attr}'")
+                    infos.append(extracted_attr)
+                    
                elif action == 'click_and_inner_text':
+                    logger.debug(f"[INFO_FROM_WEBSITE] Performing click_and_inner_text with {len(info_dict['selector'])} selectors")
                    for idx, sel in enumerate(info_dict['selector']):
+                        logger.debug(f"[INFO_FROM_WEBSITE] Processing selector {idx + 1}/{len(info_dict['selector'])}: {sel}")
                        if idx != len(info_dict['selector']) - 1:
+                            logger.debug(f"[INFO_FROM_WEBSITE] Clicking element with selector: {sel}")
                            link = page.wait_for_selector(sel, state='attached', timeout=10000)
                            link.click()
                            page.wait_for_load_state('load')
+                            logger.info(f"[INFO_FROM_WEBSITE] Successfully clicked element, page loaded")
+                            logger.debug(f"[INFO_FROM_WEBSITE] New page URL: {page.url}")
                        else:
+                            logger.debug(f"[INFO_FROM_WEBSITE] Extracting inner_text from final element: {sel}")
                            ele = page.wait_for_selector(sel, state='attached', timeout=10000)
-                            infos.append(ele.inner_text())
+                            extracted_text = ele.inner_text()
+                            logger.info(f"[INFO_FROM_WEBSITE] Successfully extracted inner_text after clicks: '{extracted_text}'")
+                            infos.append(extracted_text)
+                            
                elif action == 'click_and_attribute':
+                    attribute = info_dict.get('attribute')
+                    logger.debug(f"[INFO_FROM_WEBSITE] Performing click_and_attribute with {len(info_dict['selector'])} selectors")
+                    logger.debug(f"[INFO_FROM_WEBSITE] Target attribute: {attribute}")
                    for idx, sel in enumerate(info_dict['selector']):
+                        logger.debug(f"[INFO_FROM_WEBSITE] Processing selector {idx + 1}/{len(info_dict['selector'])}: {sel}")
                        if idx != len(info_dict['selector']) - 1:
+                            logger.debug(f"[INFO_FROM_WEBSITE] Clicking element with selector: {sel}")
                            link = page.wait_for_selector(sel, state='attached', timeout=10000)
                            link.click()
                            page.wait_for_load_state('load')
+                            logger.info(f"[INFO_FROM_WEBSITE] Successfully clicked element, page loaded")
+                            logger.debug(f"[INFO_FROM_WEBSITE] New page URL: {page.url}")
                        else:
+                            logger.debug(f"[INFO_FROM_WEBSITE] Extracting attribute from final element: {sel}")
                            ele = page.wait_for_selector(sel, state='attached')
-                            infos.append(ele.get_attribute(info_dict['attribute']))
+                            extracted_attr = ele.get_attribute(info_dict['attribute'])
+                            logger.info(f"[INFO_FROM_WEBSITE] Successfully extracted attribute '{attribute}' after clicks: '{extracted_attr}'")
+                            infos.append(extracted_attr)
                else:
+                    logger.error(f"[INFO_FROM_WEBSITE] Unsupported action: {action}")
                    raise NotImplementedError(f'The action {action} is not supported yet.')
+                
+                logger.info(f"[INFO_FROM_WEBSITE] Completed info operation {idx + 1}")
+            
+            # 记录最终提取的所有信息
+            logger.info(f"[INFO_FROM_WEBSITE] All operations completed successfully")
+            logger.info(f"[INFO_FROM_WEBSITE] Total extracted information count: {len(infos)}")
+            logger.info(f"[INFO_FROM_WEBSITE] Final extracted information: {infos}")
+            
        return infos
    except Exception as e:
-        logger.error(f'[ERROR]: failed to obtain information from the website: {config["url"]}. Use backup results instead.')
-        return config.get('backups', None)
+        logger.error(f'[INFO_FROM_WEBSITE] ERROR: Failed to obtain information from website: {config.get("url", "N/A")}')
+        logger.error(f'[INFO_FROM_WEBSITE] Exception details: {str(e)}')
+        logger.error(f'[INFO_FROM_WEBSITE] Exception type: {type(e).__name__}')
+        logger.info(f'[INFO_FROM_WEBSITE] Using backup results instead')
+        backup_data = config.get('backups', None)
+        logger.info(f'[INFO_FROM_WEBSITE] Backup data: {backup_data}')
+        return backup_data


 # The following ones just need to load info from the files of software, no need to connect to the software
--- a/desktop_env/evaluators/metrics/chrome.py
+++ b/desktop_env/evaluators/metrics/chrome.py
@@ -29,8 +29,8 @@ def is_expected_active_tab(active_tab_info: Dict[str, str], rule: Dict[str, Any]
            actual_url = active_tab_info.get('url', None)
        else:
            actual_url = active_tab_info
-        print("expected_url: {}".format(expected_url))
-        print("actual_url: {}".format(actual_url))
+        logger.info("expected_url: {}".format(expected_url))
+        logger.info("actual_url: {}".format(actual_url))
        return 1 if compare_urls(expected_url, actual_url) else 0
    else:
        logger.error(f"Unknown type: {match_type}")
@@ -76,23 +76,26 @@ def is_expected_url_pattern_match(result, rules) -> float:

    if type(result) == dict:
        result_url = result["url"]
-        print("result url: {}".format(result_url))
+        logger.info("result url: {}".format(result_url))
    else:
        result_url = result
    # expect_regex = re.compile(rules["expected"])
    patterns = rules["expected"]
-    print("expected_regex: {}".format(patterns))
+    logger.info("expected_regex: {}".format(patterns))
    for pattern in patterns:
        match = re.search(pattern, result_url)
-        print(match)
+        logger.info("match: {}".format(match))
        if not match:
            return 0.
    return 1.


 def is_expected_installed_extensions(installed_extensions, expected) -> float:
-    print("installed_extensions: ")
-    print(installed_extensions)
+    if not installed_extensions:
+        return 0.
+
+    logger.info("installed_extensions: ")
+    logger.info(installed_extensions)
    expected_extensions = expected["expected"]

    # whether the expected extensions are installed
@@ -109,6 +112,8 @@ def is_expected_tabs(open_tabs: List[Dict[str, str]], rule: Dict[str, Any]) -> f
    """
    Checks if the expected tabs are open in Chrome.
    """
+    if not open_tabs:
+        return 0.

    match_type = rule['type']

@@ -146,8 +151,10 @@ def is_expected_bookmarks(bookmarks: List[str], rule: Dict[str, Any]) -> float:
                                     bookmark['type'] == 'folder' and bookmark['name'] == 'Liked Authors'), None)
        if liked_authors_folder:
            # Check if it contains the specified URLs
+            logger.info("'Liked Authors' folder exists")
            liked_authors_urls = [bookmark['url'] for bookmark in liked_authors_folder['children'] if
                                  bookmark['type'] == 'url']
+            logger.info("Here is the 'Liked Authors' folder's urls: {}".format(liked_authors_urls))

            urls = rule['urls']

@@ -168,6 +175,9 @@ def is_expected_bookmarks(bookmarks: List[str], rule: Dict[str, Any]) -> float:


 def is_expected_search_query(active_tab_info: Dict[str, str], rules: Dict[str, Any]) -> float:
+    if not active_tab_info:
+        return 0.
+
    expected = rules['expect']
    pattern = expected['pattern']
    matched = re.search(pattern, active_tab_info['url'])
--- a/desktop_env/evaluators/metrics/gimp.py
+++ b/desktop_env/evaluators/metrics/gimp.py
@@ -396,7 +396,10 @@ def check_structure_sim_resized(src_path, tgt_path):

    # Check if the structure is similar
    structure_same = structure_check_by_ssim(img_src_resized, img_tgt)
-    return structure_same
+    if structure_same:
+        return 1.
+    else:
+        return 0.


 def check_contrast_increase_and_structure_sim(src_path, tgt_path):
--- a/desktop_env/evaluators/metrics/table.py
+++ b/desktop_env/evaluators/metrics/table.py
@@ -463,23 +463,60 @@ def compare_table(result: str, expected: str = None, **options) -> float:
    #  }}} function compare_table # 


-def compare_csv(result: str, expected: str, **options) -> float:
+def compare_csv(result: str, expected: Union[str, List[str]], **options) -> float:
+    """
+    Compare CSV files. If expected is a list, returns 1.0 if result matches any of the expected files.
+    
+    Args:
+        result: Path to result CSV file
+        expected: Path to expected CSV file or list of paths to expected CSV files
+        options: Additional options (strict, ignore_case)
+    
+    Returns:
+        1.0 if result matches expected (or any file in expected list), 0.0 otherwise
+    """
    if result is None:
        return 0.

-    with open(result) as f:
-        result_lines: Iterable[str] = f.read().splitlines()
-    with open(expected) as f:
-        expected_lines: Iterable[str] = f.read().splitlines()
-    if not options.get("strict", True):
-        result_lines = map(str.strip, result_lines)
-        expected_lines = map(str.strip, expected_lines)
-    if options.get("ignore_case", False):
-        result_lines = map(str.lower, result_lines)
-        expected_lines = map(str.lower, expected_lines)
+    try:
+        with open(result) as f:
+            result_lines: Iterable[str] = f.read().splitlines()
+    except (FileNotFoundError, IOError):
+        return 0.

-    metric: bool = list(result_lines) == list(expected_lines)
-    return float(metric)
+    # Convert expected to list if it's a single string (for backward compatibility)
+    if isinstance(expected, str):
+        expected_files = [expected]
+    else:
+        expected_files = expected
+
+    # Try to match against each expected file
+    for expected_file in expected_files:
+        try:
+            with open(expected_file) as f:
+                expected_lines: Iterable[str] = f.read().splitlines()
+            
+            # Process lines based on options
+            current_result_lines = result_lines
+            current_expected_lines = expected_lines
+            
+            if not options.get("strict", True):
+                current_result_lines = map(str.strip, current_result_lines)
+                current_expected_lines = map(str.strip, current_expected_lines)
+            if options.get("ignore_case", False):
+                current_result_lines = map(str.lower, current_result_lines)
+                current_expected_lines = map(str.lower, current_expected_lines)
+
+            # Check if this expected file matches
+            if list(current_result_lines) == list(current_expected_lines):
+                return 1.0
+                
+        except (FileNotFoundError, IOError):
+            # If this expected file doesn't exist, continue to next one
+            continue
+
+    # No match found
+    return 0.0


 def compare_conference_city_in_order(actual_city_list_path, expected_city):
--- a/evaluation_examples/examples/multi_apps/2373b66a-092d-44cb-bfd7-82e86e7a3b4d.json
+++ b/evaluation_examples/examples/multi_apps/2373b66a-092d-44cb-bfd7-82e86e7a3b4d.json
@@ -1,7 +1,7 @@
 {
  "id": "2373b66a-092d-44cb-bfd7-82e86e7a3b4d",
  "snapshot": "multiapps",
-  "instruction": "I want to understand the resource usage of my Ubuntu system under normal workloads. Please use the `sar` command in the `sysstat` toolkit to monitor system activity, evaluate the status once every second for 30 seconds, output the results to \"System_Resources_Report.txt\" under Desktop.",
+  "instruction": "Monitor Ubuntu system resource usage using the sar command from sysstat toolkit. Collect CPU statistics every second for 30 seconds and save the output to 'System_Resources_Report.txt' on Desktop.",
  "source": "author",
  "config": [
    {
--- a/evaluation_examples/examples/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c.json
+++ b/evaluation_examples/examples/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c.json
@@ -37,6 +37,7 @@
      "check_include_exclude",
      "compare_csv"
    ],
+    "conj": "and",
    "result": [
      {
        "type": "vm_command_line",
@@ -63,8 +64,18 @@
      },
      {
        "type": "cloud_file",
-        "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c/output.csv",
-        "dest": "output_gold.csv"
+        "multi": true,
+        "path": [
+          "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c/output.csv",
+          "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c/output_gold2.csv",
+          "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c/output_gold3.csv"
+        ],
+        "dest": [
+          "output_gold.csv",
+          "output_gold2.csv", 
+          "output_gold3.csv"
+        ],
+        "gives": [0, 1, 2]
      }
    ]
  },
--- a/evaluation_examples/examples/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f.json
+++ b/evaluation_examples/examples/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f.json
@@ -144,17 +144,46 @@
    "os"
  ],
  "evaluator": {
-    "func": "compare_epub",
-    "result": {
-      "type": "vm_file",
-      "dest": "Pass Through.epub",
-      "path": "/home/user/Documents/Novels/Pass Through/Pass Through.epub"
-    },
-    "expected": {
-      "type": "cloud_file",
-      "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f/Pass%20Through.epub",
-      "dest": "Pass Through Gold.epub"
-    }
+    "func": [
+      "compare_epub",
+      "compare_epub",
+      "compare_epub"
+    ],
+    "conj": "or",
+    "result": [
+      {
+        "type": "vm_file",
+        "dest": "Pass Through.epub",
+        "path": "/home/user/Documents/Novels/Pass Through/Pass Through.epub"
+      },
+      {
+        "type": "vm_file",
+        "dest": "Pass Through.epub",
+        "path": "/home/user/Documents/Novels/Pass Through/Pass_Through.epub"
+      },
+      {
+        "type": "vm_file",
+        "dest": "Pass Through.epub",
+        "path": "/home/user/Documents/Novels/Pass Through/pass_through.epub"
+      }
+    ],
+    "expected": [
+      {
+        "type": "cloud_file",
+        "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f/Pass%20Through.epub",
+        "dest": "Pass Through Gold.epub"
+      },
+      {
+        "type": "cloud_file",
+        "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f/Pass%20Through.epub",
+        "dest": "Pass Through Gold.epub"
+      },
+      {
+        "type": "cloud_file",
+        "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f/Pass%20Through.epub",
+        "dest": "Pass Through Gold.epub"
+      }
+    ]
  },
  "proxy": true
 }
--- a/evaluation_examples/examples/multi_apps/a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json
+++ b/evaluation_examples/examples/multi_apps/a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json
@@ -83,21 +83,27 @@
        "urls": [
          [
            "https://jimfan.me/",
-            "https://research.nvidia.com/person/linxi-jim-fan"
+            "https://research.nvidia.com/person/linxi-jim-fan",
+            "https://www.linkedin.com/in/drjimfan/"
          ],
          [
            "https://research.nvidia.com/person/de-an-huang",
-            "https://ai.stanford.edu/~dahuang/"
+            "https://ai.stanford.edu/~dahuang/",
+            "https://www.linkedin.com/in/de-an-huang-38242a69"
          ],
          [
            "https://yukezhu.me/",
            "https://www.cs.utexas.edu/people/faculty-researchers/yuke-zhu",
            "https://experts.utexas.edu/yuke_zhu",
-            "https://research.nvidia.com/person/yuke-zhu"
+            "https://research.nvidia.com/person/yuke-zhu",
+            "https://www.linkedin.com/in/yukez/"
          ],
          [
+            "https://tensorlab.cms.caltech.edu/users/anima/",
            "http://tensorlab.cms.caltech.edu/users/anima/",
-            "https://www.eas.caltech.edu/people/anima"
+            "https://www.eas.caltech.edu/people/anima",
+            "https://en.wikipedia.org/wiki/Anima_Anandkumar",
+            "https://www.linkedin.com/in/anima-anandkumar/"
          ]
        ]
      }
--- a/evaluation_examples/examples/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f.json
+++ b/evaluation_examples/examples/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f.json
@@ -11,10 +11,6 @@
          {
            "url": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f/character.png",
            "path": "/home/user/Desktop/character.png"
-          },
-          {
-            "url": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f/character_no_background_gold.png",
-            "path": "/home/user/Desktop/character_no_background_gold.png"
          }
        ]
      }
@@ -36,8 +32,8 @@
  ],
  "evaluator": {
    "func": [
-      "check_structure_sim_resized",
-      "check_structure_sim_resized"
+      "check_structure_sim",
+      "check_structure_sim"
    ],
    "result": [
      {
@@ -53,13 +49,13 @@
    ],
    "expected": [
      {
-        "type": "vm_file",
-        "path": "/home/user/Desktop/character_no_background_gold.png",
+        "type": "cloud_file",
+        "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f/character_no_background_gold.png",
        "dest": "character_no_background_gold.png"
      },
      {
-        "type": "vm_file",
-        "path": "/home/user/Desktop/character_no_background_gold.png",
+        "type": "cloud_file",
+        "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f/character_no_background_gold.png",
        "dest": "character_no_background_gold.png"
      }
    ]
--- a/evaluation_examples/examples/multi_apps/f8cfa149-d1c1-4215-8dac-4a0932bad3c2.json
+++ b/evaluation_examples/examples/multi_apps/f8cfa149-d1c1-4215-8dac-4a0932bad3c2.json
@@ -65,7 +65,7 @@
      "type": "rule",
      "rules": {
        "expect": {
-          "pattern": "https?://(www\\.?)?google\\.com/search\\?q=nereida(&|$)"
+          "pattern": "(?i)https?://(?:www\\.)?google\\.com/search\\?q=nereida(?:&|$|#).*"
        }
      }
    }
--- a/evaluation_examples/test_fix_chrome.json
+++ b/evaluation_examples/test_fix_chrome.json
@@ -1,50 +0,0 @@
-{
-  "chrome": [
-    "bb5e4c0d-f964-439c-97b6-bdb9747de3f4",
-    "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3",
-    "06fe7178-4491-4589-810f-2e2bc9502122",
-    "e1e75309-3ddb-4d09-92ec-de869c928143",
-    "35253b65-1c19-4304-8aa4-6884b8218fc0",
-    "2ad9387a-65d8-4e33-ad5b-7580065a27ca",
-    "7a5a7856-f1b6-42a4-ade9-1ca81ca0f263",
-    "44ee5668-ecd5-4366-a6ce-c1c9b8d4e938",
-    "2ae9ba84-3a0d-4d4c-8338-3a1478dc5fe3",
-    "480bcfea-d68f-4aaa-a0a9-2589ef319381",
-    "af630914-714e-4a24-a7bb-f9af687d3b91",
-    "3720f614-37fd-4d04-8a6b-76f54f8c222d",
-    "99146c54-4f37-4ab8-9327-5f3291665e1e",
-    "12086550-11c0-466b-b367-1d9e75b3910e",
-    "6766f2b8-8a72-417f-a9e5-56fcaa735837",
-    "93eabf48-6a27-4cb6-b963-7d5fe1e0d3a9",
-    "ae78f875-5b98-4907-bbb5-9c737fc68c03",
-    "3299584d-8f11-4457-bf4c-ce98f7600250",
-    "030eeff7-b492-4218-b312-701ec99ee0cc",
-    "9656a811-9b5b-4ddf-99c7-5117bcef0626",
-    "fc6d8143-9452-4171-9459-7f515143419a",
-    "a96b564e-dbe9-42c3-9ccf-b4498073438a",
-    "1704f00f-79e6-43a7-961b-cedd3724d5fd",
-    "f3b19d1e-2d48-44e9-b4e1-defcae1a0197",
-    "82bc8d6a-36eb-4d2d-8801-ef714fb1e55a",
-    "47543840-672a-467d-80df-8f7c3b9788c9",
-    "c1fa57f3-c3db-4596-8f09-020701085416",
-    "da46d875-6b82-4681-9284-653b0c7ae241",
-    "6c4c23a1-42a4-43cc-9db1-2f86ff3738cc",
-    "f79439ad-3ee8-4f99-a518-0eb60e5652b0",
-    "b7895e80-f4d1-4648-bee0-4eb45a6f1fa8",
-    "9f3f70fc-5afc-4958-a7b7-3bb4fcb01805",
-    "7f52cab9-535c-4835-ac8c-391ee64dc930",
-    "82279c77-8fc6-46f6-9622-3ba96f61b477",
-    "2888b4e6-5b47-4b57-8bf5-c73827890774",
-    "b4f95342-463e-4179-8c3f-193cd7241fb2",
-    "f5d96daf-83a8-4c86-9686-bada31fc66ab",
-    "121ba48f-9e17-48ce-9bc6-a4fb17a7ebba",
-    "368d9ba4-203c-40c1-9fa3-da2f1430ce63",
-    "59155008-fe71-45ec-8a8f-dc35497b6aa8",
-    "a728a36e-8bf1-4bb6-9a03-ef039a5233f0",
-    "b070486d-e161-459b-aa2b-ef442d973b92",
-    "0d8b7de3-e8de-4d86-b9fd-dd2dce58a217",
-    "9f935cce-0a9f-435f-8007-817732bfc0a5",
-    "f0b971a1-6831-4b9b-a50e-22a6e47f45ba",
-    "cabb3bae-cccb-41bd-9f5d-0f3a9fecd825"
-  ]
-}
--- a/monitor/.env
+++ b/monitor/.env
@@ -2,9 +2,9 @@
 # Do not write any secret keys or sensitive information here.

 # Monitor configuration
-TASK_CONFIG_PATH=../evaluation_examples/test_fix_chrome.json
+TASK_CONFIG_PATH=../evaluation_examples/test_all.json
 EXAMPLES_BASE_PATH=../evaluation_examples/examples
-RESULTS_BASE_PATH=../results_chrome_operator
+RESULTS_BASE_PATH=../results_all
 ACTION_SPACE=pyautogui
 OBSERVATION_TYPE=screenshot
 MODEL_NAME=computer-use-preview
--- a/run_human_examine.sh
+++ b/run_human_examine.sh
@@ -1,9 +0,0 @@
-python manual_examine.py \
-    --headless \
-    --observation_type screenshot \
-    --result_dir ./results_human_examine_chrome_fix_1 \
-    --test_all_meta_path evaluation_examples/test_fix_chrome.json \
-    --region us-east-1 \
-    --domain chrome \
-    --example_id 030eeff7-b492-4218-b312-701ec99ee0cc \
-    --max_steps 3
--- a/run_operator.sh
+++ b/run_operator.sh
@@ -1,9 +0,0 @@
-python run_multienv_openaicua.py \
--headless \
--observation_type screenshot \
--model computer-use-preview \
--result_dir ./results_multiapps_operator \
--test_all_meta_path evaluation_examples/test_multiapps.json \
--region us-east-1 \
--max_steps 150 \
--num_envs 5