From 81863b26ddf4473eca819fdd331bdb587e6a1056 Mon Sep 17 00:00:00 2001 From: Timothyxxx <384084775@qq.com> Date: Fri, 23 Feb 2024 11:57:50 +0800 Subject: [PATCH] Improve on eval script on web browsing tasks; Add one setup example --- desktop_env/envs/desktop_env.py | 3 + desktop_env/evaluators/metrics/chrome.py | 2 +- desktop_env/evaluators/metrics/utils.py | 3 + .../a96b564e-dbe9-42c3-9ccf-b4498073438a.json | 105 +++++++++++------- 4 files changed, 72 insertions(+), 41 deletions(-) diff --git a/desktop_env/envs/desktop_env.py b/desktop_env/envs/desktop_env.py index fbfb739..2454ce6 100644 --- a/desktop_env/envs/desktop_env.py +++ b/desktop_env/envs/desktop_env.py @@ -316,6 +316,9 @@ class DesktopEnv(gym.Env): return 1 else: return 0 + else: + if len(self.action_history) > 0 and self.action_history[-1] == "FAIL": + return 0 if type(self.metric) == list: results = [] diff --git a/desktop_env/evaluators/metrics/chrome.py b/desktop_env/evaluators/metrics/chrome.py index 69d3dcd..0517c1d 100644 --- a/desktop_env/evaluators/metrics/chrome.py +++ b/desktop_env/evaluators/metrics/chrome.py @@ -17,7 +17,7 @@ def is_expected_active_tab(active_tab_info: Dict[str, str], rule: Dict[str, Any] if match_type == "url": expected_url = rule['url'] - actual_url = active_tab_info['url'] + actual_url = active_tab_info.get('url', None) print("expected_url: {}".format(expected_url)) print("actual_url: {}".format(actual_url)) return 1 if compare_urls(expected_url, actual_url) else 0 diff --git a/desktop_env/evaluators/metrics/utils.py b/desktop_env/evaluators/metrics/utils.py index 0747726..00deb09 100644 --- a/desktop_env/evaluators/metrics/utils.py +++ b/desktop_env/evaluators/metrics/utils.py @@ -625,6 +625,9 @@ def are_lists_equal(list1, list2, comparison_func): def compare_urls(url1, url2): + if url1 is None or url2 is None: + return url1 == url2 + def normalize_url(url): # Parse the URL parsed_url = urlparse(url) diff --git a/evaluation_examples/examples/chrome/a96b564e-dbe9-42c3-9ccf-b4498073438a.json b/evaluation_examples/examples/chrome/a96b564e-dbe9-42c3-9ccf-b4498073438a.json index d0a46b2..b643e8a 100644 --- a/evaluation_examples/examples/chrome/a96b564e-dbe9-42c3-9ccf-b4498073438a.json +++ b/evaluation_examples/examples/chrome/a96b564e-dbe9-42c3-9ccf-b4498073438a.json @@ -1,45 +1,70 @@ { - "id": "a96b564e-dbe9-42c3-9ccf-b4498073438a", - "snapshot": "chrome", - "instruction": "Find discussions of community and open one with most replies.", - "source": "test_task_0", - "config": [ - { - "type": "launch", - "parameters": { - "command": [ - "google-chrome", - "--remote-debugging-port=1337" - ] - } - }, - { - "type": "launch", - "parameters": { - "command": [ - "socat", - "tcp-listen:9222,fork", - "tcp:localhost:1337" - ] - } + "id": "a96b564e-dbe9-42c3-9ccf-b4498073438a", + "snapshot": "chrome", + "instruction": "Find discussions of community and open one with most replies.", + "change_possibility": "low", + "source": "test_task_0", + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "google-chrome", + "--remote-debugging-port=1337" + ] + } + }, + { + "type": "launch", + "parameters": { + "command": [ + "socat", + "tcp-listen:9222,fork", + "tcp:localhost:1337" + ] + } + }, + { + "type": "chrome_open_tabs", + "parameters": { + "urls_to_open": [ + "https://www.flightaware.com/" + ] + } + }, + { + "type": "activate_window", + "parameters": { + "window_name": "Google Chrome" + } + }, + { + "type": "execute", + "parameters": { + "command": [ + "python", + "-c", + "import pyautogui; import time; pyautogui.hotkey('alt', 'f10'); time.sleep(0.5);" + ] + } + } + ], + "trajectory": "trajectories/", + "related_apps": [ + "chrome" + ], + "evaluator": { + "func": "is_expected_active_tab", + "result": { + "type": "active_tab_info" + }, + "expected": { + "type": "rule", + "rules": { + "type": "url", + "url": "https://discussions.flightaware.com/t/the-banter-thread/4412" } - ], - "trajectory": "trajectories/", - "related_apps": [ - "chrome" - ], - "evaluator": { - "func":"is_expected_active_tab", - "result": { - "type": "active_tab_info" - }, - "expected":{ - "type": "rule", - "rules":{ - "type": "url", - "url": "https://discussions.flightaware.com/t/graphs-for-dump1090-my-version-with-install-script/46263" - } - } } } +} \ No newline at end of file