From 0c9c2f214aea805136a4ef4e30452a0b456a98ae Mon Sep 17 00:00:00 2001 From: David Chang Date: Mon, 11 Mar 2024 22:45:16 +0800 Subject: [PATCH 1/5] ver Mar11thv2 minor adjustment --- at_processing/filter.anal | 4 +++- .../accessibility_tree_wrap/heuristic_retrieve.py | 12 ++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/at_processing/filter.anal b/at_processing/filter.anal index 98fc1fe..9e3ba1d 100644 --- a/at_processing/filter.anal +++ b/at_processing/filter.anal @@ -231,6 +231,8 @@ worker-window 1 xldesk 1 xlmain 1 +TODO: 对Windows元素,可能按win:class属性筛选会更贴近UFO的方案。 + |3. 筛选属性 UFO中筛选的几种元素属性: @@ -238,7 +240,7 @@ P: is_visible, is_enabled, title_list, class_name_list 对Ubuntu,照葫芦画瓢,筛选 P: visible & showing -P: enabled +P: enabled | editable | expandable | checkable P: name P: text diff --git a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py index 337b402..34a1d76 100644 --- a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py +++ b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py @@ -55,12 +55,12 @@ def judge_node(node: ET, platform="ubuntu") -> bool: or platform=="windows"\ and node.get("{{{:}}}visible".format(state_ns), "false")=="true"\ )\ - and ( node.get("{{{:}}}enabled".format(state_ns), "false")=="true"\ - or node.get("{{{:}}}editable".format(state_ns), "false")=="true"\ - or node.get("{{{:}}}expandable".format(state_ns), "false")=="true"\ - or node.get("{{{:}}}checkable".format(state_ns), "false")=="true" - )\ - and (node.get("name", "") != "" or node.text is not None and len(node.text)>0) + and ( node.get("{{{:}}}enabled".format(state_ns), "false")=="true"\ + or node.get("{{{:}}}editable".format(state_ns), "false")=="true"\ + or node.get("{{{:}}}expandable".format(state_ns), "false")=="true"\ + or node.get("{{{:}}}checkable".format(state_ns), "false")=="true" + )\ + and (node.get("name", "") != "" or node.text is not None and len(node.text)>0) coordinates: Tuple[int, int] = eval(node.get("{{{:}}}screencoord".format(component_ns), "(-1, -1)")) sizes: Tuple[int, int] = eval(node.get("{{{:}}}size".format(component_ns), "(-1, -1)")) From 3a0ab526c8f981066b96d3825ed5e1900a38162d Mon Sep 17 00:00:00 2001 From: David Chang Date: Wed, 13 Mar 2024 12:25:46 +0800 Subject: [PATCH 2/5] ver Mar13th service file for server/main.py --- desktop_env/server/osbench_server.service | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 desktop_env/server/osbench_server.service diff --git a/desktop_env/server/osbench_server.service b/desktop_env/server/osbench_server.service new file mode 100644 index 0000000..d0fa216 --- /dev/null +++ b/desktop_env/server/osbench_server.service @@ -0,0 +1,16 @@ +[Unit] +Description=OSBench Server +StartLimitIntervalSec=60 +StartLimitBurst=4 +After=network.target auditd.service + +[Service] +ExecStart=/usr/bin/python3 /home/user/main.py +User=user +WorkingDirectory=/home/user +Restart=on-failure +RestartSec=1 +Environment="DISPLAY=:1" + +[Install] +WantedBy=graphical.target From 587a89fa7f8d8237c70f15e539765835eb9307ee Mon Sep 17 00:00:00 2001 From: David Chang Date: Wed, 13 Mar 2024 19:42:49 +0800 Subject: [PATCH 3/5] ver Mar13thv2 added a new service file with replacable arguments --- desktop_env/server/osbench_server@.service | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 desktop_env/server/osbench_server@.service diff --git a/desktop_env/server/osbench_server@.service b/desktop_env/server/osbench_server@.service new file mode 100644 index 0000000..87fc59f --- /dev/null +++ b/desktop_env/server/osbench_server@.service @@ -0,0 +1,16 @@ +[Unit] +Description=OSBench Server +StartLimitIntervalSec=60 +StartLimitBurst=4 +After=network.target auditd.service + +[Service] +ExecStart=/usr/bin/python3 /home/user/main.py +User=user +WorkingDirectory=/home/user +Restart=on-failure +RestartSec=1 +Environment="DISPLAY=%i" + +[Install] +WantedBy=graphical.target From 2b9772174e34bb510a0a4dce624cab1a92f43659 Mon Sep 17 00:00:00 2001 From: David Chang Date: Fri, 15 Mar 2024 12:25:41 +0800 Subject: [PATCH 4/5] ver Mar15th fixed bugs about infeasible task evaluation --- desktop_env/envs/desktop_env.py | 4 ++-- .../2bd59342-0664-4ccb-ba87-79379096cc08.json | 8 ++------ .../7b802dad-6e0f-4204-9815-d4e3f57627d8.json | 8 ++------ 3 files changed, 6 insertions(+), 14 deletions(-) diff --git a/desktop_env/envs/desktop_env.py b/desktop_env/envs/desktop_env.py index 603ed3c..fee3f37 100644 --- a/desktop_env/envs/desktop_env.py +++ b/desktop_env/envs/desktop_env.py @@ -174,7 +174,7 @@ class DesktopEnv(gym.Env): if isinstance(self.evaluator["func"], list) \ else getattr(metrics, self.evaluator["func"]) self.metric_conj: str = self.evaluator.get("conj", "and") # take conjunction of multiple metrics - if "result" in self.evaluator: + if "result" in self.evaluator and len(self.evaluator["result"])>0: self.result_getter: Getter = [getattr(getters, "get_{:}".format(res["type"])) for res in self.evaluator["result"]] \ if isinstance(self.evaluator["result"], list) \ @@ -184,7 +184,7 @@ class DesktopEnv(gym.Env): if isinstance(self.metric, list) \ else None - if "expected" in self.evaluator: + if "expected" in self.evaluator and len(self.evaluator["expected"])>0: self.expected_getter: Getter = [getattr(getters, "get_{:}".format(exp["type"])) if exp else None for exp in self.evaluator["expected"]] \ if isinstance(self.evaluator["expected"], list) \ diff --git a/evaluation_examples/examples/libreoffice_calc/2bd59342-0664-4ccb-ba87-79379096cc08.json b/evaluation_examples/examples/libreoffice_calc/2bd59342-0664-4ccb-ba87-79379096cc08.json index aba58cd..d4bbb32 100644 --- a/evaluation_examples/examples/libreoffice_calc/2bd59342-0664-4ccb-ba87-79379096cc08.json +++ b/evaluation_examples/examples/libreoffice_calc/2bd59342-0664-4ccb-ba87-79379096cc08.json @@ -10,10 +10,6 @@ "libreoffice_calc" ], "evaluator": { - "func": "infeasible", - "expected": { - }, - "result": { - } + "func": "infeasible" } -} \ No newline at end of file +} diff --git a/evaluation_examples/examples/libreoffice_calc/7b802dad-6e0f-4204-9815-d4e3f57627d8.json b/evaluation_examples/examples/libreoffice_calc/7b802dad-6e0f-4204-9815-d4e3f57627d8.json index 0ebfeaf..46d6e7c 100644 --- a/evaluation_examples/examples/libreoffice_calc/7b802dad-6e0f-4204-9815-d4e3f57627d8.json +++ b/evaluation_examples/examples/libreoffice_calc/7b802dad-6e0f-4204-9815-d4e3f57627d8.json @@ -10,10 +10,6 @@ "libreoffice_calc" ], "evaluator": { - "func": "infeasible", - "expected": { - }, - "result": { - } + "func": "infeasible" } -} \ No newline at end of file +} From 815c7ab67cf6e1184917110eae6bb0f365d0e5d9 Mon Sep 17 00:00:00 2001 From: Jason Lee Date: Fri, 15 Mar 2024 16:52:17 +0800 Subject: [PATCH 5/5] filter unfinished examples and add timer to ensure upper limit of each example --- .vscode/launch.json | 19 +++ demo.py | 16 ++ .../examples/multi_apps/demo.py | 19 +++ mm_agents/agent.py | 15 +- run.py | 156 +++++++++++------- 5 files changed, 166 insertions(+), 59 deletions(-) create mode 100644 .vscode/launch.json create mode 100644 demo.py create mode 100644 evaluation_examples/examples/multi_apps/demo.py diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..bc0f472 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,19 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python Debugger: Current File with Arguments", + "type": "debugpy", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal", + "args": [ + "--path_to_vm", "/Users/lxc/Virtual Machines.localized/DesktopEnv-Ubuntu 64-bit Arm.vmwarevm/DesktopEnv-Ubuntu 64-bit Arm.vmx", + "--example_time_limit", "60" + ] + } + ] +} \ No newline at end of file diff --git a/demo.py b/demo.py new file mode 100644 index 0000000..736adfe --- /dev/null +++ b/demo.py @@ -0,0 +1,16 @@ +import signal +import time + +def handler(signo, frame): + raise RuntimeError("Timeout") + +signal.signal(signal.SIGALRM, handler) + +while True: + try: + signal.alarm(5) # seconds + time.sleep(10) + print("Working...") + except Exception as e : + print(e) + continue \ No newline at end of file diff --git a/evaluation_examples/examples/multi_apps/demo.py b/evaluation_examples/examples/multi_apps/demo.py new file mode 100644 index 0000000..ffa2b85 --- /dev/null +++ b/evaluation_examples/examples/multi_apps/demo.py @@ -0,0 +1,19 @@ +import pandas as pd + +file_path = "/Users/lxc/Downloads/Speedtest.csv" +# 找到csv第二行的第二个数据格里的值 +# with open(file_path, "r") as f: +# for i, line in enumerate(f): +# if i == 1: +# data = line.split(",")[1] +# break +# print(data) + +with open(file_path, "r") as f: + reader = pd.read_csv(f, sep=',', header=None) + # for column in reader.columns: + # if column.startswith("TEST_DATE"): + # data_col = column + # break + for data in reader['TEST_DATE']: + print(data) \ No newline at end of file diff --git a/mm_agents/agent.py b/mm_agents/agent.py index 85db78b..5229aba 100644 --- a/mm_agents/agent.py +++ b/mm_agents/agent.py @@ -5,10 +5,12 @@ import os import re import time import uuid +import openai import xml.etree.ElementTree as ET from http import HTTPStatus from io import BytesIO from typing import Dict, List +from google.api_core.exceptions import InvalidArgument import backoff import dashscope @@ -513,7 +515,7 @@ class PromptAgent: try: actions = self.parse_actions(response, masks) self.thoughts.append(response) - except Exception as e: + except ValueError as e: print("Failed to parse action from response", e) actions = None self.thoughts.append("") @@ -522,9 +524,16 @@ class PromptAgent: @backoff.on_exception( backoff.expo, - (Exception), + # here you should add more model exceptions as you want, + # but you are forbidden to add "Exception", that is, a common type of exception + # because we want to catch this kind of Exception in the outside to ensure each example won't exceed the time limit + (openai.RateLimitError, + openai.BadRequestError, + openai.InternalServerError, + InvalidArgument), max_tries=5 ) + def call_llm(self, payload): if self.model.startswith("gpt"): @@ -532,7 +541,7 @@ class PromptAgent: "Content-Type": "application/json", "Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}" } - logger.info("Generating content with GPT model: %s", self.model) + # logger.info("Generating content with GPT model: %s", self.model) response = requests.post( "https://api.openai.com/v1/chat/completions", headers=headers, diff --git a/run.py b/run.py index 908d479..16c2bba 100644 --- a/run.py +++ b/run.py @@ -7,6 +7,7 @@ import json import logging import os import sys +import signal from desktop_env.envs.desktop_env import DesktopEnv from mm_agents.agent import PromptAgent @@ -45,6 +46,10 @@ logger.addHandler(sdebug_handler) logger = logging.getLogger("desktopenv.experiment") +# make sure each example won't exceed the time limit +def handler(signo, frame): + raise RuntimeError("Time limit exceeded!") +signal.signal(signal.SIGALRM, handler) def config() -> argparse.Namespace: parser = argparse.ArgumentParser( @@ -77,6 +82,7 @@ def config() -> argparse.Namespace: # agent config parser.add_argument("--max_trajectory_length", type=int, default=3) parser.add_argument("--test_config_base_dir", type=str, default="evaluation_examples") + parser.add_argument("--example_time_limit", type=int, default=600) # lm config parser.add_argument("--model", type=str, default="gpt-4-vision-preview") @@ -98,6 +104,7 @@ def test( ) -> None: scores = [] max_steps = args.max_steps + time_limit = args.example_time_limit # log args logger.info("Args: %s", args) @@ -119,6 +126,7 @@ def test( for domain in test_all_meta: for example_id in test_all_meta[domain]: + # example setting config_file = os.path.join(args.test_config_base_dir, f"examples/{domain}/{example_id}.json") with open(config_file, "r", encoding="utf-8") as f: example = json.load(f) @@ -140,79 +148,115 @@ def test( ) os.makedirs(example_result_dir, exist_ok=True) - agent.reset() - obs = env.reset(task_config=example) - done = False - step_idx = 0 - env.controller.start_recording() + # example start running + try: + signal.alarm(time_limit) + agent.reset() + obs = env.reset(task_config=example) + done = False + step_idx = 0 + env.controller.start_recording() - while not done and step_idx < max_steps: - actions = agent.predict( - instruction, - obs - ) + while not done and step_idx < max_steps: + actions = agent.predict( + instruction, + obs + ) + for action in actions: + # Capture the timestamp before executing the action + action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S") + logger.info("Step %d: %s", step_idx + 1, action) - for action in actions: + observation, reward, done, info = env.step(action, args.sleep_after_execution) + + logger.info("Reward: %.2f", reward) + logger.info("Done: %s", done) + logger.info("Info: %s", info) + + # Save screenshot and trajectory information + with open(os.path.join(example_result_dir, f"step_{step_idx + 1}_{action_timestamp}.png"), + "wb") as _f: + with open(observation['screenshot'], "rb") as __f: + screenshot = __f.read() + _f.write(screenshot) + + with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f: + f.write(json.dumps({ + "step_num": step_idx + 1, + "action_timestamp": action_timestamp, + "action": action, + "reward": reward, + "done": done, + "info": info, + "screenshot_file": f"step_{step_idx + 1}_{action_timestamp}.png" + })) + f.write("\n") + + if done: + logger.info("The episode is done.") + break step_idx += 1 - # Capture the timestamp before executing the action - action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S") - logger.info("Step %d: %s", step_idx + 1, action) - - observation, reward, done, info = env.step(action, args.sleep_after_execution) - - logger.info("Reward: %.2f", reward) - logger.info("Done: %s", done) - logger.info("Info: %s", info) - - # Save screenshot and trajectory information - with open(os.path.join(example_result_dir, f"step_{step_idx + 1}_{action_timestamp}.png"), - "wb") as _f: - with open(observation['screenshot'], "rb") as __f: - screenshot = __f.read() - _f.write(screenshot) - - with open(os.path.join(example_result_dir, "traj.json"), "a") as f: + + result = env.evaluate() + logger.info("Result: %.2f", result) + scores.append(result) + env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4")) + except RuntimeError as e: + logger.error(f"Error in example {domain}/{example_id}: {e}") + # save info of this example and then continue + try: + env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4")) + with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f: f.write(json.dumps({ - "step_num": step_idx + 1, - "action_timestamp": action_timestamp, - "action": action, - "reward": reward, - "done": done, - "info": info, - "screenshot_file": f"step_{step_idx + 1}_{action_timestamp}.png" + "Error": f"Error in example {domain}/{example_id}: {e}", + "step": step_idx + 1, })) f.write("\n") - - if done: - logger.info("The episode is done.") - break - - result = env.evaluate() - logger.info("Result: %.2f", result) - scores.append(result) - env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4")) - + except Exception as new_e: + with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f: + f.write(json.dumps({ + "Error": f"Error in example {domain}/{example_id}: {e} and {new_e}", + "step": "before start recording", + })) + f.write("\n") + continue env.close() logger.info(f"Average score: {sum(scores) / len(scores)}") -def get_unfinished(test_file_list, result_dir): - finished = [] - for domain in os.listdir(result_dir): - for example_id in os.listdir(os.path.join(result_dir, domain)): - finished.append(f"{domain}/{example_id}") - return [x for x in test_file_list if x not in finished] +def get_unfinished(action_space, use_model, observation_type, result_dir, total_file_json): + target_dir = os.path.join(result_dir, action_space, observation_type, use_model) + + if not os.path.exists(target_dir): + return total_file_json + + finished = {} + for domain in os.listdir(target_dir): + domain_path = os.path.join(target_dir, domain) + if os.path.isdir(domain_path): + finished[domain] = os.listdir(domain_path) + if not finished: + return total_file_json + + for domain, examples in finished.items(): + if domain in total_file_json: + total_file_json[domain] = [x for x in total_file_json[domain] if x not in examples] + + return total_file_json if __name__ == '__main__': ####### The complete version of the list of examples ####### os.environ["TOKENIZERS_PARALLELISM"] = "false" args = config() - # test_file_list = get_unfinished(args.test, args.result_dir) - # logger.info(f"Total {len(test_file_list)} tasks left") - with open("evaluation_examples/test_all.json", "r", encoding="utf-8") as f: test_all_meta = json.load(f) - test(args, test_all_meta) + test_file_list = get_unfinished(args.action_space, args.model, args.observation_type, args.result_dir, test_all_meta) + left_info = "" + for domain in test_file_list: + left_info += f"{domain}: {len(test_file_list[domain])}\n" + logger.info(f"Left tasks:\n{left_info}") + + test(args, test_all_meta) \ No newline at end of file