From eeae1442cd209cb7fb7937fd94d6c1daf873346c Mon Sep 17 00:00:00 2001 From: Timothyxxx <384084775@qq.com> Date: Mon, 18 Mar 2024 20:42:57 +0800 Subject: [PATCH 1/7] Add execute timeout to server; Fix error examples --- desktop_env/evaluators/metrics/vscode.py | 6 ++++++ desktop_env/server/main.py | 2 +- .../ac9bb6cb-1888-43ab-81e4-a98a547918cd.json | 6 ++++++ .../4c26e3f3-3a14-4d86-b44a-d3cedebbb487.json | 4 ++-- .../examples/multi_apps/demo.py | 19 ------------------- 5 files changed, 15 insertions(+), 22 deletions(-) delete mode 100644 evaluation_examples/examples/multi_apps/demo.py diff --git a/desktop_env/evaluators/metrics/vscode.py b/desktop_env/evaluators/metrics/vscode.py index 61976f1..d207aae 100644 --- a/desktop_env/evaluators/metrics/vscode.py +++ b/desktop_env/evaluators/metrics/vscode.py @@ -236,6 +236,9 @@ def check_html_background_image(src_path: str, rule: Dict = None) -> float: Check if the background image is correctly set. multi-app:bb7db4c2-30b5-4be7-8dd7-b8c4ec7d3108 """ + if not src_path: + return 0.0 + from bs4 import BeautifulSoup with open(src_path, 'r') as f: html_content = f.read() @@ -252,6 +255,9 @@ def compare_result_files(src_path, tgt_path): Compare whether the content of two files are the same. multi-app:7f35355e-02a6-45b5-b140-f0be698bcf85 """ + if not src_path or not tgt_path: + return 0.0 + with open(src_path, 'r') as f: src_content = f.read().strip() with open(tgt_path, 'r') as f: diff --git a/desktop_env/server/main.py b/desktop_env/server/main.py index efa62c7..d53232e 100644 --- a/desktop_env/server/main.py +++ b/desktop_env/server/main.py @@ -63,7 +63,7 @@ def execute_command(): # Execute the command without any safety checks. try: - result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=shell, text=True) + result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=shell, text=True, timeout=120) return jsonify({ 'status': 'success', 'output': result.stdout, diff --git a/evaluation_examples/examples/libreoffice_impress/ac9bb6cb-1888-43ab-81e4-a98a547918cd.json b/evaluation_examples/examples/libreoffice_impress/ac9bb6cb-1888-43ab-81e4-a98a547918cd.json index c0d6ba0..053421c 100644 --- a/evaluation_examples/examples/libreoffice_impress/ac9bb6cb-1888-43ab-81e4-a98a547918cd.json +++ b/evaluation_examples/examples/libreoffice_impress/ac9bb6cb-1888-43ab-81e4-a98a547918cd.json @@ -63,6 +63,12 @@ "type": "vm_file", "path": "/home/user/Desktop/saa-format-guide.pptx", "dest": "saa-format-guide.pptx" + }, + "expected": { + "type": "rule", + "rules": { + "color": "red" + } } } } diff --git a/evaluation_examples/examples/multi_apps/4c26e3f3-3a14-4d86-b44a-d3cedebbb487.json b/evaluation_examples/examples/multi_apps/4c26e3f3-3a14-4d86-b44a-d3cedebbb487.json index 0a70b11..447a862 100644 --- a/evaluation_examples/examples/multi_apps/4c26e3f3-3a14-4d86-b44a-d3cedebbb487.json +++ b/evaluation_examples/examples/multi_apps/4c26e3f3-3a14-4d86-b44a-d3cedebbb487.json @@ -30,12 +30,12 @@ ], "evaluator": { "func": "check_brightness_decrease_and_structure_sim", - "expected": { + "result": { "type": "vm_file", "path": "/home/user/Desktop/background.png", "dest": "background.png" }, - "result": { + "expected": { "type": "cloud_file", "path": "https://drive.usercontent.google.com/download?id=13if1UwZ5ay6ADAVW2jp3rcyvAEBse6MJ&export=download&authuser=0&confirm=t&uuid=2ea03068-1874-4240-baa1-f8bb2f917a99&at=APZUnTXq6dVlASg819jCaI1A-rm2:1710136385956", "dest": "image_original.png" diff --git a/evaluation_examples/examples/multi_apps/demo.py b/evaluation_examples/examples/multi_apps/demo.py deleted file mode 100644 index ffa2b85..0000000 --- a/evaluation_examples/examples/multi_apps/demo.py +++ /dev/null @@ -1,19 +0,0 @@ -import pandas as pd - -file_path = "/Users/lxc/Downloads/Speedtest.csv" -# 找到csv第二行的第二个数据格里的值 -# with open(file_path, "r") as f: -# for i, line in enumerate(f): -# if i == 1: -# data = line.split(",")[1] -# break -# print(data) - -with open(file_path, "r") as f: - reader = pd.read_csv(f, sep=',', header=None) - # for column in reader.columns: - # if column.startswith("TEST_DATE"): - # data_col = column - # break - for data in reader['TEST_DATE']: - print(data) \ No newline at end of file From 1c9c5fd2ad8fdbe5d124b2a12818ff8770ef0cb4 Mon Sep 17 00:00:00 2001 From: rhythmcao Date: Mon, 18 Mar 2024 20:51:53 +0800 Subject: [PATCH 2/7] fix multi_apps/51f5801c-18b3-4f25-b0c3-02f85507a078.json missing file problems: who delete it on googledrive??? --- .../multi_apps/51f5801c-18b3-4f25-b0c3-02f85507a078.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/evaluation_examples/examples/multi_apps/51f5801c-18b3-4f25-b0c3-02f85507a078.json b/evaluation_examples/examples/multi_apps/51f5801c-18b3-4f25-b0c3-02f85507a078.json index 3d32ee5..0f1c8ac 100644 --- a/evaluation_examples/examples/multi_apps/51f5801c-18b3-4f25-b0c3-02f85507a078.json +++ b/evaluation_examples/examples/multi_apps/51f5801c-18b3-4f25-b0c3-02f85507a078.json @@ -9,7 +9,7 @@ "parameters": { "files": [ { - "url": "https://drive.usercontent.google.com/download?id=1e12nL_V7bffaLSocQ86EiGCdygzggWeu&export=download", + "url": "https://drive.usercontent.google.com/download?id=1epTcblcYh8j_wFtA-aiXPIF2Oo1IVw8A&export=download", "path": "/home/user/Desktop/Dickinson_Slides.pptx" } ] @@ -36,7 +36,7 @@ }, "expected": { "type": "cloud_file", - "path": "https://drive.usercontent.google.com/download?id=1Xl6tgQ0K5qA1BDA2fKTK2xFLzXwbtkZ6&export=download", + "path": "https://drive.usercontent.google.com/download?id=1vUvaQLJUtFgbZi7lSzl0y0TS_WecFczm&export=download", "dest": "notes_gold.docx" }, "options": { From f992d1f694540b9f2ccdc8f5adcf21262803a194 Mon Sep 17 00:00:00 2001 From: Timothyxxx <384084775@qq.com> Date: Mon, 18 Mar 2024 21:43:35 +0800 Subject: [PATCH 3/7] Disable a11y tree temporarily --- lib_run_single.py | 18 +++++++++--------- run.py | 25 +++++++++++++------------ 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/lib_run_single.py b/lib_run_single.py index d60fd7a..82b2dd3 100644 --- a/lib_run_single.py +++ b/lib_run_single.py @@ -2,7 +2,7 @@ import datetime import json import logging import os -import wandb +# import wandb from wrapt_timeout_decorator import * @@ -15,13 +15,13 @@ with open("./settings.json", "r") as file: time_limit = data["time_limit"] @timeout(time_limit, use_signals=False) -def run_single_example(agent, env, example, max_steps, instruction, args, example_result_dir, scores, run): +def run_single_example(agent, env, example, max_steps, instruction, args, example_result_dir, scores): agent.reset() obs = env.reset(task_config=example) done = False step_idx = 0 env.controller.start_recording() - str_table = wandb.Table(columns=["Screenshot", "A11T", "Modle Response", "Action", "Action timestamp", "Done"]) + # str_table = wandb.Table(columns=["Screenshot", "A11T", "Modle Response", "Action", "Action timestamp", "Done"]) while not done and step_idx < max_steps: response, actions = agent.predict( instruction, @@ -43,10 +43,10 @@ def run_single_example(agent, env, example, max_steps, instruction, args, exampl _f.write(screenshot) # get a11tree and save to wandb thisrun_a11tree = env.controller.get_accessibility_tree() - str_table.add_data(wandb.Image(data_or_path=os.path.join(example_result_dir, f"step_{step_idx + 1}_{action_timestamp}.png"), caption=f"step_{step_idx + 1}_{action_timestamp}"), - thisrun_a11tree, - response, action, action_timestamp, done) - run.log({"Reward": reward}) + # str_table.add_data(wandb.Image(data_or_path=os.path.join(example_result_dir, f"step_{step_idx + 1}_{action_timestamp}.png"), caption=f"step_{step_idx + 1}_{action_timestamp}"), + # thisrun_a11tree, + # response, action, action_timestamp, done) + # run.log({"Reward": reward}) with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f: f.write(json.dumps({ "step_num": step_idx + 1, @@ -62,11 +62,11 @@ def run_single_example(agent, env, example, max_steps, instruction, args, exampl logger.info("The episode is done.") break step_idx += 1 - run.log({"str_trajectory": str_table}) + # run.log({"str_trajectory": str_table}) result = env.evaluate() logger.info("Result: %.2f", result) scores.append(result) with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f: f.write(f"{result}\n") env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4")) - run.log({"Result": result}) + # run.log({"Result": result}) diff --git a/run.py b/run.py index 5212bc0..92e989a 100644 --- a/run.py +++ b/run.py @@ -8,7 +8,7 @@ import logging import os import random import sys -import wandb +# import wandb from tqdm import tqdm @@ -52,7 +52,8 @@ logger = logging.getLogger("desktopenv.experiment") # wandb config ### set your wandb api key here -wandb.login(key=os.environ.get("WANDB_API_KEY", None)) +# os.environ["WANDB_API_KEY"] = "48ec18fb4da7087238c6d6833eab9907565adbf3" +# wandb.login(key=os.environ.get("WANDB_API_KEY", None)) def config() -> argparse.Namespace: @@ -147,8 +148,8 @@ def test( for domain in tqdm(test_all_meta, desc="Domain"): for example_id in tqdm(test_all_meta[domain], desc="Example", leave=False): - run = wandb.init(project=f"OSworld-{args.action_space}-{args.observation_type}-{args.model}", group=f"{domain}", - name=f"{example_id}") + # run = wandb.init(project=f"OSworld-{args.action_space}-{args.observation_type}-{args.model}", group=f"{domain}", + # name=f"{example_id}") # example setting config_file = os.path.join(args.test_config_base_dir, f"examples/{domain}/{example_id}.json") with open(config_file, "r", encoding="utf-8") as f: @@ -163,7 +164,7 @@ def test( # wandb each example config settings cfg_args["instruction"] = instruction cfg_args["start_time"] = datetime.datetime.now().strftime("%Y:%m:%d-%H:%M:%S") - run.config.update(cfg_args) + # run.config.update(cfg_args) example_result_dir = os.path.join( args.result_dir, @@ -177,10 +178,10 @@ def test( # example start running try: lib_run_single.run_single_example(agent, env, example, max_steps, instruction, args, example_result_dir, - scores, run) + scores) except Exception as e: logger.error(f"Exception in {domain}/{example_id}: {e}") - wandb.log({"Exception": wandb.Table(data=[[f"Exception in {domain}/{example_id}: {e}"]], columns=["Error"])}) + # wandb.log({"Exception": wandb.Table(data=[[f"Exception in {domain}/{example_id}: {e}"]], columns=["Error"])}) env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4")) with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f: f.write(json.dumps({ @@ -188,11 +189,11 @@ def test( })) f.write("\n") # wandb settings - os.mkdir(os.path.join(wandb.run.dir, "results/")) - for file in os.listdir(example_result_dir): - # move file to just under the root dir - os.rename(os.path.join(example_result_dir, file), os.path.join(wandb.run.dir, f"./results/{file}")) - wandb.finish() + # os.mkdir(os.path.join(wandb.run.dir, "results/")) + # for file in os.listdir(example_result_dir): + # # move file to just under the root dir + # os.rename(os.path.join(example_result_dir, file), os.path.join(wandb.run.dir, f"./results/{file}")) + # wandb.finish() env.close() logger.info(f"Average score: {sum(scores) / len(scores)}") From 866ac3fbd9c6ce9255c503a9a1f52e7023511f5c Mon Sep 17 00:00:00 2001 From: Fangyu Lei <55661995+lfy79001@users.noreply.github.com> Date: Mon, 18 Mar 2024 21:43:59 +0800 Subject: [PATCH 4/7] Update requirements.txt add wandb and wrapt_timeout_decorator --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 2c595b9..9faae48 100644 --- a/requirements.txt +++ b/requirements.txt @@ -48,4 +48,5 @@ easyocr borb pypdf2 pdfplumber - +wandb +wrapt_timeout_decorator From 4671455b567c438522704d9f09ae74213dd73789 Mon Sep 17 00:00:00 2001 From: BlankCheng <913501223@qq.com> Date: Mon, 18 Mar 2024 22:16:04 +0800 Subject: [PATCH 5/7] Fix eval func --- desktop_env/evaluators/metrics/gimp.py | 16 ++++++++++++---- .../d16c99dc-2a1e-46f2-b350-d97c86c85c15.json | 16 ++++++++-------- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/desktop_env/evaluators/metrics/gimp.py b/desktop_env/evaluators/metrics/gimp.py index e46f5d5..c1208af 100644 --- a/desktop_env/evaluators/metrics/gimp.py +++ b/desktop_env/evaluators/metrics/gimp.py @@ -199,7 +199,7 @@ def structure_check_by_mse(img1, img2, threshold=0.03): def structure_check_by_ssim(img1, img2, threshold=0.9): """Check if two images are approximately the same by SSIM""" - similarity = ssim(np.array(img1), np.array(img2), multichannel=True) + similarity = ssim(np.array(img1), np.array(img2), multichannel=True, channel_axis=-1) print("SSIM: ", similarity) return similarity >= threshold @@ -430,11 +430,11 @@ def check_image_size(src_path, rule): img = Image.open(src_path) # Check the size - if rule["height"] is not None: + if rule.get("height", None) is not None: height_same = img.size[1] == rule["height"] else: height_same = True - if rule["width"] is not None: + if rule.get("width", None) is not None: width_same = img.size[0] == rule["width"] else: width_same = True @@ -607,4 +607,12 @@ if __name__ == "__main__": rule = { "max_size": 500000 } - print(check_image_file_size(src_path, rule)) \ No newline at end of file + print(check_image_file_size(src_path, rule)) + + src_path = "../../../cache/d16c99dc-2a1e-46f2-b350-d97c86c85c15/resized.png" + tgt_path = "../../../cache/d16c99dc-2a1e-46f2-b350-d97c86c85c15/dog_with_background.png" + rule = { + "height": 512 + } + print(check_image_size(src_path, rule)) + print(check_structure_sim_resized(src_path, tgt_path)) \ No newline at end of file diff --git a/evaluation_examples/examples/gimp/d16c99dc-2a1e-46f2-b350-d97c86c85c15.json b/evaluation_examples/examples/gimp/d16c99dc-2a1e-46f2-b350-d97c86c85c15.json index 3029c0c..ca22630 100644 --- a/evaluation_examples/examples/gimp/d16c99dc-2a1e-46f2-b350-d97c86c85c15.json +++ b/evaluation_examples/examples/gimp/d16c99dc-2a1e-46f2-b350-d97c86c85c15.json @@ -86,13 +86,14 @@ ], "func": [ "check_image_size", - "check_structure_sim" + "check_structure_sim_resized" ], "expected": [ { - "type": "vm_file", - "path": "/home/user/Desktop/dog_with_background.png", - "dest": "dog_with_background.png" + "type": "rule", + "rules": { + "height": 512 + } }, { "type": "vm_file", @@ -102,10 +103,9 @@ ], "result": [ { - "type": "rule", - "rules": { - "height": 512 - } + "type": "vm_file", + "path": "/home/user/Desktop/dog_with_background.png", + "dest": "dog_with_background.png" }, { "type": "vm_file", From 8e760fd45045d9556a9a46fafcf2b995e5ff8006 Mon Sep 17 00:00:00 2001 From: Timothyxxx <384084775@qq.com> Date: Tue, 19 Mar 2024 08:57:05 +0800 Subject: [PATCH 6/7] Disable wandb temporarily, speedup the environment step speed by remove useless a11y tree re-get and terminal output --- desktop_env/envs/desktop_env.py | 2 +- lib_run_single.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/desktop_env/envs/desktop_env.py b/desktop_env/envs/desktop_env.py index fee3f37..7dd70b6 100644 --- a/desktop_env/envs/desktop_env.py +++ b/desktop_env/envs/desktop_env.py @@ -285,7 +285,7 @@ class DesktopEnv(gym.Env): observation = { "screenshot": self._get_obs(), "accessibility_tree": self.controller.get_accessibility_tree(), - "terminal": self.controller.get_terminal_output(), + # "terminal": self.controller.get_terminal_output(), "instruction": self.instruction } diff --git a/lib_run_single.py b/lib_run_single.py index 82b2dd3..daa374e 100644 --- a/lib_run_single.py +++ b/lib_run_single.py @@ -42,7 +42,7 @@ def run_single_example(agent, env, example, max_steps, instruction, args, exampl screenshot = __f.read() _f.write(screenshot) # get a11tree and save to wandb - thisrun_a11tree = env.controller.get_accessibility_tree() + # thisrun_a11tree = env.controller.get_accessibility_tree() # str_table.add_data(wandb.Image(data_or_path=os.path.join(example_result_dir, f"step_{step_idx + 1}_{action_timestamp}.png"), caption=f"step_{step_idx + 1}_{action_timestamp}"), # thisrun_a11tree, # response, action, action_timestamp, done) From 41db4b44e78596351e99fb76f0af17fdf1abd9ed Mon Sep 17 00:00:00 2001 From: Fangyu Lei <55661995+lfy79001@users.noreply.github.com> Date: Tue, 19 Mar 2024 12:06:33 +0800 Subject: [PATCH 7/7] Update agent.py mixtral --- mm_agents/agent.py | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/mm_agents/agent.py b/mm_agents/agent.py index ff92673..c769827 100644 --- a/mm_agents/agent.py +++ b/mm_agents/agent.py @@ -568,7 +568,7 @@ class PromptAgent: top_p = payload["top_p"] temperature = payload["temperature"] - misrtal_messages = [] + mistral_messages = [] for i, message in enumerate(messages): mistral_message = { @@ -579,13 +579,8 @@ class PromptAgent: for part in message["content"]: mistral_message['content'] = part['text'] if part['type'] == "text" else "" - misrtal_messages.append(mistral_message) + mistral_messages.append(mistral_message) - # openai.api_base = "http://localhost:8000/v1" - # response = openai.ChatCompletion.create( - # messages=misrtal_messages, - # model="Mixtral-8x7B-Instruct-v0.1" - # ) from openai import OpenAI @@ -593,12 +588,23 @@ class PromptAgent: base_url='https://api.together.xyz', ) logger.info("Generating content with Mistral model: %s", self.model) - - response = client.chat.completions.create( - messages=misrtal_messages, - model=self.model, - max_tokens=max_tokens - ) + + flag = 0 + while True: + try: + if flag > 20: break + response = client.chat.completions.create( + messages=mistral_messages, + model=self.model, + max_tokens=max_tokens + ) + break + except: + if flag == 0: + mistral_messages = [mistral_messages[0]] + mistral_messages[-1:] + else: + mistral_messages[-1]["content"] = ' '.join(mistral_messages[-1]["content"].split()[:-500]) + flag = flag + 1 try: return response.choices[0].message.content