From eeae1442cd209cb7fb7937fd94d6c1daf873346c Mon Sep 17 00:00:00 2001
From: Timothyxxx <384084775@qq.com>
Date: Mon, 18 Mar 2024 20:42:57 +0800
Subject: [PATCH 1/7] Add execute timeout to server; Fix error examples

---
 desktop_env/evaluators/metrics/vscode.py      |  6 ++++++
 desktop_env/server/main.py                    |  2 +-
 .../ac9bb6cb-1888-43ab-81e4-a98a547918cd.json |  6 ++++++
 .../4c26e3f3-3a14-4d86-b44a-d3cedebbb487.json |  4 ++--
 .../examples/multi_apps/demo.py               | 19 -------------------
 5 files changed, 15 insertions(+), 22 deletions(-)
 delete mode 100644 evaluation_examples/examples/multi_apps/demo.py

diff --git a/desktop_env/evaluators/metrics/vscode.py b/desktop_env/evaluators/metrics/vscode.py
index 61976f1..d207aae 100644
--- a/desktop_env/evaluators/metrics/vscode.py
+++ b/desktop_env/evaluators/metrics/vscode.py
@@ -236,6 +236,9 @@ def check_html_background_image(src_path: str, rule: Dict = None) -> float:
     Check if the background image is correctly set.
     multi-app:bb7db4c2-30b5-4be7-8dd7-b8c4ec7d3108
     """
+    if not src_path:
+        return 0.0
+
     from bs4 import BeautifulSoup
     with open(src_path, 'r') as f:
         html_content = f.read()
@@ -252,6 +255,9 @@ def compare_result_files(src_path, tgt_path):
     Compare whether the content of two files are the same.
     multi-app:7f35355e-02a6-45b5-b140-f0be698bcf85
     """
+    if not src_path or not tgt_path:
+        return 0.0
+
     with open(src_path, 'r') as f:
         src_content = f.read().strip()
     with open(tgt_path, 'r') as f:
diff --git a/desktop_env/server/main.py b/desktop_env/server/main.py
index efa62c7..d53232e 100644
--- a/desktop_env/server/main.py
+++ b/desktop_env/server/main.py
@@ -63,7 +63,7 @@ def execute_command():
 
     # Execute the command without any safety checks.
     try:
-        result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=shell, text=True)
+        result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=shell, text=True, timeout=120)
         return jsonify({
             'status': 'success',
             'output': result.stdout,
diff --git a/evaluation_examples/examples/libreoffice_impress/ac9bb6cb-1888-43ab-81e4-a98a547918cd.json b/evaluation_examples/examples/libreoffice_impress/ac9bb6cb-1888-43ab-81e4-a98a547918cd.json
index c0d6ba0..053421c 100644
--- a/evaluation_examples/examples/libreoffice_impress/ac9bb6cb-1888-43ab-81e4-a98a547918cd.json
+++ b/evaluation_examples/examples/libreoffice_impress/ac9bb6cb-1888-43ab-81e4-a98a547918cd.json
@@ -63,6 +63,12 @@
       "type": "vm_file",
       "path": "/home/user/Desktop/saa-format-guide.pptx",
       "dest": "saa-format-guide.pptx"
+    },
+    "expected": {
+      "type": "rule",
+      "rules": {
+        "color": "red"
+      }
     }
   }
 }
diff --git a/evaluation_examples/examples/multi_apps/4c26e3f3-3a14-4d86-b44a-d3cedebbb487.json b/evaluation_examples/examples/multi_apps/4c26e3f3-3a14-4d86-b44a-d3cedebbb487.json
index 0a70b11..447a862 100644
--- a/evaluation_examples/examples/multi_apps/4c26e3f3-3a14-4d86-b44a-d3cedebbb487.json
+++ b/evaluation_examples/examples/multi_apps/4c26e3f3-3a14-4d86-b44a-d3cedebbb487.json
@@ -30,12 +30,12 @@
     ],
     "evaluator": {
         "func": "check_brightness_decrease_and_structure_sim",
-        "expected": {
+        "result": {
             "type": "vm_file",
             "path": "/home/user/Desktop/background.png",
             "dest": "background.png"
         },
-        "result": {
+        "expected": {
             "type": "cloud_file",
             "path": "https://drive.usercontent.google.com/download?id=13if1UwZ5ay6ADAVW2jp3rcyvAEBse6MJ&export=download&authuser=0&confirm=t&uuid=2ea03068-1874-4240-baa1-f8bb2f917a99&at=APZUnTXq6dVlASg819jCaI1A-rm2:1710136385956",
             "dest": "image_original.png"
diff --git a/evaluation_examples/examples/multi_apps/demo.py b/evaluation_examples/examples/multi_apps/demo.py
deleted file mode 100644
index ffa2b85..0000000
--- a/evaluation_examples/examples/multi_apps/demo.py
+++ /dev/null
@@ -1,19 +0,0 @@
-import pandas as pd
-
-file_path = "/Users/lxc/Downloads/Speedtest.csv"
-# 找到csv第二行的第二个数据格里的值
-# with open(file_path, "r") as f:
-#     for i, line in enumerate(f):
-#         if i == 1:
-#             data = line.split(",")[1]
-#             break
-# print(data)
-
-with open(file_path, "r") as f:
-    reader = pd.read_csv(f, sep=',', header=None)
-    # for column in reader.columns:
-    #     if column.startswith("TEST_DATE"):
-    #         data_col = column
-    #         break
-    for data in reader['TEST_DATE']:
-        print(data)
\ No newline at end of file

From 1c9c5fd2ad8fdbe5d124b2a12818ff8770ef0cb4 Mon Sep 17 00:00:00 2001
From: rhythmcao <ruishengcao@gmail.com>
Date: Mon, 18 Mar 2024 20:51:53 +0800
Subject: [PATCH 2/7] fix multi_apps/51f5801c-18b3-4f25-b0c3-02f85507a078.json
 missing file problems: who delete it on googledrive???

---
 .../multi_apps/51f5801c-18b3-4f25-b0c3-02f85507a078.json      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/evaluation_examples/examples/multi_apps/51f5801c-18b3-4f25-b0c3-02f85507a078.json b/evaluation_examples/examples/multi_apps/51f5801c-18b3-4f25-b0c3-02f85507a078.json
index 3d32ee5..0f1c8ac 100644
--- a/evaluation_examples/examples/multi_apps/51f5801c-18b3-4f25-b0c3-02f85507a078.json
+++ b/evaluation_examples/examples/multi_apps/51f5801c-18b3-4f25-b0c3-02f85507a078.json
@@ -9,7 +9,7 @@
             "parameters": {
               "files": [
                 {
-                  "url": "https://drive.usercontent.google.com/download?id=1e12nL_V7bffaLSocQ86EiGCdygzggWeu&export=download",
+                  "url": "https://drive.usercontent.google.com/download?id=1epTcblcYh8j_wFtA-aiXPIF2Oo1IVw8A&export=download",
                   "path": "/home/user/Desktop/Dickinson_Slides.pptx"
                 }
               ]
@@ -36,7 +36,7 @@
         },
         "expected": {
             "type": "cloud_file",
-            "path": "https://drive.usercontent.google.com/download?id=1Xl6tgQ0K5qA1BDA2fKTK2xFLzXwbtkZ6&export=download",
+            "path": "https://drive.usercontent.google.com/download?id=1vUvaQLJUtFgbZi7lSzl0y0TS_WecFczm&export=download",
             "dest": "notes_gold.docx"
         },
         "options": {

From f992d1f694540b9f2ccdc8f5adcf21262803a194 Mon Sep 17 00:00:00 2001
From: Timothyxxx <384084775@qq.com>
Date: Mon, 18 Mar 2024 21:43:35 +0800
Subject: [PATCH 3/7] Disable a11y tree temporarily

---
 lib_run_single.py | 18 +++++++++---------
 run.py            | 25 +++++++++++++------------
 2 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/lib_run_single.py b/lib_run_single.py
index d60fd7a..82b2dd3 100644
--- a/lib_run_single.py
+++ b/lib_run_single.py
@@ -2,7 +2,7 @@ import datetime
 import json
 import logging
 import os
-import wandb
+# import wandb
 
 from wrapt_timeout_decorator import *
 
@@ -15,13 +15,13 @@ with open("./settings.json", "r") as file:
 time_limit = data["time_limit"]
 
 @timeout(time_limit, use_signals=False)
-def run_single_example(agent, env, example, max_steps, instruction, args, example_result_dir, scores, run):
+def run_single_example(agent, env, example, max_steps, instruction, args, example_result_dir, scores):
     agent.reset()
     obs = env.reset(task_config=example)
     done = False
     step_idx = 0
     env.controller.start_recording()
-    str_table = wandb.Table(columns=["Screenshot", "A11T", "Modle Response", "Action", "Action timestamp", "Done"])
+    # str_table = wandb.Table(columns=["Screenshot", "A11T", "Modle Response", "Action", "Action timestamp", "Done"])
     while not done and step_idx < max_steps:
         response, actions = agent.predict(
             instruction,
@@ -43,10 +43,10 @@ def run_single_example(agent, env, example, max_steps, instruction, args, exampl
                 _f.write(screenshot)
             # get a11tree and save to wandb
             thisrun_a11tree = env.controller.get_accessibility_tree()
-            str_table.add_data(wandb.Image(data_or_path=os.path.join(example_result_dir, f"step_{step_idx + 1}_{action_timestamp}.png"), caption=f"step_{step_idx + 1}_{action_timestamp}"),
-                            thisrun_a11tree,
-                            response, action, action_timestamp, done)
-            run.log({"Reward": reward})
+            # str_table.add_data(wandb.Image(data_or_path=os.path.join(example_result_dir, f"step_{step_idx + 1}_{action_timestamp}.png"), caption=f"step_{step_idx + 1}_{action_timestamp}"),
+            #                 thisrun_a11tree,
+            #                 response, action, action_timestamp, done)
+            # run.log({"Reward": reward})
             with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
                 f.write(json.dumps({
                     "step_num": step_idx + 1,
@@ -62,11 +62,11 @@ def run_single_example(agent, env, example, max_steps, instruction, args, exampl
                 logger.info("The episode is done.")
                 break
         step_idx += 1
-    run.log({"str_trajectory": str_table})
+    # run.log({"str_trajectory": str_table})
     result = env.evaluate()
     logger.info("Result: %.2f", result)
     scores.append(result)
     with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
         f.write(f"{result}\n")
     env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
-    run.log({"Result": result})
+    # run.log({"Result": result})
diff --git a/run.py b/run.py
index 5212bc0..92e989a 100644
--- a/run.py
+++ b/run.py
@@ -8,7 +8,7 @@ import logging
 import os
 import random
 import sys
-import wandb
+# import wandb
 
 from tqdm import tqdm
 
@@ -52,7 +52,8 @@ logger = logging.getLogger("desktopenv.experiment")
 
 # wandb config
 ### set your wandb api key here
-wandb.login(key=os.environ.get("WANDB_API_KEY", None))
+# os.environ["WANDB_API_KEY"] = "48ec18fb4da7087238c6d6833eab9907565adbf3"
+# wandb.login(key=os.environ.get("WANDB_API_KEY", None))
 
 
 def config() -> argparse.Namespace:
@@ -147,8 +148,8 @@ def test(
 
     for domain in tqdm(test_all_meta, desc="Domain"):
         for example_id in tqdm(test_all_meta[domain], desc="Example", leave=False):
-            run = wandb.init(project=f"OSworld-{args.action_space}-{args.observation_type}-{args.model}", group=f"{domain}",
-                    name=f"{example_id}")
+            # run = wandb.init(project=f"OSworld-{args.action_space}-{args.observation_type}-{args.model}", group=f"{domain}",
+            #         name=f"{example_id}")
             # example setting
             config_file = os.path.join(args.test_config_base_dir, f"examples/{domain}/{example_id}.json")
             with open(config_file, "r", encoding="utf-8") as f:
@@ -163,7 +164,7 @@ def test(
             # wandb each example config settings
             cfg_args["instruction"] = instruction
             cfg_args["start_time"] = datetime.datetime.now().strftime("%Y:%m:%d-%H:%M:%S")
-            run.config.update(cfg_args)
+            # run.config.update(cfg_args)
 
             example_result_dir = os.path.join(
                 args.result_dir,
@@ -177,10 +178,10 @@ def test(
             # example start running
             try:
                 lib_run_single.run_single_example(agent, env, example, max_steps, instruction, args, example_result_dir,
-                                                  scores, run)
+                                                  scores)
             except Exception as e:
                 logger.error(f"Exception in {domain}/{example_id}: {e}")
-                wandb.log({"Exception": wandb.Table(data=[[f"Exception in {domain}/{example_id}: {e}"]], columns=["Error"])})
+                # wandb.log({"Exception": wandb.Table(data=[[f"Exception in {domain}/{example_id}: {e}"]], columns=["Error"])})
                 env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
                 with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
                     f.write(json.dumps({
@@ -188,11 +189,11 @@ def test(
                     }))
                     f.write("\n")
             # wandb settings
-            os.mkdir(os.path.join(wandb.run.dir, "results/"))
-            for file in os.listdir(example_result_dir):
-                # move file to just under the root dir
-                os.rename(os.path.join(example_result_dir, file), os.path.join(wandb.run.dir, f"./results/{file}"))
-            wandb.finish()
+            # os.mkdir(os.path.join(wandb.run.dir, "results/"))
+            # for file in os.listdir(example_result_dir):
+            #     # move file to just under the root dir
+            #     os.rename(os.path.join(example_result_dir, file), os.path.join(wandb.run.dir, f"./results/{file}"))
+            # wandb.finish()
 
     env.close()
     logger.info(f"Average score: {sum(scores) / len(scores)}")

From 866ac3fbd9c6ce9255c503a9a1f52e7023511f5c Mon Sep 17 00:00:00 2001
From: Fangyu Lei <55661995+lfy79001@users.noreply.github.com>
Date: Mon, 18 Mar 2024 21:43:59 +0800
Subject: [PATCH 4/7] Update requirements.txt  add wandb and
 wrapt_timeout_decorator

---
 requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 2c595b9..9faae48 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -48,4 +48,5 @@ easyocr
 borb
 pypdf2
 pdfplumber
-
+wandb
+wrapt_timeout_decorator

From 4671455b567c438522704d9f09ae74213dd73789 Mon Sep 17 00:00:00 2001
From: BlankCheng <913501223@qq.com>
Date: Mon, 18 Mar 2024 22:16:04 +0800
Subject: [PATCH 5/7] Fix eval func

---
 desktop_env/evaluators/metrics/gimp.py           | 16 ++++++++++++----
 .../d16c99dc-2a1e-46f2-b350-d97c86c85c15.json    | 16 ++++++++--------
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/desktop_env/evaluators/metrics/gimp.py b/desktop_env/evaluators/metrics/gimp.py
index e46f5d5..c1208af 100644
--- a/desktop_env/evaluators/metrics/gimp.py
+++ b/desktop_env/evaluators/metrics/gimp.py
@@ -199,7 +199,7 @@ def structure_check_by_mse(img1, img2, threshold=0.03):
 
 def structure_check_by_ssim(img1, img2, threshold=0.9):
     """Check if two images are approximately the same by SSIM"""
-    similarity = ssim(np.array(img1), np.array(img2), multichannel=True)
+    similarity = ssim(np.array(img1), np.array(img2), multichannel=True, channel_axis=-1)
     print("SSIM: ", similarity)
     return similarity >= threshold
 
@@ -430,11 +430,11 @@ def check_image_size(src_path, rule):
     img = Image.open(src_path)
 
     # Check the size
-    if rule["height"] is not None:
+    if rule.get("height", None) is not None:
         height_same = img.size[1] == rule["height"]
     else:
         height_same = True
-    if rule["width"] is not None:
+    if rule.get("width", None) is not None:
         width_same = img.size[0] == rule["width"]
     else:
         width_same = True
@@ -607,4 +607,12 @@ if __name__ == "__main__":
     rule = {
         "max_size": 500000
     }
-    print(check_image_file_size(src_path, rule))
\ No newline at end of file
+    print(check_image_file_size(src_path, rule))
+
+    src_path = "../../../cache/d16c99dc-2a1e-46f2-b350-d97c86c85c15/resized.png"
+    tgt_path = "../../../cache/d16c99dc-2a1e-46f2-b350-d97c86c85c15/dog_with_background.png"
+    rule = {
+        "height": 512
+    }
+    print(check_image_size(src_path, rule))
+    print(check_structure_sim_resized(src_path, tgt_path))
\ No newline at end of file
diff --git a/evaluation_examples/examples/gimp/d16c99dc-2a1e-46f2-b350-d97c86c85c15.json b/evaluation_examples/examples/gimp/d16c99dc-2a1e-46f2-b350-d97c86c85c15.json
index 3029c0c..ca22630 100644
--- a/evaluation_examples/examples/gimp/d16c99dc-2a1e-46f2-b350-d97c86c85c15.json
+++ b/evaluation_examples/examples/gimp/d16c99dc-2a1e-46f2-b350-d97c86c85c15.json
@@ -86,13 +86,14 @@
     ],
     "func": [
       "check_image_size",
-      "check_structure_sim"
+      "check_structure_sim_resized"
     ],
     "expected": [
       {
-        "type": "vm_file",
-        "path": "/home/user/Desktop/dog_with_background.png",
-        "dest": "dog_with_background.png"
+        "type": "rule",
+        "rules": {
+          "height": 512
+        }
       },
       {
         "type": "vm_file",
@@ -102,10 +103,9 @@
     ],
     "result": [
       {
-        "type": "rule",
-        "rules": {
-          "height": 512
-        }
+        "type": "vm_file",
+        "path": "/home/user/Desktop/dog_with_background.png",
+        "dest": "dog_with_background.png"
       },
       {
         "type": "vm_file",

From 8e760fd45045d9556a9a46fafcf2b995e5ff8006 Mon Sep 17 00:00:00 2001
From: Timothyxxx <384084775@qq.com>
Date: Tue, 19 Mar 2024 08:57:05 +0800
Subject: [PATCH 6/7] Disable wandb temporarily, speedup the environment step
 speed by remove useless a11y tree re-get and terminal output

---
 desktop_env/envs/desktop_env.py | 2 +-
 lib_run_single.py               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/desktop_env/envs/desktop_env.py b/desktop_env/envs/desktop_env.py
index fee3f37..7dd70b6 100644
--- a/desktop_env/envs/desktop_env.py
+++ b/desktop_env/envs/desktop_env.py
@@ -285,7 +285,7 @@ class DesktopEnv(gym.Env):
         observation = {
             "screenshot": self._get_obs(),
             "accessibility_tree": self.controller.get_accessibility_tree(),
-            "terminal": self.controller.get_terminal_output(),
+            # "terminal": self.controller.get_terminal_output(),
             "instruction": self.instruction
         }
 
diff --git a/lib_run_single.py b/lib_run_single.py
index 82b2dd3..daa374e 100644
--- a/lib_run_single.py
+++ b/lib_run_single.py
@@ -42,7 +42,7 @@ def run_single_example(agent, env, example, max_steps, instruction, args, exampl
                     screenshot = __f.read()
                 _f.write(screenshot)
             # get a11tree and save to wandb
-            thisrun_a11tree = env.controller.get_accessibility_tree()
+            # thisrun_a11tree = env.controller.get_accessibility_tree()
             # str_table.add_data(wandb.Image(data_or_path=os.path.join(example_result_dir, f"step_{step_idx + 1}_{action_timestamp}.png"), caption=f"step_{step_idx + 1}_{action_timestamp}"),
             #                 thisrun_a11tree,
             #                 response, action, action_timestamp, done)

From 41db4b44e78596351e99fb76f0af17fdf1abd9ed Mon Sep 17 00:00:00 2001
From: Fangyu Lei <55661995+lfy79001@users.noreply.github.com>
Date: Tue, 19 Mar 2024 12:06:33 +0800
Subject: [PATCH 7/7] Update agent.py mixtral

---
 mm_agents/agent.py | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/mm_agents/agent.py b/mm_agents/agent.py
index ff92673..c769827 100644
--- a/mm_agents/agent.py
+++ b/mm_agents/agent.py
@@ -568,7 +568,7 @@ class PromptAgent:
             top_p = payload["top_p"]
             temperature = payload["temperature"]
 
-            misrtal_messages = []
+            mistral_messages = []
 
             for i, message in enumerate(messages):
                 mistral_message = {
@@ -579,13 +579,8 @@ class PromptAgent:
                 for part in message["content"]:
                     mistral_message['content'] = part['text'] if part['type'] == "text" else ""
 
-                misrtal_messages.append(mistral_message)
+                mistral_messages.append(mistral_message)
 
-            # openai.api_base = "http://localhost:8000/v1"
-            # response = openai.ChatCompletion.create(
-            #     messages=misrtal_messages,
-            #     model="Mixtral-8x7B-Instruct-v0.1"
-            # )
 
             from openai import OpenAI
 
@@ -593,12 +588,23 @@ class PromptAgent:
                             base_url='https://api.together.xyz',
                             )
             logger.info("Generating content with Mistral model: %s", self.model)
-
-            response = client.chat.completions.create(
-                messages=misrtal_messages,
-                model=self.model,
-                max_tokens=max_tokens
-            )
+            
+            flag = 0
+            while True:
+                try:
+                    if flag > 20: break
+                    response = client.chat.completions.create(
+                        messages=mistral_messages,
+                        model=self.model,
+                        max_tokens=max_tokens
+                    )
+                    break
+                except:
+                    if flag == 0:
+                        mistral_messages = [mistral_messages[0]] + mistral_messages[-1:]
+                    else:
+                        mistral_messages[-1]["content"] = ' '.join(mistral_messages[-1]["content"].split()[:-500])
+                    flag = flag + 1
 
             try:
                 return response.choices[0].message.content