Merge branch 'main' of https://github.com/xlang-ai/DesktopEnv

2024-03-15 22:09:44 +08:00
parent 81580a1bbc cfa9aaf3a7
commit fdd8ac9cf2
6 changed files with 198 additions and 83 deletions
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -0,0 +1,19 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python Debugger: Current File with Arguments",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+            "args": [
+                "--path_to_vm", "/Users/lxc/Virtual Machines.localized/DesktopEnv-Ubuntu 64-bit Arm.vmwarevm/DesktopEnv-Ubuntu 64-bit Arm.vmx",
+                "--example_time_limit", "60"
+            ]
+        }
+    ]
+}
--- a/README.md
+++ b/README.md
@@ -21,10 +21,12 @@
 Please refer to [guidance](https://docs.google.com/document/d/1KBdeZwmZs2Vi_Wsnngb3Wf1-RiwMMpXTftwMqP2Ztak/edit#heading=h.uh0x0tkl7fuw)

 2. Install the environment package, download the examples and the virtual machine image.
+For x86_64 Linux or Windows, you can install the environment package and download the examples and the virtual machine image by running the following commands:
 ```bash
 pip install desktop-env
 gdown xxxx
-gdown xxxx
+vmrun -T ws start "Ubuntu/Ubuntu.vmx" nogui
+vmrun -T ws snapshot "Ubuntu/Ubuntu.vmx" "init_state"
 ```

 ## Quick Start
--- a/demo.py
+++ b/demo.py
@@ -0,0 +1,16 @@
+import signal
+import time
+
+def handler(signo, frame):
+    raise RuntimeError("Timeout")
+
+signal.signal(signal.SIGALRM, handler)
+
+while True:
+    try:
+        signal.alarm(5) # seconds
+        time.sleep(10)
+        print("Working...")
+    except Exception as e :
+        print(e)
+        continue
--- a/evaluation_examples/examples/multi_apps/demo.py
+++ b/evaluation_examples/examples/multi_apps/demo.py
@@ -0,0 +1,19 @@
+import pandas as pd
+
+file_path = "/Users/lxc/Downloads/Speedtest.csv"
+# 找到csv第二行的第二个数据格里的值
+# with open(file_path, "r") as f:
+#     for i, line in enumerate(f):
+#         if i == 1:
+#             data = line.split(",")[1]
+#             break
+# print(data)
+
+with open(file_path, "r") as f:
+    reader = pd.read_csv(f, sep=',', header=None)
+    # for column in reader.columns:
+    #     if column.startswith("TEST_DATE"):
+    #         data_col = column
+    #         break
+    for data in reader['TEST_DATE']:
+        print(data)
--- a/mm_agents/agent.py
+++ b/mm_agents/agent.py
@@ -5,21 +5,17 @@ import os
 import re
 import time
 import uuid
+import openai
 import xml.etree.ElementTree as ET
 from http import HTTPStatus
 from io import BytesIO
 from typing import Dict, List
-
+from google.api_core.exceptions import InvalidArgument
 import backoff
 import dashscope
 import google.generativeai as genai
 import requests
 from PIL import Image
-from vertexai.preview.generative_models import (
-    HarmBlockThreshold,
-    HarmCategory,
-    Image,
-)

 from mm_agents.accessibility_tree_wrap.heuristic_retrieve import find_leaf_nodes, filter_nodes, draw_bounding_boxes
 from mm_agents.prompts import SYS_PROMPT_IN_SCREENSHOT_OUT_CODE, SYS_PROMPT_IN_SCREENSHOT_OUT_ACTION, \
@@ -43,7 +39,7 @@ def linearize_accessibility_tree(accessibility_tree):
    # leaf_nodes = find_leaf_nodes(accessibility_tree)
    filtered_nodes = filter_nodes(ET.fromstring(accessibility_tree))

-    linearized_accessibility_tree = "tag\tname\ttext\tposition\tsize\n"
+    linearized_accessibility_tree = "tag\tname\ttext\tposition (top-left x&y)\tsize (w&h)\n"
    # Linearize the accessibility tree nodes into a table format

    for node in filtered_nodes:
@@ -205,7 +201,7 @@ class PromptAgent:
                self.system_message = SYS_PROMPT_IN_A11Y_OUT_CODE
            else:
                raise ValueError("Invalid action space: " + action_space)
-        elif observation_type == "both":
+        elif observation_type == "screenshot_a11y_tree":
            if action_space == "computer_13":
                self.system_message = SYS_PROMPT_IN_BOTH_OUT_ACTION
            elif action_space == "pyautogui":
@@ -233,8 +229,7 @@ class PromptAgent:
        """
        Predict the next action(s) based on the current observation.
        """
-        self.system_message = self.system_message + "\nYou are asked to complete the following task: {}".format(
-            instruction)
+        system_message = self.system_message + "\nYou are asked to complete the following task: {}".format(instruction)

        # Prepare the payload for the API call
        messages = []
@@ -245,7 +240,7 @@ class PromptAgent:
            "content": [
                {
                    "type": "text",
-                    "text": self.system_message
+                    "text": system_message
                },
            ]
        })
@@ -266,7 +261,7 @@ class PromptAgent:
        for previous_obs, previous_action, previous_thought in zip(_observations, _actions, _thoughts):

            # {{{1
-            if self.observation_type == "both":
+            if self.observation_type == "screenshot_a11y_tree":
                _screenshot = previous_obs["screenshot"]
                _linearized_accessibility_tree = previous_obs["accessibility_tree"]
                logger.debug("LINEAR AT: %s", _linearized_accessibility_tree)
@@ -356,11 +351,11 @@ class PromptAgent:
            })

        # {{{1
-        if self.observation_type in ["screenshot", "both"]:
+        if self.observation_type in ["screenshot", "screenshot_a11y_tree"]:
            base64_image = encode_image(obs["screenshot"])
            linearized_accessibility_tree = linearize_accessibility_tree(accessibility_tree=obs["accessibility_tree"])

-            if self.observation_type == "both":
+            if self.observation_type == "screenshot_a11y_tree":
                self.observations.append({
                    "screenshot": base64_image,
                    "accessibility_tree": linearized_accessibility_tree
@@ -473,7 +468,9 @@ class PromptAgent:
        response = self.call_llm({
            "model": self.model,
            "messages": messages,
-            "max_tokens": self.max_tokens
+            "max_tokens": self.max_tokens,
+            "top_p": self.top_p,
+            "temperature": self.temperature
        })

        logger.info("RESPONSE: %s", response)
@@ -513,7 +510,7 @@ class PromptAgent:
        try:
            actions = self.parse_actions(response, masks)
            self.thoughts.append(response)
-        except Exception as e:
+        except ValueError as e:
            print("Failed to parse action from response", e)
            actions = None
            self.thoughts.append("")
@@ -522,9 +519,16 @@ class PromptAgent:

    @backoff.on_exception(
        backoff.expo,
-        (Exception),
+        # here you should add more model exceptions as you want,
+        # but you are forbidden to add "Exception", that is, a common type of exception
+        # because we want to catch this kind of Exception in the outside to ensure each example won't exceed the time limit
+        (openai.RateLimitError,
+        openai.BadRequestError,
+        openai.InternalServerError,
+        InvalidArgument),
        max_tries=5
    )
+
    def call_llm(self, payload):

        if self.model.startswith("gpt"):
@@ -532,7 +536,7 @@ class PromptAgent:
                "Content-Type": "application/json",
                "Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}"
            }
-            logger.info("Generating content with GPT model: %s", self.model)
+            # logger.info("Generating content with GPT model: %s", self.model)
            response = requests.post(
                "https://api.openai.com/v1/chat/completions",
                headers=headers,
@@ -542,14 +546,14 @@ class PromptAgent:
            if response.status_code != 200:
                if response.json()['error']['code'] == "context_length_exceeded":
                    logger.error("Context length exceeded. Retrying with a smaller context.")
-                    payload["messages"] = payload["messages"][-1:]
+                    payload["messages"] = [payload["messages"][0]] + payload["messages"][-1:]
                    retry_response = requests.post(
                        "https://api.openai.com/v1/chat/completions",
                        headers=headers,
                        json=payload
                    )
                    if retry_response.status_code != 200:
-                        logger.error("Failed to call LLM: " + retry_response.text)
+                        logger.error("Failed to call LLM even after attempt on shortening the history: " + retry_response.text)
                        return ""

                logger.error("Failed to call LLM: " + response.text)
@@ -656,8 +660,9 @@ class PromptAgent:
                for message in gemini_messages:
                    message_history_str += "<|" + message['role'] + "|>\n" + message['parts'][0] + "\n"
                gemini_messages = [{"role": "user", "parts": [message_history_str, gemini_messages[-1]['parts'][1]]}]
+                # gemini_messages[-1]['parts'][1].save("output.png", "PNG")

-            print(gemini_messages)
+            # print(gemini_messages)
            api_key = os.environ.get("GENAI_API_KEY")
            assert api_key is not None, "Please set the GENAI_API_KEY environment variable"
            genai.configure(api_key=api_key)
@@ -671,11 +676,10 @@ class PromptAgent:
                    "temperature": temperature
                },
                safety_settings={
-                    HarmCategory.HARM_CATEGORY_UNSPECIFIED: HarmBlockThreshold.BLOCK_NONE,
-                    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
-                    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
-                    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
-                    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
+                    "harassment": "block_none",
+                    "hate": "block_none",
+                    "sex": "block_none",
+                    "danger": "block_none"
                }
            )

@@ -726,7 +730,7 @@ class PromptAgent:

    def parse_actions(self, response: str, masks=None):

-        if self.observation_type in ["screenshot", "a11y_tree", "both"]:
+        if self.observation_type in ["screenshot", "a11y_tree", "screenshot_a11y_tree"]:
            # parse from the response
            if self.action_space == "computer_13":
                actions = parse_actions_from_string(response)
--- a/run.py
+++ b/run.py
@@ -6,6 +6,7 @@ import datetime
 import json
 import logging
 import os
+import signal
 import sys

 from desktop_env.envs.desktop_env import DesktopEnv
@@ -46,6 +47,14 @@ logger.addHandler(sdebug_handler)
 logger = logging.getLogger("desktopenv.experiment")


+# make sure each example won't exceed the time limit
+def handler(signo, frame):
+    raise RuntimeError("Time limit exceeded!")
+
+
+signal.signal(signal.SIGALRM, handler)
+
+
 def config() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Run end-to-end evaluation on the benchmark"
@@ -66,7 +75,7 @@ def config() -> argparse.Namespace:
            "screenshot_a11y_tree",
            "som"
        ],
-        default="a11y_tree",
+        default="som",
        help="Observation type",
    )
    parser.add_argument("--screen_width", type=int, default=1920)
@@ -77,6 +86,7 @@ def config() -> argparse.Namespace:
    # agent config
    parser.add_argument("--max_trajectory_length", type=int, default=3)
    parser.add_argument("--test_config_base_dir", type=str, default="evaluation_examples")
+    parser.add_argument("--example_time_limit", type=int, default=600)

    # lm config
    parser.add_argument("--model", type=str, default="gpt-4-vision-preview")
@@ -98,6 +108,7 @@ def test(
 ) -> None:
    scores = []
    max_steps = args.max_steps
+    time_limit = args.example_time_limit

    # log args
    logger.info("Args: %s", args)
@@ -119,6 +130,7 @@ def test(

    for domain in test_all_meta:
        for example_id in test_all_meta[domain]:
+            # example setting
            config_file = os.path.join(args.test_config_base_dir, f"examples/{domain}/{example_id}.json")
            with open(config_file, "r", encoding="utf-8") as f:
                example = json.load(f)
@@ -140,68 +152,102 @@ def test(
            )
            os.makedirs(example_result_dir, exist_ok=True)

-            agent.reset()
-            obs = env.reset(task_config=example)
-            done = False
-            step_idx = 0
-            env.controller.start_recording()
+            # example start running
+            try:
+                signal.alarm(time_limit)
+                agent.reset()
+                obs = env.reset(task_config=example)
+                done = False
+                step_idx = 0
+                env.controller.start_recording()

-            while not done and step_idx < max_steps:
-                actions = agent.predict(
-                    instruction,
-                    obs
-                )
+                while not done and step_idx < max_steps:
+                    actions = agent.predict(
+                        instruction,
+                        obs
+                    )
+                    for action in actions:
+                        # Capture the timestamp before executing the action
+                        action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
+                        logger.info("Step %d: %s", step_idx + 1, action)

-                for action in actions:
+                        obs, reward, done, info = env.step(action, args.sleep_after_execution)
+
+                        logger.info("Reward: %.2f", reward)
+                        logger.info("Done: %s", done)
+                        logger.info("Info: %s", info)
+
+                        # Save screenshot and trajectory information
+                        with open(os.path.join(example_result_dir, f"step_{step_idx + 1}_{action_timestamp}.png"),
+                                  "wb") as _f:
+                            with open(obs['screenshot'], "rb") as __f:
+                                screenshot = __f.read()
+                            _f.write(screenshot)
+
+                        with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
+                            f.write(json.dumps({
+                                "step_num": step_idx + 1,
+                                "action_timestamp": action_timestamp,
+                                "action": action,
+                                "reward": reward,
+                                "done": done,
+                                "info": info,
+                                "screenshot_file": f"step_{step_idx + 1}_{action_timestamp}.png"
+                            }))
+                            f.write("\n")
+
+                        if done:
+                            logger.info("The episode is done.")
+                            break
                    step_idx += 1
-                    # Capture the timestamp before executing the action
-                    action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
-                    logger.info("Step %d: %s", step_idx + 1, action)

-                    observation, reward, done, info = env.step(action, args.sleep_after_execution)
-
-                    logger.info("Reward: %.2f", reward)
-                    logger.info("Done: %s", done)
-                    logger.info("Info: %s", info)
-
-                    # Save screenshot and trajectory information
-                    with open(os.path.join(example_result_dir, f"step_{step_idx + 1}_{action_timestamp}.png"),
-                              "wb") as _f:
-                        with open(observation['screenshot'], "rb") as __f:
-                            screenshot = __f.read()
-                        _f.write(screenshot)
-
-                    with open(os.path.join(example_result_dir, "traj.json"), "a") as f:
+                result = env.evaluate()
+                logger.info("Result: %.2f", result)
+                scores.append(result)
+                env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
+            except RuntimeError as e:
+                logger.error(f"Error in example {domain}/{example_id}: {e}")
+                # save info of this example and then continue
+                try:
+                    env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
+                    with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
                        f.write(json.dumps({
-                            "step_num": step_idx + 1,
-                            "action_timestamp": action_timestamp,
-                            "action": action,
-                            "reward": reward,
-                            "done": done,
-                            "info": info,
-                            "screenshot_file": f"step_{step_idx + 1}_{action_timestamp}.png"
+                            "Error": f"Error in example {domain}/{example_id}: {e}",
+                            "step": step_idx + 1,
                        }))
                        f.write("\n")
-
-                    if done:
-                        logger.info("The episode is done.")
-                        break
-
-            result = env.evaluate()
-            logger.info("Result: %.2f", result)
-            scores.append(result)
-            env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
-
+                except Exception as new_e:
+                    with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
+                        f.write(json.dumps({
+                            "Error": f"Error in example {domain}/{example_id}: {e} and {new_e}",
+                            "step": "before start recording",
+                        }))
+                        f.write("\n")
+                continue
    env.close()
    logger.info(f"Average score: {sum(scores) / len(scores)}")


-def get_unfinished(test_file_list, result_dir):
-    finished = []
-    for domain in os.listdir(result_dir):
-        for example_id in os.listdir(os.path.join(result_dir, domain)):
-            finished.append(f"{domain}/{example_id}")
-    return [x for x in test_file_list if x not in finished]
+def get_unfinished(action_space, use_model, observation_type, result_dir, total_file_json):
+    target_dir = os.path.join(result_dir, action_space, observation_type, use_model)
+
+    if not os.path.exists(target_dir):
+        return total_file_json
+
+    finished = {}
+    for domain in os.listdir(target_dir):
+        domain_path = os.path.join(target_dir, domain)
+        if os.path.isdir(domain_path):
+            finished[domain] = os.listdir(domain_path)
+
+    if not finished:
+        return total_file_json
+
+    for domain, examples in finished.items():
+        if domain in total_file_json:
+            total_file_json[domain] = [x for x in total_file_json[domain] if x not in examples]
+
+    return total_file_json


 if __name__ == '__main__':
@@ -209,10 +255,19 @@ if __name__ == '__main__':
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    args = config()

-    # test_file_list = get_unfinished(args.test, args.result_dir)
-    # logger.info(f"Total {len(test_file_list)} tasks left")
-
    with open("evaluation_examples/test_all.json", "r", encoding="utf-8") as f:
        test_all_meta = json.load(f)

+    test_file_list = get_unfinished(
+        args.action_space,
+        args.model,
+        args.observation_type,
+        args.result_dir,
+        test_all_meta
+    )
+    left_info = ""
+    for domain in test_file_list:
+        left_info += f"{domain}: {len(test_file_list[domain])}\n"
+    logger.info(f"Left tasks:\n{left_info}")
+
    test(args, test_all_meta)