Merge branch 'main' into zdy

2024-03-20 22:27:48 +08:00
parent 15e01e7ccc be1383adb6
commit afddeb04ab
15 changed files with 33 additions and 23 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,9 @@
 *.pth
 *.pt

+# Credential files
+evaluation_examples/settings/googledrive/credentials.json
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
--- a/README.md
+++ b/README.md
@@ -23,8 +23,11 @@ Please refer to [guidance](https://docs.google.com/document/d/1KBdeZwmZs2Vi_Wsnn
 2. Install the environment package, download the examples and the virtual machine image.
 For x86_64 Linux or Windows, you can install the environment package and download the examples and the virtual machine image by running the following commands:
 ```bash
-pip install desktop-env
-gdown xxxx
+git clone https://github.com/xlang-ai/DesktopEnv
+cd DesktopEnv
+pip install -r requirements.txt
+gdown https://drive.google.com/drive/folders/1HX5gcf7UeyR-2UmiA15Q9U-
+Wr6E6Gio8 -O Ubuntu --folder
 vmrun -T ws start "Ubuntu/Ubuntu.vmx" nogui
 vmrun -T ws snapshot "Ubuntu/Ubuntu.vmx" "init_state"
 ```
@@ -89,4 +92,4 @@ If you find this environment useful, please consider citing our work:
  journal={arXiv preprint arXiv:xxxx.xxxx},
  year={2024}
 }
-```
+```
--- a/desktop_env/envs/desktop_env.py
+++ b/desktop_env/envs/desktop_env.py
@@ -58,7 +58,8 @@ class DesktopEnv(gym.Env):
            tmp_dir: str = "tmp",
            cache_dir: str = "cache",
            screen_size: Tuple[int] = (1920, 1080),
-            headless: bool = False
+            headless: bool = False,
+            require_a11y_tree: bool = True,
    ):
        """
        Args:
@@ -77,6 +78,7 @@ class DesktopEnv(gym.Env):
        self.cache_dir_base: str = cache_dir
        self.vm_screen_size = screen_size  # todo: add the logic to get the screen size from the VM
        self.headless = headless
+        self.require_a11y_tree = require_a11y_tree

        os.makedirs(self.tmp_dir_base, exist_ok=True)

@@ -248,7 +250,7 @@ class DesktopEnv(gym.Env):

        observation = {
            "screenshot": self._get_obs(),
-            "accessibility_tree": self.controller.get_accessibility_tree(),
+            "accessibility_tree": self.controller.get_accessibility_tree() if self.require_a11y_tree else None,
        }
        return observation

@@ -284,7 +286,7 @@ class DesktopEnv(gym.Env):

        observation = {
            "screenshot": self._get_obs(),
-            "accessibility_tree": self.controller.get_accessibility_tree(),
+            "accessibility_tree": self.controller.get_accessibility_tree() if self.require_a11y_tree else None,
            # "terminal": self.controller.get_terminal_output(),
            "instruction": self.instruction
        }
--- a/desktop_env/evaluators/metrics/init.py
+++ b/desktop_env/evaluators/metrics/init.py
@@ -77,6 +77,7 @@ from .general import (
    literal_match
 )
 from .gimp import (
+    check_structure_sim_resized,
    check_brightness_decrease_and_structure_sim,
    check_contrast_increase_and_structure_sim,
    check_saturation_increase_and_structure_sim,
--- a/evaluation_examples/examples/vs_code/70745df8-f2f5-42bd-8074-fbc10334fcc5.json
+++ b/evaluation_examples/examples/vs_code/70745df8-f2f5-42bd-8074-fbc10334fcc5.json
@@ -9,7 +9,7 @@
      "parameters": {
        "files": [
          {
-            "url": "https://drive-data-export.usercontent.google.com/download/aht97d3cgh8crp6c8nof1vt3tipaiqt5/tvf25tcbo8jda5qvbhlr5et7mb3l00qr/1709005500000/767b5ea4-1bdf-4b49-9fa0-c17b53e21f8a/108888117743638485671/ADt3v-PaaonAsKLjIRGjHf-MSjw2YlZrA_AiqizGDB9kBc9aOX8OpnU4AjTlh83sB2TPylr28DyOIJhAt4Wpnvm3DK8bVGBOM7JyLSyFtO_hXXbDtrF2DyWDuYy-9PqaxJuwgPfpXVnTuwOwYbZh5kebA99822_ymo383VWrpSaga6MjZXZFtGdl5r87fxwi5G7KgL_bQFo3QUWadawJzldqrwe6KRIIo0Zru0oIVazeM7LtjFV4WWLozAJ7ZJ3lS6qCKJltKN0wpg6Sdw1rS1VzDq_tYo0n2uR4zDll5cMMA8fW5AhU44PNxnWmGmivzJszfXA4Fn7I?j=767b5ea4-1bdf-4b49-9fa0-c17b53e21f8a&user=6816948370&i=0&authuser=0",
+            "url": "https://drive.usercontent.google.com/download?id=1k1d2UbXvp05gDdV669gNDnbdEv9SsAtN&export=download&authuser=0&confirm=t&uuid=c3d51b38-e061-4198-80cd-3cd251de8dae&at=APZUnTXaiHViMYwtweYPykye7N5u:1710938272734",
            "path": "/home/user/Downloads/HW-8-main-20240207T164539Z-001.zip"
          }
        ]
@@ -62,4 +62,4 @@
      "dest": "settings.json"
    }
  }
-}
+}
--- a/evaluation_examples/examples/vs_code/c6bf789c-ba3a-4209-971d-b63abf0ab733.json
+++ b/evaluation_examples/examples/vs_code/c6bf789c-ba3a-4209-971d-b63abf0ab733.json
@@ -9,7 +9,7 @@
      "parameters": {
        "files": [
          {
-            "url": "https://drive-data-export.usercontent.google.com/download/aht97d3cgh8crp6c8nof1vt3tipaiqt5/94gda7383revq68jl0c4fu852bb4a375/1709010000000/09ed1496-6945-4b34-b938-8e5f67e64d8f/108888117743638485671/ADt3v-NkzdbsoW3-0iDvDvlUAcCN3jRyAtBZH4ork--gAhv8JfYhMxiHDU7hr0GB-M8y8BSEArd4Z6becXlgNGuv7k50hOVsShmmQ22KgJkGimg6urK1fzkCG0VS_5cfdxRsjKQByRZmmvX675Zw5NQoRMgcJpTqcjIBr01BsSljkvtPU1wu_iVz_w1i2lk6TlTsNNIa3MRCK1zG4Fd7qySq5Tg6TzWhga1uewXlYGfQVwxyBlVX7rPuJBud2CB7UfZFQbd-2DftnZRA3zSYpDmfwc9NutAdmmuvGr6Fj9395yItzi5Vt6sUWHZfSykXy8DdHtsONn32?j=09ed1496-6945-4b34-b938-8e5f67e64d8f&user=6816948370&i=0&authuser=0",
+            "url": "https://drive.usercontent.google.com/download?id=1ITuXkSbTF0BcbTQ3v4A1qnSzbTPrP5ax&export=download&authuser=0&confirm=t&uuid=c6c45cbf-63bc-4cb0-b76c-5a663c0ed820&at=APZUnTVrE-pn_e6HGTp_Eg4ziQhi:1710938673095",
            "path": "/home/user/Downloads/hw_python_oop-master.zip"
          }
        ]
@@ -63,4 +63,4 @@
      "dest": "settings.json"
    }
  }
-}
+}
--- a/evaluation_examples/settings/google/settings.json
+++ b/evaluation_examples/settings/google/settings.json
@@ -1,4 +1,4 @@
 {
    "email": "xlang2024anonym@gmail.com",
-    "password": "q]wN~0iD>H:6"
-}
+    "password": "Evt5LLj!VJ6Y!C$B"
+}
--- a/evaluation_examples/settings/googledrive/credentials.json
+++ b/evaluation_examples/settings/googledrive/credentials.json
@@ -1 +0,0 @@
-{"access_token": "ya29.a0Ad52N382_JIl2nZBNpJCgoU3HXk2Kz7CArVYn_PGI8pXFucAozry1Vmp5QolzGrnl4UChZswJDOgcdPm5Ew-NbdHPX95wxknoG1oJKqjWYtjl3mw433hiGtriuKWKnXcz1NUf8ewqqq458tJLLDhbbZFW7eZRQrdJzmrGAaCgYKAZ4SARISFQHGX2Mik2MQ5qx0goIypVyzbcUmYw0173", "client_id": "786888752612-rgng5v9hcq4as7pn0b40gt9r5lekmht9.apps.googleusercontent.com", "client_secret": "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", "refresh_token": "1//0eVpYfdSAjvbCCgYIARAAGA4SNwF-L9IrAgL6KVceiEVTjtQdmPki2I3m8ejP3lzTLL2Wa3-rdrYfU7eYeKDVCS5KRxa_xCE_pPY", "token_expiry": "2024-03-13T10:09:01Z", "token_uri": "https://oauth2.googleapis.com/token", "user_agent": null, "revoke_uri": "https://oauth2.googleapis.com/revoke", "id_token": null, "id_token_jwt": null, "token_response": {"access_token": "ya29.a0Ad52N382_JIl2nZBNpJCgoU3HXk2Kz7CArVYn_PGI8pXFucAozry1Vmp5QolzGrnl4UChZswJDOgcdPm5Ew-NbdHPX95wxknoG1oJKqjWYtjl3mw433hiGtriuKWKnXcz1NUf8ewqqq458tJLLDhbbZFW7eZRQrdJzmrGAaCgYKAZ4SARISFQHGX2Mik2MQ5qx0goIypVyzbcUmYw0173", "expires_in": 3599, "scope": "https://www.googleapis.com/auth/drive", "token_type": "Bearer"}, "scopes": ["https://www.googleapis.com/auth/drive"], "token_info_uri": "https://oauth2.googleapis.com/tokeninfo", "invalid": false, "_class": "OAuth2Credentials", "_module": "oauth2client.client"}
--- a/evaluation_examples/test_all.json
+++ b/evaluation_examples/test_all.json
@@ -286,7 +286,6 @@
    "788b3701-3ec9-4b67-b679-418bfa726c22",
    "48c46dc7-fe04-4505-ade7-723cba1aa6f6",
    "42d25c08-fb87-4927-8b65-93631280a26f",
-    "bb7db4c2-30b5-4be7-8dd7-b8c4ec7d3108",
    "e8172110-ec08-421b-a6f5-842e6451911f",
    "42f4d1c7-4521-4161-b646-0a8934e36081",
    "3c8f201a-009d-4bbe-8b65-a6f8b35bb57f",
--- a/evaluation_examples/test_small.json
+++ b/evaluation_examples/test_small.json
@@ -70,7 +70,6 @@
    "c2751594-0cd5-4088-be1b-b5f2f9ec97c4",
    "48c46dc7-fe04-4505-ade7-723cba1aa6f6",
    "42d25c08-fb87-4927-8b65-93631280a26f",
-    "bb7db4c2-30b5-4be7-8dd7-b8c4ec7d3108",
    "3c8f201a-009d-4bbe-8b65-a6f8b35bb57f",
    "d68204bf-11c1-4b13-b48b-d303c73d4bf6",
    "91190194-f406-4cd6-b3f9-c43fac942b22",
--- a/mm_agents/agent.py
+++ b/mm_agents/agent.py
@@ -350,7 +350,7 @@ class PromptAgent:
        # {{{1
        if self.observation_type in ["screenshot", "screenshot_a11y_tree"]:
            base64_image = encode_image(obs["screenshot"])
-            linearized_accessibility_tree = linearize_accessibility_tree(accessibility_tree=obs["accessibility_tree"])
+            linearized_accessibility_tree = linearize_accessibility_tree(accessibility_tree=obs["accessibility_tree"]) if self.observation_type == "screenshot_a11y_tree" else None
            logger.debug("LINEAR AT: %s", linearized_accessibility_tree)

            if self.observation_type == "screenshot_a11y_tree":
--- a/mm_agents/download_ckpt.sh
+++ b/mm_agents/download_ckpt.sh
@@ -1,3 +0,0 @@
-wget https://github.com/UX-Decoder/Semantic-SAM/releases/download/checkpoint/swinl_only_sam_many2many.pth
-wget https://huggingface.co/xdecoder/SEEM/resolve/main/seem_focall_v1.pt
-wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth
--- a/mm_agents/prompts.py
+++ b/mm_agents/prompts.py
@@ -801,7 +801,7 @@ You CAN predict multiple actions at one step, but you should only return one act
 SYS_PROMPT_IN_SOM_OUT_TAG = """
 You are an agent which follow my instruction and perform desktop computer tasks as instructed.
 You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard.
-For each step, you will get an observation of the desktop by 1) a screenshot with interact-able elements marked with numerical tags; and 2) accessibility tree, which is based on AT-SPI library. And you will predict the action of the computer based on the image and test information.
+For each step, you will get an observation of the desktop by 1) a screenshot with interact-able elements marked with numerical tags; and 2) accessibility tree, which is based on AT-SPI library. And you will predict the action of the computer based on the image and text information.

 You are required to use `pyautogui` to perform the action grounded to the observation, but DONOT use the `pyautogui.locateCenterOnScreen` function to locate the element you want to operate with since we have no image of the element you want to operate with. DONOT USE `pyautogui.screenshot()` to make screenshot.
 You can replace x, y in the code with the tag of the element you want to operate with. such as:
--- a/run.py
+++ b/run.py
@@ -95,6 +95,10 @@ def config() -> argparse.Namespace:
    parser.add_argument("--max_tokens", type=int, default=1500)
    parser.add_argument("--stop_token", type=str, default=None)

+    # example config
+    parser.add_argument("--domain", type=str, default="all")
+    parser.add_argument("--test_all_meta_path", type=str, default="evaluation_examples/test_all.json")
+
    # logging related
    parser.add_argument("--result_dir", type=str, default="./results")
    args = parser.parse_args()
@@ -141,10 +145,10 @@ def test(

    env = DesktopEnv(
        path_to_vm=args.path_to_vm,
-        snapshot_name="Snapshot 35",
        action_space=agent.action_space,
        screen_size=(args.screen_width, args.screen_height),
        headless=args.headless,
+        require_a11y_tree=args.observation_type in ["a11y_tree", "screenshot_a11y_tree", "som"],
    )

    for domain in tqdm(test_all_meta, desc="Domain"):
@@ -265,9 +269,12 @@ if __name__ == '__main__':
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    args = config()

-    with open("evaluation_examples/test_all.json", "r", encoding="utf-8") as f:
+    with open(args.test_all_meta_path, "r", encoding="utf-8") as f:
        test_all_meta = json.load(f)

+    if args.domain != "all":
+        test_all_meta = {args.domain: test_all_meta[args.domain]}
+
    test_file_list = get_unfinished(
        args.action_space,
        args.model,
--- a/settings.json
+++ b/settings.json
@@ -1,3 +1,3 @@
 {
-    "time_limit": "600"
+    "time_limit": "1800"
 }
				`@@ -1 +0,0 @@`
				{"access_token": "ya29.a0Ad52N382_JIl2nZBNpJCgoU3HXk2Kz7CArVYn_PGI8pXFucAozry1Vmp5QolzGrnl4UChZswJDOgcdPm5Ew-NbdHPX95wxknoG1oJKqjWYtjl3mw433hiGtriuKWKnXcz1NUf8ewqqq458tJLLDhbbZFW7eZRQrdJzmrGAaCgYKAZ4SARISFQHGX2Mik2MQ5qx0goIypVyzbcUmYw0173", "client_id": "786888752612-rgng5v9hcq4as7pn0b40gt9r5lekmht9.apps.googleusercontent.com", "client_secret": "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", "refresh_token": "1//0eVpYfdSAjvbCCgYIARAAGA4SNwF-L9IrAgL6KVceiEVTjtQdmPki2I3m8ejP3lzTLL2Wa3-rdrYfU7eYeKDVCS5KRxa_xCE_pPY", "token_expiry": "2024-03-13T10:09:01Z", "token_uri": "https://oauth2.googleapis.com/token", "user_agent": null, "revoke_uri": "https://oauth2.googleapis.com/revoke", "id_token": null, "id_token_jwt": null, "token_response": {"access_token": "ya29.a0Ad52N382_JIl2nZBNpJCgoU3HXk2Kz7CArVYn_PGI8pXFucAozry1Vmp5QolzGrnl4UChZswJDOgcdPm5Ew-NbdHPX95wxknoG1oJKqjWYtjl3mw433hiGtriuKWKnXcz1NUf8ewqqq458tJLLDhbbZFW7eZRQrdJzmrGAaCgYKAZ4SARISFQHGX2Mik2MQ5qx0goIypVyzbcUmYw0173", "expires_in": 3599, "scope": "https://www.googleapis.com/auth/drive", "token_type": "Bearer"}, "scopes": ["https://www.googleapis.com/auth/drive"], "token_info_uri": "https://oauth2.googleapis.com/tokeninfo", "invalid": false, "_class": "OAuth2Credentials", "_module": "oauth2client.client"}