diff --git a/.gitignore b/.gitignore index a3543d8..f8bd92f 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,9 @@ *.pth *.pt +# Credential files +evaluation_examples/settings/googledrive/credentials.json + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/README.md b/README.md index 6262044..44d8517 100644 --- a/README.md +++ b/README.md @@ -23,8 +23,11 @@ Please refer to [guidance](https://docs.google.com/document/d/1KBdeZwmZs2Vi_Wsnn 2. Install the environment package, download the examples and the virtual machine image. For x86_64 Linux or Windows, you can install the environment package and download the examples and the virtual machine image by running the following commands: ```bash -pip install desktop-env -gdown xxxx +git clone https://github.com/xlang-ai/DesktopEnv +cd DesktopEnv +pip install -r requirements.txt +gdown https://drive.google.com/drive/folders/1HX5gcf7UeyR-2UmiA15Q9U- +Wr6E6Gio8 -O Ubuntu --folder vmrun -T ws start "Ubuntu/Ubuntu.vmx" nogui vmrun -T ws snapshot "Ubuntu/Ubuntu.vmx" "init_state" ``` @@ -89,4 +92,4 @@ If you find this environment useful, please consider citing our work: journal={arXiv preprint arXiv:xxxx.xxxx}, year={2024} } -``` \ No newline at end of file +``` diff --git a/desktop_env/envs/desktop_env.py b/desktop_env/envs/desktop_env.py index 7dd70b6..b443a4a 100644 --- a/desktop_env/envs/desktop_env.py +++ b/desktop_env/envs/desktop_env.py @@ -58,7 +58,8 @@ class DesktopEnv(gym.Env): tmp_dir: str = "tmp", cache_dir: str = "cache", screen_size: Tuple[int] = (1920, 1080), - headless: bool = False + headless: bool = False, + require_a11y_tree: bool = True, ): """ Args: @@ -77,6 +78,7 @@ class DesktopEnv(gym.Env): self.cache_dir_base: str = cache_dir self.vm_screen_size = screen_size # todo: add the logic to get the screen size from the VM self.headless = headless + self.require_a11y_tree = require_a11y_tree os.makedirs(self.tmp_dir_base, exist_ok=True) @@ -248,7 +250,7 @@ class DesktopEnv(gym.Env): observation = { "screenshot": self._get_obs(), - "accessibility_tree": self.controller.get_accessibility_tree(), + "accessibility_tree": self.controller.get_accessibility_tree() if self.require_a11y_tree else None, } return observation @@ -284,7 +286,7 @@ class DesktopEnv(gym.Env): observation = { "screenshot": self._get_obs(), - "accessibility_tree": self.controller.get_accessibility_tree(), + "accessibility_tree": self.controller.get_accessibility_tree() if self.require_a11y_tree else None, # "terminal": self.controller.get_terminal_output(), "instruction": self.instruction } diff --git a/desktop_env/evaluators/metrics/__init__.py b/desktop_env/evaluators/metrics/__init__.py index 61bb025..341e138 100644 --- a/desktop_env/evaluators/metrics/__init__.py +++ b/desktop_env/evaluators/metrics/__init__.py @@ -77,6 +77,7 @@ from .general import ( literal_match ) from .gimp import ( + check_structure_sim_resized, check_brightness_decrease_and_structure_sim, check_contrast_increase_and_structure_sim, check_saturation_increase_and_structure_sim, diff --git a/evaluation_examples/examples/vs_code/70745df8-f2f5-42bd-8074-fbc10334fcc5.json b/evaluation_examples/examples/vs_code/70745df8-f2f5-42bd-8074-fbc10334fcc5.json index 4d1c86b..3c41b6c 100644 --- a/evaluation_examples/examples/vs_code/70745df8-f2f5-42bd-8074-fbc10334fcc5.json +++ b/evaluation_examples/examples/vs_code/70745df8-f2f5-42bd-8074-fbc10334fcc5.json @@ -9,7 +9,7 @@ "parameters": { "files": [ { - "url": "https://drive-data-export.usercontent.google.com/download/aht97d3cgh8crp6c8nof1vt3tipaiqt5/tvf25tcbo8jda5qvbhlr5et7mb3l00qr/1709005500000/767b5ea4-1bdf-4b49-9fa0-c17b53e21f8a/108888117743638485671/ADt3v-PaaonAsKLjIRGjHf-MSjw2YlZrA_AiqizGDB9kBc9aOX8OpnU4AjTlh83sB2TPylr28DyOIJhAt4Wpnvm3DK8bVGBOM7JyLSyFtO_hXXbDtrF2DyWDuYy-9PqaxJuwgPfpXVnTuwOwYbZh5kebA99822_ymo383VWrpSaga6MjZXZFtGdl5r87fxwi5G7KgL_bQFo3QUWadawJzldqrwe6KRIIo0Zru0oIVazeM7LtjFV4WWLozAJ7ZJ3lS6qCKJltKN0wpg6Sdw1rS1VzDq_tYo0n2uR4zDll5cMMA8fW5AhU44PNxnWmGmivzJszfXA4Fn7I?j=767b5ea4-1bdf-4b49-9fa0-c17b53e21f8a&user=6816948370&i=0&authuser=0", + "url": "https://drive.usercontent.google.com/download?id=1k1d2UbXvp05gDdV669gNDnbdEv9SsAtN&export=download&authuser=0&confirm=t&uuid=c3d51b38-e061-4198-80cd-3cd251de8dae&at=APZUnTXaiHViMYwtweYPykye7N5u:1710938272734", "path": "/home/user/Downloads/HW-8-main-20240207T164539Z-001.zip" } ] @@ -62,4 +62,4 @@ "dest": "settings.json" } } -} \ No newline at end of file +} diff --git a/evaluation_examples/examples/vs_code/c6bf789c-ba3a-4209-971d-b63abf0ab733.json b/evaluation_examples/examples/vs_code/c6bf789c-ba3a-4209-971d-b63abf0ab733.json index e011be9..ce94baf 100644 --- a/evaluation_examples/examples/vs_code/c6bf789c-ba3a-4209-971d-b63abf0ab733.json +++ b/evaluation_examples/examples/vs_code/c6bf789c-ba3a-4209-971d-b63abf0ab733.json @@ -9,7 +9,7 @@ "parameters": { "files": [ { - "url": "https://drive-data-export.usercontent.google.com/download/aht97d3cgh8crp6c8nof1vt3tipaiqt5/94gda7383revq68jl0c4fu852bb4a375/1709010000000/09ed1496-6945-4b34-b938-8e5f67e64d8f/108888117743638485671/ADt3v-NkzdbsoW3-0iDvDvlUAcCN3jRyAtBZH4ork--gAhv8JfYhMxiHDU7hr0GB-M8y8BSEArd4Z6becXlgNGuv7k50hOVsShmmQ22KgJkGimg6urK1fzkCG0VS_5cfdxRsjKQByRZmmvX675Zw5NQoRMgcJpTqcjIBr01BsSljkvtPU1wu_iVz_w1i2lk6TlTsNNIa3MRCK1zG4Fd7qySq5Tg6TzWhga1uewXlYGfQVwxyBlVX7rPuJBud2CB7UfZFQbd-2DftnZRA3zSYpDmfwc9NutAdmmuvGr6Fj9395yItzi5Vt6sUWHZfSykXy8DdHtsONn32?j=09ed1496-6945-4b34-b938-8e5f67e64d8f&user=6816948370&i=0&authuser=0", + "url": "https://drive.usercontent.google.com/download?id=1ITuXkSbTF0BcbTQ3v4A1qnSzbTPrP5ax&export=download&authuser=0&confirm=t&uuid=c6c45cbf-63bc-4cb0-b76c-5a663c0ed820&at=APZUnTVrE-pn_e6HGTp_Eg4ziQhi:1710938673095", "path": "/home/user/Downloads/hw_python_oop-master.zip" } ] @@ -63,4 +63,4 @@ "dest": "settings.json" } } -} \ No newline at end of file +} diff --git a/evaluation_examples/settings/google/settings.json b/evaluation_examples/settings/google/settings.json index ae70605..cc20c51 100644 --- a/evaluation_examples/settings/google/settings.json +++ b/evaluation_examples/settings/google/settings.json @@ -1,4 +1,4 @@ { "email": "xlang2024anonym@gmail.com", - "password": "q]wN~0iD>H:6" -} \ No newline at end of file + "password": "Evt5LLj!VJ6Y!C$B" +} diff --git a/evaluation_examples/settings/googledrive/credentials.json b/evaluation_examples/settings/googledrive/credentials.json deleted file mode 100644 index 81d22c2..0000000 --- a/evaluation_examples/settings/googledrive/credentials.json +++ /dev/null @@ -1 +0,0 @@ -{"access_token": "ya29.a0Ad52N382_JIl2nZBNpJCgoU3HXk2Kz7CArVYn_PGI8pXFucAozry1Vmp5QolzGrnl4UChZswJDOgcdPm5Ew-NbdHPX95wxknoG1oJKqjWYtjl3mw433hiGtriuKWKnXcz1NUf8ewqqq458tJLLDhbbZFW7eZRQrdJzmrGAaCgYKAZ4SARISFQHGX2Mik2MQ5qx0goIypVyzbcUmYw0173", "client_id": "786888752612-rgng5v9hcq4as7pn0b40gt9r5lekmht9.apps.googleusercontent.com", "client_secret": "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", "refresh_token": "1//0eVpYfdSAjvbCCgYIARAAGA4SNwF-L9IrAgL6KVceiEVTjtQdmPki2I3m8ejP3lzTLL2Wa3-rdrYfU7eYeKDVCS5KRxa_xCE_pPY", "token_expiry": "2024-03-13T10:09:01Z", "token_uri": "https://oauth2.googleapis.com/token", "user_agent": null, "revoke_uri": "https://oauth2.googleapis.com/revoke", "id_token": null, "id_token_jwt": null, "token_response": {"access_token": "ya29.a0Ad52N382_JIl2nZBNpJCgoU3HXk2Kz7CArVYn_PGI8pXFucAozry1Vmp5QolzGrnl4UChZswJDOgcdPm5Ew-NbdHPX95wxknoG1oJKqjWYtjl3mw433hiGtriuKWKnXcz1NUf8ewqqq458tJLLDhbbZFW7eZRQrdJzmrGAaCgYKAZ4SARISFQHGX2Mik2MQ5qx0goIypVyzbcUmYw0173", "expires_in": 3599, "scope": "https://www.googleapis.com/auth/drive", "token_type": "Bearer"}, "scopes": ["https://www.googleapis.com/auth/drive"], "token_info_uri": "https://oauth2.googleapis.com/tokeninfo", "invalid": false, "_class": "OAuth2Credentials", "_module": "oauth2client.client"} \ No newline at end of file diff --git a/evaluation_examples/test_all.json b/evaluation_examples/test_all.json index 7153d86..e530435 100644 --- a/evaluation_examples/test_all.json +++ b/evaluation_examples/test_all.json @@ -286,7 +286,6 @@ "788b3701-3ec9-4b67-b679-418bfa726c22", "48c46dc7-fe04-4505-ade7-723cba1aa6f6", "42d25c08-fb87-4927-8b65-93631280a26f", - "bb7db4c2-30b5-4be7-8dd7-b8c4ec7d3108", "e8172110-ec08-421b-a6f5-842e6451911f", "42f4d1c7-4521-4161-b646-0a8934e36081", "3c8f201a-009d-4bbe-8b65-a6f8b35bb57f", diff --git a/evaluation_examples/test_small.json b/evaluation_examples/test_small.json index 4c1feb7..aec99fc 100644 --- a/evaluation_examples/test_small.json +++ b/evaluation_examples/test_small.json @@ -70,7 +70,6 @@ "c2751594-0cd5-4088-be1b-b5f2f9ec97c4", "48c46dc7-fe04-4505-ade7-723cba1aa6f6", "42d25c08-fb87-4927-8b65-93631280a26f", - "bb7db4c2-30b5-4be7-8dd7-b8c4ec7d3108", "3c8f201a-009d-4bbe-8b65-a6f8b35bb57f", "d68204bf-11c1-4b13-b48b-d303c73d4bf6", "91190194-f406-4cd6-b3f9-c43fac942b22", diff --git a/mm_agents/agent.py b/mm_agents/agent.py index f2d4b5c..e9f1147 100644 --- a/mm_agents/agent.py +++ b/mm_agents/agent.py @@ -350,7 +350,7 @@ class PromptAgent: # {{{1 if self.observation_type in ["screenshot", "screenshot_a11y_tree"]: base64_image = encode_image(obs["screenshot"]) - linearized_accessibility_tree = linearize_accessibility_tree(accessibility_tree=obs["accessibility_tree"]) + linearized_accessibility_tree = linearize_accessibility_tree(accessibility_tree=obs["accessibility_tree"]) if self.observation_type == "screenshot_a11y_tree" else None logger.debug("LINEAR AT: %s", linearized_accessibility_tree) if self.observation_type == "screenshot_a11y_tree": diff --git a/mm_agents/download_ckpt.sh b/mm_agents/download_ckpt.sh deleted file mode 100644 index 146fcea..0000000 --- a/mm_agents/download_ckpt.sh +++ /dev/null @@ -1,3 +0,0 @@ -wget https://github.com/UX-Decoder/Semantic-SAM/releases/download/checkpoint/swinl_only_sam_many2many.pth -wget https://huggingface.co/xdecoder/SEEM/resolve/main/seem_focall_v1.pt -wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth \ No newline at end of file diff --git a/mm_agents/prompts.py b/mm_agents/prompts.py index c609a66..3a916d9 100644 --- a/mm_agents/prompts.py +++ b/mm_agents/prompts.py @@ -801,7 +801,7 @@ You CAN predict multiple actions at one step, but you should only return one act SYS_PROMPT_IN_SOM_OUT_TAG = """ You are an agent which follow my instruction and perform desktop computer tasks as instructed. You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard. -For each step, you will get an observation of the desktop by 1) a screenshot with interact-able elements marked with numerical tags; and 2) accessibility tree, which is based on AT-SPI library. And you will predict the action of the computer based on the image and test information. +For each step, you will get an observation of the desktop by 1) a screenshot with interact-able elements marked with numerical tags; and 2) accessibility tree, which is based on AT-SPI library. And you will predict the action of the computer based on the image and text information. You are required to use `pyautogui` to perform the action grounded to the observation, but DONOT use the `pyautogui.locateCenterOnScreen` function to locate the element you want to operate with since we have no image of the element you want to operate with. DONOT USE `pyautogui.screenshot()` to make screenshot. You can replace x, y in the code with the tag of the element you want to operate with. such as: diff --git a/run.py b/run.py index 705e108..e6f67f9 100644 --- a/run.py +++ b/run.py @@ -95,6 +95,10 @@ def config() -> argparse.Namespace: parser.add_argument("--max_tokens", type=int, default=1500) parser.add_argument("--stop_token", type=str, default=None) + # example config + parser.add_argument("--domain", type=str, default="all") + parser.add_argument("--test_all_meta_path", type=str, default="evaluation_examples/test_all.json") + # logging related parser.add_argument("--result_dir", type=str, default="./results") args = parser.parse_args() @@ -141,10 +145,10 @@ def test( env = DesktopEnv( path_to_vm=args.path_to_vm, - snapshot_name="Snapshot 35", action_space=agent.action_space, screen_size=(args.screen_width, args.screen_height), headless=args.headless, + require_a11y_tree=args.observation_type in ["a11y_tree", "screenshot_a11y_tree", "som"], ) for domain in tqdm(test_all_meta, desc="Domain"): @@ -265,9 +269,12 @@ if __name__ == '__main__': os.environ["TOKENIZERS_PARALLELISM"] = "false" args = config() - with open("evaluation_examples/test_all.json", "r", encoding="utf-8") as f: + with open(args.test_all_meta_path, "r", encoding="utf-8") as f: test_all_meta = json.load(f) + if args.domain != "all": + test_all_meta = {args.domain: test_all_meta[args.domain]} + test_file_list = get_unfinished( args.action_space, args.model, diff --git a/settings.json b/settings.json index 7ee7a21..75ddfa3 100644 --- a/settings.json +++ b/settings.json @@ -1,3 +1,3 @@ { - "time_limit": "600" + "time_limit": "1800" } \ No newline at end of file