Merge branch 'main' into zdy

This commit is contained in:
David Chang
2024-03-20 22:27:48 +08:00
15 changed files with 33 additions and 23 deletions

3
.gitignore vendored
View File

@@ -2,6 +2,9 @@
*.pth
*.pt
# Credential files
evaluation_examples/settings/googledrive/credentials.json
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]

View File

@@ -23,8 +23,11 @@ Please refer to [guidance](https://docs.google.com/document/d/1KBdeZwmZs2Vi_Wsnn
2. Install the environment package, download the examples and the virtual machine image.
For x86_64 Linux or Windows, you can install the environment package and download the examples and the virtual machine image by running the following commands:
```bash
pip install desktop-env
gdown xxxx
git clone https://github.com/xlang-ai/DesktopEnv
cd DesktopEnv
pip install -r requirements.txt
gdown https://drive.google.com/drive/folders/1HX5gcf7UeyR-2UmiA15Q9U-
Wr6E6Gio8 -O Ubuntu --folder
vmrun -T ws start "Ubuntu/Ubuntu.vmx" nogui
vmrun -T ws snapshot "Ubuntu/Ubuntu.vmx" "init_state"
```
@@ -89,4 +92,4 @@ If you find this environment useful, please consider citing our work:
journal={arXiv preprint arXiv:xxxx.xxxx},
year={2024}
}
```
```

View File

@@ -58,7 +58,8 @@ class DesktopEnv(gym.Env):
tmp_dir: str = "tmp",
cache_dir: str = "cache",
screen_size: Tuple[int] = (1920, 1080),
headless: bool = False
headless: bool = False,
require_a11y_tree: bool = True,
):
"""
Args:
@@ -77,6 +78,7 @@ class DesktopEnv(gym.Env):
self.cache_dir_base: str = cache_dir
self.vm_screen_size = screen_size # todo: add the logic to get the screen size from the VM
self.headless = headless
self.require_a11y_tree = require_a11y_tree
os.makedirs(self.tmp_dir_base, exist_ok=True)
@@ -248,7 +250,7 @@ class DesktopEnv(gym.Env):
observation = {
"screenshot": self._get_obs(),
"accessibility_tree": self.controller.get_accessibility_tree(),
"accessibility_tree": self.controller.get_accessibility_tree() if self.require_a11y_tree else None,
}
return observation
@@ -284,7 +286,7 @@ class DesktopEnv(gym.Env):
observation = {
"screenshot": self._get_obs(),
"accessibility_tree": self.controller.get_accessibility_tree(),
"accessibility_tree": self.controller.get_accessibility_tree() if self.require_a11y_tree else None,
# "terminal": self.controller.get_terminal_output(),
"instruction": self.instruction
}

View File

@@ -77,6 +77,7 @@ from .general import (
literal_match
)
from .gimp import (
check_structure_sim_resized,
check_brightness_decrease_and_structure_sim,
check_contrast_increase_and_structure_sim,
check_saturation_increase_and_structure_sim,

View File

@@ -9,7 +9,7 @@
"parameters": {
"files": [
{
"url": "https://drive-data-export.usercontent.google.com/download/aht97d3cgh8crp6c8nof1vt3tipaiqt5/tvf25tcbo8jda5qvbhlr5et7mb3l00qr/1709005500000/767b5ea4-1bdf-4b49-9fa0-c17b53e21f8a/108888117743638485671/ADt3v-PaaonAsKLjIRGjHf-MSjw2YlZrA_AiqizGDB9kBc9aOX8OpnU4AjTlh83sB2TPylr28DyOIJhAt4Wpnvm3DK8bVGBOM7JyLSyFtO_hXXbDtrF2DyWDuYy-9PqaxJuwgPfpXVnTuwOwYbZh5kebA99822_ymo383VWrpSaga6MjZXZFtGdl5r87fxwi5G7KgL_bQFo3QUWadawJzldqrwe6KRIIo0Zru0oIVazeM7LtjFV4WWLozAJ7ZJ3lS6qCKJltKN0wpg6Sdw1rS1VzDq_tYo0n2uR4zDll5cMMA8fW5AhU44PNxnWmGmivzJszfXA4Fn7I?j=767b5ea4-1bdf-4b49-9fa0-c17b53e21f8a&user=6816948370&i=0&authuser=0",
"url": "https://drive.usercontent.google.com/download?id=1k1d2UbXvp05gDdV669gNDnbdEv9SsAtN&export=download&authuser=0&confirm=t&uuid=c3d51b38-e061-4198-80cd-3cd251de8dae&at=APZUnTXaiHViMYwtweYPykye7N5u:1710938272734",
"path": "/home/user/Downloads/HW-8-main-20240207T164539Z-001.zip"
}
]
@@ -62,4 +62,4 @@
"dest": "settings.json"
}
}
}
}

View File

@@ -9,7 +9,7 @@
"parameters": {
"files": [
{
"url": "https://drive-data-export.usercontent.google.com/download/aht97d3cgh8crp6c8nof1vt3tipaiqt5/94gda7383revq68jl0c4fu852bb4a375/1709010000000/09ed1496-6945-4b34-b938-8e5f67e64d8f/108888117743638485671/ADt3v-NkzdbsoW3-0iDvDvlUAcCN3jRyAtBZH4ork--gAhv8JfYhMxiHDU7hr0GB-M8y8BSEArd4Z6becXlgNGuv7k50hOVsShmmQ22KgJkGimg6urK1fzkCG0VS_5cfdxRsjKQByRZmmvX675Zw5NQoRMgcJpTqcjIBr01BsSljkvtPU1wu_iVz_w1i2lk6TlTsNNIa3MRCK1zG4Fd7qySq5Tg6TzWhga1uewXlYGfQVwxyBlVX7rPuJBud2CB7UfZFQbd-2DftnZRA3zSYpDmfwc9NutAdmmuvGr6Fj9395yItzi5Vt6sUWHZfSykXy8DdHtsONn32?j=09ed1496-6945-4b34-b938-8e5f67e64d8f&user=6816948370&i=0&authuser=0",
"url": "https://drive.usercontent.google.com/download?id=1ITuXkSbTF0BcbTQ3v4A1qnSzbTPrP5ax&export=download&authuser=0&confirm=t&uuid=c6c45cbf-63bc-4cb0-b76c-5a663c0ed820&at=APZUnTVrE-pn_e6HGTp_Eg4ziQhi:1710938673095",
"path": "/home/user/Downloads/hw_python_oop-master.zip"
}
]
@@ -63,4 +63,4 @@
"dest": "settings.json"
}
}
}
}

View File

@@ -1,4 +1,4 @@
{
"email": "xlang2024anonym@gmail.com",
"password": "q]wN~0iD>H:6"
}
"password": "Evt5LLj!VJ6Y!C$B"
}

View File

@@ -1 +0,0 @@
{"access_token": "ya29.a0Ad52N382_JIl2nZBNpJCgoU3HXk2Kz7CArVYn_PGI8pXFucAozry1Vmp5QolzGrnl4UChZswJDOgcdPm5Ew-NbdHPX95wxknoG1oJKqjWYtjl3mw433hiGtriuKWKnXcz1NUf8ewqqq458tJLLDhbbZFW7eZRQrdJzmrGAaCgYKAZ4SARISFQHGX2Mik2MQ5qx0goIypVyzbcUmYw0173", "client_id": "786888752612-rgng5v9hcq4as7pn0b40gt9r5lekmht9.apps.googleusercontent.com", "client_secret": "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", "refresh_token": "1//0eVpYfdSAjvbCCgYIARAAGA4SNwF-L9IrAgL6KVceiEVTjtQdmPki2I3m8ejP3lzTLL2Wa3-rdrYfU7eYeKDVCS5KRxa_xCE_pPY", "token_expiry": "2024-03-13T10:09:01Z", "token_uri": "https://oauth2.googleapis.com/token", "user_agent": null, "revoke_uri": "https://oauth2.googleapis.com/revoke", "id_token": null, "id_token_jwt": null, "token_response": {"access_token": "ya29.a0Ad52N382_JIl2nZBNpJCgoU3HXk2Kz7CArVYn_PGI8pXFucAozry1Vmp5QolzGrnl4UChZswJDOgcdPm5Ew-NbdHPX95wxknoG1oJKqjWYtjl3mw433hiGtriuKWKnXcz1NUf8ewqqq458tJLLDhbbZFW7eZRQrdJzmrGAaCgYKAZ4SARISFQHGX2Mik2MQ5qx0goIypVyzbcUmYw0173", "expires_in": 3599, "scope": "https://www.googleapis.com/auth/drive", "token_type": "Bearer"}, "scopes": ["https://www.googleapis.com/auth/drive"], "token_info_uri": "https://oauth2.googleapis.com/tokeninfo", "invalid": false, "_class": "OAuth2Credentials", "_module": "oauth2client.client"}

View File

@@ -286,7 +286,6 @@
"788b3701-3ec9-4b67-b679-418bfa726c22",
"48c46dc7-fe04-4505-ade7-723cba1aa6f6",
"42d25c08-fb87-4927-8b65-93631280a26f",
"bb7db4c2-30b5-4be7-8dd7-b8c4ec7d3108",
"e8172110-ec08-421b-a6f5-842e6451911f",
"42f4d1c7-4521-4161-b646-0a8934e36081",
"3c8f201a-009d-4bbe-8b65-a6f8b35bb57f",

View File

@@ -70,7 +70,6 @@
"c2751594-0cd5-4088-be1b-b5f2f9ec97c4",
"48c46dc7-fe04-4505-ade7-723cba1aa6f6",
"42d25c08-fb87-4927-8b65-93631280a26f",
"bb7db4c2-30b5-4be7-8dd7-b8c4ec7d3108",
"3c8f201a-009d-4bbe-8b65-a6f8b35bb57f",
"d68204bf-11c1-4b13-b48b-d303c73d4bf6",
"91190194-f406-4cd6-b3f9-c43fac942b22",

View File

@@ -350,7 +350,7 @@ class PromptAgent:
# {{{1
if self.observation_type in ["screenshot", "screenshot_a11y_tree"]:
base64_image = encode_image(obs["screenshot"])
linearized_accessibility_tree = linearize_accessibility_tree(accessibility_tree=obs["accessibility_tree"])
linearized_accessibility_tree = linearize_accessibility_tree(accessibility_tree=obs["accessibility_tree"]) if self.observation_type == "screenshot_a11y_tree" else None
logger.debug("LINEAR AT: %s", linearized_accessibility_tree)
if self.observation_type == "screenshot_a11y_tree":

View File

@@ -1,3 +0,0 @@
wget https://github.com/UX-Decoder/Semantic-SAM/releases/download/checkpoint/swinl_only_sam_many2many.pth
wget https://huggingface.co/xdecoder/SEEM/resolve/main/seem_focall_v1.pt
wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth

View File

@@ -801,7 +801,7 @@ You CAN predict multiple actions at one step, but you should only return one act
SYS_PROMPT_IN_SOM_OUT_TAG = """
You are an agent which follow my instruction and perform desktop computer tasks as instructed.
You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard.
For each step, you will get an observation of the desktop by 1) a screenshot with interact-able elements marked with numerical tags; and 2) accessibility tree, which is based on AT-SPI library. And you will predict the action of the computer based on the image and test information.
For each step, you will get an observation of the desktop by 1) a screenshot with interact-able elements marked with numerical tags; and 2) accessibility tree, which is based on AT-SPI library. And you will predict the action of the computer based on the image and text information.
You are required to use `pyautogui` to perform the action grounded to the observation, but DONOT use the `pyautogui.locateCenterOnScreen` function to locate the element you want to operate with since we have no image of the element you want to operate with. DONOT USE `pyautogui.screenshot()` to make screenshot.
You can replace x, y in the code with the tag of the element you want to operate with. such as:

11
run.py
View File

@@ -95,6 +95,10 @@ def config() -> argparse.Namespace:
parser.add_argument("--max_tokens", type=int, default=1500)
parser.add_argument("--stop_token", type=str, default=None)
# example config
parser.add_argument("--domain", type=str, default="all")
parser.add_argument("--test_all_meta_path", type=str, default="evaluation_examples/test_all.json")
# logging related
parser.add_argument("--result_dir", type=str, default="./results")
args = parser.parse_args()
@@ -141,10 +145,10 @@ def test(
env = DesktopEnv(
path_to_vm=args.path_to_vm,
snapshot_name="Snapshot 35",
action_space=agent.action_space,
screen_size=(args.screen_width, args.screen_height),
headless=args.headless,
require_a11y_tree=args.observation_type in ["a11y_tree", "screenshot_a11y_tree", "som"],
)
for domain in tqdm(test_all_meta, desc="Domain"):
@@ -265,9 +269,12 @@ if __name__ == '__main__':
os.environ["TOKENIZERS_PARALLELISM"] = "false"
args = config()
with open("evaluation_examples/test_all.json", "r", encoding="utf-8") as f:
with open(args.test_all_meta_path, "r", encoding="utf-8") as f:
test_all_meta = json.load(f)
if args.domain != "all":
test_all_meta = {args.domain: test_all_meta[args.domain]}
test_file_list = get_unfinished(
args.action_space,
args.model,

View File

@@ -1,3 +1,3 @@
{
"time_limit": "600"
"time_limit": "1800"
}