diff --git a/desktop_env/desktop_env.py b/desktop_env/desktop_env.py index 833e54e..488d4ef 100644 --- a/desktop_env/desktop_env.py +++ b/desktop_env/desktop_env.py @@ -3,6 +3,7 @@ from __future__ import annotations import logging import os import time +import re from typing import Callable, Any, Optional, Tuple from typing import List, Dict, Union @@ -22,6 +23,88 @@ MAX_RETRIES = 5 # Maximum retries for environment setup +def _fix_pyautogui_less_than_bug(command: str) -> str: + """ + Fix PyAutoGUI '<' character bug by converting it to hotkey("shift", ',') calls. + + This fixes the known PyAutoGUI issue where typing '<' produces '>' instead. + References: + - https://github.com/asweigart/pyautogui/issues/198 + - https://github.com/xlang-ai/OSWorld/issues/257 + + Args: + command (str): The original pyautogui command + + Returns: + str: The fixed command with '<' characters handled properly + """ + # Handle typewrite with '<' characters + def replace_typewrite_less_than(match): + content = match.group(1) + # Split the content by '<' and rebuild with hotkey calls + parts = content.split('<') + if len(parts) == 1: + # No '<' found, return original + return match.group(0) + + # Rebuild the command + result_parts = [] + for i, part in enumerate(parts): + if i == 0: + # First part, just add typewrite if not empty + if part: + result_parts.append(f"pyautogui.typewrite({repr(part)})") + else: + # Add hotkey for '<' and then typewrite for the rest if not empty + result_parts.append('pyautogui.hotkey("shift", ",")') + if part: + result_parts.append(f"pyautogui.typewrite({repr(part)})") + + return '; '.join(result_parts) + + # Handle press('<') calls + def replace_press_less_than(match): + return 'pyautogui.hotkey("shift", ",")' + + # Pattern to match typewrite calls with quoted strings + typewrite_pattern = r'pyautogui\.typewrite\((["\'])(.*?)\1\)' + # Pattern to match press('<') calls + press_pattern = r'pyautogui\.press\(["\']<["\']\)' + + # First handle press('<') calls + command = re.sub(press_pattern, replace_press_less_than, command) + + # Then handle typewrite calls + def process_typewrite_match(match): + quote_char = match.group(1) + content = match.group(2) + + # Check if content contains '<' + if '<' not in content: + return match.group(0) + + # Split by '<' and rebuild + parts = content.split('<') + result_parts = [] + + for i, part in enumerate(parts): + if i == 0: + # First part + if part: + result_parts.append(f"pyautogui.typewrite({quote_char}{part}{quote_char})") + else: + # Add hotkey for '<' and then typewrite for the rest + result_parts.append('pyautogui.hotkey("shift", ",")') + if part: + result_parts.append(f"pyautogui.typewrite({quote_char}{part}{quote_char})") + + return '; '.join(result_parts) + + command = re.sub(typewrite_pattern, process_typewrite_match, command) + + return command + + class DesktopEnv(gym.Env): """ DesktopEnv with OpenAI Gym interface. It provides a desktop environment for setting and evaluating desktop automation tasks. @@ -341,9 +424,13 @@ class DesktopEnv(gym.Env): else: # the set of all possible python commands insides `pyautogui` if type(action) == str: - self.controller.execute_python_command(action) + # Fix PyAutoGUI '<' character bug before execution + fixed_command = _fix_pyautogui_less_than_bug(action) + self.controller.execute_python_command(fixed_command) elif type(action) == dict: - self.controller.execute_python_command(action['command']) + # Fix PyAutoGUI '<' character bug before execution + fixed_command = _fix_pyautogui_less_than_bug(action['command']) + self.controller.execute_python_command(fixed_command) time.sleep(pause) observation = self._get_obs() diff --git a/evaluation_examples/examples/multi_apps/3c8f201a-009d-4bbe-8b65-a6f8b35bb57f.json b/evaluation_examples/examples/multi_apps/3c8f201a-009d-4bbe-8b65-a6f8b35bb57f.json index 060aaac..a66980b 100644 --- a/evaluation_examples/examples/multi_apps/3c8f201a-009d-4bbe-8b65-a6f8b35bb57f.json +++ b/evaluation_examples/examples/multi_apps/3c8f201a-009d-4bbe-8b65-a6f8b35bb57f.json @@ -1,7 +1,7 @@ { "id": "3c8f201a-009d-4bbe-8b65-a6f8b35bb57f", "snapshot": "gimp", - "instruction": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/3c8f201a-009d-4bbe-8b65-a6f8b35bb57f/kingbird.jpeg", + "instruction": "Download the image from \"https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/3c8f201a-009d-4bbe-8b65-a6f8b35bb57f/kingbird.jpeg\", and then use GIMP to compress it to under 600KB as \"compressed.jpeg\" on the Desktop. Resize if needed.", "source": "", "config": [ { diff --git a/evaluation_examples/examples/multi_apps/68a25bd4-59c7-4f4d-975e-da0c8509c848.json b/evaluation_examples/examples/multi_apps/68a25bd4-59c7-4f4d-975e-da0c8509c848.json index 349848d..7ec1346 100644 --- a/evaluation_examples/examples/multi_apps/68a25bd4-59c7-4f4d-975e-da0c8509c848.json +++ b/evaluation_examples/examples/multi_apps/68a25bd4-59c7-4f4d-975e-da0c8509c848.json @@ -15,12 +15,6 @@ ] } }, - { - "type": "open", - "parameters": { - "path": "/home/user/Desktop/rsc-ebook-collection-2023.xlsx" - } - }, { "type": "launch", "parameters": { @@ -41,9 +35,9 @@ } }, { - "type": "activate_window", + "type": "open", "parameters": { - "window_name": "Google Chrome" + "path": "/home/user/Desktop/rsc-ebook-collection-2023.xlsx" } } ], diff --git a/evaluation_examples/examples/multi_apps/778efd0a-153f-4842-9214-f05fc176b877.json b/evaluation_examples/examples/multi_apps/778efd0a-153f-4842-9214-f05fc176b877.json index b74ed3d..063397a 100644 --- a/evaluation_examples/examples/multi_apps/778efd0a-153f-4842-9214-f05fc176b877.json +++ b/evaluation_examples/examples/multi_apps/778efd0a-153f-4842-9214-f05fc176b877.json @@ -28,7 +28,7 @@ { "type": "launch", "parameters": { - "command": "vlc", + "command": "VLC_VERBOSE=-1 vlc --no-audio --no-video-title-show /home/user/Desktop/planet.mp4", "shell": true } } diff --git a/evaluation_examples/examples/multi_apps/bc2b57f3-686d-4ec9-87ce-edf850b7e442.json b/evaluation_examples/examples/multi_apps/bc2b57f3-686d-4ec9-87ce-edf850b7e442.json index 654b704..f4fc314 100644 --- a/evaluation_examples/examples/multi_apps/bc2b57f3-686d-4ec9-87ce-edf850b7e442.json +++ b/evaluation_examples/examples/multi_apps/bc2b57f3-686d-4ec9-87ce-edf850b7e442.json @@ -1,7 +1,7 @@ { "id": "bc2b57f3-686d-4ec9-87ce-edf850b7e442", "snapshot": "libreoffice_calc", - "instruction": "The requirements of my data analysis assignment are listed in \"reminder.docx\" on the desktop. Help me modify my assignment \"asm.xlsx\" saved on the desktop accordingly.", + "instruction": "The requirements of my data analysis assignment are listed in \"reminder.docx\" on the desktop. Help me modify my assignment opended accordingly.", "source": "authors", "config": [ { diff --git a/evaluation_examples/settings/proxy/dataimpulse.json b/evaluation_examples/settings/proxy/dataimpulse.json index 4cd99ac..2f08efe 100644 --- a/evaluation_examples/settings/proxy/dataimpulse.json +++ b/evaluation_examples/settings/proxy/dataimpulse.json @@ -2,8 +2,8 @@ { "host": "gw.dataimpulse.com", "port": 823, - "username": "your_username", - "password": "your_password", + "username": "fba5ac061fe18be70c6c", + "password": "e225c50bf56bdd6c", "protocol": "http", "provider": "dataimpulse", "type": "residential", diff --git a/evaluation_examples/test.json b/evaluation_examples/test.json new file mode 100644 index 0000000..92a9f36 --- /dev/null +++ b/evaluation_examples/test.json @@ -0,0 +1,12 @@ +{ + "multi_apps": [ + "b52b40a5-ad70-4c53-b5b0-5650a8387052", + "22a4636f-8179-4357-8e87-d1743ece1f81", + "46407397-a7d5-4c6b-92c6-dbe038b1457b", + "4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc", + "78aed49a-a710-4321-a793-b611a7c5b56b", + "0c825995-5b70-4526-b663-113f4c999dd2", + "897e3b53-5d4d-444b-85cb-2cdc8a97d903", + "a0b9dc9c-fc07-4a88-8c5d-5e3ecad91bcb" + ] +} \ No newline at end of file diff --git a/evaluation_examples/test_all_nogdrive.json b/evaluation_examples/test_all_nogdrive.json new file mode 100644 index 0000000..1d06660 --- /dev/null +++ b/evaluation_examples/test_all_nogdrive.json @@ -0,0 +1,383 @@ +{ + "chrome": [ + "bb5e4c0d-f964-439c-97b6-bdb9747de3f4", + "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3", + "06fe7178-4491-4589-810f-2e2bc9502122", + "e1e75309-3ddb-4d09-92ec-de869c928143", + "35253b65-1c19-4304-8aa4-6884b8218fc0", + "2ad9387a-65d8-4e33-ad5b-7580065a27ca", + "7a5a7856-f1b6-42a4-ade9-1ca81ca0f263", + "44ee5668-ecd5-4366-a6ce-c1c9b8d4e938", + "2ae9ba84-3a0d-4d4c-8338-3a1478dc5fe3", + "480bcfea-d68f-4aaa-a0a9-2589ef319381", + "af630914-714e-4a24-a7bb-f9af687d3b91", + "3720f614-37fd-4d04-8a6b-76f54f8c222d", + "99146c54-4f37-4ab8-9327-5f3291665e1e", + "12086550-11c0-466b-b367-1d9e75b3910e", + "6766f2b8-8a72-417f-a9e5-56fcaa735837", + "93eabf48-6a27-4cb6-b963-7d5fe1e0d3a9", + "ae78f875-5b98-4907-bbb5-9c737fc68c03", + "3299584d-8f11-4457-bf4c-ce98f7600250", + "030eeff7-b492-4218-b312-701ec99ee0cc", + "9656a811-9b5b-4ddf-99c7-5117bcef0626", + "fc6d8143-9452-4171-9459-7f515143419a", + "a96b564e-dbe9-42c3-9ccf-b4498073438a", + "1704f00f-79e6-43a7-961b-cedd3724d5fd", + "f3b19d1e-2d48-44e9-b4e1-defcae1a0197", + "82bc8d6a-36eb-4d2d-8801-ef714fb1e55a", + "47543840-672a-467d-80df-8f7c3b9788c9", + "c1fa57f3-c3db-4596-8f09-020701085416", + "da46d875-6b82-4681-9284-653b0c7ae241", + "6c4c23a1-42a4-43cc-9db1-2f86ff3738cc", + "f79439ad-3ee8-4f99-a518-0eb60e5652b0", + "b7895e80-f4d1-4648-bee0-4eb45a6f1fa8", + "9f3f70fc-5afc-4958-a7b7-3bb4fcb01805", + "7f52cab9-535c-4835-ac8c-391ee64dc930", + "82279c77-8fc6-46f6-9622-3ba96f61b477", + "2888b4e6-5b47-4b57-8bf5-c73827890774", + "b4f95342-463e-4179-8c3f-193cd7241fb2", + "f5d96daf-83a8-4c86-9686-bada31fc66ab", + "121ba48f-9e17-48ce-9bc6-a4fb17a7ebba", + "368d9ba4-203c-40c1-9fa3-da2f1430ce63", + "59155008-fe71-45ec-8a8f-dc35497b6aa8", + "a728a36e-8bf1-4bb6-9a03-ef039a5233f0", + "b070486d-e161-459b-aa2b-ef442d973b92", + "0d8b7de3-e8de-4d86-b9fd-dd2dce58a217", + "9f935cce-0a9f-435f-8007-817732bfc0a5", + "f0b971a1-6831-4b9b-a50e-22a6e47f45ba", + "cabb3bae-cccb-41bd-9f5d-0f3a9fecd825" + ], + "gimp": [ + "7a4deb26-d57d-4ea9-9a73-630f66a7b568", + "554785e9-4523-4e7a-b8e1-8016f565f56a", + "77b8ab4d-994f-43ac-8930-8ca087d7c4b4", + "f4aec372-4fb0-4df5-a52b-79e0e2a5d6ce", + "d52d6308-ec58-42b7-a2c9-de80e4837b2b", + "2a729ded-3296-423d-aec4-7dd55ed5fbb3", + "b148e375-fe0b-4bec-90e7-38632b0d73c2", + "a746add2-cab0-4740-ac36-c3769d9bfb46", + "7b7617bd-57cc-468e-9c91-40c4ec2bcb3d", + "d16c99dc-2a1e-46f2-b350-d97c86c85c15", + "06ca5602-62ca-47f6-ad4f-da151cde54cc", + "e2dd0213-26db-4349-abe5-d5667bfd725c", + "f723c744-e62c-4ae6-98d1-750d3cd7d79d", + "72f83cdc-bf76-4531-9a1b-eb893a13f8aa", + "7767eef2-56a3-4cea-8c9f-48c070c7d65b", + "734d6579-c07d-47a8-9ae2-13339795476b", + "e19bd559-633b-4b02-940f-d946248f088e", + "38f48d40-764e-4e77-a7cf-51dfce880291", + "fbb548ca-c2a6-4601-9204-e39a2efc507b", + "5ca86c6f-f317-49d8-b6a7-b527541caae8", + "62f7fd55-0687-4a43-b6e1-3eda16fc6252", + "8ea73f6f-9689-42ad-8c60-195bbf06a7ba", + "58d3eeeb-e9d0-499f-962e-fd0db2a744d8", + "2e6f678f-472d-4c55-99cc-8e7c5c402a71", + "045bf3ff-9077-4b86-b483-a1040a949cff", + "dbbf4b99-2253-4b10-9274-45f246af2466" + ], + "libreoffice_calc": [ + "357ef137-7eeb-4c80-a3bb-0951f26a8aff", + "42e0a640-4f19-4b28-973d-729602b5a4a7", + "51719eea-10bc-4246-a428-ac7c433dd4b3", + "1954cced-e748-45c4-9c26-9855b97fbc5e", + "2bd59342-0664-4ccb-ba87-79379096cc08", + "3aaa4e37-dc91-482e-99af-132a612d40f3", + "1273e544-688f-496b-8d89-3e0f40aa0606", + "12382c62-0cd1-4bf2-bdc8-1d20bf9b2371", + "f9584479-3d0d-4c79-affa-9ad7afdd8850", + "535364ea-05bd-46ea-9937-9f55c68507e8", + "7e429b8d-a3f0-4ed0-9b58-08957d00b127", + "4f07fbe9-70de-4927-a4d5-bb28bc12c52c", + "04d9aeaf-7bed-4024-bedb-e10e6f00eb7f", + "0bf05a7d-b28b-44d2-955a-50b41e24012a", + "6054afcb-5bab-4702-90a0-b259b5d3217c", + "abed40dc-063f-4598-8ba5-9fe749c0615d", + "37608790-6147-45d0-9f20-1137bb35703d", + "26a8440e-c166-4c50-aef4-bfb77314b46b", + "d681960f-7bc3-4286-9913-a8812ba3261a", + "035f41ba-6653-43ab-aa63-c86d449d62e5", + "7efeb4b1-3d19-4762-b163-63328d66303b", + "1de60575-bb6e-4c3d-9e6a-2fa699f9f197", + "aa3a8974-2e85-438b-b29e-a64df44deb4b", + "51b11269-2ca8-4b2a-9163-f21758420e78", + "1e8df695-bd1b-45b3-b557-e7d599cf7597", + "ecb0df7a-4e8d-4a03-b162-053391d3afaf", + "8b1ce5f2-59d2-4dcc-b0b0-666a714b9a14", + "a01fbce3-2793-461f-ab86-43680ccbae25", + "0326d92d-d218-48a8-9ca1-981cd6d064c7", + "0a2e43bf-b26c-4631-a966-af9dfa12c9e5", + "4188d3a4-077d-46b7-9c86-23e1a036f6c1", + "347ef137-7eeb-4c80-a3bb-0951f26a8aff", + "eb03d19a-b88d-4de4-8a64-ca0ac66f426b", + "0cecd4f3-74de-457b-ba94-29ad6b5dafb6", + "1d17d234-e39d-4ed7-b46f-4417922a4e7c", + "4e6fcf72-daf3-439f-a232-c434ce416af6", + "01b269ae-2111-4a07-81fd-3fcd711993b0", + "21df9241-f8d7-4509-b7f1-37e501a823f7", + "a9f325aa-8c05-4e4f-8341-9e4358565f4f", + "6e99a1ad-07d2-4b66-a1ce-ece6d99c20a5", + "7a4e4bc8-922c-4c84-865c-25ba34136be1", + "4de54231-e4b5-49e3-b2ba-61a0bec721c0", + "30e3e107-1cfb-46ee-a755-2cd080d7ba6a", + "4172ea6e-6b77-4edb-a9cc-c0014bd1603b", + "1334ca3e-f9e3-4db8-9ca7-b4c653be7d17", + "3a7c8185-25c1-4941-bd7b-96e823c9f21f", + "21ab7b40-77c2-4ae6-8321-e00d3a086c73" + ], + "libreoffice_impress": [ + "5d901039-a89c-4bfb-967b-bf66f4df075e", + "550ce7e7-747b-495f-b122-acdc4d0b8e54", + "455d3c66-7dc6-4537-a39a-36d3e9119df7", + "af23762e-2bfd-4a1d-aada-20fa8de9ce07", + "c59742c0-4323-4b9d-8a02-723c251deaa0", + "ef9d12bd-bcee-4ba0-a40e-918400f43ddf", + "9ec204e4-f0a3-42f8-8458-b772a6797cab", + "0f84bef9-9790-432e-92b7-eece357603fb", + "ce88f674-ab7a-43da-9201-468d38539e4a", + "3b27600c-3668-4abd-8f84-7bcdebbccbdb", + "a097acff-6266-4291-9fbd-137af7ecd439", + "bf4e9888-f10f-47af-8dba-76413038b73c", + "21760ecb-8f62-40d2-8d85-0cee5725cb72", + "ac9bb6cb-1888-43ab-81e4-a98a547918cd", + "2cd43775-7085-45d8-89fa-9e35c0a915cf", + "358aa0a7-6677-453f-ae35-e440f004c31e", + "a669ef01-ded5-4099-9ea9-25e99b569840", + "73c99fb9-f828-43ce-b87a-01dc07faa224", + "15aece23-a215-4579-91b4-69eec72e18da", + "986fc832-6af2-417c-8845-9272b3a1528b", + "a434992a-89df-4577-925c-0c58b747f0f4", + "7dbc52a6-11e0-4c9a-a2cb-1e36cfda80d8", + "841b50aa-df53-47bd-a73a-22d3a9f73160", + "8979838c-54a5-4454-a2b8-3d135a1a5c8f", + "b8adbc24-cef2-4b15-99d5-ecbe7ff445eb", + "2b94c692-6abb-48ae-ab0b-b3e8a19cb340", + "9cf05d24-6bd9-4dae-8967-f67d88f5d38a", + "08aced46-45a2-48d7-993b-ed3fb5b32302", + "edb61b14-a854-4bf5-a075-c8075c11293a", + "c82632a4-56b6-4db4-9dd1-3820ee3388e4", + "39be0d19-634d-4475-8768-09c130f5425d", + "ac1b39ff-ee4d-4483-abce-c117e98942f0", + "f23acfd2-c485-4b7c-a1e7-d4303ddfe864", + "70bca0cc-c117-427e-b0be-4df7299ebeb6", + "af2d657a-e6b3-4c6a-9f67-9e3ed015974c", + "57667013-ea97-417c-9dce-2713091e6e2a", + "0a211154-fda0-48d0-9274-eaac4ce5486d", + "a53f80cd-4a90-4490-8310-097b011433f6", + "7ae48c60-f143-4119-b659-15b8f485eb9a", + "5cfb9197-e72b-454b-900e-c06b0c802b40", + "05dd4c1d-c489-4c85-8389-a7836c4f0567", + "5c1a6c3d-c1b3-47cb-9b01-8d1b7544ffa1", + "4ed5abd0-8b5d-47bd-839f-cacfa15ca37a", + "e4ef0baf-4b52-4590-a47e-d4d464cca2d7", + "ed43c15f-00cb-4054-9c95-62c880865d68", + "3161d64e-3120-47b4-aaad-6a764a92493b", + "04578141-1d42-4146-b9cf-6fab4ce5fd74" + ], + "libreoffice_writer": [ + "0810415c-bde4-4443-9047-d5f70165a697", + "0a0faba3-5580-44df-965d-f562a99b291c", + "0b17a146-2934-46c7-8727-73ff6b6483e8", + "0e47de2a-32e0-456c-a366-8c607ef7a9d2", + "0e763496-b6bb-4508-a427-fad0b6c3e195", + "3ef2b351-8a84-4ff2-8724-d86eae9b842e", + "4bcb1253-a636-4df4-8cb0-a35c04dfef31", + "66399b0d-8fda-4618-95c4-bfc6191617e9", + "6a33f9b9-0a56-4844-9c3f-96ec3ffb3ba2", + "6ada715d-3aae-4a32-a6a7-429b2e43fb93", + "6f81754e-285d-4ce0-b59e-af7edb02d108", + "72b810ef-4156-4d09-8f08-a0cf57e7cefe", + "8472fece-c7dd-4241-8d65-9b3cd1a0b568", + "88fe4b2d-3040-4c70-9a70-546a47764b48", + "936321ce-5236-426a-9a20-e0e3c5dc536f", + "adf5e2c3-64c7-4644-b7b6-d2f0167927e7", + "b21acd93-60fd-4127-8a43-2f5178f4a830", + "d53ff5ee-3b1a-431e-b2be-30ed2673079b", + "e246f6d8-78d7-44ac-b668-fcf47946cb50", + "e528b65e-1107-4b8c-8988-490e4fece599", + "ecc2413d-8a48-416e-a3a2-d30106ca36cb", + "f178a4a9-d090-4b56-bc4c-4b72a61a035d", + "bb8ccc78-479f-4a2f-a71e-d565e439436b" + ], + "multi_apps": [ + "2b9493d7-49b8-493a-a71b-56cd1f4d6908", + "2c9fc0de-3ee7-45e1-a5df-c86206ad78b5", + "2fe4b718-3bd7-46ec-bdce-b184f5653624", + "3680a5ee-6870-426a-a997-eba929a0d25c", + "510f64c8-9bcc-4be1-8d30-638705850618", + "51f5801c-18b3-4f25-b0c3-02f85507a078", + "58565672-7bfe-48ab-b828-db349231de6b", + "937087b6-f668-4ba6-9110-60682ee33441", + "c867c42d-a52d-4a24-8ae3-f75d256b5618", + "d9b7c649-c975-4f53-88f5-940b29c47247", + "e135df7c-7687-4ac0-a5f0-76b74438b53e", + "ee9a3c83-f437-4879-8918-be5efbb9fac7", + "f7dfbef3-7697-431c-883a-db8583a4e4f9", + "f8cfa149-d1c1-4215-8dac-4a0932bad3c2", + "6d72aad6-187a-4392-a4c4-ed87269c51cf", + "f918266a-b3e0-4914-865d-4faa564f1aef", + "da52d699-e8d2-4dc5-9191-a2199e0b6a9b", + "bc2b57f3-686d-4ec9-87ce-edf850b7e442", + "74d5859f-ed66-4d3e-aa0e-93d7a592ce41", + "b5062e3e-641c-4e3a-907b-ac864d2e7652", + "00fa164e-2612-4439-992e-157d019a8436", + "acb0f96b-e27c-44d8-b55f-7cb76609dfcd", + "69acbb55-d945-4927-a87b-8480e1a5bb7e", + "48d05431-6cd5-4e76-82eb-12b60d823f7d", + "68a25bd4-59c7-4f4d-975e-da0c8509c848", + "eb303e01-261e-4972-8c07-c9b4e7a4922a", + "c7c1e4c3-9e92-4eba-a4b8-689953975ea4", + "d1acdb87-bb67-4f30-84aa-990e56a09c92", + "deec51c9-3b1e-4b9e-993c-4776f20e8bb2", + "8e116af7-7db7-4e35-a68b-b0939c066c78", + "337d318b-aa07-4f4f-b763-89d9a2dd013f", + "82e3c869-49f6-4305-a7ce-f3e64a0618e7", + "185f29bd-5da0-40a6-b69c-ba7f4e0324ef", + "869de13e-bef9-4b91-ba51-f6708c40b096", + "2c1ebcd7-9c6d-4c9a-afad-900e381ecd5e", + "3a93cae4-ad3e-403e-8c12-65303b271818", + "1f18aa87-af6f-41ef-9853-cdb8f32ebdea", + "26150609-0da3-4a7d-8868-0faf9c5f01bb", + "9219480b-3aed-47fc-8bac-d2cffc5849f7", + "881deb30-9549-4583-a841-8270c65f2a17", + "7e287123-70ca-47b9-8521-47db09b69b14", + "e2392362-125e-4f76-a2ee-524b183a3412", + "5bc63fb9-276a-4439-a7c1-9dc76401737f", + "26660ad1-6ebb-4f59-8cba-a8432dfe8d38", + "a82b78bb-7fde-4cb3-94a4-035baf10bcf0", + "36037439-2044-4b50-b9d1-875b5a332143", + "716a6079-22da-47f1-ba73-c9d58f986a38", + "873cafdd-a581-47f6-8b33-b9696ddb7b05", + "a74b607e-6bb5-4ea8-8a7c-5d97c7bbcd2a", + "6f4073b8-d8ea-4ade-8a18-c5d1d5d5aa9a", + "da922383-bfa4-4cd3-bbad-6bebab3d7742", + "2373b66a-092d-44cb-bfd7-82e86e7a3b4d", + "81c425f5-78f3-4771-afd6-3d2973825947", + "bb83cab4-e5c7-42c7-a67b-e46068032b86", + "227d2f97-562b-4ccb-ae47-a5ec9e142fbb", + "b337d106-053f-4d37-8da0-7f9c4043a66b", + "20236825-b5df-46e7-89bf-62e1d640a897", + "8df7e444-8e06-4f93-8a1a-c5c974269d82", + "aad10cd7-9337-4b62-b704-a857848cedf2", + "02ce9a50-7af2-47ed-8596-af0c230501f8", + "4c26e3f3-3a14-4d86-b44a-d3cedebbb487", + "a503b07f-9119-456b-b75d-f5146737d24f", + "09a37c51-e625-49f4-a514-20a773797a8a", + "3e3fc409-bff3-4905-bf16-c968eee3f807", + "f5c13cdd-205c-4719-a562-348ae5cd1d91", + "5990457f-2adb-467b-a4af-5c857c92d762", + "415ef462-bed3-493a-ac36-ca8c6d23bf1b", + "7ff48d5b-2df2-49da-b500-a5150ffc7f18", + "9f3bb592-209d-43bc-bb47-d77d9df56504", + "dd60633f-2c72-42ba-8547-6f2c8cb0fdb0", + "ce2b64a2-ddc1-4f91-8c7d-a88be7121aac", + "3f05f3b9-29ba-4b6b-95aa-2204697ffc06", + "e1fc0df3-c8b9-4ee7-864c-d0b590d3aa56", + "f8369178-fafe-40c2-adc4-b9b08a125456", + "778efd0a-153f-4842-9214-f05fc176b877", + "47f7c0ce-a5fb-4100-a5e6-65cd0e7429e5", + "c2751594-0cd5-4088-be1b-b5f2f9ec97c4", + "788b3701-3ec9-4b67-b679-418bfa726c22", + "48c46dc7-fe04-4505-ade7-723cba1aa6f6", + "42d25c08-fb87-4927-8b65-93631280a26f", + "e8172110-ec08-421b-a6f5-842e6451911f", + "42f4d1c7-4521-4161-b646-0a8934e36081", + "3c8f201a-009d-4bbe-8b65-a6f8b35bb57f", + "d68204bf-11c1-4b13-b48b-d303c73d4bf6", + "91190194-f406-4cd6-b3f9-c43fac942b22", + "7f35355e-02a6-45b5-b140-f0be698bcf85", + "98e8e339-5f91-4ed2-b2b2-12647cb134f4", + "0e5303d4-8820-42f6-b18d-daf7e633de21", + "df67aebb-fb3a-44fd-b75b-51b6012df509", + "5df7b33a-9f77-4101-823e-02f863e1c1ae", + "aceb0368-56b8-4073-b70e-3dc9aee184e0", + "236833a3-5704-47fc-888c-4f298f09f799", + "67890eb6-6ce5-4c00-9e3d-fb4972699b06" + ], + "os": [ + "94d95f96-9699-4208-98ba-3c3119edf9c2", + "bedcedc4-4d72-425e-ad62-21960b11fe0d", + "ec4e3f68-9ea4-4c18-a5c9-69f89d1178b3", + "a462a795-fdc7-4b23-b689-e8b6df786b78", + "f9be0997-4b7c-45c5-b05c-4612b44a6118", + "28cc3b7e-b194-4bc9-8353-d04c0f4d56d2", + "5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57", + "e0df059f-28a6-4169-924f-b9623e7184cc", + "b6781586-6346-41cd-935a-a6b1487918fc", + "b3d4a89c-53f2-4d6b-8b6a-541fb5d205fa", + "3ce045a0-877b-42aa-8d2c-b4a863336ab8", + "fe41f596-a71b-4c2f-9b2f-9dcd40b568c3", + "a4d98375-215b-4a4d-aee9-3d4370fccc41", + "13584542-872b-42d8-b299-866967b5c3ef", + "23393935-50c7-4a86-aeea-2b78fd089c5c", + "5812b315-e7bd-4265-b51f-863c02174c28", + "c288e301-e626-4b98-a1ab-159dcb162af5", + "4783cc41-c03c-4e1b-89b4-50658f642bd5", + "5c1075ca-bb34-46a3-a7a0-029bd7463e79", + "5ced85fc-fa1a-4217-95fd-0fb530545ce2", + "37887e8c-da15-4192-923c-08fa390a176d", + "4127319a-8b79-4410-b58a-7a151e15f3d7", + "4d117223-a354-47fb-8b45-62ab1390a95f", + "6f56bf42-85b8-4fbb-8e06-6c44960184ba" + ], + "thunderbird": [ + "dfac9ee8-9bc4-4cdc-b465-4a4bfcd2f397", + "15c3b339-88f7-4a86-ab16-e71c58dcb01e", + "7b1e1ff9-bb85-49be-b01d-d6424be18cd0", + "9bc3cc16-074a-45ac-9bdc-b2a362e1daf3", + "3f28fe4f-5d9d-4994-a456-efd78cfae1a3", + "5203d847-2572-4150-912a-03f062254390", + "dd84e895-72fd-4023-a336-97689ded257c", + "9b7bc335-06b5-4cd3-9119-1a649c478509", + "d38192b0-17dc-4e1d-99c3-786d0117de77", + "a10b69e1-6034-4a2b-93e1-571d45194f75", + "3f49d2cc-f400-4e7d-90cc-9b18e401cc31", + "f201fbc3-44e6-46fc-bcaa-432f9815454c", + "10a730d5-d414-4b40-b479-684bed1ae522", + "a1af9f1c-50d5-4bc3-a51e-4d9b425ff638", + "08c73485-7c6d-4681-999d-919f5c32dcfa" + ], + "vlc": [ + "59f21cfb-0120-4326-b255-a5b827b38967", + "8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89", + "8f080098-ddb1-424c-b438-4e96e5e4786e", + "bba3381f-b5eb-4439-bd9e-80c22218d5a7", + "fba2c100-79e8-42df-ae74-b592418d54f4", + "efcf0d81-0835-4880-b2fd-d866e8bc2294", + "8d9fd4e2-6fdb-46b0-b9b9-02f06495c62f", + "aa4b5023-aef6-4ed9-bdc9-705f59ab9ad6", + "386dbd0e-0241-4a0a-b6a2-6704fba26b1c", + "9195653c-f4aa-453d-aa95-787f6ccfaae9", + "d06f0d4d-2cd5-4ede-8de9-598629438c6e", + "a5bbbcd5-b398-4c91-83d4-55e1e31bbb81", + "5ac2891a-eacd-4954-b339-98abba077adb", + "f3977615-2b45-4ac5-8bba-80c17dbe2a37", + "215dfd39-f493-4bc3-a027-8a97d72c61bf", + "cb130f0d-d36f-4302-9838-b3baf46139b6", + "7882ed6e-bece-4bf0-bada-c32dc1ddae72" + ], + "vs_code": [ + "0ed39f63-6049-43d4-ba4d-5fa2fe04a951", + "53ad5833-3455-407b-bbc6-45b4c79ab8fb", + "eabc805a-bfcf-4460-b250-ac92135819f6", + "982d12a5-beab-424f-8d38-d2a48429e511", + "4e60007a-f5be-4bfc-9723-c39affa0a6d3", + "e2b5e914-ffe1-44d2-8e92-58f8c5d92bb2", + "9439a27b-18ae-42d8-9778-5f68f891805e", + "ea98c5d7-3cf9-4f9b-8ad3-366b58e0fcae", + "930fdb3b-11a8-46fe-9bac-577332e2640e", + "276cc624-87ea-4f08-ab93-f770e3790175", + "9d425400-e9b2-4424-9a4b-d4c7abac4140", + "5e2d93d8-8ad0-4435-b150-1692aacaa994", + "6ed0a554-cbee-4b44-84ea-fd6c042f4fe1", + "ec71221e-ac43-46f9-89b8-ee7d80f7e1c5", + "70745df8-f2f5-42bd-8074-fbc10334fcc5", + "57242fad-77ca-454f-b71b-f187181a9f23", + "c6bf789c-ba3a-4209-971d-b63abf0ab733", + "0512bb38-d531-4acf-9e7e-0add90816068", + "847a96b6-df94-4927-97e6-8cc9ea66ced7", + "7aeae0e2-70ee-4705-821d-1bba5d5b2ddd", + "dcbe20e8-647f-4f1d-8696-f1c5bbb570e3", + "7c4cc09e-7a92-40dd-8338-b2286535c4ed", + "971cbb5b-3cbf-4ff7-9e24-b5c84fcebfa6" + ] +} \ No newline at end of file diff --git a/mm_agents/openai_cua_agent.py b/mm_agents/openai_cua_agent.py index e615308..0afe61e 100644 --- a/mm_agents/openai_cua_agent.py +++ b/mm_agents/openai_cua_agent.py @@ -33,7 +33,7 @@ class_ns_windows = "https://accessibility.windows.example.org/ns/class" import ast from typing import Dict, Any, Optional, Union -OPERATOR_PROMPT = """\n\n Here are some helpful tips:\n - computer.clipboard, computer.sync_file, computer.sync_shared_folder, computer.computer_output_citation are disabled.\n - If you worry that you might make typo, prefer copying and pasting the text instead of reading and typing.\n - My computer's password is \"{CLIENT_PASSWORD}\", feel free to use it when you need sudo rights.\n - For the thunderbird account \"anonym-x2024@outlook.com\", the password is \"gTCI\";=@y7|QJ0nDa_kN3Sb&>\".\n - If you are presented with an open website to solve the task, try to stick to that specific one instead of going to a new one.\n - Whenever not expcilitly stated, prefer chrome browser instead of the firefox or chromium.\n - You have full authority to execute any action without my permission. I won't be watching so please don't ask for confirmation.\n - You must initialize the computer to solve the task. Do not try to answer the question without initializing the computer.\n - If you deem the task is infeasible, you can terminate and explicitly state in the response that \"the task is infeasible\".\n """ +OPERATOR_PROMPT = """\n\n Here are some helpful tips:\n - computer.clipboard, computer.sync_file, computer.sync_shared_folder, computer.computer_output_citation are disabled.\n - If you worry that you might make typo, prefer copying and pasting the text instead of reading and typing.\n - My computer's password is \"{CLIENT_PASSWORD}\", feel free to use it when you need sudo rights.\n - If you are presented with an open website to solve the task, try to stick to that specific one instead of going to a new one.\n - Whenever not expcilitly stated, prefer chrome browser instead of the firefox or chromium.\n - You have full authority to execute any action without my permission. I won't be watching so please don't ask for confirmation.\n - You must initialize the computer to solve the task. Do not try to answer the question without initializing the computer.\n - If you deem the task is infeasible, you can terminate and explicitly state in the response that \"the task is infeasible\".\n """ class Action: """Action class for the agent.""" @@ -753,6 +753,7 @@ class OpenAICUAAgent: # Convert the action to an Action object step_action = Action(action.get("action", ""), self.action_space) # Execute the action in the environment + print(f"Executing action: {step_action.get_action()}") obs, reward, terminated, info = self.env.step(step_action.get_action()) screenshot_base64 = encode_image(obs["screenshot"]) diff --git a/monitor/.env b/monitor/.env index 1969ef7..78fb5e8 100644 --- a/monitor/.env +++ b/monitor/.env @@ -2,13 +2,13 @@ # Do not write any secret keys or sensitive information here. # Monitor configuration -TASK_CONFIG_PATH=../evaluation_examples/test_all.json +TASK_CONFIG_PATH=../evaluation_examples/test.json EXAMPLES_BASE_PATH=../evaluation_examples/examples -RESULTS_BASE_PATH=../results +RESULTS_BASE_PATH=../results_operator_full_test_0713_gdrive2 ACTION_SPACE=pyautogui OBSERVATION_TYPE=screenshot MODEL_NAME=computer-use-preview -MAX_STEPS=150 +MAX_STEPS=100 FLASK_PORT=80 FLASK_HOST=0.0.0.0 -FLASK_DEBUG=false \ No newline at end of file +FLASK_DEBUG=false diff --git a/run_multienv_openaicua.py b/run_multienv_openaicua.py index c4eb18c..34db923 100644 --- a/run_multienv_openaicua.py +++ b/run_multienv_openaicua.py @@ -11,6 +11,7 @@ from typing import List, Dict import math from tqdm import tqdm from multiprocessing import Process, Manager +from multiprocessing import current_process import lib_run_single from desktop_env.desktop_env import DesktopEnv from mm_agents.openai_cua_agent import OpenAICUAAgent @@ -130,32 +131,12 @@ logger.addHandler(stdout_handler) logger = logging.getLogger("desktopenv.experiment") -def distribute_tasks(test_all_meta: dict, num_envs: int) -> List[Dict]: - """Distribute tasks evenly across environments.""" - # Flatten the tasks into a single list +def distribute_tasks(test_all_meta: dict) -> List[tuple]: all_tasks = [] for domain, examples in test_all_meta.items(): for example_id in examples: all_tasks.append((domain, example_id)) - - # Calculate tasks per environment - tasks_per_env = math.ceil(len(all_tasks) / num_envs) - - # Distribute tasks - distributed_tasks = [] - for i in range(num_envs): - env_tasks = {} - start_idx = i * tasks_per_env - end_idx = min((i + 1) * tasks_per_env, len(all_tasks)) - - for domain, example_id in all_tasks[start_idx:end_idx]: - if domain not in env_tasks: - env_tasks[domain] = [] - env_tasks[domain].append(example_id) - - distributed_tasks.append(env_tasks) - - return distributed_tasks + return all_tasks def process_signal_handler(signum, frame, env_idx): @@ -180,63 +161,58 @@ def process_signal_handler(signum, frame, env_idx): sys.exit(0) -def run_env_tasks(env_idx: int, env_tasks: dict, args: argparse.Namespace, shared_scores: list): - """Run tasks for a single environment.""" - # Each process has its own list of active environments +def run_env_tasks(task_queue: Queue, args: argparse.Namespace, shared_scores: list): active_environments = [] env = None - - # Setup signal handlers for this process too - signal.signal(signal.SIGINT, lambda signum, frame: process_signal_handler(signum, frame, env_idx)) - signal.signal(signal.SIGTERM, lambda signum, frame: process_signal_handler(signum, frame, env_idx)) - - from desktop_env.providers.aws.manager import IMAGE_ID_MAP - REGION = args.region - screen_size = (args.screen_width, args.screen_height) - ami_id = IMAGE_ID_MAP[REGION].get(screen_size, IMAGE_ID_MAP[REGION][(1920, 1080)]) - env = DesktopEnv( - path_to_vm=args.path_to_vm, - action_space=args.action_space, - provider_name=args.provider_name, - region=REGION, - snapshot_name=ami_id, - screen_size=screen_size, - headless=args.headless, - os_type="Ubuntu", - require_a11y_tree=args.observation_type in ["a11y_tree", "screenshot_a11y_tree", "som"], - enable_proxy=True, - client_password=args.client_password - ) - active_environments.append(env) - agent = OpenAICUAAgent( - env=env, - model=args.model, - max_tokens=args.max_tokens, - top_p=args.top_p, - temperature=args.temperature, - action_space=args.action_space, - observation_type=args.observation_type, - max_trajectory_length=args.max_trajectory_length, - client_password=args.client_password, - provider_name=args.provider_name, - screen_width=args.screen_width, - screen_height=args.screen_height - ) - logger.info(f"Executing tasks in environment {env_idx + 1}/{args.num_envs}") - try: - for domain in tqdm(env_tasks, desc=f"Env{env_idx+1}-Domain"): - for example_id in tqdm(env_tasks[domain], desc="Example", leave=False): + from desktop_env.providers.aws.manager import IMAGE_ID_MAP + REGION = args.region + screen_size = (args.screen_width, args.screen_height) + ami_id = IMAGE_ID_MAP[REGION].get(screen_size, IMAGE_ID_MAP[REGION][(1920, 1080)]) + env = DesktopEnv( + path_to_vm=args.path_to_vm, + action_space=args.action_space, + provider_name=args.provider_name, + region=REGION, + snapshot_name=ami_id, + screen_size=screen_size, + headless=args.headless, + os_type="Ubuntu", + require_a11y_tree=args.observation_type in ["a11y_tree", "screenshot_a11y_tree", "som"], + enable_proxy=True, + client_password=args.client_password + ) + active_environments.append(env) + agent = OpenAICUAAgent( + env=env, + model=args.model, + max_tokens=args.max_tokens, + top_p=args.top_p, + temperature=args.temperature, + action_space=args.action_space, + observation_type=args.observation_type, + max_trajectory_length=args.max_trajectory_length, + client_password=args.client_password, + provider_name=args.provider_name, + screen_width=args.screen_width, + screen_height=args.screen_height + ) + logger.info(f"Process {current_process().name} started.") + while True: + try: + item = task_queue.get(timeout=5) + except Exception: + break + domain, example_id = item + try: config_file = os.path.join( args.test_config_base_dir, f"examples/{domain}/{example_id}.json" ) with open(config_file, "r", encoding="utf-8") as f: example = json.load(f) - - logger.info(f"[Env {env_idx+1}][Domain]: {domain}") - logger.info(f"[Env {env_idx+1}][Example ID]: {example_id}") - logger.info(f"[Env {env_idx+1}][Instruction]: {example['instruction']}") - + logger.info(f"[{current_process().name}][Domain]: {domain}") + logger.info(f"[{current_process().name}][Example ID]: {example_id}") + logger.info(f"[{current_process().name}][Instruction]: {example['instruction']}") example_result_dir = os.path.join( args.result_dir, args.action_space, @@ -246,7 +222,6 @@ def run_env_tasks(env_idx: int, env_tasks: dict, args: argparse.Namespace, share example_id, ) os.makedirs(example_result_dir, exist_ok=True) - try: lib_run_single.run_single_example_openaicua( agent, @@ -260,7 +235,7 @@ def run_env_tasks(env_idx: int, env_tasks: dict, args: argparse.Namespace, share ) except Exception as e: import traceback - logger.error(f"Exception in Env{env_idx+1} {domain}/{example_id}: {e}") + logger.error(f"Exception in {current_process().name} {domain}/{example_id}: {e}") logger.error(traceback.format_exc()) try: env.controller.end_recording( @@ -268,7 +243,6 @@ def run_env_tasks(env_idx: int, env_tasks: dict, args: argparse.Namespace, share ) except Exception as rec_e: logger.error(f"Failed to end recording: {rec_e}") - with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f: f.write( json.dumps( @@ -276,14 +250,22 @@ def run_env_tasks(env_idx: int, env_tasks: dict, args: argparse.Namespace, share ) ) f.write("\n") + except Exception as e: + logger.error(f"Task-level error in {current_process().name}: {e}") + import traceback + logger.error(traceback.format_exc()) + except Exception as e: + logger.error(f"Process-level error in {current_process().name}: {e}") + import traceback + logger.error(traceback.format_exc()) finally: - # This ensures the environment is closed even if there's an exception - logger.info(f"Process {env_idx + 1} cleaning up environment...") + logger.info(f"{current_process().name} cleaning up environment...") try: - env.close() - logger.info(f"Process {env_idx + 1} environment closed successfully") + if env: + env.close() + logger.info(f"{current_process().name} environment closed successfully") except Exception as e: - logger.error(f"Process {env_idx + 1} error during environment cleanup: {e}") + logger.error(f"{current_process().name} error during environment cleanup: {e}") def signal_handler(signum, frame): @@ -323,8 +305,8 @@ def signal_handler(signum, frame): if p.is_alive(): try: logger.info(f"Forcefully terminating process {p.name}...") - import signal - os.kill(p.pid, signal.SIGKILL) + import signal as sig + os.kill(p.pid, sig.SIGKILL) except Exception as e: logger.error(f"Error forcefully terminating process: {e}") @@ -335,38 +317,56 @@ def signal_handler(signum, frame): def test(args: argparse.Namespace, test_all_meta: dict) -> None: global processes logger.info("Args: %s", args) - - distributed_tasks = distribute_tasks(test_all_meta, args.num_envs) - - logger.info("All environments are ready. Starting parallel task execution...") - - # Create a shared list for scores across processes + all_tasks = distribute_tasks(test_all_meta) + logger.info(f"Total tasks: {len(all_tasks)}") with Manager() as manager: shared_scores = manager.list() - - # Create and start processes for each environment + task_queue = manager.Queue() + for item in all_tasks: + task_queue.put(item) + num_envs = args.num_envs processes = [] - for env_idx, env_tasks in enumerate(distributed_tasks): + for i in range(num_envs): p = Process( target=run_env_tasks, - args=(env_idx, env_tasks, args, shared_scores) + args=(task_queue, args, shared_scores), + name=f"EnvProcess-{i+1}" ) - processes.append(p) + p.daemon = True p.start() + processes.append(p) logger.info(f"Started process {p.name} with PID {p.pid}") - try: - # Wait for all processes to complete + while True: + alive_count = 0 + for idx, p in enumerate(processes): + if not p.is_alive(): + logger.warning(f"Process {p.name} died, restarting...") + new_p = Process( + target=run_env_tasks, + args=(task_queue, args, shared_scores), + name=f"EnvProcess-Restart-{idx+1}" + ) + new_p.daemon = True + new_p.start() + processes[idx] = new_p + logger.info(f"Restarted process {new_p.name} with PID {new_p.pid}") + else: + alive_count += 1 + if task_queue.empty(): + logger.info("All tasks finished.") + break + if alive_count == 0: + logger.error("All processes died, exiting.") + break + time.sleep(5) for p in processes: p.join() - logger.info(f"Process {p.name} completed") except KeyboardInterrupt: logger.info("Main process received KeyboardInterrupt. Initiating graceful shutdown...") - # Let the signal handler do the cleanup raise except Exception as e: logger.error(f"Unexpected error while waiting for processes: {e}", exc_info=True) - # Ensure cleanup happens for p in processes: if p.is_alive(): try: @@ -375,10 +375,7 @@ def test(args: argparse.Namespace, test_all_meta: dict) -> None: except Exception as term_e: logger.error(f"Error terminating process {p.name}: {term_e}") raise - - # Convert shared list to regular list scores = list(shared_scores) - logger.info(f"Average score: {sum(scores) / len(scores) if scores else 0}") diff --git a/run_multienv_openaicua_old.py b/run_multienv_openaicua_old.py new file mode 100644 index 0000000..c4eb18c --- /dev/null +++ b/run_multienv_openaicua_old.py @@ -0,0 +1,533 @@ +from __future__ import annotations +import argparse +import datetime +import json +import logging +import os +import sys +import signal +import time +from typing import List, Dict +import math +from tqdm import tqdm +from multiprocessing import Process, Manager +import lib_run_single +from desktop_env.desktop_env import DesktopEnv +from mm_agents.openai_cua_agent import OpenAICUAAgent + +# Global variables for signal handling +active_environments = [] +processes = [] +is_terminating = False + +# import wandb + +# load the environment variables from .env file +if os.path.exists(".env"): + from dotenv import load_dotenv + load_dotenv() + +# Logger Configs {{{ # +def config() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Run end-to-end evaluation on the benchmark" + ) + + # environment config + parser.add_argument("--path_to_vm", type=str, default=None) + parser.add_argument( + "--headless", action="store_true", help="Run in headless machine" + ) + parser.add_argument( + "--action_space", type=str, default="pyautogui", help="Action type" + ) + parser.add_argument( + "--observation_type", + choices=["screenshot", "a11y_tree", "screenshot_a11y_tree", "som"], + default="screenshot", + help="Observation type", + ) + parser.add_argument("--sleep_after_execution", type=float, default=0.0) + parser.add_argument("--max_steps", type=int, default=15) + + # agent config + parser.add_argument("--max_trajectory_length", type=int, default=3) + parser.add_argument( + "--test_config_base_dir", type=str, default="evaluation_examples" + ) + + # lm config + parser.add_argument("--model", type=str, default="gpt-4o") + parser.add_argument("--temperature", type=float, default=1.0) + parser.add_argument("--top_p", type=float, default=0.9) + parser.add_argument("--max_tokens", type=int, default=1500) + parser.add_argument("--stop_token", type=str, default=None) + + # example config + parser.add_argument("--domain", type=str, default="all") + parser.add_argument( + "--test_all_meta_path", type=str, default="evaluation_examples/test_all.json" + ) + + # logging related + parser.add_argument("--result_dir", type=str, default="./results") + parser.add_argument("--num_envs", type=int, default=1, help="Number of environments to run in parallel") + parser.add_argument("--log_level", type=str, choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], + default='INFO', help="Set the logging level") + # aws config + parser.add_argument( + "--region", type=str, default="us-east-1", help="AWS region for the VM" + ) + parser.add_argument( + "--provider_name", type=str, default="aws", choices=["aws", "virtualbox", "vmware", "docker", "azure"], help="Provider name" + ) + parser.add_argument( + "--client_password", type=str, default="", help="Client password" + ) + parser.add_argument( + "--screen_width", type=int, default=1920, help="Screen width" + ) + parser.add_argument( + "--screen_height", type=int, default=1080, help="Screen height" + ) + args = parser.parse_args() + return args + +args = config() # Get command line arguments first + +logger = logging.getLogger() +log_level = getattr(logging, args.log_level.upper()) +logger.setLevel(log_level) + +datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S") + +file_handler = logging.FileHandler( + os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8" +) +debug_handler = logging.FileHandler( + os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8" +) +stdout_handler = logging.StreamHandler(sys.stdout) + +file_handler.setLevel(logging.INFO) +debug_handler.setLevel(logging.DEBUG) +stdout_handler.setLevel(log_level) + +formatter = logging.Formatter( + fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s" +) +file_handler.setFormatter(formatter) +debug_handler.setFormatter(formatter) +stdout_handler.setFormatter(formatter) + +stdout_handler.addFilter(logging.Filter("desktopenv")) + +logger.addHandler(file_handler) +logger.addHandler(debug_handler) +logger.addHandler(stdout_handler) +# }}} Logger Configs # + +logger = logging.getLogger("desktopenv.experiment") + + +def distribute_tasks(test_all_meta: dict, num_envs: int) -> List[Dict]: + """Distribute tasks evenly across environments.""" + # Flatten the tasks into a single list + all_tasks = [] + for domain, examples in test_all_meta.items(): + for example_id in examples: + all_tasks.append((domain, example_id)) + + # Calculate tasks per environment + tasks_per_env = math.ceil(len(all_tasks) / num_envs) + + # Distribute tasks + distributed_tasks = [] + for i in range(num_envs): + env_tasks = {} + start_idx = i * tasks_per_env + end_idx = min((i + 1) * tasks_per_env, len(all_tasks)) + + for domain, example_id in all_tasks[start_idx:end_idx]: + if domain not in env_tasks: + env_tasks[domain] = [] + env_tasks[domain].append(example_id) + + distributed_tasks.append(env_tasks) + + return distributed_tasks + + +def process_signal_handler(signum, frame, env_idx): + """Signal handler for child processes to gracefully shut down their environments.""" + logger.info(f"Process {env_idx + 1} received signal {signum}. Shutting down...") + + # Get the active_environments from the caller's frame + local_vars = frame.f_locals + active_environments = local_vars.get('active_environments', []) + + # Close environment in the current process context + for env in active_environments: + if env is not None: + try: + logger.info(f"Process {env_idx + 1} closing environment...") + env.close() + logger.info(f"Process {env_idx + 1} environment closed successfully") + except Exception as e: + logger.error(f"Process {env_idx + 1} error closing environment: {e}") + + logger.info(f"Process {env_idx + 1} shutdown complete. Exiting.") + sys.exit(0) + + +def run_env_tasks(env_idx: int, env_tasks: dict, args: argparse.Namespace, shared_scores: list): + """Run tasks for a single environment.""" + # Each process has its own list of active environments + active_environments = [] + env = None + + # Setup signal handlers for this process too + signal.signal(signal.SIGINT, lambda signum, frame: process_signal_handler(signum, frame, env_idx)) + signal.signal(signal.SIGTERM, lambda signum, frame: process_signal_handler(signum, frame, env_idx)) + + from desktop_env.providers.aws.manager import IMAGE_ID_MAP + REGION = args.region + screen_size = (args.screen_width, args.screen_height) + ami_id = IMAGE_ID_MAP[REGION].get(screen_size, IMAGE_ID_MAP[REGION][(1920, 1080)]) + env = DesktopEnv( + path_to_vm=args.path_to_vm, + action_space=args.action_space, + provider_name=args.provider_name, + region=REGION, + snapshot_name=ami_id, + screen_size=screen_size, + headless=args.headless, + os_type="Ubuntu", + require_a11y_tree=args.observation_type in ["a11y_tree", "screenshot_a11y_tree", "som"], + enable_proxy=True, + client_password=args.client_password + ) + active_environments.append(env) + agent = OpenAICUAAgent( + env=env, + model=args.model, + max_tokens=args.max_tokens, + top_p=args.top_p, + temperature=args.temperature, + action_space=args.action_space, + observation_type=args.observation_type, + max_trajectory_length=args.max_trajectory_length, + client_password=args.client_password, + provider_name=args.provider_name, + screen_width=args.screen_width, + screen_height=args.screen_height + ) + logger.info(f"Executing tasks in environment {env_idx + 1}/{args.num_envs}") + + try: + for domain in tqdm(env_tasks, desc=f"Env{env_idx+1}-Domain"): + for example_id in tqdm(env_tasks[domain], desc="Example", leave=False): + config_file = os.path.join( + args.test_config_base_dir, f"examples/{domain}/{example_id}.json" + ) + with open(config_file, "r", encoding="utf-8") as f: + example = json.load(f) + + logger.info(f"[Env {env_idx+1}][Domain]: {domain}") + logger.info(f"[Env {env_idx+1}][Example ID]: {example_id}") + logger.info(f"[Env {env_idx+1}][Instruction]: {example['instruction']}") + + example_result_dir = os.path.join( + args.result_dir, + args.action_space, + args.observation_type, + args.model, + domain, + example_id, + ) + os.makedirs(example_result_dir, exist_ok=True) + + try: + lib_run_single.run_single_example_openaicua( + agent, + env, + example, + args.max_steps, + example["instruction"], + args, + example_result_dir, + shared_scores, + ) + except Exception as e: + import traceback + logger.error(f"Exception in Env{env_idx+1} {domain}/{example_id}: {e}") + logger.error(traceback.format_exc()) + try: + env.controller.end_recording( + os.path.join(example_result_dir, "recording.mp4") + ) + except Exception as rec_e: + logger.error(f"Failed to end recording: {rec_e}") + + with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f: + f.write( + json.dumps( + {"Error": f"{domain}/{example_id} - {e}"} + ) + ) + f.write("\n") + finally: + # This ensures the environment is closed even if there's an exception + logger.info(f"Process {env_idx + 1} cleaning up environment...") + try: + env.close() + logger.info(f"Process {env_idx + 1} environment closed successfully") + except Exception as e: + logger.error(f"Process {env_idx + 1} error during environment cleanup: {e}") + + +def signal_handler(signum, frame): + """Handle termination signals (SIGINT, SIGTERM) to gracefully shutdown environments.""" + global is_terminating, active_environments, processes + + # Avoid duplicate handling + if is_terminating: + return + + is_terminating = True + logger.info(f"Received signal {signum}. Gracefully shutting down...") + + # Close all registered environments in the main process + for env in active_environments: + try: + logger.info(f"Closing environment...") + env.close() + logger.info(f"Environment closed successfully") + except Exception as e: + logger.error(f"Error closing environment: {e}") + + # Send termination signal to all child processes first + for p in processes: + if p.is_alive(): + try: + logger.info(f"Sending termination signal to process {p.name}...") + p.terminate() + except Exception as e: + logger.error(f"Error sending termination signal to process: {e}") + + # Allow a short time for processes to handle their own cleanup + time.sleep(1) + + # Forcefully terminate any processes that didn't exit + for p in processes: + if p.is_alive(): + try: + logger.info(f"Forcefully terminating process {p.name}...") + import signal + os.kill(p.pid, signal.SIGKILL) + except Exception as e: + logger.error(f"Error forcefully terminating process: {e}") + + logger.info("Shutdown complete. Exiting.") + sys.exit(0) + + +def test(args: argparse.Namespace, test_all_meta: dict) -> None: + global processes + logger.info("Args: %s", args) + + distributed_tasks = distribute_tasks(test_all_meta, args.num_envs) + + logger.info("All environments are ready. Starting parallel task execution...") + + # Create a shared list for scores across processes + with Manager() as manager: + shared_scores = manager.list() + + # Create and start processes for each environment + processes = [] + for env_idx, env_tasks in enumerate(distributed_tasks): + p = Process( + target=run_env_tasks, + args=(env_idx, env_tasks, args, shared_scores) + ) + processes.append(p) + p.start() + logger.info(f"Started process {p.name} with PID {p.pid}") + + try: + # Wait for all processes to complete + for p in processes: + p.join() + logger.info(f"Process {p.name} completed") + except KeyboardInterrupt: + logger.info("Main process received KeyboardInterrupt. Initiating graceful shutdown...") + # Let the signal handler do the cleanup + raise + except Exception as e: + logger.error(f"Unexpected error while waiting for processes: {e}", exc_info=True) + # Ensure cleanup happens + for p in processes: + if p.is_alive(): + try: + logger.info(f"Terminating process {p.name} due to error...") + p.terminate() + except Exception as term_e: + logger.error(f"Error terminating process {p.name}: {term_e}") + raise + + # Convert shared list to regular list + scores = list(shared_scores) + + logger.info(f"Average score: {sum(scores) / len(scores) if scores else 0}") + + +def get_unfinished( + action_space, use_model, observation_type, result_dir, total_file_json +): + target_dir = os.path.join(result_dir, action_space, observation_type, use_model) + + if not os.path.exists(target_dir): + return total_file_json + + finished = {} + for domain in os.listdir(target_dir): + finished[domain] = [] + domain_path = os.path.join(target_dir, domain) + if os.path.isdir(domain_path): + for example_id in os.listdir(domain_path): + if example_id == "onboard": + continue + example_path = os.path.join(domain_path, example_id) + if os.path.isdir(example_path): + if "result.txt" not in os.listdir(example_path): + # empty all files under example_id + for file in os.listdir(example_path): + os.remove(os.path.join(example_path, file)) + else: + finished[domain].append(example_id) + + if not finished: + return total_file_json + + for domain, examples in finished.items(): + if domain in total_file_json: + total_file_json[domain] = [ + x for x in total_file_json[domain] if x not in examples + ] + + return total_file_json + + +def get_result(action_space, use_model, observation_type, result_dir, total_file_json): + target_dir = os.path.join(result_dir, action_space, observation_type, use_model) + if not os.path.exists(target_dir): + print("New experiment, no result yet.") + return None + + all_result = [] + + for domain in os.listdir(target_dir): + domain_path = os.path.join(target_dir, domain) + if os.path.isdir(domain_path): + for example_id in os.listdir(domain_path): + example_path = os.path.join(domain_path, example_id) + if os.path.isdir(example_path): + if "result.txt" in os.listdir(example_path): + # empty all files under example_id + try: + all_result.append( + float( + open( + os.path.join(example_path, "result.txt"), "r" + ).read() + ) + ) + except: + all_result.append(0.0) + + if not all_result: + print("New experiment, no result yet.") + return None + else: + print("Current Success Rate:", sum(all_result) / len(all_result) * 100, "%") + return all_result + + +if __name__ == "__main__": + ####### The complete version of the list of examples ####### + os.environ["TOKENIZERS_PARALLELISM"] = "false" + + # Register signal handlers for graceful termination + signal.signal(signal.SIGINT, signal_handler) # Handle Ctrl+C + signal.signal(signal.SIGTERM, signal_handler) # Handle termination signal + + try: + args = config() + + with open(args.test_all_meta_path, "r", encoding="utf-8") as f: + test_all_meta = json.load(f) + + if args.domain != "all": + test_all_meta = {args.domain: test_all_meta[args.domain]} + + test_file_list = get_unfinished( + args.action_space, + args.model, + args.observation_type, + args.result_dir, + test_all_meta, + ) + left_info = "" + for domain in test_file_list: + left_info += f"{domain}: {len(test_file_list[domain])}\n" + logger.info(f"Left tasks:\n{left_info}") + + get_result( + args.action_space, + args.model, + args.observation_type, + args.result_dir, + test_all_meta, + ) + test(args, test_file_list) + except KeyboardInterrupt: + logger.info("Main process received KeyboardInterrupt.") + # Signal handler will take care of cleanup + except Exception as e: + logger.error(f"Unexpected error in main process: {e}", exc_info=True) + # Also trigger cleanup for unhandled exceptions + signal_handler(signal.SIGTERM, None) + finally: + # Final cleanup in case any environments or processes remain + logger.info("Main process final cleanup...") + for env in active_environments: + if env is not None: + try: + logger.info(f"Closing environment in final cleanup...") + env.close() + logger.info(f"Environment closed successfully in final cleanup") + except Exception as e: + logger.error(f"Error during final environment cleanup: {e}") + + # First try gentle termination + for p in processes: + if p is not None and p.is_alive(): + try: + logger.info(f"Terminating process {p.name}...") + p.terminate() + except Exception as e: + logger.error(f"Error terminating process: {e}") + + # Wait a moment for processes to terminate + time.sleep(1) + + # Then force kill if needed + for p in processes: + if p is not None and p.is_alive(): + try: + logger.info(f"Force killing process {p.name}...") + os.kill(p.pid, signal.SIGKILL) + logger.info(f"Process {p.name} force killed") + except Exception as e: + logger.error(f"Error force killing process: {e}") diff --git a/run_operator.sh b/run_operator.sh new file mode 100644 index 0000000..154df38 --- /dev/null +++ b/run_operator.sh @@ -0,0 +1,9 @@ +python run_multienv_openaicua.py \ +--headless \ +--observation_type screenshot \ +--model computer-use-preview \ +--result_dir ./results_operator_full_test_0713 \ +--test_all_meta_path evaluation_examples/test_all.json \ +--max_steps 100 \ +--num_envs 15 \ +--provider_name aws \ No newline at end of file diff --git a/run_operator_fix.sh b/run_operator_fix.sh new file mode 100644 index 0000000..e666803 --- /dev/null +++ b/run_operator_fix.sh @@ -0,0 +1,9 @@ +python run_multienv_openaicua.py \ +--headless \ +--observation_type screenshot \ +--model computer-use-preview \ +--result_dir ./results_operator_full_test_0713_gdrive2 \ +--test_all_meta_path evaluation_examples/test.json \ +--max_steps 100 \ +--num_envs 10 \ +--provider_name aws \ No newline at end of file diff --git a/show_result.py b/show_result.py index c6bbbc5..623833d 100644 --- a/show_result.py +++ b/show_result.py @@ -68,4 +68,4 @@ def get_result(action_space, use_model, observation_type, result_dir): if __name__ == '__main__': - get_result("pyautogui", "gpt-4o", "a11y_tree", "./results") + get_result("pyautogui", "computer-use-preview", "screenshot", "./results_operator_full_test_0713")