diff --git a/desktop_env/evaluators/README.md b/desktop_env/evaluators/README.md index 266e6e7..d7a746c 100644 --- a/desktop_env/evaluators/README.md +++ b/desktop_env/evaluators/README.md @@ -1,5 +1,12 @@ # Setup Instructions +## Overall +Disable the system crash report by: +``` +sudo vim /etc/default/apport +``` +and then change the `enabled` to `0`. + ## LibreOffice For LibreOffice, please enter into the app first, and then enable the no pop-up when 'ctrl + s'. @@ -209,3 +216,5 @@ pip install opencv-python-headless Pillow imagehash - Ensure VLC is running and the correct port (default is 8080) is being used. - If the port is in use by another application, you may change the port number in VLC's settings. +## GIMP +Click on the "Keep" of the image loading pop-up. \ No newline at end of file diff --git a/experiment_a11y_tree.py b/experiment_a11y_tree.py index 40836da..59f28ae 100644 --- a/experiment_a11y_tree.py +++ b/experiment_a11y_tree.py @@ -1,9 +1,9 @@ -import ctypes import datetime import json import logging import os import sys + import func_timeout from desktop_env.envs.desktop_env import DesktopEnv @@ -46,7 +46,6 @@ logger = logging.getLogger("desktopenv.experiment") PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx" - def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_trajectory", recording=True): trajectory_recording_path = os.path.join(example_trajectory_dir, "trajectory.json") env = DesktopEnv( @@ -123,9 +122,8 @@ def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_tr logger.info("Environment closed.") -def main(example_class, example_id): +def main(example_class, example_id, gpt4_model="gpt-4-0125-preview"): action_space = "pyautogui" - gpt4_model = "gpt-4-0125-preview" gemini_model = "gemini-pro-vision" logger.info("Running example %s/%s", example_class, example_id) @@ -134,7 +132,7 @@ def main(example_class, example_id): with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r", encoding="utf-8") as f: example = json.load(f) - example["snapshot"] = "exp_v1" + example["snapshot"] = "exp_v5" api_key = os.environ.get("OPENAI_API_KEY") agent = GPT4v_Agent(api_key=api_key, model=gpt4_model, instruction=example['instruction'], max_tokens=1000, @@ -154,25 +152,50 @@ def main(example_class, example_id): if __name__ == '__main__': + os_list = [ + "94d95f96-9699-4208-98ba-3c3119edf9c2", + "bedcedc4-4d72-425e-ad62-21960b11fe0d", + "43c2d64c-bab5-4dcb-a30c-b888321c319a", + "7688b85f-87a4-4e4a-b2f8-f3d6c3f29b82", + "ec4e3f68-9ea4-4c18-a5c9-69f89d1178b3", + "f9be0997-4b7c-45c5-b05c-4612b44a6118", + "28cc3b7e-b194-4bc9-8353-d04c0f4d56d2", + "5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57", + "e0df059f-28a6-4169-924f-b9623e7184cc", + "ddc75b62-7311-4af8-bfb3-859558542b36", + "b6781586-6346-41cd-935a-a6b1487918fc", + "3ce045a0-877b-42aa-8d2c-b4a863336ab8", + "a4d98375-215b-4a4d-aee9-3d4370fccc41", + "13584542-872b-42d8-b299-866967b5c3ef", + "23393935-50c7-4a86-aeea-2b78fd089c5c" + ] + + # for example_id in os_list: + # try: + # main("os", example_id, gpt4_model="gpt-3.5-turbo-16k") + # except Exception as e: + # logger.error("An error occurred while running the example: %s", e) + # continue + vlc_list = [ - # "8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89", - # "8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89", - # "8f080098-ddb1-424c-b438-4e96e5e4786e", - # "bba3381f-b5eb-4439-bd9e-80c22218d5a7", - # "fba2c100-79e8-42df-ae74-b592418d54f4", - # "efcf0d81-0835-4880-b2fd-d866e8bc2294", - # "8d9fd4e2-6fdb-46b0-b9b9-02f06495c62f", - # "aa4b5023-aef6-4ed9-bdc9-705f59ab9ad6", - # "386dbd0e-0241-4a0a-b6a2-6704fba26b1c", - # "9195653c-f4aa-453d-aa95-787f6ccfaae9", - # "d06f0d4d-2cd5-4ede-8de9-598629438c6e", - # "a5bbbcd5-b398-4c91-83d4-55e1e31bbb81", + "8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89", + "8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89", + "8f080098-ddb1-424c-b438-4e96e5e4786e", + "bba3381f-b5eb-4439-bd9e-80c22218d5a7", + "fba2c100-79e8-42df-ae74-b592418d54f4", + "efcf0d81-0835-4880-b2fd-d866e8bc2294", + "8d9fd4e2-6fdb-46b0-b9b9-02f06495c62f", + "aa4b5023-aef6-4ed9-bdc9-705f59ab9ad6", + "386dbd0e-0241-4a0a-b6a2-6704fba26b1c", + "9195653c-f4aa-453d-aa95-787f6ccfaae9", + "d06f0d4d-2cd5-4ede-8de9-598629438c6e", + "a5bbbcd5-b398-4c91-83d4-55e1e31bbb81", "f3977615-2b45-4ac5-8bba-80c17dbe2a37", "215dfd39-f493-4bc3-a027-8a97d72c61bf" ] chrome_list = [ - # "bb5e4c0d-f964-439c-97b6-bdb9747de3f4", + "bb5e4c0d-f964-439c-97b6-bdb9747de3f4", "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3", "06fe7178-4491-4589-810f-2e2bc9502122", "e1e75309-3ddb-4d09-92ec-de869c928143", @@ -215,5 +238,116 @@ if __name__ == '__main__': "4f07fbe9-70de-4927-a4d5-bb28bc12c52c", ] - for example_id in calc_list: - main("libreoffice_calc", example_id) + # for example_id in calc_list: + # main("libreoffice_calc", example_id) + + impress_list = [ + # "5d901039-a89c-4bfb-967b-bf66f4df075e", + # "550ce7e7-747b-495f-b122-acdc4d0b8e54", + # "455d3c66-7dc6-4537-a39a-36d3e9119df7", + # "af23762e-2bfd-4a1d-aada-20fa8de9ce07", + # "c59742c0-4323-4b9d-8a02-723c251deaa0", + # "ef9d12bd-bcee-4ba0-a40e-918400f43ddf", + # "9ec204e4-f0a3-42f8-8458-b772a6797cab", + # "0f84bef9-9790-432e-92b7-eece357603fb", + # "ce88f674-ab7a-43da-9201-468d38539e4a", + # "3b27600c-3668-4abd-8f84-7bcdebbccbdb", + # "a097acff-6266-4291-9fbd-137af7ecd439", + # "bf4e9888-f10f-47af-8dba-76413038b73c", + "21760ecb-8f62-40d2-8d85-0cee5725cb72" + ] + # for example_id in impress_list: + # main("libreoffice_impress", example_id) + + thunderbird_list = [ + # "bb5e4c0d-f964-439c-97b6-bdb9747de3f4", + # "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3", + "12086550-11c0-466b-b367-1d9e75b3910e", + "06fe7178-4491-4589-810f-2e2bc9502122", + "6766f2b8-8a72-417f-a9e5-56fcaa735837", + "e1e75309-3ddb-4d09-92ec-de869c928143", + "3d1682a7-0fb0-49ae-a4dc-a73afd2d06d5", + "35253b65-1c19-4304-8aa4-6884b8218fc0", + "d088f539-cab4-4f9a-ac92-9999fc3a656e", + "2ad9387a-65d8-4e33-ad5b-7580065a27ca", + "480bcfea-d68f-4aaa-a0a9-2589ef319381", + "030eeff7-b492-4218-b312-701ec99ee0cc", + "94760984-3ff5-41ee-8347-cf1af709fea0", + "99146c54-4f37-4ab8-9327-5f3291665e1e", + "c9e7eaf2-b1a1-4efc-a982-721972fa9f02" + ] + # for example_id in thunderbird_list: + # main("thunderbird", example_id) + + gimp_list = [ + "7a4deb26-d57d-4ea9-9a73-630f66a7b568", + "554785e9-4523-4e7a-b8e1-8016f565f56a", + "77b8ab4d-994f-43ac-8930-8ca087d7c4b4", + "f4aec372-4fb0-4df5-a52b-79e0e2a5d6ce", + "d52d6308-ec58-42b7-a2c9-de80e4837b2b", + "2a729ded-3296-423d-aec4-7dd55ed5fbb3", + "b148e375-fe0b-4bec-90e7-38632b0d73c2", + "a746add2-cab0-4740-ac36-c3769d9bfb46", + "7b7617bd-57cc-468e-9c91-40c4ec2bcb3d", + "d16c99dc-2a1e-46f2-b350-d97c86c85c15", + "06ca5602-62ca-47f6-ad4f-da151cde54cc", + "e2dd0213-26db-4349-abe5-d5667bfd725c", + "f723c744-e62c-4ae6-98d1-750d3cd7d79d", + "72f83cdc-bf76-4531-9a1b-eb893a13f8aa", + "7767eef2-56a3-4cea-8c9f-48c070c7d65b", + "734d6579-c07d-47a8-9ae2-13339795476b" + ] + + # for example_id in gimp_list: + # try: + # main("gimp", example_id) + # except Exception as e: + # logger.error("An error occurred while running the example: %s", e) + # continue + + vs_code_list = [ + "0ed39f63-6049-43d4-ba4d-5fa2fe04a951", + "53ad5833-3455-407b-bbc6-45b4c79ab8fb", + "eabc805a-bfcf-4460-b250-ac92135819f6", + "982d12a5-beab-424f-8d38-d2a48429e511", + "4e60007a-f5be-4bfc-9723-c39affa0a6d3", + "e2b5e914-ffe1-44d2-8e92-58f8c5d92bb2", + "9439a27b-18ae-42d8-9778-5f68f891805e", + "ea98c5d7-3cf9-4f9b-8ad3-366b58e0fcae", + "930fdb3b-11a8-46fe-9bac-577332e2640e", + "276cc624-87ea-4f08-ab93-f770e3790175", + "9d425400-e9b2-4424-9a4b-d4c7abac4140" + ] + + # for example_id in vs_code_list: + # try: + # main("vs_code", example_id) + # except Exception as e: + # logger.error("An error occurred while running the example: %s", e) + # continue + + multiple_list = [ + "f8cfa149-d1c1-4215-8dac-4a0932bad3c2", + "897e3b53-5d4d-444b-85cb-2cdc8a97d903", + "4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc", + "b52b40a5-ad70-4c53-b5b0-5650a8387052", + "46407397-a7d5-4c6b-92c6-dbe038b1457b", + "2b9493d7-49b8-493a-a71b-56cd1f4d6908", + "51f5801c-18b3-4f25-b0c3-02f85507a078", + "2c9fc0de-3ee7-45e1-a5df-c86206ad78b5", + "510f64c8-9bcc-4be1-8d30-638705850618", + "937087b6-f668-4ba6-9110-60682ee33441", + "ee9a3c83-f437-4879-8918-be5efbb9fac7", + "3680a5ee-6870-426a-a997-eba929a0d25c", + "e135df7c-7687-4ac0-a5f0-76b74438b53e", + "58565672-7bfe-48ab-b828-db349231de6b", + "2fe4b718-3bd7-46ec-bdce-b184f5653624" + ] + + for example_id in multiple_list: + try: + main("multi_apps", example_id) + except Exception as e: + logger.error("An error occurred while running the example: %s", e) + continue + diff --git a/experiment_screenshot.py b/experiment_screenshot.py index f490d69..b6ec2f5 100644 --- a/experiment_screenshot.py +++ b/experiment_screenshot.py @@ -134,7 +134,7 @@ def main(example_class, example_id): with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r", encoding="utf-8") as f: example = json.load(f) - example["snapshot"] = "exp_v1" + example["snapshot"] = "exp_v5" api_key = os.environ.get("OPENAI_API_KEY") agent = GPT4v_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space, @@ -168,17 +168,17 @@ if __name__ == '__main__': "af630914-714e-4a24-a7bb-f9af687d3b91" ] calc_list = [ - "eb03d19a-b88d-4de4-8a64-ca0ac66f426b", - "0bf05a7d-b28b-44d2-955a-50b41e24012a", - "7a4e4bc8-922c-4c84-865c-25ba34136be1", - "2bd59342-0664-4ccb-ba87-79379096cc08", - "ecb0df7a-4e8d-4a03-b162-053391d3afaf", - "7efeb4b1-3d19-4762-b163-63328d66303b", - "4e6fcf72-daf3-439f-a232-c434ce416af6", - "6054afcb-5bab-4702-90a0-b259b5d3217c", - "abed40dc-063f-4598-8ba5-9fe749c0615d", - "01b269ae-2111-4a07-81fd-3fcd711993b0", - "8b1ce5f2-59d2-4dcc-b0b0-666a714b9a14", + # "eb03d19a-b88d-4de4-8a64-ca0ac66f426b", + # "0bf05a7d-b28b-44d2-955a-50b41e24012a", + # "7a4e4bc8-922c-4c84-865c-25ba34136be1", + # "2bd59342-0664-4ccb-ba87-79379096cc08", + # "ecb0df7a-4e8d-4a03-b162-053391d3afaf", + # "7efeb4b1-3d19-4762-b163-63328d66303b", + # "4e6fcf72-daf3-439f-a232-c434ce416af6", + # "6054afcb-5bab-4702-90a0-b259b5d3217c", + # "abed40dc-063f-4598-8ba5-9fe749c0615d", + # "01b269ae-2111-4a07-81fd-3fcd711993b0", + # "8b1ce5f2-59d2-4dcc-b0b0-666a714b9a14", "0cecd4f3-74de-457b-ba94-29ad6b5dafb6", "4188d3a4-077d-46b7-9c86-23e1a036f6c1", "51b11269-2ca8-4b2a-9163-f21758420e78", @@ -197,5 +197,97 @@ if __name__ == '__main__': "4f07fbe9-70de-4927-a4d5-bb28bc12c52c", ] - for example_id in calc_list: - main("libreoffice_calc", example_id) + # for example_id in calc_list: + # main("libreoffice_calc", example_id) + + impress_list = [ + # "5d901039-a89c-4bfb-967b-bf66f4df075e", + # "550ce7e7-747b-495f-b122-acdc4d0b8e54", + # "455d3c66-7dc6-4537-a39a-36d3e9119df7", + # "af23762e-2bfd-4a1d-aada-20fa8de9ce07", + # "c59742c0-4323-4b9d-8a02-723c251deaa0", + # "ef9d12bd-bcee-4ba0-a40e-918400f43ddf", + # "9ec204e4-f0a3-42f8-8458-b772a6797cab", + # "0f84bef9-9790-432e-92b7-eece357603fb", + # "ce88f674-ab7a-43da-9201-468d38539e4a", + # "3b27600c-3668-4abd-8f84-7bcdebbccbdb", + # "a097acff-6266-4291-9fbd-137af7ecd439", + # "bf4e9888-f10f-47af-8dba-76413038b73c", + "21760ecb-8f62-40d2-8d85-0cee5725cb72" + ] + # for example_id in impress_list: + # main("libreoffice_impress", example_id) + + # gimp_list = [ + # "7a4deb26-d57d-4ea9-9a73-630f66a7b568", + # "554785e9-4523-4e7a-b8e1-8016f565f56a", + # "77b8ab4d-994f-43ac-8930-8ca087d7c4b4", + # "f4aec372-4fb0-4df5-a52b-79e0e2a5d6ce", + # "d52d6308-ec58-42b7-a2c9-de80e4837b2b", + # "2a729ded-3296-423d-aec4-7dd55ed5fbb3", + # "b148e375-fe0b-4bec-90e7-38632b0d73c2", + # "a746add2-cab0-4740-ac36-c3769d9bfb46", + # "7b7617bd-57cc-468e-9c91-40c4ec2bcb3d", + # "d16c99dc-2a1e-46f2-b350-d97c86c85c15", + # "06ca5602-62ca-47f6-ad4f-da151cde54cc", + # "e2dd0213-26db-4349-abe5-d5667bfd725c", + # "f723c744-e62c-4ae6-98d1-750d3cd7d79d", + # "72f83cdc-bf76-4531-9a1b-eb893a13f8aa", + # "7767eef2-56a3-4cea-8c9f-48c070c7d65b", + # "734d6579-c07d-47a8-9ae2-13339795476b" + # ] + # + # for example_id in gimp_list: + # try: + # main("gimp", example_id) + # except Exception as e: + # logger.error("An error occurred while running the example: %s", e) + # continue + # + + vs_code_list = [ + "0ed39f63-6049-43d4-ba4d-5fa2fe04a951", + "53ad5833-3455-407b-bbc6-45b4c79ab8fb", + "eabc805a-bfcf-4460-b250-ac92135819f6", + "982d12a5-beab-424f-8d38-d2a48429e511", + "4e60007a-f5be-4bfc-9723-c39affa0a6d3", + "e2b5e914-ffe1-44d2-8e92-58f8c5d92bb2", + "9439a27b-18ae-42d8-9778-5f68f891805e", + "ea98c5d7-3cf9-4f9b-8ad3-366b58e0fcae", + "930fdb3b-11a8-46fe-9bac-577332e2640e", + "276cc624-87ea-4f08-ab93-f770e3790175", + "9d425400-e9b2-4424-9a4b-d4c7abac4140" + ] + + # for example_id in vs_code_list: + # try: + # main("vs_code", example_id) + # except Exception as e: + # logger.error("An error occurred while running the example: %s", e) + # continue + + multiple_list = [ + "f8cfa149-d1c1-4215-8dac-4a0932bad3c2", + "897e3b53-5d4d-444b-85cb-2cdc8a97d903", + "4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc", + "b52b40a5-ad70-4c53-b5b0-5650a8387052", + "46407397-a7d5-4c6b-92c6-dbe038b1457b", + "2b9493d7-49b8-493a-a71b-56cd1f4d6908", + "51f5801c-18b3-4f25-b0c3-02f85507a078", + "2c9fc0de-3ee7-45e1-a5df-c86206ad78b5", + "510f64c8-9bcc-4be1-8d30-638705850618", + "937087b6-f668-4ba6-9110-60682ee33441", + "ee9a3c83-f437-4879-8918-be5efbb9fac7", + "3680a5ee-6870-426a-a997-eba929a0d25c", + "e135df7c-7687-4ac0-a5f0-76b74438b53e", + "58565672-7bfe-48ab-b828-db349231de6b", + "2fe4b718-3bd7-46ec-bdce-b184f5653624" + ] + + for example_id in multiple_list: + try: + main("multi_apps", example_id) + except Exception as e: + logger.error("An error occurred while running the example: %s", e) + continue + diff --git a/experiment_screenshot_a11y_tree.py b/experiment_screenshot_a11y_tree.py index 042bbfb..c03e0e4 100644 --- a/experiment_screenshot_a11y_tree.py +++ b/experiment_screenshot_a11y_tree.py @@ -1,9 +1,9 @@ -import ctypes import datetime import json import logging import os import sys + import func_timeout from desktop_env.envs.desktop_env import DesktopEnv @@ -124,12 +124,11 @@ def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_tr logger.info("Environment closed.") -def main(example_class, example_id): +def main(example_class, example_id, gpt4_model="gpt-4-vision-preview"): action_space = "pyautogui" # example_class = "libreoffice_calc" # example_id = "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3" # example_id = "01b269ae-2111-4a07-81fd-3fcd711993b0" - gpt4_model = "gpt-4-vision-preview" gemini_model = "gemini-pro-vision" logger.info("Running example %s/%s", example_class, example_id) @@ -138,7 +137,7 @@ def main(example_class, example_id): with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r", encoding="utf-8") as f: example = json.load(f) - example["snapshot"] = "exp_v1" + example["snapshot"] = "exp_v5" # example["snapshot"] = "exp_setup4" # example["snapshot"] = "Snapshot 30" @@ -160,7 +159,133 @@ def main(example_class, example_id): if __name__ == '__main__': - xx_list = [ + os_list = [ + "94d95f96-9699-4208-98ba-3c3119edf9c2", + "bedcedc4-4d72-425e-ad62-21960b11fe0d", + "43c2d64c-bab5-4dcb-a30c-b888321c319a", + "7688b85f-87a4-4e4a-b2f8-f3d6c3f29b82", + "ec4e3f68-9ea4-4c18-a5c9-69f89d1178b3", + "f9be0997-4b7c-45c5-b05c-4612b44a6118", + "28cc3b7e-b194-4bc9-8353-d04c0f4d56d2", + "5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57", + "e0df059f-28a6-4169-924f-b9623e7184cc", + "ddc75b62-7311-4af8-bfb3-859558542b36", + "b6781586-6346-41cd-935a-a6b1487918fc", + "3ce045a0-877b-42aa-8d2c-b4a863336ab8", + "a4d98375-215b-4a4d-aee9-3d4370fccc41", + "13584542-872b-42d8-b299-866967b5c3ef", + "23393935-50c7-4a86-aeea-2b78fd089c5c" ] - for example_id in xx_list: - main("xx", example_id) + + # for example_id in os_list: + # try: + # main("os", example_id) + # except Exception as e: + # logger.error("An error occurred while running the example: %s", e) + # continue + + calc_list = [ + # "eb03d19a-b88d-4de4-8a64-ca0ac66f426b", + # "0bf05a7d-b28b-44d2-955a-50b41e24012a", + # "7a4e4bc8-922c-4c84-865c-25ba34136be1", + # "2bd59342-0664-4ccb-ba87-79379096cc08", + # "ecb0df7a-4e8d-4a03-b162-053391d3afaf", + # "7efeb4b1-3d19-4762-b163-63328d66303b", + # "4e6fcf72-daf3-439f-a232-c434ce416af6", + # "6054afcb-5bab-4702-90a0-b259b5d3217c", + # "abed40dc-063f-4598-8ba5-9fe749c0615d", + # "01b269ae-2111-4a07-81fd-3fcd711993b0", + # "8b1ce5f2-59d2-4dcc-b0b0-666a714b9a14", + # "0cecd4f3-74de-457b-ba94-29ad6b5dafb6", + # "4188d3a4-077d-46b7-9c86-23e1a036f6c1", + # "51b11269-2ca8-4b2a-9163-f21758420e78", + # "7e429b8d-a3f0-4ed0-9b58-08957d00b127", + # "347ef137-7eeb-4c80-a3bb-0951f26a8aff", + # "6e99a1ad-07d2-4b66-a1ce-ece6d99c20a5", + # "3aaa4e37-dc91-482e-99af-132a612d40f3", + # "37608790-6147-45d0-9f20-1137bb35703d", + # "f9584479-3d0d-4c79-affa-9ad7afdd8850", + "d681960f-7bc3-4286-9913-a8812ba3261a", + "21df9241-f8d7-4509-b7f1-37e501a823f7", + "1334ca3e-f9e3-4db8-9ca7-b4c653be7d17", + "357ef137-7eeb-4c80-a3bb-0951f26a8aff", + "aa3a8974-2e85-438b-b29e-a64df44deb4b", + "a01fbce3-2793-461f-ab86-43680ccbae25", + "4f07fbe9-70de-4927-a4d5-bb28bc12c52c", + ] + + # for example_id in calc_list: + # try: + # main("libreoffice_calc", example_id) + # except Exception as e: + # logger.error("An error occurred while running the example: %s", e) + # continue + + impress_list = [ + "5d901039-a89c-4bfb-967b-bf66f4df075e", + "550ce7e7-747b-495f-b122-acdc4d0b8e54", + "455d3c66-7dc6-4537-a39a-36d3e9119df7", + "af23762e-2bfd-4a1d-aada-20fa8de9ce07", + "c59742c0-4323-4b9d-8a02-723c251deaa0", + "ef9d12bd-bcee-4ba0-a40e-918400f43ddf", + "9ec204e4-f0a3-42f8-8458-b772a6797cab", + "0f84bef9-9790-432e-92b7-eece357603fb", + "ce88f674-ab7a-43da-9201-468d38539e4a", + "3b27600c-3668-4abd-8f84-7bcdebbccbdb", + "a097acff-6266-4291-9fbd-137af7ecd439", + "bf4e9888-f10f-47af-8dba-76413038b73c", + "21760ecb-8f62-40d2-8d85-0cee5725cb72" + ] + + # for example_id in impress_list: + # try: + # main("libreoffice_impress", example_id) + # except Exception as e: + # logger.error("An error occurred while running the example: %s", e) + # continue + + vs_code_list = [ + "0ed39f63-6049-43d4-ba4d-5fa2fe04a951", + "53ad5833-3455-407b-bbc6-45b4c79ab8fb", + "eabc805a-bfcf-4460-b250-ac92135819f6", + "982d12a5-beab-424f-8d38-d2a48429e511", + "4e60007a-f5be-4bfc-9723-c39affa0a6d3", + "e2b5e914-ffe1-44d2-8e92-58f8c5d92bb2", + "9439a27b-18ae-42d8-9778-5f68f891805e", + "ea98c5d7-3cf9-4f9b-8ad3-366b58e0fcae", + "930fdb3b-11a8-46fe-9bac-577332e2640e", + "276cc624-87ea-4f08-ab93-f770e3790175", + "9d425400-e9b2-4424-9a4b-d4c7abac4140" + ] + + # for example_id in vs_code_list: + # try: + # main("vs_code", example_id) + # except Exception as e: + # logger.error("An error occurred while running the example: %s", e) + # continue + + multiple_list = [ + "f8cfa149-d1c1-4215-8dac-4a0932bad3c2", + "897e3b53-5d4d-444b-85cb-2cdc8a97d903", + "4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc", + "b52b40a5-ad70-4c53-b5b0-5650a8387052", + "46407397-a7d5-4c6b-92c6-dbe038b1457b", + "2b9493d7-49b8-493a-a71b-56cd1f4d6908", + "51f5801c-18b3-4f25-b0c3-02f85507a078", + "2c9fc0de-3ee7-45e1-a5df-c86206ad78b5", + "510f64c8-9bcc-4be1-8d30-638705850618", + "937087b6-f668-4ba6-9110-60682ee33441", + "ee9a3c83-f437-4879-8918-be5efbb9fac7", + "3680a5ee-6870-426a-a997-eba929a0d25c", + "e135df7c-7687-4ac0-a5f0-76b74438b53e", + "58565672-7bfe-48ab-b828-db349231de6b", + "2fe4b718-3bd7-46ec-bdce-b184f5653624" + ] + + for example_id in multiple_list: + try: + main("multi_apps", example_id) + except Exception as e: + logger.error("An error occurred while running the example: %s", e) + continue diff --git a/experiment_screenshot_seeact.py b/experiment_screenshot_seeact.py index 541d549..6c3a472 100644 --- a/experiment_screenshot_seeact.py +++ b/experiment_screenshot_seeact.py @@ -129,7 +129,7 @@ def main(example_class, example_id): with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r", encoding="utf-8") as f: example = json.load(f) - example["snapshot"] = "exp_v1" + example["snapshot"] = "exp_v5" api_key = os.environ.get("OPENAI_API_KEY") agent = GPT4v_Agent(api_key=api_key, model=gpt4_model, instruction=example['instruction'], diff --git a/experiment_screenshot_som.py b/experiment_screenshot_som.py index 2ecdafe..863f115 100644 --- a/experiment_screenshot_som.py +++ b/experiment_screenshot_som.py @@ -129,7 +129,7 @@ def main(example_class, example_id): with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r", encoding="utf-8") as f: example = json.load(f) - example["snapshot"] = "exp_v1" + example["snapshot"] = "exp_v5" api_key = os.environ.get("OPENAI_API_KEY") agent = GPT4v_Agent(api_key=api_key, model=gpt4_model, instruction=example['instruction'],