From 909aa868f3b9560e424366b6265ea9e679914e29 Mon Sep 17 00:00:00 2001 From: Timothyxxx <384084775@qq.com> Date: Sat, 27 Jan 2024 19:47:47 +0800 Subject: [PATCH] Improve on agent codes; add auto-running experiments code; Fix some examples --- .../5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57.json | 4 +- .../f3977615-2b45-4ac5-8bba-80c17dbe2a37.json | 2 +- experiment_a11y_tree.py | 96 ++++++++++++++++--- experiment_screenshot.py | 70 ++++++++++---- experiment_screenshot_a11y_tree.py | 54 +++++++++-- experiment_screenshot_seeact.py | 54 +++++++++-- experiment_screenshot_som.py | 54 +++++++++-- mm_agents/gpt_4v_agent.py | 5 + 8 files changed, 283 insertions(+), 56 deletions(-) diff --git a/evaluation_examples/examples/os/5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57.json b/evaluation_examples/examples/os/5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57.json index d51c886..f010182 100644 --- a/evaluation_examples/examples/os/5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57.json +++ b/evaluation_examples/examples/os/5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57.json @@ -11,7 +11,7 @@ "files": [ { "url": "https://drive.usercontent.google.com/download?id=1XaTnC_lLbR_tGTz8tcN2Tp6cNrMlNW3R&export=download&authuser=0&confirm=t&uuid=89e69a23-43cf-4316-833a-fb9d3e281460&at=APZUnTWn5zZTH4GlClO6lV1i4WwP:1706184669922", - "path": "poster_party_night.webp" + "path": "/home/user/Desktop/poster_party_night.webp" } ] } @@ -19,7 +19,7 @@ { "type": "execute", "parameters": { - "command": "mv ~/poster_party_night.webp ~/.local/share/Trash/files/", + "command": "gio trash /home/user/Desktop/poster_party_night.webp", "shell": true } }, diff --git a/evaluation_examples/examples/vlc/f3977615-2b45-4ac5-8bba-80c17dbe2a37.json b/evaluation_examples/examples/vlc/f3977615-2b45-4ac5-8bba-80c17dbe2a37.json index 3be1e5d..8f49282 100644 --- a/evaluation_examples/examples/vlc/f3977615-2b45-4ac5-8bba-80c17dbe2a37.json +++ b/evaluation_examples/examples/vlc/f3977615-2b45-4ac5-8bba-80c17dbe2a37.json @@ -1,7 +1,7 @@ { "id": "f3977615-2b45-4ac5-8bba-80c17dbe2a37", "snapshot": "chrome", - "instruction": "I want to watch two or more videos in same time on VLC. I tried to run multiple instances of VLC. It worked but can't play videos on those new instances. When I play video it plays on first instance instead of new instance.\nIs there any way to solve this problem? Take the three videos on my desktop for example, do that for me.", + "instruction": "I want to watch two or more videos in same time on VLC. I tried to run multiple instances of VLC. It worked but can't play videos on those new instances. When I play video it plays on first instance instead of new instance.\nIs there any way to solve this problem?", "source": "https://www.reddit.com/r/Fedora/comments/rhljzd/how_to_run_multiple_instances_of_vlc_media_player/", "config": [ { diff --git a/experiment_a11y_tree.py b/experiment_a11y_tree.py index 728d0de..86e6a72 100644 --- a/experiment_a11y_tree.py +++ b/experiment_a11y_tree.py @@ -3,7 +3,8 @@ import json import logging import os import sys - +import threading +import time from desktop_env.envs.desktop_env import DesktopEnv from mm_agents.gpt_4v_agent import GPT4v_Agent @@ -61,8 +62,6 @@ def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_tr env.controller.start_recording() while not done and step_num < max_steps: - with open("accessibility_tree.xml", "w", encoding="utf-8") as f: - f.write(observation["accessibility_tree"]) actions = agent.predict(observation) step_num += 1 for action in actions: @@ -98,34 +97,63 @@ def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_tr logger.info("The episode is done.") break - if recording: - # send a request to the server to stop recording - env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4")) + def stop_recording(): + try: + env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4")) + except Exception as e: + print(f"An error occurred while stopping the recording: {e}") + + # Run the `record` function in a separate thread + recording_thread = threading.Thread(target=stop_recording()) + recording_thread.start() + + # Start a timer for your timeout length (in this case, 60 seconds) + timeout = 60 # seconds + start_time = time.time() + + # The main thread will wait for the set timeout period or until the recording is done + while recording_thread.is_alive(): + elapsed_time = time.time() - start_time + if elapsed_time >= timeout: + print("Timeout reached. Stopping recording.") + break + time.sleep(0.1) # Sleep for a short time to prevent this loop from using too much CPU + + # kill the recording thread if it is still alive + if recording_thread.is_alive(): + recording_thread.kill() + + # Wait for the recording thread to finish before exiting + recording_thread.join() result = env.evaluate() logger.info("Result: %.2f", result) + with open(trajectory_recording_path, "a") as f: + f.write(json.dumps({ + "result": result + })) + f.write("\n") + # env.close() logger.info("Environment closed.") -if __name__ == "__main__": +def main(example_class, example_id): action_space = "pyautogui" - example_class = "chrome" - example_id = "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3" - gpt4_model = "gpt-4-vision-preview" + gpt4_model = "gpt-4-0125-preview" gemini_model = "gemini-pro-vision" logger.info("Running example %s/%s", example_class, example_id) logger.info("Using model %s", gpt4_model) # logger.info("Using model %s", gemini_model) - with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f: + with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r", encoding="utf-8") as f: example = json.load(f) - example["snapshot"] = "exp_setup4" + example["snapshot"] = "exp_chrome" api_key = os.environ.get("OPENAI_API_KEY") - agent = GPT4v_Agent(api_key=api_key, model=gpt4_model, instruction=example['instruction'], + agent = GPT4v_Agent(api_key=api_key, model=gpt4_model, instruction=example['instruction'], max_tokens=1000, action_space=action_space, exp="a11y_tree") # api_key = os.environ.get("GENAI_API_KEY") @@ -139,3 +167,45 @@ if __name__ == "__main__": os.makedirs(example_trajectory_dir, exist_ok=True) run_one_example(example, agent, 15, example_trajectory_dir) + + +if __name__ == '__main__': + vlc_list = [ + # "8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89", + # "8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89", + # "8f080098-ddb1-424c-b438-4e96e5e4786e", + # "bba3381f-b5eb-4439-bd9e-80c22218d5a7", + # "fba2c100-79e8-42df-ae74-b592418d54f4", + # "efcf0d81-0835-4880-b2fd-d866e8bc2294", + # "8d9fd4e2-6fdb-46b0-b9b9-02f06495c62f", + # "aa4b5023-aef6-4ed9-bdc9-705f59ab9ad6", + # "386dbd0e-0241-4a0a-b6a2-6704fba26b1c", + # "9195653c-f4aa-453d-aa95-787f6ccfaae9", + # "d06f0d4d-2cd5-4ede-8de9-598629438c6e", + # "a5bbbcd5-b398-4c91-83d4-55e1e31bbb81", + "f3977615-2b45-4ac5-8bba-80c17dbe2a37", + "215dfd39-f493-4bc3-a027-8a97d72c61bf" + ] + for example_id in vlc_list: + recording_thread = threading.Thread(target=main, args=("vlc", example_id)) + recording_thread.start() + + # Start a timer for your timeout length (in this case, 60 seconds) + timeout = 600 # seconds + start_time = time.time() + + # The main thread will wait for the set timeout period or until the recording is done + while recording_thread.is_alive(): + elapsed_time = time.time() - start_time + if elapsed_time >= timeout: + print("Timeout reached. Kill this example.") + break + time.sleep(0.1) # Sleep for a short time to prevent this loop from using too much CPU + + # kill the recording thread if it is still alive + if recording_thread.is_alive(): + recording_thread.kill() + + # Wait for the recording thread to finish before exiting + recording_thread.join() + diff --git a/experiment_screenshot.py b/experiment_screenshot.py index 6d82730..943a8ec 100644 --- a/experiment_screenshot.py +++ b/experiment_screenshot.py @@ -3,10 +3,12 @@ import json import logging import os import sys +import threading +import time from desktop_env.envs.desktop_env import DesktopEnv from mm_agents.gpt_4v_agent import GPT4v_Agent -from mm_agents.gemini_pro_agent import GeminiPro_Agent +# from mm_agents.gemini_pro_agent import GeminiPro_Agent # Logger Configs {{{ # logger = logging.getLogger() @@ -98,21 +100,50 @@ def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_tr logger.info("The episode is done.") break - if recording: - # send a request to the server to stop recording - env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4")) + def stop_recording(): + try: + env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4")) + except Exception as e: + print(f"An error occurred while stopping the recording: {e}") + + # Run the `record` function in a separate thread + recording_thread = threading.Thread(target=stop_recording()) + recording_thread.start() + + # Start a timer for your timeout length (in this case, 60 seconds) + timeout = 60 # seconds + start_time = time.time() + + # The main thread will wait for the set timeout period or until the recording is done + while recording_thread.is_alive(): + elapsed_time = time.time() - start_time + if elapsed_time >= timeout: + print("Timeout reached. Stopping recording.") + break + time.sleep(0.1) # Sleep for a short time to prevent this loop from using too much CPU + + # kill the recording thread if it is still alive + if recording_thread.is_alive(): + recording_thread.kill() + + # Wait for the recording thread to finish before exiting + recording_thread.join() result = env.evaluate() logger.info("Result: %.2f", result) + with open(trajectory_recording_path, "a") as f: + f.write(json.dumps({ + "result": result + })) + f.write("\n") + # env.close() logger.info("Environment closed.") -if __name__ == "__main__": +def main(example_class, example_id): action_space = "pyautogui" - example_class = "thunderbird" - example_id = "bb5e4c0d-f964-439c-97b6-bdb9747de3f4" gpt4_model = "gpt-4-vision-preview" gemini_model = "gemini-pro-vision" @@ -120,21 +151,28 @@ if __name__ == "__main__": logger.info("Using model %s", gpt4_model) # logger.info("Using model %s", gemini_model) - with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f: + with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r", encoding="utf-8") as f: example = json.load(f) - example["snapshot"] = "exp_setup2" + example["snapshot"] = "exp_chrome" - # api_key = os.environ.get("OPENAI_API_KEY") - # agent = GPT4v_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space, exp="screenshot") - - api_key = os.environ.get("GENAI_API_KEY") - agent = GeminiPro_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space, exp="screenshot") + api_key = os.environ.get("OPENAI_API_KEY") + agent = GPT4v_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space, exp="screenshot") + # + # api_key = os.environ.get("GENAI_API_KEY") + # agent = GeminiPro_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space, exp="screenshot") root_trajectory_dir = "exp_trajectory" - example_trajectory_dir = os.path.join(root_trajectory_dir, "a11y_tree", example_class, gpt4_model, example_id) - # example_trajectory_dir = os.path.join(root_trajectory_dir, "a11y_tree", example_class, gemini_model, example_id) + example_trajectory_dir = os.path.join(root_trajectory_dir, "screenshot", example_class, gpt4_model, example_id) + # example_trajectory_dir = os.path.join(root_trajectory_dir, "screenshot", example_class, gemini_model, example_id) os.makedirs(example_trajectory_dir, exist_ok=True) run_one_example(example, agent, 15, example_trajectory_dir) + + +if __name__ == '__main__': + xx_list = [ + ] + for example_id in xx_list: + main("xx", example_id) diff --git a/experiment_screenshot_a11y_tree.py b/experiment_screenshot_a11y_tree.py index 60c81b6..d32bc7e 100644 --- a/experiment_screenshot_a11y_tree.py +++ b/experiment_screenshot_a11y_tree.py @@ -3,6 +3,8 @@ import json import logging import os import sys +import threading +import time from desktop_env.envs.desktop_env import DesktopEnv from mm_agents.gpt_4v_agent import GPT4v_Agent @@ -96,21 +98,50 @@ def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_tr logger.info("The episode is done.") break - if recording: - # send a request to the server to stop recording - env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4")) + def stop_recording(): + try: + env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4")) + except Exception as e: + print(f"An error occurred while stopping the recording: {e}") + + # Run the `record` function in a separate thread + recording_thread = threading.Thread(target=stop_recording()) + recording_thread.start() + + # Start a timer for your timeout length (in this case, 60 seconds) + timeout = 60 # seconds + start_time = time.time() + + # The main thread will wait for the set timeout period or until the recording is done + while recording_thread.is_alive(): + elapsed_time = time.time() - start_time + if elapsed_time >= timeout: + print("Timeout reached. Stopping recording.") + break + time.sleep(0.1) # Sleep for a short time to prevent this loop from using too much CPU + + # kill the recording thread if it is still alive + if recording_thread.is_alive(): + recording_thread.kill() + + # Wait for the recording thread to finish before exiting + recording_thread.join() result = env.evaluate() logger.info("Result: %.2f", result) + with open(trajectory_recording_path, "a") as f: + f.write(json.dumps({ + "result": result + })) + f.write("\n") + # env.close() logger.info("Environment closed.") -if __name__ == "__main__": +def main(example_class, example_id): action_space = "pyautogui" - example_class = "chrome" - example_id = "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3" gpt4_model = "gpt-4-vision-preview" gemini_model = "gemini-pro-vision" @@ -118,9 +149,9 @@ if __name__ == "__main__": logger.info("Using model %s", gpt4_model) # logger.info("Using model %s", gemini_model) - with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f: + with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r", encoding="utf-8") as f: example = json.load(f) - example["snapshot"] = "exp_setup4" + example["snapshot"] = "exp_chrome" api_key = os.environ.get("OPENAI_API_KEY") agent = GPT4v_Agent(api_key=api_key, model=gpt4_model, instruction=example['instruction'], @@ -137,3 +168,10 @@ if __name__ == "__main__": os.makedirs(example_trajectory_dir, exist_ok=True) run_one_example(example, agent, 15, example_trajectory_dir) + + +if __name__ == '__main__': + xx_list = [ + ] + for example_id in xx_list: + main("xx", example_id) diff --git a/experiment_screenshot_seeact.py b/experiment_screenshot_seeact.py index b718693..3f72375 100644 --- a/experiment_screenshot_seeact.py +++ b/experiment_screenshot_seeact.py @@ -3,6 +3,8 @@ import json import logging import os import sys +import threading +import time from desktop_env.envs.desktop_env import DesktopEnv from mm_agents.gpt_4v_agent import GPT4v_Agent @@ -96,27 +98,56 @@ def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_tr logger.info("The episode is done.") break - if recording: - # send a request to the server to stop recording - env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4")) + def stop_recording(): + try: + env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4")) + except Exception as e: + print(f"An error occurred while stopping the recording: {e}") + + # Run the `record` function in a separate thread + recording_thread = threading.Thread(target=stop_recording()) + recording_thread.start() + + # Start a timer for your timeout length (in this case, 60 seconds) + timeout = 60 # seconds + start_time = time.time() + + # The main thread will wait for the set timeout period or until the recording is done + while recording_thread.is_alive(): + elapsed_time = time.time() - start_time + if elapsed_time >= timeout: + print("Timeout reached. Stopping recording.") + break + time.sleep(0.1) # Sleep for a short time to prevent this loop from using too much CPU + + # kill the recording thread if it is still alive + if recording_thread.is_alive(): + recording_thread.kill() + + # Wait for the recording thread to finish before exiting + recording_thread.join() result = env.evaluate() logger.info("Result: %.2f", result) + with open(trajectory_recording_path, "a") as f: + f.write(json.dumps({ + "result": result + })) + f.write("\n") + # env.close() logger.info("Environment closed.") -if __name__ == "__main__": +def main(example_class, example_id): action_space = "pyautogui" - example_class = "chrome" - example_id = "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3" gpt4_model = "gpt-4-vision-preview" gemini_model = "gemini-pro-vision" - with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f: + with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r", encoding="utf-8") as f: example = json.load(f) - example["snapshot"] = "exp_setup4" + example["snapshot"] = "exp_chrome" api_key = os.environ.get("OPENAI_API_KEY") agent = GPT4v_Agent(api_key=api_key, model=gpt4_model, instruction=example['instruction'], @@ -133,3 +164,10 @@ if __name__ == "__main__": os.makedirs(example_trajectory_dir, exist_ok=True) run_one_example(example, agent, 15, example_trajectory_dir) + + +if __name__ == '__main__': + xx_list = [ + ] + for example_id in xx_list: + main("xx", example_id) diff --git a/experiment_screenshot_som.py b/experiment_screenshot_som.py index 2a64bb3..abd77d9 100644 --- a/experiment_screenshot_som.py +++ b/experiment_screenshot_som.py @@ -3,6 +3,8 @@ import json import logging import os import sys +import threading +import time from desktop_env.envs.desktop_env import DesktopEnv from mm_agents.gpt_4v_agent import GPT4v_Agent @@ -96,27 +98,56 @@ def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_tr logger.info("The episode is done.") break - if recording: - # send a request to the server to stop recording - env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4")) + def stop_recording(): + try: + env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4")) + except Exception as e: + print(f"An error occurred while stopping the recording: {e}") + + # Run the `record` function in a separate thread + recording_thread = threading.Thread(target=stop_recording()) + recording_thread.start() + + # Start a timer for your timeout length (in this case, 60 seconds) + timeout = 60 # seconds + start_time = time.time() + + # The main thread will wait for the set timeout period or until the recording is done + while recording_thread.is_alive(): + elapsed_time = time.time() - start_time + if elapsed_time >= timeout: + print("Timeout reached. Stopping recording.") + break + time.sleep(0.1) # Sleep for a short time to prevent this loop from using too much CPU + + # kill the recording thread if it is still alive + if recording_thread.is_alive(): + recording_thread.kill() + + # Wait for the recording thread to finish before exiting + recording_thread.join() result = env.evaluate() logger.info("Result: %.2f", result) + with open(trajectory_recording_path, "a") as f: + f.write(json.dumps({ + "result": result + })) + f.write("\n") + # env.close() logger.info("Environment closed.") -if __name__ == "__main__": +def main(example_class, example_id): action_space = "pyautogui" - example_class = "chrome" - example_id = "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3" gpt4_model = "gpt-4-vision-preview" gemini_model = "gemini-pro-vision" - with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f: + with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r", encoding="utf-8") as f: example = json.load(f) - example["snapshot"] = "exp_setup4" + example["snapshot"] = "exp_chrome" api_key = os.environ.get("OPENAI_API_KEY") agent = GPT4v_Agent(api_key=api_key, model=gpt4_model, instruction=example['instruction'], @@ -133,3 +164,10 @@ if __name__ == "__main__": os.makedirs(example_trajectory_dir, exist_ok=True) run_one_example(example, agent, 15, example_trajectory_dir) + + +if __name__ == '__main__': + xx_list = [ + ] + for example_id in xx_list: + main("xx", example_id) diff --git a/mm_agents/gpt_4v_agent.py b/mm_agents/gpt_4v_agent.py index cc08b79..4fdb946 100644 --- a/mm_agents/gpt_4v_agent.py +++ b/mm_agents/gpt_4v_agent.py @@ -63,6 +63,8 @@ def tag_screenshot(screenshot, accessibility_tree): def parse_actions_from_string(input_string): + if input_string.strip() in ['WAIT', 'DONE', 'FAIL']: + return [input_string.strip()] # Search for a JSON string within the input string actions = [] matches = re.findall(r'```json\s+(.*?)\s+```', input_string, re.DOTALL) @@ -95,6 +97,9 @@ def parse_actions_from_string(input_string): def parse_code_from_string(input_string): + if input_string.strip() in ['WAIT', 'DONE', 'FAIL']: + return [input_string.strip()] + # This regular expression will match both ```code``` and ```python code``` # and capture the `code` part. It uses a non-greedy match for the content inside. pattern = r"```(?:\w+\s+)?(.*?)```"