diff --git a/README.md b/README.md index b7d56df..5fd8aa6 100644 --- a/README.md +++ b/README.md @@ -23,4 +23,11 @@ todo - [x] Error handling during file passing and file opening, etc. - [x] Add accessibility tree from the OS into the observation space - [ ] Add pre-process and post-process action support for benchmarking setup and evaluation -- [ ] Multiprocess support, this can enable the reinforcement learning to be more efficient \ No newline at end of file +- [ ] Multiprocess support, this can enable the reinforcement learning to be more efficient + +## Road map of benchmark, tools and resources (Proposed) +- [ ] Improve the annotation tool base on DuckTrack, make it more robust which align on accessibility tree +- [ ] Annotate the steps of doing the task +- [ ] Build a website for the project +- [ ] Crawl all resources we explored from the internet, and make it easy to access +- [ ] Set up ways for community to contribute new examples diff --git a/desktop_env/controllers/python.py b/desktop_env/controllers/python.py index 1c37d4a..a4b2abb 100644 --- a/desktop_env/controllers/python.py +++ b/desktop_env/controllers/python.py @@ -83,6 +83,8 @@ class PythonController: """ Executes an action on the server computer. """ + if action in ['WAIT', 'FAIL', 'DONE']: + return action_type = action["action_type"] parameters = action["parameters"] if "parameters" in action else {} diff --git a/desktop_env/envs/desktop_env.py b/desktop_env/envs/desktop_env.py index e2ef08b..786ed72 100644 --- a/desktop_env/envs/desktop_env.py +++ b/desktop_env/envs/desktop_env.py @@ -204,7 +204,10 @@ class DesktopEnv(gym.Env): time.sleep(5) logger.info("Environment setup complete.") - observation = {"screenshot": self._get_obs()} + observation = { + "screenshot": self._get_obs(), + "accessibility_tree": self.controller.get_accessibility_tree(), + } return observation def step(self, action, pause=0.5): @@ -231,8 +234,11 @@ class DesktopEnv(gym.Env): # the set of all possible actions defined in the action representation self.controller.execute_action(action) elif self.action_space == "pyautogui": - # the set of all possible python commands insides `pyautogui` - self.controller.execute_python_command(action) + if action in ['WAIT', 'FAIL', 'DONE']: + self.controller.execute_action(action) + else: + # the set of all possible python commands insides `pyautogui` + self.controller.execute_python_command(action) observation = { "screenshot": self._get_obs(), diff --git a/desktop_env/evaluators/getters/chrome.py b/desktop_env/evaluators/getters/chrome.py index 1b77016..d58d08b 100644 --- a/desktop_env/evaluators/getters/chrome.py +++ b/desktop_env/evaluators/getters/chrome.py @@ -159,9 +159,19 @@ def get_open_tabs_info(env, config: Dict[str, str]): tabs_info = [] for context in browser.contexts: for page in context.pages: - title = page.title() - url = page.url - tabs_info.append({'title': title, 'url': url}) + try: + # Wait for the page to finish loading, this prevents the "execution context was destroyed" issue + page.wait_for_load_state('load') # Wait for the 'load' event to complete + title = page.title() + url = page.url + tabs_info.append({'title': title, 'url': url}) + except TimeoutError: + # If page loading times out, catch the exception and store the current information in the list + tabs_info.append({'title': 'Load timeout', 'url': page.url}) + except Exception as e: + # Catch other potential exceptions that might occur while reading the page title + print(f'Error: {e}') + tabs_info.append({'title': 'Error encountered', 'url': page.url}) browser.close() return tabs_info diff --git a/desktop_env/evaluators/metrics/__init__.py b/desktop_env/evaluators/metrics/__init__.py index 200b56f..59ebab2 100644 --- a/desktop_env/evaluators/metrics/__init__.py +++ b/desktop_env/evaluators/metrics/__init__.py @@ -14,4 +14,4 @@ from .gimp import increase_saturation, decrease_brightness, check_file_exists, c from .general import check_csv, check_accessibility_tree, check_list, run_sqlite3, check_json from .thunderbird import check_thunderbird_prefs, check_thunderbird_filter from .vscode import compare_text_file, compare_config, compare_answer, is_extension_installed -from .impress import check_slide_numbers_color, compare_pptx_files, check_for_two_lines +from .impress import check_slide_numbers_color, compare_pptx_files, check_for_two_lines, check_for_audio, check_formula_shape, check_file_exists diff --git a/desktop_env/evaluators/metrics/general.py b/desktop_env/evaluators/metrics/general.py index b0433c3..6246861 100644 --- a/desktop_env/evaluators/metrics/general.py +++ b/desktop_env/evaluators/metrics/general.py @@ -4,7 +4,7 @@ import functools import operator import re from numbers import Number -from typing import Callable, Any +from typing import Callable, Any, Union from typing import Dict, List, Pattern import lxml.etree diff --git a/desktop_env/evaluators/metrics/impress.py b/desktop_env/evaluators/metrics/impress.py index 7268aef..719ae56 100644 --- a/desktop_env/evaluators/metrics/impress.py +++ b/desktop_env/evaluators/metrics/impress.py @@ -71,6 +71,29 @@ def check_file_exists(directory, filename): file_path = os.path.join(directory, filename) return 1 if os.path.isfile(file_path) else 0 +def has_audio_on_page(slide): + for shape in slide.shapes: + if shape.shape_type == 13: + return True + return False + +def check_for_audio(prs): + prs = Presentation(prs) + for i, slide in enumerate(prs.slides): + if has_audio_on_page(slide): + return 1 + return 0 + +def check_formula_shape(prs): + prs = Presentation(prs) + slide = prs.slides[13] + + for shape in slide.shapes: + + if shape.has_text_frame and shape.shape_type == 1: + return 1 + return 0 + if __name__ == "__main__": path1 = "../../任务数据/LibreOffice Impress/Change_Color_Slide_Number_gold_textbox.pptx" presentation = Presentation(path1) diff --git a/evaluation_examples/examples/libreoffice_impress/39478d4a-1049-456f-aa77-407811393add.json b/evaluation_examples/examples/libreoffice_impress/39478d4a-1049-456f-aa77-407811393add.json new file mode 100644 index 0000000..e5ee5d6 --- /dev/null +++ b/evaluation_examples/examples/libreoffice_impress/39478d4a-1049-456f-aa77-407811393add.json @@ -0,0 +1,37 @@ +{ + "id": "39478d4a-1049-456f-aa77-407811393add", + "snapshot": "libreoffice_impress", + "instruction": "Could you help me add a hat symbol to \"Y\" in text in page 14? I need to represent vector sign for my statistics class.", + "source": "https://www.reddit.com/r/libreoffice/comments/jul3o8/putting_cap_or_hat_or_carat_symbol_in_libre/", + "config": [ + { + "type": "download", + "parameters": { + "files": [ + { + "url": "https://drive.usercontent.google.com/download?id=1WT1-L0iiIlF2kuIK77IDxTfBaQ0X0BbX&export=download&authuser=0&confirm=t&uuid=0b69767e-1f3e-49ce-88a7-1036ef25bcaf&at=APZUnTXZ_sqEZUrHNx1edWep017b:1705337750065", + "path": "Desktop/Ch5.pptx" + } + ] + } + }, + { + "type": "open", + "parameters": { + "path": "Desktop/Ch5.pptx" + } + } + ], + "trajectory": "trajectories/", + "related_apps": [ + "libreoffice_impress" + ], + "evaluator": { + "func": "check_formula_shape", + "result": { + "type": "vm_file", + "path": "Desktop/Ch5.pptx", + "dest": "Ch5.pptx" + } + } +} \ No newline at end of file diff --git a/evaluation_examples/examples/libreoffice_impress/9ec204e4-f0a3-42f8-8458-b772a6797cab.json b/evaluation_examples/examples/libreoffice_impress/9ec204e4-f0a3-42f8-8458-b772a6797cab.json index 8d8255e..c825ce7 100644 --- a/evaluation_examples/examples/libreoffice_impress/9ec204e4-f0a3-42f8-8458-b772a6797cab.json +++ b/evaluation_examples/examples/libreoffice_impress/9ec204e4-f0a3-42f8-8458-b772a6797cab.json @@ -1,12 +1,42 @@ { "id": "9ec204e4-f0a3-42f8-8458-b772a6797cab", "snapshot": "libreoffice_impress", - "instruction": "Could you help me copy and paste the first 3 slides?", + "instruction": "Make a duplicate of the last two slides for me, please.", "source": "https://www.tiktok.com/@lil.d1rt_/video/7247574148887629083", - "config": [], + "config": [ + { + "type": "download", + "parameters": { + "files": [ + { + "url": "https://drive.usercontent.google.com/download?id=1ad5vUXasdN2MypNap-pBUmgPg5FaxmDA&export=download&authuser=0&confirm=t&uuid=9dc069bb-edd3-4ae9-b356-4c6543778584&at=APZUnTXbgLHjv1MhMFy1IfZL3fQI:1705338128337", + "path": "Desktop/MLA_Workshop_061X_Works_Cited.pptx" + } + ] + } + }, + { + "type": "open", + "parameters": { + "path": "Desktop/MLA_Workshop_061X_Works_Cited.pptx" + } + } + ], "trajectory": "trajectories/", "related_apps": [ - "" + "libreoffice_impress" ], - "evaluator": "evaluation_dir" -} + "evaluator": { + "func": "compare_pptx_files", + "expected": { + "type": "cloud_file", + "path": "https://drive.usercontent.google.com/download?id=1otbzscpOZ0tCXMvsMC0MmNWUC7Pv71of&export=download&authuser=0&confirm=t&uuid=faa0b0c1-6b14-4bce-a1fd-ccf824ee1e60&at=APZUnTXw6TlBOlrPPZ2OhfGnNPf0:1705338135842", + "dest": "MLA_Workshop_061X_Works_Cited_Gold.docx" + }, + "result": { + "type": "vm_file", + "path": "Desktop/MLA_Workshop_061X_Works_Cited.pptx", + "dest": "MLA_Workshop_061X_Works_Cited.pptx" + } + } +} \ No newline at end of file diff --git a/evaluation_examples/examples/libreoffice_impress/af23762e-2bfd-4a1d-aada-20fa8de9ce07.json b/evaluation_examples/examples/libreoffice_impress/af23762e-2bfd-4a1d-aada-20fa8de9ce07.json index 93d0a94..db71d72 100644 --- a/evaluation_examples/examples/libreoffice_impress/af23762e-2bfd-4a1d-aada-20fa8de9ce07.json +++ b/evaluation_examples/examples/libreoffice_impress/af23762e-2bfd-4a1d-aada-20fa8de9ce07.json @@ -1,12 +1,42 @@ { "id": "af23762e-2bfd-4a1d-aada-20fa8de9ce07", "snapshot": "libreoffice_impress", - "instruction": "Please make a summary slide of the whole presentation for me", + "instruction": "I am making PPT on LibreOffice Impress for presentation tomorrow. I need to summarize contents on one slide. Could you make a summary slide for me?", "source": "https://superuser.com/questions/1059080/how-to-make-a-summary-slide-in-impress-listing-the-titles-of-all-slides-autom", - "config": [], + "config": [ + { + "type": "download", + "parameters": { + "files": [ + { + "url": "https://drive.usercontent.google.com/download?id=1zmtomIzSgSjnYZbhgtH4n90L5mV9bS7L&export=download&authuser=0&confirm=t&uuid=1966ab03-1e17-447a-aeb4-71a753eca196&at=APZUnTVoWHV6z8LJi5VHADuQE6VG:1705319233167", + "path": "Desktop/Forests.pptx" + } + ] + } + }, + { + "type": "open", + "parameters": { + "path": "Desktop/Forests.pptx" + } + } + ], "trajectory": "trajectories/", "related_apps": [ - "" + "libreoffice_impress" ], - "evaluator": "evaluation_dir" -} + "evaluator": { + "func": "compare_pptx_files", + "expected": { + "type": "cloud_file", + "path": "https://drive.usercontent.google.com/download?id=1nRwmFgYdskv3EiriZZFoT8TzM9CsG5B0&export=download&authuser=0&confirm=t&uuid=f2f919df-2867-4bc3-8bb9-dabd51108ebb&at=APZUnTWzw9LJWWXvH0cvdaWL-Ij-:1705319339474", + "dest": "Forests_Gold.docx" + }, + "result": { + "type": "vm_file", + "path": "Desktop/Forests.pptx", + "dest": "Forests.pptx" + } + } +} \ No newline at end of file diff --git a/evaluation_examples/examples/libreoffice_impress/c59742c0-4323-4b9d-8a02-723c251deaa0.json b/evaluation_examples/examples/libreoffice_impress/c59742c0-4323-4b9d-8a02-723c251deaa0.json index 5321e76..9eb877c 100644 --- a/evaluation_examples/examples/libreoffice_impress/c59742c0-4323-4b9d-8a02-723c251deaa0.json +++ b/evaluation_examples/examples/libreoffice_impress/c59742c0-4323-4b9d-8a02-723c251deaa0.json @@ -1,12 +1,48 @@ { "id": "c59742c0-4323-4b9d-8a02-723c251deaa0", "snapshot": "libreoffice_impress", - "instruction": "Could you help me add video into the presentation file?", + "instruction": "I am making PPT about the history of baseball. I want to add an introduction audio named \"Baseball.mp3\" on the Desktop into my PPT, but I do not know how. Could you help me add audio into my presentation file?", "source": "https://www.reddit.com/r/libreoffice/comments/17lcdrp/audio_not_supported_in_libreoffice_impress/", - "config": [], + "config": [ + { + "type": "download", + "parameters": { + "files": [ + { + "url": "https://drive.usercontent.google.com/download?id=1Oy5Zga6PnvpIwJ1OHMdFf3mSbm_YClHh&export=download&authuser=0&confirm=t&uuid=da1db839-da27-4bb5-a4fc-0358342f493b&at=APZUnTVsLyGsj8qI1rPyGAUTCX4F:1705324246149", + "path": "Desktop/Mady_and_Mia_Baseball.pptx" + } + ] + } + }, + { + "type": "download", + "parameters": { + "files": [ + { + "url": "https://drive.usercontent.google.com/download?id=1a-DaT1LUuvh55GsjpJkhpjcDTh8CPzkl&export=download&authuser=0&confirm=t&uuid=2bfb7700-5222-47ad-9aee-e5c22b50dbfe&at=APZUnTW7RQR5HI9giEl-cirnSX3q:1705320977391", + "path": "Desktop/Baseball.mp3" + } + ] + } + }, + { + "type": "open", + "parameters": { + "path": "Desktop/Mady_and_Mia_Baseball.pptx" + } + } + ], "trajectory": "trajectories/", "related_apps": [ - "" + "libreoffice_impress" ], - "evaluator": "evaluation_dir" -} + "evaluator": { + "func": "check_for_audio", + "result": { + "type": "vm_file", + "path": "Desktop/Mady_and_Mia_Baseball.pptx", + "dest": "Mady_and_Mia_Baseball.pptx" + } + } +} \ No newline at end of file diff --git a/experiment.py b/experiment.py index 48bb1dc..8e7f8b5 100644 --- a/experiment.py +++ b/experiment.py @@ -6,6 +6,7 @@ import sys from desktop_env.envs.desktop_env import DesktopEnv from mm_agents.gpt_4v_agent import GPT4v_Agent +from mm_agents.gemini_pro_agent import GeminiPro_Agent # Logger Configs {{{ # logger = logging.getLogger() @@ -44,7 +45,7 @@ logger = logging.getLogger("desktopenv.experiment") PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx" -def run_one_example(example, agent, max_steps=2, example_trajectory_dir="exp_trajectory", recording=True): +def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_trajectory", recording=True): trajectory_recording_path = os.path.join(example_trajectory_dir, "trajectory.json") env = DesktopEnv( path_to_vm=PATH_TO_VM, @@ -53,7 +54,6 @@ def run_one_example(example, agent, max_steps=2, example_trajectory_dir="exp_tra ) # reset the environment to certain snapshot observation = env.reset() - observation['instruction'] = example['instruction'] done = False step_num = 0 @@ -63,17 +63,14 @@ def run_one_example(example, agent, max_steps=2, example_trajectory_dir="exp_tra while not done and step_num < max_steps: actions = agent.predict(observation) + step_num += 1 for action in actions: - step_num += 1 - # Capture the timestamp before executing the action action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S") + logger.info("Step %d: %s", step_num, action) observation, reward, done, info = env.step(action) - observation['instruction'] = example['instruction'] - # Logging - logger.info("Step %d: %s", step_num, action) logger.info("Reward: %.2f", reward) logger.info("Done: %s", done) logger.info("Info: %s", info) @@ -114,19 +111,22 @@ def run_one_example(example, agent, max_steps=2, example_trajectory_dir="exp_tra if __name__ == "__main__": action_space = "pyautogui" - example_class = "vlc" - example_id = "8f080098-ddb1-424c-b438-4e96e5e4786e" + example_class = "thunderbird" + example_id = "bb5e4c0d-f964-439c-97b6-bdb9747de3f4" with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f: example = json.load(f) - example["snapshot"] = "exp_setup" + example["snapshot"] = "exp_setup2" - api_key = os.environ.get("OPENAI_API_KEY") - agent = GPT4v_Agent(api_key=api_key, action_space=action_space) + # api_key = os.environ.get("OPENAI_API_KEY") + # agent = GPT4v_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space) + + api_key = os.environ.get("GENAI_API_KEY") + agent = GeminiPro_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space) root_trajectory_dir = "exp_trajectory" example_trajectory_dir = os.path.join(root_trajectory_dir, example_class, example_id) os.makedirs(example_trajectory_dir, exist_ok=True) - run_one_example(example, agent, 2, example_trajectory_dir) + run_one_example(example, agent, 10, example_trajectory_dir) diff --git a/experiment_pure_text.py b/experiment_pure_text.py new file mode 100644 index 0000000..cfcbd46 --- /dev/null +++ b/experiment_pure_text.py @@ -0,0 +1,135 @@ +import datetime +import json +import logging +import os +import sys + +from desktop_env.envs.desktop_env import DesktopEnv +from mm_agents.gpt_4_agent import GPT4_Agent +from mm_agents.gemini_pro_agent import GeminiPro_Agent + +# Logger Configs {{{ # +logger = logging.getLogger() +logger.setLevel(logging.DEBUG) + +datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S") + +file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8") +debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8") +stdout_handler = logging.StreamHandler(sys.stdout) +sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8") + +file_handler.setLevel(logging.INFO) +debug_handler.setLevel(logging.DEBUG) +stdout_handler.setLevel(logging.INFO) +sdebug_handler.setLevel(logging.DEBUG) + +formatter = logging.Formatter( + fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s") +file_handler.setFormatter(formatter) +debug_handler.setFormatter(formatter) +stdout_handler.setFormatter(formatter) +sdebug_handler.setFormatter(formatter) + +stdout_handler.addFilter(logging.Filter("desktopenv")) +sdebug_handler.addFilter(logging.Filter("desktopenv")) + +logger.addHandler(file_handler) +logger.addHandler(debug_handler) +logger.addHandler(stdout_handler) +logger.addHandler(sdebug_handler) +# }}} Logger Configs # + +logger = logging.getLogger("desktopenv.experiment") + +PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx" + + +def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_trajectory", recording=True): + trajectory_recording_path = os.path.join(example_trajectory_dir, "trajectory.json") + env = DesktopEnv( + path_to_vm=PATH_TO_VM, + action_space=agent.action_space, + task_config=example + ) + # reset the environment to certain snapshot + observation = env.reset() + done = False + step_num = 0 + + if recording: + # send a request to the server to start recording + env.controller.start_recording() + + while not done and step_num < max_steps: + actions = agent.predict(observation) + step_num += 1 + for action in actions: + # Capture the timestamp before executing the action + action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S") + logger.info("Step %d: %s", step_num, action) + + observation, reward, done, info = env.step(action) + + logger.info("Reward: %.2f", reward) + logger.info("Done: %s", done) + logger.info("Info: %s", info) + + # Save screenshot and trajectory information + with open(os.path.join(example_trajectory_dir, f"step_{step_num}_{action_timestamp}.png"), "wb") as _f: + with open(observation['screenshot'], "rb") as __f: + screenshot = __f.read() + _f.write(screenshot) + + with open(trajectory_recording_path, "a") as f: + f.write(json.dumps({ + "step_num": step_num, + "action_timestamp": action_timestamp, + "action": action, + "reward": reward, + "done": done, + "info": info, + "screenshot_file": f"step_{step_num}_{action_timestamp}.png" + })) + f.write("\n") + + if done: + logger.info("The episode is done.") + break + + if recording: + # send a request to the server to stop recording + env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4")) + + result = env.evaluate() + logger.info("Result: %.2f", result) + + # env.close() + logger.info("Environment closed.") + + +if __name__ == "__main__": + action_space = "pyautogui" + example_class = "chrome" + example_id = "7a5a7856-f1b6-42a4-ade9-1ca81ca0f263" + gpt4_model = "gpt-4-1106-preview" + gemini_model = "gemini-pro-vision" + + with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f: + example = json.load(f) + example["snapshot"] = "exp_setup4" + + api_key = os.environ.get("OPENAI_API_KEY") + agent = GPT4_Agent(api_key=api_key, model=gpt4_model, instruction=example['instruction'], action_space=action_space) + + # api_key = os.environ.get("GENAI_API_KEY") + # agent = GeminiPro_Agent(api_key=api_key, model=gemini_model, instruction=example['instruction'], action_space=action_space) + + root_trajectory_dir = "exp_trajectory" + + example_trajectory_dir = os.path.join(root_trajectory_dir, "text", example_class, gpt4_model, example_id) + # example_trajectory_dir = os.path.join(root_trajectory_dir, "text", example_class, gemini_model, example_id) + + os.makedirs(example_trajectory_dir, exist_ok=True) + + run_one_example(example, agent, 15, example_trajectory_dir) diff --git a/mm_agents/accessibility_tree_wrap/__init__.py b/mm_agents/accessibility_tree_wrap/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py new file mode 100644 index 0000000..d6f83eb --- /dev/null +++ b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py @@ -0,0 +1,102 @@ +import xml.etree.ElementTree as ET + +from PIL import Image, ImageDraw, ImageFont + + +def find_leaf_nodes(xlm_file_str): + if not xlm_file_str: + return [] + + root = ET.fromstring(xlm_file_str) + + # Recursive function to traverse the XML tree and collect leaf nodes + def collect_leaf_nodes(node, leaf_nodes): + # If the node has no children, it is a leaf node, add it to the list + if not list(node): + leaf_nodes.append(node) + # If the node has children, recurse on each child + for child in node: + collect_leaf_nodes(child, leaf_nodes) + + # List to hold all leaf nodes + leaf_nodes = [] + collect_leaf_nodes(root, leaf_nodes) + return leaf_nodes + + +def filter_nodes(nodes): + filtered_nodes = [] + + for node in nodes: + if not node.get('{uri:deskat:state.at-spi.gnome.org}visible', None) == 'true': + # Not visible + continue + # Check if the node is a 'panel' + if node.tag == 'panel': + # Check if the 'panel' represents an interactive element + # or if it has certain attributes that are of interest. + # Add your conditions here... + if node.get('{uri:deskat:state.at-spi.gnome.org}focusable', 'false') == 'true': + filtered_nodes.append(node) + elif node.tag == 'text': + continue + else: + coords = tuple(map(int, node.attrib.get('{uri:deskat:component.at-spi.gnome.org}screencoord').strip('()').split(', '))) + if coords[0] < 0 or coords[1] < 0: + continue + size = tuple(map(int, node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size').strip('()').split(', '))) + if size[0] <= 0 or size[1] <= 0: + continue + # Node is not a 'panel', add to the list. + filtered_nodes.append(node) + + return filtered_nodes + + +def draw_bounding_boxes(nodes, image_file_path, output_image_file_path): + # Load the screenshot image + image = Image.open(image_file_path) + draw = ImageDraw.Draw(image) + + # Optional: Load a font. If you don't specify a font, a default one will be used. + try: + # Adjust the path to the font file you have or use a default one + font = ImageFont.truetype("arial.ttf", 20) + except IOError: + # Fallback to a basic font if the specified font can't be loaded + font = ImageFont.load_default() + + # Loop over all the visible nodes and draw their bounding boxes + for index, _node in enumerate(nodes): + coords_str = _node.attrib.get('{uri:deskat:component.at-spi.gnome.org}screencoord') + size_str = _node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size') + + if coords_str and size_str: + try: + # Parse the coordinates and size from the strings + coords = tuple(map(int, coords_str.strip('()').split(', '))) + size = tuple(map(int, size_str.strip('()').split(', '))) + + # Check for negative sizes + if size[0] <= 0 or size[1] <= 0: + raise ValueError(f"Size must be positive, got: {size}") + + # Calculate the bottom-right corner of the bounding box + bottom_right = (coords[0] + size[0], coords[1] + size[1]) + + # Check that bottom_right > coords (x1 >= x0, y1 >= y0) + if bottom_right[0] < coords[0] or bottom_right[1] < coords[1]: + raise ValueError(f"Invalid coordinates or size, coords: {coords}, size: {size}") + + # Draw rectangle on image + draw.rectangle([coords, bottom_right], outline="red", width=2) + + # Draw index number at the bottom left of the bounding box + text_position = (coords[0], bottom_right[1]) # Adjust Y to be above the bottom right + draw.text(text_position, str(index), font=font, fill="purple") + + except ValueError as e: + pass + + # Save the result + image.save(output_image_file_path) diff --git a/mm_agents/accessibility_tree_wrap/relevant_retrieve.py b/mm_agents/accessibility_tree_wrap/relevant_retrieve.py new file mode 100644 index 0000000..e69de29 diff --git a/mm_agents/fuyu_test.py b/mm_agents/fuyu_test.py deleted file mode 100644 index ea77186..0000000 --- a/mm_agents/fuyu_test.py +++ /dev/null @@ -1,20 +0,0 @@ -from transformers import FuyuProcessor, FuyuForCausalLM -from PIL import Image - -image = Image.open("stackoverflow.png").convert("RGB") - -# load model and processor -model_id = "adept/fuyu-8b" -processor = FuyuProcessor.from_pretrained(model_id) -model = FuyuForCausalLM.from_pretrained(model_id, device_map="cuda:0") - -# prepare inputs for the model -text_prompt = "Description:\n" - -inputs = processor(text=text_prompt, images=image, return_tensors="pt").to("cuda:0") - -# autoregressively generate text -generation_output = model.generate(**inputs, max_new_tokens=100) -generation_text = processor.batch_decode(generation_output[:, -100:], skip_special_tokens=True) - -print(generation_text) diff --git a/mm_agents/gemini_agent.py b/mm_agents/gemini_agent.py deleted file mode 100644 index 37e22f2..0000000 --- a/mm_agents/gemini_agent.py +++ /dev/null @@ -1,84 +0,0 @@ -from typing import Dict - -import PIL.Image -import google.generativeai as genai - -from mm_agents.gpt_4v_agent import parse_actions_from_string, parse_code_from_string -from mm_agents.gpt_4v_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION -from mm_agents.gpt_4v_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE - - -class GeminiPro_Agent: - def __init__(self, api_key, model='gemini-pro-vision', max_tokens=300, action_space="computer_13"): - genai.configure(api_key) - self.model = genai.GenerativeModel(model) - self.max_tokens = max_tokens - self.action_space = action_space - - self.trajectory = [ - { - "role": "system", - "parts": [ - { - "computer_13": SYS_PROMPT_ACTION, - "pyautogui": SYS_PROMPT_CODE - }[action_space] - ] - } - ] - - def predict(self, obs: Dict): - """ - Predict the next action(s) based on the current observation. - """ - img = PIL.Image.open(obs["screenshot"]) - self.trajectory.append({ - "role": "user", - "parts": ["To accomplish the task '{}' and given the current screenshot, what's the next step?".format( - obs["instruction"]), img] - }) - - traj_to_show = [] - for i in range(len(self.trajectory)): - traj_to_show.append(self.trajectory[i]["parts"][0]) - if len(self.trajectory[i]["parts"]) > 1: - traj_to_show.append("screenshot_obs") - - print("Trajectory:", traj_to_show) - - response = self.model.generate_content(self.trajectory, max_tokens=self.max_tokens) - - try: - # fixme: change to fit the new response format from gemini pro - actions = self.parse_actions(response.json()['choices'][0]['message']['content']) - except: - # todo: add error handling - print("Failed to parse action from response:", response.json()['choices'][0]['message']['content']) - actions = None - - return actions - - def parse_actions(self, response: str): - # response example - """ - ```json - { - "action_type": "CLICK", - "click_type": "RIGHT" - } - ``` - """ - - # parse from the response - if self.action_space == "computer_13": - actions = parse_actions_from_string(response) - elif self.action_space == "pyautogui": - actions = parse_code_from_string(response) - - # add action into the trajectory - self.trajectory.append({ - "role": "assistant", - "parts": [response] - }) - - return actions diff --git a/mm_agents/gemini_pro_agent.py b/mm_agents/gemini_pro_agent.py new file mode 100644 index 0000000..26f9c0e --- /dev/null +++ b/mm_agents/gemini_pro_agent.py @@ -0,0 +1,134 @@ +import time +from typing import Dict, List + +import google.generativeai as genai + +from mm_agents.accessibility_tree_wrap.heuristic_retrieve import find_leaf_nodes, filter_nodes +from mm_agents.gpt_4_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION +from mm_agents.gpt_4_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE +from mm_agents.gpt_4v_agent import parse_actions_from_string, parse_code_from_string + + +class GeminiPro_Agent: + def __init__(self, api_key, instruction, model='gemini-pro', max_tokens=300, temperature=0.0, + action_space="computer_13"): + genai.configure(api_key=api_key) + self.instruction = instruction + self.model = genai.GenerativeModel(model) + self.max_tokens = max_tokens + self.temperature = temperature + self.action_space = action_space + + self.trajectory = [ + { + "role": "system", + "parts": [ + { + "computer_13": SYS_PROMPT_ACTION, + "pyautogui": SYS_PROMPT_CODE + }[action_space] + "\nHere is the instruction for the task: {}".format(self.instruction) + ] + } + ] + + def predict(self, obs: Dict) -> List: + """ + Predict the next action(s) based on the current observation. + Only support single-round conversation, only fill-in the last desktop screenshot. + """ + accessibility_tree = obs["accessibility_tree"] + + leaf_nodes = find_leaf_nodes(accessibility_tree) + filtered_nodes = filter_nodes(leaf_nodes) + + linearized_accessibility_tree = "tag\ttext\tposition\tsize\n" + # Linearize the accessibility tree nodes into a table format + + for node in filtered_nodes: + linearized_accessibility_tree += node.tag + "\t" + linearized_accessibility_tree += node.attrib.get('name') + "\t" + linearized_accessibility_tree += node.attrib.get( + '{uri:deskat:component.at-spi.gnome.org}screencoord') + "\t" + linearized_accessibility_tree += node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size') + "\n" + + self.trajectory.append({ + "role": "user", + "parts": [ + "Given the XML format of accessibility tree (convert and formatted into table) as below:\n{}\nWhat's the next step that you will do to help with the task?".format( + linearized_accessibility_tree)] + }) + + # todo: Remove this step once the Gemini supports multi-round conversation + all_message_str = "" + for i in range(len(self.trajectory)): + if i == 0: + all_message_template = "<|im_start|>system\n{}\n<|im_end|>\n" + elif i % 2 == 1: + all_message_template = "<|im_start|>user\n{}\n<|im_end|>\n" + else: + all_message_template = "<|im_start|>assistant\n{}\n<|im_end|>\n" + + all_message_str += all_message_template.format(self.trajectory[i]["parts"][0]) + + print("All message: >>>>>>>>>>>>>>>> ") + print( + all_message_str + ) + + message_for_gemini = { + "role": "user", + "parts": [all_message_str] + } + + traj_to_show = [] + for i in range(len(self.trajectory)): + traj_to_show.append(self.trajectory[i]["parts"][0]) + if len(self.trajectory[i]["parts"]) > 1: + traj_to_show.append("screenshot_obs") + + print("Trajectory:", traj_to_show) + + while True: + try: + response = self.model.generate_content( + message_for_gemini, + generation_config={ + "max_output_tokens": self.max_tokens, + "temperature": self.temperature + } + ) + break + except: + print("Failed to generate response, retrying...") + time.sleep(5) + pass + + try: + response_text = response.text + except: + return [] + + try: + actions = self.parse_actions(response_text) + except: + print("Failed to parse action from response:", response_text) + actions = [] + + return actions + + def parse_actions(self, response: str): + # parse from the response + if self.action_space == "computer_13": + actions = parse_actions_from_string(response) + elif self.action_space == "pyautogui": + actions = parse_code_from_string(response) + else: + raise ValueError("Invalid action space: " + self.action_space) + + # add action into the trajectory + self.trajectory.append({ + "role": "assistant", + "parts": [response] + }) + + return actions diff --git a/mm_agents/gemini_pro_vision_agent.py b/mm_agents/gemini_pro_vision_agent.py new file mode 100644 index 0000000..2d5d365 --- /dev/null +++ b/mm_agents/gemini_pro_vision_agent.py @@ -0,0 +1,113 @@ +import time +from typing import Dict, List + +import PIL.Image +import google.generativeai as genai + +from mm_agents.gpt_4v_agent import parse_actions_from_string, parse_code_from_string +from mm_agents.gpt_4v_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION +from mm_agents.gpt_4v_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE + + +class GeminiProV_Agent: + def __init__(self, api_key, instruction, model='gemini-pro-vision', max_tokens=300, temperature=0.0, + action_space="computer_13"): + genai.configure(api_key=api_key) + self.instruction = instruction + self.model = genai.GenerativeModel(model) + self.max_tokens = max_tokens + self.temperature = temperature + self.action_space = action_space + + self.trajectory = [ + { + "role": "system", + "parts": [ + { + "computer_13": SYS_PROMPT_ACTION, + "pyautogui": SYS_PROMPT_CODE + }[action_space] + "\nHere is the instruction for the task: {}".format(self.instruction) + ] + } + ] + + def predict(self, obs: Dict) -> List: + """ + Predict the next action(s) based on the current observation. + Only support single-round conversation, only fill-in the last desktop screenshot. + """ + img = PIL.Image.open(obs["screenshot"]) + self.trajectory.append({ + "role": "user", + "parts": ["What's the next step that you will do to help with the task?", img] + }) + + # todo: Remove this step once the Gemini supports multi-round conversation + all_message_str = "" + for i in range(len(self.trajectory)): + if i == 0: + all_message_template = "<|im_start|>system\n{}\n<|im_end|>\n" + elif i % 2 == 1: + all_message_template = "<|im_start|>user\n{}\n<|im_end|>\n" + else: + all_message_template = "<|im_start|>assistant\n{}\n<|im_end|>\n" + + all_message_str += all_message_template.format(self.trajectory[i]["parts"][0]) + + message_for_gemini = { + "role": "user", + "parts": [all_message_str, img] + } + + traj_to_show = [] + for i in range(len(self.trajectory)): + traj_to_show.append(self.trajectory[i]["parts"][0]) + if len(self.trajectory[i]["parts"]) > 1: + traj_to_show.append("screenshot_obs") + + print("Trajectory:", traj_to_show) + + while True: + try: + response = self.model.generate_content( + message_for_gemini, + generation_config={ + "max_output_tokens": self.max_tokens, + "temperature": self.temperature + } + ) + break + except: + print("Failed to generate response, retrying...") + time.sleep(5) + pass + + try: + response_text = response.text + except: + return [] + + try: + actions = self.parse_actions(response_text) + except: + print("Failed to parse action from response:", response_text) + actions = [] + + return actions + + def parse_actions(self, response: str): + # parse from the response + if self.action_space == "computer_13": + actions = parse_actions_from_string(response) + elif self.action_space == "pyautogui": + actions = parse_code_from_string(response) + else: + raise ValueError("Invalid action space: " + self.action_space) + + # add action into the trajectory + self.trajectory.append({ + "role": "assistant", + "parts": [response] + }) + + return actions diff --git a/mm_agents/gpt_4_agent.py b/mm_agents/gpt_4_agent.py new file mode 100644 index 0000000..aa19185 --- /dev/null +++ b/mm_agents/gpt_4_agent.py @@ -0,0 +1,195 @@ +import base64 +import json +import re +import time +from typing import Dict, List + +import requests + +from mm_agents.accessibility_tree_wrap.heuristic_retrieve import find_leaf_nodes, filter_nodes +from mm_agents.gpt_4_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION +from mm_agents.gpt_4_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE + + +# Function to encode the image +def encode_image(image_path): + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode('utf-8') + + +def parse_actions_from_string(input_string): + # Search for a JSON string within the input string + actions = [] + matches = re.findall(r'```json\s+(.*?)\s+```', input_string, re.DOTALL) + if matches: + # Assuming there's only one match, parse the JSON string into a dictionary + try: + for match in matches: + action_dict = json.loads(match) + actions.append(action_dict) + return actions + except json.JSONDecodeError as e: + return f"Failed to parse JSON: {e}" + else: + matches = re.findall(r'```\s+(.*?)\s+```', input_string, re.DOTALL) + if matches: + # Assuming there's only one match, parse the JSON string into a dictionary + try: + for match in matches: + action_dict = json.loads(match) + actions.append(action_dict) + return actions + except json.JSONDecodeError as e: + return f"Failed to parse JSON: {e}" + else: + try: + action_dict = json.loads(input_string) + return [action_dict] + except json.JSONDecodeError as e: + raise ValueError("Invalid response format: " + input_string) + + +def parse_code_from_string(input_string): + # This regular expression will match both ```code``` and ```python code``` + # and capture the `code` part. It uses a non-greedy match for the content inside. + pattern = r"```(?:\w+\s+)?(.*?)```" + # Find all non-overlapping matches in the string + matches = re.findall(pattern, input_string, re.DOTALL) + + # The regex above captures the content inside the triple backticks. + # The `re.DOTALL` flag allows the dot `.` to match newline characters as well, + # so the code inside backticks can span multiple lines. + + # matches now contains all the captured code snippets + + codes = [] + + for match in matches: + match = match.strip() + commands = ['WAIT', 'DONE', 'FAIL'] # fixme: updates this part when we have more commands + + if match in commands: + codes.append(match.strip()) + elif match.split('\n')[-1] in commands: + if len(match.split('\n')) > 1: + codes.append("\n".join(match.split('\n')[:-1])) + codes.append(match.split('\n')[-1]) + else: + codes.append(match) + + return codes + + +class GPT4_Agent: + def __init__(self, api_key, instruction, model="gpt-4-1106-preview", max_tokens=600, action_space="computer_13"): + self.instruction = instruction + self.model = model + self.max_tokens = max_tokens + self.action_space = action_space + + self.headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {api_key}" + } + + self.trajectory = [ + { + "role": "system", + "content": [ + { + "type": "text", + "text": { + "computer_13": SYS_PROMPT_ACTION, + "pyautogui": SYS_PROMPT_CODE + }[action_space] + "\nHere is the instruction for the task: {}".format(self.instruction) + }, + ] + } + ] + + def predict(self, obs: Dict) -> List: + """ + Predict the next action(s) based on the current observation. + """ + accessibility_tree = obs["accessibility_tree"] + + leaf_nodes = find_leaf_nodes(accessibility_tree) + filtered_nodes = filter_nodes(leaf_nodes) + + linearized_accessibility_tree = "tag\ttext\tposition\tsize\n" + # Linearize the accessibility tree nodes into a table format + + for node in filtered_nodes: + linearized_accessibility_tree += node.tag + "\t" + linearized_accessibility_tree += node.attrib.get('name') + "\t" + linearized_accessibility_tree += node.attrib.get( + '{uri:deskat:component.at-spi.gnome.org}screencoord') + "\t" + linearized_accessibility_tree += node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size') + "\n" + + self.trajectory.append({ + "role": "user", + "content": [ + { + "type": "text", + "text": "Given the XML format of accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format( + linearized_accessibility_tree) + } + ] + }) + + # print( + # "Given the XML format of accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format( + # linearized_accessibility_tree) + # ) + + traj_to_show = [] + for i in range(len(self.trajectory)): + traj_to_show.append(self.trajectory[i]["content"][0]["text"]) + if len(self.trajectory[i]["content"]) > 1: + traj_to_show.append("screenshot_obs") + + payload = { + "model": self.model, + "messages": self.trajectory, + "max_tokens": self.max_tokens + } + + while True: + try: + response = requests.post("https://api.openai.com/v1/chat/completions", headers=self.headers, + json=payload) + break + except: + print("Failed to generate response, retrying...") + time.sleep(5) + pass + + try: + actions = self.parse_actions(response.json()['choices'][0]['message']['content']) + except: + print("Failed to parse action from response:", response.json()) + actions = None + + return actions + + def parse_actions(self, response: str): + # parse from the response + if self.action_space == "computer_13": + actions = parse_actions_from_string(response) + elif self.action_space == "pyautogui": + actions = parse_code_from_string(response) + else: + raise ValueError("Invalid action space: " + self.action_space) + + # add action into the trajectory + self.trajectory.append({ + "role": "assistant", + "content": [ + { + "type": "text", + "text": response + }, + ] + }) + + return actions diff --git a/mm_agents/gpt_4_prompt_action.py b/mm_agents/gpt_4_prompt_action.py new file mode 100644 index 0000000..3019074 --- /dev/null +++ b/mm_agents/gpt_4_prompt_action.py @@ -0,0 +1,244 @@ +SYS_PROMPT = """ +You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection. +For each step, you will get an observation of the desktop by the XML format of accessibility tree, which is based on AT-SPI library. And you will predict the action of the computer based on the accessibility tree. + +HERE is the description of the action space you need to predict, follow the format and choose the correct action type and parameters: +ACTION_SPACE = [ + { + "action_type": "MOVE_TO", + "note": "move the cursor to the specified position", + "parameters": { + "x": { + "type": float, + "range": [0, X_MAX], + "optional": False, + }, + "y": { + "type": float, + "range": [0, Y_MAX], + "optional": False, + } + } + }, + { + "action_type": "CLICK", + "note": "click the left button if the button not specified, otherwise click the specified button; click at the current position if x and y are not specified, otherwise click at the specified position", + "parameters": { + "button": { + "type": str, + "range": ["left", "right", "middle"], + "optional": True, + }, + "x": { + "type": float, + "range": [0, X_MAX], + "optional": True, + }, + "y": { + "type": float, + "range": [0, Y_MAX], + "optional": True, + }, + "num_clicks": { + "type": int, + "range": [1, 2, 3], + "optional": True, + }, + } + }, + { + "action_type": "MOUSE_DOWN", + "note": "press the left button if the button not specified, otherwise press the specified button", + "parameters": { + "button": { + "type": str, + "range": ["left", "right", "middle"], + "optional": True, + } + } + }, + { + "action_type": "MOUSE_UP", + "note": "release the left button if the button not specified, otherwise release the specified button", + "parameters": { + "button": { + "type": str, + "range": ["left", "right", "middle"], + "optional": True, + } + } + }, + { + "action_type": "RIGHT_CLICK", + "note": "right click at the current position if x and y are not specified, otherwise right click at the specified position", + "parameters": { + "x": { + "type": float, + "range": [0, X_MAX], + "optional": True, + }, + "y": { + "type": float, + "range": [0, Y_MAX], + "optional": True, + } + } + }, + { + "action_type": "DOUBLE_CLICK", + "note": "double click at the current position if x and y are not specified, otherwise double click at the specified position", + "parameters": { + "x": { + "type": float, + "range": [0, X_MAX], + "optional": True, + }, + "y": { + "type": float, + "range": [0, Y_MAX], + "optional": True, + } + } + }, + { + "action_type": "DRAG_TO", + "note": "drag the cursor to the specified position with the left button pressed", + "parameters": { + "x": { + "type": float, + "range": [0, X_MAX], + "optional": False, + }, + "y": { + "type": float, + "range": [0, Y_MAX], + "optional": False, + } + } + }, + { + "action_type": "SCROLL", + "note": "scroll the mouse wheel up or down", + "parameters": { + "dx": { + "type": int, + "range": None, + "optional": False, + }, + "dy": { + "type": int, + "range": None, + "optional": False, + } + } + }, + { + "action_type": "TYPING", + "note": "type the specified text", + "parameters": { + "text": { + "type": str, + "range": None, + "optional": False, + } + } + }, + { + "action_type": "PRESS", + "note": "press the specified key and release it", + "parameters": { + "key": { + "type": str, + "range": KEYBOARD_KEYS, + "optional": False, + } + } + }, + { + "action_type": "KEY_DOWN", + "note": "press the specified key", + "parameters": { + "key": { + "type": str, + "range": KEYBOARD_KEYS, + "optional": False, + } + } + }, + { + "action_type": "KEY_UP", + "note": "release the specified key", + "parameters": { + "key": { + "type": str, + "range": KEYBOARD_KEYS, + "optional": False, + } + } + }, + { + "action_type": "HOTKEY", + "note": "press the specified key combination", + "parameters": { + "keys": { + "type": list, + "range": [KEYBOARD_KEYS], + "optional": False, + } + } + }, + ############################################################################################################ + { + "action_type": "WAIT", + "note": "wait until the next action", + }, + { + "action_type": "FAIL", + "note": "decide the task can not be performed", + }, + { + "action_type": "DONE", + "note": "decide the task is done", + } +] +Firstly you need to predict the class of your action, then you need to predict the parameters of your action: +- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is (1920, 1080) +for example, format as: +``` +{ + "action_type": "MOUSE_MOVE", + "x": 1319.11, + "y": 65.06 +} +``` +- For [CLICK, MOUSE_DOWN, MOUSE_UP], you need to specify the click_type as well, select from [LEFT, MIDDLE, RIGHT, WHEEL_UP, WHEEL_DOWN], which means you click the left button, middle button, right button, wheel up or wheel down of your mouse: +for example, format as: +``` +{ + "action_type": "CLICK", + "click_type": "LEFT" +} +``` +- For [KEY, KEY_DOWN, KEY_UP], you need to choose a(multiple) key(s) from the keyboard +for example, format as: +``` +{ + "action_type": "KEY", + "key": "ctrl+c" +} +``` +- For TYPE, you need to specify the text you want to type +for example, format as: +``` +{ + "action_type": "TYPE", + "text": "hello world" +} +``` + +REMEMBER: +For every step, you should only RETURN ME THE action_type AND parameters I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE. +You MUST wrap the dict with backticks (\`). +You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty. +You CAN predict multiple actions at one step, but you should only return one action for each step. +""" \ No newline at end of file diff --git a/mm_agents/gpt_4_prompt_code.py b/mm_agents/gpt_4_prompt_code.py new file mode 100644 index 0000000..25e4083 --- /dev/null +++ b/mm_agents/gpt_4_prompt_code.py @@ -0,0 +1,18 @@ +SYS_PROMPT = """ +You are an agent which follow my instruction and perform desktop computer tasks as instructed. +You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard. +For each step, you will get an observation of the desktop by the XML format of accessibility tree, which is based on AT-SPI library. And you will predict the action of the computer based on the accessibility tree. + +You are required to use `pyautogui` to perform the action. +Return one line or multiple lines of python code to perform the action each time, be time efficient. +You ONLY need to return the code inside a code block, like this: +```python +# your code here +``` +Specially, it is also allowed to return the following special code: +When you think you have to wait for some time, return ```WAIT```; +When you think the task can not be done, return ```FAIL```, don't easily say ```FAIL```, try your best to do the task; +When you think the task is done, return ```DONE```. + +First give the current screenshot and previous things we did a reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE. +""" \ No newline at end of file diff --git a/mm_agents/gpt_4v_agent.py b/mm_agents/gpt_4v_agent.py index 203b40c..d594b76 100644 --- a/mm_agents/gpt_4v_agent.py +++ b/mm_agents/gpt_4v_agent.py @@ -1,7 +1,8 @@ import base64 import json import re -from typing import Dict +import time +from typing import Dict, List import requests @@ -63,7 +64,8 @@ def parse_code_from_string(input_string): class GPT4v_Agent: - def __init__(self, api_key, model="gpt-4-vision-preview", max_tokens=300, action_space="computer_13"): + def __init__(self, api_key, instruction, model="gpt-4-vision-preview", max_tokens=300, action_space="computer_13"): + self.instruction = instruction self.model = model self.max_tokens = max_tokens self.action_space = action_space @@ -80,15 +82,15 @@ class GPT4v_Agent: { "type": "text", "text": { - "computer_13": SYS_PROMPT_ACTION, - "pyautogui": SYS_PROMPT_CODE - }[action_space] + "computer_13": SYS_PROMPT_ACTION, + "pyautogui": SYS_PROMPT_CODE + }[action_space] + "\nHere is the instruction for the task: {}".format(self.instruction) }, ] } ] - def predict(self, obs: Dict): + def predict(self, obs: Dict) -> List: """ Predict the next action(s) based on the current observation. """ @@ -98,8 +100,7 @@ class GPT4v_Agent: "content": [ { "type": "text", - "text": "To accomplish the task '{}' and given the current screenshot, what's the next step?".format( - obs["instruction"]) + "text": "What's the next step that you will do to help with the task?" }, { "type": "image_url", @@ -123,33 +124,32 @@ class GPT4v_Agent: "messages": self.trajectory, "max_tokens": self.max_tokens } - response = requests.post("https://api.openai.com/v1/chat/completions", headers=self.headers, json=payload) + while True: + try: + response = requests.post("https://api.openai.com/v1/chat/completions", headers=self.headers, + json=payload) + break + except: + print("Failed to generate response, retrying...") + time.sleep(5) + pass try: actions = self.parse_actions(response.json()['choices'][0]['message']['content']) except: - # todo: add error handling - print("Failed to parse action from response:", response.json()['choices'][0]['message']['content']) + print("Failed to parse action from response:", response.json()) actions = None return actions def parse_actions(self, response: str): - # response example - """ - ```json - { - "action_type": "CLICK", - "click_type": "RIGHT" - } - ``` - """ - # parse from the response if self.action_space == "computer_13": actions = parse_actions_from_string(response) elif self.action_space == "pyautogui": actions = parse_code_from_string(response) + else: + raise ValueError("Invalid action space: " + self.action_space) # add action into the trajectory self.trajectory.append({ diff --git a/mm_agents/gpt_4v_prompt_action.py b/mm_agents/gpt_4v_prompt_action.py index 650b136..4323df6 100644 --- a/mm_agents/gpt_4v_prompt_action.py +++ b/mm_agents/gpt_4v_prompt_action.py @@ -237,7 +237,7 @@ for example, format as: ``` REMEMBER: -For every step, you should only return the action_type and the parameters of your action as a dict, without any other things. +For every step, you should only RETURN ME THE action_type AND parameters I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE. You MUST wrap the dict with backticks (\`). You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty. You CAN predict multiple actions at one step, but you should only return one action for each step. diff --git a/mm_agents/gpt_4v_prompt_code.py b/mm_agents/gpt_4v_prompt_code.py index 17e8c9d..8f256da 100644 --- a/mm_agents/gpt_4v_prompt_code.py +++ b/mm_agents/gpt_4v_prompt_code.py @@ -1,11 +1,18 @@ SYS_PROMPT = """ -You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection. -For each step, you will get an observation of an image, which is the screenshot of the computer screen. And you will predict the action of the computer based on the image. +You are an agent which follow my instruction and perform desktop computer tasks as instructed. +You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard. +For each step, you will get an observation of an image, which is the screenshot of the computer screen and you will predict the action of the computer based on the image. You are required to use `pyautogui` to perform the action. Return one line or multiple lines of python code to perform the action each time, be time efficient. +You ONLY need to return the code inside a code block, like this: +```python +# your code here +``` +Specially, it is also allowed to return the following special code: +When you think you have to wait for some time, return ```WAIT```; +When you think the task can not be done, return ```FAIL```, don't easily say ```FAIL```, try your best to do the task; +When you think the task is done, return ```DONE```. -When you think you have to wait for some time, return `WAIT`. -When you think the task can not be done, return `FAIL`. -When you think the task is done, return `DONE`. +First give the current screenshot and previous things we did a reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE. """ \ No newline at end of file diff --git a/mm_agents/gui_som/READAME.md b/mm_agents/gui_som/READAME.md new file mode 100644 index 0000000..05b15ba --- /dev/null +++ b/mm_agents/gui_som/READAME.md @@ -0,0 +1 @@ +Deprecated since we found we can use `accelaerator` to do the same thing. But can be potentially used in the future when only access to screen is available. \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index a13f733..f2c4cc3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -30,3 +30,5 @@ ImageHash scikit-image librosa pymupdf +chardet +playwright