From 48a86d36cf84556933452dced412dc4b08261d65 Mon Sep 17 00:00:00 2001
From: Timothyxxx <384084775@qq.com>
Date: Tue, 16 Jan 2024 12:15:21 +0800
Subject: [PATCH] Minor updates

---
 README.md                        |   9 +-
 experiment_pure_text.py          | 131 +++++++++++++++++
 mm_agents/gpt_4_prompt_action.py | 244 +++++++++++++++++++++++++++++++
 mm_agents/gpt_4_prompt_code.py   |  18 +++
 4 files changed, 401 insertions(+), 1 deletion(-)
 create mode 100644 experiment_pure_text.py
 create mode 100644 mm_agents/gpt_4_prompt_action.py
 create mode 100644 mm_agents/gpt_4_prompt_code.py

diff --git a/README.md b/README.md
index b7d56df..5fd8aa6 100644
--- a/README.md
+++ b/README.md
@@ -23,4 +23,11 @@ todo
 - [x] Error handling during file passing and file opening, etc.
 - [x] Add accessibility tree from the OS into the observation space
 - [ ] Add pre-process and post-process action support for benchmarking setup and evaluation
-- [ ] Multiprocess support, this can enable the reinforcement learning to be more efficient
\ No newline at end of file
+- [ ] Multiprocess support, this can enable the reinforcement learning to be more efficient
+
+## Road map of benchmark, tools and resources (Proposed)
+- [ ] Improve the annotation tool base on DuckTrack, make it more robust which align on accessibility tree
+- [ ] Annotate the steps of doing the task
+- [ ] Build a website for the project
+- [ ] Crawl all resources we explored from the internet, and make it easy to access
+- [ ] Set up ways for community to contribute new examples
diff --git a/experiment_pure_text.py b/experiment_pure_text.py
new file mode 100644
index 0000000..011c7bf
--- /dev/null
+++ b/experiment_pure_text.py
@@ -0,0 +1,131 @@
+import datetime
+import json
+import logging
+import os
+import sys
+
+from desktop_env.envs.desktop_env import DesktopEnv
+from mm_agents.gpt_4_agent import GPT4_Agent
+from mm_agents.gemini_pro_agent import GeminiPro_Agent
+
+#  Logger Configs {{{ # 
+logger = logging.getLogger()
+logger.setLevel(logging.DEBUG)
+
+datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
+
+file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8")
+debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8")
+stdout_handler = logging.StreamHandler(sys.stdout)
+sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8")
+
+file_handler.setLevel(logging.INFO)
+debug_handler.setLevel(logging.DEBUG)
+stdout_handler.setLevel(logging.INFO)
+sdebug_handler.setLevel(logging.DEBUG)
+
+formatter = logging.Formatter(
+    fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s")
+file_handler.setFormatter(formatter)
+debug_handler.setFormatter(formatter)
+stdout_handler.setFormatter(formatter)
+sdebug_handler.setFormatter(formatter)
+
+stdout_handler.addFilter(logging.Filter("desktopenv"))
+sdebug_handler.addFilter(logging.Filter("desktopenv"))
+
+logger.addHandler(file_handler)
+logger.addHandler(debug_handler)
+logger.addHandler(stdout_handler)
+logger.addHandler(sdebug_handler)
+#  }}} Logger Configs # 
+
+logger = logging.getLogger("desktopenv.experiment")
+
+PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx"
+
+
+def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_trajectory", recording=True):
+    trajectory_recording_path = os.path.join(example_trajectory_dir, "trajectory.json")
+    env = DesktopEnv(
+        path_to_vm=PATH_TO_VM,
+        action_space=agent.action_space,
+        task_config=example
+    )
+    # reset the environment to certain snapshot
+    observation = env.reset()
+    done = False
+    step_num = 0
+
+    if recording:
+        # send a request to the server to start recording
+        env.controller.start_recording()
+
+    while not done and step_num < max_steps:
+        actions = agent.predict(observation)
+        step_num += 1
+        for action in actions:
+            # Capture the timestamp before executing the action
+            action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
+            logger.info("Step %d: %s", step_num, action)
+
+            observation, reward, done, info = env.step(action)
+
+            logger.info("Reward: %.2f", reward)
+            logger.info("Done: %s", done)
+            logger.info("Info: %s", info)
+
+            # Save screenshot and trajectory information
+            with open(os.path.join(example_trajectory_dir, f"step_{step_num}_{action_timestamp}.png"), "wb") as _f:
+                with open(observation['screenshot'], "rb") as __f:
+                    screenshot = __f.read()
+                _f.write(screenshot)
+
+            with open(trajectory_recording_path, "a") as f:
+                f.write(json.dumps({
+                    "step_num": step_num,
+                    "action_timestamp": action_timestamp,
+                    "action": action,
+                    "reward": reward,
+                    "done": done,
+                    "info": info,
+                    "screenshot_file": f"step_{step_num}_{action_timestamp}.png"
+                }))
+                f.write("\n")
+
+            if done:
+                logger.info("The episode is done.")
+                break
+
+    if recording:
+        # send a request to the server to stop recording
+        env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4"))
+
+    result = env.evaluate()
+    logger.info("Result: %.2f", result)
+
+    # env.close()
+    logger.info("Environment closed.")
+
+
+if __name__ == "__main__":
+    action_space = "pyautogui"
+    example_class = "chrome"
+    example_id = "bb5e4c0d-f964-439c-97b6-bdb9747de3f4"
+
+    with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f:
+        example = json.load(f)
+    example["snapshot"] = "exp_setup2"
+
+    # api_key = os.environ.get("OPENAI_API_KEY")
+    # agent = GPT4v_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space)
+
+    api_key = os.environ.get("GENAI_API_KEY")
+    agent = GeminiPro_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space)
+
+    root_trajectory_dir = "exp_trajectory"
+
+    example_trajectory_dir = os.path.join(root_trajectory_dir, example_class, example_id)
+    os.makedirs(example_trajectory_dir, exist_ok=True)
+
+    run_one_example(example, agent, 10, example_trajectory_dir)
diff --git a/mm_agents/gpt_4_prompt_action.py b/mm_agents/gpt_4_prompt_action.py
new file mode 100644
index 0000000..3019074
--- /dev/null
+++ b/mm_agents/gpt_4_prompt_action.py
@@ -0,0 +1,244 @@
+SYS_PROMPT = """
+You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection.
+For each step, you will get an observation of the desktop by the XML format of accessibility tree, which is based on AT-SPI library. And you will predict the action of the computer based on the accessibility tree.
+
+HERE is the description of the action space you need to predict, follow the format and choose the correct action type and parameters:
+ACTION_SPACE = [
+    {
+        "action_type": "MOVE_TO",
+        "note": "move the cursor to the specified position",
+        "parameters": {
+            "x": {
+                "type": float,
+                "range": [0, X_MAX],
+                "optional": False,
+            },
+            "y": {
+                "type": float,
+                "range": [0, Y_MAX],
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "CLICK",
+        "note": "click the left button if the button not specified, otherwise click the specified button; click at the current position if x and y are not specified, otherwise click at the specified position",
+        "parameters": {
+            "button": {
+                "type": str,
+                "range": ["left", "right", "middle"],
+                "optional": True,
+            },
+            "x": {
+                "type": float,
+                "range": [0, X_MAX],
+                "optional": True,
+            },
+            "y": {
+                "type": float,
+                "range": [0, Y_MAX],
+                "optional": True,
+            },
+            "num_clicks": {
+                "type": int,
+                "range": [1, 2, 3],
+                "optional": True,
+            },
+        }
+    },
+    {
+        "action_type": "MOUSE_DOWN",
+        "note": "press the left button if the button not specified, otherwise press the specified button",
+        "parameters": {
+            "button": {
+                "type": str,
+                "range": ["left", "right", "middle"],
+                "optional": True,
+            }
+        }
+    },
+    {
+        "action_type": "MOUSE_UP",
+        "note": "release the left button if the button not specified, otherwise release the specified button",
+        "parameters": {
+            "button": {
+                "type": str,
+                "range": ["left", "right", "middle"],
+                "optional": True,
+            }
+        }
+    },
+    {
+        "action_type": "RIGHT_CLICK",
+        "note": "right click at the current position if x and y are not specified, otherwise right click at the specified position",
+        "parameters": {
+            "x": {
+                "type": float,
+                "range": [0, X_MAX],
+                "optional": True,
+            },
+            "y": {
+                "type": float,
+                "range": [0, Y_MAX],
+                "optional": True,
+            }
+        }
+    },
+    {
+        "action_type": "DOUBLE_CLICK",
+        "note": "double click at the current position if x and y are not specified, otherwise double click at the specified position",
+        "parameters": {
+            "x": {
+                "type": float,
+                "range": [0, X_MAX],
+                "optional": True,
+            },
+            "y": {
+                "type": float,
+                "range": [0, Y_MAX],
+                "optional": True,
+            }
+        }
+    },
+    {
+        "action_type": "DRAG_TO",
+        "note": "drag the cursor to the specified position with the left button pressed",
+        "parameters": {
+            "x": {
+                "type": float,
+                "range": [0, X_MAX],
+                "optional": False,
+            },
+            "y": {
+                "type": float,
+                "range": [0, Y_MAX],
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "SCROLL",
+        "note": "scroll the mouse wheel up or down",
+        "parameters": {
+            "dx": {
+                "type": int,
+                "range": None,
+                "optional": False,
+            },
+            "dy": {
+                "type": int,
+                "range": None,
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "TYPING",
+        "note": "type the specified text",
+        "parameters": {
+            "text": {
+                "type": str,
+                "range": None,
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "PRESS",
+        "note": "press the specified key and release it",
+        "parameters": {
+            "key": {
+                "type": str,
+                "range": KEYBOARD_KEYS,
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "KEY_DOWN",
+        "note": "press the specified key",
+        "parameters": {
+            "key": {
+                "type": str,
+                "range": KEYBOARD_KEYS,
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "KEY_UP",
+        "note": "release the specified key",
+        "parameters": {
+            "key": {
+                "type": str,
+                "range": KEYBOARD_KEYS,
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "HOTKEY",
+        "note": "press the specified key combination",
+        "parameters": {
+            "keys": {
+                "type": list,
+                "range": [KEYBOARD_KEYS],
+                "optional": False,
+            }
+        }
+    },
+    ############################################################################################################
+    {
+        "action_type": "WAIT",
+        "note": "wait until the next action",
+    },
+    {
+        "action_type": "FAIL",
+        "note": "decide the task can not be performed",
+    },
+    {
+        "action_type": "DONE",
+        "note": "decide the task is done",
+    }
+]
+Firstly you need to predict the class of your action, then you need to predict the parameters of your action:
+- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is (1920, 1080)
+for example, format as:
+```
+{
+  "action_type": "MOUSE_MOVE",
+  "x": 1319.11,
+  "y": 65.06
+}
+```
+- For [CLICK, MOUSE_DOWN, MOUSE_UP], you need to specify the click_type as well, select from [LEFT, MIDDLE, RIGHT, WHEEL_UP, WHEEL_DOWN], which means you click the left button, middle button, right button, wheel up or wheel down of your mouse:
+for example, format as:
+```
+{
+  "action_type": "CLICK",
+  "click_type": "LEFT"
+}
+```
+- For [KEY, KEY_DOWN, KEY_UP], you need to choose a(multiple) key(s) from the keyboard
+for example, format as:
+```
+{
+  "action_type": "KEY",
+  "key": "ctrl+c"
+}
+```
+- For TYPE, you need to specify the text you want to type
+for example, format as:
+```
+{
+  "action_type": "TYPE",
+  "text": "hello world"
+}
+```
+
+REMEMBER:
+For every step, you should only RETURN ME THE action_type AND parameters I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
+You MUST wrap the dict with backticks (\`).
+You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty.
+You CAN predict multiple actions at one step, but you should only return one action for each step.
+"""
\ No newline at end of file
diff --git a/mm_agents/gpt_4_prompt_code.py b/mm_agents/gpt_4_prompt_code.py
new file mode 100644
index 0000000..b057da6
--- /dev/null
+++ b/mm_agents/gpt_4_prompt_code.py
@@ -0,0 +1,18 @@
+SYS_PROMPT = """
+You are an agent which follow my instruction and perform desktop computer tasks as instructed.
+You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard.
+For each step, you will get an observation of the desktop by the XML format of accessibility tree, which is based on AT-SPI library. And you will predict the action of the computer based on the accessibility tree.
+
+You are required to use `pyautogui` to perform the action. 
+Return one line or multiple lines of python code to perform the action each time, be time efficient.
+You ONLY need to return the code inside a code block, like this:
+```python
+# your code here
+```
+Specially, it is also allowed to return the following special code:
+When you think you have to wait for some time, return ```WAIT```;
+When you think the task can not be done, return ```FAIL```;
+When you think the task is done, return ```DONE```.
+
+First give the current screenshot and previous things we did a reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
+"""
\ No newline at end of file