diff --git a/desktop_env/controllers/python.py b/desktop_env/controllers/python.py index 2e2c804..39d1196 100644 --- a/desktop_env/controllers/python.py +++ b/desktop_env/controllers/python.py @@ -39,3 +39,10 @@ class PythonController: return response.json() except requests.exceptions.RequestException as e: print("An error occurred while trying to execute the command:", e) + + def execute_action(self, action: str): + """ + Executes an action on the server computer. + """ + + raise NotImplementedError \ No newline at end of file diff --git a/desktop_env/envs/desktop_env.py b/desktop_env/envs/desktop_env.py index 5485fe0..3fbb92f 100644 --- a/desktop_env/envs/desktop_env.py +++ b/desktop_env/envs/desktop_env.py @@ -35,7 +35,6 @@ class DesktopEnv(gym.Env): print("Initializing...") self._start_emulator() self.controller = PythonController(http_server=self.host) - # todo: define the action space and the observation space as gym did def _start_emulator(self): @@ -87,6 +86,7 @@ class DesktopEnv(gym.Env): return observation def step(self, action): + # todo: support both the action space of our-designed space and the executable code space in pyautogui # Our action space is the set of all possible python commands insides `pyautogui` self.controller.execute_python_command(action) observation = self._get_obs() diff --git a/mm_agents/gpt_4v_prompt.py b/mm_agents/gpt_4v_prompt.py deleted file mode 100644 index 11705e3..0000000 --- a/mm_agents/gpt_4v_prompt.py +++ /dev/null @@ -1,54 +0,0 @@ -SYS_PROMPT = """ -You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection. -For each step, you will get an observation of an image, which is the screenshot of the computer screen. And you will predict the action of the computer based on the image. -Here is the description of the action space: - -Firstly you need to predict the class of your action, select from one below: -- **MOUSE_MOVE**: move the mouse to a specific position -- **CLICK**: click on the screen -- **MOUSE_DOWN**: press the mouse button -- **MOUSE_UP**: release the mouse button -- **KEY**: press a key on the keyboard -- **KEY_DOWN**: press a key on the keyboard -- **KEY_UP**: release a key on the keyboard -- **TYPE**: type a string on the keyboard - -Then you need to predict the parameters of your action: -- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is (1920, 1080) -for example, format as: -``` -{ - "action_type": "MOUSE_MOVE", - "x": 1319.11, - "y": 65.06 -} -``` -- For [CLICK, MOUSE_DOWN, MOUSE_UP], you need to specify the click_type as well, select from [LEFT, MIDDLE, RIGHT, WHEEL_UP, WHEEL_DOWN], which means you click the left button, middle button, right button, wheel up or wheel down of your mouse: -for example, format as: -``` -{ - "action_type": "CLICK", - "click_type": "LEFT" -} -``` -- For [KEY, KEY_DOWN, KEY_UP], you need to choose a(multiple) key(s) from the keyboard -for example, format as: -``` -{ - "action_type": "KEY", - "key": "ctrl+c" -} -``` -- For TYPE, you need to specify the text you want to type -for example, format as: -``` -{ - "action_type": "TYPE", - "text": "hello world" -} -``` - -For every step, you should only return the action_type and the parameters of your action as a dict, without any other things. You MUST wrap the dict with backticks (\`). -You can predict multiple actions at one step, but you should only return one action for each step. -You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty. -""" \ No newline at end of file diff --git a/mm_agents/gpt_4v_prompt_code.py b/mm_agents/gpt_4v_prompt_code.py new file mode 100644 index 0000000..f04602c --- /dev/null +++ b/mm_agents/gpt_4v_prompt_code.py @@ -0,0 +1,8 @@ +SYS_PROMPT = """ +You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection. +For each step, you will get an observation of an image, which is the screenshot of the computer screen. And you will predict the action of the computer based on the image. + +You are required to use `pyautogui` to perform the action. +Return one line or multiple lines of python code to perform the action each time, be time efficient. +Return `None` if you cannot perform the action. +""" \ No newline at end of file