Feat/claude cua support (#253)

* feat: add claude support * feat: add script for end-to-end evaluation with logging and task distribution * feat&fix: add tool result handling and update model default in evaluation script * chore: remove run_test_env.py script * feat&fix: implement action parsing for tool calls and update default action space * fix: update text formatting in action parsing and replace logger import * feat&fix: implement action parsing for tool calls and add screen size handling * feat: add setup instructions for Anthropic API integration * feat: add notice about image size limitations for Anthropic API * Delete test_env/logger.py * Delete test_env/utils.py
2025-07-13 21:10:49 +08:00
parent 38a30734a6
commit 349f2fd9fe
13 changed files with 1975 additions and 4 deletions
--- a/desktop_env/desktop_env.py
+++ b/desktop_env/desktop_env.py
@@ -19,6 +19,8 @@ Metric = Callable[[Any, Any], float]
 Getter = Callable[[gym.Env, Dict[str, Any]], Any]

 MAX_RETRIES = 5 # Maximum retries for environment setup
+            
+

 class DesktopEnv(gym.Env):
    """
@@ -115,7 +117,7 @@ class DesktopEnv(gym.Env):

            # mode: human or machine
            self.instruction = None
-            assert action_space in ["computer_13", "pyautogui"]
+            assert action_space in ["computer_13", "pyautogui", "claude_computer_use"]
            self.action_space = action_space  # todo: refactor it to the ActType

            # episodic stuffs, like counters, will be updated or reset
@@ -318,7 +320,7 @@ class DesktopEnv(gym.Env):
        reward = 0  # todo: Define reward calculation for each example
        done = False  # todo: Define episode termination condition for each example
        info = {}
-
+        logger.info(f"Step {self._step_no} in trajectory {self._traj_no} with action: {action}")
        # handle the special actions
        if action in ['WAIT', 'FAIL', 'DONE'] or (type(action) == dict and action['action_type'] in ['WAIT', 'FAIL', 'DONE']):
            if action == 'WAIT':
@@ -333,12 +335,15 @@ class DesktopEnv(gym.Env):
        if self.action_space == "computer_13":
            # the set of all possible actions defined in the action representation
            self.controller.execute_action(action)
-        elif self.action_space == "pyautogui":
+        elif self.action_space == "pyautogui" or self.action_space == "claude_computer_use":
            if action in ['WAIT', 'FAIL', 'DONE']:
                self.controller.execute_action(action)
            else:
                # the set of all possible python commands insides `pyautogui`
-                self.controller.execute_python_command(action)
+                if type(action) == str:
+                    self.controller.execute_python_command(action)
+                elif type(action) == dict:
+                    self.controller.execute_python_command(action['command'])

        time.sleep(pause)
        observation = self._get_obs()