Feat/claude cua support (#253)

* feat: add claude support

* feat: add script for end-to-end evaluation with logging and task distribution

* feat&fix: add tool result handling and update model default in evaluation script

* chore: remove run_test_env.py script

* feat&fix: implement action parsing for tool calls and update default action space

* fix: update text formatting in action parsing and replace logger import

* feat&fix: implement action parsing for tool calls and add screen size handling

* feat: add setup instructions for Anthropic API integration

* feat: add notice about image size limitations for Anthropic API

* Delete test_env/logger.py

* Delete test_env/utils.py
This commit is contained in:
Zilong Zhou
2025-07-13 21:10:49 +08:00
committed by GitHub
parent 38a30734a6
commit 349f2fd9fe
13 changed files with 1975 additions and 4 deletions

View File

@@ -19,6 +19,8 @@ Metric = Callable[[Any, Any], float]
Getter = Callable[[gym.Env, Dict[str, Any]], Any]
MAX_RETRIES = 5 # Maximum retries for environment setup
class DesktopEnv(gym.Env):
"""
@@ -115,7 +117,7 @@ class DesktopEnv(gym.Env):
# mode: human or machine
self.instruction = None
assert action_space in ["computer_13", "pyautogui"]
assert action_space in ["computer_13", "pyautogui", "claude_computer_use"]
self.action_space = action_space # todo: refactor it to the ActType
# episodic stuffs, like counters, will be updated or reset
@@ -318,7 +320,7 @@ class DesktopEnv(gym.Env):
reward = 0 # todo: Define reward calculation for each example
done = False # todo: Define episode termination condition for each example
info = {}
logger.info(f"Step {self._step_no} in trajectory {self._traj_no} with action: {action}")
# handle the special actions
if action in ['WAIT', 'FAIL', 'DONE'] or (type(action) == dict and action['action_type'] in ['WAIT', 'FAIL', 'DONE']):
if action == 'WAIT':
@@ -333,12 +335,15 @@ class DesktopEnv(gym.Env):
if self.action_space == "computer_13":
# the set of all possible actions defined in the action representation
self.controller.execute_action(action)
elif self.action_space == "pyautogui":
elif self.action_space == "pyautogui" or self.action_space == "claude_computer_use":
if action in ['WAIT', 'FAIL', 'DONE']:
self.controller.execute_action(action)
else:
# the set of all possible python commands insides `pyautogui`
self.controller.execute_python_command(action)
if type(action) == str:
self.controller.execute_python_command(action)
elif type(action) == dict:
self.controller.execute_python_command(action['command'])
time.sleep(pause)
observation = self._get_obs()