update clean code (#213)

2025-06-10 22:18:03 +08:00
parent 4ce05b89ae
commit 362499330e
13 changed files with 0 additions and 2309 deletions
--- a/aws/README.md
+++ b/aws/README.md
@@ -1,54 +0,0 @@
-# AWS CLI v2
-
-This bundle contains a built executable of the AWS CLI v2.
-
-## Installation
-
-To install the AWS CLI v2, run the `install` script:
-```
-$ sudo ./install 
-You can now run: /usr/local/bin/aws --version
-```
-This will install the AWS CLI v2 at `/usr/local/bin/aws`.  Assuming
-`/usr/local/bin` is on your `PATH`, you can now run:
-```
-$ aws --version
-```
-
-
-### Installing without sudo
-
-If you don't have ``sudo`` permissions or want to install the AWS
-CLI v2 only for the current user, run the `install` script with the `-b`
-and `-i` options:
-```
-$ ./install -i ~/.local/aws-cli -b ~/.local/bin
-``` 
-This will install the AWS CLI v2 in `~/.local/aws-cli` and create
-symlinks for `aws` and `aws_completer` in `~/.local/bin`. For more
-information about these options, run the `install` script with `-h`:
-```
-$ ./install -h
-```
-
-### Updating
-
-If you run the `install` script and there is a previously installed version
-of the AWS CLI v2, the script will error out. To update to the version included
-in this bundle, run the `install` script with `--update`:
-```
-$ sudo ./install --update
-```
-
-
-### Removing the installation
-
-To remove the AWS CLI v2, delete the its installation and symlinks:
-```
-$ sudo rm -rf /usr/local/aws-cli
-$ sudo rm /usr/local/bin/aws
-$ sudo rm /usr/local/bin/aws_completer
-```
-Note if you installed the AWS CLI v2 using the `-b` or `-i` options, you will
-need to remove the installation and the symlinks in the directories you
-specified.
--- a/aws/THIRD_PARTY_LICENSES
+++ b/aws/THIRD_PARTY_LICENSES
--- a/aws/install
+++ b/aws/install
@@ -1,155 +0,0 @@
-#!/bin/sh
-# Copyright 2012-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You
-# may not use this file except in compliance with the License. A copy of
-# the License is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is
-# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
-# ANY KIND, either express or implied. See the License for the specific
-# language governing permissions and limitations under the License.
-
-usage() {
-  cat 1>&2 <<EOF
-Installs the AWS CLI v2
-
-USAGE:
-    install [FLAGS] [OPTIONS]
-
-FLAGS:
-    -u, --update              Updates the AWS CLI v2 if a different version
-                              is previously installed. By default, this script
-                              will not update the AWS CLI if a previous
-                              installation is detected.
-
-    -h, --help                Prints help information
-
-OPTIONS:
-    -i, --install-dir <path>  The directory to install the AWS CLI v2. By
-                              default, this directory is: /usr/local/aws-cli
-
-    -b, --bin-dir <path>      The directory to store symlinks to executables
-                              for the AWS CLI v2. By default, the directory
-                              used is: /usr/local/bin
-EOF
-}
-
-parse_commandline() {
-  while test $# -gt 0
-  do
-    key="$1"
-	case "$key" in
-	  -i|--install-dir)
-	    PARSED_INSTALL_DIR="$2"
-		shift
-	   ;;
-	  -b|--bin-dir)
-	    PARSED_BIN_DIR="$2"
-		shift
-	   ;;
-	  -u|--update)
-	    PARSED_UPGRADE="yes"
-	  ;;
-	  -h|--help)
-	    usage
-        exit 0
-	  ;;
-	  *)
-	   die "Got an unexpected argument: $1"
-	  ;;
-    esac
-	shift
-  done
-}
-
-set_global_vars() {
-  ROOT_INSTALL_DIR=${PARSED_INSTALL_DIR:-/usr/local/aws-cli}
-  BIN_DIR=${PARSED_BIN_DIR:-/usr/local/bin}
-  UPGRADE=${PARSED_UPGRADE:-no}
-
-  EXE_NAME="aws"
-  COMPLETER_EXE_NAME="aws_completer"
-  INSTALLER_DIR="$( cd "$( dirname "$0" )" >/dev/null 2>&1 && pwd )"
-  INSTALLER_DIST_DIR="$INSTALLER_DIR/dist"
-  INSTALLER_EXE="$INSTALLER_DIST_DIR/$EXE_NAME"
-  AWS_EXE_VERSION=$($INSTALLER_EXE --version | cut -d ' ' -f 1 | cut -d '/' -f 2)
-
-  INSTALL_DIR="$ROOT_INSTALL_DIR/v2/$AWS_EXE_VERSION"
-  INSTALL_DIR="$INSTALL_DIR"
-  INSTALL_DIST_DIR="$INSTALL_DIR/dist"
-  INSTALL_BIN_DIR="$INSTALL_DIR/bin"
-  INSTALL_AWS_EXE="$INSTALL_BIN_DIR/$EXE_NAME"
-  INSTALL_AWS_COMPLETER_EXE="$INSTALL_BIN_DIR/$COMPLETER_EXE_NAME"
-
-  CURRENT_INSTALL_DIR="$ROOT_INSTALL_DIR/v2/current"
-  CURRENT_AWS_EXE="$CURRENT_INSTALL_DIR/bin/$EXE_NAME"
-  CURRENT_AWS_COMPLETER_EXE="$CURRENT_INSTALL_DIR/bin/$COMPLETER_EXE_NAME"
-
-  BIN_AWS_EXE="$BIN_DIR/$EXE_NAME"
-  BIN_AWS_COMPLETER_EXE="$BIN_DIR/$COMPLETER_EXE_NAME"
-}
-
-create_install_dir() {
-  mkdir -p "$INSTALL_DIR" || exit 1
-  {
-    setup_install_dist &&
-    setup_install_bin &&
-    create_current_symlink
-  } || {
-    rm -rf "$INSTALL_DIR"
-    exit 1
-  }
-}
-
-check_preexisting_install() {
-  if [ -L "$CURRENT_INSTALL_DIR" ] && [ "$UPGRADE" = "no" ]
-  then
-    die "Found preexisting AWS CLI installation: $CURRENT_INSTALL_DIR. Please rerun install script with --update flag."
-  fi
-  if [ -d "$INSTALL_DIR" ]
-  then
-    echo "Found same AWS CLI version: $INSTALL_DIR. Skipping install."
-    exit 0
-  fi
-}
-
-setup_install_dist() {
-  cp -r "$INSTALLER_DIST_DIR" "$INSTALL_DIST_DIR"
-}
-
-setup_install_bin() {
-  mkdir -p "$INSTALL_BIN_DIR"
-  ln -s "../dist/$EXE_NAME" "$INSTALL_AWS_EXE"
-  ln -s "../dist/$COMPLETER_EXE_NAME" "$INSTALL_AWS_COMPLETER_EXE"
-}
-
-create_current_symlink() {
-  ln -snf "$INSTALL_DIR" "$CURRENT_INSTALL_DIR"
-}
-
-create_bin_symlinks() {
-  mkdir -p "$BIN_DIR"
-  ln -sf "$CURRENT_AWS_EXE" "$BIN_AWS_EXE"
-  ln -sf "$CURRENT_AWS_COMPLETER_EXE" "$BIN_AWS_COMPLETER_EXE"
-}
-
-die() {
-	err_msg="$1"
-	echo "$err_msg" >&2
-	exit 1
-}
-
-main() {
-  parse_commandline "$@"
-  set_global_vars
-  check_preexisting_install
-  create_install_dir
-  create_bin_symlinks
-  echo "You can now run: $BIN_AWS_EXE --version"
-  exit 0
-}
-
-main "$@" || exit 1
--- a/fake_run_single.py
+++ b/fake_run_single.py
@@ -1,65 +0,0 @@
-import datetime
-import json
-import logging
-import os
-import time
-from wrapt_timeout_decorator import *
-
-logger = logging.getLogger("desktopenv.experiment")
-
-
-def run_single_example(agent, env, example, max_steps, instruction, args, example_result_dir, scores):
-    runtime_logger = setup_logger(example, example_result_dir)
-    agent.reset(runtime_logger)
-    env.reset(task_config=example)
-    # time.sleep(60) # Wait for the environment to be ready
-    obs = env._get_obs() # Get the initial observation
-    done = False
-    step_idx = 0
-    env.controller.start_recording()
-    while not done and step_idx < max_steps:
-        response, actions = agent.predict(
-            instruction,
-            obs
-        )
-
-        for action in actions:
-            # Capture the timestamp before executing the action
-            action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
-            logger.info("Step %d: %s", step_idx + 1, action)
-            obs, reward, done, info = env.step(action, args.sleep_after_execution)
-
-            logger.info("Reward: %.2f", reward)
-            logger.info("Done: %s", done)
-            # Save screenshot and trajectory information
-            with open(os.path.join(example_result_dir, f"step_{step_idx + 1}_{action_timestamp}.png"),
-                      "wb") as _f:
-                _f.write(obs['screenshot'])
-            with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
-                f.write(json.dumps({
-                    "step_num": step_idx + 1,
-                    "action_timestamp": action_timestamp,
-                    "action": action,
-                    "reward": reward,
-                    "done": done,
-                    "info": info,
-                    "screenshot_file": f"step_{step_idx + 1}_{action_timestamp}.png"
-                }))
-                f.write("\n")
-            if done:
-                logger.info("The episode is done.")
-                break
-        step_idx += 1
-    result = env.evaluate()
-    logger.info("Result: %.2f", result)
-    scores.append(result)
-    with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
-        f.write(f"{result}\n")
-    env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
-
-
-def setup_logger(example, example_result_dir):
-    runtime_logger = logging.getLogger(f"desktopenv.example.{example['id']}")
-    runtime_logger.setLevel(logging.DEBUG)
-    runtime_logger.addHandler(logging.FileHandler(os.path.join(example_result_dir, "runtime.log")))
-    return runtime_logger
--- a/run_operator.sh
+++ b/run_operator.sh
@@ -1,9 +0,0 @@
-python run_multienv_openaicua.py \
--headless \
--observation_type screenshot \
--model computer-use-preview \
--result_dir ./results_all \
--test_all_meta_path evaluation_examples/test_all.json \
--region us-east-1 \
--max_steps 150 \
--num_envs 1
--- a/run_test_env.py
+++ b/run_test_env.py
@@ -1,376 +0,0 @@
-"""Script to run end-to-end evaluation on the benchmark.
-Utils and basic architecture credit to https://github.com/web-arena-x/webarena/blob/main/run.py.
-"""
-
-import argparse
-import datetime
-import json
-import logging
-import os
-import sys
-from typing import List, Dict
-import math
-from tqdm import tqdm
-from multiprocessing import Process, Manager
-import fake_run_single
-from test_env import DesktopEnv
-from mm_agents.agent import PromptAgent
-
-# import wandb
-
-
-#  Logger Configs {{{ #
-logger = logging.getLogger()
-logger.setLevel(logging.DEBUG)
-
-datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
-
-file_handler = logging.FileHandler(
-    os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8"
-)
-debug_handler = logging.FileHandler(
-    os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8"
-)
-stdout_handler = logging.StreamHandler(sys.stdout)
-sdebug_handler = logging.FileHandler(
-    os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8"
-)
-
-file_handler.setLevel(logging.INFO)
-debug_handler.setLevel(logging.DEBUG)
-stdout_handler.setLevel(logging.INFO)
-sdebug_handler.setLevel(logging.DEBUG)
-
-formatter = logging.Formatter(
-    fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s"
-)
-file_handler.setFormatter(formatter)
-debug_handler.setFormatter(formatter)
-stdout_handler.setFormatter(formatter)
-sdebug_handler.setFormatter(formatter)
-
-stdout_handler.addFilter(logging.Filter("desktopenv"))
-sdebug_handler.addFilter(logging.Filter("desktopenv"))
-
-logger.addHandler(file_handler)
-logger.addHandler(debug_handler)
-logger.addHandler(stdout_handler)
-logger.addHandler(sdebug_handler)
-#  }}} Logger Configs #
-
-logger = logging.getLogger("desktopenv.experiment")
-
-
-def config() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(
-        description="Run end-to-end evaluation on the benchmark"
-    )
-
-    # environment config
-    parser.add_argument("--path_to_vm", type=str, default=None)
-    parser.add_argument(
-        "--headless", action="store_true", help="Run in headless machine"
-    )
-    parser.add_argument(
-        "--action_space", type=str, default="pyautogui", help="Action type"
-    )
-    parser.add_argument(
-        "--observation_type",
-        choices=["screenshot", "a11y_tree", "screenshot_a11y_tree", "som"],
-        default="a11y_tree",
-        help="Observation type",
-    )
-    parser.add_argument("--screen_width", type=int, default=1920)
-    parser.add_argument("--screen_height", type=int, default=1080)
-    parser.add_argument("--sleep_after_execution", type=float, default=0.0)
-    parser.add_argument("--max_steps", type=int, default=15)
-
-    # agent config
-    parser.add_argument("--max_trajectory_length", type=int, default=3)
-    parser.add_argument(
-        "--test_config_base_dir", type=str, default="evaluation_examples"
-    )
-
-    # lm config
-    parser.add_argument("--model", type=str, default="gpt-4o")
-    parser.add_argument("--temperature", type=float, default=1.0)
-    parser.add_argument("--top_p", type=float, default=0.9)
-    parser.add_argument("--max_tokens", type=int, default=1500)
-    parser.add_argument("--stop_token", type=str, default=None)
-
-    # example config
-    parser.add_argument("--domain", type=str, default="all")
-    parser.add_argument(
-        "--test_all_meta_path", type=str, default="evaluation_examples/test_all.json"
-    )
-
-    # logging related
-    parser.add_argument("--result_dir", type=str, default="./results")
-    parser.add_argument("--num_envs", type=int, default=1, help="Number of environments to run in parallel")
-
-    # aws config
-    parser.add_argument(
-        "--region", type=str, default="us-east-1", help="AWS region for the VM"
-    )
-    
-    args = parser.parse_args()
-    return args
-
-
-def distribute_tasks(test_all_meta: dict, num_envs: int) -> List[Dict]:
-    """Distribute tasks evenly across environments."""
-    # Flatten the tasks into a single list
-    all_tasks = []
-    for domain, examples in test_all_meta.items():
-        for example_id in examples:
-            all_tasks.append((domain, example_id))
-    
-    # Calculate tasks per environment
-    tasks_per_env = math.ceil(len(all_tasks) / num_envs)
-    
-    # Distribute tasks
-    distributed_tasks = []
-    for i in range(num_envs):
-        env_tasks = {}
-        start_idx = i * tasks_per_env
-        end_idx = min((i + 1) * tasks_per_env, len(all_tasks))
-        
-        for domain, example_id in all_tasks[start_idx:end_idx]:
-            if domain not in env_tasks:
-                env_tasks[domain] = []
-            env_tasks[domain].append(example_id)
-        
-        distributed_tasks.append(env_tasks)
-    
-    return distributed_tasks
-
-
-
-def run_env_tasks(env_idx: int, env: DesktopEnv, agent: PromptAgent, env_tasks: dict, args: argparse.Namespace, shared_scores: list):
-    """Run tasks for a single environment."""
-    logger.info(f"Executing tasks in environment {env_idx + 1}/{args.num_envs}")
-    
-    for domain in tqdm(env_tasks, desc=f"Env{env_idx+1}-Domain"):
-        for example_id in tqdm(env_tasks[domain], desc="Example", leave=False):
-            config_file = os.path.join(
-                args.test_config_base_dir, f"examples/{domain}/{example_id}.json"
-            )
-            with open(config_file, "r", encoding="utf-8") as f:
-                example = json.load(f)
-
-            logger.info(f"[Env {env_idx+1}][Domain]: {domain}")
-            logger.info(f"[Env {env_idx+1}][Example ID]: {example_id}")
-            logger.info(f"[Env {env_idx+1}][Instruction]: {example['instruction']}")
-            
-            example_result_dir = os.path.join(
-                args.result_dir,
-                args.action_space,
-                args.observation_type,
-                args.model,
-                domain,
-                example_id,
-            )
-            os.makedirs(example_result_dir, exist_ok=True)
-
-            # try:
-            fake_run_single.run_single_example(
-                agent,
-                env,
-                example,
-                args.max_steps,
-                example["instruction"],
-                args,
-                example_result_dir,
-                shared_scores,
-            )
-            # except Exception as e:
-            #     logger.error(f"Exception in Env{env_idx+1} {domain}/{example_id}: {e}")
-            #     env.controller.end_recording(
-            #         os.path.join(example_result_dir, "recording.mp4")
-            #     )
-            #     with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
-            #         f.write(
-            #             json.dumps(
-            #                 {"Error": f"Time limit exceeded in {domain}/{example_id}"}
-            #             )
-            #         )
-            #         f.write("\n")
-    
-    env.close()
-
-
-def test(args: argparse.Namespace, test_all_meta: dict) -> None:
-    logger.info("Args: %s", args)
-    
-    distributed_tasks = distribute_tasks(test_all_meta, args.num_envs)
-    
-    # First, set up all environments
-    logger.info("Setting up all environments...")
-    envs = []
-    agents = []
-    
-    for env_idx in range(args.num_envs):
-        logger.info(f"Setting up environment {env_idx + 1}/{args.num_envs}")
-        
-        agent = PromptAgent(
-            model=args.model,
-            max_tokens=args.max_tokens,
-            top_p=args.top_p,
-            temperature=args.temperature,
-            action_space=args.action_space,
-            observation_type=args.observation_type,
-            max_trajectory_length=args.max_trajectory_length,
-        )
-        agents.append(agent)
-
-        from desktop_env.providers.aws.manager import IMAGE_ID_MAP
-        REGION = "us-east-1"
-        env = DesktopEnv(
-            path_to_vm=args.path_to_vm,
-            action_space=agent.action_space,
-
-            provider_name="aws",
-            region=REGION,
-            snapshot_name=IMAGE_ID_MAP[REGION],
-
-            screen_size=(args.screen_width, args.screen_height),
-            headless=args.headless,
-            os_type="Ubuntu",
-            require_a11y_tree=args.observation_type
-            in ["a11y_tree", "screenshot_a11y_tree", "som"],
-        )
-        envs.append(env)
-    
-    logger.info("All environments are ready. Starting parallel task execution...")
-    
-    # Create a shared list for scores across processes
-    with Manager() as manager:
-        shared_scores = manager.list()
-        
-        # Create and start processes for each environment
-        processes = []
-        for env_idx, (env, agent, env_tasks) in enumerate(zip(envs, agents, distributed_tasks)):
-            p = Process(
-                target=run_env_tasks,
-                args=(env_idx, env, agent, env_tasks, args, shared_scores)
-            )
-            processes.append(p)
-            p.start()
-        
-        # Wait for all processes to complete
-        for p in processes:
-            p.join()
-        
-        # Convert shared list to regular list
-        scores = list(shared_scores)
-    
-    logger.info(f"Average score: {sum(scores) / len(scores) if scores else 0}")
-
-
-def get_unfinished(
-    action_space, use_model, observation_type, result_dir, total_file_json
-):
-    target_dir = os.path.join(result_dir, action_space, observation_type, use_model)
-
-    if not os.path.exists(target_dir):
-        return total_file_json
-
-    finished = {}
-    for domain in os.listdir(target_dir):
-        finished[domain] = []
-        domain_path = os.path.join(target_dir, domain)
-        if os.path.isdir(domain_path):
-            for example_id in os.listdir(domain_path):
-                if example_id == "onboard":
-                    continue
-                example_path = os.path.join(domain_path, example_id)
-                if os.path.isdir(example_path):
-                    if "result.txt" not in os.listdir(example_path):
-                        # empty all files under example_id
-                        for file in os.listdir(example_path):
-                            os.remove(os.path.join(example_path, file))
-                    else:
-                        finished[domain].append(example_id)
-
-    if not finished:
-        return total_file_json
-
-    for domain, examples in finished.items():
-        if domain in total_file_json:
-            total_file_json[domain] = [
-                x for x in total_file_json[domain] if x not in examples
-            ]
-
-    return total_file_json
-
-
-def get_result(action_space, use_model, observation_type, result_dir, total_file_json):
-    target_dir = os.path.join(result_dir, action_space, observation_type, use_model)
-    if not os.path.exists(target_dir):
-        print("New experiment, no result yet.")
-        return None
-
-    all_result = []
-
-    for domain in os.listdir(target_dir):
-        domain_path = os.path.join(target_dir, domain)
-        if os.path.isdir(domain_path):
-            for example_id in os.listdir(domain_path):
-                example_path = os.path.join(domain_path, example_id)
-                if os.path.isdir(example_path):
-                    if "result.txt" in os.listdir(example_path):
-                        # empty all files under example_id
-                        try:
-                            all_result.append(
-                                float(
-                                    open(
-                                        os.path.join(example_path, "result.txt"), "r"
-                                    ).read()
-                                )
-                            )
-                        except:
-                            all_result.append(0.0)
-
-    if not all_result:
-        print("New experiment, no result yet.")
-        return None
-    else:
-        print("Current Success Rate:", sum(all_result) / len(all_result) * 100, "%")
-        return all_result
-
-
-if __name__ == "__main__":
-    ####### The complete version of the list of examples #######
-    os.environ["TOKENIZERS_PARALLELISM"] = "false"
-
-    args = config()
-
-    with open(args.test_all_meta_path, "r", encoding="utf-8") as f:
-        test_all_meta = json.load(f)
-
-    if args.domain != "all":
-        test_all_meta = {args.domain: test_all_meta[args.domain]}
-
-    test_file_list = get_unfinished(
-        args.action_space,
-        args.model,
-        args.observation_type,
-        args.result_dir,
-        test_all_meta,
-    )
-    left_info = ""
-    for domain in test_file_list:
-        left_info += f"{domain}: {len(test_file_list[domain])}\n"
-    logger.info(f"Left tasks:\n{left_info}")
-
-    get_result(
-        args.action_space,
-        args.model,
-        args.observation_type,
-        args.result_dir,
-        test_all_meta,
-    )
-    test(args, test_file_list)
-
-
-# path_to_vm can be a list["xxx","xxx"]
--- a/test_env/init.py
+++ b/test_env/init.py
@@ -1,2 +0,0 @@
-from .fake_python_controller import PythonController
-from .fake_env import DesktopEnv
--- a/test_env/fake_env.py
+++ b/test_env/fake_env.py
@@ -1,128 +0,0 @@
-from typing import Callable, Any, Optional, Tuple
-import os
-from test_env import PythonController
-
-
-class DesktopEnv:
-    def __init__(
-        self,
-        action_space: str = "computer_13",
-        screen_size: Tuple[int] = (1920, 1080),
-        *args: Any,
-        **kwargs: Any,
-        ):
-        self.obs_options = {}  
-        self._step_no = 0
-        self.action_history = []
-        self.action_space = action_space
-        self.resolution = screen_size
-        self.controller = PythonController()
-
-        
-        # Load test screenshots and accessibility trees
-        test_obs_dir = os.path.join(os.path.dirname(__file__), "test_observations")
-
-        self.screenshots = [
-            self._load_image(os.path.join(test_obs_dir, "screenshot0.jpg")),
-            self._load_image(os.path.join(test_obs_dir, "screenshot1.jpg")),
-        ]
-        self.accessibility_trees = [
-            self._load_accessibility_tree(os.path.join(test_obs_dir, "a11y_tree0.txt")),
-            self._load_accessibility_tree(os.path.join(test_obs_dir, "a11y_tree1.txt")),
-        ]
-    
-    def _get_screenshot(self):
-        if self._step_no == 0:
-            return self.screenshots[0]
-        return self.screenshots[1]
-    
-    def _get_accessibility_tree(self):
-        if self._step_no == 0:
-            return self.accessibility_trees[0]
-        return self.accessibility_trees[1]
-
-    def set_obs_options(self, obs_options):
-        print(f"Setting obs options to {obs_options}")
-        self.obs_options = obs_options
-    
-    def _load_image(self, image_path):
-        try:
-            with open(image_path, "rb") as image_file:
-                # Read the image file in binary mode
-                image_data = image_file.read()
-                # Encode the binary data as Base64
-                return image_data
-        except FileNotFoundError:
-            print(f"Error: File not found at {image_path}")
-        except Exception as e:
-            print(f"An error occurred: {e}")
-    
-    def _load_accessibility_tree(self, tree_path):
-        try:
-            with open(tree_path, "r") as tree_file:
-                # Read the accessibility tree file
-                tree_data = tree_file.read()
-                return tree_data
-        except FileNotFoundError:
-            print(f"Error: File not found at {tree_path}")
-        except Exception as e:
-            print(f"An error occurred: {e}")
-
-    def _get_obs(self):
-        obs = {}
-        obs["screenshot"] = self._get_screenshot()
-        obs["accessibility_tree"] = self._get_accessibility_tree()
-        obs["terminal"] = ""
-        obs["instruction"] = "Open Chrome browser"
-        
-        return obs
-    
-    def _start_video_recording(self):
-        pass
-    
-    def _stop_video_recording(self):
-        pass
-    
-    def step(self, action, *args, **kargs) -> Tuple:
-        self._step_no += 1
-        self.action_history.append(action)
-
-        info = {}
-        terminated = False  # todo: Define episode termination condition for each example
-        
-        if action == 'FAIL' or action == 'DONE':
-            terminated = True
-            
-        else:       
-            if self.action_space == "claude_computer_use":
-                tool_result = {
-                    "role": "user",
-                    "content": [
-                        {
-                        "type": "tool_result",
-                        "tool_use_id": "toolu_01A09q90qw90lq917835lq9",
-                        "content": [
-                                {
-                                    "type": "image",
-                                    "source": {
-                                        "type": "base64",
-                                        "media_type": "image/jpeg",
-                                        "data": self.screenshots[1],
-                                    }
-                                }
-                            ]
-                        }
-                    ]
-                }
-                info.update({"tool_result": tool_result})
-        
-        return (self._get_obs(), 0, terminated, info)
-    
-    def close(self):
-        self._step_no = 0
-        self.action_history = []
-        self.obs_options = {}
-        self.controller = None
-    
-    def reset(self, *args: Any, **kwargs: Any) -> dict:
-        return self._get_obs()
--- a/test_env/fake_python_controller.py
+++ b/test_env/fake_python_controller.py
@@ -1,50 +0,0 @@
-from typing import Any, Dict, Optional
-
-
-class PythonController:
-    def __init__(self):
-        pass
-
-    def get_screenshot(self) -> Optional[bytes]:
-        pass
-
-    def get_accessibility_tree(self) -> Optional[str]:
-        pass
-
-    def get_terminal_output(self) -> Optional[str]:
-        pass
-
-    def get_file(self, file_path: str) -> Optional[bytes]:
-        pass
-
-    def execute_python_command(self, command: str) -> None:
-        pass
-
-    def execute_action(self, action: Dict[str, Any]):
-        pass
-
-    # Record video
-    def start_recording(self):
-        pass
-
-    def end_recording(self, dest: str):
-        pass
-
-    # Additional info
-    def get_vm_platform(self):
-        pass
-
-    def get_vm_screen_size(self):
-        pass
-
-    def get_vm_window_size(self, app_class_name: str):
-        pass
-
-    def get_vm_wallpaper(self):
-        pass
-
-    def get_vm_desktop_path(self) -> Optional[str]:
-        pass
-
-    def get_vm_directory_tree(self, path) -> Optional[Dict[str, Any]]:
-        pass
--- a/test_env/test_observations/a11y_tree0.txt
+++ b/test_env/test_observations/a11y_tree0.txt
--- a/test_env/test_observations/a11y_tree1.txt
+++ b/test_env/test_observations/a11y_tree1.txt
--- a/test_env/test_observations/screenshot0.jpg
+++ b/test_env/test_observations/screenshot0.jpg
--- a/test_env/test_observations/screenshot1.jpg
+++ b/test_env/test_observations/screenshot1.jpg