fix the bug about auto download; now the default vmware path is None, which can trigger the auto download manner (#58)

This commit is contained in:
Xiaochuan Li
2024-07-23 19:28:40 -05:00
committed by GitHub
parent 284a5cf2db
commit c7e3004456

117
run.py
View File

@@ -1,13 +1,13 @@
"""Script to run end-to-end evaluation on the benchmark. """Script to run end-to-end evaluation on the benchmark.
Utils and basic architecture credit to https://github.com/web-arena-x/webarena/blob/main/run.py. Utils and basic architecture credit to https://github.com/web-arena-x/webarena/blob/main/run.py.
""" """
import argparse import argparse
import datetime import datetime
import json import json
import logging import logging
import os import os
import sys import sys
# import wandb
from tqdm import tqdm from tqdm import tqdm
@@ -15,16 +15,25 @@ import lib_run_single
from desktop_env.desktop_env import DesktopEnv from desktop_env.desktop_env import DesktopEnv
from mm_agents.agent import PromptAgent from mm_agents.agent import PromptAgent
# import wandb
# Logger Configs {{{ # # Logger Configs {{{ #
logger = logging.getLogger() logger = logging.getLogger()
logger.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG)
datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S") datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8") file_handler = logging.FileHandler(
debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8") os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8"
)
debug_handler = logging.FileHandler(
os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8"
)
stdout_handler = logging.StreamHandler(sys.stdout) stdout_handler = logging.StreamHandler(sys.stdout)
sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8") sdebug_handler = logging.FileHandler(
os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8"
)
file_handler.setLevel(logging.INFO) file_handler.setLevel(logging.INFO)
debug_handler.setLevel(logging.DEBUG) debug_handler.setLevel(logging.DEBUG)
@@ -32,7 +41,8 @@ stdout_handler.setLevel(logging.INFO)
sdebug_handler.setLevel(logging.DEBUG) sdebug_handler.setLevel(logging.DEBUG)
formatter = logging.Formatter( formatter = logging.Formatter(
fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s") fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s"
)
file_handler.setFormatter(formatter) file_handler.setFormatter(formatter)
debug_handler.setFormatter(formatter) debug_handler.setFormatter(formatter)
stdout_handler.setFormatter(formatter) stdout_handler.setFormatter(formatter)
@@ -45,30 +55,27 @@ logger.addHandler(file_handler)
logger.addHandler(debug_handler) logger.addHandler(debug_handler)
logger.addHandler(stdout_handler) logger.addHandler(stdout_handler)
logger.addHandler(sdebug_handler) logger.addHandler(sdebug_handler)
# }}} Logger Configs # # }}} Logger Configs #
logger = logging.getLogger("desktopenv.experiment") logger = logging.getLogger("desktopenv.experiment")
def config() -> argparse.Namespace: def config() -> argparse.Namespace:
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Run end-to-end evaluation on the benchmark" description="Run end-to-end evaluation on the benchmark"
) )
# environment config # environment config
parser.add_argument("--path_to_vm", type=str, parser.add_argument("--path_to_vm", type=str, default=None)
default=r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx")
parser.add_argument( parser.add_argument(
"--headless", action="store_true", help="Run in headless machine" "--headless", action="store_true", help="Run in headless machine"
) )
parser.add_argument("--action_space", type=str, default="pyautogui", help="Action type") parser.add_argument(
"--action_space", type=str, default="pyautogui", help="Action type"
)
parser.add_argument( parser.add_argument(
"--observation_type", "--observation_type",
choices=[ choices=["screenshot", "a11y_tree", "screenshot_a11y_tree", "som"],
"screenshot",
"a11y_tree",
"screenshot_a11y_tree",
"som"
],
default="a11y_tree", default="a11y_tree",
help="Observation type", help="Observation type",
) )
@@ -79,7 +86,9 @@ def config() -> argparse.Namespace:
# agent config # agent config
parser.add_argument("--max_trajectory_length", type=int, default=3) parser.add_argument("--max_trajectory_length", type=int, default=3)
parser.add_argument("--test_config_base_dir", type=str, default="evaluation_examples") parser.add_argument(
"--test_config_base_dir", type=str, default="evaluation_examples"
)
# lm config # lm config
parser.add_argument("--model", type=str, default="gpt-4-0125-preview") parser.add_argument("--model", type=str, default="gpt-4-0125-preview")
@@ -90,7 +99,9 @@ def config() -> argparse.Namespace:
# example config # example config
parser.add_argument("--domain", type=str, default="all") parser.add_argument("--domain", type=str, default="all")
parser.add_argument("--test_all_meta_path", type=str, default="evaluation_examples/test_all.json") parser.add_argument(
"--test_all_meta_path", type=str, default="evaluation_examples/test_all.json"
)
# logging related # logging related
parser.add_argument("--result_dir", type=str, default="./results") parser.add_argument("--result_dir", type=str, default="./results")
@@ -99,18 +110,14 @@ def config() -> argparse.Namespace:
return args return args
def test( def test(args: argparse.Namespace, test_all_meta: dict) -> None:
args: argparse.Namespace,
test_all_meta: dict
) -> None:
scores = [] scores = []
max_steps = args.max_steps max_steps = args.max_steps
# log args # log args
logger.info("Args: %s", args) logger.info("Args: %s", args)
# set wandb project # set wandb project
cfg_args = \ cfg_args = {
{
"path_to_vm": args.path_to_vm, "path_to_vm": args.path_to_vm,
"headless": args.headless, "headless": args.headless,
"action_space": args.action_space, "action_space": args.action_space,
@@ -125,7 +132,7 @@ def test(
"top_p": args.top_p, "top_p": args.top_p,
"max_tokens": args.max_tokens, "max_tokens": args.max_tokens,
"stop_token": args.stop_token, "stop_token": args.stop_token,
"result_dir": args.result_dir "result_dir": args.result_dir,
} }
agent = PromptAgent( agent = PromptAgent(
@@ -143,12 +150,15 @@ def test(
action_space=agent.action_space, action_space=agent.action_space,
screen_size=(args.screen_width, args.screen_height), screen_size=(args.screen_width, args.screen_height),
headless=args.headless, headless=args.headless,
require_a11y_tree=args.observation_type in ["a11y_tree", "screenshot_a11y_tree", "som"], require_a11y_tree=args.observation_type
in ["a11y_tree", "screenshot_a11y_tree", "som"],
) )
for domain in tqdm(test_all_meta, desc="Domain"): for domain in tqdm(test_all_meta, desc="Domain"):
for example_id in tqdm(test_all_meta[domain], desc="Example", leave=False): for example_id in tqdm(test_all_meta[domain], desc="Example", leave=False):
config_file = os.path.join(args.test_config_base_dir, f"examples/{domain}/{example_id}.json") config_file = os.path.join(
args.test_config_base_dir, f"examples/{domain}/{example_id}.json"
)
with open(config_file, "r", encoding="utf-8") as f: with open(config_file, "r", encoding="utf-8") as f:
example = json.load(f) example = json.load(f)
@@ -160,7 +170,9 @@ def test(
logger.info(f"[Instruction]: {instruction}") logger.info(f"[Instruction]: {instruction}")
# wandb each example config settings # wandb each example config settings
cfg_args["instruction"] = instruction cfg_args["instruction"] = instruction
cfg_args["start_time"] = datetime.datetime.now().strftime("%Y:%m:%d-%H:%M:%S") cfg_args["start_time"] = datetime.datetime.now().strftime(
"%Y:%m:%d-%H:%M:%S"
)
# run.config.update(cfg_args) # run.config.update(cfg_args)
example_result_dir = os.path.join( example_result_dir = os.path.join(
@@ -169,27 +181,41 @@ def test(
args.observation_type, args.observation_type,
args.model, args.model,
domain, domain,
example_id example_id,
) )
os.makedirs(example_result_dir, exist_ok=True) os.makedirs(example_result_dir, exist_ok=True)
# example start running # example start running
try: try:
lib_run_single.run_single_example(agent, env, example, max_steps, instruction, args, example_result_dir, lib_run_single.run_single_example(
scores) agent,
env,
example,
max_steps,
instruction,
args,
example_result_dir,
scores,
)
except Exception as e: except Exception as e:
logger.error(f"Exception in {domain}/{example_id}: {e}") logger.error(f"Exception in {domain}/{example_id}: {e}")
env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4")) env.controller.end_recording(
os.path.join(example_result_dir, "recording.mp4")
)
with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f: with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
f.write(json.dumps({ f.write(
"Error": f"Time limit exceeded in {domain}/{example_id}" json.dumps(
})) {"Error": f"Time limit exceeded in {domain}/{example_id}"}
)
)
f.write("\n") f.write("\n")
env.close() env.close()
logger.info(f"Average score: {sum(scores) / len(scores)}") logger.info(f"Average score: {sum(scores) / len(scores)}")
def get_unfinished(action_space, use_model, observation_type, result_dir, total_file_json): def get_unfinished(
action_space, use_model, observation_type, result_dir, total_file_json
):
target_dir = os.path.join(result_dir, action_space, observation_type, use_model) target_dir = os.path.join(result_dir, action_space, observation_type, use_model)
if not os.path.exists(target_dir): if not os.path.exists(target_dir):
@@ -217,7 +243,9 @@ def get_unfinished(action_space, use_model, observation_type, result_dir, total_
for domain, examples in finished.items(): for domain, examples in finished.items():
if domain in total_file_json: if domain in total_file_json:
total_file_json[domain] = [x for x in total_file_json[domain] if x not in examples] total_file_json[domain] = [
x for x in total_file_json[domain] if x not in examples
]
return total_file_json return total_file_json
@@ -239,7 +267,13 @@ def get_result(action_space, use_model, observation_type, result_dir, total_file
if "result.txt" in os.listdir(example_path): if "result.txt" in os.listdir(example_path):
# empty all files under example_id # empty all files under example_id
try: try:
all_result.append(float(open(os.path.join(example_path, "result.txt"), "r").read())) all_result.append(
float(
open(
os.path.join(example_path, "result.txt"), "r"
).read()
)
)
except: except:
all_result.append(0.0) all_result.append(0.0)
@@ -251,7 +285,7 @@ def get_result(action_space, use_model, observation_type, result_dir, total_file
return all_result return all_result
if __name__ == '__main__': if __name__ == "__main__":
####### The complete version of the list of examples ####### ####### The complete version of the list of examples #######
os.environ["TOKENIZERS_PARALLELISM"] = "false" os.environ["TOKENIZERS_PARALLELISM"] = "false"
args = config() args = config()
@@ -267,17 +301,18 @@ if __name__ == '__main__':
args.model, args.model,
args.observation_type, args.observation_type,
args.result_dir, args.result_dir,
test_all_meta test_all_meta,
) )
left_info = "" left_info = ""
for domain in test_file_list: for domain in test_file_list:
left_info += f"{domain}: {len(test_file_list[domain])}\n" left_info += f"{domain}: {len(test_file_list[domain])}\n"
logger.info(f"Left tasks:\n{left_info}") logger.info(f"Left tasks:\n{left_info}")
get_result(args.action_space, get_result(
args.action_space,
args.model, args.model,
args.observation_type, args.observation_type,
args.result_dir, args.result_dir,
test_all_meta test_all_meta,
) )
test(args, test_file_list) test(args, test_file_list)