Merge remote-tracking branch 'origin/main'

# Conflicts:
#	mm_agents/agent.py
#	run.py
This commit is contained in:
Timothyxxx
2024-03-15 21:10:32 +08:00
11 changed files with 215 additions and 85 deletions

19
.vscode/launch.json vendored Normal file
View File

@@ -0,0 +1,19 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python Debugger: Current File with Arguments",
"type": "debugpy",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal",
"args": [
"--path_to_vm", "/Users/lxc/Virtual Machines.localized/DesktopEnv-Ubuntu 64-bit Arm.vmwarevm/DesktopEnv-Ubuntu 64-bit Arm.vmx",
"--example_time_limit", "60"
]
}
]
}

16
demo.py Normal file
View File

@@ -0,0 +1,16 @@
import signal
import time
def handler(signo, frame):
raise RuntimeError("Timeout")
signal.signal(signal.SIGALRM, handler)
while True:
try:
signal.alarm(5) # seconds
time.sleep(10)
print("Working...")
except Exception as e :
print(e)
continue

View File

@@ -174,7 +174,7 @@ class DesktopEnv(gym.Env):
if isinstance(self.evaluator["func"], list) \ if isinstance(self.evaluator["func"], list) \
else getattr(metrics, self.evaluator["func"]) else getattr(metrics, self.evaluator["func"])
self.metric_conj: str = self.evaluator.get("conj", "and") # take conjunction of multiple metrics self.metric_conj: str = self.evaluator.get("conj", "and") # take conjunction of multiple metrics
if "result" in self.evaluator: if "result" in self.evaluator and len(self.evaluator["result"])>0:
self.result_getter: Getter = [getattr(getters, "get_{:}".format(res["type"])) for res in self.result_getter: Getter = [getattr(getters, "get_{:}".format(res["type"])) for res in
self.evaluator["result"]] \ self.evaluator["result"]] \
if isinstance(self.evaluator["result"], list) \ if isinstance(self.evaluator["result"], list) \
@@ -184,7 +184,7 @@ class DesktopEnv(gym.Env):
if isinstance(self.metric, list) \ if isinstance(self.metric, list) \
else None else None
if "expected" in self.evaluator: if "expected" in self.evaluator and len(self.evaluator["expected"])>0:
self.expected_getter: Getter = [getattr(getters, "get_{:}".format(exp["type"])) if exp else None for exp in self.expected_getter: Getter = [getattr(getters, "get_{:}".format(exp["type"])) if exp else None for exp in
self.evaluator["expected"]] \ self.evaluator["expected"]] \
if isinstance(self.evaluator["expected"], list) \ if isinstance(self.evaluator["expected"], list) \

View File

@@ -0,0 +1,16 @@
[Unit]
Description=OSBench Server
StartLimitIntervalSec=60
StartLimitBurst=4
After=network.target auditd.service
[Service]
ExecStart=/usr/bin/python3 /home/user/main.py
User=user
WorkingDirectory=/home/user
Restart=on-failure
RestartSec=1
Environment="DISPLAY=:1"
[Install]
WantedBy=graphical.target

View File

@@ -0,0 +1,16 @@
[Unit]
Description=OSBench Server
StartLimitIntervalSec=60
StartLimitBurst=4
After=network.target auditd.service
[Service]
ExecStart=/usr/bin/python3 /home/user/main.py
User=user
WorkingDirectory=/home/user
Restart=on-failure
RestartSec=1
Environment="DISPLAY=%i"
[Install]
WantedBy=graphical.target

View File

@@ -10,10 +10,6 @@
"libreoffice_calc" "libreoffice_calc"
], ],
"evaluator": { "evaluator": {
"func": "infeasible", "func": "infeasible"
"expected": {
},
"result": {
}
} }
} }

View File

@@ -10,10 +10,6 @@
"libreoffice_calc" "libreoffice_calc"
], ],
"evaluator": { "evaluator": {
"func": "infeasible", "func": "infeasible"
"expected": {
},
"result": {
}
} }
} }

View File

@@ -0,0 +1,19 @@
import pandas as pd
file_path = "/Users/lxc/Downloads/Speedtest.csv"
# 找到csv第二行的第二个数据格里的值
# with open(file_path, "r") as f:
# for i, line in enumerate(f):
# if i == 1:
# data = line.split(",")[1]
# break
# print(data)
with open(file_path, "r") as f:
reader = pd.read_csv(f, sep=',', header=None)
# for column in reader.columns:
# if column.startswith("TEST_DATE"):
# data_col = column
# break
for data in reader['TEST_DATE']:
print(data)

View File

@@ -55,12 +55,12 @@ def judge_node(node: ET, platform="ubuntu") -> bool:
or platform=="windows"\ or platform=="windows"\
and node.get("{{{:}}}visible".format(state_ns), "false")=="true"\ and node.get("{{{:}}}visible".format(state_ns), "false")=="true"\
)\ )\
and ( node.get("{{{:}}}enabled".format(state_ns), "false")=="true"\ and ( node.get("{{{:}}}enabled".format(state_ns), "false")=="true"\
or node.get("{{{:}}}editable".format(state_ns), "false")=="true"\ or node.get("{{{:}}}editable".format(state_ns), "false")=="true"\
or node.get("{{{:}}}expandable".format(state_ns), "false")=="true"\ or node.get("{{{:}}}expandable".format(state_ns), "false")=="true"\
or node.get("{{{:}}}checkable".format(state_ns), "false")=="true" or node.get("{{{:}}}checkable".format(state_ns), "false")=="true"
)\ )\
and (node.get("name", "") != "" or node.text is not None and len(node.text)>0) and (node.get("name", "") != "" or node.text is not None and len(node.text)>0)
coordinates: Tuple[int, int] = eval(node.get("{{{:}}}screencoord".format(component_ns), "(-1, -1)")) coordinates: Tuple[int, int] = eval(node.get("{{{:}}}screencoord".format(component_ns), "(-1, -1)"))
sizes: Tuple[int, int] = eval(node.get("{{{:}}}size".format(component_ns), "(-1, -1)")) sizes: Tuple[int, int] = eval(node.get("{{{:}}}size".format(component_ns), "(-1, -1)"))

View File

@@ -5,11 +5,13 @@ import os
import re import re
import time import time
import uuid import uuid
import openai
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
from http import HTTPStatus from http import HTTPStatus
from io import BytesIO from io import BytesIO
from typing import Dict, List from typing import Dict, List
from google.api_core.exceptions import InvalidArgument
import backoff
import dashscope import dashscope
import google.generativeai as genai import google.generativeai as genai
import requests import requests
@@ -22,6 +24,8 @@ from mm_agents.prompts import SYS_PROMPT_IN_SCREENSHOT_OUT_CODE, SYS_PROMPT_IN_S
SYS_PROMPT_IN_SOM_A11Y_OUT_TAG, \ SYS_PROMPT_IN_SOM_A11Y_OUT_TAG, \
SYS_PROMPT_SEEACT, ACTION_DESCRIPTION_PROMPT_SEEACT, ACTION_GROUNDING_PROMPT_SEEACT SYS_PROMPT_SEEACT, ACTION_DESCRIPTION_PROMPT_SEEACT, ACTION_GROUNDING_PROMPT_SEEACT
# todo: cross-check with visualwebarena
logger = logging.getLogger("desktopenv.agent") logger = logging.getLogger("desktopenv.agent")
@@ -506,18 +510,25 @@ class PromptAgent:
try: try:
actions = self.parse_actions(response, masks) actions = self.parse_actions(response, masks)
self.thoughts.append(response) self.thoughts.append(response)
except Exception as e: except ValueError as e:
print("Failed to parse action from response", e) print("Failed to parse action from response", e)
actions = None actions = None
self.thoughts.append("") self.thoughts.append("")
return actions return actions
# @backoff.on_exception( @backoff.on_exception(
# backoff.expo, backoff.expo,
# (Exception), # here you should add more model exceptions as you want,
# max_tries=5 # but you are forbidden to add "Exception", that is, a common type of exception
# ) # because we want to catch this kind of Exception in the outside to ensure each example won't exceed the time limit
(openai.RateLimitError,
openai.BadRequestError,
openai.InternalServerError,
InvalidArgument),
max_tries=5
)
def call_llm(self, payload): def call_llm(self, payload):
if self.model.startswith("gpt"): if self.model.startswith("gpt"):
@@ -525,7 +536,7 @@ class PromptAgent:
"Content-Type": "application/json", "Content-Type": "application/json",
"Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}" "Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}"
} }
logger.info("Generating content with GPT model: %s", self.model) # logger.info("Generating content with GPT model: %s", self.model)
response = requests.post( response = requests.post(
"https://api.openai.com/v1/chat/completions", "https://api.openai.com/v1/chat/completions",
headers=headers, headers=headers,

155
run.py
View File

@@ -7,6 +7,7 @@ import json
import logging import logging
import os import os
import sys import sys
import signal
from desktop_env.envs.desktop_env import DesktopEnv from desktop_env.envs.desktop_env import DesktopEnv
from mm_agents.agent import PromptAgent from mm_agents.agent import PromptAgent
@@ -45,6 +46,10 @@ logger.addHandler(sdebug_handler)
logger = logging.getLogger("desktopenv.experiment") logger = logging.getLogger("desktopenv.experiment")
# make sure each example won't exceed the time limit
def handler(signo, frame):
raise RuntimeError("Time limit exceeded!")
signal.signal(signal.SIGALRM, handler)
def config() -> argparse.Namespace: def config() -> argparse.Namespace:
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
@@ -77,6 +82,7 @@ def config() -> argparse.Namespace:
# agent config # agent config
parser.add_argument("--max_trajectory_length", type=int, default=3) parser.add_argument("--max_trajectory_length", type=int, default=3)
parser.add_argument("--test_config_base_dir", type=str, default="evaluation_examples") parser.add_argument("--test_config_base_dir", type=str, default="evaluation_examples")
parser.add_argument("--example_time_limit", type=int, default=600)
# lm config # lm config
parser.add_argument("--model", type=str, default="gpt-4-vision-preview") parser.add_argument("--model", type=str, default="gpt-4-vision-preview")
@@ -98,6 +104,7 @@ def test(
) -> None: ) -> None:
scores = [] scores = []
max_steps = args.max_steps max_steps = args.max_steps
time_limit = args.example_time_limit
# log args # log args
logger.info("Args: %s", args) logger.info("Args: %s", args)
@@ -119,6 +126,7 @@ def test(
for domain in test_all_meta: for domain in test_all_meta:
for example_id in test_all_meta[domain]: for example_id in test_all_meta[domain]:
# example setting
config_file = os.path.join(args.test_config_base_dir, f"examples/{domain}/{example_id}.json") config_file = os.path.join(args.test_config_base_dir, f"examples/{domain}/{example_id}.json")
with open(config_file, "r", encoding="utf-8") as f: with open(config_file, "r", encoding="utf-8") as f:
example = json.load(f) example = json.load(f)
@@ -140,82 +148,115 @@ def test(
) )
os.makedirs(example_result_dir, exist_ok=True) os.makedirs(example_result_dir, exist_ok=True)
agent.reset() # example start running
obs = env.reset(task_config=example) try:
done = False signal.alarm(time_limit)
step_idx = 0 agent.reset()
env.controller.start_recording() obs = env.reset(task_config=example)
done = False
step_idx = 0
env.controller.start_recording()
# todo: update max running time for each example, @xiaochuan while not done and step_idx < max_steps:
while not done and step_idx < max_steps: actions = agent.predict(
actions = agent.predict( instruction,
instruction, obs
obs )
) for action in actions:
# Capture the timestamp before executing the action
action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
logger.info("Step %d: %s", step_idx + 1, action)
for action in actions: obs, reward, done, info = env.step(action, args.sleep_after_execution)
logger.info("Reward: %.2f", reward)
logger.info("Done: %s", done)
logger.info("Info: %s", info)
# Save screenshot and trajectory information
with open(os.path.join(example_result_dir, f"step_{step_idx + 1}_{action_timestamp}.png"),
"wb") as _f:
with open(obs['screenshot'], "rb") as __f:
screenshot = __f.read()
_f.write(screenshot)
with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
f.write(json.dumps({
"step_num": step_idx + 1,
"action_timestamp": action_timestamp,
"action": action,
"reward": reward,
"done": done,
"info": info,
"screenshot_file": f"step_{step_idx + 1}_{action_timestamp}.png"
}))
f.write("\n")
if done:
logger.info("The episode is done.")
break
step_idx += 1 step_idx += 1
# Capture the timestamp before executing the action
action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
logger.info("Step %d: %s", step_idx + 1, action)
obs, reward, done, info = env.step(action, args.sleep_after_execution) result = env.evaluate()
logger.info("Result: %.2f", result)
logger.info("Reward: %.2f", reward) scores.append(result)
logger.info("Done: %s", done) env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
logger.info("Info: %s", info) except RuntimeError as e:
logger.error(f"Error in example {domain}/{example_id}: {e}")
# Save screenshot and trajectory information # save info of this example and then continue
with open(os.path.join(example_result_dir, f"step_{step_idx + 1}_{action_timestamp}.png"), try:
"wb") as _f: env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
with open(obs['screenshot'], "rb") as __f: with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
screenshot = __f.read()
_f.write(screenshot)
with open(os.path.join(example_result_dir, "traj.json"), "a") as f:
f.write(json.dumps({ f.write(json.dumps({
"step_num": step_idx + 1, "Error": f"Error in example {domain}/{example_id}: {e}",
"action_timestamp": action_timestamp, "step": step_idx + 1,
"action": action,
"reward": reward,
"done": done,
"info": info,
"screenshot_file": f"step_{step_idx + 1}_{action_timestamp}.png"
})) }))
f.write("\n") f.write("\n")
except Exception as new_e:
if done: with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
logger.info("The episode is done.") f.write(json.dumps({
break "Error": f"Error in example {domain}/{example_id}: {e} and {new_e}",
try: "step": "before start recording",
result = env.evaluate() }))
except Exception as e: f.write("\n")
logger.error(f"Error in evaluating the example {example_id}: {e}") continue
result = 0.0
logger.info("Result: %.2f", result)
env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
scores.append(result)
with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
f.write(f"{result}\n")
env.close() env.close()
logger.info(f"Average score: {sum(scores) / len(scores)}") logger.info(f"Average score: {sum(scores) / len(scores)}")
def get_unfinished(test, result_dir): def get_unfinished(action_space, use_model, observation_type, result_dir, total_file_json):
# todo @xiaochuan target_dir = os.path.join(result_dir, action_space, observation_type, use_model)
pass
if not os.path.exists(target_dir):
return total_file_json
finished = {}
for domain in os.listdir(target_dir):
domain_path = os.path.join(target_dir, domain)
if os.path.isdir(domain_path):
finished[domain] = os.listdir(domain_path)
if not finished:
return total_file_json
for domain, examples in finished.items():
if domain in total_file_json:
total_file_json[domain] = [x for x in total_file_json[domain] if x not in examples]
return total_file_json
if __name__ == '__main__': if __name__ == '__main__':
####### The complete version of the list of examples ####### ####### The complete version of the list of examples #######
os.environ["TOKENIZERS_PARALLELISM"] = "false" os.environ["TOKENIZERS_PARALLELISM"] = "false"
args = config() args = config()
# test_file_list = get_unfinished(args.test, args.result_dir)
# logger.info(f"Total {len(test_file_list)} tasks left")
with open("evaluation_examples/test_all.json", "r", encoding="utf-8") as f: with open("evaluation_examples/test_all.json", "r", encoding="utf-8") as f:
test_all_meta = json.load(f) test_all_meta = json.load(f)
test_file_list = get_unfinished(args.action_space, args.model, args.observation_type, args.result_dir, test_all_meta)
left_info = ""
for domain in test_file_list:
left_info += f"{domain}: {len(test_file_list[domain])}\n"
logger.info(f"Left tasks:\n{left_info}")
test(args, test_all_meta) test(args, test_all_meta)