Merge remote-tracking branch 'origin/main'
# Conflicts: # mm_agents/agent.py # run.py
This commit is contained in:
19
.vscode/launch.json
vendored
Normal file
19
.vscode/launch.json
vendored
Normal file
@@ -0,0 +1,19 @@
|
||||
{
|
||||
// Use IntelliSense to learn about possible attributes.
|
||||
// Hover to view descriptions of existing attributes.
|
||||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"name": "Python Debugger: Current File with Arguments",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "${file}",
|
||||
"console": "integratedTerminal",
|
||||
"args": [
|
||||
"--path_to_vm", "/Users/lxc/Virtual Machines.localized/DesktopEnv-Ubuntu 64-bit Arm.vmwarevm/DesktopEnv-Ubuntu 64-bit Arm.vmx",
|
||||
"--example_time_limit", "60"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
16
demo.py
Normal file
16
demo.py
Normal file
@@ -0,0 +1,16 @@
|
||||
import signal
|
||||
import time
|
||||
|
||||
def handler(signo, frame):
|
||||
raise RuntimeError("Timeout")
|
||||
|
||||
signal.signal(signal.SIGALRM, handler)
|
||||
|
||||
while True:
|
||||
try:
|
||||
signal.alarm(5) # seconds
|
||||
time.sleep(10)
|
||||
print("Working...")
|
||||
except Exception as e :
|
||||
print(e)
|
||||
continue
|
||||
@@ -174,7 +174,7 @@ class DesktopEnv(gym.Env):
|
||||
if isinstance(self.evaluator["func"], list) \
|
||||
else getattr(metrics, self.evaluator["func"])
|
||||
self.metric_conj: str = self.evaluator.get("conj", "and") # take conjunction of multiple metrics
|
||||
if "result" in self.evaluator:
|
||||
if "result" in self.evaluator and len(self.evaluator["result"])>0:
|
||||
self.result_getter: Getter = [getattr(getters, "get_{:}".format(res["type"])) for res in
|
||||
self.evaluator["result"]] \
|
||||
if isinstance(self.evaluator["result"], list) \
|
||||
@@ -184,7 +184,7 @@ class DesktopEnv(gym.Env):
|
||||
if isinstance(self.metric, list) \
|
||||
else None
|
||||
|
||||
if "expected" in self.evaluator:
|
||||
if "expected" in self.evaluator and len(self.evaluator["expected"])>0:
|
||||
self.expected_getter: Getter = [getattr(getters, "get_{:}".format(exp["type"])) if exp else None for exp in
|
||||
self.evaluator["expected"]] \
|
||||
if isinstance(self.evaluator["expected"], list) \
|
||||
|
||||
16
desktop_env/server/osbench_server.service
Normal file
16
desktop_env/server/osbench_server.service
Normal file
@@ -0,0 +1,16 @@
|
||||
[Unit]
|
||||
Description=OSBench Server
|
||||
StartLimitIntervalSec=60
|
||||
StartLimitBurst=4
|
||||
After=network.target auditd.service
|
||||
|
||||
[Service]
|
||||
ExecStart=/usr/bin/python3 /home/user/main.py
|
||||
User=user
|
||||
WorkingDirectory=/home/user
|
||||
Restart=on-failure
|
||||
RestartSec=1
|
||||
Environment="DISPLAY=:1"
|
||||
|
||||
[Install]
|
||||
WantedBy=graphical.target
|
||||
16
desktop_env/server/osbench_server@.service
Normal file
16
desktop_env/server/osbench_server@.service
Normal file
@@ -0,0 +1,16 @@
|
||||
[Unit]
|
||||
Description=OSBench Server
|
||||
StartLimitIntervalSec=60
|
||||
StartLimitBurst=4
|
||||
After=network.target auditd.service
|
||||
|
||||
[Service]
|
||||
ExecStart=/usr/bin/python3 /home/user/main.py
|
||||
User=user
|
||||
WorkingDirectory=/home/user
|
||||
Restart=on-failure
|
||||
RestartSec=1
|
||||
Environment="DISPLAY=%i"
|
||||
|
||||
[Install]
|
||||
WantedBy=graphical.target
|
||||
@@ -10,10 +10,6 @@
|
||||
"libreoffice_calc"
|
||||
],
|
||||
"evaluator": {
|
||||
"func": "infeasible",
|
||||
"expected": {
|
||||
},
|
||||
"result": {
|
||||
}
|
||||
"func": "infeasible"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,10 +10,6 @@
|
||||
"libreoffice_calc"
|
||||
],
|
||||
"evaluator": {
|
||||
"func": "infeasible",
|
||||
"expected": {
|
||||
},
|
||||
"result": {
|
||||
}
|
||||
"func": "infeasible"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
19
evaluation_examples/examples/multi_apps/demo.py
Normal file
19
evaluation_examples/examples/multi_apps/demo.py
Normal file
@@ -0,0 +1,19 @@
|
||||
import pandas as pd
|
||||
|
||||
file_path = "/Users/lxc/Downloads/Speedtest.csv"
|
||||
# 找到csv第二行的第二个数据格里的值
|
||||
# with open(file_path, "r") as f:
|
||||
# for i, line in enumerate(f):
|
||||
# if i == 1:
|
||||
# data = line.split(",")[1]
|
||||
# break
|
||||
# print(data)
|
||||
|
||||
with open(file_path, "r") as f:
|
||||
reader = pd.read_csv(f, sep=',', header=None)
|
||||
# for column in reader.columns:
|
||||
# if column.startswith("TEST_DATE"):
|
||||
# data_col = column
|
||||
# break
|
||||
for data in reader['TEST_DATE']:
|
||||
print(data)
|
||||
@@ -55,12 +55,12 @@ def judge_node(node: ET, platform="ubuntu") -> bool:
|
||||
or platform=="windows"\
|
||||
and node.get("{{{:}}}visible".format(state_ns), "false")=="true"\
|
||||
)\
|
||||
and ( node.get("{{{:}}}enabled".format(state_ns), "false")=="true"\
|
||||
or node.get("{{{:}}}editable".format(state_ns), "false")=="true"\
|
||||
or node.get("{{{:}}}expandable".format(state_ns), "false")=="true"\
|
||||
or node.get("{{{:}}}checkable".format(state_ns), "false")=="true"
|
||||
)\
|
||||
and (node.get("name", "") != "" or node.text is not None and len(node.text)>0)
|
||||
and ( node.get("{{{:}}}enabled".format(state_ns), "false")=="true"\
|
||||
or node.get("{{{:}}}editable".format(state_ns), "false")=="true"\
|
||||
or node.get("{{{:}}}expandable".format(state_ns), "false")=="true"\
|
||||
or node.get("{{{:}}}checkable".format(state_ns), "false")=="true"
|
||||
)\
|
||||
and (node.get("name", "") != "" or node.text is not None and len(node.text)>0)
|
||||
|
||||
coordinates: Tuple[int, int] = eval(node.get("{{{:}}}screencoord".format(component_ns), "(-1, -1)"))
|
||||
sizes: Tuple[int, int] = eval(node.get("{{{:}}}size".format(component_ns), "(-1, -1)"))
|
||||
|
||||
@@ -5,11 +5,13 @@ import os
|
||||
import re
|
||||
import time
|
||||
import uuid
|
||||
import openai
|
||||
import xml.etree.ElementTree as ET
|
||||
from http import HTTPStatus
|
||||
from io import BytesIO
|
||||
from typing import Dict, List
|
||||
|
||||
from google.api_core.exceptions import InvalidArgument
|
||||
import backoff
|
||||
import dashscope
|
||||
import google.generativeai as genai
|
||||
import requests
|
||||
@@ -22,6 +24,8 @@ from mm_agents.prompts import SYS_PROMPT_IN_SCREENSHOT_OUT_CODE, SYS_PROMPT_IN_S
|
||||
SYS_PROMPT_IN_SOM_A11Y_OUT_TAG, \
|
||||
SYS_PROMPT_SEEACT, ACTION_DESCRIPTION_PROMPT_SEEACT, ACTION_GROUNDING_PROMPT_SEEACT
|
||||
|
||||
# todo: cross-check with visualwebarena
|
||||
|
||||
logger = logging.getLogger("desktopenv.agent")
|
||||
|
||||
|
||||
@@ -506,18 +510,25 @@ class PromptAgent:
|
||||
try:
|
||||
actions = self.parse_actions(response, masks)
|
||||
self.thoughts.append(response)
|
||||
except Exception as e:
|
||||
except ValueError as e:
|
||||
print("Failed to parse action from response", e)
|
||||
actions = None
|
||||
self.thoughts.append("")
|
||||
|
||||
return actions
|
||||
|
||||
# @backoff.on_exception(
|
||||
# backoff.expo,
|
||||
# (Exception),
|
||||
# max_tries=5
|
||||
# )
|
||||
@backoff.on_exception(
|
||||
backoff.expo,
|
||||
# here you should add more model exceptions as you want,
|
||||
# but you are forbidden to add "Exception", that is, a common type of exception
|
||||
# because we want to catch this kind of Exception in the outside to ensure each example won't exceed the time limit
|
||||
(openai.RateLimitError,
|
||||
openai.BadRequestError,
|
||||
openai.InternalServerError,
|
||||
InvalidArgument),
|
||||
max_tries=5
|
||||
)
|
||||
|
||||
def call_llm(self, payload):
|
||||
|
||||
if self.model.startswith("gpt"):
|
||||
@@ -525,7 +536,7 @@ class PromptAgent:
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}"
|
||||
}
|
||||
logger.info("Generating content with GPT model: %s", self.model)
|
||||
# logger.info("Generating content with GPT model: %s", self.model)
|
||||
response = requests.post(
|
||||
"https://api.openai.com/v1/chat/completions",
|
||||
headers=headers,
|
||||
|
||||
155
run.py
155
run.py
@@ -7,6 +7,7 @@ import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import signal
|
||||
|
||||
from desktop_env.envs.desktop_env import DesktopEnv
|
||||
from mm_agents.agent import PromptAgent
|
||||
@@ -45,6 +46,10 @@ logger.addHandler(sdebug_handler)
|
||||
|
||||
logger = logging.getLogger("desktopenv.experiment")
|
||||
|
||||
# make sure each example won't exceed the time limit
|
||||
def handler(signo, frame):
|
||||
raise RuntimeError("Time limit exceeded!")
|
||||
signal.signal(signal.SIGALRM, handler)
|
||||
|
||||
def config() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
@@ -77,6 +82,7 @@ def config() -> argparse.Namespace:
|
||||
# agent config
|
||||
parser.add_argument("--max_trajectory_length", type=int, default=3)
|
||||
parser.add_argument("--test_config_base_dir", type=str, default="evaluation_examples")
|
||||
parser.add_argument("--example_time_limit", type=int, default=600)
|
||||
|
||||
# lm config
|
||||
parser.add_argument("--model", type=str, default="gpt-4-vision-preview")
|
||||
@@ -98,6 +104,7 @@ def test(
|
||||
) -> None:
|
||||
scores = []
|
||||
max_steps = args.max_steps
|
||||
time_limit = args.example_time_limit
|
||||
|
||||
# log args
|
||||
logger.info("Args: %s", args)
|
||||
@@ -119,6 +126,7 @@ def test(
|
||||
|
||||
for domain in test_all_meta:
|
||||
for example_id in test_all_meta[domain]:
|
||||
# example setting
|
||||
config_file = os.path.join(args.test_config_base_dir, f"examples/{domain}/{example_id}.json")
|
||||
with open(config_file, "r", encoding="utf-8") as f:
|
||||
example = json.load(f)
|
||||
@@ -140,82 +148,115 @@ def test(
|
||||
)
|
||||
os.makedirs(example_result_dir, exist_ok=True)
|
||||
|
||||
agent.reset()
|
||||
obs = env.reset(task_config=example)
|
||||
done = False
|
||||
step_idx = 0
|
||||
env.controller.start_recording()
|
||||
# example start running
|
||||
try:
|
||||
signal.alarm(time_limit)
|
||||
agent.reset()
|
||||
obs = env.reset(task_config=example)
|
||||
done = False
|
||||
step_idx = 0
|
||||
env.controller.start_recording()
|
||||
|
||||
# todo: update max running time for each example, @xiaochuan
|
||||
while not done and step_idx < max_steps:
|
||||
actions = agent.predict(
|
||||
instruction,
|
||||
obs
|
||||
)
|
||||
while not done and step_idx < max_steps:
|
||||
actions = agent.predict(
|
||||
instruction,
|
||||
obs
|
||||
)
|
||||
for action in actions:
|
||||
# Capture the timestamp before executing the action
|
||||
action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
|
||||
logger.info("Step %d: %s", step_idx + 1, action)
|
||||
|
||||
for action in actions:
|
||||
obs, reward, done, info = env.step(action, args.sleep_after_execution)
|
||||
|
||||
logger.info("Reward: %.2f", reward)
|
||||
logger.info("Done: %s", done)
|
||||
logger.info("Info: %s", info)
|
||||
|
||||
# Save screenshot and trajectory information
|
||||
with open(os.path.join(example_result_dir, f"step_{step_idx + 1}_{action_timestamp}.png"),
|
||||
"wb") as _f:
|
||||
with open(obs['screenshot'], "rb") as __f:
|
||||
screenshot = __f.read()
|
||||
_f.write(screenshot)
|
||||
|
||||
with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
|
||||
f.write(json.dumps({
|
||||
"step_num": step_idx + 1,
|
||||
"action_timestamp": action_timestamp,
|
||||
"action": action,
|
||||
"reward": reward,
|
||||
"done": done,
|
||||
"info": info,
|
||||
"screenshot_file": f"step_{step_idx + 1}_{action_timestamp}.png"
|
||||
}))
|
||||
f.write("\n")
|
||||
|
||||
if done:
|
||||
logger.info("The episode is done.")
|
||||
break
|
||||
step_idx += 1
|
||||
# Capture the timestamp before executing the action
|
||||
action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
|
||||
logger.info("Step %d: %s", step_idx + 1, action)
|
||||
|
||||
obs, reward, done, info = env.step(action, args.sleep_after_execution)
|
||||
|
||||
logger.info("Reward: %.2f", reward)
|
||||
logger.info("Done: %s", done)
|
||||
logger.info("Info: %s", info)
|
||||
|
||||
# Save screenshot and trajectory information
|
||||
with open(os.path.join(example_result_dir, f"step_{step_idx + 1}_{action_timestamp}.png"),
|
||||
"wb") as _f:
|
||||
with open(obs['screenshot'], "rb") as __f:
|
||||
screenshot = __f.read()
|
||||
_f.write(screenshot)
|
||||
|
||||
with open(os.path.join(example_result_dir, "traj.json"), "a") as f:
|
||||
result = env.evaluate()
|
||||
logger.info("Result: %.2f", result)
|
||||
scores.append(result)
|
||||
env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
|
||||
except RuntimeError as e:
|
||||
logger.error(f"Error in example {domain}/{example_id}: {e}")
|
||||
# save info of this example and then continue
|
||||
try:
|
||||
env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
|
||||
with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
|
||||
f.write(json.dumps({
|
||||
"step_num": step_idx + 1,
|
||||
"action_timestamp": action_timestamp,
|
||||
"action": action,
|
||||
"reward": reward,
|
||||
"done": done,
|
||||
"info": info,
|
||||
"screenshot_file": f"step_{step_idx + 1}_{action_timestamp}.png"
|
||||
"Error": f"Error in example {domain}/{example_id}: {e}",
|
||||
"step": step_idx + 1,
|
||||
}))
|
||||
f.write("\n")
|
||||
|
||||
if done:
|
||||
logger.info("The episode is done.")
|
||||
break
|
||||
try:
|
||||
result = env.evaluate()
|
||||
except Exception as e:
|
||||
logger.error(f"Error in evaluating the example {example_id}: {e}")
|
||||
result = 0.0
|
||||
logger.info("Result: %.2f", result)
|
||||
env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
|
||||
scores.append(result)
|
||||
with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
|
||||
f.write(f"{result}\n")
|
||||
|
||||
except Exception as new_e:
|
||||
with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
|
||||
f.write(json.dumps({
|
||||
"Error": f"Error in example {domain}/{example_id}: {e} and {new_e}",
|
||||
"step": "before start recording",
|
||||
}))
|
||||
f.write("\n")
|
||||
continue
|
||||
env.close()
|
||||
logger.info(f"Average score: {sum(scores) / len(scores)}")
|
||||
|
||||
|
||||
def get_unfinished(test, result_dir):
|
||||
# todo @xiaochuan
|
||||
pass
|
||||
def get_unfinished(action_space, use_model, observation_type, result_dir, total_file_json):
|
||||
target_dir = os.path.join(result_dir, action_space, observation_type, use_model)
|
||||
|
||||
if not os.path.exists(target_dir):
|
||||
return total_file_json
|
||||
|
||||
finished = {}
|
||||
for domain in os.listdir(target_dir):
|
||||
domain_path = os.path.join(target_dir, domain)
|
||||
if os.path.isdir(domain_path):
|
||||
finished[domain] = os.listdir(domain_path)
|
||||
|
||||
if not finished:
|
||||
return total_file_json
|
||||
|
||||
for domain, examples in finished.items():
|
||||
if domain in total_file_json:
|
||||
total_file_json[domain] = [x for x in total_file_json[domain] if x not in examples]
|
||||
|
||||
return total_file_json
|
||||
|
||||
if __name__ == '__main__':
|
||||
####### The complete version of the list of examples #######
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
args = config()
|
||||
|
||||
# test_file_list = get_unfinished(args.test, args.result_dir)
|
||||
# logger.info(f"Total {len(test_file_list)} tasks left")
|
||||
|
||||
with open("evaluation_examples/test_all.json", "r", encoding="utf-8") as f:
|
||||
test_all_meta = json.load(f)
|
||||
|
||||
test_file_list = get_unfinished(args.action_space, args.model, args.observation_type, args.result_dir, test_all_meta)
|
||||
left_info = ""
|
||||
for domain in test_file_list:
|
||||
left_info += f"{domain}: {len(test_file_list[domain])}\n"
|
||||
logger.info(f"Left tasks:\n{left_info}")
|
||||
|
||||
test(args, test_all_meta)
|
||||
|
||||
Reference in New Issue
Block a user