ver Feb1st

human evaluation and SoM experiments on Thunderbird
This commit is contained in:
David Chang
2024-02-01 11:38:46 +08:00
parent a1e02c6d57
commit 5d436a6b66
5 changed files with 34 additions and 13 deletions

View File

@@ -1,7 +1,7 @@
{
"id": "06fe7178-4491-4589-810f-2e2bc9502122",
"snapshot": "thunderbird",
"instruction": "Could you help me back up all the email files in my profile to ~/emails.bak? Please save them separately in eml format.",
"instruction": "Could you help me back up all the email files in my inbox to ~/emails.bak? Please save them separately in eml format.",
"source": "https://www.quora.com/How-do-I-backup-email-files-in-Mozilla-Thunderbird",
"config": [
{

View File

@@ -1,7 +1,7 @@
{
"id": "2ad9387a-65d8-4e33-ad5b-7580065a27ca",
"snapshot": "thunderbird",
"instruction": "Create two local folders for me: COMPANY and UNIVERSITY.",
"instruction": "Create two local folders in Thunderbird for me: COMPANY and UNIVERSITY.",
"source": "https://support.mozilla.org/bm/questions/1027435",
"config": [
{

View File

@@ -46,7 +46,7 @@ logger = logging.getLogger("desktopenv.experiment")
#PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx"
PATH_TO_VM = "../../../../大文件/镜像/Ubuntu-1218/Ubuntu/Ubuntu.vmx"
PATH_TO_VM = "/mnt/data1/david/os-images/Ubuntu-1218/Ubuntu.vmx"
def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_trajectory", recording=True):
trajectory_recording_path = os.path.join(example_trajectory_dir, "trajectory.json")
@@ -198,6 +198,22 @@ if __name__ == '__main__':
, "aa3a8974-2e85-438b-b29e-a64df44deb4b"
, "a01fbce3-2793-461f-ab86-43680ccbae25"
, "4f07fbe9-70de-4927-a4d5-bb28bc12c52c"
# 42, ^ calc, v thunderbird
, "bb5e4c0d-f964-439c-97b6-bdb9747de3f4"
, "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3"
, "12086550-11c0-466b-b367-1d9e75b3910e"
, "06fe7178-4491-4589-810f-2e2bc9502122"
, "6766f2b8-8a72-417f-a9e5-56fcaa735837"
, "e1e75309-3ddb-4d09-92ec-de869c928143"
, "3d1682a7-0fb0-49ae-a4dc-a73afd2d06d5"
, "35253b65-1c19-4304-8aa4-6884b8218fc0"
, "d088f539-cab4-4f9a-ac92-9999fc3a656e"
, "2ad9387a-65d8-4e33-ad5b-7580065a27ca"
, "480bcfea-d68f-4aaa-a0a9-2589ef319381"
, "030eeff7-b492-4218-b312-701ec99ee0cc"
, "94760984-3ff5-41ee-8347-cf1af709fea0"
, "99146c54-4f37-4ab8-9327-5f3291665e1e"
, "c9e7eaf2-b1a1-4efc-a982-721972fa9f02"
]
for example_id in xx_list[18:]:
main("libreoffice_calc", example_id)
for example_id in xx_list[42:]:
main("thunderbird", example_id)

View File

@@ -47,7 +47,7 @@ def human_agent():
Runs the Gym environment with human input.
"""
with open("evaluation_examples/examples/thunderbird/030eeff7-b492-4218-b312-701ec99ee0cc.json", "r") as f:
with open("evaluation_examples/examples/thunderbird/c9e7eaf2-b1a1-4efc-a982-721972fa9f02.json", "r") as f:
example = json.load(f)
example["snapshot"] = "Snapshot 30"

View File

@@ -460,14 +460,18 @@ class GPT4v_Agent:
with open("messages.json", "w") as f:
f.write(json.dumps(messages, indent=4))
response = self.call_llm({
"model": self.model,
"messages": messages,
"max_tokens": self.max_tokens
})
try:
response = self.call_llm({
"model": self.model,
"messages": messages,
"max_tokens": self.max_tokens
})
except:
response = ""
logger.debug("RESPONSE: %s", response)
# {{{
if self.exp == "seeact":
messages.append({
"role": "assistant",
@@ -503,7 +507,7 @@ class GPT4v_Agent:
except Exception as e:
print("Failed to parse action from response", e)
actions = None
self.thoughts.append("")
self.thoughts.append("") # }}}
return actions
@@ -516,7 +520,8 @@ class GPT4v_Agent:
response = requests.post(
"https://api.openai.com/v1/chat/completions",
headers=self.headers,
json=payload
json=payload,
timeout=20
)
if response.status_code != 200: