diff --git a/evaluation_examples/README.md b/evaluation_examples/README.md new file mode 100644 index 0000000..a5bf2db --- /dev/null +++ b/evaluation_examples/README.md @@ -0,0 +1,24 @@ +# Evaluation examples + +Here we put the data examples to benchmark the ability of agents when interacting with GUI. +The examples are stored in `./examples` where each data item formatted as: + +``` +{ + "id": "uid", # unique id + "snapshot": "snapshot_id", # the snapshot id of the environment, with some data already there and apps already opened, or just desktop + "instruction": "natural_language_instruction", # the natural language instruction of the task, what we want the agent to do + "source": "website_url", # where we know this example, some forum, or some website, or some paper + "config": {xxx}, # the scripts to setup the donwload and open files actions, as the initial state of a task + "trajectory": "trajectory_directory", # the trajectory directory, which contains the action sequence file, the screenshots and the recording video + "related_apps": ["app1", "app2", ...], # the related apps, which are opened during the task + "evaluator": "evaluation_dir", # the directory of the evaluator, which contains the evaluation script for this example +… +} +``` + +The `./trajectories` file contains the annotated trajectories for each data item in `./examples` for finishing the task. + +For now, it is under construction, and only tested on Windows 10. Please: +- Modify the path accordingly to run the evaluation; +- Remind us if some parts are overfit to our environment. diff --git a/evaluation_examples/examples/0bf05a7d-b28b-44d2-955a-50b41e24012a.json b/evaluation_examples/examples/0bf05a7d-b28b-44d2-955a-50b41e24012a.json new file mode 100644 index 0000000..6f92ac5 --- /dev/null +++ b/evaluation_examples/examples/0bf05a7d-b28b-44d2-955a-50b41e24012a.json @@ -0,0 +1,22 @@ +{ + "id": "0bf05a7d-b28b-44d2-955a-50b41e24012a", + "snapshot": "libreoffice_calc", + "instruction": "I would like to pad all the numbers in the 'Old ID' column with zeros in front, to fill them up to seven digits in the 'New 7 Digit ID' column.", + "source": "https://www.youtube.com/shorts/FPAQaDTS8VY", + "config": { + "download": [ + [ + "", + "C:\\Users\\tianbaox\\Desktop\\Customers_New_7digit_Id.xlsx" + ] + ], + "open": [ + "C:\\Users\\tianbaox\\Desktop\\Customers_New_7digit_Id.xlsx" + ] + }, + "trajectory": "trajectories/0bf05a7d-b28b-44d2-955a-50b41e24012a", + "related_apps": [ + "libreoffice calc" + ], + "evaluator": "evaluation_dir" +} diff --git a/evaluation_examples/examples/2bd59342-0664-4ccb-ba87-79379096cc08.json b/evaluation_examples/examples/2bd59342-0664-4ccb-ba87-79379096cc08.json new file mode 100644 index 0000000..98d773a --- /dev/null +++ b/evaluation_examples/examples/2bd59342-0664-4ccb-ba87-79379096cc08.json @@ -0,0 +1,22 @@ +{ + "id": "2bd59342-0664-4ccb-ba87-79379096cc08", + "snapshot": "libreoffice_calc", + "instruction": "Make sparkline chart line by line", + "source": "https://www.youtube.com/shorts/L3Z-F1QTQFY", + "config": { + "download": [ + [ + "", + "C:\\Users\\tianbaox\\Desktop\\OrderId_Month_Chart.xlsx" + ] + ], + "open": [ + "C:\\Users\\tianbaox\\Desktop\\OrderId_Month_Chart.xlsx" + ] + }, + "trajectory": "trajectories/2bd59342-0664-4ccb-ba87-79379096cc08", + "related_apps": [ + "libreoffice calc" + ], + "evaluator": "evaluation_dir" +} diff --git a/evaluation_examples/examples/37608790-6147-45d0-9f20-1137bb35703d.json b/evaluation_examples/examples/37608790-6147-45d0-9f20-1137bb35703d.json new file mode 100644 index 0000000..b1f9455 --- /dev/null +++ b/evaluation_examples/examples/37608790-6147-45d0-9f20-1137bb35703d.json @@ -0,0 +1,22 @@ +{ + "id": "37608790-6147-45d0-9f20-1137bb35703d", + "snapshot": "libreoffice_calc", + "instruction": "Help me fill the columns of First Name, Last Name and Rank", + "source": "https://www.youtube.com/shorts/uzPo_CPCHH8", + "config": { + "download": [ + [ + "https://drive.usercontent.google.com/download?id=1wDqap5cBfxnlqTNrZG61k_wDWTujl6AU&export=download&authuser=0&confirm=t&uuid=fd183b89-76b7-4dc5-880e-1045ed769562&at=APZUnTWp9RMafMg0xohhBWazN3YD:1701785710674", + "C:\\Users\\tianbaox\\Desktop\\Employee_Roles_and_Ranks.xlsx" + ] + ], + "open": [ + "C:\\Users\\tianbaox\\Desktop\\Employee_Roles_and_Ranks.xlsx" + ] + }, + "trajectory": "trajectories/37608790-6147-45d0-9f20-1137bb35703d", + "related_apps": [ + "libreoffice calc" + ], + "evaluator": "evaluation_dir" +} diff --git a/evaluation_examples/examples/7a4e4bc8-922c-4c84-865c-25ba34136be1.json b/evaluation_examples/examples/7a4e4bc8-922c-4c84-865c-25ba34136be1.json new file mode 100644 index 0000000..dfe1da0 --- /dev/null +++ b/evaluation_examples/examples/7a4e4bc8-922c-4c84-865c-25ba34136be1.json @@ -0,0 +1,22 @@ +{ + "id": "7a4e4bc8-922c-4c84-865c-25ba34136be1", + "snapshot": "libreoffice_calc", + "instruction": "Reorder the columns to be \"Data\", \"First Name\", \"Last Name\", \"Order ID\", \"Sales\"", + "source": "https://www.youtube.com/shorts/bvUhr1AHs44", + "config": { + "download": [ + [ + "", + "C:\\Users\\tianbaox\\Desktop\\Name_Order_Id_move_column.xlsx" + ] + ], + "open": [ + "C:\\Users\\tianbaox\\Desktop\\Name_Order_Id_move_column.xlsx" + ] + }, + "trajectory": "trajectories/7a4e4bc8-922c-4c84-865c-25ba34136be1", + "related_apps": [ + "libreoffice calc" + ], + "evaluator": "evaluation_dir" +} diff --git a/evaluation_examples/examples/7b802dad-6e0f-4204-9815-d4e3f57627d8.json b/evaluation_examples/examples/7b802dad-6e0f-4204-9815-d4e3f57627d8.json new file mode 100644 index 0000000..d26f5ce --- /dev/null +++ b/evaluation_examples/examples/7b802dad-6e0f-4204-9815-d4e3f57627d8.json @@ -0,0 +1,22 @@ +{ + "id": "7b802dad-6e0f-4204-9815-d4e3f57627d8", + "snapshot": "libreoffice_calc", + "instruction": "I would like to sort this table based on cell color, placing all the rows marked with pink at the beginning, while keeping their order among themselves unchanged.", + "source": "https://www.youtube.com/shorts/Of-lzeP1usE", + "config": { + "download": [ + [ + "", + "C:\\Users\\tianbaox\\Desktop\\Customer_Sort_by_cell_color.xlsx" + ] + ], + "open": [ + "C:\\Users\\tianbaox\\Desktop\\Customer_Sort_by_cell_color.xlsx" + ] + }, + "trajectory": "trajectories/7b802dad-6e0f-4204-9815-d4e3f57627d8", + "related_apps": [ + "libreoffice calc" + ], + "evaluator": "evaluation_dir" +} diff --git a/evaluation_examples/examples/7efeb4b1-3d19-4762-b163-63328d66303b.json b/evaluation_examples/examples/7efeb4b1-3d19-4762-b163-63328d66303b.json new file mode 100644 index 0000000..dcb0eb6 --- /dev/null +++ b/evaluation_examples/examples/7efeb4b1-3d19-4762-b163-63328d66303b.json @@ -0,0 +1,22 @@ +{ + "id": "7efeb4b1-3d19-4762-b163-63328d66303b", + "snapshot": "libreoffice_calc", + "instruction": "Fill in the Serieal Numbers in \"Serial #\" column", + "source": "https://www.youtube.com/shorts/4jzXfZNhfmk", + "config": { + "download": [ + [ + "", + "C:\\Users\\tianbaox\\Desktop\\Order_Sales_Serial#.xlsx" + ] + ], + "open": [ + "C:\\Users\\tianbaox\\Desktop\\Order_Sales_Serial#.xlsx" + ] + }, + "trajectory": "trajectories/", + "related_apps": [ + "libreoffice calc" + ], + "evaluator": "evaluation_dir" +} diff --git a/evaluation_examples/examples/a9f325aa-8c05-4e4f-8341-9e4358565f4f.json b/evaluation_examples/examples/a9f325aa-8c05-4e4f-8341-9e4358565f4f.json new file mode 100644 index 0000000..057ed2f --- /dev/null +++ b/evaluation_examples/examples/a9f325aa-8c05-4e4f-8341-9e4358565f4f.json @@ -0,0 +1,22 @@ +{ + "id": "a9f325aa-8c05-4e4f-8341-9e4358565f4f", + "snapshot": "libreoffice_calc", + "instruction": "Clean the messy movie titles and put them in the cleaned column", + "source": "https://www.youtube.com/shorts/A0gmEBRKXWs", + "config": { + "download": [ + [ + "", + "C:\\Users\\tianbaox\\Desktop\\" + ] + ], + "open": [ + "C:\\Users\\tianbaox\\Desktop\\" + ] + }, + "trajectory": "trajectories/a9f325aa-8c05-4e4f-8341-9e4358565f4f", + "related_apps": [ + "libreoffice calc" + ], + "evaluator": "evaluation_dir" +} diff --git a/evaluation_examples/examples/d681960f-7bc3-4286-9913-a8812ba3261a.json b/evaluation_examples/examples/d681960f-7bc3-4286-9913-a8812ba3261a.json new file mode 100644 index 0000000..52316e6 --- /dev/null +++ b/evaluation_examples/examples/d681960f-7bc3-4286-9913-a8812ba3261a.json @@ -0,0 +1,22 @@ +{ + "id": "d681960f-7bc3-4286-9913-a8812ba3261a", + "snapshot": "libreoffice_calc", + "instruction": "According to the green table shown above, calculate and give each student a grade", + "source": "https://www.youtube.com/shorts/d7U1S_IsTVM", + "config": { + "download": [ + [ + "https://drive.usercontent.google.com/download?id=1wodZjx1KjThUsrtF6ZJaCTy1fQX4E9vA&export=download&authuser=0&confirm=t&uuid=d07ca312-1abc-40f2-81cd-d06e27119854&at=APZUnTWwjnxsHQYapSvpLR8NmlfV:1701785087048", + "C:\\Users\\tianbaox\\Desktop\\Student_Grades_and_Remarks.xlsx" + ] + ], + "open": [ + "C:\\Users\\tianbaox\\Desktop\\Student_Grades_and_Remarks.xlsx" + ] + }, + "trajectory": "trajectories/d681960f-7bc3-4286-9913-a8812ba3261a", + "related_apps": [ + "libreoffice calc" + ], + "evaluator": "evaluation_dir" +} diff --git a/evaluation_examples/examples/eb03d19a-b88d-4de4-8a64-ca0ac66f426b.json b/evaluation_examples/examples/eb03d19a-b88d-4de4-8a64-ca0ac66f426b.json new file mode 100644 index 0000000..02e0da2 --- /dev/null +++ b/evaluation_examples/examples/eb03d19a-b88d-4de4-8a64-ca0ac66f426b.json @@ -0,0 +1,22 @@ +{ + "id": "eb03d19a-b88d-4de4-8a64-ca0ac66f426b", + "snapshot": "libreoffice_calc", + "instruction": "Traverse the table and paste it below", + "source": "https://www.youtube.com/shorts/t9JLUaT55UQ", + "config": { + "download": [ + [ + "", + "C:\\Users\\tianbaox\\Desktop\\" + ] + ], + "open": [ + "C:\\Users\\tianbaox\\Desktop\\" + ] + }, + "trajectory": "trajectories/eb03d19a-b88d-4de4-8a64-ca0ac66f426b", + "related_apps": [ + "libreoffice calc" + ], + "evaluator": "evaluation_dir" +} diff --git a/evaluation_examples/examples/ecb0df7a-4e8d-4a03-b162-053391d3afaf.json b/evaluation_examples/examples/ecb0df7a-4e8d-4a03-b162-053391d3afaf.json new file mode 100644 index 0000000..6c5142a --- /dev/null +++ b/evaluation_examples/examples/ecb0df7a-4e8d-4a03-b162-053391d3afaf.json @@ -0,0 +1,22 @@ +{ + "id": "ecb0df7a-4e8d-4a03-b162-053391d3afaf", + "snapshot": "libreoffice_calc", + "instruction": "Enable each cell in the column\"Pass/Fail/Held\" is a drop down list", + "source": "https://www.youtube.com/shorts/tXOovKn0H68", + "config": { + "download": [ + [ + "", + "C:\\Users\\tianbaox\\Desktop\\" + ] + ], + "open": [ + "C:\\Users\\tianbaox\\Desktop\\" + ] + }, + "trajectory": "trajectories/ecb0df7a-4e8d-4a03-b162-053391d3afaf", + "related_apps": [ + "libreoffice calc" + ], + "evaluator": "evaluation_dir" +} diff --git a/evaluation_examples/examples/f9584479-3d0d-4c79-affa-9ad7afdd8850.json b/evaluation_examples/examples/f9584479-3d0d-4c79-affa-9ad7afdd8850.json new file mode 100644 index 0000000..1bb0ff6 --- /dev/null +++ b/evaluation_examples/examples/f9584479-3d0d-4c79-affa-9ad7afdd8850.json @@ -0,0 +1,22 @@ +{ + "id": "f9584479-3d0d-4c79-affa-9ad7afdd8850", + "snapshot": "libreoffice_calc", + "instruction": "Fill the missing row and column which show the total value", + "source": "https://youtube.com/shorts/feldd-Pn48c?si=9xJiem2uAHm6Jshb", + "config": { + "download": [ + [ + "https://drive.usercontent.google.com/download?id=1rwhniaClEkF8XFzdfaNUA6GmAiy4syMZ&export=download&authuser=0&confirm=t&uuid=6fdd5b04-85f4-45e1-ad74-368f8f2a82ab&at=APZUnTUP-JxPxLfNls6jXWghblQ5:1701766091851", + "C:\\Users\\tianbaox\\Desktop\\Quarterly_Product_Sales_by_Zone.xlsx" + ] + ], + "open": [ + "C:\\Users\\tianbaox\\Desktop\\Quarterly_Product_Sales_by_Zone.xlsx" + ] + }, + "trajectory": "trajectories/f9584479-3d0d-4c79-affa-9ad7afdd8850", + "related_apps": [ + "libreoffice calc" + ], + "evaluator": "evaluation_dir" +} diff --git a/evaluation_examples/examples/template.json b/evaluation_examples/examples/template.json new file mode 100644 index 0000000..f8efe3b --- /dev/null +++ b/evaluation_examples/examples/template.json @@ -0,0 +1,13 @@ +{ + "id": "", + "snapshot": "libreoffice_calc", + "instruction": "", + "source": "", + "config": { + }, + "trajectory": "trajectories/", + "related_apps": [ + "libreoffice calc" + ], + "evaluator": "evaluation_dir" +} diff --git a/main.py b/main.py index fd21ac0..a52ad90 100644 --- a/main.py +++ b/main.py @@ -1,3 +1,4 @@ +import json from desktop_env.envs.desktop_env import DesktopEnv @@ -5,17 +6,16 @@ def human_agent(): """ Runs the Gym environment with human input. """ + + with open("evaluation_examples/examples/37608790-6147-45d0-9f20-1137bb35703d.json", "r") as f: + example = json.load(f) + env = DesktopEnv( # path_to_vm=r"""C:\Users\tianbaox\Downloads\Windows 10 x64\Windows 10 x64.vmx""", path_to_vm=r"""C:\Users\tianbaox\Documents\Virtual Machines\Win10\Win10.vmx""", # path_to_vm="/home/yuri/vmware/Ubuntu 64-bit/Ubuntu 64-bit.vmx", snapshot_path="base_setup", - config={ - "download": [( - "https://drive.usercontent.google.com/download?id=1rwhniaClEkF8XFzdfaNUA6GmAiy4syMZ&export=download&authuser=0&confirm=t&uuid=6fdd5b04-85f4-45e1-ad74-368f8f2a82ab&at=APZUnTUP-JxPxLfNls6jXWghblQ5:1701766091851", - r"C:\Users\tianbaox\Desktop\Quarterly_Product_Sales_by_Zone.xlsx")], - "open": [r"C:\Users\tianbaox\Desktop\Quarterly_Product_Sales_by_Zone.xlsx"], - } + config=example["config"], ) # reset the environment to certain snapshot