diff --git a/evaluation_examples/examples/chrome_windows/chrome_win_task01.json b/evaluation_examples/examples/chrome_windows/chrome_win_task01.json new file mode 100644 index 0000000..3b0c5fa --- /dev/null +++ b/evaluation_examples/examples/chrome_windows/chrome_win_task01.json @@ -0,0 +1,45 @@ +{ + "id": "chrome_win_task01", + "snapshot": "snapshot", + "instruction": "Can you make a new folder for me on the bookmarks bar in Chrome? Let's call it 'Favorites.'", + "source": "OSWorld-adapted", + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe" + ], + "shell": false + } + }, + { + "type": "sleep", + "parameters": { + "seconds": 5 + } + } + ], + "trajectory": "trajectories/", + "related_apps": [ + "chrome" + ], + "evaluator": { + "postconfig": [ + { + "type": "sleep", + "parameters": { + "seconds": 3 + } + } + ], + "func": "vllm_eval" + }, + "proxy": false, + "fixed_ip": true, + "possibility_of_env_change": "low", + "metadata": { + "input_files": [], + "steps": "1. Chrome已自动打开。\n2. 在书签栏空白处右键,选择'Add folder'(或'添加文件夹')。\n3. 在弹出的对话框中将文件夹名称设为'Favorites',点击保存。\n4. 确认书签栏上出现了名为'Favorites'的文件夹。" + } +} \ No newline at end of file diff --git a/evaluation_examples/examples/chrome_windows/chrome_win_task02.json b/evaluation_examples/examples/chrome_windows/chrome_win_task02.json new file mode 100644 index 0000000..50a9a56 --- /dev/null +++ b/evaluation_examples/examples/chrome_windows/chrome_win_task02.json @@ -0,0 +1,46 @@ +{ + "id": "chrome_win_task02", + "snapshot": "snapshot", + "instruction": "Can you save this webpage I'm looking at to the bookmarks bar so I can come back to it later?", + "source": "OSWorld-adapted", + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", + "https://jalammar.github.io/illustrated-transformer/" + ], + "shell": false + } + }, + { + "type": "sleep", + "parameters": { + "seconds": 5 + } + } + ], + "trajectory": "trajectories/", + "related_apps": [ + "chrome" + ], + "evaluator": { + "postconfig": [ + { + "type": "sleep", + "parameters": { + "seconds": 3 + } + } + ], + "func": "vllm_eval" + }, + "proxy": false, + "fixed_ip": true, + "possibility_of_env_change": "low", + "metadata": { + "input_files": [], + "steps": "1. Chrome已自动打开并导航到目标页面。\n2. 按 Ctrl+D 打开书签保存对话框,或点击地址栏右侧的星形图标。\n3. 在'Folder'下拉菜单中选择'Bookmarks bar'(书签栏)。\n4. 点击'Done'保存。\n5. 确认书签栏上出现了该页面的书签。" + } +} \ No newline at end of file diff --git a/evaluation_examples/examples/chrome_windows/chrome_win_task03.json b/evaluation_examples/examples/chrome_windows/chrome_win_task03.json new file mode 100644 index 0000000..3e03dfa --- /dev/null +++ b/evaluation_examples/examples/chrome_windows/chrome_win_task03.json @@ -0,0 +1,45 @@ +{ + "id": "chrome_win_task03", + "snapshot": "snapshot", + "instruction": "My grandmother has been using Chrome and told me the font size is way too small. Could you set the default font size to the largest?", + "source": "OSWorld-adapted", + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe" + ], + "shell": false + } + }, + { + "type": "sleep", + "parameters": { + "seconds": 5 + } + } + ], + "trajectory": "trajectories/", + "related_apps": [ + "chrome" + ], + "evaluator": { + "postconfig": [ + { + "type": "sleep", + "parameters": { + "seconds": 3 + } + } + ], + "func": "vllm_eval" + }, + "proxy": false, + "fixed_ip": true, + "possibility_of_env_change": "low", + "metadata": { + "input_files": [], + "steps": "1. Chrome已自动打开。\n2. 点击右上角三点菜单 → Settings(设置)。\n3. 在左侧找到'Appearance'(外观),点击进入。\n4. 找到'Font size'(字体大小),将其设置为'Very large'(最大)。\n5. 确认字体大小选项显示为'Very large'。" + } +} \ No newline at end of file diff --git a/evaluation_examples/examples/chrome_windows/chrome_win_task04.json b/evaluation_examples/examples/chrome_windows/chrome_win_task04.json new file mode 100644 index 0000000..c8ff886 --- /dev/null +++ b/evaluation_examples/examples/chrome_windows/chrome_win_task04.json @@ -0,0 +1,45 @@ +{ + "id": "chrome_win_task04", + "snapshot": "snapshot", + "instruction": "Can you make Bing the main search engine when I look stuff up in Chrome?", + "source": "OSWorld-adapted", + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe" + ], + "shell": false + } + }, + { + "type": "sleep", + "parameters": { + "seconds": 5 + } + } + ], + "trajectory": "trajectories/", + "related_apps": [ + "chrome" + ], + "evaluator": { + "postconfig": [ + { + "type": "sleep", + "parameters": { + "seconds": 3 + } + } + ], + "func": "vllm_eval" + }, + "proxy": false, + "fixed_ip": true, + "possibility_of_env_change": "low", + "metadata": { + "input_files": [], + "steps": "1. Chrome已自动打开。\n2. 点击右上角三点菜单 → Settings(设置)。\n3. 在左侧点击'Search engine'(搜索引擎)。\n4. 在'Search engine used in the address bar'下拉菜单中选择'Bing'。\n5. 确认搜索引擎已改为Bing。" + } +} \ No newline at end of file diff --git a/evaluation_examples/examples/chrome_windows/chrome_win_task05.json b/evaluation_examples/examples/chrome_windows/chrome_win_task05.json new file mode 100644 index 0000000..5d47154 --- /dev/null +++ b/evaluation_examples/examples/chrome_windows/chrome_win_task05.json @@ -0,0 +1,45 @@ +{ + "id": "chrome_win_task05", + "snapshot": "snapshot", + "instruction": "I want Chrome to enable the 'Do Not Track' feature to enhance my online privacy. Can you turn that on?", + "source": "OSWorld-adapted", + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe" + ], + "shell": false + } + }, + { + "type": "sleep", + "parameters": { + "seconds": 5 + } + } + ], + "trajectory": "trajectories/", + "related_apps": [ + "chrome" + ], + "evaluator": { + "postconfig": [ + { + "type": "sleep", + "parameters": { + "seconds": 3 + } + } + ], + "func": "vllm_eval" + }, + "proxy": false, + "fixed_ip": true, + "possibility_of_env_change": "low", + "metadata": { + "input_files": [], + "steps": "1. Chrome已自动打开。\n2. 点击右上角三点菜单 → Settings(设置)。\n3. 在左侧点击'Privacy and security'(隐私和安全)。\n4. 找到'Send a Do Not Track request with your browsing traffic',将其开关打开(变为蓝色)。\n5. 确认该开关处于开启状态。" + } +} \ No newline at end of file diff --git a/evaluation_examples/examples/chrome_windows/chrome_win_task06.json b/evaluation_examples/examples/chrome_windows/chrome_win_task06.json new file mode 100644 index 0000000..2946c74 --- /dev/null +++ b/evaluation_examples/examples/chrome_windows/chrome_win_task06.json @@ -0,0 +1,45 @@ +{ + "id": "chrome_win_task06", + "snapshot": "snapshot", + "instruction": "Please help me set Chrome to delete my browsing data automatically every time I close the browser.", + "source": "OSWorld-adapted", + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe" + ], + "shell": false + } + }, + { + "type": "sleep", + "parameters": { + "seconds": 5 + } + } + ], + "trajectory": "trajectories/", + "related_apps": [ + "chrome" + ], + "evaluator": { + "postconfig": [ + { + "type": "sleep", + "parameters": { + "seconds": 3 + } + } + ], + "func": "vllm_eval" + }, + "proxy": false, + "fixed_ip": true, + "possibility_of_env_change": "low", + "metadata": { + "input_files": [], + "steps": "1. Chrome已自动打开。\n2. 点击右上角三点菜单 → Settings(设置)。\n3. 点击'Privacy and security' → 'Cookies and other site data'。\n4. 找到'Clear cookies and site data when you close all windows',将其开关打开。\n5. 确认该开关处于开启状态。" + } +} \ No newline at end of file diff --git a/evaluation_examples/examples/chrome_windows/chrome_win_task07.json b/evaluation_examples/examples/chrome_windows/chrome_win_task07.json new file mode 100644 index 0000000..1532ab6 --- /dev/null +++ b/evaluation_examples/examples/chrome_windows/chrome_win_task07.json @@ -0,0 +1,45 @@ +{ + "id": "chrome_win_task07", + "snapshot": "snapshot", + "instruction": "I want to update my Chrome profile name to Thomas. Could you help me change the username in Chrome profiles?", + "source": "OSWorld-adapted", + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe" + ], + "shell": false + } + }, + { + "type": "sleep", + "parameters": { + "seconds": 5 + } + } + ], + "trajectory": "trajectories/", + "related_apps": [ + "chrome" + ], + "evaluator": { + "postconfig": [ + { + "type": "sleep", + "parameters": { + "seconds": 3 + } + } + ], + "func": "vllm_eval" + }, + "proxy": false, + "fixed_ip": true, + "possibility_of_env_change": "low", + "metadata": { + "input_files": [], + "steps": "1. Chrome已自动打开。\n2. 点击右上角的个人头像图标(Profile图标)。\n3. 在弹出菜单中点击铅笔编辑图标或'Manage profiles'。\n4. 在名称输入框中将当前名称修改为'Thomas'。\n5. 点击完成/保存,确认界面上显示用户名为'Thomas'。" + } +} \ No newline at end of file diff --git a/evaluation_examples/examples/chrome_windows/chrome_win_task08.json b/evaluation_examples/examples/chrome_windows/chrome_win_task08.json new file mode 100644 index 0000000..b1d955a --- /dev/null +++ b/evaluation_examples/examples/chrome_windows/chrome_win_task08.json @@ -0,0 +1,68 @@ +{ + "id": "chrome_win_task08", + "snapshot": "snapshot", + "instruction": "Can you make my computer bring back the last tab I shut down in Chrome?", + "source": "OSWorld-adapted", + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", + "https://www.lonelyplanet.com" + ], + "shell": false + } + }, + { + "type": "launch", + "parameters": { + "command": [ + "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", + "--new-tab", + "https://www.airbnb.com" + ], + "shell": false + } + }, + { + "type": "launch", + "parameters": { + "command": [ + "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", + "--new-tab", + "https://www.tripadvisor.com" + ], + "shell": false + } + }, + { + "type": "sleep", + "parameters": { + "seconds": 5 + } + } + ], + "trajectory": "trajectories/", + "related_apps": [ + "chrome" + ], + "evaluator": { + "postconfig": [ + { + "type": "sleep", + "parameters": { + "seconds": 3 + } + } + ], + "func": "vllm_eval" + }, + "proxy": false, + "fixed_ip": true, + "possibility_of_env_change": "low", + "metadata": { + "input_files": [], + "steps": "1. Chrome已自动打开并加载了多个标签页。\n2. 关闭最右侧的标签页(点击该标签页上的X,或按Ctrl+W)。\n3. 恢复刚刚关闭的标签页:按 Ctrl+Shift+T,或右键标签栏选择'Reopen closed tab'。\n4. 确认被关闭的标签页重新出现在标签栏中。" + } +} \ No newline at end of file diff --git a/evaluation_examples/examples/chrome_windows/chrome_win_task09.json b/evaluation_examples/examples/chrome_windows/chrome_win_task09.json new file mode 100644 index 0000000..a3954c2 --- /dev/null +++ b/evaluation_examples/examples/chrome_windows/chrome_win_task09.json @@ -0,0 +1,45 @@ +{ + "id": "chrome_win_task09", + "snapshot": "snapshot", + "instruction": "Computer, please navigate to the area in Chrome settings where my passwords are stored.", + "source": "OSWorld-adapted", + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe" + ], + "shell": false + } + }, + { + "type": "sleep", + "parameters": { + "seconds": 5 + } + } + ], + "trajectory": "trajectories/", + "related_apps": [ + "chrome" + ], + "evaluator": { + "postconfig": [ + { + "type": "sleep", + "parameters": { + "seconds": 3 + } + } + ], + "func": "vllm_eval" + }, + "proxy": false, + "fixed_ip": true, + "possibility_of_env_change": "low", + "metadata": { + "input_files": [], + "steps": "1. Chrome已自动打开。\n2. 在地址栏输入 chrome://password-manager/passwords 并按回车,或通过 Settings → Autofill and passwords → Google Password Manager 找到密码管理页面。\n3. 确认页面显示的是Chrome密码管理器界面。" + } +} \ No newline at end of file diff --git a/evaluation_examples/examples/chrome_windows/chrome_win_task10.json b/evaluation_examples/examples/chrome_windows/chrome_win_task10.json new file mode 100644 index 0000000..4daa6a0 --- /dev/null +++ b/evaluation_examples/examples/chrome_windows/chrome_win_task10.json @@ -0,0 +1,46 @@ +{ + "id": "chrome_win_task10", + "snapshot": "snapshot", + "instruction": "Please help me find the score record for the 2019 Super Bowl on the NFL website.", + "source": "OSWorld-adapted", + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", + "https://www.nfl.com/" + ], + "shell": false + } + }, + { + "type": "sleep", + "parameters": { + "seconds": 5 + } + } + ], + "trajectory": "trajectories/", + "related_apps": [ + "chrome" + ], + "evaluator": { + "postconfig": [ + { + "type": "sleep", + "parameters": { + "seconds": 3 + } + } + ], + "func": "vllm_eval" + }, + "proxy": false, + "fixed_ip": true, + "possibility_of_env_change": "low", + "metadata": { + "input_files": [], + "steps": "1. Chrome已自动打开并导航到NFL官网。\n2. 在页面导航中找到Scores或Schedule入口。\n3. 找到2019赛季的Super Bowl比赛(Super Bowl LIII,2019年2月3日)。\n4. 确认页面上显示了该场比赛的比分信息。" + } +} \ No newline at end of file diff --git a/evaluation_examples/examples/chrome_windows/chrome_win_task11.json b/evaluation_examples/examples/chrome_windows/chrome_win_task11.json new file mode 100644 index 0000000..e8cc13c --- /dev/null +++ b/evaluation_examples/examples/chrome_windows/chrome_win_task11.json @@ -0,0 +1,46 @@ +{ + "id": "chrome_win_task11", + "snapshot": "snapshot", + "instruction": "Open the baggage fee calculator on the United Airlines website.", + "source": "OSWorld-adapted", + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", + "https://www.united.com/en/us" + ], + "shell": false + } + }, + { + "type": "sleep", + "parameters": { + "seconds": 5 + } + } + ], + "trajectory": "trajectories/", + "related_apps": [ + "chrome" + ], + "evaluator": { + "postconfig": [ + { + "type": "sleep", + "parameters": { + "seconds": 3 + } + } + ], + "func": "vllm_eval" + }, + "proxy": false, + "fixed_ip": true, + "possibility_of_env_change": "low", + "metadata": { + "input_files": [], + "steps": "1. Chrome已自动打开并导航到United Airlines官网。\n2. 在页面导航中找到Travel info或Baggage部分。\n3. 找到并打开行李费计算器(Baggage Calculator)页面。\n4. 确认页面显示的是行李费计算器界面。" + } +} \ No newline at end of file diff --git a/evaluation_examples/examples/chrome_windows/chrome_win_task12.json b/evaluation_examples/examples/chrome_windows/chrome_win_task12.json new file mode 100644 index 0000000..1e09037 --- /dev/null +++ b/evaluation_examples/examples/chrome_windows/chrome_win_task12.json @@ -0,0 +1,46 @@ +{ + "id": "chrome_win_task12", + "snapshot": "snapshot", + "instruction": "Show me the side effects of Tamiflu on the Drugs.com website.", + "source": "OSWorld-adapted", + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", + "https://www.drugs.com/tamiflu.html" + ], + "shell": false + } + }, + { + "type": "sleep", + "parameters": { + "seconds": 5 + } + } + ], + "trajectory": "trajectories/", + "related_apps": [ + "chrome" + ], + "evaluator": { + "postconfig": [ + { + "type": "sleep", + "parameters": { + "seconds": 3 + } + } + ], + "func": "vllm_eval" + }, + "proxy": false, + "fixed_ip": true, + "possibility_of_env_change": "low", + "metadata": { + "input_files": [], + "steps": "1. Chrome已自动打开并导航到Drugs.com的Tamiflu页面。\n2. 在页面上找到'Side Effects'(副作用)部分并点击查看。\n3. 确认页面上显示了Tamiflu的副作用信息列表。" + } +} \ No newline at end of file diff --git a/evaluation_examples/examples/ovito/remote_file_access_task1.json b/evaluation_examples/examples/ovito/remote_file_access_task1.json deleted file mode 100644 index a910634..0000000 --- a/evaluation_examples/examples/ovito/remote_file_access_task1.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "id": "remote_file_access_task1", - "snapshot": "ovito", - "instruction": "在 OVITO 中,通过 File → Load Remote File 打开远程 SSH 文件 sftp://user@hostname/path/file", - "source": "custom", - "config": [ - { - "type": "launch", - "parameters": { - "command": [ - "C:\\OVITO Basic\\ovito.exe" - ] - } - }, - { - "type": "sleep", - "parameters": { - "seconds": 5 - } - } - ], - "trajectory": "trajectories/", - "related_apps": [ - "ovito" - ], - "evaluator": { - "postconfig": [ - { - "type": "sleep", - "parameters": { - "seconds": 3 - } - } - ], - "func": "vllm_eval" - }, - "proxy": false, - "fixed_ip": false, - "possibility_of_env_change": "low", - "metadata": { - "input_files": [], - "steps": "1. 启动 OVITO 软件并等待进入主界面。\n2. 单击主界面顶部菜单栏中的 \"File\" 菜单将其展开。\n3. 在展开的下拉菜单中,单击选择 \"Load Remote File\" 菜单项,此时会弹出 \"Load Remote File\" 对话框。\n4. 在 \"Load Remote File\" 对话框中,单击 \"Remote URL:\" 标签下方的组合输入框将光标定位至此。\n5. 在该输入框中,清空原有内容,并输入远程文件地址:sftp://user@hostname/path/file。\n6. 单击 \"File type:\" 标签下方的下拉菜单控件将其展开。\n7. 在展开的下拉列表中,单击选择 \"\" 选项。\n8. 在 \"SSH connection method:\" 区域,单击 \"Integrated client (default)\" 左侧的单选按钮以选中该选项。\n9. 单击对话框右下角蓝色的 \"Open\" 按钮以建立连接并加载文件。", - "steps_original": "1. 打开 OVITO 软件。\n2. 点击菜单 File → Load Remote File。\n3. 在弹出的对话框中填写 Remote URL 字段,例如:sftp://user@hostname/path/file。\n4. 在 File type 下选择 Auto-detect file format。\n5. 在 SSH connection method 下选择 Integrated client (default)。\n6. 点击 Open 完成连接并加载文件。" - } -} \ No newline at end of file diff --git a/evaluation_examples/examples/ovito/rendering_task1.json b/evaluation_examples/examples/ovito/rendering_task1.json deleted file mode 100644 index 8534506..0000000 --- a/evaluation_examples/examples/ovito/rendering_task1.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "id": "rendering_task1", - "snapshot": "ovito", - "instruction": "在 OVITO 中,通过 Render Settings 面板渲染主动观察窗口为分辨率 1024x768 的图像,背景为透明色。", - "source": "custom", - "config": [ - { - "type": "launch", - "parameters": { - "command": [ - "C:\\OVITO Basic\\ovito.exe" - ] - } - }, - { - "type": "sleep", - "parameters": { - "seconds": 5 - } - } - ], - "trajectory": "trajectories/", - "related_apps": [ - "ovito" - ], - "evaluator": { - "postconfig": [ - { - "type": "sleep", - "parameters": { - "seconds": 3 - } - } - ], - "func": "vllm_eval" - }, - "proxy": false, - "fixed_ip": false, - "possibility_of_env_change": "low", - "metadata": { - "input_files": [], - "steps": "1. 双击桌面或系统菜单中的 OVITO 软件快捷方式打开软件。\n2. 在软件主界面的观察窗口区域,单击鼠标左键以激活目标视口(确保视口边缘出现黄色边框)。\n3. 在界面右侧命令面板顶部的工具栏中,单击带有照相机形状的 \"Render\" 图标选项卡,打开 \"Render settings\" 面板。\n4. 在 \"Render settings\" 面板的 \"Rendering range\" 区域,单击选中 \"Single frame\" 单选按钮。\n5. 在 \"Output image size\" 区域,将光标定位到 \"Width:\" 对应的数值输入框中,清空原有内容,输入 \"1024\"。\n6. 在 \"Output image size\" 区域,将光标定位到 \"Height:\" 对应的数值输入框中,清空原有内容,输入 \"768\"。\n7. 在 \"Background\" 区域,单击选中 \"Transparent\" 单选按钮。\n8. 在 \"Render settings\" 面板的底部,单击带有照相机图标的 \"Render active viewport\" 按钮开始渲染。", - "steps_original": "1. 打开 OVITO 软件。\n2. 确保观察窗口激活(黄色边框)。\n3. 点击右侧命令面板上的 Render 图标。\n4. 在弹出的 Render Settings 面板中,选择 'Single frame'。\n5. 设置输出图像大小为 Width: 1024 和 Height: 768。\n6. 选择背景为 'Transparent'。\n7. 点击 'Render active viewport' 按钮完成渲染。" - } -} \ No newline at end of file diff --git a/evaluation_examples/test_final.json b/evaluation_examples/test_final.json index a46ac11..368ab58 100644 --- a/evaluation_examples/test_final.json +++ b/evaluation_examples/test_final.json @@ -46,5 +46,28 @@ "naming-a-molecule_task1", "using-qtaim-and-wfn_task2", "viewing-electrostatic-potential_task1" + ], + "ovito": [ + "animation_task3", + "aspherical_particles_task1", + "clone_pipeline_task1", + "customize_init_state_task1", + "data_model_task1", + "export_task1", + "marker_particles_task2", + "miscellaneous_task1", + "python_extensions_task1", + "transparent_particles_task1", + "viewports_task1", + "viewports_task2", + "viewports_task3", + "viewports_task4", + "viewports_task5", + "viewports_task6", + "viewports_task7", + "viewports_task8", + "viewports_task9", + "viewports_task10", + "viewports_task11" ] -} +} \ No newline at end of file diff --git a/lib_run_single.py b/lib_run_single.py index 78051b8..277cc04 100644 --- a/lib_run_single.py +++ b/lib_run_single.py @@ -67,7 +67,8 @@ def run_single_example(agent, env, example, max_steps, instruction, args, exampl "reward": reward, "done": done, "info": info, - "screenshot_file": f"step_{step_idx + 1}_{action_timestamp}.png" + "screenshot_file": f"step_{step_idx + 1}_{action_timestamp}.png", + "a11y_tree": obs.get("accessibility_tree") if isinstance(obs, dict) else None })) f.write("\n") if done: diff --git a/mm_agents/agent.py b/mm_agents/agent.py index 1c27125..b57cbce 100644 --- a/mm_agents/agent.py +++ b/mm_agents/agent.py @@ -722,6 +722,9 @@ class PromptAgent: "Content-Type": "application/json", "Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}" } + # Newer GPT models (gpt-5.x, o-series) require max_completion_tokens instead of max_tokens + if "max_tokens" in payload: + payload["max_completion_tokens"] = payload.pop("max_tokens") logger.info("Generating content with GPT model: %s", self.model) response = requests.post( api_url, diff --git a/run_proxmox.sh b/run_proxmox.sh index 53fd558..96b9f17 100755 --- a/run_proxmox.sh +++ b/run_proxmox.sh @@ -12,7 +12,11 @@ export PROXMOX_VM_IP="10.10.17.10" # ---------- LLM API 配置 ---------- # OpenAI 兼容代理(同时用于 Agent 模型和 Eval 模型) +<<<<<<< HEAD export OPENAI_API_KEY="sk-EQGuvk0rS7EG4Cu22cF6D5Cc3a324c88B2E2D432Bc59Bb17" # ⚠️ 请替换为你的实际 API Key +======= +export OPENAI_API_KEY="sk-EQGuvk0rS7EG4Cu22cF6D5Cc3a324c88B2E2D432Bc59Bb17" +>>>>>>> b1ed0a478511115622edf0e5c1590c93f4cdd855 export OPENAI_BASE_URL="https://vip.apiyi.com/v1" # ---------- 评测参数(按需修改) ---------- @@ -30,9 +34,15 @@ OBSERVATION_TYPE="screenshot" # 观测类型 ACTION_SPACE="pyautogui" # 动作空间 SCREEN_WIDTH=1920 # 屏幕宽度 SCREEN_HEIGHT=1080 # 屏幕高度 +<<<<<<< HEAD RESULT_DIR="/mnt/d/work/result" # 结果输出目录 TEST_META="/mnt/d/work/sci-gui-agent-benchmark/evaluation_examples/test_final.json" # 评测任务列表 DOMAIN="origin" # 评测领域 +======= +RESULT_DIR="/Volumes/Castor/课题/results_baseline_50steps" # 结果输出目录 +TEST_META="evaluation_examples/test_final.json" # 评测任务列表 +DOMAIN="ovito" # 评测领域 +>>>>>>> b1ed0a478511115622edf0e5c1590c93f4cdd855 SNAPSHOT_NAME="snapshot" # 快照名称(需提前创建) INJECT_STEPS=false # 是否注入教程步骤到 Agent prompt(baseline 不注入) diff --git a/run_proxmox_chrome.sh b/run_proxmox_chrome.sh new file mode 100644 index 0000000..407e45b --- /dev/null +++ b/run_proxmox_chrome.sh @@ -0,0 +1,108 @@ +#!/bin/bash +# ============================================================================= +# Chrome Windows 通用软件对照组评测脚本 +# 用途:在 Windows VM 上测试 Chrome 任务,作为科学软件的通用软件对照组 +# ============================================================================= + +# ---------- Proxmox 配置 ---------- +export PROXMOX_SSH_HOST="root@10.10.17.3" +export PROXMOX_VM_IP="10.10.17.10" + +# ---------- LLM API 配置 ---------- +export OPENAI_API_KEY="sk-EQGuvk0rS7EG4Cu22cF6D5Cc3a324c88B2E2D432Bc59Bb17" +export OPENAI_BASE_URL="https://vip.apiyi.com/v1" + +# ---------- 评测参数 ---------- +PROVIDER="proxmox" +VM_ID="102" +MODEL="gpt-5.4" +EVAL_MODEL="gemini-3.1-pro-preview" +MAX_STEPS=35 +SLEEP_AFTER_EXEC=3 +TEMPERATURE=0.5 +TOP_P=0.9 +MAX_TOKENS=16384 +MAX_TRAJECTORY_LENGTH=3 +ACTION_SPACE="pyautogui" +SCREEN_WIDTH=1920 +SCREEN_HEIGHT=1080 +RESULT_DIR="/Volumes/Castor/课题/results_baseline_50steps" +TEST_META="evaluation_examples/test_chrome.json" +DOMAIN="chrome_windows" +SNAPSHOT_NAME="snapshot" +INJECT_STEPS=false + +# ---------- 两种观测模式,按需切换 ---------- +# screenshot only: +#OBSERVATION_TYPE="screenshot" +# screenshot + a11y tree(第二轮时改为下面这行): +OBSERVATION_TYPE="screenshot_a11y_tree" + +# ---------- 预检查 ---------- +echo "=== 预检查 ===" + +echo -n "SSH 到 Proxmox... " +if ssh -o BatchMode=yes -o ConnectTimeout=5 ${PROXMOX_SSH_HOST} "echo ok" 2>/dev/null | grep -q "ok"; then + echo "✅ 连接成功" +else + echo "❌ SSH 连接失败" + exit 1 +fi + +echo -n "VM ${VM_ID} 状态... " +VM_STATUS=$(ssh -o BatchMode=yes ${PROXMOX_SSH_HOST} "qm status ${VM_ID}" 2>/dev/null) +echo "${VM_STATUS}" + +echo -n "Flask Server (${PROXMOX_VM_IP}:5000)... " +if curl -s --connect-timeout 5 "http://${PROXMOX_VM_IP}:5000/screenshot" -o /dev/null -w "%{http_code}" | grep -q "200"; then + echo "✅ 可访问" +else + echo "⚠️ 不可访问(评测启动时会自动启动 VM)" +fi + +echo -n "快照 '${SNAPSHOT_NAME}'... " +SNAPSHOTS=$(ssh -o BatchMode=yes ${PROXMOX_SSH_HOST} "qm listsnapshot ${VM_ID}" 2>/dev/null) +if echo "${SNAPSHOTS}" | grep -q "${SNAPSHOT_NAME}"; then + echo "✅ 已存在" +else + echo "⚠️ 未找到快照 '${SNAPSHOT_NAME}'" +fi + +echo "" +echo "=== 开始评测 ===" +echo "Provider: ${PROVIDER}" +echo "VM ID: ${VM_ID}" +echo "VM IP: ${PROXMOX_VM_IP}" +echo "Model: ${MODEL}" +echo "Eval: ${EVAL_MODEL}" +echo "Observation: ${OBSERVATION_TYPE}" +echo "Domain: ${DOMAIN}" +echo "Results: ${RESULT_DIR}" +echo "" + +# ---------- 运行评测 ---------- +if [ "${INJECT_STEPS}" = true ]; then + INJECT_STEPS_FLAG="--inject_steps" +else + INJECT_STEPS_FLAG="--no_inject_steps" +fi + +python run.py \ + --provider_name "${PROVIDER}" \ + --path_to_vm "${VM_ID}" \ + --observation_type "${OBSERVATION_TYPE}" \ + --action_space "${ACTION_SPACE}" \ + --model "${MODEL}" \ + --eval_model "${EVAL_MODEL}" \ + --temperature "${TEMPERATURE}" \ + --top_p "${TOP_P}" \ + --max_tokens "${MAX_TOKENS}" \ + --max_trajectory_length "${MAX_TRAJECTORY_LENGTH}" \ + --screen_width "${SCREEN_WIDTH}" \ + --screen_height "${SCREEN_HEIGHT}" \ + --sleep_after_execution "${SLEEP_AFTER_EXEC}" \ + --max_steps "${MAX_STEPS}" \ + --result_dir "${RESULT_DIR}" \ + --test_all_meta_path "${TEST_META}" \ + --domain "${DOMAIN}" \ + ${INJECT_STEPS_FLAG}