From 970d430dcff84e274d4bc88f99b0ff74bd14a970 Mon Sep 17 00:00:00 2001 From: lizhanyuan <949777411@qq.com> Date: Thu, 19 Mar 2026 14:49:39 +0800 Subject: [PATCH 1/3] =?UTF-8?q?feat:=20=E6=9C=AC=E5=9C=B0=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=20agent.py=20/=20run=5Fproxmox=20/=20chrome=20tasks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../chrome_windows/chrome_win_task01.json | 45 ++++++++ .../chrome_windows/chrome_win_task02.json | 46 ++++++++ .../chrome_windows/chrome_win_task03.json | 45 ++++++++ .../chrome_windows/chrome_win_task04.json | 45 ++++++++ .../chrome_windows/chrome_win_task05.json | 45 ++++++++ .../chrome_windows/chrome_win_task06.json | 45 ++++++++ .../chrome_windows/chrome_win_task07.json | 45 ++++++++ .../chrome_windows/chrome_win_task08.json | 68 +++++++++++ .../chrome_windows/chrome_win_task09.json | 45 ++++++++ .../chrome_windows/chrome_win_task10.json | 46 ++++++++ .../chrome_windows/chrome_win_task11.json | 46 ++++++++ .../chrome_windows/chrome_win_task12.json | 46 ++++++++ evaluation_examples/test_final.json | 25 ++++ mm_agents/agent.py | 3 + run_proxmox.sh | 8 +- run_proxmox_chrome.sh | 108 ++++++++++++++++++ 16 files changed, 707 insertions(+), 4 deletions(-) create mode 100644 evaluation_examples/examples/chrome_windows/chrome_win_task01.json create mode 100644 evaluation_examples/examples/chrome_windows/chrome_win_task02.json create mode 100644 evaluation_examples/examples/chrome_windows/chrome_win_task03.json create mode 100644 evaluation_examples/examples/chrome_windows/chrome_win_task04.json create mode 100644 evaluation_examples/examples/chrome_windows/chrome_win_task05.json create mode 100644 evaluation_examples/examples/chrome_windows/chrome_win_task06.json create mode 100644 evaluation_examples/examples/chrome_windows/chrome_win_task07.json create mode 100644 evaluation_examples/examples/chrome_windows/chrome_win_task08.json create mode 100644 evaluation_examples/examples/chrome_windows/chrome_win_task09.json create mode 100644 evaluation_examples/examples/chrome_windows/chrome_win_task10.json create mode 100644 evaluation_examples/examples/chrome_windows/chrome_win_task11.json create mode 100644 evaluation_examples/examples/chrome_windows/chrome_win_task12.json create mode 100644 run_proxmox_chrome.sh diff --git a/evaluation_examples/examples/chrome_windows/chrome_win_task01.json b/evaluation_examples/examples/chrome_windows/chrome_win_task01.json new file mode 100644 index 0000000..3b0c5fa --- /dev/null +++ b/evaluation_examples/examples/chrome_windows/chrome_win_task01.json @@ -0,0 +1,45 @@ +{ + "id": "chrome_win_task01", + "snapshot": "snapshot", + "instruction": "Can you make a new folder for me on the bookmarks bar in Chrome? Let's call it 'Favorites.'", + "source": "OSWorld-adapted", + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe" + ], + "shell": false + } + }, + { + "type": "sleep", + "parameters": { + "seconds": 5 + } + } + ], + "trajectory": "trajectories/", + "related_apps": [ + "chrome" + ], + "evaluator": { + "postconfig": [ + { + "type": "sleep", + "parameters": { + "seconds": 3 + } + } + ], + "func": "vllm_eval" + }, + "proxy": false, + "fixed_ip": true, + "possibility_of_env_change": "low", + "metadata": { + "input_files": [], + "steps": "1. Chrome已自动打开。\n2. 在书签栏空白处右键,选择'Add folder'(或'添加文件夹')。\n3. 在弹出的对话框中将文件夹名称设为'Favorites',点击保存。\n4. 确认书签栏上出现了名为'Favorites'的文件夹。" + } +} \ No newline at end of file diff --git a/evaluation_examples/examples/chrome_windows/chrome_win_task02.json b/evaluation_examples/examples/chrome_windows/chrome_win_task02.json new file mode 100644 index 0000000..50a9a56 --- /dev/null +++ b/evaluation_examples/examples/chrome_windows/chrome_win_task02.json @@ -0,0 +1,46 @@ +{ + "id": "chrome_win_task02", + "snapshot": "snapshot", + "instruction": "Can you save this webpage I'm looking at to the bookmarks bar so I can come back to it later?", + "source": "OSWorld-adapted", + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", + "https://jalammar.github.io/illustrated-transformer/" + ], + "shell": false + } + }, + { + "type": "sleep", + "parameters": { + "seconds": 5 + } + } + ], + "trajectory": "trajectories/", + "related_apps": [ + "chrome" + ], + "evaluator": { + "postconfig": [ + { + "type": "sleep", + "parameters": { + "seconds": 3 + } + } + ], + "func": "vllm_eval" + }, + "proxy": false, + "fixed_ip": true, + "possibility_of_env_change": "low", + "metadata": { + "input_files": [], + "steps": "1. Chrome已自动打开并导航到目标页面。\n2. 按 Ctrl+D 打开书签保存对话框,或点击地址栏右侧的星形图标。\n3. 在'Folder'下拉菜单中选择'Bookmarks bar'(书签栏)。\n4. 点击'Done'保存。\n5. 确认书签栏上出现了该页面的书签。" + } +} \ No newline at end of file diff --git a/evaluation_examples/examples/chrome_windows/chrome_win_task03.json b/evaluation_examples/examples/chrome_windows/chrome_win_task03.json new file mode 100644 index 0000000..3e03dfa --- /dev/null +++ b/evaluation_examples/examples/chrome_windows/chrome_win_task03.json @@ -0,0 +1,45 @@ +{ + "id": "chrome_win_task03", + "snapshot": "snapshot", + "instruction": "My grandmother has been using Chrome and told me the font size is way too small. Could you set the default font size to the largest?", + "source": "OSWorld-adapted", + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe" + ], + "shell": false + } + }, + { + "type": "sleep", + "parameters": { + "seconds": 5 + } + } + ], + "trajectory": "trajectories/", + "related_apps": [ + "chrome" + ], + "evaluator": { + "postconfig": [ + { + "type": "sleep", + "parameters": { + "seconds": 3 + } + } + ], + "func": "vllm_eval" + }, + "proxy": false, + "fixed_ip": true, + "possibility_of_env_change": "low", + "metadata": { + "input_files": [], + "steps": "1. Chrome已自动打开。\n2. 点击右上角三点菜单 → Settings(设置)。\n3. 在左侧找到'Appearance'(外观),点击进入。\n4. 找到'Font size'(字体大小),将其设置为'Very large'(最大)。\n5. 确认字体大小选项显示为'Very large'。" + } +} \ No newline at end of file diff --git a/evaluation_examples/examples/chrome_windows/chrome_win_task04.json b/evaluation_examples/examples/chrome_windows/chrome_win_task04.json new file mode 100644 index 0000000..c8ff886 --- /dev/null +++ b/evaluation_examples/examples/chrome_windows/chrome_win_task04.json @@ -0,0 +1,45 @@ +{ + "id": "chrome_win_task04", + "snapshot": "snapshot", + "instruction": "Can you make Bing the main search engine when I look stuff up in Chrome?", + "source": "OSWorld-adapted", + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe" + ], + "shell": false + } + }, + { + "type": "sleep", + "parameters": { + "seconds": 5 + } + } + ], + "trajectory": "trajectories/", + "related_apps": [ + "chrome" + ], + "evaluator": { + "postconfig": [ + { + "type": "sleep", + "parameters": { + "seconds": 3 + } + } + ], + "func": "vllm_eval" + }, + "proxy": false, + "fixed_ip": true, + "possibility_of_env_change": "low", + "metadata": { + "input_files": [], + "steps": "1. Chrome已自动打开。\n2. 点击右上角三点菜单 → Settings(设置)。\n3. 在左侧点击'Search engine'(搜索引擎)。\n4. 在'Search engine used in the address bar'下拉菜单中选择'Bing'。\n5. 确认搜索引擎已改为Bing。" + } +} \ No newline at end of file diff --git a/evaluation_examples/examples/chrome_windows/chrome_win_task05.json b/evaluation_examples/examples/chrome_windows/chrome_win_task05.json new file mode 100644 index 0000000..5d47154 --- /dev/null +++ b/evaluation_examples/examples/chrome_windows/chrome_win_task05.json @@ -0,0 +1,45 @@ +{ + "id": "chrome_win_task05", + "snapshot": "snapshot", + "instruction": "I want Chrome to enable the 'Do Not Track' feature to enhance my online privacy. Can you turn that on?", + "source": "OSWorld-adapted", + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe" + ], + "shell": false + } + }, + { + "type": "sleep", + "parameters": { + "seconds": 5 + } + } + ], + "trajectory": "trajectories/", + "related_apps": [ + "chrome" + ], + "evaluator": { + "postconfig": [ + { + "type": "sleep", + "parameters": { + "seconds": 3 + } + } + ], + "func": "vllm_eval" + }, + "proxy": false, + "fixed_ip": true, + "possibility_of_env_change": "low", + "metadata": { + "input_files": [], + "steps": "1. Chrome已自动打开。\n2. 点击右上角三点菜单 → Settings(设置)。\n3. 在左侧点击'Privacy and security'(隐私和安全)。\n4. 找到'Send a Do Not Track request with your browsing traffic',将其开关打开(变为蓝色)。\n5. 确认该开关处于开启状态。" + } +} \ No newline at end of file diff --git a/evaluation_examples/examples/chrome_windows/chrome_win_task06.json b/evaluation_examples/examples/chrome_windows/chrome_win_task06.json new file mode 100644 index 0000000..2946c74 --- /dev/null +++ b/evaluation_examples/examples/chrome_windows/chrome_win_task06.json @@ -0,0 +1,45 @@ +{ + "id": "chrome_win_task06", + "snapshot": "snapshot", + "instruction": "Please help me set Chrome to delete my browsing data automatically every time I close the browser.", + "source": "OSWorld-adapted", + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe" + ], + "shell": false + } + }, + { + "type": "sleep", + "parameters": { + "seconds": 5 + } + } + ], + "trajectory": "trajectories/", + "related_apps": [ + "chrome" + ], + "evaluator": { + "postconfig": [ + { + "type": "sleep", + "parameters": { + "seconds": 3 + } + } + ], + "func": "vllm_eval" + }, + "proxy": false, + "fixed_ip": true, + "possibility_of_env_change": "low", + "metadata": { + "input_files": [], + "steps": "1. Chrome已自动打开。\n2. 点击右上角三点菜单 → Settings(设置)。\n3. 点击'Privacy and security' → 'Cookies and other site data'。\n4. 找到'Clear cookies and site data when you close all windows',将其开关打开。\n5. 确认该开关处于开启状态。" + } +} \ No newline at end of file diff --git a/evaluation_examples/examples/chrome_windows/chrome_win_task07.json b/evaluation_examples/examples/chrome_windows/chrome_win_task07.json new file mode 100644 index 0000000..1532ab6 --- /dev/null +++ b/evaluation_examples/examples/chrome_windows/chrome_win_task07.json @@ -0,0 +1,45 @@ +{ + "id": "chrome_win_task07", + "snapshot": "snapshot", + "instruction": "I want to update my Chrome profile name to Thomas. Could you help me change the username in Chrome profiles?", + "source": "OSWorld-adapted", + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe" + ], + "shell": false + } + }, + { + "type": "sleep", + "parameters": { + "seconds": 5 + } + } + ], + "trajectory": "trajectories/", + "related_apps": [ + "chrome" + ], + "evaluator": { + "postconfig": [ + { + "type": "sleep", + "parameters": { + "seconds": 3 + } + } + ], + "func": "vllm_eval" + }, + "proxy": false, + "fixed_ip": true, + "possibility_of_env_change": "low", + "metadata": { + "input_files": [], + "steps": "1. Chrome已自动打开。\n2. 点击右上角的个人头像图标(Profile图标)。\n3. 在弹出菜单中点击铅笔编辑图标或'Manage profiles'。\n4. 在名称输入框中将当前名称修改为'Thomas'。\n5. 点击完成/保存,确认界面上显示用户名为'Thomas'。" + } +} \ No newline at end of file diff --git a/evaluation_examples/examples/chrome_windows/chrome_win_task08.json b/evaluation_examples/examples/chrome_windows/chrome_win_task08.json new file mode 100644 index 0000000..b1d955a --- /dev/null +++ b/evaluation_examples/examples/chrome_windows/chrome_win_task08.json @@ -0,0 +1,68 @@ +{ + "id": "chrome_win_task08", + "snapshot": "snapshot", + "instruction": "Can you make my computer bring back the last tab I shut down in Chrome?", + "source": "OSWorld-adapted", + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", + "https://www.lonelyplanet.com" + ], + "shell": false + } + }, + { + "type": "launch", + "parameters": { + "command": [ + "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", + "--new-tab", + "https://www.airbnb.com" + ], + "shell": false + } + }, + { + "type": "launch", + "parameters": { + "command": [ + "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", + "--new-tab", + "https://www.tripadvisor.com" + ], + "shell": false + } + }, + { + "type": "sleep", + "parameters": { + "seconds": 5 + } + } + ], + "trajectory": "trajectories/", + "related_apps": [ + "chrome" + ], + "evaluator": { + "postconfig": [ + { + "type": "sleep", + "parameters": { + "seconds": 3 + } + } + ], + "func": "vllm_eval" + }, + "proxy": false, + "fixed_ip": true, + "possibility_of_env_change": "low", + "metadata": { + "input_files": [], + "steps": "1. Chrome已自动打开并加载了多个标签页。\n2. 关闭最右侧的标签页(点击该标签页上的X,或按Ctrl+W)。\n3. 恢复刚刚关闭的标签页:按 Ctrl+Shift+T,或右键标签栏选择'Reopen closed tab'。\n4. 确认被关闭的标签页重新出现在标签栏中。" + } +} \ No newline at end of file diff --git a/evaluation_examples/examples/chrome_windows/chrome_win_task09.json b/evaluation_examples/examples/chrome_windows/chrome_win_task09.json new file mode 100644 index 0000000..a3954c2 --- /dev/null +++ b/evaluation_examples/examples/chrome_windows/chrome_win_task09.json @@ -0,0 +1,45 @@ +{ + "id": "chrome_win_task09", + "snapshot": "snapshot", + "instruction": "Computer, please navigate to the area in Chrome settings where my passwords are stored.", + "source": "OSWorld-adapted", + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe" + ], + "shell": false + } + }, + { + "type": "sleep", + "parameters": { + "seconds": 5 + } + } + ], + "trajectory": "trajectories/", + "related_apps": [ + "chrome" + ], + "evaluator": { + "postconfig": [ + { + "type": "sleep", + "parameters": { + "seconds": 3 + } + } + ], + "func": "vllm_eval" + }, + "proxy": false, + "fixed_ip": true, + "possibility_of_env_change": "low", + "metadata": { + "input_files": [], + "steps": "1. Chrome已自动打开。\n2. 在地址栏输入 chrome://password-manager/passwords 并按回车,或通过 Settings → Autofill and passwords → Google Password Manager 找到密码管理页面。\n3. 确认页面显示的是Chrome密码管理器界面。" + } +} \ No newline at end of file diff --git a/evaluation_examples/examples/chrome_windows/chrome_win_task10.json b/evaluation_examples/examples/chrome_windows/chrome_win_task10.json new file mode 100644 index 0000000..4daa6a0 --- /dev/null +++ b/evaluation_examples/examples/chrome_windows/chrome_win_task10.json @@ -0,0 +1,46 @@ +{ + "id": "chrome_win_task10", + "snapshot": "snapshot", + "instruction": "Please help me find the score record for the 2019 Super Bowl on the NFL website.", + "source": "OSWorld-adapted", + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", + "https://www.nfl.com/" + ], + "shell": false + } + }, + { + "type": "sleep", + "parameters": { + "seconds": 5 + } + } + ], + "trajectory": "trajectories/", + "related_apps": [ + "chrome" + ], + "evaluator": { + "postconfig": [ + { + "type": "sleep", + "parameters": { + "seconds": 3 + } + } + ], + "func": "vllm_eval" + }, + "proxy": false, + "fixed_ip": true, + "possibility_of_env_change": "low", + "metadata": { + "input_files": [], + "steps": "1. Chrome已自动打开并导航到NFL官网。\n2. 在页面导航中找到Scores或Schedule入口。\n3. 找到2019赛季的Super Bowl比赛(Super Bowl LIII,2019年2月3日)。\n4. 确认页面上显示了该场比赛的比分信息。" + } +} \ No newline at end of file diff --git a/evaluation_examples/examples/chrome_windows/chrome_win_task11.json b/evaluation_examples/examples/chrome_windows/chrome_win_task11.json new file mode 100644 index 0000000..e8cc13c --- /dev/null +++ b/evaluation_examples/examples/chrome_windows/chrome_win_task11.json @@ -0,0 +1,46 @@ +{ + "id": "chrome_win_task11", + "snapshot": "snapshot", + "instruction": "Open the baggage fee calculator on the United Airlines website.", + "source": "OSWorld-adapted", + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", + "https://www.united.com/en/us" + ], + "shell": false + } + }, + { + "type": "sleep", + "parameters": { + "seconds": 5 + } + } + ], + "trajectory": "trajectories/", + "related_apps": [ + "chrome" + ], + "evaluator": { + "postconfig": [ + { + "type": "sleep", + "parameters": { + "seconds": 3 + } + } + ], + "func": "vllm_eval" + }, + "proxy": false, + "fixed_ip": true, + "possibility_of_env_change": "low", + "metadata": { + "input_files": [], + "steps": "1. Chrome已自动打开并导航到United Airlines官网。\n2. 在页面导航中找到Travel info或Baggage部分。\n3. 找到并打开行李费计算器(Baggage Calculator)页面。\n4. 确认页面显示的是行李费计算器界面。" + } +} \ No newline at end of file diff --git a/evaluation_examples/examples/chrome_windows/chrome_win_task12.json b/evaluation_examples/examples/chrome_windows/chrome_win_task12.json new file mode 100644 index 0000000..1e09037 --- /dev/null +++ b/evaluation_examples/examples/chrome_windows/chrome_win_task12.json @@ -0,0 +1,46 @@ +{ + "id": "chrome_win_task12", + "snapshot": "snapshot", + "instruction": "Show me the side effects of Tamiflu on the Drugs.com website.", + "source": "OSWorld-adapted", + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", + "https://www.drugs.com/tamiflu.html" + ], + "shell": false + } + }, + { + "type": "sleep", + "parameters": { + "seconds": 5 + } + } + ], + "trajectory": "trajectories/", + "related_apps": [ + "chrome" + ], + "evaluator": { + "postconfig": [ + { + "type": "sleep", + "parameters": { + "seconds": 3 + } + } + ], + "func": "vllm_eval" + }, + "proxy": false, + "fixed_ip": true, + "possibility_of_env_change": "low", + "metadata": { + "input_files": [], + "steps": "1. Chrome已自动打开并导航到Drugs.com的Tamiflu页面。\n2. 在页面上找到'Side Effects'(副作用)部分并点击查看。\n3. 确认页面上显示了Tamiflu的副作用信息列表。" + } +} \ No newline at end of file diff --git a/evaluation_examples/test_final.json b/evaluation_examples/test_final.json index 7a1aa67..a97c1d9 100644 --- a/evaluation_examples/test_final.json +++ b/evaluation_examples/test_final.json @@ -23,5 +23,30 @@ "VESTA_Manual_task9", "VESTA_Manual_task10", "VESTA_Manual_task11" + ], + "ovito": [ + "animation_task3", + "aspherical_particles_task1", + "clone_pipeline_task1", + "customize_init_state_task1", + "data_model_task1", + "export_task1", + "marker_particles_task2", + "miscellaneous_task1", + "python_extensions_task1", + "remote_file_access_task1", + "rendering_task1", + "transparent_particles_task1", + "viewports_task1", + "viewports_task2", + "viewports_task3", + "viewports_task4", + "viewports_task5", + "viewports_task6", + "viewports_task7", + "viewports_task8", + "viewports_task9", + "viewports_task10", + "viewports_task11" ] } diff --git a/mm_agents/agent.py b/mm_agents/agent.py index 1c27125..b57cbce 100644 --- a/mm_agents/agent.py +++ b/mm_agents/agent.py @@ -722,6 +722,9 @@ class PromptAgent: "Content-Type": "application/json", "Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}" } + # Newer GPT models (gpt-5.x, o-series) require max_completion_tokens instead of max_tokens + if "max_tokens" in payload: + payload["max_completion_tokens"] = payload.pop("max_tokens") logger.info("Generating content with GPT model: %s", self.model) response = requests.post( api_url, diff --git a/run_proxmox.sh b/run_proxmox.sh index 2d9bc74..2d3c9eb 100755 --- a/run_proxmox.sh +++ b/run_proxmox.sh @@ -12,7 +12,7 @@ export PROXMOX_VM_IP="10.10.17.10" # ---------- LLM API 配置 ---------- # OpenAI 兼容代理(同时用于 Agent 模型和 Eval 模型) -export OPENAI_API_KEY="sk-5zk3CL73E2DsNyMn5a6dA357B6214eEd9240A674Ec0555Be" +export OPENAI_API_KEY="sk-EQGuvk0rS7EG4Cu22cF6D5Cc3a324c88B2E2D432Bc59Bb17" export OPENAI_BASE_URL="https://vip.apiyi.com/v1" # ---------- 评测参数(按需修改) ---------- @@ -26,13 +26,13 @@ TEMPERATURE=0.5 # 生成温度(越低越稳定 TOP_P=0.9 # nucleus sampling MAX_TOKENS=16384 # 模型最大输出 token 数 MAX_TRAJECTORY_LENGTH=3 # 历史轨迹保留长度 -OBSERVATION_TYPE="screenshot_a11y_tree" # 观测类型 +OBSERVATION_TYPE="screenshot" # 观测类型 ACTION_SPACE="pyautogui" # 动作空间 SCREEN_WIDTH=1920 # 屏幕宽度 SCREEN_HEIGHT=1080 # 屏幕高度 -RESULT_DIR="/Volumes/Castor/课题/results" # 结果输出目录 +RESULT_DIR="/Volumes/Castor/课题/results_baseline_50steps" # 结果输出目录 TEST_META="evaluation_examples/test_final.json" # 评测任务列表 -DOMAIN="vesta" # 评测领域 +DOMAIN="ovito" # 评测领域 SNAPSHOT_NAME="snapshot" # 快照名称(需提前创建) INJECT_STEPS=false # 是否注入教程步骤到 Agent prompt(baseline 不注入) diff --git a/run_proxmox_chrome.sh b/run_proxmox_chrome.sh new file mode 100644 index 0000000..407e45b --- /dev/null +++ b/run_proxmox_chrome.sh @@ -0,0 +1,108 @@ +#!/bin/bash +# ============================================================================= +# Chrome Windows 通用软件对照组评测脚本 +# 用途:在 Windows VM 上测试 Chrome 任务,作为科学软件的通用软件对照组 +# ============================================================================= + +# ---------- Proxmox 配置 ---------- +export PROXMOX_SSH_HOST="root@10.10.17.3" +export PROXMOX_VM_IP="10.10.17.10" + +# ---------- LLM API 配置 ---------- +export OPENAI_API_KEY="sk-EQGuvk0rS7EG4Cu22cF6D5Cc3a324c88B2E2D432Bc59Bb17" +export OPENAI_BASE_URL="https://vip.apiyi.com/v1" + +# ---------- 评测参数 ---------- +PROVIDER="proxmox" +VM_ID="102" +MODEL="gpt-5.4" +EVAL_MODEL="gemini-3.1-pro-preview" +MAX_STEPS=35 +SLEEP_AFTER_EXEC=3 +TEMPERATURE=0.5 +TOP_P=0.9 +MAX_TOKENS=16384 +MAX_TRAJECTORY_LENGTH=3 +ACTION_SPACE="pyautogui" +SCREEN_WIDTH=1920 +SCREEN_HEIGHT=1080 +RESULT_DIR="/Volumes/Castor/课题/results_baseline_50steps" +TEST_META="evaluation_examples/test_chrome.json" +DOMAIN="chrome_windows" +SNAPSHOT_NAME="snapshot" +INJECT_STEPS=false + +# ---------- 两种观测模式,按需切换 ---------- +# screenshot only: +#OBSERVATION_TYPE="screenshot" +# screenshot + a11y tree(第二轮时改为下面这行): +OBSERVATION_TYPE="screenshot_a11y_tree" + +# ---------- 预检查 ---------- +echo "=== 预检查 ===" + +echo -n "SSH 到 Proxmox... " +if ssh -o BatchMode=yes -o ConnectTimeout=5 ${PROXMOX_SSH_HOST} "echo ok" 2>/dev/null | grep -q "ok"; then + echo "✅ 连接成功" +else + echo "❌ SSH 连接失败" + exit 1 +fi + +echo -n "VM ${VM_ID} 状态... " +VM_STATUS=$(ssh -o BatchMode=yes ${PROXMOX_SSH_HOST} "qm status ${VM_ID}" 2>/dev/null) +echo "${VM_STATUS}" + +echo -n "Flask Server (${PROXMOX_VM_IP}:5000)... " +if curl -s --connect-timeout 5 "http://${PROXMOX_VM_IP}:5000/screenshot" -o /dev/null -w "%{http_code}" | grep -q "200"; then + echo "✅ 可访问" +else + echo "⚠️ 不可访问(评测启动时会自动启动 VM)" +fi + +echo -n "快照 '${SNAPSHOT_NAME}'... " +SNAPSHOTS=$(ssh -o BatchMode=yes ${PROXMOX_SSH_HOST} "qm listsnapshot ${VM_ID}" 2>/dev/null) +if echo "${SNAPSHOTS}" | grep -q "${SNAPSHOT_NAME}"; then + echo "✅ 已存在" +else + echo "⚠️ 未找到快照 '${SNAPSHOT_NAME}'" +fi + +echo "" +echo "=== 开始评测 ===" +echo "Provider: ${PROVIDER}" +echo "VM ID: ${VM_ID}" +echo "VM IP: ${PROXMOX_VM_IP}" +echo "Model: ${MODEL}" +echo "Eval: ${EVAL_MODEL}" +echo "Observation: ${OBSERVATION_TYPE}" +echo "Domain: ${DOMAIN}" +echo "Results: ${RESULT_DIR}" +echo "" + +# ---------- 运行评测 ---------- +if [ "${INJECT_STEPS}" = true ]; then + INJECT_STEPS_FLAG="--inject_steps" +else + INJECT_STEPS_FLAG="--no_inject_steps" +fi + +python run.py \ + --provider_name "${PROVIDER}" \ + --path_to_vm "${VM_ID}" \ + --observation_type "${OBSERVATION_TYPE}" \ + --action_space "${ACTION_SPACE}" \ + --model "${MODEL}" \ + --eval_model "${EVAL_MODEL}" \ + --temperature "${TEMPERATURE}" \ + --top_p "${TOP_P}" \ + --max_tokens "${MAX_TOKENS}" \ + --max_trajectory_length "${MAX_TRAJECTORY_LENGTH}" \ + --screen_width "${SCREEN_WIDTH}" \ + --screen_height "${SCREEN_HEIGHT}" \ + --sleep_after_execution "${SLEEP_AFTER_EXEC}" \ + --max_steps "${MAX_STEPS}" \ + --result_dir "${RESULT_DIR}" \ + --test_all_meta_path "${TEST_META}" \ + --domain "${DOMAIN}" \ + ${INJECT_STEPS_FLAG} From c9912ad54cee60b63698978cd386f5387bf44162 Mon Sep 17 00:00:00 2001 From: lizhanyuan <949777411@qq.com> Date: Thu, 19 Mar 2026 14:59:20 +0800 Subject: [PATCH 2/3] =?UTF-8?q?data:=20=E5=88=A0=E9=99=A4=20ovito=20remote?= =?UTF-8?q?=5Ffile=5Faccess/rendering=20=E4=BB=BB=E5=8A=A1=EF=BC=8C?= =?UTF-8?q?=E6=9B=B4=E6=96=B0=20test=5Ffinal.json?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ovito/remote_file_access_task1.json | 45 ------------------- .../examples/ovito/rendering_task1.json | 45 ------------------- evaluation_examples/test_final.json | 4 +- 3 files changed, 1 insertion(+), 93 deletions(-) delete mode 100644 evaluation_examples/examples/ovito/remote_file_access_task1.json delete mode 100644 evaluation_examples/examples/ovito/rendering_task1.json diff --git a/evaluation_examples/examples/ovito/remote_file_access_task1.json b/evaluation_examples/examples/ovito/remote_file_access_task1.json deleted file mode 100644 index a910634..0000000 --- a/evaluation_examples/examples/ovito/remote_file_access_task1.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "id": "remote_file_access_task1", - "snapshot": "ovito", - "instruction": "在 OVITO 中,通过 File → Load Remote File 打开远程 SSH 文件 sftp://user@hostname/path/file", - "source": "custom", - "config": [ - { - "type": "launch", - "parameters": { - "command": [ - "C:\\OVITO Basic\\ovito.exe" - ] - } - }, - { - "type": "sleep", - "parameters": { - "seconds": 5 - } - } - ], - "trajectory": "trajectories/", - "related_apps": [ - "ovito" - ], - "evaluator": { - "postconfig": [ - { - "type": "sleep", - "parameters": { - "seconds": 3 - } - } - ], - "func": "vllm_eval" - }, - "proxy": false, - "fixed_ip": false, - "possibility_of_env_change": "low", - "metadata": { - "input_files": [], - "steps": "1. 启动 OVITO 软件并等待进入主界面。\n2. 单击主界面顶部菜单栏中的 \"File\" 菜单将其展开。\n3. 在展开的下拉菜单中,单击选择 \"Load Remote File\" 菜单项,此时会弹出 \"Load Remote File\" 对话框。\n4. 在 \"Load Remote File\" 对话框中,单击 \"Remote URL:\" 标签下方的组合输入框将光标定位至此。\n5. 在该输入框中,清空原有内容,并输入远程文件地址:sftp://user@hostname/path/file。\n6. 单击 \"File type:\" 标签下方的下拉菜单控件将其展开。\n7. 在展开的下拉列表中,单击选择 \"\" 选项。\n8. 在 \"SSH connection method:\" 区域,单击 \"Integrated client (default)\" 左侧的单选按钮以选中该选项。\n9. 单击对话框右下角蓝色的 \"Open\" 按钮以建立连接并加载文件。", - "steps_original": "1. 打开 OVITO 软件。\n2. 点击菜单 File → Load Remote File。\n3. 在弹出的对话框中填写 Remote URL 字段,例如:sftp://user@hostname/path/file。\n4. 在 File type 下选择 Auto-detect file format。\n5. 在 SSH connection method 下选择 Integrated client (default)。\n6. 点击 Open 完成连接并加载文件。" - } -} \ No newline at end of file diff --git a/evaluation_examples/examples/ovito/rendering_task1.json b/evaluation_examples/examples/ovito/rendering_task1.json deleted file mode 100644 index 8534506..0000000 --- a/evaluation_examples/examples/ovito/rendering_task1.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "id": "rendering_task1", - "snapshot": "ovito", - "instruction": "在 OVITO 中,通过 Render Settings 面板渲染主动观察窗口为分辨率 1024x768 的图像,背景为透明色。", - "source": "custom", - "config": [ - { - "type": "launch", - "parameters": { - "command": [ - "C:\\OVITO Basic\\ovito.exe" - ] - } - }, - { - "type": "sleep", - "parameters": { - "seconds": 5 - } - } - ], - "trajectory": "trajectories/", - "related_apps": [ - "ovito" - ], - "evaluator": { - "postconfig": [ - { - "type": "sleep", - "parameters": { - "seconds": 3 - } - } - ], - "func": "vllm_eval" - }, - "proxy": false, - "fixed_ip": false, - "possibility_of_env_change": "low", - "metadata": { - "input_files": [], - "steps": "1. 双击桌面或系统菜单中的 OVITO 软件快捷方式打开软件。\n2. 在软件主界面的观察窗口区域,单击鼠标左键以激活目标视口(确保视口边缘出现黄色边框)。\n3. 在界面右侧命令面板顶部的工具栏中,单击带有照相机形状的 \"Render\" 图标选项卡,打开 \"Render settings\" 面板。\n4. 在 \"Render settings\" 面板的 \"Rendering range\" 区域,单击选中 \"Single frame\" 单选按钮。\n5. 在 \"Output image size\" 区域,将光标定位到 \"Width:\" 对应的数值输入框中,清空原有内容,输入 \"1024\"。\n6. 在 \"Output image size\" 区域,将光标定位到 \"Height:\" 对应的数值输入框中,清空原有内容,输入 \"768\"。\n7. 在 \"Background\" 区域,单击选中 \"Transparent\" 单选按钮。\n8. 在 \"Render settings\" 面板的底部,单击带有照相机图标的 \"Render active viewport\" 按钮开始渲染。", - "steps_original": "1. 打开 OVITO 软件。\n2. 确保观察窗口激活(黄色边框)。\n3. 点击右侧命令面板上的 Render 图标。\n4. 在弹出的 Render Settings 面板中,选择 'Single frame'。\n5. 设置输出图像大小为 Width: 1024 和 Height: 768。\n6. 选择背景为 'Transparent'。\n7. 点击 'Render active viewport' 按钮完成渲染。" - } -} \ No newline at end of file diff --git a/evaluation_examples/test_final.json b/evaluation_examples/test_final.json index a97c1d9..1c86295 100644 --- a/evaluation_examples/test_final.json +++ b/evaluation_examples/test_final.json @@ -34,8 +34,6 @@ "marker_particles_task2", "miscellaneous_task1", "python_extensions_task1", - "remote_file_access_task1", - "rendering_task1", "transparent_particles_task1", "viewports_task1", "viewports_task2", @@ -49,4 +47,4 @@ "viewports_task10", "viewports_task11" ] -} +} \ No newline at end of file From b1ed0a478511115622edf0e5c1590c93f4cdd855 Mon Sep 17 00:00:00 2001 From: lizhanyuan <949777411@qq.com> Date: Wed, 25 Mar 2026 23:27:21 +0800 Subject: [PATCH 3/3] add a11y_tree recording to trajectory output --- lib_run_single.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib_run_single.py b/lib_run_single.py index 78051b8..277cc04 100644 --- a/lib_run_single.py +++ b/lib_run_single.py @@ -67,7 +67,8 @@ def run_single_example(agent, env, example, max_steps, instruction, args, exampl "reward": reward, "done": done, "info": info, - "screenshot_file": f"step_{step_idx + 1}_{action_timestamp}.png" + "screenshot_file": f"step_{step_idx + 1}_{action_timestamp}.png", + "a11y_tree": obs.get("accessibility_tree") if isinstance(obs, dict) else None })) f.write("\n") if done: