From acd38ca54a82f1ba355feab5658a3d9dcae2e00d Mon Sep 17 00:00:00 2001 From: lizhanyuan <949777411@qq.com> Date: Tue, 7 Apr 2026 22:27:59 +0800 Subject: [PATCH] feat(uv): add uv direct eval script, task data, healthcheck fix - add run_uv.sh direct eval entry and test_uv.json meta - add evaluation_examples/examples/uv task data - switch setup.py healthcheck from /terminal to /screenshot to avoid Win7 hang --- desktop_env/controllers/setup.py | 4 +- evaluation_examples/examples/uv/uv_test.json | 33 +++++++ evaluation_examples/test_uv.json | 5 ++ run_uv.sh | 92 ++++++++++++++++++++ 4 files changed, 132 insertions(+), 2 deletions(-) create mode 100644 evaluation_examples/examples/uv/uv_test.json create mode 100644 evaluation_examples/test_uv.json create mode 100755 run_uv.sh diff --git a/desktop_env/controllers/setup.py b/desktop_env/controllers/setup.py index f7637c1..322803e 100644 --- a/desktop_env/controllers/setup.py +++ b/desktop_env/controllers/setup.py @@ -73,7 +73,7 @@ class SetupController: retry = 0 while retry < MAX_RETRIES: try: - _ = requests.get(self.http_server + "/terminal") + _ = requests.get(self.http_server + "/screenshot", timeout=10) break except: time.sleep(5) @@ -518,7 +518,7 @@ class SetupController: retry = 0 while retry < MAX_RETRIES: try: - _ = requests.get(self.http_server + "/terminal") + _ = requests.get(self.http_server + "/screenshot", timeout=10) break except: time.sleep(5) diff --git a/evaluation_examples/examples/uv/uv_test.json b/evaluation_examples/examples/uv/uv_test.json new file mode 100644 index 0000000..0ec206c --- /dev/null +++ b/evaluation_examples/examples/uv/uv_test.json @@ -0,0 +1,33 @@ +{ + "id": "uv_test", + "snapshot": "uv", + "instruction": "从第四步开始,假设你的目标就是测量样品的荧光图谱。下面的透露给你的辅助信息,把辅助信息当成你自己的思维和决策并完全遵循,而不是把它当成任务,**绝对不允许在输出的「观察」「判断」中说明是任务要求,而是把辅助信息当成你自己的经验,用根据我的经验使用某某配置而不是根据任务要求使用某某配置**因为下面的步骤是完全正确和精准的,你只需要执行,而且绝对不允许在:是严格按坐标执行,每步执行完等待界面响应再执行下一步,禁止连续操作。修改任何数值输入框时必须:先double_click双击全选pyautogui.click(x,y);pyautogui.click(x,y)),再pyautogui.press(delete),最后用pyautogui.typewrite输入新值。1.等待主界面加载并确认正常;2.点击sample(1885,201)打开保存位置选择,双击选中sample name文本框(798,295),并清空和重命名为test 3.查看当前的输出路径是否为C:\\Users\\admin\\Desktop\\test_lzy,是的话则点击按钮OK(758,746) 返回到主界面 4.点击右侧的Method按钮(1896,147)进行参数配置 5. 点击Instrument按钮(463,54)进入Instrument面板,并检查当前的配置参数是不是为\n\n```\n {\n \"data_mode\": \"Abs\",\n \"start_wavelength\": {\"value\": 850, \"unit\": \"nm\"},\n \"end_wavelength\": {\"value\": 750, \"unit\": \"nm\"},\n \"scan_speed\": {\"value\": 300, \"unit\": \"nm/min\"},\n \"high_resolution\": \"Off\",\n \"baseline\": \"User 1\",\n \"delay\": {\"value\": 0, \"unit\": \"s\"},\n \"cycle_time\": {\"value\": 0, \"unit\": \"min\"},\n \"auto_zero_before_each_run\": False,\n \"lamp_change_mode\": \"Auto\",\n \"lamp_change_wavelength\": {\"value\": 325, \"unit\": \"nm\"},\n \"wi_lamp\": \"On\",\n \"d2_lamp\": \"On\",\n \"slit_width\": {\"value\": 2, \"unit\": \"nm\"},\n \"pmt_mode\": \"Auto\",\n \"pmt_voltage\": {\"value\": 100, \"unit\": \"V\"},\n \"sampling_interval\": \"Auto\",\n \"replicates\": 1,\n \"uv_scan_speed_change\": {\n \"enabled\": False,\n \"speed_change_wavelength\": {\"value\": 340, \"unit\": \"nm\"},\n \"scan_speed\": {\"value\": 120, \"unit\": \"nm/min\"}\n },\n \"path_correct\": True,\n \"path_length\": {\"value\": 10, \"unit\": \"mm\"}\n```\n\n6. 点击Report按钮(711,57)进入Report面板 7. 再点击确定按钮(1150,1015)回到主界面 8.点击确定(641,717);9. 等待Ready 10. 点击右侧的Baseline按钮(1896,248) 11. 选择baseline为User1(默认),点击OK按钮(1165,496),等待扫描完成变成ready。12.点击主界面右侧的measure按钮(1888,291)进行测量,等待扫描完成,图中会出现保存的pdf 路径,需要确定路径为桌面的test_lzy,确定后的话,单击文件名输入框(726,774),命名为test,如果默认有后缀,则点击空白处让命名为test 13. 点击保存按钮(1687,867)", + "source": "custom", + "config": [], + "trajectory": "trajectories/", + "related_apps": [ + "uv" + ], + "evaluator": { + "postconfig": [ + { + "type": "sleep", + "parameters": { + "seconds": 5 + } + } + ], + "func": "vllm_eval", + "expected": { + "description": "FL Solutions 主界面中图表区域应显示一条完整的荧光发射光谱曲线:峰形平滑、顶部无截断(曲线最高点不贴近纵轴上限)、基线平稳、信噪比良好。界面中的仪器参数区域应可见激发波长 350 nm、发射扫描范围 380-700 nm,以及经过迭代调整后的最终 PMT 电压和狭缝宽度参数。" + } + }, + "proxy": false, + "fixed_ip": true, + "possibility_of_env_change": "medium", + "metadata": { + "input_files": [], + "steps": "", + "difficulty": "hard" + } +} diff --git a/evaluation_examples/test_uv.json b/evaluation_examples/test_uv.json new file mode 100644 index 0000000..f798e87 --- /dev/null +++ b/evaluation_examples/test_uv.json @@ -0,0 +1,5 @@ +{ + "uv": [ + "uv_test" + ] +} diff --git a/run_uv.sh b/run_uv.sh new file mode 100755 index 0000000..eb59335 --- /dev/null +++ b/run_uv.sh @@ -0,0 +1,92 @@ +#!/bin/bash +# ============================================================================= +# uv 评测脚本 +# provider: direct —— 直接访问 Flask 服务,无需任何 VM/SSH +# ============================================================================= + +# ---------- Win7 直连 IP ---------- +export DIRECT_VM_IP="172.20.103.218" #uv 实机 Flask 地址 + +# ---------- LLM API 配置 ---------- +export OPENAI_API_KEY="sk-5qmpSdZDGKUS8idrEf4c62Fc2f6746D8B68eC48124718329" +export OPENAI_BASE_URL="https://vip.apiyi.com/v1" + +# ---------- 评测参数(对齐 run_proxmox.sh)---------- +MODEL="gpt-4o" # claude-sonnet-4-6 +EVAL_MODEL="gemini-3.1-pro-preview" +MAX_STEPS=50 +SLEEP_AFTER_EXEC=3 +TEMPERATURE=0 +TOP_P=0.9 +MAX_TOKENS=16384 +MAX_TRAJECTORY_LENGTH=5 +OBSERVATION_TYPE="screenshot" +ACTION_SPACE="pyautogui" +SCREEN_WIDTH=1920 +SCREEN_HEIGHT=1080 +RESULT_DIR="/Users/lizhanyuan/Downloads/results/uv" +TEST_META="evaluation_examples/test_uv.json" +DOMAIN="uv" +INJECT_STEPS=False + +# ---------- 预检查 ---------- +cd "$(dirname "$0")" + +echo "=== FL Solutions F-4600 评测预检查 ===" +echo "" + +echo -n "Flask Server (${DIRECT_VM_IP}:5000)... " +HTTP_CODE=$(curl -s --connect-timeout 5 "http://${DIRECT_VM_IP}:5000/screenshot" \ + -o /dev/null -w "%{http_code}" 2>/dev/null) +if [ "$HTTP_CODE" = "200" ]; then + echo "OK" +else + echo "FAIL (HTTP ${HTTP_CODE})" + echo "[ERROR] Win7 Flask Server 不可达,请先在 Win7 运行: python D:\python_server\main.py" + exit 1 +fi + +mkdir -p "${RESULT_DIR}" logs + +echo "" +echo "=== 开始评测 ===" +echo " Provider: direct (无 VM 管理,直连 Flask)" +echo " Win7 IP: ${DIRECT_VM_IP}" +echo " Model: ${MODEL}" +echo " Eval: ${EVAL_MODEL}" +echo " Task: flsol_task4_measure" +echo " Obs Type: ${OBSERVATION_TYPE} (screenshot only, Win7 a11y unstable)" +echo " Max Steps: ${MAX_STEPS}" +echo " Max Tokens: ${MAX_TOKENS}" +echo " Results: ${RESULT_DIR}" +echo "" + +if [ "${INJECT_STEPS}" = true ]; then + INJECT_FLAG="--inject_steps" +else + INJECT_FLAG="--no_inject_steps" +fi + +python3 run.py \ + --provider_name "direct" \ + --path_to_vm "ignored" \ + --observation_type "${OBSERVATION_TYPE}" \ + --action_space "${ACTION_SPACE}" \ + --model "${MODEL}" \ + --eval_model "${EVAL_MODEL}" \ + --temperature "${TEMPERATURE}" \ + --top_p "${TOP_P}" \ + --max_tokens "${MAX_TOKENS}" \ + --max_trajectory_length "${MAX_TRAJECTORY_LENGTH}" \ + --screen_width "${SCREEN_WIDTH}" \ + --screen_height "${SCREEN_HEIGHT}" \ + --sleep_after_execution "${SLEEP_AFTER_EXEC}" \ + --max_steps "${MAX_STEPS}" \ + --result_dir "${RESULT_DIR}" \ + --test_all_meta_path "${TEST_META}" \ + --domain "${DOMAIN}" \ + ${INJECT_FLAG} + +echo "" +echo "=== 评测完成 ===" +echo "结果保存在: ${RESULT_DIR}"