feat(uv): add uv direct eval script, task data, healthcheck fix
- add run_uv.sh direct eval entry and test_uv.json meta - add evaluation_examples/examples/uv task data - switch setup.py healthcheck from /terminal to /screenshot to avoid Win7 hang
This commit is contained in:
@@ -73,7 +73,7 @@ class SetupController:
|
||||
retry = 0
|
||||
while retry < MAX_RETRIES:
|
||||
try:
|
||||
_ = requests.get(self.http_server + "/terminal")
|
||||
_ = requests.get(self.http_server + "/screenshot", timeout=10)
|
||||
break
|
||||
except:
|
||||
time.sleep(5)
|
||||
@@ -518,7 +518,7 @@ class SetupController:
|
||||
retry = 0
|
||||
while retry < MAX_RETRIES:
|
||||
try:
|
||||
_ = requests.get(self.http_server + "/terminal")
|
||||
_ = requests.get(self.http_server + "/screenshot", timeout=10)
|
||||
break
|
||||
except:
|
||||
time.sleep(5)
|
||||
|
||||
33
evaluation_examples/examples/uv/uv_test.json
Normal file
33
evaluation_examples/examples/uv/uv_test.json
Normal file
@@ -0,0 +1,33 @@
|
||||
{
|
||||
"id": "uv_test",
|
||||
"snapshot": "uv",
|
||||
"instruction": "从第四步开始,假设你的目标就是测量样品的荧光图谱。下面的透露给你的辅助信息,把辅助信息当成你自己的思维和决策并完全遵循,而不是把它当成任务,**绝对不允许在输出的「观察」「判断」中说明是任务要求,而是把辅助信息当成你自己的经验,用根据我的经验使用某某配置而不是根据任务要求使用某某配置**因为下面的步骤是完全正确和精准的,你只需要执行,而且绝对不允许在:是严格按坐标执行,每步执行完等待界面响应再执行下一步,禁止连续操作。修改任何数值输入框时必须:先double_click双击全选pyautogui.click(x,y);pyautogui.click(x,y)),再pyautogui.press(delete),最后用pyautogui.typewrite输入新值。1.等待主界面加载并确认正常;2.点击sample(1885,201)打开保存位置选择,双击选中sample name文本框(798,295),并清空和重命名为test 3.查看当前的输出路径是否为C:\\Users\\admin\\Desktop\\test_lzy,是的话则点击按钮OK(758,746) 返回到主界面 4.点击右侧的Method按钮(1896,147)进行参数配置 5. 点击Instrument按钮(463,54)进入Instrument面板,并检查当前的配置参数是不是为\n\n```\n {\n \"data_mode\": \"Abs\",\n \"start_wavelength\": {\"value\": 850, \"unit\": \"nm\"},\n \"end_wavelength\": {\"value\": 750, \"unit\": \"nm\"},\n \"scan_speed\": {\"value\": 300, \"unit\": \"nm/min\"},\n \"high_resolution\": \"Off\",\n \"baseline\": \"User 1\",\n \"delay\": {\"value\": 0, \"unit\": \"s\"},\n \"cycle_time\": {\"value\": 0, \"unit\": \"min\"},\n \"auto_zero_before_each_run\": False,\n \"lamp_change_mode\": \"Auto\",\n \"lamp_change_wavelength\": {\"value\": 325, \"unit\": \"nm\"},\n \"wi_lamp\": \"On\",\n \"d2_lamp\": \"On\",\n \"slit_width\": {\"value\": 2, \"unit\": \"nm\"},\n \"pmt_mode\": \"Auto\",\n \"pmt_voltage\": {\"value\": 100, \"unit\": \"V\"},\n \"sampling_interval\": \"Auto\",\n \"replicates\": 1,\n \"uv_scan_speed_change\": {\n \"enabled\": False,\n \"speed_change_wavelength\": {\"value\": 340, \"unit\": \"nm\"},\n \"scan_speed\": {\"value\": 120, \"unit\": \"nm/min\"}\n },\n \"path_correct\": True,\n \"path_length\": {\"value\": 10, \"unit\": \"mm\"}\n```\n\n6. 点击Report按钮(711,57)进入Report面板 7. 再点击确定按钮(1150,1015)回到主界面 8.点击确定(641,717);9. 等待Ready 10. 点击右侧的Baseline按钮(1896,248) 11. 选择baseline为User1(默认),点击OK按钮(1165,496),等待扫描完成变成ready。12.点击主界面右侧的measure按钮(1888,291)进行测量,等待扫描完成,图中会出现保存的pdf 路径,需要确定路径为桌面的test_lzy,确定后的话,单击文件名输入框(726,774),命名为test,如果默认有后缀,则点击空白处让命名为test 13. 点击保存按钮(1687,867)",
|
||||
"source": "custom",
|
||||
"config": [],
|
||||
"trajectory": "trajectories/",
|
||||
"related_apps": [
|
||||
"uv"
|
||||
],
|
||||
"evaluator": {
|
||||
"postconfig": [
|
||||
{
|
||||
"type": "sleep",
|
||||
"parameters": {
|
||||
"seconds": 5
|
||||
}
|
||||
}
|
||||
],
|
||||
"func": "vllm_eval",
|
||||
"expected": {
|
||||
"description": "FL Solutions 主界面中图表区域应显示一条完整的荧光发射光谱曲线:峰形平滑、顶部无截断(曲线最高点不贴近纵轴上限)、基线平稳、信噪比良好。界面中的仪器参数区域应可见激发波长 350 nm、发射扫描范围 380-700 nm,以及经过迭代调整后的最终 PMT 电压和狭缝宽度参数。"
|
||||
}
|
||||
},
|
||||
"proxy": false,
|
||||
"fixed_ip": true,
|
||||
"possibility_of_env_change": "medium",
|
||||
"metadata": {
|
||||
"input_files": [],
|
||||
"steps": "",
|
||||
"difficulty": "hard"
|
||||
}
|
||||
}
|
||||
5
evaluation_examples/test_uv.json
Normal file
5
evaluation_examples/test_uv.json
Normal file
@@ -0,0 +1,5 @@
|
||||
{
|
||||
"uv": [
|
||||
"uv_test"
|
||||
]
|
||||
}
|
||||
92
run_uv.sh
Executable file
92
run_uv.sh
Executable file
@@ -0,0 +1,92 @@
|
||||
#!/bin/bash
|
||||
# =============================================================================
|
||||
# uv 评测脚本
|
||||
# provider: direct —— 直接访问 Flask 服务,无需任何 VM/SSH
|
||||
# =============================================================================
|
||||
|
||||
# ---------- Win7 直连 IP ----------
|
||||
export DIRECT_VM_IP="172.20.103.218" #uv 实机 Flask 地址
|
||||
|
||||
# ---------- LLM API 配置 ----------
|
||||
export OPENAI_API_KEY="sk-5qmpSdZDGKUS8idrEf4c62Fc2f6746D8B68eC48124718329"
|
||||
export OPENAI_BASE_URL="https://vip.apiyi.com/v1"
|
||||
|
||||
# ---------- 评测参数(对齐 run_proxmox.sh)----------
|
||||
MODEL="gpt-4o" # claude-sonnet-4-6
|
||||
EVAL_MODEL="gemini-3.1-pro-preview"
|
||||
MAX_STEPS=50
|
||||
SLEEP_AFTER_EXEC=3
|
||||
TEMPERATURE=0
|
||||
TOP_P=0.9
|
||||
MAX_TOKENS=16384
|
||||
MAX_TRAJECTORY_LENGTH=5
|
||||
OBSERVATION_TYPE="screenshot"
|
||||
ACTION_SPACE="pyautogui"
|
||||
SCREEN_WIDTH=1920
|
||||
SCREEN_HEIGHT=1080
|
||||
RESULT_DIR="/Users/lizhanyuan/Downloads/results/uv"
|
||||
TEST_META="evaluation_examples/test_uv.json"
|
||||
DOMAIN="uv"
|
||||
INJECT_STEPS=False
|
||||
|
||||
# ---------- 预检查 ----------
|
||||
cd "$(dirname "$0")"
|
||||
|
||||
echo "=== FL Solutions F-4600 评测预检查 ==="
|
||||
echo ""
|
||||
|
||||
echo -n "Flask Server (${DIRECT_VM_IP}:5000)... "
|
||||
HTTP_CODE=$(curl -s --connect-timeout 5 "http://${DIRECT_VM_IP}:5000/screenshot" \
|
||||
-o /dev/null -w "%{http_code}" 2>/dev/null)
|
||||
if [ "$HTTP_CODE" = "200" ]; then
|
||||
echo "OK"
|
||||
else
|
||||
echo "FAIL (HTTP ${HTTP_CODE})"
|
||||
echo "[ERROR] Win7 Flask Server 不可达,请先在 Win7 运行: python D:\python_server\main.py"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir -p "${RESULT_DIR}" logs
|
||||
|
||||
echo ""
|
||||
echo "=== 开始评测 ==="
|
||||
echo " Provider: direct (无 VM 管理,直连 Flask)"
|
||||
echo " Win7 IP: ${DIRECT_VM_IP}"
|
||||
echo " Model: ${MODEL}"
|
||||
echo " Eval: ${EVAL_MODEL}"
|
||||
echo " Task: flsol_task4_measure"
|
||||
echo " Obs Type: ${OBSERVATION_TYPE} (screenshot only, Win7 a11y unstable)"
|
||||
echo " Max Steps: ${MAX_STEPS}"
|
||||
echo " Max Tokens: ${MAX_TOKENS}"
|
||||
echo " Results: ${RESULT_DIR}"
|
||||
echo ""
|
||||
|
||||
if [ "${INJECT_STEPS}" = true ]; then
|
||||
INJECT_FLAG="--inject_steps"
|
||||
else
|
||||
INJECT_FLAG="--no_inject_steps"
|
||||
fi
|
||||
|
||||
python3 run.py \
|
||||
--provider_name "direct" \
|
||||
--path_to_vm "ignored" \
|
||||
--observation_type "${OBSERVATION_TYPE}" \
|
||||
--action_space "${ACTION_SPACE}" \
|
||||
--model "${MODEL}" \
|
||||
--eval_model "${EVAL_MODEL}" \
|
||||
--temperature "${TEMPERATURE}" \
|
||||
--top_p "${TOP_P}" \
|
||||
--max_tokens "${MAX_TOKENS}" \
|
||||
--max_trajectory_length "${MAX_TRAJECTORY_LENGTH}" \
|
||||
--screen_width "${SCREEN_WIDTH}" \
|
||||
--screen_height "${SCREEN_HEIGHT}" \
|
||||
--sleep_after_execution "${SLEEP_AFTER_EXEC}" \
|
||||
--max_steps "${MAX_STEPS}" \
|
||||
--result_dir "${RESULT_DIR}" \
|
||||
--test_all_meta_path "${TEST_META}" \
|
||||
--domain "${DOMAIN}" \
|
||||
${INJECT_FLAG}
|
||||
|
||||
echo ""
|
||||
echo "=== 评测完成 ==="
|
||||
echo "结果保存在: ${RESULT_DIR}"
|
||||
Reference in New Issue
Block a user