feat(uv): add uv direct eval script, task data, healthcheck fix
- add run_uv.sh direct eval entry and test_uv.json meta - add evaluation_examples/examples/uv task data - switch setup.py healthcheck from /terminal to /screenshot to avoid Win7 hang
This commit is contained in:
92
run_uv.sh
Executable file
92
run_uv.sh
Executable file
@@ -0,0 +1,92 @@
|
||||
#!/bin/bash
|
||||
# =============================================================================
|
||||
# uv 评测脚本
|
||||
# provider: direct —— 直接访问 Flask 服务,无需任何 VM/SSH
|
||||
# =============================================================================
|
||||
|
||||
# ---------- Win7 直连 IP ----------
|
||||
export DIRECT_VM_IP="172.20.103.218" #uv 实机 Flask 地址
|
||||
|
||||
# ---------- LLM API 配置 ----------
|
||||
export OPENAI_API_KEY="sk-5qmpSdZDGKUS8idrEf4c62Fc2f6746D8B68eC48124718329"
|
||||
export OPENAI_BASE_URL="https://vip.apiyi.com/v1"
|
||||
|
||||
# ---------- 评测参数(对齐 run_proxmox.sh)----------
|
||||
MODEL="gpt-4o" # claude-sonnet-4-6
|
||||
EVAL_MODEL="gemini-3.1-pro-preview"
|
||||
MAX_STEPS=50
|
||||
SLEEP_AFTER_EXEC=3
|
||||
TEMPERATURE=0
|
||||
TOP_P=0.9
|
||||
MAX_TOKENS=16384
|
||||
MAX_TRAJECTORY_LENGTH=5
|
||||
OBSERVATION_TYPE="screenshot"
|
||||
ACTION_SPACE="pyautogui"
|
||||
SCREEN_WIDTH=1920
|
||||
SCREEN_HEIGHT=1080
|
||||
RESULT_DIR="/Users/lizhanyuan/Downloads/results/uv"
|
||||
TEST_META="evaluation_examples/test_uv.json"
|
||||
DOMAIN="uv"
|
||||
INJECT_STEPS=False
|
||||
|
||||
# ---------- 预检查 ----------
|
||||
cd "$(dirname "$0")"
|
||||
|
||||
echo "=== FL Solutions F-4600 评测预检查 ==="
|
||||
echo ""
|
||||
|
||||
echo -n "Flask Server (${DIRECT_VM_IP}:5000)... "
|
||||
HTTP_CODE=$(curl -s --connect-timeout 5 "http://${DIRECT_VM_IP}:5000/screenshot" \
|
||||
-o /dev/null -w "%{http_code}" 2>/dev/null)
|
||||
if [ "$HTTP_CODE" = "200" ]; then
|
||||
echo "OK"
|
||||
else
|
||||
echo "FAIL (HTTP ${HTTP_CODE})"
|
||||
echo "[ERROR] Win7 Flask Server 不可达,请先在 Win7 运行: python D:\python_server\main.py"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir -p "${RESULT_DIR}" logs
|
||||
|
||||
echo ""
|
||||
echo "=== 开始评测 ==="
|
||||
echo " Provider: direct (无 VM 管理,直连 Flask)"
|
||||
echo " Win7 IP: ${DIRECT_VM_IP}"
|
||||
echo " Model: ${MODEL}"
|
||||
echo " Eval: ${EVAL_MODEL}"
|
||||
echo " Task: flsol_task4_measure"
|
||||
echo " Obs Type: ${OBSERVATION_TYPE} (screenshot only, Win7 a11y unstable)"
|
||||
echo " Max Steps: ${MAX_STEPS}"
|
||||
echo " Max Tokens: ${MAX_TOKENS}"
|
||||
echo " Results: ${RESULT_DIR}"
|
||||
echo ""
|
||||
|
||||
if [ "${INJECT_STEPS}" = true ]; then
|
||||
INJECT_FLAG="--inject_steps"
|
||||
else
|
||||
INJECT_FLAG="--no_inject_steps"
|
||||
fi
|
||||
|
||||
python3 run.py \
|
||||
--provider_name "direct" \
|
||||
--path_to_vm "ignored" \
|
||||
--observation_type "${OBSERVATION_TYPE}" \
|
||||
--action_space "${ACTION_SPACE}" \
|
||||
--model "${MODEL}" \
|
||||
--eval_model "${EVAL_MODEL}" \
|
||||
--temperature "${TEMPERATURE}" \
|
||||
--top_p "${TOP_P}" \
|
||||
--max_tokens "${MAX_TOKENS}" \
|
||||
--max_trajectory_length "${MAX_TRAJECTORY_LENGTH}" \
|
||||
--screen_width "${SCREEN_WIDTH}" \
|
||||
--screen_height "${SCREEN_HEIGHT}" \
|
||||
--sleep_after_execution "${SLEEP_AFTER_EXEC}" \
|
||||
--max_steps "${MAX_STEPS}" \
|
||||
--result_dir "${RESULT_DIR}" \
|
||||
--test_all_meta_path "${TEST_META}" \
|
||||
--domain "${DOMAIN}" \
|
||||
${INJECT_FLAG}
|
||||
|
||||
echo ""
|
||||
echo "=== 评测完成 ==="
|
||||
echo "结果保存在: ${RESULT_DIR}"
|
||||
Reference in New Issue
Block a user