Files
sci-gui-agent-benchmark/run_uv.sh
lizhanyuan acd38ca54a feat(uv): add uv direct eval script, task data, healthcheck fix
- add run_uv.sh direct eval entry and test_uv.json meta
- add evaluation_examples/examples/uv task data
- switch setup.py healthcheck from /terminal to /screenshot to avoid Win7 hang
2026-04-07 22:27:59 +08:00

93 lines
2.7 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# =============================================================================
# uv 评测脚本
# provider: direct —— 直接访问 Flask 服务,无需任何 VM/SSH
# =============================================================================
# ---------- Win7 直连 IP ----------
export DIRECT_VM_IP="172.20.103.218" #uv 实机 Flask 地址
# ---------- LLM API 配置 ----------
export OPENAI_API_KEY="sk-5qmpSdZDGKUS8idrEf4c62Fc2f6746D8B68eC48124718329"
export OPENAI_BASE_URL="https://vip.apiyi.com/v1"
# ---------- 评测参数(对齐 run_proxmox.sh----------
MODEL="gpt-4o" # claude-sonnet-4-6
EVAL_MODEL="gemini-3.1-pro-preview"
MAX_STEPS=50
SLEEP_AFTER_EXEC=3
TEMPERATURE=0
TOP_P=0.9
MAX_TOKENS=16384
MAX_TRAJECTORY_LENGTH=5
OBSERVATION_TYPE="screenshot"
ACTION_SPACE="pyautogui"
SCREEN_WIDTH=1920
SCREEN_HEIGHT=1080
RESULT_DIR="/Users/lizhanyuan/Downloads/results/uv"
TEST_META="evaluation_examples/test_uv.json"
DOMAIN="uv"
INJECT_STEPS=False
# ---------- 预检查 ----------
cd "$(dirname "$0")"
echo "=== FL Solutions F-4600 评测预检查 ==="
echo ""
echo -n "Flask Server (${DIRECT_VM_IP}:5000)... "
HTTP_CODE=$(curl -s --connect-timeout 5 "http://${DIRECT_VM_IP}:5000/screenshot" \
-o /dev/null -w "%{http_code}" 2>/dev/null)
if [ "$HTTP_CODE" = "200" ]; then
echo "OK"
else
echo "FAIL (HTTP ${HTTP_CODE})"
echo "[ERROR] Win7 Flask Server 不可达,请先在 Win7 运行: python D:\python_server\main.py"
exit 1
fi
mkdir -p "${RESULT_DIR}" logs
echo ""
echo "=== 开始评测 ==="
echo " Provider: direct (无 VM 管理,直连 Flask)"
echo " Win7 IP: ${DIRECT_VM_IP}"
echo " Model: ${MODEL}"
echo " Eval: ${EVAL_MODEL}"
echo " Task: flsol_task4_measure"
echo " Obs Type: ${OBSERVATION_TYPE} (screenshot only, Win7 a11y unstable)"
echo " Max Steps: ${MAX_STEPS}"
echo " Max Tokens: ${MAX_TOKENS}"
echo " Results: ${RESULT_DIR}"
echo ""
if [ "${INJECT_STEPS}" = true ]; then
INJECT_FLAG="--inject_steps"
else
INJECT_FLAG="--no_inject_steps"
fi
python3 run.py \
--provider_name "direct" \
--path_to_vm "ignored" \
--observation_type "${OBSERVATION_TYPE}" \
--action_space "${ACTION_SPACE}" \
--model "${MODEL}" \
--eval_model "${EVAL_MODEL}" \
--temperature "${TEMPERATURE}" \
--top_p "${TOP_P}" \
--max_tokens "${MAX_TOKENS}" \
--max_trajectory_length "${MAX_TRAJECTORY_LENGTH}" \
--screen_width "${SCREEN_WIDTH}" \
--screen_height "${SCREEN_HEIGHT}" \
--sleep_after_execution "${SLEEP_AFTER_EXEC}" \
--max_steps "${MAX_STEPS}" \
--result_dir "${RESULT_DIR}" \
--test_all_meta_path "${TEST_META}" \
--domain "${DOMAIN}" \
${INJECT_FLAG}
echo ""
echo "=== 评测完成 ==="
echo "结果保存在: ${RESULT_DIR}"