#!/bin/bash # ============================================================================= # uv 评测脚本 # provider: direct —— 直接访问 Flask 服务,无需任何 VM/SSH # ============================================================================= # ---------- Win7 直连 IP ---------- export DIRECT_VM_IP="172.20.103.218" #uv 实机 Flask 地址 # ---------- LLM API 配置 ---------- export OPENAI_API_KEY="sk-5qmpSdZDGKUS8idrEf4c62Fc2f6746D8B68eC48124718329" export OPENAI_BASE_URL="https://vip.apiyi.com/v1" # ---------- 评测参数(对齐 run_proxmox.sh)---------- MODEL="gpt-4o" # claude-sonnet-4-6 EVAL_MODEL="gemini-3.1-pro-preview" MAX_STEPS=50 SLEEP_AFTER_EXEC=3 TEMPERATURE=0 TOP_P=0.9 MAX_TOKENS=16384 MAX_TRAJECTORY_LENGTH=5 OBSERVATION_TYPE="screenshot" ACTION_SPACE="pyautogui" SCREEN_WIDTH=1920 SCREEN_HEIGHT=1080 RESULT_DIR="/Users/lizhanyuan/Downloads/results/uv" TEST_META="evaluation_examples/test_uv.json" DOMAIN="uv" INJECT_STEPS=False # ---------- 预检查 ---------- cd "$(dirname "$0")" echo "=== FL Solutions F-4600 评测预检查 ===" echo "" echo -n "Flask Server (${DIRECT_VM_IP}:5000)... " HTTP_CODE=$(curl -s --connect-timeout 5 "http://${DIRECT_VM_IP}:5000/screenshot" \ -o /dev/null -w "%{http_code}" 2>/dev/null) if [ "$HTTP_CODE" = "200" ]; then echo "OK" else echo "FAIL (HTTP ${HTTP_CODE})" echo "[ERROR] Win7 Flask Server 不可达,请先在 Win7 运行: python D:\python_server\main.py" exit 1 fi mkdir -p "${RESULT_DIR}" logs echo "" echo "=== 开始评测 ===" echo " Provider: direct (无 VM 管理,直连 Flask)" echo " Win7 IP: ${DIRECT_VM_IP}" echo " Model: ${MODEL}" echo " Eval: ${EVAL_MODEL}" echo " Task: flsol_task4_measure" echo " Obs Type: ${OBSERVATION_TYPE} (screenshot only, Win7 a11y unstable)" echo " Max Steps: ${MAX_STEPS}" echo " Max Tokens: ${MAX_TOKENS}" echo " Results: ${RESULT_DIR}" echo "" if [ "${INJECT_STEPS}" = true ]; then INJECT_FLAG="--inject_steps" else INJECT_FLAG="--no_inject_steps" fi python3 run.py \ --provider_name "direct" \ --path_to_vm "ignored" \ --observation_type "${OBSERVATION_TYPE}" \ --action_space "${ACTION_SPACE}" \ --model "${MODEL}" \ --eval_model "${EVAL_MODEL}" \ --temperature "${TEMPERATURE}" \ --top_p "${TOP_P}" \ --max_tokens "${MAX_TOKENS}" \ --max_trajectory_length "${MAX_TRAJECTORY_LENGTH}" \ --screen_width "${SCREEN_WIDTH}" \ --screen_height "${SCREEN_HEIGHT}" \ --sleep_after_execution "${SLEEP_AFTER_EXEC}" \ --max_steps "${MAX_STEPS}" \ --result_dir "${RESULT_DIR}" \ --test_all_meta_path "${TEST_META}" \ --domain "${DOMAIN}" \ ${INJECT_FLAG} echo "" echo "=== 评测完成 ===" echo "结果保存在: ${RESULT_DIR}"