#!/bin/bash # ============================================================================= # Jade-BenchMark-MVP 一键评测脚本(Proxmox 远程虚拟机版) # ============================================================================= # ---------- Proxmox 配置 ---------- # Proxmox 主机 SSH 地址(格式: user@host) export PROXMOX_SSH_HOST="root@10.10.17.3" # VM 的内网 IP(你的 Mac 能通过内网访问到的 IP) export PROXMOX_VM_IP="10.10.17.10" # ---------- LLM API 配置 ---------- # OpenAI 兼容代理(同时用于 Agent 模型和 Eval 模型) export OPENAI_API_KEY="sk-EQGuvk0rS7EG4Cu22cF6D5Cc3a324c88B2E2D432Bc59Bb17" export OPENAI_BASE_URL="https://vip.apiyi.com/v1" # ---------- 评测参数(按需修改) ---------- PROVIDER="proxmox" VM_ID="102" # Proxmox 上的 VM ID MODEL="gpt-5.2-chat-latest" # Agent 模型 EVAL_MODEL="gemini-3.1-pro-preview" # 评测模型 MAX_STEPS=50 # 每个任务最大步数(公共评测指南推荐50) SLEEP_AFTER_EXEC=3 # 每步执行后等待秒数 TEMPERATURE=0.5 # 生成温度(越低越稳定可复现) TOP_P=0.9 # nucleus sampling MAX_TOKENS=16384 # 模型最大输出 token 数 MAX_TRAJECTORY_LENGTH=3 # 历史轨迹保留长度 OBSERVATION_TYPE="screenshot_a11y_tree" # 观测类型 ACTION_SPACE="pyautogui" # 动作空间 SCREEN_WIDTH=1920 # 屏幕宽度 SCREEN_HEIGHT=1080 # 屏幕高度 RESULT_DIR="/Volumes/Castor/课题/results" # 结果输出目录 TEST_META="evaluation_examples/test_curated.json" # 评测任务列表 DOMAIN="jade" # 评测领域 SNAPSHOT_NAME="snapshot" # 快照名称(需提前创建) INJECT_STEPS=false # 是否注入教程步骤到 Agent prompt(baseline 不注入) # ---------- 预检查 ---------- echo "=== 预检查 ===" # 检查 SSH 连通性 echo -n "SSH 到 Proxmox... " if ssh -o BatchMode=yes -o ConnectTimeout=5 ${PROXMOX_SSH_HOST} "echo ok" 2>/dev/null | grep -q "ok"; then echo "✅ 连接成功" else echo "❌ SSH 连接失败,请确认:" echo " 1. 已执行 ssh-copy-id ${PROXMOX_SSH_HOST}" echo " 2. Proxmox 主机可达" exit 1 fi # 检查 VM 状态 echo -n "VM ${VM_ID} 状态... " VM_STATUS=$(ssh -o BatchMode=yes ${PROXMOX_SSH_HOST} "qm status ${VM_ID}" 2>/dev/null) echo "${VM_STATUS}" # 检查 Flask Server echo -n "Flask Server (${PROXMOX_VM_IP}:5000)... " if curl -s --connect-timeout 5 "http://${PROXMOX_VM_IP}:5000/screenshot" -o /dev/null -w "%{http_code}" | grep -q "200"; then echo "✅ 可访问" else echo "⚠️ 不可访问(VM 可能未启动或 Flask 未运行,评测启动时会自动启动 VM)" fi # 检查快照 echo -n "快照 '${SNAPSHOT_NAME}'... " SNAPSHOTS=$(ssh -o BatchMode=yes ${PROXMOX_SSH_HOST} "qm listsnapshot ${VM_ID}" 2>/dev/null) if echo "${SNAPSHOTS}" | grep -q "${SNAPSHOT_NAME}"; then echo "✅ 已存在" else echo "⚠️ 未找到快照 '${SNAPSHOT_NAME}'。" echo " 批量评测需要快照来回滚环境。可以现在创建:" echo " ssh ${PROXMOX_SSH_HOST} \"qm snapshot ${VM_ID} ${SNAPSHOT_NAME}\"" read -p " 是否现在创建快照?(y/N) " -n 1 -r echo if [[ $REPLY =~ ^[Yy]$ ]]; then echo " 正在创建快照..." ssh ${PROXMOX_SSH_HOST} "qm snapshot ${VM_ID} ${SNAPSHOT_NAME}" echo " ✅ 快照已创建" fi fi echo "" echo "=== 开始评测 ===" echo "Provider: ${PROVIDER}" echo "VM ID: ${VM_ID}" echo "VM IP: ${PROXMOX_VM_IP}" echo "Model: ${MODEL}" echo "Eval: ${EVAL_MODEL}" echo "Domain: ${DOMAIN}" echo "Results: ${RESULT_DIR}" echo "Inject: ${INJECT_STEPS}" echo "" # ---------- 运行评测 ---------- # 构建 inject_steps 参数 if [ "${INJECT_STEPS}" = true ]; then INJECT_STEPS_FLAG="--inject_steps" else INJECT_STEPS_FLAG="--no_inject_steps" fi python run.py \ --provider_name "${PROVIDER}" \ --path_to_vm "${VM_ID}" \ --observation_type "${OBSERVATION_TYPE}" \ --action_space "${ACTION_SPACE}" \ --model "${MODEL}" \ --eval_model "${EVAL_MODEL}" \ --temperature "${TEMPERATURE}" \ --top_p "${TOP_P}" \ --max_tokens "${MAX_TOKENS}" \ --max_trajectory_length "${MAX_TRAJECTORY_LENGTH}" \ --screen_width "${SCREEN_WIDTH}" \ --screen_height "${SCREEN_HEIGHT}" \ --sleep_after_execution "${SLEEP_AFTER_EXEC}" \ --max_steps "${MAX_STEPS}" \ --result_dir "${RESULT_DIR}" \ --test_all_meta_path "${TEST_META}" \ --domain "${DOMAIN}" \ ${INJECT_STEPS_FLAG}