- 新增 desktop_env/providers/proxmox/ (manager + provider) - desktop_env.py: 添加 proxmox 到 provider 名称列表 - providers/__init__.py: 工厂函数注册 proxmox provider - run.py: 新增 --inject_steps/--no_inject_steps 参数 - run_proxmox.sh: Proxmox 运行脚本
123 lines
4.8 KiB
Bash
Executable File
123 lines
4.8 KiB
Bash
Executable File
#!/bin/bash
|
||
# =============================================================================
|
||
# Jade-BenchMark-MVP 一键评测脚本(Proxmox 远程虚拟机版)
|
||
# =============================================================================
|
||
|
||
# ---------- Proxmox 配置 ----------
|
||
# Proxmox 主机 SSH 地址(格式: user@host)
|
||
export PROXMOX_SSH_HOST="root@10.10.17.3"
|
||
|
||
# VM 的内网 IP(你的 Mac 能通过内网访问到的 IP)
|
||
export PROXMOX_VM_IP="10.10.17.10"
|
||
|
||
# ---------- LLM API 配置 ----------
|
||
# OpenAI 兼容代理(同时用于 Agent 模型和 Eval 模型)
|
||
export OPENAI_API_KEY="sk-EQGuvk0rS7EG4Cu22cF6D5Cc3a324c88B2E2D432Bc59Bb17"
|
||
export OPENAI_BASE_URL="https://vip.apiyi.com/v1"
|
||
|
||
# ---------- 评测参数(按需修改) ----------
|
||
PROVIDER="proxmox"
|
||
VM_ID="102" # Proxmox 上的 VM ID
|
||
MODEL="gpt-5.2-chat-latest" # Agent 模型
|
||
EVAL_MODEL="gemini-3.1-pro-preview" # 评测模型
|
||
MAX_STEPS=50 # 每个任务最大步数(公共评测指南推荐50)
|
||
SLEEP_AFTER_EXEC=3 # 每步执行后等待秒数
|
||
TEMPERATURE=0.5 # 生成温度(越低越稳定可复现)
|
||
TOP_P=0.9 # nucleus sampling
|
||
MAX_TOKENS=16384 # 模型最大输出 token 数
|
||
MAX_TRAJECTORY_LENGTH=3 # 历史轨迹保留长度
|
||
OBSERVATION_TYPE="screenshot_a11y_tree" # 观测类型
|
||
ACTION_SPACE="pyautogui" # 动作空间
|
||
SCREEN_WIDTH=1920 # 屏幕宽度
|
||
SCREEN_HEIGHT=1080 # 屏幕高度
|
||
RESULT_DIR="/Volumes/Castor/课题/results" # 结果输出目录
|
||
TEST_META="evaluation_examples/test_curated.json" # 评测任务列表
|
||
DOMAIN="jade" # 评测领域
|
||
SNAPSHOT_NAME="snapshot" # 快照名称(需提前创建)
|
||
INJECT_STEPS=false # 是否注入教程步骤到 Agent prompt(baseline 不注入)
|
||
|
||
# ---------- 预检查 ----------
|
||
echo "=== 预检查 ==="
|
||
|
||
# 检查 SSH 连通性
|
||
echo -n "SSH 到 Proxmox... "
|
||
if ssh -o BatchMode=yes -o ConnectTimeout=5 ${PROXMOX_SSH_HOST} "echo ok" 2>/dev/null | grep -q "ok"; then
|
||
echo "✅ 连接成功"
|
||
else
|
||
echo "❌ SSH 连接失败,请确认:"
|
||
echo " 1. 已执行 ssh-copy-id ${PROXMOX_SSH_HOST}"
|
||
echo " 2. Proxmox 主机可达"
|
||
exit 1
|
||
fi
|
||
|
||
# 检查 VM 状态
|
||
echo -n "VM ${VM_ID} 状态... "
|
||
VM_STATUS=$(ssh -o BatchMode=yes ${PROXMOX_SSH_HOST} "qm status ${VM_ID}" 2>/dev/null)
|
||
echo "${VM_STATUS}"
|
||
|
||
# 检查 Flask Server
|
||
echo -n "Flask Server (${PROXMOX_VM_IP}:5000)... "
|
||
if curl -s --connect-timeout 5 "http://${PROXMOX_VM_IP}:5000/screenshot" -o /dev/null -w "%{http_code}" | grep -q "200"; then
|
||
echo "✅ 可访问"
|
||
else
|
||
echo "⚠️ 不可访问(VM 可能未启动或 Flask 未运行,评测启动时会自动启动 VM)"
|
||
fi
|
||
|
||
# 检查快照
|
||
echo -n "快照 '${SNAPSHOT_NAME}'... "
|
||
SNAPSHOTS=$(ssh -o BatchMode=yes ${PROXMOX_SSH_HOST} "qm listsnapshot ${VM_ID}" 2>/dev/null)
|
||
if echo "${SNAPSHOTS}" | grep -q "${SNAPSHOT_NAME}"; then
|
||
echo "✅ 已存在"
|
||
else
|
||
echo "⚠️ 未找到快照 '${SNAPSHOT_NAME}'。"
|
||
echo " 批量评测需要快照来回滚环境。可以现在创建:"
|
||
echo " ssh ${PROXMOX_SSH_HOST} \"qm snapshot ${VM_ID} ${SNAPSHOT_NAME}\""
|
||
read -p " 是否现在创建快照?(y/N) " -n 1 -r
|
||
echo
|
||
if [[ $REPLY =~ ^[Yy]$ ]]; then
|
||
echo " 正在创建快照..."
|
||
ssh ${PROXMOX_SSH_HOST} "qm snapshot ${VM_ID} ${SNAPSHOT_NAME}"
|
||
echo " ✅ 快照已创建"
|
||
fi
|
||
fi
|
||
|
||
echo ""
|
||
echo "=== 开始评测 ==="
|
||
echo "Provider: ${PROVIDER}"
|
||
echo "VM ID: ${VM_ID}"
|
||
echo "VM IP: ${PROXMOX_VM_IP}"
|
||
echo "Model: ${MODEL}"
|
||
echo "Eval: ${EVAL_MODEL}"
|
||
echo "Domain: ${DOMAIN}"
|
||
echo "Results: ${RESULT_DIR}"
|
||
echo "Inject: ${INJECT_STEPS}"
|
||
echo ""
|
||
|
||
# ---------- 运行评测 ----------
|
||
# 构建 inject_steps 参数
|
||
if [ "${INJECT_STEPS}" = true ]; then
|
||
INJECT_STEPS_FLAG="--inject_steps"
|
||
else
|
||
INJECT_STEPS_FLAG="--no_inject_steps"
|
||
fi
|
||
|
||
python run.py \
|
||
--provider_name "${PROVIDER}" \
|
||
--path_to_vm "${VM_ID}" \
|
||
--observation_type "${OBSERVATION_TYPE}" \
|
||
--action_space "${ACTION_SPACE}" \
|
||
--model "${MODEL}" \
|
||
--eval_model "${EVAL_MODEL}" \
|
||
--temperature "${TEMPERATURE}" \
|
||
--top_p "${TOP_P}" \
|
||
--max_tokens "${MAX_TOKENS}" \
|
||
--max_trajectory_length "${MAX_TRAJECTORY_LENGTH}" \
|
||
--screen_width "${SCREEN_WIDTH}" \
|
||
--screen_height "${SCREEN_HEIGHT}" \
|
||
--sleep_after_execution "${SLEEP_AFTER_EXEC}" \
|
||
--max_steps "${MAX_STEPS}" \
|
||
--result_dir "${RESULT_DIR}" \
|
||
--test_all_meta_path "${TEST_META}" \
|
||
--domain "${DOMAIN}" \
|
||
${INJECT_STEPS_FLAG}
|