Files
sci-gui-agent-benchmark/run_proxmox.sh
lizhanyuan 4bde685bbd feat: 新增 Proxmox provider 支持及 inject_steps 参数
- 新增 desktop_env/providers/proxmox/ (manager + provider)
- desktop_env.py: 添加 proxmox 到 provider 名称列表
- providers/__init__.py: 工厂函数注册 proxmox provider
- run.py: 新增 --inject_steps/--no_inject_steps 参数
- run_proxmox.sh: Proxmox 运行脚本
2026-03-04 16:39:08 +08:00

123 lines
4.8 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# =============================================================================
# Jade-BenchMark-MVP 一键评测脚本Proxmox 远程虚拟机版)
# =============================================================================
# ---------- Proxmox 配置 ----------
# Proxmox 主机 SSH 地址(格式: user@host
export PROXMOX_SSH_HOST="root@10.10.17.3"
# VM 的内网 IP你的 Mac 能通过内网访问到的 IP
export PROXMOX_VM_IP="10.10.17.10"
# ---------- LLM API 配置 ----------
# OpenAI 兼容代理(同时用于 Agent 模型和 Eval 模型)
export OPENAI_API_KEY="sk-EQGuvk0rS7EG4Cu22cF6D5Cc3a324c88B2E2D432Bc59Bb17"
export OPENAI_BASE_URL="https://vip.apiyi.com/v1"
# ---------- 评测参数(按需修改) ----------
PROVIDER="proxmox"
VM_ID="102" # Proxmox 上的 VM ID
MODEL="gpt-5.2-chat-latest" # Agent 模型
EVAL_MODEL="gemini-3.1-pro-preview" # 评测模型
MAX_STEPS=50 # 每个任务最大步数公共评测指南推荐50
SLEEP_AFTER_EXEC=3 # 每步执行后等待秒数
TEMPERATURE=0.5 # 生成温度(越低越稳定可复现)
TOP_P=0.9 # nucleus sampling
MAX_TOKENS=16384 # 模型最大输出 token 数
MAX_TRAJECTORY_LENGTH=3 # 历史轨迹保留长度
OBSERVATION_TYPE="screenshot_a11y_tree" # 观测类型
ACTION_SPACE="pyautogui" # 动作空间
SCREEN_WIDTH=1920 # 屏幕宽度
SCREEN_HEIGHT=1080 # 屏幕高度
RESULT_DIR="/Volumes/Castor/课题/results" # 结果输出目录
TEST_META="evaluation_examples/test_curated.json" # 评测任务列表
DOMAIN="jade" # 评测领域
SNAPSHOT_NAME="snapshot" # 快照名称(需提前创建)
INJECT_STEPS=false # 是否注入教程步骤到 Agent promptbaseline 不注入)
# ---------- 预检查 ----------
echo "=== 预检查 ==="
# 检查 SSH 连通性
echo -n "SSH 到 Proxmox... "
if ssh -o BatchMode=yes -o ConnectTimeout=5 ${PROXMOX_SSH_HOST} "echo ok" 2>/dev/null | grep -q "ok"; then
echo "✅ 连接成功"
else
echo "❌ SSH 连接失败,请确认:"
echo " 1. 已执行 ssh-copy-id ${PROXMOX_SSH_HOST}"
echo " 2. Proxmox 主机可达"
exit 1
fi
# 检查 VM 状态
echo -n "VM ${VM_ID} 状态... "
VM_STATUS=$(ssh -o BatchMode=yes ${PROXMOX_SSH_HOST} "qm status ${VM_ID}" 2>/dev/null)
echo "${VM_STATUS}"
# 检查 Flask Server
echo -n "Flask Server (${PROXMOX_VM_IP}:5000)... "
if curl -s --connect-timeout 5 "http://${PROXMOX_VM_IP}:5000/screenshot" -o /dev/null -w "%{http_code}" | grep -q "200"; then
echo "✅ 可访问"
else
echo "⚠️ 不可访问VM 可能未启动或 Flask 未运行,评测启动时会自动启动 VM"
fi
# 检查快照
echo -n "快照 '${SNAPSHOT_NAME}'... "
SNAPSHOTS=$(ssh -o BatchMode=yes ${PROXMOX_SSH_HOST} "qm listsnapshot ${VM_ID}" 2>/dev/null)
if echo "${SNAPSHOTS}" | grep -q "${SNAPSHOT_NAME}"; then
echo "✅ 已存在"
else
echo "⚠️ 未找到快照 '${SNAPSHOT_NAME}'。"
echo " 批量评测需要快照来回滚环境。可以现在创建:"
echo " ssh ${PROXMOX_SSH_HOST} \"qm snapshot ${VM_ID} ${SNAPSHOT_NAME}\""
read -p " 是否现在创建快照?(y/N) " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
echo " 正在创建快照..."
ssh ${PROXMOX_SSH_HOST} "qm snapshot ${VM_ID} ${SNAPSHOT_NAME}"
echo " ✅ 快照已创建"
fi
fi
echo ""
echo "=== 开始评测 ==="
echo "Provider: ${PROVIDER}"
echo "VM ID: ${VM_ID}"
echo "VM IP: ${PROXMOX_VM_IP}"
echo "Model: ${MODEL}"
echo "Eval: ${EVAL_MODEL}"
echo "Domain: ${DOMAIN}"
echo "Results: ${RESULT_DIR}"
echo "Inject: ${INJECT_STEPS}"
echo ""
# ---------- 运行评测 ----------
# 构建 inject_steps 参数
if [ "${INJECT_STEPS}" = true ]; then
INJECT_STEPS_FLAG="--inject_steps"
else
INJECT_STEPS_FLAG="--no_inject_steps"
fi
python run.py \
--provider_name "${PROVIDER}" \
--path_to_vm "${VM_ID}" \
--observation_type "${OBSERVATION_TYPE}" \
--action_space "${ACTION_SPACE}" \
--model "${MODEL}" \
--eval_model "${EVAL_MODEL}" \
--temperature "${TEMPERATURE}" \
--top_p "${TOP_P}" \
--max_tokens "${MAX_TOKENS}" \
--max_trajectory_length "${MAX_TRAJECTORY_LENGTH}" \
--screen_width "${SCREEN_WIDTH}" \
--screen_height "${SCREEN_HEIGHT}" \
--sleep_after_execution "${SLEEP_AFTER_EXEC}" \
--max_steps "${MAX_STEPS}" \
--result_dir "${RESULT_DIR}" \
--test_all_meta_path "${TEST_META}" \
--domain "${DOMAIN}" \
${INJECT_STEPS_FLAG}