Files
sci-gui-agent-benchmark/run_proxmox_chrome.sh

109 lines
3.3 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# =============================================================================
# Chrome Windows 通用软件对照组评测脚本
# 用途:在 Windows VM 上测试 Chrome 任务,作为科学软件的通用软件对照组
# =============================================================================
# ---------- Proxmox 配置 ----------
export PROXMOX_SSH_HOST="root@10.10.17.3"
export PROXMOX_VM_IP="10.10.17.10"
# ---------- LLM API 配置 ----------
export OPENAI_API_KEY="sk-EQGuvk0rS7EG4Cu22cF6D5Cc3a324c88B2E2D432Bc59Bb17"
export OPENAI_BASE_URL="https://vip.apiyi.com/v1"
# ---------- 评测参数 ----------
PROVIDER="proxmox"
VM_ID="102"
MODEL="gpt-5.4"
EVAL_MODEL="gemini-3.1-pro-preview"
MAX_STEPS=35
SLEEP_AFTER_EXEC=3
TEMPERATURE=0.5
TOP_P=0.9
MAX_TOKENS=16384
MAX_TRAJECTORY_LENGTH=3
ACTION_SPACE="pyautogui"
SCREEN_WIDTH=1920
SCREEN_HEIGHT=1080
RESULT_DIR="/Volumes/Castor/课题/results_baseline_50steps"
TEST_META="evaluation_examples/test_chrome.json"
DOMAIN="chrome_windows"
SNAPSHOT_NAME="snapshot"
INJECT_STEPS=false
# ---------- 两种观测模式,按需切换 ----------
# screenshot only:
#OBSERVATION_TYPE="screenshot"
# screenshot + a11y tree第二轮时改为下面这行:
OBSERVATION_TYPE="screenshot_a11y_tree"
# ---------- 预检查 ----------
echo "=== 预检查 ==="
echo -n "SSH 到 Proxmox... "
if ssh -o BatchMode=yes -o ConnectTimeout=5 ${PROXMOX_SSH_HOST} "echo ok" 2>/dev/null | grep -q "ok"; then
echo "✅ 连接成功"
else
echo "❌ SSH 连接失败"
exit 1
fi
echo -n "VM ${VM_ID} 状态... "
VM_STATUS=$(ssh -o BatchMode=yes ${PROXMOX_SSH_HOST} "qm status ${VM_ID}" 2>/dev/null)
echo "${VM_STATUS}"
echo -n "Flask Server (${PROXMOX_VM_IP}:5000)... "
if curl -s --connect-timeout 5 "http://${PROXMOX_VM_IP}:5000/screenshot" -o /dev/null -w "%{http_code}" | grep -q "200"; then
echo "✅ 可访问"
else
echo "⚠️ 不可访问(评测启动时会自动启动 VM"
fi
echo -n "快照 '${SNAPSHOT_NAME}'... "
SNAPSHOTS=$(ssh -o BatchMode=yes ${PROXMOX_SSH_HOST} "qm listsnapshot ${VM_ID}" 2>/dev/null)
if echo "${SNAPSHOTS}" | grep -q "${SNAPSHOT_NAME}"; then
echo "✅ 已存在"
else
echo "⚠️ 未找到快照 '${SNAPSHOT_NAME}'"
fi
echo ""
echo "=== 开始评测 ==="
echo "Provider: ${PROVIDER}"
echo "VM ID: ${VM_ID}"
echo "VM IP: ${PROXMOX_VM_IP}"
echo "Model: ${MODEL}"
echo "Eval: ${EVAL_MODEL}"
echo "Observation: ${OBSERVATION_TYPE}"
echo "Domain: ${DOMAIN}"
echo "Results: ${RESULT_DIR}"
echo ""
# ---------- 运行评测 ----------
if [ "${INJECT_STEPS}" = true ]; then
INJECT_STEPS_FLAG="--inject_steps"
else
INJECT_STEPS_FLAG="--no_inject_steps"
fi
python run.py \
--provider_name "${PROVIDER}" \
--path_to_vm "${VM_ID}" \
--observation_type "${OBSERVATION_TYPE}" \
--action_space "${ACTION_SPACE}" \
--model "${MODEL}" \
--eval_model "${EVAL_MODEL}" \
--temperature "${TEMPERATURE}" \
--top_p "${TOP_P}" \
--max_tokens "${MAX_TOKENS}" \
--max_trajectory_length "${MAX_TRAJECTORY_LENGTH}" \
--screen_width "${SCREEN_WIDTH}" \
--screen_height "${SCREEN_HEIGHT}" \
--sleep_after_execution "${SLEEP_AFTER_EXEC}" \
--max_steps "${MAX_STEPS}" \
--result_dir "${RESULT_DIR}" \
--test_all_meta_path "${TEST_META}" \
--domain "${DOMAIN}" \
${INJECT_STEPS_FLAG}