109 lines
3.3 KiB
Bash
109 lines
3.3 KiB
Bash
#!/bin/bash
|
||
# =============================================================================
|
||
# Chrome Windows 通用软件对照组评测脚本
|
||
# 用途:在 Windows VM 上测试 Chrome 任务,作为科学软件的通用软件对照组
|
||
# =============================================================================
|
||
|
||
# ---------- Proxmox 配置 ----------
|
||
export PROXMOX_SSH_HOST="root@10.10.17.3"
|
||
export PROXMOX_VM_IP="10.10.17.10"
|
||
|
||
# ---------- LLM API 配置 ----------
|
||
export OPENAI_API_KEY="sk-EQGuvk0rS7EG4Cu22cF6D5Cc3a324c88B2E2D432Bc59Bb17"
|
||
export OPENAI_BASE_URL="https://vip.apiyi.com/v1"
|
||
|
||
# ---------- 评测参数 ----------
|
||
PROVIDER="proxmox"
|
||
VM_ID="102"
|
||
MODEL="gpt-5.4"
|
||
EVAL_MODEL="gemini-3.1-pro-preview"
|
||
MAX_STEPS=35
|
||
SLEEP_AFTER_EXEC=3
|
||
TEMPERATURE=0.5
|
||
TOP_P=0.9
|
||
MAX_TOKENS=16384
|
||
MAX_TRAJECTORY_LENGTH=3
|
||
ACTION_SPACE="pyautogui"
|
||
SCREEN_WIDTH=1920
|
||
SCREEN_HEIGHT=1080
|
||
RESULT_DIR="/Volumes/Castor/课题/results_baseline_50steps"
|
||
TEST_META="evaluation_examples/test_chrome.json"
|
||
DOMAIN="chrome_windows"
|
||
SNAPSHOT_NAME="snapshot"
|
||
INJECT_STEPS=false
|
||
|
||
# ---------- 两种观测模式,按需切换 ----------
|
||
# screenshot only:
|
||
#OBSERVATION_TYPE="screenshot"
|
||
# screenshot + a11y tree(第二轮时改为下面这行):
|
||
OBSERVATION_TYPE="screenshot_a11y_tree"
|
||
|
||
# ---------- 预检查 ----------
|
||
echo "=== 预检查 ==="
|
||
|
||
echo -n "SSH 到 Proxmox... "
|
||
if ssh -o BatchMode=yes -o ConnectTimeout=5 ${PROXMOX_SSH_HOST} "echo ok" 2>/dev/null | grep -q "ok"; then
|
||
echo "✅ 连接成功"
|
||
else
|
||
echo "❌ SSH 连接失败"
|
||
exit 1
|
||
fi
|
||
|
||
echo -n "VM ${VM_ID} 状态... "
|
||
VM_STATUS=$(ssh -o BatchMode=yes ${PROXMOX_SSH_HOST} "qm status ${VM_ID}" 2>/dev/null)
|
||
echo "${VM_STATUS}"
|
||
|
||
echo -n "Flask Server (${PROXMOX_VM_IP}:5000)... "
|
||
if curl -s --connect-timeout 5 "http://${PROXMOX_VM_IP}:5000/screenshot" -o /dev/null -w "%{http_code}" | grep -q "200"; then
|
||
echo "✅ 可访问"
|
||
else
|
||
echo "⚠️ 不可访问(评测启动时会自动启动 VM)"
|
||
fi
|
||
|
||
echo -n "快照 '${SNAPSHOT_NAME}'... "
|
||
SNAPSHOTS=$(ssh -o BatchMode=yes ${PROXMOX_SSH_HOST} "qm listsnapshot ${VM_ID}" 2>/dev/null)
|
||
if echo "${SNAPSHOTS}" | grep -q "${SNAPSHOT_NAME}"; then
|
||
echo "✅ 已存在"
|
||
else
|
||
echo "⚠️ 未找到快照 '${SNAPSHOT_NAME}'"
|
||
fi
|
||
|
||
echo ""
|
||
echo "=== 开始评测 ==="
|
||
echo "Provider: ${PROVIDER}"
|
||
echo "VM ID: ${VM_ID}"
|
||
echo "VM IP: ${PROXMOX_VM_IP}"
|
||
echo "Model: ${MODEL}"
|
||
echo "Eval: ${EVAL_MODEL}"
|
||
echo "Observation: ${OBSERVATION_TYPE}"
|
||
echo "Domain: ${DOMAIN}"
|
||
echo "Results: ${RESULT_DIR}"
|
||
echo ""
|
||
|
||
# ---------- 运行评测 ----------
|
||
if [ "${INJECT_STEPS}" = true ]; then
|
||
INJECT_STEPS_FLAG="--inject_steps"
|
||
else
|
||
INJECT_STEPS_FLAG="--no_inject_steps"
|
||
fi
|
||
|
||
python run.py \
|
||
--provider_name "${PROVIDER}" \
|
||
--path_to_vm "${VM_ID}" \
|
||
--observation_type "${OBSERVATION_TYPE}" \
|
||
--action_space "${ACTION_SPACE}" \
|
||
--model "${MODEL}" \
|
||
--eval_model "${EVAL_MODEL}" \
|
||
--temperature "${TEMPERATURE}" \
|
||
--top_p "${TOP_P}" \
|
||
--max_tokens "${MAX_TOKENS}" \
|
||
--max_trajectory_length "${MAX_TRAJECTORY_LENGTH}" \
|
||
--screen_width "${SCREEN_WIDTH}" \
|
||
--screen_height "${SCREEN_HEIGHT}" \
|
||
--sleep_after_execution "${SLEEP_AFTER_EXEC}" \
|
||
--max_steps "${MAX_STEPS}" \
|
||
--result_dir "${RESULT_DIR}" \
|
||
--test_all_meta_path "${TEST_META}" \
|
||
--domain "${DOMAIN}" \
|
||
${INJECT_STEPS_FLAG}
|