#!/bin/bash # ============================================================================= # Chrome Windows 通用软件对照组评测脚本 # 用途:在 Windows VM 上测试 Chrome 任务,作为科学软件的通用软件对照组 # ============================================================================= # ---------- Proxmox 配置 ---------- export PROXMOX_SSH_HOST="root@10.10.17.3" export PROXMOX_VM_IP="10.10.17.10" # ---------- LLM API 配置 ---------- export OPENAI_API_KEY="sk-EQGuvk0rS7EG4Cu22cF6D5Cc3a324c88B2E2D432Bc59Bb17" export OPENAI_BASE_URL="https://vip.apiyi.com/v1" # ---------- 评测参数 ---------- PROVIDER="proxmox" VM_ID="102" MODEL="gpt-5.4" EVAL_MODEL="gemini-3.1-pro-preview" MAX_STEPS=35 SLEEP_AFTER_EXEC=3 TEMPERATURE=0.5 TOP_P=0.9 MAX_TOKENS=16384 MAX_TRAJECTORY_LENGTH=3 ACTION_SPACE="pyautogui" SCREEN_WIDTH=1920 SCREEN_HEIGHT=1080 RESULT_DIR="/Volumes/Castor/课题/results_baseline_50steps" TEST_META="evaluation_examples/test_chrome.json" DOMAIN="chrome_windows" SNAPSHOT_NAME="snapshot" INJECT_STEPS=false # ---------- 两种观测模式,按需切换 ---------- # screenshot only: #OBSERVATION_TYPE="screenshot" # screenshot + a11y tree(第二轮时改为下面这行): OBSERVATION_TYPE="screenshot_a11y_tree" # ---------- 预检查 ---------- echo "=== 预检查 ===" echo -n "SSH 到 Proxmox... " if ssh -o BatchMode=yes -o ConnectTimeout=5 ${PROXMOX_SSH_HOST} "echo ok" 2>/dev/null | grep -q "ok"; then echo "✅ 连接成功" else echo "❌ SSH 连接失败" exit 1 fi echo -n "VM ${VM_ID} 状态... " VM_STATUS=$(ssh -o BatchMode=yes ${PROXMOX_SSH_HOST} "qm status ${VM_ID}" 2>/dev/null) echo "${VM_STATUS}" echo -n "Flask Server (${PROXMOX_VM_IP}:5000)... " if curl -s --connect-timeout 5 "http://${PROXMOX_VM_IP}:5000/screenshot" -o /dev/null -w "%{http_code}" | grep -q "200"; then echo "✅ 可访问" else echo "⚠️ 不可访问(评测启动时会自动启动 VM)" fi echo -n "快照 '${SNAPSHOT_NAME}'... " SNAPSHOTS=$(ssh -o BatchMode=yes ${PROXMOX_SSH_HOST} "qm listsnapshot ${VM_ID}" 2>/dev/null) if echo "${SNAPSHOTS}" | grep -q "${SNAPSHOT_NAME}"; then echo "✅ 已存在" else echo "⚠️ 未找到快照 '${SNAPSHOT_NAME}'" fi echo "" echo "=== 开始评测 ===" echo "Provider: ${PROVIDER}" echo "VM ID: ${VM_ID}" echo "VM IP: ${PROXMOX_VM_IP}" echo "Model: ${MODEL}" echo "Eval: ${EVAL_MODEL}" echo "Observation: ${OBSERVATION_TYPE}" echo "Domain: ${DOMAIN}" echo "Results: ${RESULT_DIR}" echo "" # ---------- 运行评测 ---------- if [ "${INJECT_STEPS}" = true ]; then INJECT_STEPS_FLAG="--inject_steps" else INJECT_STEPS_FLAG="--no_inject_steps" fi python run.py \ --provider_name "${PROVIDER}" \ --path_to_vm "${VM_ID}" \ --observation_type "${OBSERVATION_TYPE}" \ --action_space "${ACTION_SPACE}" \ --model "${MODEL}" \ --eval_model "${EVAL_MODEL}" \ --temperature "${TEMPERATURE}" \ --top_p "${TOP_P}" \ --max_tokens "${MAX_TOKENS}" \ --max_trajectory_length "${MAX_TRAJECTORY_LENGTH}" \ --screen_width "${SCREEN_WIDTH}" \ --screen_height "${SCREEN_HEIGHT}" \ --sleep_after_execution "${SLEEP_AFTER_EXEC}" \ --max_steps "${MAX_STEPS}" \ --result_dir "${RESULT_DIR}" \ --test_all_meta_path "${TEST_META}" \ --domain "${DOMAIN}" \ ${INJECT_STEPS_FLAG}