Files
sci-gui-agent-benchmark/run_flsol_win7.sh
lizhanyuan 355bf655cc feat(flsol-demo): add reflection comments to prompt and update API key
- prompts.py: add mandatory 【观察】【判断】【动作】 comment format for demo clarity
- run_flsol_win7.sh: update OPENAI_API_KEY

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-26 11:17:44 +08:00

93 lines
2.8 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# =============================================================================
# FL Solutions for F-4600 评测脚本Win7 网线直连版)
# provider: direct —— 直接访问 Flask 服务,无需任何 VM/SSH
# =============================================================================
# ---------- Win7 直连 IP ----------
export DIRECT_VM_IP="192.168.1.11" # Win7 实机 Flask 地址
# ---------- LLM API 配置 ----------
export OPENAI_API_KEY="sk-5qmpSdZDGKUS8idrEf4c62Fc2f6746D8B68eC48124718329"
export OPENAI_BASE_URL="https://vip.apiyi.com/v1"
# ---------- 评测参数(对齐 run_proxmox.sh----------
MODEL="gpt-5.4" # claude-sonnet-4-6
EVAL_MODEL="gemini-3.1-pro-preview"
MAX_STEPS=50
SLEEP_AFTER_EXEC=3
TEMPERATURE=0
TOP_P=0.9
MAX_TOKENS=36748
MAX_TRAJECTORY_LENGTH=5
OBSERVATION_TYPE="screenshot"
ACTION_SPACE="pyautogui"
SCREEN_WIDTH=1280
SCREEN_HEIGHT=1024
RESULT_DIR="/Users/lizhanyuan/Downloads/results7/flsol"
TEST_META="evaluation_examples/test_flsol.json"
DOMAIN="flsol"
INJECT_STEPS=False
# ---------- 预检查 ----------
cd "$(dirname "$0")"
echo "=== FL Solutions F-4600 评测预检查 ==="
echo ""
echo -n "Flask Server (${DIRECT_VM_IP}:5000)... "
HTTP_CODE=$(curl -s --connect-timeout 5 "http://${DIRECT_VM_IP}:5000/screenshot" \
-o /dev/null -w "%{http_code}" 2>/dev/null)
if [ "$HTTP_CODE" = "200" ]; then
echo "OK"
else
echo "FAIL (HTTP ${HTTP_CODE})"
echo "[ERROR] Win7 Flask Server 不可达,请先在 Win7 运行: python D:\python_server\main.py"
exit 1
fi
mkdir -p "${RESULT_DIR}" logs
echo ""
echo "=== 开始评测 ==="
echo " Provider: direct (无 VM 管理,直连 Flask)"
echo " Win7 IP: ${DIRECT_VM_IP}"
echo " Model: ${MODEL}"
echo " Eval: ${EVAL_MODEL}"
echo " Task: flsol_task4_measure"
echo " Obs Type: ${OBSERVATION_TYPE} (screenshot only, Win7 a11y unstable)"
echo " Max Steps: ${MAX_STEPS}"
echo " Max Tokens: ${MAX_TOKENS}"
echo " Results: ${RESULT_DIR}"
echo ""
if [ "${INJECT_STEPS}" = true ]; then
INJECT_FLAG="--inject_steps"
else
INJECT_FLAG="--no_inject_steps"
fi
python3 run.py \
--provider_name "direct" \
--path_to_vm "ignored" \
--observation_type "${OBSERVATION_TYPE}" \
--action_space "${ACTION_SPACE}" \
--model "${MODEL}" \
--eval_model "${EVAL_MODEL}" \
--temperature "${TEMPERATURE}" \
--top_p "${TOP_P}" \
--max_tokens "${MAX_TOKENS}" \
--max_trajectory_length "${MAX_TRAJECTORY_LENGTH}" \
--screen_width "${SCREEN_WIDTH}" \
--screen_height "${SCREEN_HEIGHT}" \
--sleep_after_execution "${SLEEP_AFTER_EXEC}" \
--max_steps "${MAX_STEPS}" \
--result_dir "${RESULT_DIR}" \
--test_all_meta_path "${TEST_META}" \
--domain "${DOMAIN}" \
${INJECT_FLAG}
echo ""
echo "=== 评测完成 ==="
echo "结果保存在: ${RESULT_DIR}"