Feat/monitor (#254)

* feat: add claude support

* feat: add script for end-to-end evaluation with logging and task distribution

* feat&fix: add tool result handling and update model default in evaluation script

* chore: remove run_test_env.py script

* feat&fix: implement action parsing for tool calls and update default action space

* fix: update text formatting in action parsing and replace logger import

* feat&fix: implement action parsing for tool calls and add screen size handling

* feat: add setup instructions for Anthropic API integration

* feat: add notice about image size limitations for Anthropic API

* Delete test_env/logger.py

* Delete test_env/utils.py

* fix: update logger usage to use global logger and improve error handling

* feat&fix: add configuration management API endpoints and update UI for configuration selection

* feat&fix: update environment configuration, enhance task statistics, and improve UI responsiveness

* feat&fix: add configuration toggle button in UI and improve task loading performance

* feat&fix: add accuracy percentage display to score and style updates for UI
This commit is contained in:
Zilong Zhou
2025-07-14 13:43:41 +08:00
committed by GitHub
parent 0651495d88
commit 74b7c189af
6 changed files with 662 additions and 37 deletions

View File

@@ -1,14 +1,17 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from functools import cache
import os
import json
import time
import subprocess
from datetime import datetime
from pathlib import Path
from flask import Flask, render_template_string, jsonify, send_file, request, render_template
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
@@ -38,12 +41,51 @@ OBSERVATION_TYPE=os.getenv("OBSERVATION_TYPE", "screenshot")
MODEL_NAME=os.getenv("MODEL_NAME", "computer-use-preview")
MAX_STEPS = int(os.getenv("MAX_STEPS", "150"))
def initialize_default_config():
"""Initialize default configuration from the first available config in results directory"""
global ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME, RESULTS_PATH
if os.path.exists(RESULTS_BASE_PATH):
try:
# Scan for the first available configuration
for action_space in os.listdir(RESULTS_BASE_PATH):
action_space_path = os.path.join(RESULTS_BASE_PATH, action_space)
if os.path.isdir(action_space_path):
for obs_type in os.listdir(action_space_path):
obs_path = os.path.join(action_space_path, obs_type)
if os.path.isdir(obs_path):
for model_name in os.listdir(obs_path):
model_path = os.path.join(obs_path, model_name)
if os.path.isdir(model_path):
# Use the first available configuration as default
ACTION_SPACE = action_space
OBSERVATION_TYPE = obs_type
MODEL_NAME = model_name
RESULTS_PATH = model_path
print(f"Initialized default config: {ACTION_SPACE}/{OBSERVATION_TYPE}/{MODEL_NAME}")
return
except Exception as e:
print(f"Error scanning results directory for default config: {e}")
# Fallback to original environment-based path if no configs found
RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
print(f"Using fallback config from environment: {ACTION_SPACE}/{OBSERVATION_TYPE}/{MODEL_NAME}")
# Initialize default configuration
initialize_default_config()
RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
if RESULTS_PATH not in TASK_STATUS_CACHE:
# Initialize cache for this results path
TASK_STATUS_CACHE[RESULTS_PATH] = {}
@cache
def load_task_list():
with open(TASK_CONFIG_PATH, 'r') as f:
return json.load(f)
@cache
def get_task_info(task_type, task_id):
task_file = os.path.join(EXAMPLES_BASE_PATH, task_type, f"{task_id}.json")
if os.path.exists(task_file):
@@ -183,8 +225,8 @@ def get_task_status_brief(task_type, task_id):
# Check if the status is already cached
current_time = time.time()
last_cache_time = None
if cache_key in TASK_STATUS_CACHE:
cached_status, cached_time = TASK_STATUS_CACHE[cache_key]
if cache_key in TASK_STATUS_CACHE[RESULTS_PATH]:
cached_status, cached_time = TASK_STATUS_CACHE[RESULTS_PATH][cache_key]
last_cache_time = cached_time
# If cached status is "Done", check if it's within the stability period
if cached_status["status"].startswith("Done"):
@@ -312,7 +354,7 @@ def get_task_status_brief(task_type, task_id):
# Cache the status if it is done or error
if status.startswith("Done") or status == "Error":
current_time = last_cache_time if last_cache_time else current_time
TASK_STATUS_CACHE[cache_key] = (status_dict, current_time)
TASK_STATUS_CACHE[RESULTS_PATH][cache_key] = (status_dict, current_time)
return status_dict
@@ -434,6 +476,90 @@ def api_task_detail(task_type, task_id):
"status": task_status
})
@app.route('/api/config')
def api_config():
"""Get configuration information from environment variables - deprecated, use /api/current-config instead"""
config_info = {
"task_config_path": TASK_CONFIG_PATH,
"results_base_path": RESULTS_BASE_PATH,
"action_space": ACTION_SPACE,
"observation_type": OBSERVATION_TYPE,
"model_name": MODEL_NAME,
"max_steps": MAX_STEPS,
"examples_base_path": EXAMPLES_BASE_PATH
}
return jsonify(config_info)
@app.route('/api/available-configs')
def api_available_configs():
"""Get all available configuration combinations by scanning the results directory"""
configs = []
if os.path.exists(RESULTS_BASE_PATH):
try:
# Scan action spaces
for action_space in os.listdir(RESULTS_BASE_PATH):
action_space_path = os.path.join(RESULTS_BASE_PATH, action_space)
if os.path.isdir(action_space_path):
# Scan observation types
for obs_type in os.listdir(action_space_path):
obs_path = os.path.join(action_space_path, obs_type)
if os.path.isdir(obs_path):
# Scan model names
for model_name in os.listdir(obs_path):
model_path = os.path.join(obs_path, model_name)
if os.path.isdir(model_path):
configs.append({
"action_space": action_space,
"observation_type": obs_type,
"model_name": model_name,
"path": model_path
})
except Exception as e:
print(f"Error scanning results directory: {e}")
return jsonify(configs)
@app.route('/api/current-config')
def api_current_config():
"""Get current configuration"""
return jsonify({
"action_space": ACTION_SPACE,
"observation_type": OBSERVATION_TYPE,
"model_name": MODEL_NAME,
"max_steps": MAX_STEPS,
"results_path": RESULTS_PATH
})
@app.route('/api/set-config', methods=['POST'])
def api_set_config():
"""Set current configuration"""
global ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME, RESULTS_PATH
data = request.get_json()
if not data:
return jsonify({"error": "No data provided"}), 400
# Update global variables
ACTION_SPACE = data.get('action_space', ACTION_SPACE)
OBSERVATION_TYPE = data.get('observation_type', OBSERVATION_TYPE)
MODEL_NAME = data.get('model_name', MODEL_NAME)
# Update results path
RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
if RESULTS_PATH not in TASK_STATUS_CACHE:
# Initialize cache for this results path
TASK_STATUS_CACHE[RESULTS_PATH] = {}
return jsonify({
"action_space": ACTION_SPACE,
"observation_type": OBSERVATION_TYPE,
"model_name": MODEL_NAME,
"max_steps": MAX_STEPS,
"results_path": RESULTS_PATH
})
if __name__ == '__main__':
# Check if necessary directories exist
if not os.path.exists(TASK_CONFIG_PATH):
@@ -447,4 +573,4 @@ if __name__ == '__main__':
port = 8080
debug = os.getenv("FLASK_DEBUG", "false").lower() == "true"
app.run(host=host, port=port, debug=debug)
app.run(host=host, port=port, debug=debug, threaded=True)