Feat/monitor (#254)

* feat: add claude support * feat: add script for end-to-end evaluation with logging and task distribution * feat&fix: add tool result handling and update model default in evaluation script * chore: remove run_test_env.py script * feat&fix: implement action parsing for tool calls and update default action space * fix: update text formatting in action parsing and replace logger import * feat&fix: implement action parsing for tool calls and add screen size handling * feat: add setup instructions for Anthropic API integration * feat: add notice about image size limitations for Anthropic API * Delete test_env/logger.py * Delete test_env/utils.py * fix: update logger usage to use global logger and improve error handling * feat&fix: add configuration management API endpoints and update UI for configuration selection * feat&fix: update environment configuration, enhance task statistics, and improve UI responsiveness * feat&fix: add configuration toggle button in UI and improve task loading performance * feat&fix: add accuracy percentage display to score and style updates for UI
2025-07-14 13:43:41 +08:00
parent 0651495d88
commit 74b7c189af
6 changed files with 662 additions and 37 deletions
--- a/monitor/main.py
+++ b/monitor/main.py
@@ -1,14 +1,17 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-

+from functools import cache
 import os
 import json
 import time
+import subprocess
 from datetime import datetime
 from pathlib import Path
 from flask import Flask, render_template_string, jsonify, send_file, request, render_template
 from dotenv import load_dotenv

+
 # Load environment variables from .env file
 load_dotenv()

@@ -38,12 +41,51 @@ OBSERVATION_TYPE=os.getenv("OBSERVATION_TYPE", "screenshot")
 MODEL_NAME=os.getenv("MODEL_NAME", "computer-use-preview")
 MAX_STEPS = int(os.getenv("MAX_STEPS", "150"))

+def initialize_default_config():
+    """Initialize default configuration from the first available config in results directory"""
+    global ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME, RESULTS_PATH
+    
+    if os.path.exists(RESULTS_BASE_PATH):
+        try:
+            # Scan for the first available configuration
+            for action_space in os.listdir(RESULTS_BASE_PATH):
+                action_space_path = os.path.join(RESULTS_BASE_PATH, action_space)
+                if os.path.isdir(action_space_path):
+                    for obs_type in os.listdir(action_space_path):
+                        obs_path = os.path.join(action_space_path, obs_type)
+                        if os.path.isdir(obs_path):
+                            for model_name in os.listdir(obs_path):
+                                model_path = os.path.join(obs_path, model_name)
+                                if os.path.isdir(model_path):
+                                    # Use the first available configuration as default
+                                    ACTION_SPACE = action_space
+                                    OBSERVATION_TYPE = obs_type
+                                    MODEL_NAME = model_name
+                                    RESULTS_PATH = model_path
+                                    print(f"Initialized default config: {ACTION_SPACE}/{OBSERVATION_TYPE}/{MODEL_NAME}")
+                                    return
+        except Exception as e:
+            print(f"Error scanning results directory for default config: {e}")
+    
+    # Fallback to original environment-based path if no configs found
+    RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
+    print(f"Using fallback config from environment: {ACTION_SPACE}/{OBSERVATION_TYPE}/{MODEL_NAME}")
+
+# Initialize default configuration
+initialize_default_config()
+
 RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)

+if RESULTS_PATH not in TASK_STATUS_CACHE:
+    # Initialize cache for this results path
+    TASK_STATUS_CACHE[RESULTS_PATH] = {}
+
+@cache
 def load_task_list():
    with open(TASK_CONFIG_PATH, 'r') as f:
        return json.load(f)

+@cache
 def get_task_info(task_type, task_id):
    task_file = os.path.join(EXAMPLES_BASE_PATH, task_type, f"{task_id}.json")
    if os.path.exists(task_file):
@@ -183,8 +225,8 @@ def get_task_status_brief(task_type, task_id):
    # Check if the status is already cached
    current_time = time.time()
    last_cache_time = None
-    if cache_key in TASK_STATUS_CACHE:
-        cached_status, cached_time = TASK_STATUS_CACHE[cache_key]
+    if cache_key in TASK_STATUS_CACHE[RESULTS_PATH]:
+        cached_status, cached_time = TASK_STATUS_CACHE[RESULTS_PATH][cache_key]
        last_cache_time = cached_time
        # If cached status is "Done", check if it's within the stability period
        if cached_status["status"].startswith("Done"):
@@ -312,7 +354,7 @@ def get_task_status_brief(task_type, task_id):
    # Cache the status if it is done or error
    if status.startswith("Done") or status == "Error":
        current_time = last_cache_time if last_cache_time else current_time
-        TASK_STATUS_CACHE[cache_key] = (status_dict, current_time)
+        TASK_STATUS_CACHE[RESULTS_PATH][cache_key] = (status_dict, current_time)
    
    return status_dict

@@ -434,6 +476,90 @@ def api_task_detail(task_type, task_id):
        "status": task_status
    })

+@app.route('/api/config')
+def api_config():
+    """Get configuration information from environment variables - deprecated, use /api/current-config instead"""
+    config_info = {
+        "task_config_path": TASK_CONFIG_PATH,
+        "results_base_path": RESULTS_BASE_PATH,
+        "action_space": ACTION_SPACE,
+        "observation_type": OBSERVATION_TYPE,
+        "model_name": MODEL_NAME,
+        "max_steps": MAX_STEPS,
+        "examples_base_path": EXAMPLES_BASE_PATH
+    }
+    return jsonify(config_info)
+
+@app.route('/api/available-configs')
+def api_available_configs():
+    """Get all available configuration combinations by scanning the results directory"""
+    configs = []
+    
+    if os.path.exists(RESULTS_BASE_PATH):
+        try:
+            # Scan action spaces
+            for action_space in os.listdir(RESULTS_BASE_PATH):
+                action_space_path = os.path.join(RESULTS_BASE_PATH, action_space)
+                if os.path.isdir(action_space_path):
+                    # Scan observation types
+                    for obs_type in os.listdir(action_space_path):
+                        obs_path = os.path.join(action_space_path, obs_type)
+                        if os.path.isdir(obs_path):
+                            # Scan model names
+                            for model_name in os.listdir(obs_path):
+                                model_path = os.path.join(obs_path, model_name)
+                                if os.path.isdir(model_path):
+                                    configs.append({
+                                        "action_space": action_space,
+                                        "observation_type": obs_type,
+                                        "model_name": model_name,
+                                        "path": model_path
+                                    })
+        except Exception as e:
+            print(f"Error scanning results directory: {e}")
+    
+    return jsonify(configs)
+
+@app.route('/api/current-config')
+def api_current_config():
+    """Get current configuration"""
+    return jsonify({
+        "action_space": ACTION_SPACE,
+        "observation_type": OBSERVATION_TYPE,
+        "model_name": MODEL_NAME,
+        "max_steps": MAX_STEPS,
+        "results_path": RESULTS_PATH
+    })
+
+@app.route('/api/set-config', methods=['POST'])
+def api_set_config():
+    """Set current configuration"""
+    global ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME, RESULTS_PATH
+    
+    data = request.get_json()
+    if not data:
+        return jsonify({"error": "No data provided"}), 400
+    
+    # Update global variables
+    ACTION_SPACE = data.get('action_space', ACTION_SPACE)
+    OBSERVATION_TYPE = data.get('observation_type', OBSERVATION_TYPE)
+    MODEL_NAME = data.get('model_name', MODEL_NAME)
+    
+    # Update results path
+    RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
+    
+    if RESULTS_PATH not in TASK_STATUS_CACHE:
+        # Initialize cache for this results path
+        TASK_STATUS_CACHE[RESULTS_PATH] = {}
+        
+    return jsonify({
+        "action_space": ACTION_SPACE,
+        "observation_type": OBSERVATION_TYPE,
+        "model_name": MODEL_NAME,
+        "max_steps": MAX_STEPS,
+        "results_path": RESULTS_PATH
+    })
+
 if __name__ == '__main__':
    # Check if necessary directories exist
    if not os.path.exists(TASK_CONFIG_PATH):
@@ -447,4 +573,4 @@ if __name__ == '__main__':
    port = 8080
    debug = os.getenv("FLASK_DEBUG", "false").lower() == "true"
    
-    app.run(host=host, port=port, debug=debug)
+    app.run(host=host, port=port, debug=debug, threaded=True)