Feat/monitor (#254)

* feat: add claude support * feat: add script for end-to-end evaluation with logging and task distribution * feat&fix: add tool result handling and update model default in evaluation script * chore: remove run_test_env.py script * feat&fix: implement action parsing for tool calls and update default action space * fix: update text formatting in action parsing and replace logger import * feat&fix: implement action parsing for tool calls and add screen size handling * feat: add setup instructions for Anthropic API integration * feat: add notice about image size limitations for Anthropic API * Delete test_env/logger.py * Delete test_env/utils.py * fix: update logger usage to use global logger and improve error handling * feat&fix: add configuration management API endpoints and update UI for configuration selection * feat&fix: update environment configuration, enhance task statistics, and improve UI responsiveness * feat&fix: add configuration toggle button in UI and improve task loading performance * feat&fix: add accuracy percentage display to score and style updates for UI
2025-07-14 13:43:41 +08:00
parent 0651495d88
commit 74b7c189af
6 changed files with 662 additions and 37 deletions
--- a/mm_agents/anthropic/main.py
+++ b/mm_agents/anthropic/main.py
@@ -369,9 +369,10 @@ class AnthropicAgent:
                )
        except (APIError, APIStatusError, APIResponseValidationError) as e:
-            self.logger.exception(f"Anthropic API error: {str(e)}")
+            logger.exception(f"Anthropic API error: {str(e)}")
            try:
-                self.logger.warning("Retrying with backup API key...")
+                logger.warning("Retrying with backup API key...")
                backup_client = Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY_BACKUP"), max_retries=4)
                if self.model_name == "claude-3-7-sonnet-20250219" or self.model_name == "claude-4-opus-20250514" or self.model_name == "claude-4-sonnet-20250514":
@@ -393,13 +394,13 @@ class AnthropicAgent:
                        tools=tools,
                        betas=betas,
                    )
-                self.logger.info("Successfully used backup API key")
+                logger.info("Successfully used backup API key")
            except Exception as backup_e:
-                self.logger.exception(f"Backup API call also failed: {str(backup_e)}")
+                logger.exception(f"Backup API call also failed: {str(backup_e)}")
                return None, None
        except Exception as e:
-            self.logger.exception(f"Error in Anthropic API: {str(e)}")
+            logger.exception(f"Error in Anthropic API: {str(e)}")
            return None, None
        response_params = _response_to_params(response)
@@ -434,9 +435,15 @@ class AnthropicAgent:
            actions = ["DONE"]
        return reasonings, actions
-    def reset(self, *args, **kwargs):
+    def reset(self, _logger = None, *args, **kwargs):
        """
        Reset the agent's state.
        """
        global logger
        if _logger:
            logger = _logger
        else:
            logger = logging.getLogger("desktopenv.agent")
        self.messages = []
-        self.logger.info(f"{self.class_name} reset.")
+        logger.info(f"{self.class_name} reset.")
--- a/monitor/.env
+++ b/monitor/.env
@@ -4,11 +4,11 @@
 # Monitor configuration
 TASK_CONFIG_PATH=../evaluation_examples/test_all.json
 EXAMPLES_BASE_PATH=../evaluation_examples/examples
-RESULTS_BASE_PATH=../results_all
+RESULTS_BASE_PATH=../results
 ACTION_SPACE=pyautogui
 OBSERVATION_TYPE=screenshot
 MODEL_NAME=computer-use-preview
 MAX_STEPS=150
 FLASK_PORT=80
 FLASK_HOST=0.0.0.0
-FLASK_DEBUG=true
+FLASK_DEBUG=false
--- a/monitor/main.py
+++ b/monitor/main.py
@@ -1,14 +1,17 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 from functools import cache
 import os
 import json
 import time
 import subprocess
 from datetime import datetime
 from pathlib import Path
 from flask import Flask, render_template_string, jsonify, send_file, request, render_template
 from dotenv import load_dotenv
 # Load environment variables from .env file
 load_dotenv()
@@ -38,12 +41,51 @@ OBSERVATION_TYPE=os.getenv("OBSERVATION_TYPE", "screenshot")
 MODEL_NAME=os.getenv("MODEL_NAME", "computer-use-preview")
 MAX_STEPS = int(os.getenv("MAX_STEPS", "150"))
 def initialize_default_config():
    """Initialize default configuration from the first available config in results directory"""
    global ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME, RESULTS_PATH
    if os.path.exists(RESULTS_BASE_PATH):
        try:
            # Scan for the first available configuration
            for action_space in os.listdir(RESULTS_BASE_PATH):
                action_space_path = os.path.join(RESULTS_BASE_PATH, action_space)
                if os.path.isdir(action_space_path):
                    for obs_type in os.listdir(action_space_path):
                        obs_path = os.path.join(action_space_path, obs_type)
                        if os.path.isdir(obs_path):
                            for model_name in os.listdir(obs_path):
                                model_path = os.path.join(obs_path, model_name)
                                if os.path.isdir(model_path):
                                    # Use the first available configuration as default
                                    ACTION_SPACE = action_space
                                    OBSERVATION_TYPE = obs_type
                                    MODEL_NAME = model_name
                                    RESULTS_PATH = model_path
                                    print(f"Initialized default config: {ACTION_SPACE}/{OBSERVATION_TYPE}/{MODEL_NAME}")
                                    return
        except Exception as e:
            print(f"Error scanning results directory for default config: {e}")
    # Fallback to original environment-based path if no configs found
    RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
    print(f"Using fallback config from environment: {ACTION_SPACE}/{OBSERVATION_TYPE}/{MODEL_NAME}")
 # Initialize default configuration
 initialize_default_config()
 RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
 if RESULTS_PATH not in TASK_STATUS_CACHE:
    # Initialize cache for this results path
    TASK_STATUS_CACHE[RESULTS_PATH] = {}
@cache
 def load_task_list():
    with open(TASK_CONFIG_PATH, 'r') as f:
        return json.load(f)
@cache
 def get_task_info(task_type, task_id):
    task_file = os.path.join(EXAMPLES_BASE_PATH, task_type, f"{task_id}.json")
    if os.path.exists(task_file):
@@ -183,8 +225,8 @@ def get_task_status_brief(task_type, task_id):
    # Check if the status is already cached
    current_time = time.time()
    last_cache_time = None
-    if cache_key in TASK_STATUS_CACHE:
+    if cache_key in TASK_STATUS_CACHE[RESULTS_PATH]:
-        cached_status, cached_time = TASK_STATUS_CACHE[cache_key]
+        cached_status, cached_time = TASK_STATUS_CACHE[RESULTS_PATH][cache_key]
        last_cache_time = cached_time
        # If cached status is "Done", check if it's within the stability period
        if cached_status["status"].startswith("Done"):
@@ -312,7 +354,7 @@ def get_task_status_brief(task_type, task_id):
    # Cache the status if it is done or error
    if status.startswith("Done") or status == "Error":
        current_time = last_cache_time if last_cache_time else current_time
-        TASK_STATUS_CACHE[cache_key] = (status_dict, current_time)
+        TASK_STATUS_CACHE[RESULTS_PATH][cache_key] = (status_dict, current_time)
    return status_dict
@@ -434,6 +476,90 @@ def api_task_detail(task_type, task_id):
        "status": task_status
    })
@app.route('/api/config')
 def api_config():
    """Get configuration information from environment variables - deprecated, use /api/current-config instead"""
    config_info = {
        "task_config_path": TASK_CONFIG_PATH,
        "results_base_path": RESULTS_BASE_PATH,
        "action_space": ACTION_SPACE,
        "observation_type": OBSERVATION_TYPE,
        "model_name": MODEL_NAME,
        "max_steps": MAX_STEPS,
        "examples_base_path": EXAMPLES_BASE_PATH
    }
    return jsonify(config_info)
@app.route('/api/available-configs')
 def api_available_configs():
    """Get all available configuration combinations by scanning the results directory"""
    configs = []
    if os.path.exists(RESULTS_BASE_PATH):
        try:
            # Scan action spaces
            for action_space in os.listdir(RESULTS_BASE_PATH):
                action_space_path = os.path.join(RESULTS_BASE_PATH, action_space)
                if os.path.isdir(action_space_path):
                    # Scan observation types
                    for obs_type in os.listdir(action_space_path):
                        obs_path = os.path.join(action_space_path, obs_type)
                        if os.path.isdir(obs_path):
                            # Scan model names
                            for model_name in os.listdir(obs_path):
                                model_path = os.path.join(obs_path, model_name)
                                if os.path.isdir(model_path):
                                    configs.append({
                                        "action_space": action_space,
                                        "observation_type": obs_type,
                                        "model_name": model_name,
                                        "path": model_path
                                    })
        except Exception as e:
            print(f"Error scanning results directory: {e}")
    return jsonify(configs)
@app.route('/api/current-config')
 def api_current_config():
    """Get current configuration"""
    return jsonify({
        "action_space": ACTION_SPACE,
        "observation_type": OBSERVATION_TYPE,
        "model_name": MODEL_NAME,
        "max_steps": MAX_STEPS,
        "results_path": RESULTS_PATH
    })
@app.route('/api/set-config', methods=['POST'])
 def api_set_config():
    """Set current configuration"""
    global ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME, RESULTS_PATH
    data = request.get_json()
    if not data:
        return jsonify({"error": "No data provided"}), 400
    # Update global variables
    ACTION_SPACE = data.get('action_space', ACTION_SPACE)
    OBSERVATION_TYPE = data.get('observation_type', OBSERVATION_TYPE)
    MODEL_NAME = data.get('model_name', MODEL_NAME)
    # Update results path
    RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
    if RESULTS_PATH not in TASK_STATUS_CACHE:
        # Initialize cache for this results path
        TASK_STATUS_CACHE[RESULTS_PATH] = {}
    return jsonify({
        "action_space": ACTION_SPACE,
        "observation_type": OBSERVATION_TYPE,
        "model_name": MODEL_NAME,
        "max_steps": MAX_STEPS,
        "results_path": RESULTS_PATH
    })
 if __name__ == '__main__':
    # Check if necessary directories exist
    if not os.path.exists(TASK_CONFIG_PATH):
@@ -447,4 +573,4 @@ if __name__ == '__main__':
    port = 8080
    debug = os.getenv("FLASK_DEBUG", "false").lower() == "true"
-    app.run(host=host, port=port, debug=debug)
+    app.run(host=host, port=port, debug=debug, threaded=True)
--- a/monitor/static/index.css
+++ b/monitor/static/index.css
@@ -1,5 +1,63 @@
 /* filepath: /home/adlsdztony/codes/OSWorld/monitor/static/index.css */
 body { font-family: 'Segoe UI', Arial, sans-serif; margin: 0; padding: 0; background: linear-gradient(135deg, #f4f6fa 0%, #e9f0f9 100%); }
 .layout-container {
    position: relative;
    max-width: 1200px;
    margin: 20px auto;
    padding: 0 20px;
 }
 .main-content {
    background: #fff;
    border-radius: 14px;
    box-shadow: 0 8px 32px rgba(0,0,0,0.1);
    padding: 36px 44px;
 }
 /* Floating Config Sidebar */
 .config-sidebar {
    position: fixed;
    top: 20px;
    left: -280px;
    width: 300px;
    height: calc(100vh - 40px);
    z-index: 1000;
    transition: left 0.3s ease;
 }
 .config-sidebar:hover {
    left: 0;
 }
 .config-toggle-btn {
    position: absolute;
    right: -50px;
    top: 50%;
    transform: translateY(-50%);
    width: 50px;
    height: 50px;
    background: linear-gradient(135deg, #007bff, #0056b3);
    border-radius: 0 25px 25px 0;
    display: flex;
    align-items: center;
    justify-content: center;
    color: white;
    font-size: 1.2em;
    cursor: pointer;
    box-shadow: 2px 0 10px rgba(0,0,0,0.2);
    transition: all 0.3s ease;
 }
 .config-toggle-btn:hover {
    background: linear-gradient(135deg, #0056b3, #004085);
    transform: translateY(-50%) scale(1.05);
 }
 .config-sidebar:hover .config-toggle-btn {
    opacity: 0.8;
 }
 .main-container { max-width: 1100px; margin: 40px auto; background: #fff; border-radius: 14px; box-shadow: 0 8px 32px rgba(0,0,0,0.1); padding: 36px 44px; }
 h1 { font-size: 2.5em; margin-bottom: 24px; color: #1a237e; text-align: center; position: relative; }
 h1:after { content: ''; display: block; width: 80px; height: 4px; background: linear-gradient(90deg, #007bff, #00c6ff); margin: 12px auto 0; border-radius: 2px; }
@@ -125,6 +183,18 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
    text-shadow: 0 1px 2px rgba(0,0,0,0.05);
 }
 .accuracy-percentage {
    font-size: 0.7em;
    font-weight: 600;
    color: #ffffff;
    margin-left: 8px;
    background: rgba(255, 255, 255, 0.1);
    padding: 4px 8px;
    border-radius: 12px;
    display: inline-block;
    vertical-align: middle;
 }
 .stat-card span {
    font-size: 2em;
@@ -197,8 +267,9 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
 .task-type-stats {
    display: flex;
    gap: 16px;
    flex-wrap: wrap;
    gap: 8px;
    align-items: center;
 }
 .task-stat {
@@ -228,6 +299,22 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
    color: #b71c1c;
 }
 /* Task type statistics styles */
 .task-stat.score {
    color: #ffc107;
    background: rgba(255, 193, 7, 0.1);
 }
 .task-stat.steps {
    color: #17a2b8;
    background: rgba(23, 162, 184, 0.1);
 }
 .task-stat.rate {
    color: #28a745;
    background: rgba(40, 167, 69, 0.1);
 }
 .tasks-container {
    padding: 20px;
    transition: all 0.4s cubic-bezier(.4,0,.2,1);
@@ -427,3 +514,174 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
    background: #a5c7e5;
 }
 /* Configuration Panel Styles */
 .config-panel {
    background: #fff;
    border-radius: 0 14px 14px 0;
    box-shadow: 0 8px 32px rgba(0,0,0,0.15);
    overflow: hidden;
    height: 100%;
    display: flex;
    flex-direction: column;
 }
 .config-header {
    display: flex;
    align-items: center;
    padding: 16px 20px;
    background: linear-gradient(135deg, #6c757d, #495057);
    color: white;
    flex-shrink: 0;
 }
 .config-header i {
    margin-right: 10px;
    font-size: 1.1em;
 }
 .config-header span {
    font-weight: 600;
    font-size: 1.1em;
 }
 .config-content {
    padding: 20px;
    flex: 1;
    overflow-y: auto;
 }
 .config-selector {
    margin-bottom: 20px;
    padding-bottom: 15px;
    border-bottom: 1px solid #dee2e6;
 }
 .selector-item {
    display: flex;
    flex-direction: column;
    gap: 8px;
 }
 .selector-item label {
    font-weight: 600;
    color: #495057;
    font-size: 0.9em;
    text-transform: uppercase;
    letter-spacing: 0.5px;
 }
 .selector-item select {
    padding: 8px 12px;
    border: 2px solid #e9ecef;
    border-radius: 6px;
    background: white;
    font-size: 0.9em;
    color: #495057;
    cursor: pointer;
    transition: all 0.3s ease;
 }
 .selector-item select:focus {
    outline: none;
    border-color: #007bff;
    box-shadow: 0 0 0 3px rgba(0,123,255,0.1);
 }
 .selector-item select:hover {
    border-color: #007bff;
 }
 .config-list {
    display: flex;
    flex-direction: column;
    gap: 15px;
 }
 .config-item {
    display: flex;
    flex-direction: column;
    background: #f8f9fa;
    padding: 12px;
    border-radius: 8px;
    border-left: 4px solid #007bff;
    transition: all 0.3s ease;
 }
 .config-item:hover {
    transform: translateX(3px);
    box-shadow: 0 4px 12px rgba(0,123,255,0.15);
 }
 .config-label {
    font-weight: 600;
    color: #495057;
    margin-bottom: 5px;
    font-size: 0.9em;
    text-transform: uppercase;
    color: #495057;
    font-size: 0.85em;
    margin-bottom: 6px;
    text-transform: uppercase;
    letter-spacing: 0.5px;
 }
 .config-value {
    color: #007bff;
    font-family: 'Courier New', monospace;
    font-size: 0.9em;
    font-weight: 600;
    word-break: break-word;
 }
 .config-path {
    font-size: 0.8em;
    line-height: 1.3;
 }
 /* Responsive design for sidebar layout */
@media (max-width: 1024px) {
    .config-sidebar {
        left: -250px;
        width: 250px;
    }
    .config-toggle-btn {
        right: -40px;
        width: 40px;
        height: 40px;
        font-size: 1em;
    }
 }
@media (max-width: 768px) {
    .layout-container {
        padding: 0 10px;
    }
    .main-content {
        padding: 20px 25px;
    }
    .config-sidebar {
        left: -220px;
        width: 220px;
        height: calc(100vh - 20px);
        top: 10px;
    }
    .config-toggle-btn {
        right: -35px;
        width: 35px;
        height: 35px;
        font-size: 0.9em;
    }
    .config-content {
        padding: 15px;
    }
    .config-item {
        padding: 10px;
    }
 }
--- a/monitor/static/index.js
+++ b/monitor/static/index.js
@@ -1,5 +1,8 @@
 document.addEventListener('DOMContentLoaded', () => {
-    fetchTasks();
+    fetchAvailableConfigs().then(() => {
        fetchConfig();
        fetchTasks();
    });
    // Bind filter functionality
    document.getElementById('total-tasks').parentElement.addEventListener('click', () => setTaskFilter('all'));
    document.getElementById('active-tasks').parentElement.addEventListener('click', () => setTaskFilter('active'));
@@ -9,6 +12,9 @@ document.addEventListener('DOMContentLoaded', () => {
 let allTaskData = null;
 let currentFilter = 'all';
 let availableConfigs = [];
 let currentConfig = null;
 let categoryStats = {};
 function refreshPage() {
    // Save expanded state before refresh
@@ -31,8 +37,8 @@ function fetchTasksForRefresh() {
    fetch('/api/tasks/brief')
        .then(response => response.json())
        .then(data => {
            // Update stored data
            allTaskData = data;
            categoryStats = calculateCategoryStats(data);
            // Only update statistics and task status, do not fully re-render
            updateStatistics(data);
            updateTaskStatus(data);
@@ -148,6 +154,7 @@ function fetchTasks() {
        .then(response => response.json())
        .then(data => {
            allTaskData = data;
            categoryStats = calculateCategoryStats(data);
            renderTasks(data);
            updateStatistics(data);
        })
@@ -208,13 +215,15 @@ function updateStatistics(data) {
    document.getElementById('completed-tasks').textContent = completedTasks;
    document.getElementById('error-tasks').textContent = errorTasks;
-    // Update score display with formatted score
+    // Update score display with formatted score and accuracy percentage
    const scoreDisplay = document.getElementById('score-display');
    if (completedTasks > 0) {
        const scoreFormatted = totalScore.toFixed(2);
-        scoreDisplay.innerHTML = `<span>${scoreFormatted}</span> / <span>${completedTasks}</span>`;
+        const averageScore = totalScore / completedTasks;
        const accuracyPercentage = (averageScore * 100).toFixed(1);
        scoreDisplay.innerHTML = `<span>${scoreFormatted}</span> / <span>${completedTasks}</span> <span class="accuracy-percentage">(${accuracyPercentage}%)</span>`;
    } else {
-        scoreDisplay.innerHTML = '<span>0.00</span> / <span>0</span>';
+        scoreDisplay.innerHTML = '<span>0.00</span> / <span>0</span> <span class="accuracy-percentage">(0.0%)</span>';
    }
    // Highlight the currently selected statistics card
@@ -279,6 +288,10 @@ function renderTasks(data) {
        // Create header with task type name and statistics
        const typeHeader = document.createElement('div');
        typeHeader.className = 'task-type-header';
        // Get category stats for this task type
        const stats = categoryStats[taskType] || {};
        typeHeader.innerHTML = `
            <span class="task-type-name"><i class="fas fa-layer-group"></i> ${taskType}</span>
            <div class="task-type-stats">
@@ -286,6 +299,9 @@ function renderTasks(data) {
                <span class="task-stat"><i class="fas fa-tasks"></i> ${tasks.length} total</span>
                <span class="task-stat running"><i class="fas fa-running"></i> ${runningCount} active</span>
                <span class="task-stat completed"><i class="fas fa-check-circle"></i> ${completedCount} completed</span>
                ${stats.avg_score ? `<span class="task-stat score"><i class="fas fa-star"></i> ${stats.avg_score} avg score</span>` : ''}
                ${stats.avg_steps ? `<span class="task-stat steps"><i class="fas fa-chart-line"></i> ${stats.avg_steps} avg steps</span>` : ''}
                ${stats.completion_rate ? `<span class="task-stat rate"><i class="fas fa-percentage"></i> ${stats.completion_rate}% completed</span>` : ''}
            </div>
        `;
        typeSection.appendChild(typeHeader);
@@ -453,7 +469,181 @@ function renderTasks(data) {
        container.appendChild(typeSection);
    });
 }
-// add auto-refresh with time interval 10 seconds
+
-setInterval(() => {
+function fetchAvailableConfigs() {
-        refreshPage();
+    return fetch('/api/available-configs')
-}, 10000); // 10 seconds interval
+        .then(response => response.json())
        .then(data => {
            availableConfigs = data;
            populateConfigSelect();
            return data;
        })
        .catch(error => {
            console.error('Error fetching available configs:', error);
            return [];
        });
 }
 function populateConfigSelect() {
    const select = document.getElementById('config-select');
    select.innerHTML = '';
    if (availableConfigs.length === 0) {
        select.innerHTML = '<option value="">No configurations found in results directory</option>';
        return;
    }
    // Add available configurations
    availableConfigs.forEach((config, index) => {
        const option = document.createElement('option');
        option.value = index;
        option.textContent = `${config.action_space} / ${config.observation_type} / ${config.model_name}`;
        select.appendChild(option);
    });
 }
 function changeConfiguration() {
    const select = document.getElementById('config-select');
    const selectedIndex = select.value;
    if (selectedIndex === '' || selectedIndex < 0 || selectedIndex >= availableConfigs.length) {
        return;
    }
    const selectedConfig = availableConfigs[selectedIndex];
    // Send configuration change request
    fetch('/api/set-config', {
        method: 'POST',
        headers: {
            'Content-Type': 'application/json',
        },
        body: JSON.stringify(selectedConfig)
    })
    .then(response => response.json())
    .then(data => {
        currentConfig = data;
        displayConfig(data);
        // Refresh tasks with new configuration
        fetchTasks();
    })
    .catch(error => {
        console.error('Error setting config:', error);
        displayConfigError();
    });
 }
 function fetchConfig() {
    return fetch('/api/current-config')
        .then(response => response.json())
        .then(data => {
            currentConfig = data;
            displayConfig(data);
            updateConfigSelect();
            return data;
        })
        .catch(error => {
            console.error('Error fetching config:', error);
            displayConfigError();
        });
 }
 function updateConfigSelect() {
    if (!currentConfig || availableConfigs.length === 0) return;
    const select = document.getElementById('config-select');
    const currentConfigIndex = availableConfigs.findIndex(config => 
        config.action_space === currentConfig.action_space &&
        config.observation_type === currentConfig.observation_type &&
        config.model_name === currentConfig.model_name
    );
    if (currentConfigIndex !== -1) {
        select.value = currentConfigIndex;
    } else {
        // Current config not found in available configs, select the first one if available
        if (availableConfigs.length > 0) {
            select.value = 0;
            console.warn('Current config not found in available configs, defaulting to first available config');
        }
    }
 }
 function displayConfig(config) {
    document.getElementById('action-space').textContent = config.action_space || 'N/A';
    document.getElementById('observation-type').textContent = config.observation_type || 'N/A';
    document.getElementById('model-name').textContent = config.model_name || 'N/A';
    document.getElementById('max-steps').textContent = config.max_steps || 'N/A';
 }
 function displayConfigError() {
    const configValues = document.querySelectorAll('.config-value');
    configValues.forEach(element => {
        element.textContent = 'Error loading';
        element.style.color = '#dc3545';
    });
 }
 function calculateCategoryStats(data) {
    const stats = {};
    Object.entries(data).forEach(([taskType, tasks]) => {
        let totalTasks = tasks.length;
        let completedTasks = 0;
        let runningTasks = 0;
        let errorTasks = 0;
        let totalScore = 0;
        let totalSteps = 0;
        let completedWithSteps = 0;
        tasks.forEach(task => {
            const status = task.status.status;
            if (['Done', 'Done (Message Exit)', 'Done (Max Steps)', 'Done (Thought Exit)'].includes(status)) {
                completedTasks++;
                // Calculate score if available
                if (task.status.result) {
                    try {
                        const score = parseFloat(task.status.result);
                        if (!isNaN(score) && score >= 0 && score <= 1) {
                            totalScore += score;
                        }
                    } catch (e) {
                        // Ignore parsing errors
                    }
                }
                // Calculate steps for completed tasks
                if (task.status.progress && task.status.progress > 0) {
                    totalSteps += task.status.progress;
                    completedWithSteps++;
                }
            } else if (['Running', 'Preparing', 'Initializing'].includes(status)) {
                runningTasks++;
            } else if (status === 'Error') {
                errorTasks++;
            }
        });
        // Calculate averages
        const avgScore = completedTasks > 0 ? totalScore / completedTasks : 0;
        const avgSteps = completedWithSteps > 0 ? totalSteps / completedWithSteps : 0;
        const completionRate = totalTasks > 0 ? (completedTasks / totalTasks * 100) : 0;
        stats[taskType] = {
            total_tasks: totalTasks,
            completed_tasks: completedTasks,
            running_tasks: runningTasks,
            error_tasks: errorTasks,
            total_score: Math.round(totalScore * 100) / 100,
            avg_score: Math.round(avgScore * 10000) / 10000,
            avg_steps: Math.round(avgSteps * 10) / 10,
            completion_rate: Math.round(completionRate * 10) / 10
        };
    });
    return stats;
 }
--- a/monitor/templates/index.html
+++ b/monitor/templates/index.html
@@ -12,19 +12,62 @@
    <link rel="stylesheet" href="/static/index.css">
 </head>
 <body>
-    <div class="main-container">
+    <div class="layout-container">
-        <h1>OSWorld Monitor <span class="system-status online">System Online</span></h1>
+        <!-- Floating Config Button and Sidebar -->
-        
+        <div class="config-sidebar" id="config-sidebar">
-        <!-- Score Display Banner -->
+            <div class="config-toggle-btn">
-        <div class="score-banner">
+                <i class="fas fa-cogs"></i>
-            <div class="score-content">
+            </div>
-                <i class="fas fa-star"></i>
+            <div class="config-panel">
-                <span class="score-label">Score:</span>
+                <div class="config-header">
-                <span id="score-display" class="score-value">Loading...</span>
+                    <i class="fas fa-cogs"></i>
                    <span>Configuration</span>
                </div>
                <div class="config-content">
                    <div class="config-selector">
                        <div class="selector-item">
                            <label for="config-select">Select Configuration:</label>
                            <select id="config-select" onchange="changeConfiguration()">
                                <option value="">Loading configurations...</option>
                            </select>
                        </div>
                    </div>
                    <div class="config-list">
                        <div class="config-item">
                            <span class="config-label">Action Space:</span>
                            <span class="config-value" id="action-space">Loading...</span>
                        </div>
                        <div class="config-item">
                            <span class="config-label">Observation:</span>
                            <span class="config-value" id="observation-type">Loading...</span>
                        </div>
                        <div class="config-item">
                            <span class="config-label">Model:</span>
                            <span class="config-value" id="model-name">Loading...</span>
                        </div>
                        <div class="config-item">
                            <span class="config-label">Max Steps:</span>
                            <span class="config-value" id="max-steps">Loading...</span>
                        </div>
                    </div>
                </div>
            </div>
        </div>
-        <div class="dashboard-stats">
+        <!-- Main Content -->
        <div class="main-content">
            <h1>OSWorld Monitor <span class="system-status online">System Online</span></h1>
            <!-- Score Display Banner -->
            <div class="score-banner">
                <div class="score-content">
                    <i class="fas fa-star"></i>
                    <span class="score-label">Score:</span>
                    <span id="score-display" class="score-value">Loading...</span>
                </div>
            </div>
            <div class="dashboard-stats">
            <div class="stat-card">
                <i class="fas fa-running"></i>
                <span id="active-tasks">Loading...</span>
@@ -46,10 +89,11 @@
                <div class="stat-label">Total Tasks</div>
            </div>
        </div>
-        <div id="task-container">
+            <div id="task-container">
-            <div class="loading-spinner">
+                <div class="loading-spinner">
-                <div class="spinner"></div>
+                    <div class="spinner"></div>
-                <div>Loading task data...</div>
+                    <div>Loading task data...</div>
                </div>
            </div>
        </div>
    </div>