diff --git a/mm_agents/anthropic/main.py b/mm_agents/anthropic/main.py
index 4cffc16..493a7bb 100644
--- a/mm_agents/anthropic/main.py
+++ b/mm_agents/anthropic/main.py
@@ -369,9 +369,10 @@ class AnthropicAgent:
)
except (APIError, APIStatusError, APIResponseValidationError) as e:
- self.logger.exception(f"Anthropic API error: {str(e)}")
+ logger.exception(f"Anthropic API error: {str(e)}")
try:
- self.logger.warning("Retrying with backup API key...")
+ logger.warning("Retrying with backup API key...")
+
backup_client = Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY_BACKUP"), max_retries=4)
if self.model_name == "claude-3-7-sonnet-20250219" or self.model_name == "claude-4-opus-20250514" or self.model_name == "claude-4-sonnet-20250514":
@@ -393,13 +394,13 @@ class AnthropicAgent:
tools=tools,
betas=betas,
)
- self.logger.info("Successfully used backup API key")
+ logger.info("Successfully used backup API key")
except Exception as backup_e:
- self.logger.exception(f"Backup API call also failed: {str(backup_e)}")
+ logger.exception(f"Backup API call also failed: {str(backup_e)}")
return None, None
except Exception as e:
- self.logger.exception(f"Error in Anthropic API: {str(e)}")
+ logger.exception(f"Error in Anthropic API: {str(e)}")
return None, None
response_params = _response_to_params(response)
@@ -434,9 +435,15 @@ class AnthropicAgent:
actions = ["DONE"]
return reasonings, actions
- def reset(self, *args, **kwargs):
+ def reset(self, _logger = None, *args, **kwargs):
"""
Reset the agent's state.
"""
+ global logger
+ if _logger:
+ logger = _logger
+ else:
+ logger = logging.getLogger("desktopenv.agent")
self.messages = []
- self.logger.info(f"{self.class_name} reset.")
\ No newline at end of file
+ logger.info(f"{self.class_name} reset.")
+
diff --git a/monitor/.env b/monitor/.env
index 05618af..1969ef7 100644
--- a/monitor/.env
+++ b/monitor/.env
@@ -4,11 +4,11 @@
# Monitor configuration
TASK_CONFIG_PATH=../evaluation_examples/test_all.json
EXAMPLES_BASE_PATH=../evaluation_examples/examples
-RESULTS_BASE_PATH=../results_all
+RESULTS_BASE_PATH=../results
ACTION_SPACE=pyautogui
OBSERVATION_TYPE=screenshot
MODEL_NAME=computer-use-preview
MAX_STEPS=150
FLASK_PORT=80
FLASK_HOST=0.0.0.0
-FLASK_DEBUG=true
\ No newline at end of file
+FLASK_DEBUG=false
\ No newline at end of file
diff --git a/monitor/main.py b/monitor/main.py
index 1657a78..acdf95a 100644
--- a/monitor/main.py
+++ b/monitor/main.py
@@ -1,14 +1,17 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
+from functools import cache
import os
import json
import time
+import subprocess
from datetime import datetime
from pathlib import Path
from flask import Flask, render_template_string, jsonify, send_file, request, render_template
from dotenv import load_dotenv
+
# Load environment variables from .env file
load_dotenv()
@@ -38,12 +41,51 @@ OBSERVATION_TYPE=os.getenv("OBSERVATION_TYPE", "screenshot")
MODEL_NAME=os.getenv("MODEL_NAME", "computer-use-preview")
MAX_STEPS = int(os.getenv("MAX_STEPS", "150"))
+def initialize_default_config():
+ """Initialize default configuration from the first available config in results directory"""
+ global ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME, RESULTS_PATH
+
+ if os.path.exists(RESULTS_BASE_PATH):
+ try:
+ # Scan for the first available configuration
+ for action_space in os.listdir(RESULTS_BASE_PATH):
+ action_space_path = os.path.join(RESULTS_BASE_PATH, action_space)
+ if os.path.isdir(action_space_path):
+ for obs_type in os.listdir(action_space_path):
+ obs_path = os.path.join(action_space_path, obs_type)
+ if os.path.isdir(obs_path):
+ for model_name in os.listdir(obs_path):
+ model_path = os.path.join(obs_path, model_name)
+ if os.path.isdir(model_path):
+ # Use the first available configuration as default
+ ACTION_SPACE = action_space
+ OBSERVATION_TYPE = obs_type
+ MODEL_NAME = model_name
+ RESULTS_PATH = model_path
+ print(f"Initialized default config: {ACTION_SPACE}/{OBSERVATION_TYPE}/{MODEL_NAME}")
+ return
+ except Exception as e:
+ print(f"Error scanning results directory for default config: {e}")
+
+ # Fallback to original environment-based path if no configs found
+ RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
+ print(f"Using fallback config from environment: {ACTION_SPACE}/{OBSERVATION_TYPE}/{MODEL_NAME}")
+
+# Initialize default configuration
+initialize_default_config()
+
RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
+if RESULTS_PATH not in TASK_STATUS_CACHE:
+ # Initialize cache for this results path
+ TASK_STATUS_CACHE[RESULTS_PATH] = {}
+
+@cache
def load_task_list():
with open(TASK_CONFIG_PATH, 'r') as f:
return json.load(f)
+@cache
def get_task_info(task_type, task_id):
task_file = os.path.join(EXAMPLES_BASE_PATH, task_type, f"{task_id}.json")
if os.path.exists(task_file):
@@ -183,8 +225,8 @@ def get_task_status_brief(task_type, task_id):
# Check if the status is already cached
current_time = time.time()
last_cache_time = None
- if cache_key in TASK_STATUS_CACHE:
- cached_status, cached_time = TASK_STATUS_CACHE[cache_key]
+ if cache_key in TASK_STATUS_CACHE[RESULTS_PATH]:
+ cached_status, cached_time = TASK_STATUS_CACHE[RESULTS_PATH][cache_key]
last_cache_time = cached_time
# If cached status is "Done", check if it's within the stability period
if cached_status["status"].startswith("Done"):
@@ -312,7 +354,7 @@ def get_task_status_brief(task_type, task_id):
# Cache the status if it is done or error
if status.startswith("Done") or status == "Error":
current_time = last_cache_time if last_cache_time else current_time
- TASK_STATUS_CACHE[cache_key] = (status_dict, current_time)
+ TASK_STATUS_CACHE[RESULTS_PATH][cache_key] = (status_dict, current_time)
return status_dict
@@ -434,6 +476,90 @@ def api_task_detail(task_type, task_id):
"status": task_status
})
+@app.route('/api/config')
+def api_config():
+ """Get configuration information from environment variables - deprecated, use /api/current-config instead"""
+ config_info = {
+ "task_config_path": TASK_CONFIG_PATH,
+ "results_base_path": RESULTS_BASE_PATH,
+ "action_space": ACTION_SPACE,
+ "observation_type": OBSERVATION_TYPE,
+ "model_name": MODEL_NAME,
+ "max_steps": MAX_STEPS,
+ "examples_base_path": EXAMPLES_BASE_PATH
+ }
+ return jsonify(config_info)
+
+@app.route('/api/available-configs')
+def api_available_configs():
+ """Get all available configuration combinations by scanning the results directory"""
+ configs = []
+
+ if os.path.exists(RESULTS_BASE_PATH):
+ try:
+ # Scan action spaces
+ for action_space in os.listdir(RESULTS_BASE_PATH):
+ action_space_path = os.path.join(RESULTS_BASE_PATH, action_space)
+ if os.path.isdir(action_space_path):
+ # Scan observation types
+ for obs_type in os.listdir(action_space_path):
+ obs_path = os.path.join(action_space_path, obs_type)
+ if os.path.isdir(obs_path):
+ # Scan model names
+ for model_name in os.listdir(obs_path):
+ model_path = os.path.join(obs_path, model_name)
+ if os.path.isdir(model_path):
+ configs.append({
+ "action_space": action_space,
+ "observation_type": obs_type,
+ "model_name": model_name,
+ "path": model_path
+ })
+ except Exception as e:
+ print(f"Error scanning results directory: {e}")
+
+ return jsonify(configs)
+
+@app.route('/api/current-config')
+def api_current_config():
+ """Get current configuration"""
+ return jsonify({
+ "action_space": ACTION_SPACE,
+ "observation_type": OBSERVATION_TYPE,
+ "model_name": MODEL_NAME,
+ "max_steps": MAX_STEPS,
+ "results_path": RESULTS_PATH
+ })
+
+@app.route('/api/set-config', methods=['POST'])
+def api_set_config():
+ """Set current configuration"""
+ global ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME, RESULTS_PATH
+
+ data = request.get_json()
+ if not data:
+ return jsonify({"error": "No data provided"}), 400
+
+ # Update global variables
+ ACTION_SPACE = data.get('action_space', ACTION_SPACE)
+ OBSERVATION_TYPE = data.get('observation_type', OBSERVATION_TYPE)
+ MODEL_NAME = data.get('model_name', MODEL_NAME)
+
+ # Update results path
+ RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
+
+ if RESULTS_PATH not in TASK_STATUS_CACHE:
+ # Initialize cache for this results path
+ TASK_STATUS_CACHE[RESULTS_PATH] = {}
+
+ return jsonify({
+ "action_space": ACTION_SPACE,
+ "observation_type": OBSERVATION_TYPE,
+ "model_name": MODEL_NAME,
+ "max_steps": MAX_STEPS,
+ "results_path": RESULTS_PATH
+ })
+
if __name__ == '__main__':
# Check if necessary directories exist
if not os.path.exists(TASK_CONFIG_PATH):
@@ -447,4 +573,4 @@ if __name__ == '__main__':
port = 8080
debug = os.getenv("FLASK_DEBUG", "false").lower() == "true"
- app.run(host=host, port=port, debug=debug)
\ No newline at end of file
+ app.run(host=host, port=port, debug=debug, threaded=True)
\ No newline at end of file
diff --git a/monitor/static/index.css b/monitor/static/index.css
index 0e20e4a..215bcbf 100644
--- a/monitor/static/index.css
+++ b/monitor/static/index.css
@@ -1,5 +1,63 @@
/* filepath: /home/adlsdztony/codes/OSWorld/monitor/static/index.css */
body { font-family: 'Segoe UI', Arial, sans-serif; margin: 0; padding: 0; background: linear-gradient(135deg, #f4f6fa 0%, #e9f0f9 100%); }
+
+.layout-container {
+ position: relative;
+ max-width: 1200px;
+ margin: 20px auto;
+ padding: 0 20px;
+}
+
+.main-content {
+ background: #fff;
+ border-radius: 14px;
+ box-shadow: 0 8px 32px rgba(0,0,0,0.1);
+ padding: 36px 44px;
+}
+
+/* Floating Config Sidebar */
+.config-sidebar {
+ position: fixed;
+ top: 20px;
+ left: -280px;
+ width: 300px;
+ height: calc(100vh - 40px);
+ z-index: 1000;
+ transition: left 0.3s ease;
+}
+
+.config-sidebar:hover {
+ left: 0;
+}
+
+.config-toggle-btn {
+ position: absolute;
+ right: -50px;
+ top: 50%;
+ transform: translateY(-50%);
+ width: 50px;
+ height: 50px;
+ background: linear-gradient(135deg, #007bff, #0056b3);
+ border-radius: 0 25px 25px 0;
+ display: flex;
+ align-items: center;
+ justify-content: center;
+ color: white;
+ font-size: 1.2em;
+ cursor: pointer;
+ box-shadow: 2px 0 10px rgba(0,0,0,0.2);
+ transition: all 0.3s ease;
+}
+
+.config-toggle-btn:hover {
+ background: linear-gradient(135deg, #0056b3, #004085);
+ transform: translateY(-50%) scale(1.05);
+}
+
+.config-sidebar:hover .config-toggle-btn {
+ opacity: 0.8;
+}
+
.main-container { max-width: 1100px; margin: 40px auto; background: #fff; border-radius: 14px; box-shadow: 0 8px 32px rgba(0,0,0,0.1); padding: 36px 44px; }
h1 { font-size: 2.5em; margin-bottom: 24px; color: #1a237e; text-align: center; position: relative; }
h1:after { content: ''; display: block; width: 80px; height: 4px; background: linear-gradient(90deg, #007bff, #00c6ff); margin: 12px auto 0; border-radius: 2px; }
@@ -125,6 +183,18 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
text-shadow: 0 1px 2px rgba(0,0,0,0.05);
}
+.accuracy-percentage {
+ font-size: 0.7em;
+ font-weight: 600;
+ color: #ffffff;
+ margin-left: 8px;
+ background: rgba(255, 255, 255, 0.1);
+ padding: 4px 8px;
+ border-radius: 12px;
+ display: inline-block;
+ vertical-align: middle;
+}
+
.stat-card span {
font-size: 2em;
@@ -197,8 +267,9 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
.task-type-stats {
display: flex;
- gap: 16px;
flex-wrap: wrap;
+ gap: 8px;
+ align-items: center;
}
.task-stat {
@@ -228,6 +299,22 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
color: #b71c1c;
}
+/* Task type statistics styles */
+.task-stat.score {
+ color: #ffc107;
+ background: rgba(255, 193, 7, 0.1);
+}
+
+.task-stat.steps {
+ color: #17a2b8;
+ background: rgba(23, 162, 184, 0.1);
+}
+
+.task-stat.rate {
+ color: #28a745;
+ background: rgba(40, 167, 69, 0.1);
+}
+
.tasks-container {
padding: 20px;
transition: all 0.4s cubic-bezier(.4,0,.2,1);
@@ -427,3 +514,174 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
background: #a5c7e5;
}
+/* Configuration Panel Styles */
+.config-panel {
+ background: #fff;
+ border-radius: 0 14px 14px 0;
+ box-shadow: 0 8px 32px rgba(0,0,0,0.15);
+ overflow: hidden;
+ height: 100%;
+ display: flex;
+ flex-direction: column;
+}
+
+.config-header {
+ display: flex;
+ align-items: center;
+ padding: 16px 20px;
+ background: linear-gradient(135deg, #6c757d, #495057);
+ color: white;
+ flex-shrink: 0;
+}
+
+.config-header i {
+ margin-right: 10px;
+ font-size: 1.1em;
+}
+
+.config-header span {
+ font-weight: 600;
+ font-size: 1.1em;
+}
+
+.config-content {
+ padding: 20px;
+ flex: 1;
+ overflow-y: auto;
+}
+
+.config-selector {
+ margin-bottom: 20px;
+ padding-bottom: 15px;
+ border-bottom: 1px solid #dee2e6;
+}
+
+.selector-item {
+ display: flex;
+ flex-direction: column;
+ gap: 8px;
+}
+
+.selector-item label {
+ font-weight: 600;
+ color: #495057;
+ font-size: 0.9em;
+ text-transform: uppercase;
+ letter-spacing: 0.5px;
+}
+
+.selector-item select {
+ padding: 8px 12px;
+ border: 2px solid #e9ecef;
+ border-radius: 6px;
+ background: white;
+ font-size: 0.9em;
+ color: #495057;
+ cursor: pointer;
+ transition: all 0.3s ease;
+}
+
+.selector-item select:focus {
+ outline: none;
+ border-color: #007bff;
+ box-shadow: 0 0 0 3px rgba(0,123,255,0.1);
+}
+
+.selector-item select:hover {
+ border-color: #007bff;
+}
+
+.config-list {
+ display: flex;
+ flex-direction: column;
+ gap: 15px;
+}
+
+.config-item {
+ display: flex;
+ flex-direction: column;
+ background: #f8f9fa;
+ padding: 12px;
+ border-radius: 8px;
+ border-left: 4px solid #007bff;
+ transition: all 0.3s ease;
+}
+
+.config-item:hover {
+ transform: translateX(3px);
+ box-shadow: 0 4px 12px rgba(0,123,255,0.15);
+}
+
+.config-label {
+ font-weight: 600;
+ color: #495057;
+ margin-bottom: 5px;
+ font-size: 0.9em;
+ text-transform: uppercase;
+ color: #495057;
+ font-size: 0.85em;
+ margin-bottom: 6px;
+ text-transform: uppercase;
+ letter-spacing: 0.5px;
+}
+
+.config-value {
+ color: #007bff;
+ font-family: 'Courier New', monospace;
+ font-size: 0.9em;
+ font-weight: 600;
+ word-break: break-word;
+}
+
+.config-path {
+ font-size: 0.8em;
+ line-height: 1.3;
+}
+
+/* Responsive design for sidebar layout */
+@media (max-width: 1024px) {
+ .config-sidebar {
+ left: -250px;
+ width: 250px;
+ }
+
+ .config-toggle-btn {
+ right: -40px;
+ width: 40px;
+ height: 40px;
+ font-size: 1em;
+ }
+}
+
+@media (max-width: 768px) {
+ .layout-container {
+ padding: 0 10px;
+ }
+
+ .main-content {
+ padding: 20px 25px;
+ }
+
+ .config-sidebar {
+ left: -220px;
+ width: 220px;
+ height: calc(100vh - 20px);
+ top: 10px;
+ }
+
+ .config-toggle-btn {
+ right: -35px;
+ width: 35px;
+ height: 35px;
+ font-size: 0.9em;
+ }
+
+ .config-content {
+ padding: 15px;
+ }
+
+ .config-item {
+ padding: 10px;
+ }
+}
+
diff --git a/monitor/static/index.js b/monitor/static/index.js
index 4dd34e5..ed2910e 100644
--- a/monitor/static/index.js
+++ b/monitor/static/index.js
@@ -1,5 +1,8 @@
document.addEventListener('DOMContentLoaded', () => {
- fetchTasks();
+ fetchAvailableConfigs().then(() => {
+ fetchConfig();
+ fetchTasks();
+ });
// Bind filter functionality
document.getElementById('total-tasks').parentElement.addEventListener('click', () => setTaskFilter('all'));
document.getElementById('active-tasks').parentElement.addEventListener('click', () => setTaskFilter('active'));
@@ -9,6 +12,9 @@ document.addEventListener('DOMContentLoaded', () => {
let allTaskData = null;
let currentFilter = 'all';
+let availableConfigs = [];
+let currentConfig = null;
+let categoryStats = {};
function refreshPage() {
// Save expanded state before refresh
@@ -31,8 +37,8 @@ function fetchTasksForRefresh() {
fetch('/api/tasks/brief')
.then(response => response.json())
.then(data => {
- // Update stored data
allTaskData = data;
+ categoryStats = calculateCategoryStats(data);
// Only update statistics and task status, do not fully re-render
updateStatistics(data);
updateTaskStatus(data);
@@ -148,6 +154,7 @@ function fetchTasks() {
.then(response => response.json())
.then(data => {
allTaskData = data;
+ categoryStats = calculateCategoryStats(data);
renderTasks(data);
updateStatistics(data);
})
@@ -208,13 +215,15 @@ function updateStatistics(data) {
document.getElementById('completed-tasks').textContent = completedTasks;
document.getElementById('error-tasks').textContent = errorTasks;
- // Update score display with formatted score
+ // Update score display with formatted score and accuracy percentage
const scoreDisplay = document.getElementById('score-display');
if (completedTasks > 0) {
const scoreFormatted = totalScore.toFixed(2);
- scoreDisplay.innerHTML = `${scoreFormatted} / ${completedTasks}`;
+ const averageScore = totalScore / completedTasks;
+ const accuracyPercentage = (averageScore * 100).toFixed(1);
+ scoreDisplay.innerHTML = `${scoreFormatted} / ${completedTasks} (${accuracyPercentage}%)`;
} else {
- scoreDisplay.innerHTML = '0.00 / 0';
+ scoreDisplay.innerHTML = '0.00 / 0 (0.0%)';
}
// Highlight the currently selected statistics card
@@ -279,6 +288,10 @@ function renderTasks(data) {
// Create header with task type name and statistics
const typeHeader = document.createElement('div');
typeHeader.className = 'task-type-header';
+
+ // Get category stats for this task type
+ const stats = categoryStats[taskType] || {};
+
typeHeader.innerHTML = `
${taskType}
@@ -286,6 +299,9 @@ function renderTasks(data) {
${tasks.length} total
${runningCount} active
${completedCount} completed
+ ${stats.avg_score ? ` ${stats.avg_score} avg score` : ''}
+ ${stats.avg_steps ? ` ${stats.avg_steps} avg steps` : ''}
+ ${stats.completion_rate ? ` ${stats.completion_rate}% completed` : ''}
`;
typeSection.appendChild(typeHeader);
@@ -453,7 +469,181 @@ function renderTasks(data) {
container.appendChild(typeSection);
});
}
-// add auto-refresh with time interval 10 seconds
-setInterval(() => {
- refreshPage();
-}, 10000); // 10 seconds interval
+
+function fetchAvailableConfigs() {
+ return fetch('/api/available-configs')
+ .then(response => response.json())
+ .then(data => {
+ availableConfigs = data;
+ populateConfigSelect();
+ return data;
+ })
+ .catch(error => {
+ console.error('Error fetching available configs:', error);
+ return [];
+ });
+}
+
+function populateConfigSelect() {
+ const select = document.getElementById('config-select');
+ select.innerHTML = '';
+
+ if (availableConfigs.length === 0) {
+ select.innerHTML = '';
+ return;
+ }
+
+ // Add available configurations
+ availableConfigs.forEach((config, index) => {
+ const option = document.createElement('option');
+ option.value = index;
+ option.textContent = `${config.action_space} / ${config.observation_type} / ${config.model_name}`;
+ select.appendChild(option);
+ });
+}
+
+function changeConfiguration() {
+ const select = document.getElementById('config-select');
+ const selectedIndex = select.value;
+
+ if (selectedIndex === '' || selectedIndex < 0 || selectedIndex >= availableConfigs.length) {
+ return;
+ }
+
+ const selectedConfig = availableConfigs[selectedIndex];
+
+ // Send configuration change request
+ fetch('/api/set-config', {
+ method: 'POST',
+ headers: {
+ 'Content-Type': 'application/json',
+ },
+ body: JSON.stringify(selectedConfig)
+ })
+ .then(response => response.json())
+ .then(data => {
+ currentConfig = data;
+ displayConfig(data);
+ // Refresh tasks with new configuration
+ fetchTasks();
+ })
+ .catch(error => {
+ console.error('Error setting config:', error);
+ displayConfigError();
+ });
+}
+
+function fetchConfig() {
+ return fetch('/api/current-config')
+ .then(response => response.json())
+ .then(data => {
+ currentConfig = data;
+ displayConfig(data);
+ updateConfigSelect();
+ return data;
+ })
+ .catch(error => {
+ console.error('Error fetching config:', error);
+ displayConfigError();
+ });
+}
+
+function updateConfigSelect() {
+ if (!currentConfig || availableConfigs.length === 0) return;
+
+ const select = document.getElementById('config-select');
+ const currentConfigIndex = availableConfigs.findIndex(config =>
+ config.action_space === currentConfig.action_space &&
+ config.observation_type === currentConfig.observation_type &&
+ config.model_name === currentConfig.model_name
+ );
+
+ if (currentConfigIndex !== -1) {
+ select.value = currentConfigIndex;
+ } else {
+ // Current config not found in available configs, select the first one if available
+ if (availableConfigs.length > 0) {
+ select.value = 0;
+ console.warn('Current config not found in available configs, defaulting to first available config');
+ }
+ }
+}
+
+function displayConfig(config) {
+ document.getElementById('action-space').textContent = config.action_space || 'N/A';
+ document.getElementById('observation-type').textContent = config.observation_type || 'N/A';
+ document.getElementById('model-name').textContent = config.model_name || 'N/A';
+ document.getElementById('max-steps').textContent = config.max_steps || 'N/A';
+}
+
+function displayConfigError() {
+ const configValues = document.querySelectorAll('.config-value');
+ configValues.forEach(element => {
+ element.textContent = 'Error loading';
+ element.style.color = '#dc3545';
+ });
+}
+
+function calculateCategoryStats(data) {
+ const stats = {};
+
+ Object.entries(data).forEach(([taskType, tasks]) => {
+ let totalTasks = tasks.length;
+ let completedTasks = 0;
+ let runningTasks = 0;
+ let errorTasks = 0;
+ let totalScore = 0;
+ let totalSteps = 0;
+ let completedWithSteps = 0;
+
+ tasks.forEach(task => {
+ const status = task.status.status;
+
+ if (['Done', 'Done (Message Exit)', 'Done (Max Steps)', 'Done (Thought Exit)'].includes(status)) {
+ completedTasks++;
+
+ // Calculate score if available
+ if (task.status.result) {
+ try {
+ const score = parseFloat(task.status.result);
+ if (!isNaN(score) && score >= 0 && score <= 1) {
+ totalScore += score;
+ }
+ } catch (e) {
+ // Ignore parsing errors
+ }
+ }
+
+ // Calculate steps for completed tasks
+ if (task.status.progress && task.status.progress > 0) {
+ totalSteps += task.status.progress;
+ completedWithSteps++;
+ }
+
+ } else if (['Running', 'Preparing', 'Initializing'].includes(status)) {
+ runningTasks++;
+
+ } else if (status === 'Error') {
+ errorTasks++;
+ }
+ });
+
+ // Calculate averages
+ const avgScore = completedTasks > 0 ? totalScore / completedTasks : 0;
+ const avgSteps = completedWithSteps > 0 ? totalSteps / completedWithSteps : 0;
+ const completionRate = totalTasks > 0 ? (completedTasks / totalTasks * 100) : 0;
+
+ stats[taskType] = {
+ total_tasks: totalTasks,
+ completed_tasks: completedTasks,
+ running_tasks: runningTasks,
+ error_tasks: errorTasks,
+ total_score: Math.round(totalScore * 100) / 100,
+ avg_score: Math.round(avgScore * 10000) / 10000,
+ avg_steps: Math.round(avgSteps * 10) / 10,
+ completion_rate: Math.round(completionRate * 10) / 10
+ };
+ });
+
+ return stats;
+}
diff --git a/monitor/templates/index.html b/monitor/templates/index.html
index 0c34f3c..ef91ab9 100644
--- a/monitor/templates/index.html
+++ b/monitor/templates/index.html
@@ -12,19 +12,62 @@
-
-
OSWorld Monitor System Online
-
-
-
-
-
-
Score:
-
Loading...
+
+
+
-
+
+
+
OSWorld Monitor System Online
+
+
+
+
+
+ Score:
+ Loading...
+
+
+
+
Loading...
@@ -46,10 +89,11 @@
Total Tasks
-
-
-
-
Loading task data...
+
+
+
+
Loading task data...
+