Feat/monitor (#254)

* feat: add claude support

* feat: add script for end-to-end evaluation with logging and task distribution

* feat&fix: add tool result handling and update model default in evaluation script

* chore: remove run_test_env.py script

* feat&fix: implement action parsing for tool calls and update default action space

* fix: update text formatting in action parsing and replace logger import

* feat&fix: implement action parsing for tool calls and add screen size handling

* feat: add setup instructions for Anthropic API integration

* feat: add notice about image size limitations for Anthropic API

* Delete test_env/logger.py

* Delete test_env/utils.py

* fix: update logger usage to use global logger and improve error handling

* feat&fix: add configuration management API endpoints and update UI for configuration selection

* feat&fix: update environment configuration, enhance task statistics, and improve UI responsiveness

* feat&fix: add configuration toggle button in UI and improve task loading performance

* feat&fix: add accuracy percentage display to score and style updates for UI
This commit is contained in:
Zilong Zhou
2025-07-14 13:43:41 +08:00
committed by GitHub
parent 0651495d88
commit 74b7c189af
6 changed files with 662 additions and 37 deletions

View File

@@ -369,9 +369,10 @@ class AnthropicAgent:
) )
except (APIError, APIStatusError, APIResponseValidationError) as e: except (APIError, APIStatusError, APIResponseValidationError) as e:
self.logger.exception(f"Anthropic API error: {str(e)}") logger.exception(f"Anthropic API error: {str(e)}")
try: try:
self.logger.warning("Retrying with backup API key...") logger.warning("Retrying with backup API key...")
backup_client = Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY_BACKUP"), max_retries=4) backup_client = Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY_BACKUP"), max_retries=4)
if self.model_name == "claude-3-7-sonnet-20250219" or self.model_name == "claude-4-opus-20250514" or self.model_name == "claude-4-sonnet-20250514": if self.model_name == "claude-3-7-sonnet-20250219" or self.model_name == "claude-4-opus-20250514" or self.model_name == "claude-4-sonnet-20250514":
@@ -393,13 +394,13 @@ class AnthropicAgent:
tools=tools, tools=tools,
betas=betas, betas=betas,
) )
self.logger.info("Successfully used backup API key") logger.info("Successfully used backup API key")
except Exception as backup_e: except Exception as backup_e:
self.logger.exception(f"Backup API call also failed: {str(backup_e)}") logger.exception(f"Backup API call also failed: {str(backup_e)}")
return None, None return None, None
except Exception as e: except Exception as e:
self.logger.exception(f"Error in Anthropic API: {str(e)}") logger.exception(f"Error in Anthropic API: {str(e)}")
return None, None return None, None
response_params = _response_to_params(response) response_params = _response_to_params(response)
@@ -434,9 +435,15 @@ class AnthropicAgent:
actions = ["DONE"] actions = ["DONE"]
return reasonings, actions return reasonings, actions
def reset(self, *args, **kwargs): def reset(self, _logger = None, *args, **kwargs):
""" """
Reset the agent's state. Reset the agent's state.
""" """
global logger
if _logger:
logger = _logger
else:
logger = logging.getLogger("desktopenv.agent")
self.messages = [] self.messages = []
self.logger.info(f"{self.class_name} reset.") logger.info(f"{self.class_name} reset.")

View File

@@ -4,11 +4,11 @@
# Monitor configuration # Monitor configuration
TASK_CONFIG_PATH=../evaluation_examples/test_all.json TASK_CONFIG_PATH=../evaluation_examples/test_all.json
EXAMPLES_BASE_PATH=../evaluation_examples/examples EXAMPLES_BASE_PATH=../evaluation_examples/examples
RESULTS_BASE_PATH=../results_all RESULTS_BASE_PATH=../results
ACTION_SPACE=pyautogui ACTION_SPACE=pyautogui
OBSERVATION_TYPE=screenshot OBSERVATION_TYPE=screenshot
MODEL_NAME=computer-use-preview MODEL_NAME=computer-use-preview
MAX_STEPS=150 MAX_STEPS=150
FLASK_PORT=80 FLASK_PORT=80
FLASK_HOST=0.0.0.0 FLASK_HOST=0.0.0.0
FLASK_DEBUG=true FLASK_DEBUG=false

View File

@@ -1,14 +1,17 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from functools import cache
import os import os
import json import json
import time import time
import subprocess
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
from flask import Flask, render_template_string, jsonify, send_file, request, render_template from flask import Flask, render_template_string, jsonify, send_file, request, render_template
from dotenv import load_dotenv from dotenv import load_dotenv
# Load environment variables from .env file # Load environment variables from .env file
load_dotenv() load_dotenv()
@@ -38,12 +41,51 @@ OBSERVATION_TYPE=os.getenv("OBSERVATION_TYPE", "screenshot")
MODEL_NAME=os.getenv("MODEL_NAME", "computer-use-preview") MODEL_NAME=os.getenv("MODEL_NAME", "computer-use-preview")
MAX_STEPS = int(os.getenv("MAX_STEPS", "150")) MAX_STEPS = int(os.getenv("MAX_STEPS", "150"))
def initialize_default_config():
"""Initialize default configuration from the first available config in results directory"""
global ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME, RESULTS_PATH
if os.path.exists(RESULTS_BASE_PATH):
try:
# Scan for the first available configuration
for action_space in os.listdir(RESULTS_BASE_PATH):
action_space_path = os.path.join(RESULTS_BASE_PATH, action_space)
if os.path.isdir(action_space_path):
for obs_type in os.listdir(action_space_path):
obs_path = os.path.join(action_space_path, obs_type)
if os.path.isdir(obs_path):
for model_name in os.listdir(obs_path):
model_path = os.path.join(obs_path, model_name)
if os.path.isdir(model_path):
# Use the first available configuration as default
ACTION_SPACE = action_space
OBSERVATION_TYPE = obs_type
MODEL_NAME = model_name
RESULTS_PATH = model_path
print(f"Initialized default config: {ACTION_SPACE}/{OBSERVATION_TYPE}/{MODEL_NAME}")
return
except Exception as e:
print(f"Error scanning results directory for default config: {e}")
# Fallback to original environment-based path if no configs found
RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
print(f"Using fallback config from environment: {ACTION_SPACE}/{OBSERVATION_TYPE}/{MODEL_NAME}")
# Initialize default configuration
initialize_default_config()
RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME) RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
if RESULTS_PATH not in TASK_STATUS_CACHE:
# Initialize cache for this results path
TASK_STATUS_CACHE[RESULTS_PATH] = {}
@cache
def load_task_list(): def load_task_list():
with open(TASK_CONFIG_PATH, 'r') as f: with open(TASK_CONFIG_PATH, 'r') as f:
return json.load(f) return json.load(f)
@cache
def get_task_info(task_type, task_id): def get_task_info(task_type, task_id):
task_file = os.path.join(EXAMPLES_BASE_PATH, task_type, f"{task_id}.json") task_file = os.path.join(EXAMPLES_BASE_PATH, task_type, f"{task_id}.json")
if os.path.exists(task_file): if os.path.exists(task_file):
@@ -183,8 +225,8 @@ def get_task_status_brief(task_type, task_id):
# Check if the status is already cached # Check if the status is already cached
current_time = time.time() current_time = time.time()
last_cache_time = None last_cache_time = None
if cache_key in TASK_STATUS_CACHE: if cache_key in TASK_STATUS_CACHE[RESULTS_PATH]:
cached_status, cached_time = TASK_STATUS_CACHE[cache_key] cached_status, cached_time = TASK_STATUS_CACHE[RESULTS_PATH][cache_key]
last_cache_time = cached_time last_cache_time = cached_time
# If cached status is "Done", check if it's within the stability period # If cached status is "Done", check if it's within the stability period
if cached_status["status"].startswith("Done"): if cached_status["status"].startswith("Done"):
@@ -312,7 +354,7 @@ def get_task_status_brief(task_type, task_id):
# Cache the status if it is done or error # Cache the status if it is done or error
if status.startswith("Done") or status == "Error": if status.startswith("Done") or status == "Error":
current_time = last_cache_time if last_cache_time else current_time current_time = last_cache_time if last_cache_time else current_time
TASK_STATUS_CACHE[cache_key] = (status_dict, current_time) TASK_STATUS_CACHE[RESULTS_PATH][cache_key] = (status_dict, current_time)
return status_dict return status_dict
@@ -434,6 +476,90 @@ def api_task_detail(task_type, task_id):
"status": task_status "status": task_status
}) })
@app.route('/api/config')
def api_config():
"""Get configuration information from environment variables - deprecated, use /api/current-config instead"""
config_info = {
"task_config_path": TASK_CONFIG_PATH,
"results_base_path": RESULTS_BASE_PATH,
"action_space": ACTION_SPACE,
"observation_type": OBSERVATION_TYPE,
"model_name": MODEL_NAME,
"max_steps": MAX_STEPS,
"examples_base_path": EXAMPLES_BASE_PATH
}
return jsonify(config_info)
@app.route('/api/available-configs')
def api_available_configs():
"""Get all available configuration combinations by scanning the results directory"""
configs = []
if os.path.exists(RESULTS_BASE_PATH):
try:
# Scan action spaces
for action_space in os.listdir(RESULTS_BASE_PATH):
action_space_path = os.path.join(RESULTS_BASE_PATH, action_space)
if os.path.isdir(action_space_path):
# Scan observation types
for obs_type in os.listdir(action_space_path):
obs_path = os.path.join(action_space_path, obs_type)
if os.path.isdir(obs_path):
# Scan model names
for model_name in os.listdir(obs_path):
model_path = os.path.join(obs_path, model_name)
if os.path.isdir(model_path):
configs.append({
"action_space": action_space,
"observation_type": obs_type,
"model_name": model_name,
"path": model_path
})
except Exception as e:
print(f"Error scanning results directory: {e}")
return jsonify(configs)
@app.route('/api/current-config')
def api_current_config():
"""Get current configuration"""
return jsonify({
"action_space": ACTION_SPACE,
"observation_type": OBSERVATION_TYPE,
"model_name": MODEL_NAME,
"max_steps": MAX_STEPS,
"results_path": RESULTS_PATH
})
@app.route('/api/set-config', methods=['POST'])
def api_set_config():
"""Set current configuration"""
global ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME, RESULTS_PATH
data = request.get_json()
if not data:
return jsonify({"error": "No data provided"}), 400
# Update global variables
ACTION_SPACE = data.get('action_space', ACTION_SPACE)
OBSERVATION_TYPE = data.get('observation_type', OBSERVATION_TYPE)
MODEL_NAME = data.get('model_name', MODEL_NAME)
# Update results path
RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
if RESULTS_PATH not in TASK_STATUS_CACHE:
# Initialize cache for this results path
TASK_STATUS_CACHE[RESULTS_PATH] = {}
return jsonify({
"action_space": ACTION_SPACE,
"observation_type": OBSERVATION_TYPE,
"model_name": MODEL_NAME,
"max_steps": MAX_STEPS,
"results_path": RESULTS_PATH
})
if __name__ == '__main__': if __name__ == '__main__':
# Check if necessary directories exist # Check if necessary directories exist
if not os.path.exists(TASK_CONFIG_PATH): if not os.path.exists(TASK_CONFIG_PATH):
@@ -447,4 +573,4 @@ if __name__ == '__main__':
port = 8080 port = 8080
debug = os.getenv("FLASK_DEBUG", "false").lower() == "true" debug = os.getenv("FLASK_DEBUG", "false").lower() == "true"
app.run(host=host, port=port, debug=debug) app.run(host=host, port=port, debug=debug, threaded=True)

View File

@@ -1,5 +1,63 @@
/* filepath: /home/adlsdztony/codes/OSWorld/monitor/static/index.css */ /* filepath: /home/adlsdztony/codes/OSWorld/monitor/static/index.css */
body { font-family: 'Segoe UI', Arial, sans-serif; margin: 0; padding: 0; background: linear-gradient(135deg, #f4f6fa 0%, #e9f0f9 100%); } body { font-family: 'Segoe UI', Arial, sans-serif; margin: 0; padding: 0; background: linear-gradient(135deg, #f4f6fa 0%, #e9f0f9 100%); }
.layout-container {
position: relative;
max-width: 1200px;
margin: 20px auto;
padding: 0 20px;
}
.main-content {
background: #fff;
border-radius: 14px;
box-shadow: 0 8px 32px rgba(0,0,0,0.1);
padding: 36px 44px;
}
/* Floating Config Sidebar */
.config-sidebar {
position: fixed;
top: 20px;
left: -280px;
width: 300px;
height: calc(100vh - 40px);
z-index: 1000;
transition: left 0.3s ease;
}
.config-sidebar:hover {
left: 0;
}
.config-toggle-btn {
position: absolute;
right: -50px;
top: 50%;
transform: translateY(-50%);
width: 50px;
height: 50px;
background: linear-gradient(135deg, #007bff, #0056b3);
border-radius: 0 25px 25px 0;
display: flex;
align-items: center;
justify-content: center;
color: white;
font-size: 1.2em;
cursor: pointer;
box-shadow: 2px 0 10px rgba(0,0,0,0.2);
transition: all 0.3s ease;
}
.config-toggle-btn:hover {
background: linear-gradient(135deg, #0056b3, #004085);
transform: translateY(-50%) scale(1.05);
}
.config-sidebar:hover .config-toggle-btn {
opacity: 0.8;
}
.main-container { max-width: 1100px; margin: 40px auto; background: #fff; border-radius: 14px; box-shadow: 0 8px 32px rgba(0,0,0,0.1); padding: 36px 44px; } .main-container { max-width: 1100px; margin: 40px auto; background: #fff; border-radius: 14px; box-shadow: 0 8px 32px rgba(0,0,0,0.1); padding: 36px 44px; }
h1 { font-size: 2.5em; margin-bottom: 24px; color: #1a237e; text-align: center; position: relative; } h1 { font-size: 2.5em; margin-bottom: 24px; color: #1a237e; text-align: center; position: relative; }
h1:after { content: ''; display: block; width: 80px; height: 4px; background: linear-gradient(90deg, #007bff, #00c6ff); margin: 12px auto 0; border-radius: 2px; } h1:after { content: ''; display: block; width: 80px; height: 4px; background: linear-gradient(90deg, #007bff, #00c6ff); margin: 12px auto 0; border-radius: 2px; }
@@ -125,6 +183,18 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
text-shadow: 0 1px 2px rgba(0,0,0,0.05); text-shadow: 0 1px 2px rgba(0,0,0,0.05);
} }
.accuracy-percentage {
font-size: 0.7em;
font-weight: 600;
color: #ffffff;
margin-left: 8px;
background: rgba(255, 255, 255, 0.1);
padding: 4px 8px;
border-radius: 12px;
display: inline-block;
vertical-align: middle;
}
.stat-card span { .stat-card span {
font-size: 2em; font-size: 2em;
@@ -197,8 +267,9 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
.task-type-stats { .task-type-stats {
display: flex; display: flex;
gap: 16px;
flex-wrap: wrap; flex-wrap: wrap;
gap: 8px;
align-items: center;
} }
.task-stat { .task-stat {
@@ -228,6 +299,22 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
color: #b71c1c; color: #b71c1c;
} }
/* Task type statistics styles */
.task-stat.score {
color: #ffc107;
background: rgba(255, 193, 7, 0.1);
}
.task-stat.steps {
color: #17a2b8;
background: rgba(23, 162, 184, 0.1);
}
.task-stat.rate {
color: #28a745;
background: rgba(40, 167, 69, 0.1);
}
.tasks-container { .tasks-container {
padding: 20px; padding: 20px;
transition: all 0.4s cubic-bezier(.4,0,.2,1); transition: all 0.4s cubic-bezier(.4,0,.2,1);
@@ -427,3 +514,174 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
background: #a5c7e5; background: #a5c7e5;
} }
/* Configuration Panel Styles */
.config-panel {
background: #fff;
border-radius: 0 14px 14px 0;
box-shadow: 0 8px 32px rgba(0,0,0,0.15);
overflow: hidden;
height: 100%;
display: flex;
flex-direction: column;
}
.config-header {
display: flex;
align-items: center;
padding: 16px 20px;
background: linear-gradient(135deg, #6c757d, #495057);
color: white;
flex-shrink: 0;
}
.config-header i {
margin-right: 10px;
font-size: 1.1em;
}
.config-header span {
font-weight: 600;
font-size: 1.1em;
}
.config-content {
padding: 20px;
flex: 1;
overflow-y: auto;
}
.config-selector {
margin-bottom: 20px;
padding-bottom: 15px;
border-bottom: 1px solid #dee2e6;
}
.selector-item {
display: flex;
flex-direction: column;
gap: 8px;
}
.selector-item label {
font-weight: 600;
color: #495057;
font-size: 0.9em;
text-transform: uppercase;
letter-spacing: 0.5px;
}
.selector-item select {
padding: 8px 12px;
border: 2px solid #e9ecef;
border-radius: 6px;
background: white;
font-size: 0.9em;
color: #495057;
cursor: pointer;
transition: all 0.3s ease;
}
.selector-item select:focus {
outline: none;
border-color: #007bff;
box-shadow: 0 0 0 3px rgba(0,123,255,0.1);
}
.selector-item select:hover {
border-color: #007bff;
}
.config-list {
display: flex;
flex-direction: column;
gap: 15px;
}
.config-item {
display: flex;
flex-direction: column;
background: #f8f9fa;
padding: 12px;
border-radius: 8px;
border-left: 4px solid #007bff;
transition: all 0.3s ease;
}
.config-item:hover {
transform: translateX(3px);
box-shadow: 0 4px 12px rgba(0,123,255,0.15);
}
.config-label {
font-weight: 600;
color: #495057;
margin-bottom: 5px;
font-size: 0.9em;
text-transform: uppercase;
color: #495057;
font-size: 0.85em;
margin-bottom: 6px;
text-transform: uppercase;
letter-spacing: 0.5px;
}
.config-value {
color: #007bff;
font-family: 'Courier New', monospace;
font-size: 0.9em;
font-weight: 600;
word-break: break-word;
}
.config-path {
font-size: 0.8em;
line-height: 1.3;
}
/* Responsive design for sidebar layout */
@media (max-width: 1024px) {
.config-sidebar {
left: -250px;
width: 250px;
}
.config-toggle-btn {
right: -40px;
width: 40px;
height: 40px;
font-size: 1em;
}
}
@media (max-width: 768px) {
.layout-container {
padding: 0 10px;
}
.main-content {
padding: 20px 25px;
}
.config-sidebar {
left: -220px;
width: 220px;
height: calc(100vh - 20px);
top: 10px;
}
.config-toggle-btn {
right: -35px;
width: 35px;
height: 35px;
font-size: 0.9em;
}
.config-content {
padding: 15px;
}
.config-item {
padding: 10px;
}
}

View File

@@ -1,5 +1,8 @@
document.addEventListener('DOMContentLoaded', () => { document.addEventListener('DOMContentLoaded', () => {
fetchTasks(); fetchAvailableConfigs().then(() => {
fetchConfig();
fetchTasks();
});
// Bind filter functionality // Bind filter functionality
document.getElementById('total-tasks').parentElement.addEventListener('click', () => setTaskFilter('all')); document.getElementById('total-tasks').parentElement.addEventListener('click', () => setTaskFilter('all'));
document.getElementById('active-tasks').parentElement.addEventListener('click', () => setTaskFilter('active')); document.getElementById('active-tasks').parentElement.addEventListener('click', () => setTaskFilter('active'));
@@ -9,6 +12,9 @@ document.addEventListener('DOMContentLoaded', () => {
let allTaskData = null; let allTaskData = null;
let currentFilter = 'all'; let currentFilter = 'all';
let availableConfigs = [];
let currentConfig = null;
let categoryStats = {};
function refreshPage() { function refreshPage() {
// Save expanded state before refresh // Save expanded state before refresh
@@ -31,8 +37,8 @@ function fetchTasksForRefresh() {
fetch('/api/tasks/brief') fetch('/api/tasks/brief')
.then(response => response.json()) .then(response => response.json())
.then(data => { .then(data => {
// Update stored data
allTaskData = data; allTaskData = data;
categoryStats = calculateCategoryStats(data);
// Only update statistics and task status, do not fully re-render // Only update statistics and task status, do not fully re-render
updateStatistics(data); updateStatistics(data);
updateTaskStatus(data); updateTaskStatus(data);
@@ -148,6 +154,7 @@ function fetchTasks() {
.then(response => response.json()) .then(response => response.json())
.then(data => { .then(data => {
allTaskData = data; allTaskData = data;
categoryStats = calculateCategoryStats(data);
renderTasks(data); renderTasks(data);
updateStatistics(data); updateStatistics(data);
}) })
@@ -208,13 +215,15 @@ function updateStatistics(data) {
document.getElementById('completed-tasks').textContent = completedTasks; document.getElementById('completed-tasks').textContent = completedTasks;
document.getElementById('error-tasks').textContent = errorTasks; document.getElementById('error-tasks').textContent = errorTasks;
// Update score display with formatted score // Update score display with formatted score and accuracy percentage
const scoreDisplay = document.getElementById('score-display'); const scoreDisplay = document.getElementById('score-display');
if (completedTasks > 0) { if (completedTasks > 0) {
const scoreFormatted = totalScore.toFixed(2); const scoreFormatted = totalScore.toFixed(2);
scoreDisplay.innerHTML = `<span>${scoreFormatted}</span> / <span>${completedTasks}</span>`; const averageScore = totalScore / completedTasks;
const accuracyPercentage = (averageScore * 100).toFixed(1);
scoreDisplay.innerHTML = `<span>${scoreFormatted}</span> / <span>${completedTasks}</span> <span class="accuracy-percentage">(${accuracyPercentage}%)</span>`;
} else { } else {
scoreDisplay.innerHTML = '<span>0.00</span> / <span>0</span>'; scoreDisplay.innerHTML = '<span>0.00</span> / <span>0</span> <span class="accuracy-percentage">(0.0%)</span>';
} }
// Highlight the currently selected statistics card // Highlight the currently selected statistics card
@@ -279,6 +288,10 @@ function renderTasks(data) {
// Create header with task type name and statistics // Create header with task type name and statistics
const typeHeader = document.createElement('div'); const typeHeader = document.createElement('div');
typeHeader.className = 'task-type-header'; typeHeader.className = 'task-type-header';
// Get category stats for this task type
const stats = categoryStats[taskType] || {};
typeHeader.innerHTML = ` typeHeader.innerHTML = `
<span class="task-type-name"><i class="fas fa-layer-group"></i> ${taskType}</span> <span class="task-type-name"><i class="fas fa-layer-group"></i> ${taskType}</span>
<div class="task-type-stats"> <div class="task-type-stats">
@@ -286,6 +299,9 @@ function renderTasks(data) {
<span class="task-stat"><i class="fas fa-tasks"></i> ${tasks.length} total</span> <span class="task-stat"><i class="fas fa-tasks"></i> ${tasks.length} total</span>
<span class="task-stat running"><i class="fas fa-running"></i> ${runningCount} active</span> <span class="task-stat running"><i class="fas fa-running"></i> ${runningCount} active</span>
<span class="task-stat completed"><i class="fas fa-check-circle"></i> ${completedCount} completed</span> <span class="task-stat completed"><i class="fas fa-check-circle"></i> ${completedCount} completed</span>
${stats.avg_score ? `<span class="task-stat score"><i class="fas fa-star"></i> ${stats.avg_score} avg score</span>` : ''}
${stats.avg_steps ? `<span class="task-stat steps"><i class="fas fa-chart-line"></i> ${stats.avg_steps} avg steps</span>` : ''}
${stats.completion_rate ? `<span class="task-stat rate"><i class="fas fa-percentage"></i> ${stats.completion_rate}% completed</span>` : ''}
</div> </div>
`; `;
typeSection.appendChild(typeHeader); typeSection.appendChild(typeHeader);
@@ -453,7 +469,181 @@ function renderTasks(data) {
container.appendChild(typeSection); container.appendChild(typeSection);
}); });
} }
// add auto-refresh with time interval 10 seconds
setInterval(() => { function fetchAvailableConfigs() {
refreshPage(); return fetch('/api/available-configs')
}, 10000); // 10 seconds interval .then(response => response.json())
.then(data => {
availableConfigs = data;
populateConfigSelect();
return data;
})
.catch(error => {
console.error('Error fetching available configs:', error);
return [];
});
}
function populateConfigSelect() {
const select = document.getElementById('config-select');
select.innerHTML = '';
if (availableConfigs.length === 0) {
select.innerHTML = '<option value="">No configurations found in results directory</option>';
return;
}
// Add available configurations
availableConfigs.forEach((config, index) => {
const option = document.createElement('option');
option.value = index;
option.textContent = `${config.action_space} / ${config.observation_type} / ${config.model_name}`;
select.appendChild(option);
});
}
function changeConfiguration() {
const select = document.getElementById('config-select');
const selectedIndex = select.value;
if (selectedIndex === '' || selectedIndex < 0 || selectedIndex >= availableConfigs.length) {
return;
}
const selectedConfig = availableConfigs[selectedIndex];
// Send configuration change request
fetch('/api/set-config', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify(selectedConfig)
})
.then(response => response.json())
.then(data => {
currentConfig = data;
displayConfig(data);
// Refresh tasks with new configuration
fetchTasks();
})
.catch(error => {
console.error('Error setting config:', error);
displayConfigError();
});
}
function fetchConfig() {
return fetch('/api/current-config')
.then(response => response.json())
.then(data => {
currentConfig = data;
displayConfig(data);
updateConfigSelect();
return data;
})
.catch(error => {
console.error('Error fetching config:', error);
displayConfigError();
});
}
function updateConfigSelect() {
if (!currentConfig || availableConfigs.length === 0) return;
const select = document.getElementById('config-select');
const currentConfigIndex = availableConfigs.findIndex(config =>
config.action_space === currentConfig.action_space &&
config.observation_type === currentConfig.observation_type &&
config.model_name === currentConfig.model_name
);
if (currentConfigIndex !== -1) {
select.value = currentConfigIndex;
} else {
// Current config not found in available configs, select the first one if available
if (availableConfigs.length > 0) {
select.value = 0;
console.warn('Current config not found in available configs, defaulting to first available config');
}
}
}
function displayConfig(config) {
document.getElementById('action-space').textContent = config.action_space || 'N/A';
document.getElementById('observation-type').textContent = config.observation_type || 'N/A';
document.getElementById('model-name').textContent = config.model_name || 'N/A';
document.getElementById('max-steps').textContent = config.max_steps || 'N/A';
}
function displayConfigError() {
const configValues = document.querySelectorAll('.config-value');
configValues.forEach(element => {
element.textContent = 'Error loading';
element.style.color = '#dc3545';
});
}
function calculateCategoryStats(data) {
const stats = {};
Object.entries(data).forEach(([taskType, tasks]) => {
let totalTasks = tasks.length;
let completedTasks = 0;
let runningTasks = 0;
let errorTasks = 0;
let totalScore = 0;
let totalSteps = 0;
let completedWithSteps = 0;
tasks.forEach(task => {
const status = task.status.status;
if (['Done', 'Done (Message Exit)', 'Done (Max Steps)', 'Done (Thought Exit)'].includes(status)) {
completedTasks++;
// Calculate score if available
if (task.status.result) {
try {
const score = parseFloat(task.status.result);
if (!isNaN(score) && score >= 0 && score <= 1) {
totalScore += score;
}
} catch (e) {
// Ignore parsing errors
}
}
// Calculate steps for completed tasks
if (task.status.progress && task.status.progress > 0) {
totalSteps += task.status.progress;
completedWithSteps++;
}
} else if (['Running', 'Preparing', 'Initializing'].includes(status)) {
runningTasks++;
} else if (status === 'Error') {
errorTasks++;
}
});
// Calculate averages
const avgScore = completedTasks > 0 ? totalScore / completedTasks : 0;
const avgSteps = completedWithSteps > 0 ? totalSteps / completedWithSteps : 0;
const completionRate = totalTasks > 0 ? (completedTasks / totalTasks * 100) : 0;
stats[taskType] = {
total_tasks: totalTasks,
completed_tasks: completedTasks,
running_tasks: runningTasks,
error_tasks: errorTasks,
total_score: Math.round(totalScore * 100) / 100,
avg_score: Math.round(avgScore * 10000) / 10000,
avg_steps: Math.round(avgSteps * 10) / 10,
completion_rate: Math.round(completionRate * 10) / 10
};
});
return stats;
}

View File

@@ -12,19 +12,62 @@
<link rel="stylesheet" href="/static/index.css"> <link rel="stylesheet" href="/static/index.css">
</head> </head>
<body> <body>
<div class="main-container"> <div class="layout-container">
<h1>OSWorld Monitor <span class="system-status online">System Online</span></h1> <!-- Floating Config Button and Sidebar -->
<div class="config-sidebar" id="config-sidebar">
<!-- Score Display Banner --> <div class="config-toggle-btn">
<div class="score-banner"> <i class="fas fa-cogs"></i>
<div class="score-content"> </div>
<i class="fas fa-star"></i> <div class="config-panel">
<span class="score-label">Score:</span> <div class="config-header">
<span id="score-display" class="score-value">Loading...</span> <i class="fas fa-cogs"></i>
<span>Configuration</span>
</div>
<div class="config-content">
<div class="config-selector">
<div class="selector-item">
<label for="config-select">Select Configuration:</label>
<select id="config-select" onchange="changeConfiguration()">
<option value="">Loading configurations...</option>
</select>
</div>
</div>
<div class="config-list">
<div class="config-item">
<span class="config-label">Action Space:</span>
<span class="config-value" id="action-space">Loading...</span>
</div>
<div class="config-item">
<span class="config-label">Observation:</span>
<span class="config-value" id="observation-type">Loading...</span>
</div>
<div class="config-item">
<span class="config-label">Model:</span>
<span class="config-value" id="model-name">Loading...</span>
</div>
<div class="config-item">
<span class="config-label">Max Steps:</span>
<span class="config-value" id="max-steps">Loading...</span>
</div>
</div>
</div>
</div> </div>
</div> </div>
<div class="dashboard-stats"> <!-- Main Content -->
<div class="main-content">
<h1>OSWorld Monitor <span class="system-status online">System Online</span></h1>
<!-- Score Display Banner -->
<div class="score-banner">
<div class="score-content">
<i class="fas fa-star"></i>
<span class="score-label">Score:</span>
<span id="score-display" class="score-value">Loading...</span>
</div>
</div>
<div class="dashboard-stats">
<div class="stat-card"> <div class="stat-card">
<i class="fas fa-running"></i> <i class="fas fa-running"></i>
<span id="active-tasks">Loading...</span> <span id="active-tasks">Loading...</span>
@@ -46,10 +89,11 @@
<div class="stat-label">Total Tasks</div> <div class="stat-label">Total Tasks</div>
</div> </div>
</div> </div>
<div id="task-container"> <div id="task-container">
<div class="loading-spinner"> <div class="loading-spinner">
<div class="spinner"></div> <div class="spinner"></div>
<div>Loading task data...</div> <div>Loading task data...</div>
</div>
</div> </div>
</div> </div>
</div> </div>