Feat/monitor (#254)
* feat: add claude support * feat: add script for end-to-end evaluation with logging and task distribution * feat&fix: add tool result handling and update model default in evaluation script * chore: remove run_test_env.py script * feat&fix: implement action parsing for tool calls and update default action space * fix: update text formatting in action parsing and replace logger import * feat&fix: implement action parsing for tool calls and add screen size handling * feat: add setup instructions for Anthropic API integration * feat: add notice about image size limitations for Anthropic API * Delete test_env/logger.py * Delete test_env/utils.py * fix: update logger usage to use global logger and improve error handling * feat&fix: add configuration management API endpoints and update UI for configuration selection * feat&fix: update environment configuration, enhance task statistics, and improve UI responsiveness * feat&fix: add configuration toggle button in UI and improve task loading performance * feat&fix: add accuracy percentage display to score and style updates for UI
This commit is contained in:
@@ -1,5 +1,63 @@
|
||||
/* filepath: /home/adlsdztony/codes/OSWorld/monitor/static/index.css */
|
||||
body { font-family: 'Segoe UI', Arial, sans-serif; margin: 0; padding: 0; background: linear-gradient(135deg, #f4f6fa 0%, #e9f0f9 100%); }
|
||||
|
||||
.layout-container {
|
||||
position: relative;
|
||||
max-width: 1200px;
|
||||
margin: 20px auto;
|
||||
padding: 0 20px;
|
||||
}
|
||||
|
||||
.main-content {
|
||||
background: #fff;
|
||||
border-radius: 14px;
|
||||
box-shadow: 0 8px 32px rgba(0,0,0,0.1);
|
||||
padding: 36px 44px;
|
||||
}
|
||||
|
||||
/* Floating Config Sidebar */
|
||||
.config-sidebar {
|
||||
position: fixed;
|
||||
top: 20px;
|
||||
left: -280px;
|
||||
width: 300px;
|
||||
height: calc(100vh - 40px);
|
||||
z-index: 1000;
|
||||
transition: left 0.3s ease;
|
||||
}
|
||||
|
||||
.config-sidebar:hover {
|
||||
left: 0;
|
||||
}
|
||||
|
||||
.config-toggle-btn {
|
||||
position: absolute;
|
||||
right: -50px;
|
||||
top: 50%;
|
||||
transform: translateY(-50%);
|
||||
width: 50px;
|
||||
height: 50px;
|
||||
background: linear-gradient(135deg, #007bff, #0056b3);
|
||||
border-radius: 0 25px 25px 0;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
color: white;
|
||||
font-size: 1.2em;
|
||||
cursor: pointer;
|
||||
box-shadow: 2px 0 10px rgba(0,0,0,0.2);
|
||||
transition: all 0.3s ease;
|
||||
}
|
||||
|
||||
.config-toggle-btn:hover {
|
||||
background: linear-gradient(135deg, #0056b3, #004085);
|
||||
transform: translateY(-50%) scale(1.05);
|
||||
}
|
||||
|
||||
.config-sidebar:hover .config-toggle-btn {
|
||||
opacity: 0.8;
|
||||
}
|
||||
|
||||
.main-container { max-width: 1100px; margin: 40px auto; background: #fff; border-radius: 14px; box-shadow: 0 8px 32px rgba(0,0,0,0.1); padding: 36px 44px; }
|
||||
h1 { font-size: 2.5em; margin-bottom: 24px; color: #1a237e; text-align: center; position: relative; }
|
||||
h1:after { content: ''; display: block; width: 80px; height: 4px; background: linear-gradient(90deg, #007bff, #00c6ff); margin: 12px auto 0; border-radius: 2px; }
|
||||
@@ -125,6 +183,18 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
|
||||
text-shadow: 0 1px 2px rgba(0,0,0,0.05);
|
||||
}
|
||||
|
||||
.accuracy-percentage {
|
||||
font-size: 0.7em;
|
||||
font-weight: 600;
|
||||
color: #ffffff;
|
||||
margin-left: 8px;
|
||||
background: rgba(255, 255, 255, 0.1);
|
||||
padding: 4px 8px;
|
||||
border-radius: 12px;
|
||||
display: inline-block;
|
||||
vertical-align: middle;
|
||||
}
|
||||
|
||||
|
||||
.stat-card span {
|
||||
font-size: 2em;
|
||||
@@ -197,8 +267,9 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
|
||||
|
||||
.task-type-stats {
|
||||
display: flex;
|
||||
gap: 16px;
|
||||
flex-wrap: wrap;
|
||||
gap: 8px;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.task-stat {
|
||||
@@ -228,6 +299,22 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
|
||||
color: #b71c1c;
|
||||
}
|
||||
|
||||
/* Task type statistics styles */
|
||||
.task-stat.score {
|
||||
color: #ffc107;
|
||||
background: rgba(255, 193, 7, 0.1);
|
||||
}
|
||||
|
||||
.task-stat.steps {
|
||||
color: #17a2b8;
|
||||
background: rgba(23, 162, 184, 0.1);
|
||||
}
|
||||
|
||||
.task-stat.rate {
|
||||
color: #28a745;
|
||||
background: rgba(40, 167, 69, 0.1);
|
||||
}
|
||||
|
||||
.tasks-container {
|
||||
padding: 20px;
|
||||
transition: all 0.4s cubic-bezier(.4,0,.2,1);
|
||||
@@ -427,3 +514,174 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
|
||||
background: #a5c7e5;
|
||||
}
|
||||
|
||||
/* Configuration Panel Styles */
|
||||
.config-panel {
|
||||
background: #fff;
|
||||
border-radius: 0 14px 14px 0;
|
||||
box-shadow: 0 8px 32px rgba(0,0,0,0.15);
|
||||
overflow: hidden;
|
||||
height: 100%;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
}
|
||||
|
||||
.config-header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
padding: 16px 20px;
|
||||
background: linear-gradient(135deg, #6c757d, #495057);
|
||||
color: white;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
|
||||
.config-header i {
|
||||
margin-right: 10px;
|
||||
font-size: 1.1em;
|
||||
}
|
||||
|
||||
.config-header span {
|
||||
font-weight: 600;
|
||||
font-size: 1.1em;
|
||||
}
|
||||
|
||||
.config-content {
|
||||
padding: 20px;
|
||||
flex: 1;
|
||||
overflow-y: auto;
|
||||
}
|
||||
|
||||
.config-selector {
|
||||
margin-bottom: 20px;
|
||||
padding-bottom: 15px;
|
||||
border-bottom: 1px solid #dee2e6;
|
||||
}
|
||||
|
||||
.selector-item {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 8px;
|
||||
}
|
||||
|
||||
.selector-item label {
|
||||
font-weight: 600;
|
||||
color: #495057;
|
||||
font-size: 0.9em;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.5px;
|
||||
}
|
||||
|
||||
.selector-item select {
|
||||
padding: 8px 12px;
|
||||
border: 2px solid #e9ecef;
|
||||
border-radius: 6px;
|
||||
background: white;
|
||||
font-size: 0.9em;
|
||||
color: #495057;
|
||||
cursor: pointer;
|
||||
transition: all 0.3s ease;
|
||||
}
|
||||
|
||||
.selector-item select:focus {
|
||||
outline: none;
|
||||
border-color: #007bff;
|
||||
box-shadow: 0 0 0 3px rgba(0,123,255,0.1);
|
||||
}
|
||||
|
||||
.selector-item select:hover {
|
||||
border-color: #007bff;
|
||||
}
|
||||
|
||||
.config-list {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 15px;
|
||||
}
|
||||
|
||||
.config-item {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
background: #f8f9fa;
|
||||
padding: 12px;
|
||||
border-radius: 8px;
|
||||
border-left: 4px solid #007bff;
|
||||
transition: all 0.3s ease;
|
||||
}
|
||||
|
||||
.config-item:hover {
|
||||
transform: translateX(3px);
|
||||
box-shadow: 0 4px 12px rgba(0,123,255,0.15);
|
||||
}
|
||||
|
||||
.config-label {
|
||||
font-weight: 600;
|
||||
color: #495057;
|
||||
margin-bottom: 5px;
|
||||
font-size: 0.9em;
|
||||
text-transform: uppercase;
|
||||
color: #495057;
|
||||
font-size: 0.85em;
|
||||
margin-bottom: 6px;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.5px;
|
||||
}
|
||||
|
||||
.config-value {
|
||||
color: #007bff;
|
||||
font-family: 'Courier New', monospace;
|
||||
font-size: 0.9em;
|
||||
font-weight: 600;
|
||||
word-break: break-word;
|
||||
}
|
||||
|
||||
.config-path {
|
||||
font-size: 0.8em;
|
||||
line-height: 1.3;
|
||||
}
|
||||
|
||||
/* Responsive design for sidebar layout */
|
||||
@media (max-width: 1024px) {
|
||||
.config-sidebar {
|
||||
left: -250px;
|
||||
width: 250px;
|
||||
}
|
||||
|
||||
.config-toggle-btn {
|
||||
right: -40px;
|
||||
width: 40px;
|
||||
height: 40px;
|
||||
font-size: 1em;
|
||||
}
|
||||
}
|
||||
|
||||
@media (max-width: 768px) {
|
||||
.layout-container {
|
||||
padding: 0 10px;
|
||||
}
|
||||
|
||||
.main-content {
|
||||
padding: 20px 25px;
|
||||
}
|
||||
|
||||
.config-sidebar {
|
||||
left: -220px;
|
||||
width: 220px;
|
||||
height: calc(100vh - 20px);
|
||||
top: 10px;
|
||||
}
|
||||
|
||||
.config-toggle-btn {
|
||||
right: -35px;
|
||||
width: 35px;
|
||||
height: 35px;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
|
||||
.config-content {
|
||||
padding: 15px;
|
||||
}
|
||||
|
||||
.config-item {
|
||||
padding: 10px;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
document.addEventListener('DOMContentLoaded', () => {
|
||||
fetchTasks();
|
||||
fetchAvailableConfigs().then(() => {
|
||||
fetchConfig();
|
||||
fetchTasks();
|
||||
});
|
||||
// Bind filter functionality
|
||||
document.getElementById('total-tasks').parentElement.addEventListener('click', () => setTaskFilter('all'));
|
||||
document.getElementById('active-tasks').parentElement.addEventListener('click', () => setTaskFilter('active'));
|
||||
@@ -9,6 +12,9 @@ document.addEventListener('DOMContentLoaded', () => {
|
||||
|
||||
let allTaskData = null;
|
||||
let currentFilter = 'all';
|
||||
let availableConfigs = [];
|
||||
let currentConfig = null;
|
||||
let categoryStats = {};
|
||||
|
||||
function refreshPage() {
|
||||
// Save expanded state before refresh
|
||||
@@ -31,8 +37,8 @@ function fetchTasksForRefresh() {
|
||||
fetch('/api/tasks/brief')
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
// Update stored data
|
||||
allTaskData = data;
|
||||
categoryStats = calculateCategoryStats(data);
|
||||
// Only update statistics and task status, do not fully re-render
|
||||
updateStatistics(data);
|
||||
updateTaskStatus(data);
|
||||
@@ -148,6 +154,7 @@ function fetchTasks() {
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
allTaskData = data;
|
||||
categoryStats = calculateCategoryStats(data);
|
||||
renderTasks(data);
|
||||
updateStatistics(data);
|
||||
})
|
||||
@@ -208,13 +215,15 @@ function updateStatistics(data) {
|
||||
document.getElementById('completed-tasks').textContent = completedTasks;
|
||||
document.getElementById('error-tasks').textContent = errorTasks;
|
||||
|
||||
// Update score display with formatted score
|
||||
// Update score display with formatted score and accuracy percentage
|
||||
const scoreDisplay = document.getElementById('score-display');
|
||||
if (completedTasks > 0) {
|
||||
const scoreFormatted = totalScore.toFixed(2);
|
||||
scoreDisplay.innerHTML = `<span>${scoreFormatted}</span> / <span>${completedTasks}</span>`;
|
||||
const averageScore = totalScore / completedTasks;
|
||||
const accuracyPercentage = (averageScore * 100).toFixed(1);
|
||||
scoreDisplay.innerHTML = `<span>${scoreFormatted}</span> / <span>${completedTasks}</span> <span class="accuracy-percentage">(${accuracyPercentage}%)</span>`;
|
||||
} else {
|
||||
scoreDisplay.innerHTML = '<span>0.00</span> / <span>0</span>';
|
||||
scoreDisplay.innerHTML = '<span>0.00</span> / <span>0</span> <span class="accuracy-percentage">(0.0%)</span>';
|
||||
}
|
||||
|
||||
// Highlight the currently selected statistics card
|
||||
@@ -279,6 +288,10 @@ function renderTasks(data) {
|
||||
// Create header with task type name and statistics
|
||||
const typeHeader = document.createElement('div');
|
||||
typeHeader.className = 'task-type-header';
|
||||
|
||||
// Get category stats for this task type
|
||||
const stats = categoryStats[taskType] || {};
|
||||
|
||||
typeHeader.innerHTML = `
|
||||
<span class="task-type-name"><i class="fas fa-layer-group"></i> ${taskType}</span>
|
||||
<div class="task-type-stats">
|
||||
@@ -286,6 +299,9 @@ function renderTasks(data) {
|
||||
<span class="task-stat"><i class="fas fa-tasks"></i> ${tasks.length} total</span>
|
||||
<span class="task-stat running"><i class="fas fa-running"></i> ${runningCount} active</span>
|
||||
<span class="task-stat completed"><i class="fas fa-check-circle"></i> ${completedCount} completed</span>
|
||||
${stats.avg_score ? `<span class="task-stat score"><i class="fas fa-star"></i> ${stats.avg_score} avg score</span>` : ''}
|
||||
${stats.avg_steps ? `<span class="task-stat steps"><i class="fas fa-chart-line"></i> ${stats.avg_steps} avg steps</span>` : ''}
|
||||
${stats.completion_rate ? `<span class="task-stat rate"><i class="fas fa-percentage"></i> ${stats.completion_rate}% completed</span>` : ''}
|
||||
</div>
|
||||
`;
|
||||
typeSection.appendChild(typeHeader);
|
||||
@@ -453,7 +469,181 @@ function renderTasks(data) {
|
||||
container.appendChild(typeSection);
|
||||
});
|
||||
}
|
||||
// add auto-refresh with time interval 10 seconds
|
||||
setInterval(() => {
|
||||
refreshPage();
|
||||
}, 10000); // 10 seconds interval
|
||||
|
||||
function fetchAvailableConfigs() {
|
||||
return fetch('/api/available-configs')
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
availableConfigs = data;
|
||||
populateConfigSelect();
|
||||
return data;
|
||||
})
|
||||
.catch(error => {
|
||||
console.error('Error fetching available configs:', error);
|
||||
return [];
|
||||
});
|
||||
}
|
||||
|
||||
function populateConfigSelect() {
|
||||
const select = document.getElementById('config-select');
|
||||
select.innerHTML = '';
|
||||
|
||||
if (availableConfigs.length === 0) {
|
||||
select.innerHTML = '<option value="">No configurations found in results directory</option>';
|
||||
return;
|
||||
}
|
||||
|
||||
// Add available configurations
|
||||
availableConfigs.forEach((config, index) => {
|
||||
const option = document.createElement('option');
|
||||
option.value = index;
|
||||
option.textContent = `${config.action_space} / ${config.observation_type} / ${config.model_name}`;
|
||||
select.appendChild(option);
|
||||
});
|
||||
}
|
||||
|
||||
function changeConfiguration() {
|
||||
const select = document.getElementById('config-select');
|
||||
const selectedIndex = select.value;
|
||||
|
||||
if (selectedIndex === '' || selectedIndex < 0 || selectedIndex >= availableConfigs.length) {
|
||||
return;
|
||||
}
|
||||
|
||||
const selectedConfig = availableConfigs[selectedIndex];
|
||||
|
||||
// Send configuration change request
|
||||
fetch('/api/set-config', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
body: JSON.stringify(selectedConfig)
|
||||
})
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
currentConfig = data;
|
||||
displayConfig(data);
|
||||
// Refresh tasks with new configuration
|
||||
fetchTasks();
|
||||
})
|
||||
.catch(error => {
|
||||
console.error('Error setting config:', error);
|
||||
displayConfigError();
|
||||
});
|
||||
}
|
||||
|
||||
function fetchConfig() {
|
||||
return fetch('/api/current-config')
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
currentConfig = data;
|
||||
displayConfig(data);
|
||||
updateConfigSelect();
|
||||
return data;
|
||||
})
|
||||
.catch(error => {
|
||||
console.error('Error fetching config:', error);
|
||||
displayConfigError();
|
||||
});
|
||||
}
|
||||
|
||||
function updateConfigSelect() {
|
||||
if (!currentConfig || availableConfigs.length === 0) return;
|
||||
|
||||
const select = document.getElementById('config-select');
|
||||
const currentConfigIndex = availableConfigs.findIndex(config =>
|
||||
config.action_space === currentConfig.action_space &&
|
||||
config.observation_type === currentConfig.observation_type &&
|
||||
config.model_name === currentConfig.model_name
|
||||
);
|
||||
|
||||
if (currentConfigIndex !== -1) {
|
||||
select.value = currentConfigIndex;
|
||||
} else {
|
||||
// Current config not found in available configs, select the first one if available
|
||||
if (availableConfigs.length > 0) {
|
||||
select.value = 0;
|
||||
console.warn('Current config not found in available configs, defaulting to first available config');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function displayConfig(config) {
|
||||
document.getElementById('action-space').textContent = config.action_space || 'N/A';
|
||||
document.getElementById('observation-type').textContent = config.observation_type || 'N/A';
|
||||
document.getElementById('model-name').textContent = config.model_name || 'N/A';
|
||||
document.getElementById('max-steps').textContent = config.max_steps || 'N/A';
|
||||
}
|
||||
|
||||
function displayConfigError() {
|
||||
const configValues = document.querySelectorAll('.config-value');
|
||||
configValues.forEach(element => {
|
||||
element.textContent = 'Error loading';
|
||||
element.style.color = '#dc3545';
|
||||
});
|
||||
}
|
||||
|
||||
function calculateCategoryStats(data) {
|
||||
const stats = {};
|
||||
|
||||
Object.entries(data).forEach(([taskType, tasks]) => {
|
||||
let totalTasks = tasks.length;
|
||||
let completedTasks = 0;
|
||||
let runningTasks = 0;
|
||||
let errorTasks = 0;
|
||||
let totalScore = 0;
|
||||
let totalSteps = 0;
|
||||
let completedWithSteps = 0;
|
||||
|
||||
tasks.forEach(task => {
|
||||
const status = task.status.status;
|
||||
|
||||
if (['Done', 'Done (Message Exit)', 'Done (Max Steps)', 'Done (Thought Exit)'].includes(status)) {
|
||||
completedTasks++;
|
||||
|
||||
// Calculate score if available
|
||||
if (task.status.result) {
|
||||
try {
|
||||
const score = parseFloat(task.status.result);
|
||||
if (!isNaN(score) && score >= 0 && score <= 1) {
|
||||
totalScore += score;
|
||||
}
|
||||
} catch (e) {
|
||||
// Ignore parsing errors
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate steps for completed tasks
|
||||
if (task.status.progress && task.status.progress > 0) {
|
||||
totalSteps += task.status.progress;
|
||||
completedWithSteps++;
|
||||
}
|
||||
|
||||
} else if (['Running', 'Preparing', 'Initializing'].includes(status)) {
|
||||
runningTasks++;
|
||||
|
||||
} else if (status === 'Error') {
|
||||
errorTasks++;
|
||||
}
|
||||
});
|
||||
|
||||
// Calculate averages
|
||||
const avgScore = completedTasks > 0 ? totalScore / completedTasks : 0;
|
||||
const avgSteps = completedWithSteps > 0 ? totalSteps / completedWithSteps : 0;
|
||||
const completionRate = totalTasks > 0 ? (completedTasks / totalTasks * 100) : 0;
|
||||
|
||||
stats[taskType] = {
|
||||
total_tasks: totalTasks,
|
||||
completed_tasks: completedTasks,
|
||||
running_tasks: runningTasks,
|
||||
error_tasks: errorTasks,
|
||||
total_score: Math.round(totalScore * 100) / 100,
|
||||
avg_score: Math.round(avgScore * 10000) / 10000,
|
||||
avg_steps: Math.round(avgSteps * 10) / 10,
|
||||
completion_rate: Math.round(completionRate * 10) / 10
|
||||
};
|
||||
});
|
||||
|
||||
return stats;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user