Files
sci-gui-agent-benchmark/monitor/static/index.js
Zilong Zhou 74b7c189af Feat/monitor (#254)
* feat: add claude support

* feat: add script for end-to-end evaluation with logging and task distribution

* feat&fix: add tool result handling and update model default in evaluation script

* chore: remove run_test_env.py script

* feat&fix: implement action parsing for tool calls and update default action space

* fix: update text formatting in action parsing and replace logger import

* feat&fix: implement action parsing for tool calls and add screen size handling

* feat: add setup instructions for Anthropic API integration

* feat: add notice about image size limitations for Anthropic API

* Delete test_env/logger.py

* Delete test_env/utils.py

* fix: update logger usage to use global logger and improve error handling

* feat&fix: add configuration management API endpoints and update UI for configuration selection

* feat&fix: update environment configuration, enhance task statistics, and improve UI responsiveness

* feat&fix: add configuration toggle button in UI and improve task loading performance

* feat&fix: add accuracy percentage display to score and style updates for UI
2025-07-14 13:43:41 +08:00

650 lines
28 KiB
JavaScript

document.addEventListener('DOMContentLoaded', () => {
fetchAvailableConfigs().then(() => {
fetchConfig();
fetchTasks();
});
// Bind filter functionality
document.getElementById('total-tasks').parentElement.addEventListener('click', () => setTaskFilter('all'));
document.getElementById('active-tasks').parentElement.addEventListener('click', () => setTaskFilter('active'));
document.getElementById('completed-tasks').parentElement.addEventListener('click', () => setTaskFilter('completed'));
document.getElementById('error-tasks').parentElement.addEventListener('click', () => setTaskFilter('error'));
});
let allTaskData = null;
let currentFilter = 'all';
let availableConfigs = [];
let currentConfig = null;
let categoryStats = {};
function refreshPage() {
// Save expanded state before refresh
const expandedTaskTypes = [];
document.querySelectorAll('.task-type').forEach(section => {
if (!section.classList.contains('collapsed')) {
const typeName = section.querySelector('.task-type-name').textContent.trim();
expandedTaskTypes.push(typeName);
}
});
// Store in sessionStorage
sessionStorage.setItem('expandedTaskTypes', JSON.stringify(expandedTaskTypes));
// Only fetch brief data for update to improve refresh speed
fetchTasksForRefresh();
}
function fetchTasksForRefresh() {
fetch('/api/tasks/brief')
.then(response => response.json())
.then(data => {
allTaskData = data;
categoryStats = calculateCategoryStats(data);
// Only update statistics and task status, do not fully re-render
updateStatistics(data);
updateTaskStatus(data);
})
.catch(error => console.error('Error refreshing tasks:', error));
}
// New function: only update task status, do not re-render the entire list
function updateTaskStatus(data) {
// Add pulse animation to score banner when refreshing
const scoreBanner = document.querySelector('.score-banner');
if (scoreBanner) {
scoreBanner.classList.add('refreshing');
setTimeout(() => {
scoreBanner.classList.remove('refreshing');
}, 1000);
}
// Update the status display of each task
Object.entries(data).forEach(([taskType, tasks]) => {
tasks.forEach(task => {
// Find the corresponding task card
const taskCard = document.querySelector(`.task-card[data-task-id="${task.id}"][data-task-type="${taskType}"]`);
if (!taskCard) return;
// Update status display
const statusElement = taskCard.querySelector('.task-status');
if (statusElement) {
// Remove all status classes
statusElement.classList.remove('status-not-started', 'status-preparing', 'status-running', 'status-completed', 'status-error', 'status-unknown');
// Set new status class and icon
let statusClass = '';
let statusIcon = '';
switch(task.status.status) {
case 'Not Started':
statusClass = 'status-not-started';
statusIcon = 'fa-hourglass-start';
break;
case 'Preparing':
case 'Initializing':
statusClass = 'status-preparing';
statusIcon = 'fa-spinner fa-pulse';
break;
case 'Running':
statusClass = 'status-running';
statusIcon = 'fa-running';
break;
case 'Done':
case 'Done (Message Exit)':
case 'Done (Max Steps)':
case 'Done (Thought Exit)':
statusClass = 'status-completed';
statusIcon = 'fa-check-circle';
break;
case 'Error':
statusClass = 'status-error';
statusIcon = 'fa-exclamation-circle';
break;
default:
statusClass = 'status-unknown';
statusIcon = 'fa-question-circle';
break;
}
statusElement.classList.add(statusClass);
statusElement.innerHTML = `<i class="fas ${statusIcon}"></i> ${task.status.status}`;
}
// Update progress bar
if (task.status.progress > 0) {
const progressText = taskCard.querySelector('.task-details div:first-child');
if (progressText) {
progressText.innerHTML = `<i class="fas fa-chart-line"></i> Progress: ${task.status.progress}/${task.status.max_steps} step(s)`;
}
const progressFill = taskCard.querySelector('.progress-fill');
if (progressFill) {
const percentage = (task.status.progress / task.status.max_steps) * 100;
progressFill.style.width = `${percentage}%`;
}
const progressPercentage = taskCard.querySelector('.progress-percentage');
if (progressPercentage) {
const percentage = (task.status.progress / task.status.max_steps) * 100;
progressPercentage.textContent = `${Math.round(percentage)}%`;
}
}
// Update last update time
const timestamp = taskCard.querySelector('.timestamp');
if (timestamp && task.status.last_update) {
timestamp.innerHTML = `<i class="far fa-clock"></i> Last Update: ${task.status.last_update}`;
}
// Update result info
if (task.status.result) {
let resultDiv = taskCard.querySelector('.task-result');
if (!resultDiv) {
resultDiv = document.createElement('div');
resultDiv.className = 'task-result';
taskCard.querySelector('.task-details').appendChild(resultDiv);
}
resultDiv.innerHTML = `<strong><i class="fas fa-flag-checkered"></i> Result:</strong> ${task.status.result}`;
}
});
});
}
function fetchTasks() {
fetch('/api/tasks/brief')
.then(response => response.json())
.then(data => {
allTaskData = data;
categoryStats = calculateCategoryStats(data);
renderTasks(data);
updateStatistics(data);
})
.catch(error => console.error('Error fetching tasks:', error));
}
function setTaskFilter(filter) {
currentFilter = filter;
if (!allTaskData) return;
renderTasks(allTaskData);
// Highlight selected card
document.querySelectorAll('.stat-card').forEach(card => card.classList.remove('selected'));
if (filter === 'all') {
document.getElementById('total-tasks').parentElement.classList.add('selected');
} else if (filter === 'active') {
document.getElementById('active-tasks').parentElement.classList.add('selected');
} else if (filter === 'completed') {
document.getElementById('completed-tasks').parentElement.classList.add('selected');
} else if (filter === 'error') {
document.getElementById('error-tasks').parentElement.classList.add('selected');
}
}
// Update statistics info
function updateStatistics(data) {
let totalTasks = 0;
let activeTasks = 0;
let completedTasks = 0;
let errorTasks = 0;
let totalScore = 0;
Object.entries(data).forEach(([taskType, tasks]) => {
totalTasks += tasks.length;
tasks.forEach(task => {
if (task.status.status === 'Running' || task.status.status === 'Preparing' || task.status.status === 'Initializing') {
activeTasks++;
} else if (task.status.status === 'Done' || task.status.status === 'Done (Message Exit)' || task.status.status === 'Done (Max Steps)' || task.status.status === 'Done (Thought Exit)') {
completedTasks++;
// Calculate score if task is completed
if (task.status.result) {
try {
const score = parseFloat(task.status.result);
if (!isNaN(score) && score >= 0 && score <= 1) {
totalScore += score;
}
} catch (e) {
console.log(`Could not parse score for task: ${task.id}`);
}
}
} else if (task.status.status === 'Error') {
errorTasks++;
}
});
});
document.getElementById('total-tasks').textContent = totalTasks;
document.getElementById('active-tasks').textContent = activeTasks;
document.getElementById('completed-tasks').textContent = completedTasks;
document.getElementById('error-tasks').textContent = errorTasks;
// Update score display with formatted score and accuracy percentage
const scoreDisplay = document.getElementById('score-display');
if (completedTasks > 0) {
const scoreFormatted = totalScore.toFixed(2);
const averageScore = totalScore / completedTasks;
const accuracyPercentage = (averageScore * 100).toFixed(1);
scoreDisplay.innerHTML = `<span>${scoreFormatted}</span> / <span>${completedTasks}</span> <span class="accuracy-percentage">(${accuracyPercentage}%)</span>`;
} else {
scoreDisplay.innerHTML = '<span>0.00</span> / <span>0</span> <span class="accuracy-percentage">(0.0%)</span>';
}
// Highlight the currently selected statistics card
document.querySelectorAll('.stat-card').forEach(card => card.classList.remove('selected'));
if (currentFilter === 'all') {
document.getElementById('total-tasks').parentElement.classList.add('selected');
} else if (currentFilter === 'active') {
document.getElementById('active-tasks').parentElement.classList.add('selected');
} else if (currentFilter === 'completed') {
document.getElementById('completed-tasks').parentElement.classList.add('selected');
} else if (currentFilter === 'error') {
document.getElementById('error-tasks').parentElement.classList.add('selected');
}
}
function renderTasks(data) {
const container = document.getElementById('task-container');
container.innerHTML = '';
let filteredData = {};
if (currentFilter === 'all') {
filteredData = data;
} else {
Object.entries(data).forEach(([taskType, tasks]) => {
let filteredTasks = [];
if (currentFilter === 'active') {
filteredTasks = tasks.filter(task => ['Running', 'Preparing', 'Initializing'].includes(task.status.status));
} else if (currentFilter === 'completed') {
filteredTasks = tasks.filter(task => task.status.status === 'Done' || task.status.status === 'Done (Message Exit)' || task.status.status === 'Done (Max Steps)'|| task.status.status === 'Done (Thought Exit)');
} else if (currentFilter === 'error') {
filteredTasks = tasks.filter(task => task.status.status === 'Error');
}
if (filteredTasks.length > 0) {
filteredData[taskType] = filteredTasks;
}
});
}
if (Object.keys(filteredData).length === 0) {
container.innerHTML = '<div class="no-tasks"><i class="fas fa-info-circle"></i> No tasks at the moment</div>';
return;
}
Object.entries(filteredData).forEach(([taskType, tasks]) => {
// Calculate task statistics for this type
let runningCount = 0;
let completedCount = 0;
let errorCount = 0;
tasks.forEach(task => {
if (task.status.status === 'Running' || task.status.status === 'Preparing' || task.status.status === 'Initializing') {
runningCount++;
} else if (task.status.status === 'Done' || task.status.status === 'Done (Message Exit)' || task.status.status === 'Done (Max Steps)' || task.status.status === 'Done (Thought Exit)') {
completedCount++;
} else if (task.status.status === 'Error') {
errorCount++;
}
});
// Create the task type card
const typeSection = document.createElement('div');
typeSection.className = 'task-type';
// Create header with task type name and statistics
const typeHeader = document.createElement('div');
typeHeader.className = 'task-type-header';
// Get category stats for this task type
const stats = categoryStats[taskType] || {};
typeHeader.innerHTML = `
<span class="task-type-name"><i class="fas fa-layer-group"></i> ${taskType}</span>
<div class="task-type-stats">
${errorCount > 0 ? `<span class="task-stat error"><i class="fas fa-exclamation-circle"></i> ${errorCount} error</span>` : ''}
<span class="task-stat"><i class="fas fa-tasks"></i> ${tasks.length} total</span>
<span class="task-stat running"><i class="fas fa-running"></i> ${runningCount} active</span>
<span class="task-stat completed"><i class="fas fa-check-circle"></i> ${completedCount} completed</span>
${stats.avg_score ? `<span class="task-stat score"><i class="fas fa-star"></i> ${stats.avg_score} avg score</span>` : ''}
${stats.avg_steps ? `<span class="task-stat steps"><i class="fas fa-chart-line"></i> ${stats.avg_steps} avg steps</span>` : ''}
${stats.completion_rate ? `<span class="task-stat rate"><i class="fas fa-percentage"></i> ${stats.completion_rate}% completed</span>` : ''}
</div>
`;
typeSection.appendChild(typeHeader);
// Create container for task cards
const tasksContainer = document.createElement('div');
tasksContainer.className = 'tasks-container';
// Set default collapsed state
typeSection.classList.add('collapsed');
tasksContainer.setAttribute('aria-hidden', 'true');
if (tasks.length === 0) {
const noTasks = document.createElement('div');
noTasks.className = 'no-tasks';
noTasks.innerHTML = '<i class="fas fa-info-circle"></i> No Tasks Available';
tasksContainer.appendChild(noTasks);
} else {
// Add scrolling for large task lists
if (tasks.length > 10) {
tasksContainer.style.maxHeight = '600px';
tasksContainer.style.overflowY = 'auto';
}
tasks.forEach(task => {
const taskCard = document.createElement('div');
taskCard.className = 'task-card';
// Add data attributes for later updates
taskCard.setAttribute('data-task-id', task.id);
taskCard.setAttribute('data-task-type', taskType);
const taskHeader = document.createElement('div');
taskHeader.className = 'task-header';
const taskTitle = document.createElement('div');
taskTitle.className = 'task-title';
taskTitle.innerHTML = `<i class="fas fa-tasks"></i> Task ID: ${task.id}`;
taskHeader.appendChild(taskTitle);
const taskStatus = document.createElement('div');
taskStatus.className = 'task-status';
let statusClass = '';
let statusIcon = '';
switch(task.status.status) {
case 'Not Started':
statusClass = 'status-not-started';
statusIcon = 'fa-hourglass-start';
break;
case 'Preparing':
case 'Initializing':
statusClass = 'status-preparing';
statusIcon = 'fa-spinner fa-pulse';
break;
case 'Running':
statusClass = 'status-running';
statusIcon = 'fa-running';
break;
case 'Done':
case 'Done (Message Exit)':
case 'Done (Max Steps)':
case 'Done (Thought Exit)':
statusClass = 'status-completed';
statusIcon = 'fa-check-circle';
break;
case 'Error':
statusClass = 'status-error';
statusIcon = 'fa-exclamation-circle';
break;
default:
statusClass = 'status-unknown';
statusIcon = 'fa-question-circle';
break;
}
taskStatus.classList.add(statusClass);
taskStatus.innerHTML = `<i class="fas ${statusIcon}"></i> ${task.status.status}`;
taskHeader.appendChild(taskStatus);
taskCard.appendChild(taskHeader);
const taskInstruction = document.createElement('div');
taskInstruction.className = 'task-instruction';
taskInstruction.innerHTML = `<strong><i class="fas fa-info-circle"></i> Instruction:</strong> ${task.instruction}`;
taskCard.appendChild(taskInstruction);
const taskProgress = document.createElement('div');
taskProgress.className = 'task-details';
if (task.status.progress > 0) {
const progressText = document.createElement('div');
progressText.innerHTML = `<i class="fas fa-chart-line"></i> Progress: ${task.status.progress}/${task.status.max_steps} step(s)`;
taskProgress.appendChild(progressText);
const progressBar = document.createElement('div');
progressBar.className = 'progress-bar';
const progressFill = document.createElement('div');
progressFill.className = 'progress-fill';
const percentage = (task.status.progress / task.status.max_steps) * 100;
progressFill.style.width = `${percentage}%`;
progressBar.appendChild(progressFill);
taskProgress.appendChild(progressBar);
const progressPercentage = document.createElement('div');
progressPercentage.className = 'progress-percentage';
progressPercentage.textContent = `${Math.round(percentage)}%`;
taskProgress.appendChild(progressPercentage);
}
if (task.status.last_update) {
const timestamp = document.createElement('div');
timestamp.className = 'timestamp';
timestamp.innerHTML = `<i class="far fa-clock"></i> Last Update: ${task.status.last_update}`;
taskProgress.appendChild(timestamp);
}
if (task.status.result) {
const resultDiv = document.createElement('div');
resultDiv.className = 'task-result';
resultDiv.innerHTML = `<strong><i class="fas fa-flag-checkered"></i> Result:</strong> ${task.status.result}`;
taskProgress.appendChild(resultDiv);
}
taskCard.appendChild(taskProgress);
if (task.status.status !== 'Not Started') {
taskCard.style.cursor = 'pointer';
taskCard.addEventListener('click', () => {
window.location.href = `/task/${taskType}/${task.id}`;
});
}
tasksContainer.appendChild(taskCard);
});
}
typeSection.appendChild(tasksContainer);
// Toggle collapse when clicking on the header
typeHeader.addEventListener('click', (event) => {
// Prevent toggling when clicking task cards
if (!event.target.closest('.task-card')) {
typeSection.classList.toggle('collapsed');
// Set appropriate aria attributes for accessibility
const isCollapsed = typeSection.classList.contains('collapsed');
tasksContainer.setAttribute('aria-hidden', isCollapsed);
// Update session storage with current expanded state
const expandedTaskTypes = [];
document.querySelectorAll('.task-type').forEach(section => {
if (!section.classList.contains('collapsed')) {
const typeName = section.querySelector('.task-type-name').textContent.trim();
expandedTaskTypes.push(typeName);
}
});
sessionStorage.setItem('expandedTaskTypes', JSON.stringify(expandedTaskTypes));
}
});
// Check if this task type was expanded before refresh
const expandedTaskTypes = JSON.parse(sessionStorage.getItem('expandedTaskTypes') || '[]');
if (expandedTaskTypes.includes(taskType)) {
typeSection.classList.remove('collapsed');
tasksContainer.setAttribute('aria-hidden', 'false');
}
container.appendChild(typeSection);
});
}
function fetchAvailableConfigs() {
return fetch('/api/available-configs')
.then(response => response.json())
.then(data => {
availableConfigs = data;
populateConfigSelect();
return data;
})
.catch(error => {
console.error('Error fetching available configs:', error);
return [];
});
}
function populateConfigSelect() {
const select = document.getElementById('config-select');
select.innerHTML = '';
if (availableConfigs.length === 0) {
select.innerHTML = '<option value="">No configurations found in results directory</option>';
return;
}
// Add available configurations
availableConfigs.forEach((config, index) => {
const option = document.createElement('option');
option.value = index;
option.textContent = `${config.action_space} / ${config.observation_type} / ${config.model_name}`;
select.appendChild(option);
});
}
function changeConfiguration() {
const select = document.getElementById('config-select');
const selectedIndex = select.value;
if (selectedIndex === '' || selectedIndex < 0 || selectedIndex >= availableConfigs.length) {
return;
}
const selectedConfig = availableConfigs[selectedIndex];
// Send configuration change request
fetch('/api/set-config', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify(selectedConfig)
})
.then(response => response.json())
.then(data => {
currentConfig = data;
displayConfig(data);
// Refresh tasks with new configuration
fetchTasks();
})
.catch(error => {
console.error('Error setting config:', error);
displayConfigError();
});
}
function fetchConfig() {
return fetch('/api/current-config')
.then(response => response.json())
.then(data => {
currentConfig = data;
displayConfig(data);
updateConfigSelect();
return data;
})
.catch(error => {
console.error('Error fetching config:', error);
displayConfigError();
});
}
function updateConfigSelect() {
if (!currentConfig || availableConfigs.length === 0) return;
const select = document.getElementById('config-select');
const currentConfigIndex = availableConfigs.findIndex(config =>
config.action_space === currentConfig.action_space &&
config.observation_type === currentConfig.observation_type &&
config.model_name === currentConfig.model_name
);
if (currentConfigIndex !== -1) {
select.value = currentConfigIndex;
} else {
// Current config not found in available configs, select the first one if available
if (availableConfigs.length > 0) {
select.value = 0;
console.warn('Current config not found in available configs, defaulting to first available config');
}
}
}
function displayConfig(config) {
document.getElementById('action-space').textContent = config.action_space || 'N/A';
document.getElementById('observation-type').textContent = config.observation_type || 'N/A';
document.getElementById('model-name').textContent = config.model_name || 'N/A';
document.getElementById('max-steps').textContent = config.max_steps || 'N/A';
}
function displayConfigError() {
const configValues = document.querySelectorAll('.config-value');
configValues.forEach(element => {
element.textContent = 'Error loading';
element.style.color = '#dc3545';
});
}
function calculateCategoryStats(data) {
const stats = {};
Object.entries(data).forEach(([taskType, tasks]) => {
let totalTasks = tasks.length;
let completedTasks = 0;
let runningTasks = 0;
let errorTasks = 0;
let totalScore = 0;
let totalSteps = 0;
let completedWithSteps = 0;
tasks.forEach(task => {
const status = task.status.status;
if (['Done', 'Done (Message Exit)', 'Done (Max Steps)', 'Done (Thought Exit)'].includes(status)) {
completedTasks++;
// Calculate score if available
if (task.status.result) {
try {
const score = parseFloat(task.status.result);
if (!isNaN(score) && score >= 0 && score <= 1) {
totalScore += score;
}
} catch (e) {
// Ignore parsing errors
}
}
// Calculate steps for completed tasks
if (task.status.progress && task.status.progress > 0) {
totalSteps += task.status.progress;
completedWithSteps++;
}
} else if (['Running', 'Preparing', 'Initializing'].includes(status)) {
runningTasks++;
} else if (status === 'Error') {
errorTasks++;
}
});
// Calculate averages
const avgScore = completedTasks > 0 ? totalScore / completedTasks : 0;
const avgSteps = completedWithSteps > 0 ? totalSteps / completedWithSteps : 0;
const completionRate = totalTasks > 0 ? (completedTasks / totalTasks * 100) : 0;
stats[taskType] = {
total_tasks: totalTasks,
completed_tasks: completedTasks,
running_tasks: runningTasks,
error_tasks: errorTasks,
total_score: Math.round(totalScore * 100) / 100,
avg_score: Math.round(avgScore * 10000) / 10000,
avg_steps: Math.round(avgSteps * 10) / 10,
completion_rate: Math.round(completionRate * 10) / 10
};
});
return stats;
}