diff --git a/evaluation_examples/test_small.json b/evaluation_examples/test_small.json index dbf95d3..3e0f127 100644 --- a/evaluation_examples/test_small.json +++ b/evaluation_examples/test_small.json @@ -29,33 +29,6 @@ "46407397-a7d5-4c6b-92c6-dbe038b1457b", "4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc", "510f64c8-9bcc-4be1-8d30-638705850618", - "897e3b53-5d4d-444b-85cb-2cdc8a97d903", - "c867c42d-a52d-4a24-8ae3-f75d256b5618", - "74d5859f-ed66-4d3e-aa0e-93d7a592ce41", - "b5062e3e-641c-4e3a-907b-ac864d2e7652", - "48d05431-6cd5-4e76-82eb-12b60d823f7d", - "eb303e01-261e-4972-8c07-c9b4e7a4922a", - "d1acdb87-bb67-4f30-84aa-990e56a09c92", - "deec51c9-3b1e-4b9e-993c-4776f20e8bb2", - "8e116af7-7db7-4e35-a68b-b0939c066c78", - "716a6079-22da-47f1-ba73-c9d58f986a38", - "2373b66a-092d-44cb-bfd7-82e86e7a3b4d" - ], - "os": [ - "5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57", - "5812b315-e7bd-4265-b51f-863c02174c28" - ], - "thunderbird": [ - "dfac9ee8-9bc4-4cdc-b465-4a4bfcd2f397", - "15c3b339-88f7-4a86-ab16-e71c58dcb01e" - ], - "vlc": [ - "59f21cfb-0120-4326-b255-a5b827b38967", - "8f080098-ddb1-424c-b438-4e96e5e4786e" - ], - "vs_code": [ - "0ed39f63-6049-43d4-ba4d-5fa2fe04a951", - "53ad5833-3455-407b-bbc6-45b4c79ab8fb", - "276cc624-87ea-4f08-ab93-f770e3790175" + "897e3b53-5d4d-444b-85cb-2cdc8a97d903" ] } \ No newline at end of file diff --git a/monitor/Dockerfile b/monitor/Dockerfile index 4692b7f..04311e0 100644 --- a/monitor/Dockerfile +++ b/monitor/Dockerfile @@ -1,14 +1,11 @@ FROM python:3.9-slim -WORKDIR /app +WORKDIR /app/monitor # Install dependencies COPY monitor/requirements.txt ./ RUN pip install --no-cache-dir -r requirements.txt -# Copy application code -COPY monitor/ ./ - # Expose port (will be overridden by environment variable) ARG FLASK_PORT=8080 EXPOSE ${FLASK_PORT} diff --git a/monitor/README.md b/monitor/README.md index b746eb6..e2d640a 100644 --- a/monitor/README.md +++ b/monitor/README.md @@ -28,11 +28,11 @@ The monitor can be configured by editing the `.env` file in the monitor director For example: ```bash # .env -TASK_CONFIG_PATH=evaluation_examples/test_small.json -EXAMPLES_BASE_PATH=evaluation_examples/examples -RESULTS_BASE_PATH=results_operator_aws/pyautogui/screenshot/computer-use-preview +TASK_CONFIG_PATH=../evaluation_examples/test_small.json +EXAMPLES_BASE_PATH=../evaluation_examples/examples +RESULTS_BASE_PATH=../results_operator_aws/pyautogui/screenshot/computer-use-preview MAX_STEPS=50 -FLASK_PORT=8080 +FLASK_PORT=80 FLASK_HOST=0.0.0.0 FLASK_DEBUG=true ``` diff --git a/monitor/docker-compose.yml b/monitor/docker-compose.yml index f5463fa..f7fb056 100644 --- a/monitor/docker-compose.yml +++ b/monitor/docker-compose.yml @@ -8,6 +8,7 @@ services: ports: - "${FLASK_PORT:-8080}:8080" volumes: + - .:/app/monitor - ../evaluation_examples:/app/evaluation_examples - ../results_operator_aws:/app/results_operator_aws env_file: diff --git a/monitor/main.py b/monitor/main.py index 1e1bfec..1d6bd96 100644 --- a/monitor/main.py +++ b/monitor/main.py @@ -15,9 +15,9 @@ load_dotenv() app = Flask(__name__) # Load configuration from environment variables -TASK_CONFIG_PATH = os.getenv("TASK_CONFIG_PATH", "evaluation_examples/test_small.json") -EXAMPLES_BASE_PATH = os.getenv("EXAMPLES_BASE_PATH", "evaluation_examples/examples") -RESULTS_BASE_PATH = os.getenv("RESULTS_BASE_PATH", "results_operator_aws/pyautogui/screenshot/computer-use-preview") +TASK_CONFIG_PATH = os.getenv("TASK_CONFIG_PATH", "../evaluation_examples/test_small.json") +EXAMPLES_BASE_PATH = os.getenv("EXAMPLES_BASE_PATH", "../evaluation_examples/examples") +RESULTS_BASE_PATH = os.getenv("RESULTS_BASE_PATH", "../results_operator_aws/pyautogui/screenshot/computer-use-preview") MAX_STEPS = int(os.getenv("MAX_STEPS", "50")) def load_task_list(): @@ -71,11 +71,59 @@ def get_task_status(task_type, task_id): last_step = steps[-1] - # check if the task is done + # Check the log file for agent responses and exit conditions + log_data = { + "agent_responses": [], + "exit_condition": None, + "last_message": None + } + + if os.path.exists(log_file): + try: + with open(log_file, 'r') as f: + log_content = f.readlines() + last_response = None + + for i, line in enumerate(log_content): + # Extract agent responses for each step + if "Responses: [" in line: + response_text = line.split("Responses: [")[1].strip() + if response_text.endswith("]"): + response_text = response_text[:-1] # Remove closing bracket + + # Clean up the response text - remove quotes + if response_text.startswith("'") and response_text.endswith("'"): + response_text = response_text[1:-1] # Remove surrounding quotes + elif response_text == '"]': # Empty response + response_text = "" + + # Handle list of responses + if response_text and "', '" in response_text: + responses = [r.strip("'") for r in response_text.split("', '")] + log_data["agent_responses"].append(responses[0]) # Use first response + last_response = responses[0] # Keep track of the last response + elif response_text: + log_data["agent_responses"].append(response_text) + last_response = response_text # Keep track of the last response + + # Check for exit conditions near the end of the log + if "The state of the agent is not correct" in line or "Exit condition met" in line: + log_data["exit_condition"] = line.strip() + # If this is a message exit, save the last response as the last message + if "message_exit: True" in line and last_response: + log_data["last_message"] = last_response + except Exception as e: + log_data["error"] = f"Error parsing log file: {str(e)}" + + # check if the task is done based on both trajectory and log if last_step.get("done", False): status = "Done" elif last_step.get("Error", False): status = "Error" + elif log_data.get("exit_condition") and "message_exit: True" in log_data.get("exit_condition", ""): + status = "Done (Message Exit)" + elif len(steps) >= MAX_STEPS: + status = "Done (Max Steps)" else: status = "Running" @@ -86,7 +134,7 @@ def get_task_status(task_type, task_id): last_update = "None" result_content = "Task not completed" - if status == "Done": + if status.startswith("Done"): if os.path.exists(result_file): with open(result_file, 'r') as f: result_content = f.read().strip() @@ -99,6 +147,7 @@ def get_task_status(task_type, task_id): "max_steps": MAX_STEPS, "last_update": last_update, "steps": steps, + "log_data": log_data, "result": result_content } diff --git a/monitor/static/favicon.ico b/monitor/static/favicon.ico new file mode 100644 index 0000000..e69de29 diff --git a/monitor/static/favicon.png b/monitor/static/favicon.png new file mode 100644 index 0000000..7fe40ab Binary files /dev/null and b/monitor/static/favicon.png differ diff --git a/monitor/static/index.css b/monitor/static/index.css index 87a3a28..1bdb589 100644 --- a/monitor/static/index.css +++ b/monitor/static/index.css @@ -56,6 +56,17 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; } margin-bottom: 10px; display: block; } + +/* Specific colors for different stat cards */ +.stat-card:nth-child(4) i { color: #007bff; } /* Total - Blue */ +.stat-card:nth-child(4):hover { background: linear-gradient(135deg, #f0f7ff, #e6f0fb); } +.stat-card:nth-child(1) i { color: #17a2b8; } /* Active - Cyan */ +.stat-card:nth-child(1):hover { background: linear-gradient(135deg, #e3fafd, #d1f2f6); } +.stat-card:nth-child(2) i { color: #28a745; } /* Completed - Green */ +.stat-card:nth-child(2):hover { background: linear-gradient(135deg, #e6f9ea, #d4f7db); } +.stat-card:nth-child(3) i { color: #dc3545; } /* Error - Red */ +.stat-card:nth-child(3):hover { background: linear-gradient(135deg, #feeaec, #fcd8db); } + .stat-card span { font-size: 2em; font-weight: 600; @@ -162,11 +173,12 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; } padding: 20px; transition: all 0.4s cubic-bezier(.4,0,.2,1); opacity: 1; - max-height: 2000px; + max-height: none; + overflow-y: auto; } .task-type.collapsed .tasks-container { - max-height: 0; + max-height: 0 !important; opacity: 0; padding: 0; overflow: hidden; @@ -187,6 +199,9 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; } position: relative; z-index: 2; } +.task-card:last-child { + margin-bottom: 5px; +} .task-card:hover { box-shadow: 0 10px 30px rgba(0,123,255,0.12); transform: translateY(-3px); } .task-header { display: flex; justify-content: space-between; margin-bottom: 14px; align-items: center; } .task-title { font-size: 1.2em; font-weight: 600; color: #1a237e; } @@ -196,6 +211,8 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; } .status-running { background: linear-gradient(135deg, #e3f2fd, #bbdefb); color: #0d47a1; } .status-completed { background: linear-gradient(135deg, #e8f5e9, #c8e6c9); color: #1b5e20; } .status-error { background: linear-gradient(135deg, #ffebee, #ffcdd2); color: #b71c1c; } +.status-unknown { background: linear-gradient(135deg, #e0e0e0, #bdbdbd); color: #424242; } +.status-done-max-steps { background: linear-gradient(135deg, #e8f5e9, #c8e6c9); color: #1b5e20; } .task-details { margin-top: 16px; } .progress-bar { height: 12px; background-color: #eef2f7; border-radius: 6px; margin-top: 10px; overflow: hidden; box-shadow: inset 0 1px 3px rgba(0,0,0,0.1); } .progress-fill { height: 100%; background: linear-gradient(90deg, #007bff, #00c6ff); width: 0%; transition: width 0.6s ease; } @@ -302,3 +319,22 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; } color: #0078d7; } +/* Custom scrollbar for tasks container */ +.tasks-container::-webkit-scrollbar { + width: 8px; +} + +.tasks-container::-webkit-scrollbar-track { + background: #f1f5f9; + border-radius: 4px; +} + +.tasks-container::-webkit-scrollbar-thumb { + background: #c0d6e8; + border-radius: 4px; +} + +.tasks-container::-webkit-scrollbar-thumb:hover { + background: #a5c7e5; +} + diff --git a/monitor/static/index.js b/monitor/static/index.js index 6769bb3..a8ef7b5 100644 --- a/monitor/static/index.js +++ b/monitor/static/index.js @@ -5,6 +5,7 @@ document.addEventListener('DOMContentLoaded', () => { document.getElementById('total-tasks').parentElement.addEventListener('click', () => setTaskFilter('all')); document.getElementById('active-tasks').parentElement.addEventListener('click', () => setTaskFilter('active')); document.getElementById('completed-tasks').parentElement.addEventListener('click', () => setTaskFilter('completed')); + document.getElementById('error-tasks').parentElement.addEventListener('click', () => setTaskFilter('error')); }); let allTaskData = null; @@ -49,6 +50,8 @@ function setTaskFilter(filter) { document.getElementById('active-tasks').parentElement.classList.add('selected'); } else if (filter === 'completed') { document.getElementById('completed-tasks').parentElement.classList.add('selected'); + } else if (filter === 'error') { + document.getElementById('error-tasks').parentElement.classList.add('selected'); } } @@ -57,14 +60,17 @@ function updateStatistics(data) { let totalTasks = 0; let activeTasks = 0; let completedTasks = 0; + let errorTasks = 0; Object.entries(data).forEach(([taskType, tasks]) => { totalTasks += tasks.length; tasks.forEach(task => { if (task.status.status === 'Running' || task.status.status === 'Preparing' || task.status.status === 'Initializing') { activeTasks++; - } else if (task.status.status === 'Done') { + } else if (task.status.status === 'Done' || task.status.status === 'Done (Message Exit)' || task.status.status === 'Done (Max Steps)') { completedTasks++; + } else if (task.status.status === 'Error') { + errorTasks++; } }); }); @@ -72,6 +78,19 @@ function updateStatistics(data) { document.getElementById('total-tasks').textContent = totalTasks; document.getElementById('active-tasks').textContent = activeTasks; document.getElementById('completed-tasks').textContent = completedTasks; + document.getElementById('error-tasks').textContent = errorTasks; + + // 高亮显示当前选中的统计卡片 + document.querySelectorAll('.stat-card').forEach(card => card.classList.remove('selected')); + if (currentFilter === 'all') { + document.getElementById('total-tasks').parentElement.classList.add('selected'); + } else if (currentFilter === 'active') { + document.getElementById('active-tasks').parentElement.classList.add('selected'); + } else if (currentFilter === 'completed') { + document.getElementById('completed-tasks').parentElement.classList.add('selected'); + } else if (currentFilter === 'error') { + document.getElementById('error-tasks').parentElement.classList.add('selected'); + } } function renderTasks(data) { @@ -86,7 +105,9 @@ function renderTasks(data) { if (currentFilter === 'active') { filteredTasks = tasks.filter(task => ['Running', 'Preparing', 'Initializing'].includes(task.status.status)); } else if (currentFilter === 'completed') { - filteredTasks = tasks.filter(task => task.status.status === 'Done'); + filteredTasks = tasks.filter(task => task.status.status === 'Done' || task.status.status === 'Done (Message Exit)' || task.status.status === 'Done (Max Steps)'); + } else if (currentFilter === 'error') { + filteredTasks = tasks.filter(task => task.status.status === 'Error'); } if (filteredTasks.length > 0) { filteredData[taskType] = filteredTasks; @@ -107,7 +128,7 @@ function renderTasks(data) { tasks.forEach(task => { if (task.status.status === 'Running' || task.status.status === 'Preparing' || task.status.status === 'Initializing') { runningCount++; - } else if (task.status.status === 'Done') { + } else if (task.status.status === 'Done' || task.status.status === 'Done (Message Exit)' || task.status.status === 'Done (Max Steps)') { completedCount++; } else if (task.status.status === 'Error') { errorCount++; @@ -146,6 +167,12 @@ function renderTasks(data) { noTasks.innerHTML = ' No Tasks Available'; tasksContainer.appendChild(noTasks); } else { + // Add scrolling for large task lists + if (tasks.length > 10) { + tasksContainer.style.maxHeight = '600px'; + tasksContainer.style.overflowY = 'auto'; + } + tasks.forEach(task => { const taskCard = document.createElement('div'); taskCard.className = 'task-card'; @@ -178,6 +205,8 @@ function renderTasks(data) { statusIcon = 'fa-running'; break; case 'Done': + case 'Done (Message Exit)': + case 'Done (Max Steps)': statusClass = 'status-completed'; statusIcon = 'fa-check-circle'; break; @@ -185,6 +214,10 @@ function renderTasks(data) { statusClass = 'status-error'; statusIcon = 'fa-exclamation-circle'; break; + default: + statusClass = 'status-unknown'; + statusIcon = 'fa-question-circle'; + break; } taskStatus.classList.add(statusClass); @@ -202,7 +235,7 @@ function renderTasks(data) { if (task.status.progress > 0) { const progressText = document.createElement('div'); - progressText.innerHTML = ` Progress: ${task.status.progress} step(s)`; + progressText.innerHTML = ` Progress: ${task.status.progress}/${task.status.max_steps} step(s)`; taskProgress.appendChild(progressText); const progressBar = document.createElement('div'); diff --git a/monitor/static/task_detail.css b/monitor/static/task_detail.css index c399504..06da1cb 100644 --- a/monitor/static/task_detail.css +++ b/monitor/static/task_detail.css @@ -49,6 +49,11 @@ h2 { color: #0056b3; margin-top: 36px; font-size: 1.6em; font-weight: 600; } .step-card { border: none; background: #fafdff; + box-shadow: 0 4px 15px rgba(0,0,0,0.08); + margin-bottom: 25px; + border-radius: 10px; + overflow: hidden; + transition: all 0.3s; padding: 22px 26px; margin-bottom: 24px; border-radius: 10px; @@ -57,19 +62,29 @@ h2 { color: #0056b3; margin-top: 36px; font-size: 1.6em; font-weight: 600; } position: relative; overflow: hidden; } +.step-intent { + padding: 10px 20px; + background: #f0f7ff; + border-left: 4px solid #4285f4; + margin: 10px 20px; + font-size: 0.95em; + line-height: 1.5; + color: #333; +} +.exit-condition { + background: #fff8e1; + padding: 8px 12px; + border-radius: 6px; + font-family: 'Courier New', monospace; + font-size: 0.9em; + border-left: 3px solid #ffa000; +} + .step-card:hover { box-shadow: 0 10px 30px rgba(0,123,255,0.1); transform: translateY(-3px); } -.step-card:before { - content: ''; - position: absolute; - left: 0; - top: 0; - height: 100%; - width: 4px; - background: linear-gradient(to bottom, #007bff, #00c6ff); -} + .step-header { display: flex; justify-content: space-between; margin-bottom: 12px; align-items: center; } .step-title { font-weight: 600; color: #1a237e; font-size: 1.1em; } .step-time { color: #6c757d; font-size: 0.92em; } @@ -90,10 +105,7 @@ pre { box-shadow: 0 5px 15px rgba(0,0,0,0.08); transition: all 0.3s; } -.step-image:hover { - transform: scale(1.01); - box-shadow: 0 8px 25px rgba(0,0,0,0.12); -} + .no-steps { color: #8492a6; font-style: italic; @@ -154,5 +166,142 @@ pre { .status-not-started { background: linear-gradient(135deg, #f0f0f0, #e6e6e6); color: #555; } .status-preparing, .status-initializing { background: linear-gradient(135deg, #fff7e0, #ffe8a3); color: #8a6d00; } .status-running { background: linear-gradient(135deg, #e3f2fd, #bbdefb); color: #0d47a1; } -.status-done { background: linear-gradient(135deg, #e8f5e9, #c8e6c9); color: #1b5e20; } +.status-done, .status-done-message-exit, .status-done-max-steps { background: linear-gradient(135deg, #e8f5e9, #c8e6c9); color: #1b5e20; } .status-error { background: linear-gradient(135deg, #ffebee, #ffcdd2); color: #b71c1c; } + +.step-intent { + padding: 10px 20px; + background: #f0f7ff; + border-left: 4px solid #4285f4; + margin: 10px 0; + font-size: 0.95em; + line-height: 1.5; + color: #333; +} + +.exit-condition { + background: #fff8e1; + padding: 8px 12px; + border-radius: 6px; + font-family: 'Courier New', monospace; + font-size: 0.9em; + border-left: 3px solid #ffa000; + position: relative; +} + +.exit-message { + background: #e8f5e9; + padding: 12px 16px; + border-radius: 6px; + font-family: 'Segoe UI', Arial, sans-serif; + font-size: 1em; + border-left: 3px solid #4caf50; + position: relative; + line-height: 1.5; + color: #1b5e20; + margin-top: 4px; + box-shadow: 0 2px 5px rgba(0,0,0,0.05); +} + +.exit-condition-help { + margin-top: 8px; + font-family: 'Segoe UI', Arial, sans-serif; + font-size: 0.85em; + color: #666; + background: #f5f5f5; + padding: 6px 10px; + border-radius: 4px; + border-left: 2px solid #9e9e9e; +} + +/* 工具提示样式 */ +.tooltip { + position: relative; + display: inline-block; + margin-left: 8px; + cursor: help; +} + +.tooltip .tooltip-text { + visibility: hidden; + min-width: 200px; + max-width: 500px; + width: max-content; + background-color: #333; + color: #fff; + text-align: left; + border-radius: 6px; + padding: 10px 12px; + position: absolute; + z-index: 10; + bottom: 125%; + left: 50%; + transform: translateX(-50%); + opacity: 0; + transition: opacity 0.3s; + font-weight: normal; + font-size: 0.85em; + white-space: normal; + word-wrap: break-word; + line-height: 1.4; + box-shadow: 0 2px 10px rgba(0,0,0,0.2); +} + +.tooltip .tooltip-text::after { + content: ""; + position: absolute; + top: 100%; + left: 50%; + margin-left: -5px; + border-width: 5px; + border-style: solid; + border-color: #333 transparent transparent transparent; +} + +.tooltip:hover .tooltip-text { + visibility: visible; + opacity: 1; +} + +/* 移动设备上的工具提示调整 */ +@media (max-width: 768px) { + .tooltip .tooltip-text { + width: auto; + max-width: 250px; + left: auto; + right: 0; + transform: none; + } + + .tooltip .tooltip-text::after { + left: auto; + right: 10px; + } +} + +/* 进度条样式 */ +.progress-bar { + height: 12px; + background-color: #eef2f7; + border-radius: 6px; + margin: 10px 0; + overflow: hidden; + box-shadow: inset 0 1px 3px rgba(0,0,0,0.1); + width: 100%; + max-width: 300px; +} + +.progress-fill { + height: 100%; + background: linear-gradient(90deg, #007bff, #00c6ff); + width: 0%; + transition: width 0.6s ease; +} + +.progress-percentage { + text-align: right; + font-size: 0.85em; + color: #6c757d; + margin-top: 4px; + font-weight: normal; +} diff --git a/monitor/templates/index.html b/monitor/templates/index.html index ceffbea..4388f64 100644 --- a/monitor/templates/index.html +++ b/monitor/templates/index.html @@ -4,6 +4,9 @@ OSWorld Monitor + + + @@ -12,11 +15,6 @@

OSWorld Monitor System Online

-
- - Loading... -
Total Tasks
-
Loading... @@ -27,6 +25,16 @@ Loading...
Completed
+
+ + Loading... +
Error
+
+
+ + Loading... +
Total Tasks
+
diff --git a/monitor/templates/task_detail.html b/monitor/templates/task_detail.html index fd5fb8f..8fcd26c 100644 --- a/monitor/templates/task_detail.html +++ b/monitor/templates/task_detail.html @@ -4,6 +4,9 @@ Task Detail: {{ task_id }} + + + @@ -21,11 +24,41 @@
Instruction
{{ task_info.instruction }}
Status
-
{{ task_status.status }}
+
+ {{ task_status.status }} + {% if task_status.status == 'Error' %} + + + Error occurred during task execution + + {% elif task_status.status == 'Done (Message Exit)' %} + + + Task completed with a message exit condition + + {% elif task_status.status == 'Done (Max Steps)' %} + + + Maximum steps reached, task completed + + {% endif %} +
Current Step
{{ task_status.progress }}
Last Update
{{ task_status.last_update or 'None' }}
+ {% if task_status.log_data and task_status.log_data.exit_condition %} +
Exit Condition
+
+ {{ task_status.log_data.exit_condition }} +
+ {% endif %} + {% if task_status.status == 'Done (Message Exit)' and task_status.log_data and task_status.log_data.last_message %} +
Exit Message
+
+ {{ task_status.log_data.last_message }} +
+ {% endif %}
Result
{{ task_status.result }}
@@ -40,7 +73,15 @@
Step {{ step.step_num }}
{{ step.action_timestamp }}
-
{{ step.action.action }}
+ {% if task_status.log_data and task_status.log_data.agent_responses and loop.index0 < task_status.log_data.agent_responses|length %} +
+ Agent Intent: {{ task_status.log_data.agent_responses[loop.index0] }} +
+ {% endif %} +
{% if step.action and step.action.action %}{{ step.action.action }}
+                            {% elif step.Error %}Error: {{ step.Error }}
+                            {% else %}{{ step|tojson }}
+                            {% endif %}
{% if step.screenshot_file %}