diff --git a/desktop_env/evaluators/getters/file.py b/desktop_env/evaluators/getters/file.py
index f4ab03a..f329d10 100644
--- a/desktop_env/evaluators/getters/file.py
+++ b/desktop_env/evaluators/getters/file.py
@@ -1,10 +1,13 @@
import os
+import logging
from typing import Dict, List, Set
from typing import Optional, Any, Union
from datetime import datetime
import requests
import pandas as pd
+logger = logging.getLogger("desktopenv.getter.file")
+
def get_content_from_vm_file(env, config: Dict[str, Any]) -> Any:
"""
@@ -101,16 +104,42 @@ def get_vm_file(env, config: Dict[str, Any]) -> Union[Optional[str], List[Option
for i, (p, d) in enumerate(zip(paths, dests)):
_path = os.path.join(env.cache_dir, d)
- file = env.controller.get_file(p)
- if file is None:
+
+ try:
+ # Try to get file from VM
+ file = env.controller.get_file(p)
+ if file is None:
+ logger.warning(f"Failed to get file from VM: {p}")
+ if i in gives:
+ cache_paths.append(None)
+ continue
+
+ if i in gives:
+ cache_paths.append(_path)
+
+ # Write file with robust error handling
+ try:
+ # Ensure cache directory exists
+ os.makedirs(env.cache_dir, exist_ok=True)
+
+ with open(_path, "wb") as f:
+ f.write(file)
+ logger.info(f"Successfully saved file: {_path} ({len(file)} bytes)")
+
+ except IOError as e:
+ logger.error(f"IO error writing file {_path}: {e}")
+ if i in gives:
+ cache_paths[-1] = None # Replace the path we just added with None
+ except Exception as e:
+ logger.error(f"Unexpected error writing file {_path}: {e}")
+ if i in gives:
+ cache_paths[-1] = None
+
+ except Exception as e:
+ logger.error(f"Error processing file {p}: {e}")
if i in gives:
cache_paths.append(None)
- continue
-
- if i in gives:
- cache_paths.append(_path)
- with open(_path, "wb") as f:
- f.write(file)
+
return cache_paths[0] if len(cache_paths)==1 else cache_paths
diff --git a/desktop_env/evaluators/metrics/general.py b/desktop_env/evaluators/metrics/general.py
index d0f6195..27356a2 100644
--- a/desktop_env/evaluators/metrics/general.py
+++ b/desktop_env/evaluators/metrics/general.py
@@ -298,34 +298,84 @@ def check_json(result: str, rules: Dict[str, List[Dict[str, Union[List[str], str
"""
if result is None:
+ logger.warning("Result file path is None, returning 0.0")
+ return 0.
+
+ # Check if file exists
+ if not os.path.exists(result):
+ logger.warning(f"Result file does not exist: {result}, returning 0.0")
+ return 0.
+
+ try:
+ with open(result, 'r', encoding='utf-8') as f:
+ if is_yaml:
+ try:
+ # Use SafeLoader instead of Loader for better security and error handling
+ result_data: Dict[str, Any] = yaml.safe_load(f)
+ if result_data is None:
+ logger.warning(f"YAML file {result} is empty or contains only null values, returning 0.0")
+ return 0.
+ except yaml.YAMLError as e:
+ logger.error(f"YAML parsing error in file {result}: {e}")
+ logger.error(f"File content might be corrupted or have invalid YAML syntax")
+ return 0.
+ except Exception as e:
+ logger.error(f"Unexpected error parsing YAML file {result}: {e}")
+ return 0.
+ else:
+ try:
+ result_data: Dict[str, Any] = json.load(f)
+ except json.JSONDecodeError as e:
+ logger.error(f"JSON parsing error in file {result}: {e}")
+ return 0.
+ except Exception as e:
+ logger.error(f"Unexpected error parsing JSON file {result}: {e}")
+ return 0.
+ except IOError as e:
+ logger.error(f"IO error reading file {result}: {e}")
+ return 0.
+ except Exception as e:
+ logger.error(f"Unexpected error reading file {result}: {e}")
return 0.
- with open(result) as f:
- if is_yaml:
- result: Dict[str, Any] = yaml.load(f, Loader=yaml.Loader)
- else:
- result: Dict[str, Any] = json.load(f)
expect_rules = rules.get("expect", {})
unexpect_rules = rules.get("unexpect", {})
metric = True
for r in expect_rules:
- value = result
- for k in r["key"]:
- try:
- value = value[k]
- except KeyError:
- return 0.
- metric = metric and _match_value_to_rule(value, r)
+ value = result_data
+ try:
+ for k in r["key"]:
+ try:
+ value = value[k]
+ except KeyError:
+ logger.debug(f"Key '{k}' not found in result data, returning 0.0")
+ return 0.
+ except TypeError:
+ logger.debug(f"Cannot access key '{k}' - value is not a dictionary, returning 0.0")
+ return 0.
+ metric = metric and _match_value_to_rule(value, r)
+ except Exception as e:
+ logger.error(f"Error processing expect rule {r}: {e}")
+ return 0.
+
for r in unexpect_rules:
- value = result
- for k in r["key"]:
- try:
- value = value[k]
- except KeyError:
- value = None
- break
- metric = metric and not _match_value_to_rule(value, r)
+ value = result_data
+ try:
+ for k in r["key"]:
+ try:
+ value = value[k]
+ except KeyError:
+ value = None
+ break
+ except TypeError:
+ value = None
+ break
+ metric = metric and not _match_value_to_rule(value, r)
+ except Exception as e:
+ logger.error(f"Error processing unexpect rule {r}: {e}")
+ return 0.
+
return float(metric)
diff --git a/desktop_env/evaluators/metrics/slides.py b/desktop_env/evaluators/metrics/slides.py
index bee4f1a..81c4af9 100644
--- a/desktop_env/evaluators/metrics/slides.py
+++ b/desktop_env/evaluators/metrics/slides.py
@@ -73,6 +73,9 @@ def check_image_stretch_and_center(modified_ppt, original_ppt):
original_slide_images = [shape for shape in original_slide.shapes if shape.shape_type == 13]
modified_slide_images = [shape for shape in modified_slide.shapes if shape.shape_type == 13]
+ if not original_slide_images:
+ return 0.
+
the_image = original_slide_images[0]
the_modified_image = None
@@ -395,12 +398,38 @@ def compare_pptx_files(file1_path, file2_path, **options):
table2 = shape2.table
if enable_debug:
debug_logger.debug(f" Shape {shape_idx} - Comparing TABLE with {len(table1.rows)} rows and {len(table1.columns)} columns")
+ debug_logger.debug(f" Shape {shape_idx} - Table2 has {len(table2.rows)} rows and {len(table2.columns)} columns")
+
+ # Check if tables have the same dimensions
+ if len(table1.rows) != len(table2.rows) or len(table1.columns) != len(table2.columns):
+ if enable_debug:
+ debug_logger.debug(f" MISMATCH: Slide {slide_idx}, Shape {shape_idx} (TABLE) - Table dimensions differ:")
+ debug_logger.debug(f" Table1: {len(table1.rows)} rows x {len(table1.columns)} columns")
+ debug_logger.debug(f" Table2: {len(table2.rows)} rows x {len(table2.columns)} columns")
+ return 0
+
for row_idx in range(len(table1.rows)):
for col_idx in range(len(table1.columns)):
cell1 = table1.cell(row_idx, col_idx)
cell2 = table2.cell(row_idx, col_idx)
+ # Check if cells have the same number of paragraphs
+ if len(cell1.text_frame.paragraphs) != len(cell2.text_frame.paragraphs):
+ if enable_debug:
+ debug_logger.debug(f" MISMATCH: Slide {slide_idx}, Shape {shape_idx} (TABLE) - Cell [{row_idx},{col_idx}] - Different number of paragraphs:")
+ debug_logger.debug(f" Cell1 paragraphs: {len(cell1.text_frame.paragraphs)}")
+ debug_logger.debug(f" Cell2 paragraphs: {len(cell2.text_frame.paragraphs)}")
+ return 0
+
for para_idx, (para1, para2) in enumerate(zip(cell1.text_frame.paragraphs, cell2.text_frame.paragraphs)):
+ # Check if paragraphs have the same number of runs
+ if len(para1.runs) != len(para2.runs):
+ if enable_debug:
+ debug_logger.debug(f" MISMATCH: Slide {slide_idx}, Shape {shape_idx} (TABLE) - Cell [{row_idx},{col_idx}], Para {para_idx} - Different number of runs:")
+ debug_logger.debug(f" Para1 runs: {len(para1.runs)}")
+ debug_logger.debug(f" Para2 runs: {len(para2.runs)}")
+ return 0
+
for run_idx, (run1, run2) in enumerate(zip(para1.runs, para2.runs)):
# Check font color
if hasattr(run1.font.color, "rgb") and hasattr(run2.font.color, "rgb"):
@@ -451,6 +480,14 @@ def compare_pptx_files(file1_path, file2_path, **options):
if shape1.text.strip() != shape2.text.strip() and examine_text:
return 0
+ # check if the number of paragraphs are the same
+ if len(shape1.text_frame.paragraphs) != len(shape2.text_frame.paragraphs):
+ if enable_debug:
+ debug_logger.debug(f" MISMATCH: Slide {slide_idx}, Shape {shape_idx} - Different number of paragraphs:")
+ debug_logger.debug(f" Shape1 paragraphs: {len(shape1.text_frame.paragraphs)}")
+ debug_logger.debug(f" Shape2 paragraphs: {len(shape2.text_frame.paragraphs)}")
+ return 0
+
# check if the paragraphs are the same
para_idx = 0
for para1, para2 in zip(shape1.text_frame.paragraphs, shape2.text_frame.paragraphs):
@@ -487,6 +524,14 @@ def compare_pptx_files(file1_path, file2_path, **options):
if para1.level != para2.level and examine_indent:
return 0
+ # check if the number of runs are the same
+ if len(para1.runs) != len(para2.runs):
+ if enable_debug:
+ debug_logger.debug(f" MISMATCH: Slide {slide_idx}, Shape {shape_idx}, Para {para_idx} - Different number of runs:")
+ debug_logger.debug(f" Para1 runs: {len(para1.runs)}")
+ debug_logger.debug(f" Para2 runs: {len(para2.runs)}")
+ return 0
+
for run1, run2 in zip(para1.runs, para2.runs):
# check if the font properties are the same
@@ -634,6 +679,12 @@ def compare_pptx_files(file1_path, file2_path, **options):
debug_logger.debug(f" MISMATCH: Text differs - '{tshape1.text.strip()}' vs '{tshape2.text.strip()}'")
return 0
+ # Check if text shapes have the same number of paragraphs
+ if len(tshape1.text_frame.paragraphs) != len(tshape2.text_frame.paragraphs):
+ if enable_debug:
+ debug_logger.debug(f" MISMATCH: Different number of paragraphs - {len(tshape1.text_frame.paragraphs)} vs {len(tshape2.text_frame.paragraphs)}")
+ return 0
+
# Compare alignment of each paragraph
for para_idx, (para1, para2) in enumerate(zip(tshape1.text_frame.paragraphs, tshape2.text_frame.paragraphs)):
from pptx.enum.text import PP_ALIGN
diff --git a/desktop_env/evaluators/metrics/table.py b/desktop_env/evaluators/metrics/table.py
index db51850..5a0f79d 100644
--- a/desktop_env/evaluators/metrics/table.py
+++ b/desktop_env/evaluators/metrics/table.py
@@ -36,8 +36,14 @@ def _parse_sheet_idx(sheet_idx: Union[int, str]
# function _parse_sheet_idx {{{ #
if isinstance(sheet_idx, int):
try:
- index: str = result_sheet_names[sheet_idx]
- except:
+ if not result_sheet_names or sheet_idx >= len(result_sheet_names):
+ logger.error(f"Sheet index {sheet_idx} out of range. Available sheets: {result_sheet_names}")
+ index = ""
+ else:
+ index: str = result_sheet_names[sheet_idx]
+ logger.debug(f"Sheet index {sheet_idx} resolved to sheet: {index}")
+ except Exception as e:
+ logger.error(f"Error resolving sheet index {sheet_idx}: {e}")
index = ""
book: BOOK = result
elif sheet_idx.startswith("RI"):
@@ -114,12 +120,21 @@ def compare_table(result: str, expected: str = None, **options) -> float:
"""
if result is None:
+ logger.error("Result file path is None")
+ return 0.
+
+ # Check if result file exists
+ if not os.path.exists(result):
+ logger.error(f"Result file not found: {result}")
return 0.
try:
+ logger.info(f"Loading result file: {result}")
xlworkbookr: Workbook = openpyxl.load_workbook(filename=result)
pdworkbookr = pd.ExcelFile(result)
- except:
+ logger.info(f"Successfully loaded result file with sheets: {pdworkbookr.sheet_names}")
+ except Exception as e:
+ logger.error(f"Failed to load result file {result}: {e}")
return 0.
worksheetr_names: List[str] = pdworkbookr.sheet_names
@@ -432,19 +447,35 @@ def compare_table(result: str, expected: str = None, **options) -> float:
# props: dict like {attribute: {"method": str, "ref": anything}}
# supported attributes: value & those supported by utils._read_cell_style
- sheet: Worksheet = _load_sheet(*parse_idx(r["sheet_idx"], xlworkbookr, xlworkbooke))
- if sheet is None:
- return 0.
- # data_frame: pd.DataFrame = _load_sheet(*parse_idx(r["sheet_idx"], pdworkbookr, pdworkbooke))
- cell: Cell = sheet[r["coordinate"]]
- metric: bool = True
- for prpt, rule in r["props"].items():
- if prpt == "value":
- val = read_cell_value(*parse_idx(r["sheet_idx"], result, expected), r["coordinate"])
- else:
- val = _read_cell_style(prpt, cell)
+ try:
+ sheet: Worksheet = _load_sheet(*parse_idx(r["sheet_idx"], xlworkbookr, xlworkbooke))
+ if sheet is None:
+ logger.error(f"Failed to load sheet for sheet_idx: {r['sheet_idx']}")
+ return 0.
+ # data_frame: pd.DataFrame = _load_sheet(*parse_idx(r["sheet_idx"], pdworkbookr, pdworkbooke))
+ cell: Cell = sheet[r["coordinate"]]
+ metric: bool = True
+ for prpt, rule in r["props"].items():
+ if prpt == "value":
+ try:
+ parsed_result = parse_idx(r["sheet_idx"], result, expected)
+ logger.debug(f"parse_idx result: {parsed_result}")
+ val = read_cell_value(*parsed_result, r["coordinate"])
+ logger.debug(f"Cell {r['coordinate']} value: {val}")
+ except Exception as e:
+ logger.error(f"Failed to read cell value at {r['coordinate']}: {e}")
+ val = None
+ else:
+ try:
+ val = _read_cell_style(prpt, cell)
+ except Exception as e:
+ logger.error(f"Failed to read cell style {prpt} at {r['coordinate']}: {e}")
+ val = None
- metric = metric and _match_value_to_rule(val, rule)
+ metric = metric and _match_value_to_rule(val, rule)
+ except Exception as e:
+ logger.error(f"Error in check_cell processing: {e}")
+ return 0.
logger.debug("Assertion: %s[%s] :%s - %s"
, r["sheet_idx"], r["coordinate"]
diff --git a/desktop_env/evaluators/metrics/utils.py b/desktop_env/evaluators/metrics/utils.py
index e512a26..1136655 100644
--- a/desktop_env/evaluators/metrics/utils.py
+++ b/desktop_env/evaluators/metrics/utils.py
@@ -4,6 +4,7 @@ import functools
import itertools
import logging
import operator
+import os
import re
import zipfile
#import pandas as pd
@@ -33,10 +34,11 @@ V = TypeVar("Value")
logger = logging.getLogger("desktopenv.metrics.utils")
-_xlsx_namespaces = [("oo", "http://schemas.openxmlformats.org/spreadsheetml/2006/main")
- , ("x14", "http://schemas.microsoft.com/office/spreadsheetml/2009/9/main")
- , ("xm", "http://schemas.microsoft.com/office/excel/2006/main")
- ]
+_xlsx_namespaces = [
+ ("oo", "http://schemas.openxmlformats.org/spreadsheetml/2006/main"),
+ ("x14", "http://schemas.microsoft.com/office/spreadsheetml/2009/9/main"),
+ ("xm", "http://schemas.microsoft.com/office/excel/2006/main")
+]
_xlsx_ns_mapping = dict(_xlsx_namespaces)
_xlsx_ns_imapping = dict(map(lambda itm: (itm[1], itm[0]), _xlsx_namespaces))
_xlsx_ns_imapping["http://schemas.openxmlformats.org/spreadsheetml/2006/main"] = None
@@ -282,6 +284,13 @@ _shared_str_value_selector = lxml.cssselect.CSSSelector("oo|t", namespaces=_xlsx
def read_cell_value(xlsx_file: str, sheet_name: str, coordinate: str) -> Any:
# read_cell_value {{{ #
+ logger.debug(f"Reading cell value from {xlsx_file}, sheet: {sheet_name}, coordinate: {coordinate}")
+
+ # Check if file exists
+ if not os.path.exists(xlsx_file):
+ logger.error(f"Excel file not found: {xlsx_file}")
+ return None
+
try:
with zipfile.ZipFile(xlsx_file, "r") as z_f:
try:
@@ -308,9 +317,17 @@ def read_cell_value(xlsx_file: str, sheet_name: str, coordinate: str) -> Any:
, namespaces=_xlsx_ns_mapping
)(sheet)
if len(cells) == 0:
+ logger.debug(f"Cell {coordinate} not found in sheet {sheet_name}")
return None
cell: _Element = cells[0]
- except zipfile.BadZipFile:
+ except zipfile.BadZipFile as e:
+ logger.error(f"Bad zip file {xlsx_file}: {e}")
+ return None
+ except KeyError as e:
+ logger.error(f"Sheet {sheet_name} not found in {xlsx_file}: {e}")
+ return None
+ except Exception as e:
+ logger.error(f"Error reading {xlsx_file}: {e}")
return None
cell: Dict[str, str] = xmltodict.parse(lxml.etree.tostring(cell, encoding="unicode")
diff --git a/mm_agents/anthropic/main.py b/mm_agents/anthropic/main.py
index 4cffc16..493a7bb 100644
--- a/mm_agents/anthropic/main.py
+++ b/mm_agents/anthropic/main.py
@@ -369,9 +369,10 @@ class AnthropicAgent:
)
except (APIError, APIStatusError, APIResponseValidationError) as e:
- self.logger.exception(f"Anthropic API error: {str(e)}")
+ logger.exception(f"Anthropic API error: {str(e)}")
try:
- self.logger.warning("Retrying with backup API key...")
+ logger.warning("Retrying with backup API key...")
+
backup_client = Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY_BACKUP"), max_retries=4)
if self.model_name == "claude-3-7-sonnet-20250219" or self.model_name == "claude-4-opus-20250514" or self.model_name == "claude-4-sonnet-20250514":
@@ -393,13 +394,13 @@ class AnthropicAgent:
tools=tools,
betas=betas,
)
- self.logger.info("Successfully used backup API key")
+ logger.info("Successfully used backup API key")
except Exception as backup_e:
- self.logger.exception(f"Backup API call also failed: {str(backup_e)}")
+ logger.exception(f"Backup API call also failed: {str(backup_e)}")
return None, None
except Exception as e:
- self.logger.exception(f"Error in Anthropic API: {str(e)}")
+ logger.exception(f"Error in Anthropic API: {str(e)}")
return None, None
response_params = _response_to_params(response)
@@ -434,9 +435,15 @@ class AnthropicAgent:
actions = ["DONE"]
return reasonings, actions
- def reset(self, *args, **kwargs):
+ def reset(self, _logger = None, *args, **kwargs):
"""
Reset the agent's state.
"""
+ global logger
+ if _logger:
+ logger = _logger
+ else:
+ logger = logging.getLogger("desktopenv.agent")
self.messages = []
- self.logger.info(f"{self.class_name} reset.")
\ No newline at end of file
+ logger.info(f"{self.class_name} reset.")
+
diff --git a/monitor/.env b/monitor/.env
index 26de7b2..2d71a24 100644
--- a/monitor/.env
+++ b/monitor/.env
@@ -11,4 +11,4 @@ MODEL_NAME=computer-use-preview
MAX_STEPS=100
FLASK_PORT=80
FLASK_HOST=0.0.0.0
-FLASK_DEBUG=true
\ No newline at end of file
+FLASK_DEBUG=false
\ No newline at end of file
diff --git a/monitor/main.py b/monitor/main.py
index 1657a78..acdf95a 100644
--- a/monitor/main.py
+++ b/monitor/main.py
@@ -1,14 +1,17 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
+from functools import cache
import os
import json
import time
+import subprocess
from datetime import datetime
from pathlib import Path
from flask import Flask, render_template_string, jsonify, send_file, request, render_template
from dotenv import load_dotenv
+
# Load environment variables from .env file
load_dotenv()
@@ -38,12 +41,51 @@ OBSERVATION_TYPE=os.getenv("OBSERVATION_TYPE", "screenshot")
MODEL_NAME=os.getenv("MODEL_NAME", "computer-use-preview")
MAX_STEPS = int(os.getenv("MAX_STEPS", "150"))
+def initialize_default_config():
+ """Initialize default configuration from the first available config in results directory"""
+ global ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME, RESULTS_PATH
+
+ if os.path.exists(RESULTS_BASE_PATH):
+ try:
+ # Scan for the first available configuration
+ for action_space in os.listdir(RESULTS_BASE_PATH):
+ action_space_path = os.path.join(RESULTS_BASE_PATH, action_space)
+ if os.path.isdir(action_space_path):
+ for obs_type in os.listdir(action_space_path):
+ obs_path = os.path.join(action_space_path, obs_type)
+ if os.path.isdir(obs_path):
+ for model_name in os.listdir(obs_path):
+ model_path = os.path.join(obs_path, model_name)
+ if os.path.isdir(model_path):
+ # Use the first available configuration as default
+ ACTION_SPACE = action_space
+ OBSERVATION_TYPE = obs_type
+ MODEL_NAME = model_name
+ RESULTS_PATH = model_path
+ print(f"Initialized default config: {ACTION_SPACE}/{OBSERVATION_TYPE}/{MODEL_NAME}")
+ return
+ except Exception as e:
+ print(f"Error scanning results directory for default config: {e}")
+
+ # Fallback to original environment-based path if no configs found
+ RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
+ print(f"Using fallback config from environment: {ACTION_SPACE}/{OBSERVATION_TYPE}/{MODEL_NAME}")
+
+# Initialize default configuration
+initialize_default_config()
+
RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
+if RESULTS_PATH not in TASK_STATUS_CACHE:
+ # Initialize cache for this results path
+ TASK_STATUS_CACHE[RESULTS_PATH] = {}
+
+@cache
def load_task_list():
with open(TASK_CONFIG_PATH, 'r') as f:
return json.load(f)
+@cache
def get_task_info(task_type, task_id):
task_file = os.path.join(EXAMPLES_BASE_PATH, task_type, f"{task_id}.json")
if os.path.exists(task_file):
@@ -183,8 +225,8 @@ def get_task_status_brief(task_type, task_id):
# Check if the status is already cached
current_time = time.time()
last_cache_time = None
- if cache_key in TASK_STATUS_CACHE:
- cached_status, cached_time = TASK_STATUS_CACHE[cache_key]
+ if cache_key in TASK_STATUS_CACHE[RESULTS_PATH]:
+ cached_status, cached_time = TASK_STATUS_CACHE[RESULTS_PATH][cache_key]
last_cache_time = cached_time
# If cached status is "Done", check if it's within the stability period
if cached_status["status"].startswith("Done"):
@@ -312,7 +354,7 @@ def get_task_status_brief(task_type, task_id):
# Cache the status if it is done or error
if status.startswith("Done") or status == "Error":
current_time = last_cache_time if last_cache_time else current_time
- TASK_STATUS_CACHE[cache_key] = (status_dict, current_time)
+ TASK_STATUS_CACHE[RESULTS_PATH][cache_key] = (status_dict, current_time)
return status_dict
@@ -434,6 +476,90 @@ def api_task_detail(task_type, task_id):
"status": task_status
})
+@app.route('/api/config')
+def api_config():
+ """Get configuration information from environment variables - deprecated, use /api/current-config instead"""
+ config_info = {
+ "task_config_path": TASK_CONFIG_PATH,
+ "results_base_path": RESULTS_BASE_PATH,
+ "action_space": ACTION_SPACE,
+ "observation_type": OBSERVATION_TYPE,
+ "model_name": MODEL_NAME,
+ "max_steps": MAX_STEPS,
+ "examples_base_path": EXAMPLES_BASE_PATH
+ }
+ return jsonify(config_info)
+
+@app.route('/api/available-configs')
+def api_available_configs():
+ """Get all available configuration combinations by scanning the results directory"""
+ configs = []
+
+ if os.path.exists(RESULTS_BASE_PATH):
+ try:
+ # Scan action spaces
+ for action_space in os.listdir(RESULTS_BASE_PATH):
+ action_space_path = os.path.join(RESULTS_BASE_PATH, action_space)
+ if os.path.isdir(action_space_path):
+ # Scan observation types
+ for obs_type in os.listdir(action_space_path):
+ obs_path = os.path.join(action_space_path, obs_type)
+ if os.path.isdir(obs_path):
+ # Scan model names
+ for model_name in os.listdir(obs_path):
+ model_path = os.path.join(obs_path, model_name)
+ if os.path.isdir(model_path):
+ configs.append({
+ "action_space": action_space,
+ "observation_type": obs_type,
+ "model_name": model_name,
+ "path": model_path
+ })
+ except Exception as e:
+ print(f"Error scanning results directory: {e}")
+
+ return jsonify(configs)
+
+@app.route('/api/current-config')
+def api_current_config():
+ """Get current configuration"""
+ return jsonify({
+ "action_space": ACTION_SPACE,
+ "observation_type": OBSERVATION_TYPE,
+ "model_name": MODEL_NAME,
+ "max_steps": MAX_STEPS,
+ "results_path": RESULTS_PATH
+ })
+
+@app.route('/api/set-config', methods=['POST'])
+def api_set_config():
+ """Set current configuration"""
+ global ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME, RESULTS_PATH
+
+ data = request.get_json()
+ if not data:
+ return jsonify({"error": "No data provided"}), 400
+
+ # Update global variables
+ ACTION_SPACE = data.get('action_space', ACTION_SPACE)
+ OBSERVATION_TYPE = data.get('observation_type', OBSERVATION_TYPE)
+ MODEL_NAME = data.get('model_name', MODEL_NAME)
+
+ # Update results path
+ RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
+
+ if RESULTS_PATH not in TASK_STATUS_CACHE:
+ # Initialize cache for this results path
+ TASK_STATUS_CACHE[RESULTS_PATH] = {}
+
+ return jsonify({
+ "action_space": ACTION_SPACE,
+ "observation_type": OBSERVATION_TYPE,
+ "model_name": MODEL_NAME,
+ "max_steps": MAX_STEPS,
+ "results_path": RESULTS_PATH
+ })
+
if __name__ == '__main__':
# Check if necessary directories exist
if not os.path.exists(TASK_CONFIG_PATH):
@@ -447,4 +573,4 @@ if __name__ == '__main__':
port = 8080
debug = os.getenv("FLASK_DEBUG", "false").lower() == "true"
- app.run(host=host, port=port, debug=debug)
\ No newline at end of file
+ app.run(host=host, port=port, debug=debug, threaded=True)
\ No newline at end of file
diff --git a/monitor/static/index.css b/monitor/static/index.css
index 0e20e4a..215bcbf 100644
--- a/monitor/static/index.css
+++ b/monitor/static/index.css
@@ -1,5 +1,63 @@
/* filepath: /home/adlsdztony/codes/OSWorld/monitor/static/index.css */
body { font-family: 'Segoe UI', Arial, sans-serif; margin: 0; padding: 0; background: linear-gradient(135deg, #f4f6fa 0%, #e9f0f9 100%); }
+
+.layout-container {
+ position: relative;
+ max-width: 1200px;
+ margin: 20px auto;
+ padding: 0 20px;
+}
+
+.main-content {
+ background: #fff;
+ border-radius: 14px;
+ box-shadow: 0 8px 32px rgba(0,0,0,0.1);
+ padding: 36px 44px;
+}
+
+/* Floating Config Sidebar */
+.config-sidebar {
+ position: fixed;
+ top: 20px;
+ left: -280px;
+ width: 300px;
+ height: calc(100vh - 40px);
+ z-index: 1000;
+ transition: left 0.3s ease;
+}
+
+.config-sidebar:hover {
+ left: 0;
+}
+
+.config-toggle-btn {
+ position: absolute;
+ right: -50px;
+ top: 50%;
+ transform: translateY(-50%);
+ width: 50px;
+ height: 50px;
+ background: linear-gradient(135deg, #007bff, #0056b3);
+ border-radius: 0 25px 25px 0;
+ display: flex;
+ align-items: center;
+ justify-content: center;
+ color: white;
+ font-size: 1.2em;
+ cursor: pointer;
+ box-shadow: 2px 0 10px rgba(0,0,0,0.2);
+ transition: all 0.3s ease;
+}
+
+.config-toggle-btn:hover {
+ background: linear-gradient(135deg, #0056b3, #004085);
+ transform: translateY(-50%) scale(1.05);
+}
+
+.config-sidebar:hover .config-toggle-btn {
+ opacity: 0.8;
+}
+
.main-container { max-width: 1100px; margin: 40px auto; background: #fff; border-radius: 14px; box-shadow: 0 8px 32px rgba(0,0,0,0.1); padding: 36px 44px; }
h1 { font-size: 2.5em; margin-bottom: 24px; color: #1a237e; text-align: center; position: relative; }
h1:after { content: ''; display: block; width: 80px; height: 4px; background: linear-gradient(90deg, #007bff, #00c6ff); margin: 12px auto 0; border-radius: 2px; }
@@ -125,6 +183,18 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
text-shadow: 0 1px 2px rgba(0,0,0,0.05);
}
+.accuracy-percentage {
+ font-size: 0.7em;
+ font-weight: 600;
+ color: #ffffff;
+ margin-left: 8px;
+ background: rgba(255, 255, 255, 0.1);
+ padding: 4px 8px;
+ border-radius: 12px;
+ display: inline-block;
+ vertical-align: middle;
+}
+
.stat-card span {
font-size: 2em;
@@ -197,8 +267,9 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
.task-type-stats {
display: flex;
- gap: 16px;
flex-wrap: wrap;
+ gap: 8px;
+ align-items: center;
}
.task-stat {
@@ -228,6 +299,22 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
color: #b71c1c;
}
+/* Task type statistics styles */
+.task-stat.score {
+ color: #ffc107;
+ background: rgba(255, 193, 7, 0.1);
+}
+
+.task-stat.steps {
+ color: #17a2b8;
+ background: rgba(23, 162, 184, 0.1);
+}
+
+.task-stat.rate {
+ color: #28a745;
+ background: rgba(40, 167, 69, 0.1);
+}
+
.tasks-container {
padding: 20px;
transition: all 0.4s cubic-bezier(.4,0,.2,1);
@@ -427,3 +514,174 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
background: #a5c7e5;
}
+/* Configuration Panel Styles */
+.config-panel {
+ background: #fff;
+ border-radius: 0 14px 14px 0;
+ box-shadow: 0 8px 32px rgba(0,0,0,0.15);
+ overflow: hidden;
+ height: 100%;
+ display: flex;
+ flex-direction: column;
+}
+
+.config-header {
+ display: flex;
+ align-items: center;
+ padding: 16px 20px;
+ background: linear-gradient(135deg, #6c757d, #495057);
+ color: white;
+ flex-shrink: 0;
+}
+
+.config-header i {
+ margin-right: 10px;
+ font-size: 1.1em;
+}
+
+.config-header span {
+ font-weight: 600;
+ font-size: 1.1em;
+}
+
+.config-content {
+ padding: 20px;
+ flex: 1;
+ overflow-y: auto;
+}
+
+.config-selector {
+ margin-bottom: 20px;
+ padding-bottom: 15px;
+ border-bottom: 1px solid #dee2e6;
+}
+
+.selector-item {
+ display: flex;
+ flex-direction: column;
+ gap: 8px;
+}
+
+.selector-item label {
+ font-weight: 600;
+ color: #495057;
+ font-size: 0.9em;
+ text-transform: uppercase;
+ letter-spacing: 0.5px;
+}
+
+.selector-item select {
+ padding: 8px 12px;
+ border: 2px solid #e9ecef;
+ border-radius: 6px;
+ background: white;
+ font-size: 0.9em;
+ color: #495057;
+ cursor: pointer;
+ transition: all 0.3s ease;
+}
+
+.selector-item select:focus {
+ outline: none;
+ border-color: #007bff;
+ box-shadow: 0 0 0 3px rgba(0,123,255,0.1);
+}
+
+.selector-item select:hover {
+ border-color: #007bff;
+}
+
+.config-list {
+ display: flex;
+ flex-direction: column;
+ gap: 15px;
+}
+
+.config-item {
+ display: flex;
+ flex-direction: column;
+ background: #f8f9fa;
+ padding: 12px;
+ border-radius: 8px;
+ border-left: 4px solid #007bff;
+ transition: all 0.3s ease;
+}
+
+.config-item:hover {
+ transform: translateX(3px);
+ box-shadow: 0 4px 12px rgba(0,123,255,0.15);
+}
+
+.config-label {
+ font-weight: 600;
+ color: #495057;
+ margin-bottom: 5px;
+ font-size: 0.9em;
+ text-transform: uppercase;
+ color: #495057;
+ font-size: 0.85em;
+ margin-bottom: 6px;
+ text-transform: uppercase;
+ letter-spacing: 0.5px;
+}
+
+.config-value {
+ color: #007bff;
+ font-family: 'Courier New', monospace;
+ font-size: 0.9em;
+ font-weight: 600;
+ word-break: break-word;
+}
+
+.config-path {
+ font-size: 0.8em;
+ line-height: 1.3;
+}
+
+/* Responsive design for sidebar layout */
+@media (max-width: 1024px) {
+ .config-sidebar {
+ left: -250px;
+ width: 250px;
+ }
+
+ .config-toggle-btn {
+ right: -40px;
+ width: 40px;
+ height: 40px;
+ font-size: 1em;
+ }
+}
+
+@media (max-width: 768px) {
+ .layout-container {
+ padding: 0 10px;
+ }
+
+ .main-content {
+ padding: 20px 25px;
+ }
+
+ .config-sidebar {
+ left: -220px;
+ width: 220px;
+ height: calc(100vh - 20px);
+ top: 10px;
+ }
+
+ .config-toggle-btn {
+ right: -35px;
+ width: 35px;
+ height: 35px;
+ font-size: 0.9em;
+ }
+
+ .config-content {
+ padding: 15px;
+ }
+
+ .config-item {
+ padding: 10px;
+ }
+}
+
diff --git a/monitor/static/index.js b/monitor/static/index.js
index 4dd34e5..ed2910e 100644
--- a/monitor/static/index.js
+++ b/monitor/static/index.js
@@ -1,5 +1,8 @@
document.addEventListener('DOMContentLoaded', () => {
- fetchTasks();
+ fetchAvailableConfigs().then(() => {
+ fetchConfig();
+ fetchTasks();
+ });
// Bind filter functionality
document.getElementById('total-tasks').parentElement.addEventListener('click', () => setTaskFilter('all'));
document.getElementById('active-tasks').parentElement.addEventListener('click', () => setTaskFilter('active'));
@@ -9,6 +12,9 @@ document.addEventListener('DOMContentLoaded', () => {
let allTaskData = null;
let currentFilter = 'all';
+let availableConfigs = [];
+let currentConfig = null;
+let categoryStats = {};
function refreshPage() {
// Save expanded state before refresh
@@ -31,8 +37,8 @@ function fetchTasksForRefresh() {
fetch('/api/tasks/brief')
.then(response => response.json())
.then(data => {
- // Update stored data
allTaskData = data;
+ categoryStats = calculateCategoryStats(data);
// Only update statistics and task status, do not fully re-render
updateStatistics(data);
updateTaskStatus(data);
@@ -148,6 +154,7 @@ function fetchTasks() {
.then(response => response.json())
.then(data => {
allTaskData = data;
+ categoryStats = calculateCategoryStats(data);
renderTasks(data);
updateStatistics(data);
})
@@ -208,13 +215,15 @@ function updateStatistics(data) {
document.getElementById('completed-tasks').textContent = completedTasks;
document.getElementById('error-tasks').textContent = errorTasks;
- // Update score display with formatted score
+ // Update score display with formatted score and accuracy percentage
const scoreDisplay = document.getElementById('score-display');
if (completedTasks > 0) {
const scoreFormatted = totalScore.toFixed(2);
- scoreDisplay.innerHTML = `${scoreFormatted} / ${completedTasks}`;
+ const averageScore = totalScore / completedTasks;
+ const accuracyPercentage = (averageScore * 100).toFixed(1);
+ scoreDisplay.innerHTML = `${scoreFormatted} / ${completedTasks} (${accuracyPercentage}%)`;
} else {
- scoreDisplay.innerHTML = '0.00 / 0';
+ scoreDisplay.innerHTML = '0.00 / 0 (0.0%)';
}
// Highlight the currently selected statistics card
@@ -279,6 +288,10 @@ function renderTasks(data) {
// Create header with task type name and statistics
const typeHeader = document.createElement('div');
typeHeader.className = 'task-type-header';
+
+ // Get category stats for this task type
+ const stats = categoryStats[taskType] || {};
+
typeHeader.innerHTML = `
${taskType}
@@ -286,6 +299,9 @@ function renderTasks(data) {
${tasks.length} total
${runningCount} active
${completedCount} completed
+ ${stats.avg_score ? ` ${stats.avg_score} avg score` : ''}
+ ${stats.avg_steps ? ` ${stats.avg_steps} avg steps` : ''}
+ ${stats.completion_rate ? ` ${stats.completion_rate}% completed` : ''}
`;
typeSection.appendChild(typeHeader);
@@ -453,7 +469,181 @@ function renderTasks(data) {
container.appendChild(typeSection);
});
}
-// add auto-refresh with time interval 10 seconds
-setInterval(() => {
- refreshPage();
-}, 10000); // 10 seconds interval
+
+function fetchAvailableConfigs() {
+ return fetch('/api/available-configs')
+ .then(response => response.json())
+ .then(data => {
+ availableConfigs = data;
+ populateConfigSelect();
+ return data;
+ })
+ .catch(error => {
+ console.error('Error fetching available configs:', error);
+ return [];
+ });
+}
+
+function populateConfigSelect() {
+ const select = document.getElementById('config-select');
+ select.innerHTML = '';
+
+ if (availableConfigs.length === 0) {
+ select.innerHTML = '';
+ return;
+ }
+
+ // Add available configurations
+ availableConfigs.forEach((config, index) => {
+ const option = document.createElement('option');
+ option.value = index;
+ option.textContent = `${config.action_space} / ${config.observation_type} / ${config.model_name}`;
+ select.appendChild(option);
+ });
+}
+
+function changeConfiguration() {
+ const select = document.getElementById('config-select');
+ const selectedIndex = select.value;
+
+ if (selectedIndex === '' || selectedIndex < 0 || selectedIndex >= availableConfigs.length) {
+ return;
+ }
+
+ const selectedConfig = availableConfigs[selectedIndex];
+
+ // Send configuration change request
+ fetch('/api/set-config', {
+ method: 'POST',
+ headers: {
+ 'Content-Type': 'application/json',
+ },
+ body: JSON.stringify(selectedConfig)
+ })
+ .then(response => response.json())
+ .then(data => {
+ currentConfig = data;
+ displayConfig(data);
+ // Refresh tasks with new configuration
+ fetchTasks();
+ })
+ .catch(error => {
+ console.error('Error setting config:', error);
+ displayConfigError();
+ });
+}
+
+function fetchConfig() {
+ return fetch('/api/current-config')
+ .then(response => response.json())
+ .then(data => {
+ currentConfig = data;
+ displayConfig(data);
+ updateConfigSelect();
+ return data;
+ })
+ .catch(error => {
+ console.error('Error fetching config:', error);
+ displayConfigError();
+ });
+}
+
+function updateConfigSelect() {
+ if (!currentConfig || availableConfigs.length === 0) return;
+
+ const select = document.getElementById('config-select');
+ const currentConfigIndex = availableConfigs.findIndex(config =>
+ config.action_space === currentConfig.action_space &&
+ config.observation_type === currentConfig.observation_type &&
+ config.model_name === currentConfig.model_name
+ );
+
+ if (currentConfigIndex !== -1) {
+ select.value = currentConfigIndex;
+ } else {
+ // Current config not found in available configs, select the first one if available
+ if (availableConfigs.length > 0) {
+ select.value = 0;
+ console.warn('Current config not found in available configs, defaulting to first available config');
+ }
+ }
+}
+
+function displayConfig(config) {
+ document.getElementById('action-space').textContent = config.action_space || 'N/A';
+ document.getElementById('observation-type').textContent = config.observation_type || 'N/A';
+ document.getElementById('model-name').textContent = config.model_name || 'N/A';
+ document.getElementById('max-steps').textContent = config.max_steps || 'N/A';
+}
+
+function displayConfigError() {
+ const configValues = document.querySelectorAll('.config-value');
+ configValues.forEach(element => {
+ element.textContent = 'Error loading';
+ element.style.color = '#dc3545';
+ });
+}
+
+function calculateCategoryStats(data) {
+ const stats = {};
+
+ Object.entries(data).forEach(([taskType, tasks]) => {
+ let totalTasks = tasks.length;
+ let completedTasks = 0;
+ let runningTasks = 0;
+ let errorTasks = 0;
+ let totalScore = 0;
+ let totalSteps = 0;
+ let completedWithSteps = 0;
+
+ tasks.forEach(task => {
+ const status = task.status.status;
+
+ if (['Done', 'Done (Message Exit)', 'Done (Max Steps)', 'Done (Thought Exit)'].includes(status)) {
+ completedTasks++;
+
+ // Calculate score if available
+ if (task.status.result) {
+ try {
+ const score = parseFloat(task.status.result);
+ if (!isNaN(score) && score >= 0 && score <= 1) {
+ totalScore += score;
+ }
+ } catch (e) {
+ // Ignore parsing errors
+ }
+ }
+
+ // Calculate steps for completed tasks
+ if (task.status.progress && task.status.progress > 0) {
+ totalSteps += task.status.progress;
+ completedWithSteps++;
+ }
+
+ } else if (['Running', 'Preparing', 'Initializing'].includes(status)) {
+ runningTasks++;
+
+ } else if (status === 'Error') {
+ errorTasks++;
+ }
+ });
+
+ // Calculate averages
+ const avgScore = completedTasks > 0 ? totalScore / completedTasks : 0;
+ const avgSteps = completedWithSteps > 0 ? totalSteps / completedWithSteps : 0;
+ const completionRate = totalTasks > 0 ? (completedTasks / totalTasks * 100) : 0;
+
+ stats[taskType] = {
+ total_tasks: totalTasks,
+ completed_tasks: completedTasks,
+ running_tasks: runningTasks,
+ error_tasks: errorTasks,
+ total_score: Math.round(totalScore * 100) / 100,
+ avg_score: Math.round(avgScore * 10000) / 10000,
+ avg_steps: Math.round(avgSteps * 10) / 10,
+ completion_rate: Math.round(completionRate * 10) / 10
+ };
+ });
+
+ return stats;
+}
diff --git a/monitor/templates/index.html b/monitor/templates/index.html
index 0c34f3c..ef91ab9 100644
--- a/monitor/templates/index.html
+++ b/monitor/templates/index.html
@@ -12,19 +12,62 @@
-
-
OSWorld Monitor System Online
-
-
-
-
-
-
Score:
-
Loading...
+
+
+
-
+
+
+
OSWorld Monitor System Online
+
+
+
+
+
+ Score:
+ Loading...
+
+
+
+
Loading...
@@ -46,10 +89,11 @@
Total Tasks
-
-
-
-
Loading task data...
+
+
+
+
Loading task data...
+