Merge remote-tracking branch 'upstream/main' into fix_chrome

2025-07-14 07:14:19 +00:00
parent 5d90faa548 74b7c189af
commit 90c4e894a4
11 changed files with 887 additions and 84 deletions
--- a/desktop_env/evaluators/getters/file.py
+++ b/desktop_env/evaluators/getters/file.py
@@ -1,10 +1,13 @@
 import os
+import logging
 from typing import Dict, List, Set
 from typing import Optional, Any, Union
 from datetime import datetime
 import requests
 import pandas as pd

+logger = logging.getLogger("desktopenv.getter.file")
+

 def get_content_from_vm_file(env, config: Dict[str, Any]) -> Any:
    """
@@ -101,16 +104,42 @@ def get_vm_file(env, config: Dict[str, Any]) -> Union[Optional[str], List[Option

    for i, (p, d) in enumerate(zip(paths, dests)):
        _path = os.path.join(env.cache_dir, d)
-        file = env.controller.get_file(p)
-        if file is None:
+        
+        try:
+            # Try to get file from VM
+            file = env.controller.get_file(p)
+            if file is None:
+                logger.warning(f"Failed to get file from VM: {p}")
+                if i in gives:
+                    cache_paths.append(None)
+                continue
+
+            if i in gives:
+                cache_paths.append(_path)
+                
+            # Write file with robust error handling
+            try:
+                # Ensure cache directory exists
+                os.makedirs(env.cache_dir, exist_ok=True)
+                
+                with open(_path, "wb") as f:
+                    f.write(file)
+                logger.info(f"Successfully saved file: {_path} ({len(file)} bytes)")
+                
+            except IOError as e:
+                logger.error(f"IO error writing file {_path}: {e}")
+                if i in gives:
+                    cache_paths[-1] = None  # Replace the path we just added with None
+            except Exception as e:
+                logger.error(f"Unexpected error writing file {_path}: {e}")
+                if i in gives:
+                    cache_paths[-1] = None
+                    
+        except Exception as e:
+            logger.error(f"Error processing file {p}: {e}")
            if i in gives:
                cache_paths.append(None)
-            continue
-
-        if i in gives:
-            cache_paths.append(_path)
-        with open(_path, "wb") as f:
-            f.write(file)
+                
    return cache_paths[0] if len(cache_paths)==1 else cache_paths


--- a/desktop_env/evaluators/metrics/general.py
+++ b/desktop_env/evaluators/metrics/general.py
@@ -298,34 +298,84 @@ def check_json(result: str, rules: Dict[str, List[Dict[str, Union[List[str], str
    """

    if result is None:
+        logger.warning("Result file path is None, returning 0.0")
+        return 0.
+        
+    # Check if file exists
+    if not os.path.exists(result):
+        logger.warning(f"Result file does not exist: {result}, returning 0.0")
+        return 0.
+    
+    try:
+        with open(result, 'r', encoding='utf-8') as f:
+            if is_yaml:
+                try:
+                    # Use SafeLoader instead of Loader for better security and error handling
+                    result_data: Dict[str, Any] = yaml.safe_load(f)
+                    if result_data is None:
+                        logger.warning(f"YAML file {result} is empty or contains only null values, returning 0.0")
+                        return 0.
+                except yaml.YAMLError as e:
+                    logger.error(f"YAML parsing error in file {result}: {e}")
+                    logger.error(f"File content might be corrupted or have invalid YAML syntax")
+                    return 0.
+                except Exception as e:
+                    logger.error(f"Unexpected error parsing YAML file {result}: {e}")
+                    return 0.
+            else:
+                try:
+                    result_data: Dict[str, Any] = json.load(f)
+                except json.JSONDecodeError as e:
+                    logger.error(f"JSON parsing error in file {result}: {e}")
+                    return 0.
+                except Exception as e:
+                    logger.error(f"Unexpected error parsing JSON file {result}: {e}")
+                    return 0.
+    except IOError as e:
+        logger.error(f"IO error reading file {result}: {e}")
+        return 0.
+    except Exception as e:
+        logger.error(f"Unexpected error reading file {result}: {e}")
        return 0.
-    with open(result) as f:
-        if is_yaml:
-            result: Dict[str, Any] = yaml.load(f, Loader=yaml.Loader)
-        else:
-            result: Dict[str, Any] = json.load(f)

    expect_rules = rules.get("expect", {})
    unexpect_rules = rules.get("unexpect", {})

    metric = True
    for r in expect_rules:
-        value = result
-        for k in r["key"]:
-            try:
-                value = value[k]
-            except KeyError:
-                return 0.
-        metric = metric and _match_value_to_rule(value, r)
+        value = result_data
+        try:
+            for k in r["key"]:
+                try:
+                    value = value[k]
+                except KeyError:
+                    logger.debug(f"Key '{k}' not found in result data, returning 0.0")
+                    return 0.
+                except TypeError:
+                    logger.debug(f"Cannot access key '{k}' - value is not a dictionary, returning 0.0")
+                    return 0.
+            metric = metric and _match_value_to_rule(value, r)
+        except Exception as e:
+            logger.error(f"Error processing expect rule {r}: {e}")
+            return 0.
+            
    for r in unexpect_rules:
-        value = result
-        for k in r["key"]:
-            try:
-                value = value[k]
-            except KeyError:
-                value = None
-                break
-        metric = metric and not _match_value_to_rule(value, r)
+        value = result_data
+        try:
+            for k in r["key"]:
+                try:
+                    value = value[k]
+                except KeyError:
+                    value = None
+                    break
+                except TypeError:
+                    value = None
+                    break
+            metric = metric and not _match_value_to_rule(value, r)
+        except Exception as e:
+            logger.error(f"Error processing unexpect rule {r}: {e}")
+            return 0.
+            
    return float(metric)


--- a/desktop_env/evaluators/metrics/slides.py
+++ b/desktop_env/evaluators/metrics/slides.py
@@ -73,6 +73,9 @@ def check_image_stretch_and_center(modified_ppt, original_ppt):
    original_slide_images = [shape for shape in original_slide.shapes if shape.shape_type == 13]
    modified_slide_images = [shape for shape in modified_slide.shapes if shape.shape_type == 13]

+    if not original_slide_images:
+        return 0.
+    
    the_image = original_slide_images[0]

    the_modified_image = None
@@ -395,12 +398,38 @@ def compare_pptx_files(file1_path, file2_path, **options):
                table2 = shape2.table
                if enable_debug:
                    debug_logger.debug(f"  Shape {shape_idx} - Comparing TABLE with {len(table1.rows)} rows and {len(table1.columns)} columns")
+                    debug_logger.debug(f"  Shape {shape_idx} - Table2 has {len(table2.rows)} rows and {len(table2.columns)} columns")
+                
+                # Check if tables have the same dimensions
+                if len(table1.rows) != len(table2.rows) or len(table1.columns) != len(table2.columns):
+                    if enable_debug:
+                        debug_logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx} (TABLE) - Table dimensions differ:")
+                        debug_logger.debug(f"      Table1: {len(table1.rows)} rows x {len(table1.columns)} columns")
+                        debug_logger.debug(f"      Table2: {len(table2.rows)} rows x {len(table2.columns)} columns")
+                    return 0
+                
                for row_idx in range(len(table1.rows)):
                    for col_idx in range(len(table1.columns)):
                        cell1 = table1.cell(row_idx, col_idx)
                        cell2 = table2.cell(row_idx, col_idx)

+                        # Check if cells have the same number of paragraphs
+                        if len(cell1.text_frame.paragraphs) != len(cell2.text_frame.paragraphs):
+                            if enable_debug:
+                                debug_logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx} (TABLE) - Cell [{row_idx},{col_idx}] - Different number of paragraphs:")
+                                debug_logger.debug(f"      Cell1 paragraphs: {len(cell1.text_frame.paragraphs)}")
+                                debug_logger.debug(f"      Cell2 paragraphs: {len(cell2.text_frame.paragraphs)}")
+                            return 0
+
                        for para_idx, (para1, para2) in enumerate(zip(cell1.text_frame.paragraphs, cell2.text_frame.paragraphs)):
+                            # Check if paragraphs have the same number of runs
+                            if len(para1.runs) != len(para2.runs):
+                                if enable_debug:
+                                    debug_logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx} (TABLE) - Cell [{row_idx},{col_idx}], Para {para_idx} - Different number of runs:")
+                                    debug_logger.debug(f"      Para1 runs: {len(para1.runs)}")
+                                    debug_logger.debug(f"      Para2 runs: {len(para2.runs)}")
+                                return 0
+
                            for run_idx, (run1, run2) in enumerate(zip(para1.runs, para2.runs)):
                                # Check font color
                                if hasattr(run1.font.color, "rgb") and hasattr(run2.font.color, "rgb"):
@@ -451,6 +480,14 @@ def compare_pptx_files(file1_path, file2_path, **options):
                if shape1.text.strip() != shape2.text.strip() and examine_text:
                    return 0

+                # check if the number of paragraphs are the same
+                if len(shape1.text_frame.paragraphs) != len(shape2.text_frame.paragraphs):
+                    if enable_debug:
+                        debug_logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx} - Different number of paragraphs:")
+                        debug_logger.debug(f"      Shape1 paragraphs: {len(shape1.text_frame.paragraphs)}")
+                        debug_logger.debug(f"      Shape2 paragraphs: {len(shape2.text_frame.paragraphs)}")
+                    return 0
+
                    # check if the paragraphs are the same
                para_idx = 0
                for para1, para2 in zip(shape1.text_frame.paragraphs, shape2.text_frame.paragraphs):
@@ -487,6 +524,14 @@ def compare_pptx_files(file1_path, file2_path, **options):
                    if para1.level != para2.level and examine_indent:
                        return 0

+                    # check if the number of runs are the same
+                    if len(para1.runs) != len(para2.runs):
+                        if enable_debug:
+                            debug_logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx}, Para {para_idx} - Different number of runs:")
+                            debug_logger.debug(f"      Para1 runs: {len(para1.runs)}")
+                            debug_logger.debug(f"      Para2 runs: {len(para2.runs)}")
+                        return 0
+
                    for run1, run2 in zip(para1.runs, para2.runs):

                        # check if the font properties are the same                        
@@ -634,6 +679,12 @@ def compare_pptx_files(file1_path, file2_path, **options):
                        debug_logger.debug(f"    MISMATCH: Text differs - '{tshape1.text.strip()}' vs '{tshape2.text.strip()}'")
                    return 0
                
+                # Check if text shapes have the same number of paragraphs
+                if len(tshape1.text_frame.paragraphs) != len(tshape2.text_frame.paragraphs):
+                    if enable_debug:
+                        debug_logger.debug(f"    MISMATCH: Different number of paragraphs - {len(tshape1.text_frame.paragraphs)} vs {len(tshape2.text_frame.paragraphs)}")
+                    return 0
+                
                # Compare alignment of each paragraph
                for para_idx, (para1, para2) in enumerate(zip(tshape1.text_frame.paragraphs, tshape2.text_frame.paragraphs)):
                    from pptx.enum.text import PP_ALIGN
--- a/desktop_env/evaluators/metrics/table.py
+++ b/desktop_env/evaluators/metrics/table.py
@@ -36,8 +36,14 @@ def _parse_sheet_idx(sheet_idx: Union[int, str]
    #  function _parse_sheet_idx {{{ # 
    if isinstance(sheet_idx, int):
        try:
-            index: str = result_sheet_names[sheet_idx]
-        except:
+            if not result_sheet_names or sheet_idx >= len(result_sheet_names):
+                logger.error(f"Sheet index {sheet_idx} out of range. Available sheets: {result_sheet_names}")
+                index = ""
+            else:
+                index: str = result_sheet_names[sheet_idx]
+                logger.debug(f"Sheet index {sheet_idx} resolved to sheet: {index}")
+        except Exception as e:
+            logger.error(f"Error resolving sheet index {sheet_idx}: {e}")
            index = ""
        book: BOOK = result
    elif sheet_idx.startswith("RI"):
@@ -114,12 +120,21 @@ def compare_table(result: str, expected: str = None, **options) -> float:
    """

    if result is None:
+        logger.error("Result file path is None")
+        return 0.
+
+    # Check if result file exists
+    if not os.path.exists(result):
+        logger.error(f"Result file not found: {result}")
        return 0.

    try:
+        logger.info(f"Loading result file: {result}")
        xlworkbookr: Workbook = openpyxl.load_workbook(filename=result)
        pdworkbookr = pd.ExcelFile(result)
-    except:
+        logger.info(f"Successfully loaded result file with sheets: {pdworkbookr.sheet_names}")
+    except Exception as e:
+        logger.error(f"Failed to load result file {result}: {e}")
        return 0.
    worksheetr_names: List[str] = pdworkbookr.sheet_names

@@ -432,19 +447,35 @@ def compare_table(result: str, expected: str = None, **options) -> float:
            # props: dict like {attribute: {"method": str, "ref": anything}}
            #   supported attributes: value & those supported by utils._read_cell_style

-            sheet: Worksheet = _load_sheet(*parse_idx(r["sheet_idx"], xlworkbookr, xlworkbooke))
-            if sheet is None:
-                return 0.
-            # data_frame: pd.DataFrame = _load_sheet(*parse_idx(r["sheet_idx"], pdworkbookr, pdworkbooke))
-            cell: Cell = sheet[r["coordinate"]]
-            metric: bool = True
-            for prpt, rule in r["props"].items():
-                if prpt == "value":
-                    val = read_cell_value(*parse_idx(r["sheet_idx"], result, expected), r["coordinate"])
-                else:
-                    val = _read_cell_style(prpt, cell)
+            try:
+                sheet: Worksheet = _load_sheet(*parse_idx(r["sheet_idx"], xlworkbookr, xlworkbooke))
+                if sheet is None:
+                    logger.error(f"Failed to load sheet for sheet_idx: {r['sheet_idx']}")
+                    return 0.
+                # data_frame: pd.DataFrame = _load_sheet(*parse_idx(r["sheet_idx"], pdworkbookr, pdworkbooke))
+                cell: Cell = sheet[r["coordinate"]]
+                metric: bool = True
+                for prpt, rule in r["props"].items():
+                    if prpt == "value":
+                        try:
+                            parsed_result = parse_idx(r["sheet_idx"], result, expected)
+                            logger.debug(f"parse_idx result: {parsed_result}")
+                            val = read_cell_value(*parsed_result, r["coordinate"])
+                            logger.debug(f"Cell {r['coordinate']} value: {val}")
+                        except Exception as e:
+                            logger.error(f"Failed to read cell value at {r['coordinate']}: {e}")
+                            val = None
+                    else:
+                        try:
+                            val = _read_cell_style(prpt, cell)
+                        except Exception as e:
+                            logger.error(f"Failed to read cell style {prpt} at {r['coordinate']}: {e}")
+                            val = None

-                metric = metric and _match_value_to_rule(val, rule)
+                    metric = metric and _match_value_to_rule(val, rule)
+            except Exception as e:
+                logger.error(f"Error in check_cell processing: {e}")
+                return 0.

            logger.debug("Assertion: %s[%s] :%s - %s"
                         , r["sheet_idx"], r["coordinate"]
--- a/desktop_env/evaluators/metrics/utils.py
+++ b/desktop_env/evaluators/metrics/utils.py
@@ -4,6 +4,7 @@ import functools
 import itertools
 import logging
 import operator
+import os
 import re
 import zipfile
 #import pandas as pd
@@ -33,10 +34,11 @@ V = TypeVar("Value")

 logger = logging.getLogger("desktopenv.metrics.utils")

-_xlsx_namespaces = [("oo", "http://schemas.openxmlformats.org/spreadsheetml/2006/main")
-    , ("x14", "http://schemas.microsoft.com/office/spreadsheetml/2009/9/main")
-    , ("xm", "http://schemas.microsoft.com/office/excel/2006/main")
-                    ]
+_xlsx_namespaces = [
+    ("oo", "http://schemas.openxmlformats.org/spreadsheetml/2006/main"),
+    ("x14", "http://schemas.microsoft.com/office/spreadsheetml/2009/9/main"),
+    ("xm", "http://schemas.microsoft.com/office/excel/2006/main")
+]
 _xlsx_ns_mapping = dict(_xlsx_namespaces)
 _xlsx_ns_imapping = dict(map(lambda itm: (itm[1], itm[0]), _xlsx_namespaces))
 _xlsx_ns_imapping["http://schemas.openxmlformats.org/spreadsheetml/2006/main"] = None
@@ -282,6 +284,13 @@ _shared_str_value_selector = lxml.cssselect.CSSSelector("oo|t", namespaces=_xlsx

 def read_cell_value(xlsx_file: str, sheet_name: str, coordinate: str) -> Any:
    #  read_cell_value {{{ # 
+    logger.debug(f"Reading cell value from {xlsx_file}, sheet: {sheet_name}, coordinate: {coordinate}")
+    
+    # Check if file exists
+    if not os.path.exists(xlsx_file):
+        logger.error(f"Excel file not found: {xlsx_file}")
+        return None
+    
    try:
        with zipfile.ZipFile(xlsx_file, "r") as z_f:
            try:
@@ -308,9 +317,17 @@ def read_cell_value(xlsx_file: str, sheet_name: str, coordinate: str) -> Any:
                                               , namespaces=_xlsx_ns_mapping
                                               )(sheet)
                if len(cells) == 0:
+                    logger.debug(f"Cell {coordinate} not found in sheet {sheet_name}")
                    return None
                cell: _Element = cells[0]
-    except zipfile.BadZipFile:
+    except zipfile.BadZipFile as e:
+        logger.error(f"Bad zip file {xlsx_file}: {e}")
+        return None
+    except KeyError as e:
+        logger.error(f"Sheet {sheet_name} not found in {xlsx_file}: {e}")
+        return None
+    except Exception as e:
+        logger.error(f"Error reading {xlsx_file}: {e}")
        return None

    cell: Dict[str, str] = xmltodict.parse(lxml.etree.tostring(cell, encoding="unicode")
--- a/mm_agents/anthropic/main.py
+++ b/mm_agents/anthropic/main.py
@@ -369,9 +369,10 @@ class AnthropicAgent:
                )

        except (APIError, APIStatusError, APIResponseValidationError) as e:
-            self.logger.exception(f"Anthropic API error: {str(e)}")
+            logger.exception(f"Anthropic API error: {str(e)}")
            try:
-                self.logger.warning("Retrying with backup API key...")
+                logger.warning("Retrying with backup API key...")
+
                backup_client = Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY_BACKUP"), max_retries=4)
                
                if self.model_name == "claude-3-7-sonnet-20250219" or self.model_name == "claude-4-opus-20250514" or self.model_name == "claude-4-sonnet-20250514":
@@ -393,13 +394,13 @@ class AnthropicAgent:
                        tools=tools,
                        betas=betas,
                    )
-                self.logger.info("Successfully used backup API key")
+                logger.info("Successfully used backup API key")
            except Exception as backup_e:
-                self.logger.exception(f"Backup API call also failed: {str(backup_e)}")
+                logger.exception(f"Backup API call also failed: {str(backup_e)}")
                return None, None

        except Exception as e:
-            self.logger.exception(f"Error in Anthropic API: {str(e)}")
+            logger.exception(f"Error in Anthropic API: {str(e)}")
            return None, None

        response_params = _response_to_params(response)
@@ -434,9 +435,15 @@ class AnthropicAgent:
            actions = ["DONE"]
        return reasonings, actions
    
-    def reset(self, *args, **kwargs):
+    def reset(self, _logger = None, *args, **kwargs):
        """
        Reset the agent's state.
        """
+        global logger
+        if _logger:
+            logger = _logger
+        else:
+            logger = logging.getLogger("desktopenv.agent")
        self.messages = []
-        self.logger.info(f"{self.class_name} reset.")
+        logger.info(f"{self.class_name} reset.")
+
--- a/monitor/.env
+++ b/monitor/.env
@@ -11,4 +11,4 @@ MODEL_NAME=computer-use-preview
 MAX_STEPS=100
 FLASK_PORT=80
 FLASK_HOST=0.0.0.0
-FLASK_DEBUG=true
+FLASK_DEBUG=false
--- a/monitor/main.py
+++ b/monitor/main.py
@@ -1,14 +1,17 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-

+from functools import cache
 import os
 import json
 import time
+import subprocess
 from datetime import datetime
 from pathlib import Path
 from flask import Flask, render_template_string, jsonify, send_file, request, render_template
 from dotenv import load_dotenv

+
 # Load environment variables from .env file
 load_dotenv()

@@ -38,12 +41,51 @@ OBSERVATION_TYPE=os.getenv("OBSERVATION_TYPE", "screenshot")
 MODEL_NAME=os.getenv("MODEL_NAME", "computer-use-preview")
 MAX_STEPS = int(os.getenv("MAX_STEPS", "150"))

+def initialize_default_config():
+    """Initialize default configuration from the first available config in results directory"""
+    global ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME, RESULTS_PATH
+    
+    if os.path.exists(RESULTS_BASE_PATH):
+        try:
+            # Scan for the first available configuration
+            for action_space in os.listdir(RESULTS_BASE_PATH):
+                action_space_path = os.path.join(RESULTS_BASE_PATH, action_space)
+                if os.path.isdir(action_space_path):
+                    for obs_type in os.listdir(action_space_path):
+                        obs_path = os.path.join(action_space_path, obs_type)
+                        if os.path.isdir(obs_path):
+                            for model_name in os.listdir(obs_path):
+                                model_path = os.path.join(obs_path, model_name)
+                                if os.path.isdir(model_path):
+                                    # Use the first available configuration as default
+                                    ACTION_SPACE = action_space
+                                    OBSERVATION_TYPE = obs_type
+                                    MODEL_NAME = model_name
+                                    RESULTS_PATH = model_path
+                                    print(f"Initialized default config: {ACTION_SPACE}/{OBSERVATION_TYPE}/{MODEL_NAME}")
+                                    return
+        except Exception as e:
+            print(f"Error scanning results directory for default config: {e}")
+    
+    # Fallback to original environment-based path if no configs found
+    RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
+    print(f"Using fallback config from environment: {ACTION_SPACE}/{OBSERVATION_TYPE}/{MODEL_NAME}")
+
+# Initialize default configuration
+initialize_default_config()
+
 RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)

+if RESULTS_PATH not in TASK_STATUS_CACHE:
+    # Initialize cache for this results path
+    TASK_STATUS_CACHE[RESULTS_PATH] = {}
+
+@cache
 def load_task_list():
    with open(TASK_CONFIG_PATH, 'r') as f:
        return json.load(f)

+@cache
 def get_task_info(task_type, task_id):
    task_file = os.path.join(EXAMPLES_BASE_PATH, task_type, f"{task_id}.json")
    if os.path.exists(task_file):
@@ -183,8 +225,8 @@ def get_task_status_brief(task_type, task_id):
    # Check if the status is already cached
    current_time = time.time()
    last_cache_time = None
-    if cache_key in TASK_STATUS_CACHE:
-        cached_status, cached_time = TASK_STATUS_CACHE[cache_key]
+    if cache_key in TASK_STATUS_CACHE[RESULTS_PATH]:
+        cached_status, cached_time = TASK_STATUS_CACHE[RESULTS_PATH][cache_key]
        last_cache_time = cached_time
        # If cached status is "Done", check if it's within the stability period
        if cached_status["status"].startswith("Done"):
@@ -312,7 +354,7 @@ def get_task_status_brief(task_type, task_id):
    # Cache the status if it is done or error
    if status.startswith("Done") or status == "Error":
        current_time = last_cache_time if last_cache_time else current_time
-        TASK_STATUS_CACHE[cache_key] = (status_dict, current_time)
+        TASK_STATUS_CACHE[RESULTS_PATH][cache_key] = (status_dict, current_time)
    
    return status_dict

@@ -434,6 +476,90 @@ def api_task_detail(task_type, task_id):
        "status": task_status
    })

+@app.route('/api/config')
+def api_config():
+    """Get configuration information from environment variables - deprecated, use /api/current-config instead"""
+    config_info = {
+        "task_config_path": TASK_CONFIG_PATH,
+        "results_base_path": RESULTS_BASE_PATH,
+        "action_space": ACTION_SPACE,
+        "observation_type": OBSERVATION_TYPE,
+        "model_name": MODEL_NAME,
+        "max_steps": MAX_STEPS,
+        "examples_base_path": EXAMPLES_BASE_PATH
+    }
+    return jsonify(config_info)
+
+@app.route('/api/available-configs')
+def api_available_configs():
+    """Get all available configuration combinations by scanning the results directory"""
+    configs = []
+    
+    if os.path.exists(RESULTS_BASE_PATH):
+        try:
+            # Scan action spaces
+            for action_space in os.listdir(RESULTS_BASE_PATH):
+                action_space_path = os.path.join(RESULTS_BASE_PATH, action_space)
+                if os.path.isdir(action_space_path):
+                    # Scan observation types
+                    for obs_type in os.listdir(action_space_path):
+                        obs_path = os.path.join(action_space_path, obs_type)
+                        if os.path.isdir(obs_path):
+                            # Scan model names
+                            for model_name in os.listdir(obs_path):
+                                model_path = os.path.join(obs_path, model_name)
+                                if os.path.isdir(model_path):
+                                    configs.append({
+                                        "action_space": action_space,
+                                        "observation_type": obs_type,
+                                        "model_name": model_name,
+                                        "path": model_path
+                                    })
+        except Exception as e:
+            print(f"Error scanning results directory: {e}")
+    
+    return jsonify(configs)
+
+@app.route('/api/current-config')
+def api_current_config():
+    """Get current configuration"""
+    return jsonify({
+        "action_space": ACTION_SPACE,
+        "observation_type": OBSERVATION_TYPE,
+        "model_name": MODEL_NAME,
+        "max_steps": MAX_STEPS,
+        "results_path": RESULTS_PATH
+    })
+
+@app.route('/api/set-config', methods=['POST'])
+def api_set_config():
+    """Set current configuration"""
+    global ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME, RESULTS_PATH
+    
+    data = request.get_json()
+    if not data:
+        return jsonify({"error": "No data provided"}), 400
+    
+    # Update global variables
+    ACTION_SPACE = data.get('action_space', ACTION_SPACE)
+    OBSERVATION_TYPE = data.get('observation_type', OBSERVATION_TYPE)
+    MODEL_NAME = data.get('model_name', MODEL_NAME)
+    
+    # Update results path
+    RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
+    
+    if RESULTS_PATH not in TASK_STATUS_CACHE:
+        # Initialize cache for this results path
+        TASK_STATUS_CACHE[RESULTS_PATH] = {}
+        
+    return jsonify({
+        "action_space": ACTION_SPACE,
+        "observation_type": OBSERVATION_TYPE,
+        "model_name": MODEL_NAME,
+        "max_steps": MAX_STEPS,
+        "results_path": RESULTS_PATH
+    })
+
 if __name__ == '__main__':
    # Check if necessary directories exist
    if not os.path.exists(TASK_CONFIG_PATH):
@@ -447,4 +573,4 @@ if __name__ == '__main__':
    port = 8080
    debug = os.getenv("FLASK_DEBUG", "false").lower() == "true"
    
-    app.run(host=host, port=port, debug=debug)
+    app.run(host=host, port=port, debug=debug, threaded=True)
--- a/monitor/static/index.css
+++ b/monitor/static/index.css
@@ -1,5 +1,63 @@
 /* filepath: /home/adlsdztony/codes/OSWorld/monitor/static/index.css */
 body { font-family: 'Segoe UI', Arial, sans-serif; margin: 0; padding: 0; background: linear-gradient(135deg, #f4f6fa 0%, #e9f0f9 100%); }
+
+.layout-container {
+    position: relative;
+    max-width: 1200px;
+    margin: 20px auto;
+    padding: 0 20px;
+}
+
+.main-content {
+    background: #fff;
+    border-radius: 14px;
+    box-shadow: 0 8px 32px rgba(0,0,0,0.1);
+    padding: 36px 44px;
+}
+
+/* Floating Config Sidebar */
+.config-sidebar {
+    position: fixed;
+    top: 20px;
+    left: -280px;
+    width: 300px;
+    height: calc(100vh - 40px);
+    z-index: 1000;
+    transition: left 0.3s ease;
+}
+
+.config-sidebar:hover {
+    left: 0;
+}
+
+.config-toggle-btn {
+    position: absolute;
+    right: -50px;
+    top: 50%;
+    transform: translateY(-50%);
+    width: 50px;
+    height: 50px;
+    background: linear-gradient(135deg, #007bff, #0056b3);
+    border-radius: 0 25px 25px 0;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    color: white;
+    font-size: 1.2em;
+    cursor: pointer;
+    box-shadow: 2px 0 10px rgba(0,0,0,0.2);
+    transition: all 0.3s ease;
+}
+
+.config-toggle-btn:hover {
+    background: linear-gradient(135deg, #0056b3, #004085);
+    transform: translateY(-50%) scale(1.05);
+}
+
+.config-sidebar:hover .config-toggle-btn {
+    opacity: 0.8;
+}
+
 .main-container { max-width: 1100px; margin: 40px auto; background: #fff; border-radius: 14px; box-shadow: 0 8px 32px rgba(0,0,0,0.1); padding: 36px 44px; }
 h1 { font-size: 2.5em; margin-bottom: 24px; color: #1a237e; text-align: center; position: relative; }
 h1:after { content: ''; display: block; width: 80px; height: 4px; background: linear-gradient(90deg, #007bff, #00c6ff); margin: 12px auto 0; border-radius: 2px; }
@@ -125,6 +183,18 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
    text-shadow: 0 1px 2px rgba(0,0,0,0.05);
 }

+.accuracy-percentage {
+    font-size: 0.7em;
+    font-weight: 600;
+    color: #ffffff;
+    margin-left: 8px;
+    background: rgba(255, 255, 255, 0.1);
+    padding: 4px 8px;
+    border-radius: 12px;
+    display: inline-block;
+    vertical-align: middle;
+}
+

 .stat-card span {
    font-size: 2em;
@@ -197,8 +267,9 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }

 .task-type-stats {
    display: flex;
-    gap: 16px;
    flex-wrap: wrap;
+    gap: 8px;
+    align-items: center;
 }

 .task-stat {
@@ -228,6 +299,22 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
    color: #b71c1c;
 }

+/* Task type statistics styles */
+.task-stat.score {
+    color: #ffc107;
+    background: rgba(255, 193, 7, 0.1);
+}
+
+.task-stat.steps {
+    color: #17a2b8;
+    background: rgba(23, 162, 184, 0.1);
+}
+
+.task-stat.rate {
+    color: #28a745;
+    background: rgba(40, 167, 69, 0.1);
+}
+
 .tasks-container {
    padding: 20px;
    transition: all 0.4s cubic-bezier(.4,0,.2,1);
@@ -427,3 +514,174 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
    background: #a5c7e5;
 }

+/* Configuration Panel Styles */
+.config-panel {
+    background: #fff;
+    border-radius: 0 14px 14px 0;
+    box-shadow: 0 8px 32px rgba(0,0,0,0.15);
+    overflow: hidden;
+    height: 100%;
+    display: flex;
+    flex-direction: column;
+}
+
+.config-header {
+    display: flex;
+    align-items: center;
+    padding: 16px 20px;
+    background: linear-gradient(135deg, #6c757d, #495057);
+    color: white;
+    flex-shrink: 0;
+}
+
+.config-header i {
+    margin-right: 10px;
+    font-size: 1.1em;
+}
+
+.config-header span {
+    font-weight: 600;
+    font-size: 1.1em;
+}
+
+.config-content {
+    padding: 20px;
+    flex: 1;
+    overflow-y: auto;
+}
+
+.config-selector {
+    margin-bottom: 20px;
+    padding-bottom: 15px;
+    border-bottom: 1px solid #dee2e6;
+}
+
+.selector-item {
+    display: flex;
+    flex-direction: column;
+    gap: 8px;
+}
+
+.selector-item label {
+    font-weight: 600;
+    color: #495057;
+    font-size: 0.9em;
+    text-transform: uppercase;
+    letter-spacing: 0.5px;
+}
+
+.selector-item select {
+    padding: 8px 12px;
+    border: 2px solid #e9ecef;
+    border-radius: 6px;
+    background: white;
+    font-size: 0.9em;
+    color: #495057;
+    cursor: pointer;
+    transition: all 0.3s ease;
+}
+
+.selector-item select:focus {
+    outline: none;
+    border-color: #007bff;
+    box-shadow: 0 0 0 3px rgba(0,123,255,0.1);
+}
+
+.selector-item select:hover {
+    border-color: #007bff;
+}
+
+.config-list {
+    display: flex;
+    flex-direction: column;
+    gap: 15px;
+}
+
+.config-item {
+    display: flex;
+    flex-direction: column;
+    background: #f8f9fa;
+    padding: 12px;
+    border-radius: 8px;
+    border-left: 4px solid #007bff;
+    transition: all 0.3s ease;
+}
+
+.config-item:hover {
+    transform: translateX(3px);
+    box-shadow: 0 4px 12px rgba(0,123,255,0.15);
+}
+
+.config-label {
+    font-weight: 600;
+    color: #495057;
+    margin-bottom: 5px;
+    font-size: 0.9em;
+    text-transform: uppercase;
+    color: #495057;
+    font-size: 0.85em;
+    margin-bottom: 6px;
+    text-transform: uppercase;
+    letter-spacing: 0.5px;
+}
+
+.config-value {
+    color: #007bff;
+    font-family: 'Courier New', monospace;
+    font-size: 0.9em;
+    font-weight: 600;
+    word-break: break-word;
+}
+
+.config-path {
+    font-size: 0.8em;
+    line-height: 1.3;
+}
+
+/* Responsive design for sidebar layout */
+@media (max-width: 1024px) {
+    .config-sidebar {
+        left: -250px;
+        width: 250px;
+    }
+    
+    .config-toggle-btn {
+        right: -40px;
+        width: 40px;
+        height: 40px;
+        font-size: 1em;
+    }
+}
+
+@media (max-width: 768px) {
+    .layout-container {
+        padding: 0 10px;
+    }
+    
+    .main-content {
+        padding: 20px 25px;
+    }
+    
+    .config-sidebar {
+        left: -220px;
+        width: 220px;
+        height: calc(100vh - 20px);
+        top: 10px;
+    }
+    
+    .config-toggle-btn {
+        right: -35px;
+        width: 35px;
+        height: 35px;
+        font-size: 0.9em;
+    }
+    
+    .config-content {
+        padding: 15px;
+    }
+    
+    .config-item {
+        padding: 10px;
+    }
+}
+
--- a/monitor/static/index.js
+++ b/monitor/static/index.js
@@ -1,5 +1,8 @@
 document.addEventListener('DOMContentLoaded', () => {
-    fetchTasks();
+    fetchAvailableConfigs().then(() => {
+        fetchConfig();
+        fetchTasks();
+    });
    // Bind filter functionality
    document.getElementById('total-tasks').parentElement.addEventListener('click', () => setTaskFilter('all'));
    document.getElementById('active-tasks').parentElement.addEventListener('click', () => setTaskFilter('active'));
@@ -9,6 +12,9 @@ document.addEventListener('DOMContentLoaded', () => {

 let allTaskData = null;
 let currentFilter = 'all';
+let availableConfigs = [];
+let currentConfig = null;
+let categoryStats = {};

 function refreshPage() {
    // Save expanded state before refresh
@@ -31,8 +37,8 @@ function fetchTasksForRefresh() {
    fetch('/api/tasks/brief')
        .then(response => response.json())
        .then(data => {
-            // Update stored data
            allTaskData = data;
+            categoryStats = calculateCategoryStats(data);
            // Only update statistics and task status, do not fully re-render
            updateStatistics(data);
            updateTaskStatus(data);
@@ -148,6 +154,7 @@ function fetchTasks() {
        .then(response => response.json())
        .then(data => {
            allTaskData = data;
+            categoryStats = calculateCategoryStats(data);
            renderTasks(data);
            updateStatistics(data);
        })
@@ -208,13 +215,15 @@ function updateStatistics(data) {
    document.getElementById('completed-tasks').textContent = completedTasks;
    document.getElementById('error-tasks').textContent = errorTasks;
    
-    // Update score display with formatted score
+    // Update score display with formatted score and accuracy percentage
    const scoreDisplay = document.getElementById('score-display');
    if (completedTasks > 0) {
        const scoreFormatted = totalScore.toFixed(2);
-        scoreDisplay.innerHTML = `<span>${scoreFormatted}</span> / <span>${completedTasks}</span>`;
+        const averageScore = totalScore / completedTasks;
+        const accuracyPercentage = (averageScore * 100).toFixed(1);
+        scoreDisplay.innerHTML = `<span>${scoreFormatted}</span> / <span>${completedTasks}</span> <span class="accuracy-percentage">(${accuracyPercentage}%)</span>`;
    } else {
-        scoreDisplay.innerHTML = '<span>0.00</span> / <span>0</span>';
+        scoreDisplay.innerHTML = '<span>0.00</span> / <span>0</span> <span class="accuracy-percentage">(0.0%)</span>';
    }
    
    // Highlight the currently selected statistics card
@@ -279,6 +288,10 @@ function renderTasks(data) {
        // Create header with task type name and statistics
        const typeHeader = document.createElement('div');
        typeHeader.className = 'task-type-header';
+        
+        // Get category stats for this task type
+        const stats = categoryStats[taskType] || {};
+        
        typeHeader.innerHTML = `
            <span class="task-type-name"><i class="fas fa-layer-group"></i> ${taskType}</span>
            <div class="task-type-stats">
@@ -286,6 +299,9 @@ function renderTasks(data) {
                <span class="task-stat"><i class="fas fa-tasks"></i> ${tasks.length} total</span>
                <span class="task-stat running"><i class="fas fa-running"></i> ${runningCount} active</span>
                <span class="task-stat completed"><i class="fas fa-check-circle"></i> ${completedCount} completed</span>
+                ${stats.avg_score ? `<span class="task-stat score"><i class="fas fa-star"></i> ${stats.avg_score} avg score</span>` : ''}
+                ${stats.avg_steps ? `<span class="task-stat steps"><i class="fas fa-chart-line"></i> ${stats.avg_steps} avg steps</span>` : ''}
+                ${stats.completion_rate ? `<span class="task-stat rate"><i class="fas fa-percentage"></i> ${stats.completion_rate}% completed</span>` : ''}
            </div>
        `;
        typeSection.appendChild(typeHeader);
@@ -453,7 +469,181 @@ function renderTasks(data) {
        container.appendChild(typeSection);
    });
 }
-// add auto-refresh with time interval 10 seconds
-setInterval(() => {
-        refreshPage();
-}, 10000); // 10 seconds interval
+
+function fetchAvailableConfigs() {
+    return fetch('/api/available-configs')
+        .then(response => response.json())
+        .then(data => {
+            availableConfigs = data;
+            populateConfigSelect();
+            return data;
+        })
+        .catch(error => {
+            console.error('Error fetching available configs:', error);
+            return [];
+        });
+}
+
+function populateConfigSelect() {
+    const select = document.getElementById('config-select');
+    select.innerHTML = '';
+    
+    if (availableConfigs.length === 0) {
+        select.innerHTML = '<option value="">No configurations found in results directory</option>';
+        return;
+    }
+    
+    // Add available configurations
+    availableConfigs.forEach((config, index) => {
+        const option = document.createElement('option');
+        option.value = index;
+        option.textContent = `${config.action_space} / ${config.observation_type} / ${config.model_name}`;
+        select.appendChild(option);
+    });
+}
+
+function changeConfiguration() {
+    const select = document.getElementById('config-select');
+    const selectedIndex = select.value;
+    
+    if (selectedIndex === '' || selectedIndex < 0 || selectedIndex >= availableConfigs.length) {
+        return;
+    }
+    
+    const selectedConfig = availableConfigs[selectedIndex];
+    
+    // Send configuration change request
+    fetch('/api/set-config', {
+        method: 'POST',
+        headers: {
+            'Content-Type': 'application/json',
+        },
+        body: JSON.stringify(selectedConfig)
+    })
+    .then(response => response.json())
+    .then(data => {
+        currentConfig = data;
+        displayConfig(data);
+        // Refresh tasks with new configuration
+        fetchTasks();
+    })
+    .catch(error => {
+        console.error('Error setting config:', error);
+        displayConfigError();
+    });
+}
+
+function fetchConfig() {
+    return fetch('/api/current-config')
+        .then(response => response.json())
+        .then(data => {
+            currentConfig = data;
+            displayConfig(data);
+            updateConfigSelect();
+            return data;
+        })
+        .catch(error => {
+            console.error('Error fetching config:', error);
+            displayConfigError();
+        });
+}
+
+function updateConfigSelect() {
+    if (!currentConfig || availableConfigs.length === 0) return;
+    
+    const select = document.getElementById('config-select');
+    const currentConfigIndex = availableConfigs.findIndex(config => 
+        config.action_space === currentConfig.action_space &&
+        config.observation_type === currentConfig.observation_type &&
+        config.model_name === currentConfig.model_name
+    );
+    
+    if (currentConfigIndex !== -1) {
+        select.value = currentConfigIndex;
+    } else {
+        // Current config not found in available configs, select the first one if available
+        if (availableConfigs.length > 0) {
+            select.value = 0;
+            console.warn('Current config not found in available configs, defaulting to first available config');
+        }
+    }
+}
+
+function displayConfig(config) {
+    document.getElementById('action-space').textContent = config.action_space || 'N/A';
+    document.getElementById('observation-type').textContent = config.observation_type || 'N/A';
+    document.getElementById('model-name').textContent = config.model_name || 'N/A';
+    document.getElementById('max-steps').textContent = config.max_steps || 'N/A';
+}
+
+function displayConfigError() {
+    const configValues = document.querySelectorAll('.config-value');
+    configValues.forEach(element => {
+        element.textContent = 'Error loading';
+        element.style.color = '#dc3545';
+    });
+}
+
+function calculateCategoryStats(data) {
+    const stats = {};
+    
+    Object.entries(data).forEach(([taskType, tasks]) => {
+        let totalTasks = tasks.length;
+        let completedTasks = 0;
+        let runningTasks = 0;
+        let errorTasks = 0;
+        let totalScore = 0;
+        let totalSteps = 0;
+        let completedWithSteps = 0;
+        
+        tasks.forEach(task => {
+            const status = task.status.status;
+            
+            if (['Done', 'Done (Message Exit)', 'Done (Max Steps)', 'Done (Thought Exit)'].includes(status)) {
+                completedTasks++;
+                
+                // Calculate score if available
+                if (task.status.result) {
+                    try {
+                        const score = parseFloat(task.status.result);
+                        if (!isNaN(score) && score >= 0 && score <= 1) {
+                            totalScore += score;
+                        }
+                    } catch (e) {
+                        // Ignore parsing errors
+                    }
+                }
+                
+                // Calculate steps for completed tasks
+                if (task.status.progress && task.status.progress > 0) {
+                    totalSteps += task.status.progress;
+                    completedWithSteps++;
+                }
+                
+            } else if (['Running', 'Preparing', 'Initializing'].includes(status)) {
+                runningTasks++;
+                
+            } else if (status === 'Error') {
+                errorTasks++;
+            }
+        });
+        
+        // Calculate averages
+        const avgScore = completedTasks > 0 ? totalScore / completedTasks : 0;
+        const avgSteps = completedWithSteps > 0 ? totalSteps / completedWithSteps : 0;
+        const completionRate = totalTasks > 0 ? (completedTasks / totalTasks * 100) : 0;
+        
+        stats[taskType] = {
+            total_tasks: totalTasks,
+            completed_tasks: completedTasks,
+            running_tasks: runningTasks,
+            error_tasks: errorTasks,
+            total_score: Math.round(totalScore * 100) / 100,
+            avg_score: Math.round(avgScore * 10000) / 10000,
+            avg_steps: Math.round(avgSteps * 10) / 10,
+            completion_rate: Math.round(completionRate * 10) / 10
+        };
+    });
+    
+    return stats;
+}
--- a/monitor/templates/index.html
+++ b/monitor/templates/index.html
@@ -12,19 +12,62 @@
    <link rel="stylesheet" href="/static/index.css">
 </head>
 <body>
-    <div class="main-container">
-        <h1>OSWorld Monitor <span class="system-status online">System Online</span></h1>
-        
-        <!-- Score Display Banner -->
-        <div class="score-banner">
-            <div class="score-content">
-                <i class="fas fa-star"></i>
-                <span class="score-label">Score:</span>
-                <span id="score-display" class="score-value">Loading...</span>
+    <div class="layout-container">
+        <!-- Floating Config Button and Sidebar -->
+        <div class="config-sidebar" id="config-sidebar">
+            <div class="config-toggle-btn">
+                <i class="fas fa-cogs"></i>
+            </div>
+            <div class="config-panel">
+                <div class="config-header">
+                    <i class="fas fa-cogs"></i>
+                    <span>Configuration</span>
+                </div>
+                <div class="config-content">
+                    <div class="config-selector">
+                        <div class="selector-item">
+                            <label for="config-select">Select Configuration:</label>
+                            <select id="config-select" onchange="changeConfiguration()">
+                                <option value="">Loading configurations...</option>
+                            </select>
+                        </div>
+                    </div>
+                    <div class="config-list">
+                        <div class="config-item">
+                            <span class="config-label">Action Space:</span>
+                            <span class="config-value" id="action-space">Loading...</span>
+                        </div>
+                        <div class="config-item">
+                            <span class="config-label">Observation:</span>
+                            <span class="config-value" id="observation-type">Loading...</span>
+                        </div>
+                        <div class="config-item">
+                            <span class="config-label">Model:</span>
+                            <span class="config-value" id="model-name">Loading...</span>
+                        </div>
+                        <div class="config-item">
+                            <span class="config-label">Max Steps:</span>
+                            <span class="config-value" id="max-steps">Loading...</span>
+                        </div>
+                    </div>
+                </div>
            </div>
        </div>
        
-        <div class="dashboard-stats">
+        <!-- Main Content -->
+        <div class="main-content">
+            <h1>OSWorld Monitor <span class="system-status online">System Online</span></h1>
+            
+            <!-- Score Display Banner -->
+            <div class="score-banner">
+                <div class="score-content">
+                    <i class="fas fa-star"></i>
+                    <span class="score-label">Score:</span>
+                    <span id="score-display" class="score-value">Loading...</span>
+                </div>
+            </div>
+            
+            <div class="dashboard-stats">
            <div class="stat-card">
                <i class="fas fa-running"></i>
                <span id="active-tasks">Loading...</span>
@@ -46,10 +89,11 @@
                <div class="stat-label">Total Tasks</div>
            </div>
        </div>
-        <div id="task-container">
-            <div class="loading-spinner">
-                <div class="spinner"></div>
-                <div>Loading task data...</div>
+            <div id="task-container">
+                <div class="loading-spinner">
+                    <div class="spinner"></div>
+                    <div>Loading task data...</div>
+                </div>
            </div>
        </div>
    </div>