Merge branch 'fix_chrome'

2025-07-15 02:13:58 +00:00
parent 7c807d4f3e
commit 756ef96850
21 changed files with 922 additions and 103 deletions
--- a/desktop_env/evaluators/getters/file.py
+++ b/desktop_env/evaluators/getters/file.py
@@ -1,10 +1,13 @@
 import os
+import logging
 from typing import Dict, List, Set
 from typing import Optional, Any, Union
 from datetime import datetime
 import requests
 import pandas as pd

+logger = logging.getLogger("desktopenv.getter.file")
+

 def get_content_from_vm_file(env, config: Dict[str, Any]) -> Any:
    """
@@ -101,16 +104,42 @@ def get_vm_file(env, config: Dict[str, Any]) -> Union[Optional[str], List[Option

    for i, (p, d) in enumerate(zip(paths, dests)):
        _path = os.path.join(env.cache_dir, d)
-        file = env.controller.get_file(p)
-        if file is None:
+        
+        try:
+            # Try to get file from VM
+            file = env.controller.get_file(p)
+            if file is None:
+                logger.warning(f"Failed to get file from VM: {p}")
+                if i in gives:
+                    cache_paths.append(None)
+                continue
+
+            if i in gives:
+                cache_paths.append(_path)
+                
+            # Write file with robust error handling
+            try:
+                # Ensure cache directory exists
+                os.makedirs(env.cache_dir, exist_ok=True)
+                
+                with open(_path, "wb") as f:
+                    f.write(file)
+                logger.info(f"Successfully saved file: {_path} ({len(file)} bytes)")
+                
+            except IOError as e:
+                logger.error(f"IO error writing file {_path}: {e}")
+                if i in gives:
+                    cache_paths[-1] = None  # Replace the path we just added with None
+            except Exception as e:
+                logger.error(f"Unexpected error writing file {_path}: {e}")
+                if i in gives:
+                    cache_paths[-1] = None
+                    
+        except Exception as e:
+            logger.error(f"Error processing file {p}: {e}")
            if i in gives:
                cache_paths.append(None)
-            continue
-
-        if i in gives:
-            cache_paths.append(_path)
-        with open(_path, "wb") as f:
-            f.write(file)
+                
    return cache_paths[0] if len(cache_paths)==1 else cache_paths


--- a/desktop_env/evaluators/metrics/general.py
+++ b/desktop_env/evaluators/metrics/general.py
@@ -298,34 +298,84 @@ def check_json(result: str, rules: Dict[str, List[Dict[str, Union[List[str], str
    """

    if result is None:
+        logger.warning("Result file path is None, returning 0.0")
+        return 0.
+        
+    # Check if file exists
+    if not os.path.exists(result):
+        logger.warning(f"Result file does not exist: {result}, returning 0.0")
+        return 0.
+    
+    try:
+        with open(result, 'r', encoding='utf-8') as f:
+            if is_yaml:
+                try:
+                    # Use SafeLoader instead of Loader for better security and error handling
+                    result_data: Dict[str, Any] = yaml.safe_load(f)
+                    if result_data is None:
+                        logger.warning(f"YAML file {result} is empty or contains only null values, returning 0.0")
+                        return 0.
+                except yaml.YAMLError as e:
+                    logger.error(f"YAML parsing error in file {result}: {e}")
+                    logger.error(f"File content might be corrupted or have invalid YAML syntax")
+                    return 0.
+                except Exception as e:
+                    logger.error(f"Unexpected error parsing YAML file {result}: {e}")
+                    return 0.
+            else:
+                try:
+                    result_data: Dict[str, Any] = json.load(f)
+                except json.JSONDecodeError as e:
+                    logger.error(f"JSON parsing error in file {result}: {e}")
+                    return 0.
+                except Exception as e:
+                    logger.error(f"Unexpected error parsing JSON file {result}: {e}")
+                    return 0.
+    except IOError as e:
+        logger.error(f"IO error reading file {result}: {e}")
+        return 0.
+    except Exception as e:
+        logger.error(f"Unexpected error reading file {result}: {e}")
        return 0.
-    with open(result) as f:
-        if is_yaml:
-            result: Dict[str, Any] = yaml.load(f, Loader=yaml.Loader)
-        else:
-            result: Dict[str, Any] = json.load(f)

    expect_rules = rules.get("expect", {})
    unexpect_rules = rules.get("unexpect", {})

    metric = True
    for r in expect_rules:
-        value = result
-        for k in r["key"]:
-            try:
-                value = value[k]
-            except KeyError:
-                return 0.
-        metric = metric and _match_value_to_rule(value, r)
+        value = result_data
+        try:
+            for k in r["key"]:
+                try:
+                    value = value[k]
+                except KeyError:
+                    logger.debug(f"Key '{k}' not found in result data, returning 0.0")
+                    return 0.
+                except TypeError:
+                    logger.debug(f"Cannot access key '{k}' - value is not a dictionary, returning 0.0")
+                    return 0.
+            metric = metric and _match_value_to_rule(value, r)
+        except Exception as e:
+            logger.error(f"Error processing expect rule {r}: {e}")
+            return 0.
+            
    for r in unexpect_rules:
-        value = result
-        for k in r["key"]:
-            try:
-                value = value[k]
-            except KeyError:
-                value = None
-                break
-        metric = metric and not _match_value_to_rule(value, r)
+        value = result_data
+        try:
+            for k in r["key"]:
+                try:
+                    value = value[k]
+                except KeyError:
+                    value = None
+                    break
+                except TypeError:
+                    value = None
+                    break
+            metric = metric and not _match_value_to_rule(value, r)
+        except Exception as e:
+            logger.error(f"Error processing unexpect rule {r}: {e}")
+            return 0.
+            
    return float(metric)


--- a/desktop_env/evaluators/metrics/slides.py
+++ b/desktop_env/evaluators/metrics/slides.py
@@ -73,6 +73,9 @@ def check_image_stretch_and_center(modified_ppt, original_ppt):
    original_slide_images = [shape for shape in original_slide.shapes if shape.shape_type == 13]
    modified_slide_images = [shape for shape in modified_slide.shapes if shape.shape_type == 13]

+    if not original_slide_images:
+        return 0.
+    
    the_image = original_slide_images[0]

    the_modified_image = None
@@ -395,12 +398,38 @@ def compare_pptx_files(file1_path, file2_path, **options):
                table2 = shape2.table
                if enable_debug:
                    debug_logger.debug(f"  Shape {shape_idx} - Comparing TABLE with {len(table1.rows)} rows and {len(table1.columns)} columns")
+                    debug_logger.debug(f"  Shape {shape_idx} - Table2 has {len(table2.rows)} rows and {len(table2.columns)} columns")
+                
+                # Check if tables have the same dimensions
+                if len(table1.rows) != len(table2.rows) or len(table1.columns) != len(table2.columns):
+                    if enable_debug:
+                        debug_logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx} (TABLE) - Table dimensions differ:")
+                        debug_logger.debug(f"      Table1: {len(table1.rows)} rows x {len(table1.columns)} columns")
+                        debug_logger.debug(f"      Table2: {len(table2.rows)} rows x {len(table2.columns)} columns")
+                    return 0
+                
                for row_idx in range(len(table1.rows)):
                    for col_idx in range(len(table1.columns)):
                        cell1 = table1.cell(row_idx, col_idx)
                        cell2 = table2.cell(row_idx, col_idx)

+                        # Check if cells have the same number of paragraphs
+                        if len(cell1.text_frame.paragraphs) != len(cell2.text_frame.paragraphs):
+                            if enable_debug:
+                                debug_logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx} (TABLE) - Cell [{row_idx},{col_idx}] - Different number of paragraphs:")
+                                debug_logger.debug(f"      Cell1 paragraphs: {len(cell1.text_frame.paragraphs)}")
+                                debug_logger.debug(f"      Cell2 paragraphs: {len(cell2.text_frame.paragraphs)}")
+                            return 0
+
                        for para_idx, (para1, para2) in enumerate(zip(cell1.text_frame.paragraphs, cell2.text_frame.paragraphs)):
+                            # Check if paragraphs have the same number of runs
+                            if len(para1.runs) != len(para2.runs):
+                                if enable_debug:
+                                    debug_logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx} (TABLE) - Cell [{row_idx},{col_idx}], Para {para_idx} - Different number of runs:")
+                                    debug_logger.debug(f"      Para1 runs: {len(para1.runs)}")
+                                    debug_logger.debug(f"      Para2 runs: {len(para2.runs)}")
+                                return 0
+
                            for run_idx, (run1, run2) in enumerate(zip(para1.runs, para2.runs)):
                                # Check font color
                                if hasattr(run1.font.color, "rgb") and hasattr(run2.font.color, "rgb"):
@@ -451,6 +480,14 @@ def compare_pptx_files(file1_path, file2_path, **options):
                if shape1.text.strip() != shape2.text.strip() and examine_text:
                    return 0

+                # check if the number of paragraphs are the same
+                if len(shape1.text_frame.paragraphs) != len(shape2.text_frame.paragraphs):
+                    if enable_debug:
+                        debug_logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx} - Different number of paragraphs:")
+                        debug_logger.debug(f"      Shape1 paragraphs: {len(shape1.text_frame.paragraphs)}")
+                        debug_logger.debug(f"      Shape2 paragraphs: {len(shape2.text_frame.paragraphs)}")
+                    return 0
+
                    # check if the paragraphs are the same
                para_idx = 0
                for para1, para2 in zip(shape1.text_frame.paragraphs, shape2.text_frame.paragraphs):
@@ -487,6 +524,14 @@ def compare_pptx_files(file1_path, file2_path, **options):
                    if para1.level != para2.level and examine_indent:
                        return 0

+                    # check if the number of runs are the same
+                    if len(para1.runs) != len(para2.runs):
+                        if enable_debug:
+                            debug_logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx}, Para {para_idx} - Different number of runs:")
+                            debug_logger.debug(f"      Para1 runs: {len(para1.runs)}")
+                            debug_logger.debug(f"      Para2 runs: {len(para2.runs)}")
+                        return 0
+
                    for run1, run2 in zip(para1.runs, para2.runs):

                        # check if the font properties are the same                        
@@ -634,6 +679,12 @@ def compare_pptx_files(file1_path, file2_path, **options):
                        debug_logger.debug(f"    MISMATCH: Text differs - '{tshape1.text.strip()}' vs '{tshape2.text.strip()}'")
                    return 0
                
+                # Check if text shapes have the same number of paragraphs
+                if len(tshape1.text_frame.paragraphs) != len(tshape2.text_frame.paragraphs):
+                    if enable_debug:
+                        debug_logger.debug(f"    MISMATCH: Different number of paragraphs - {len(tshape1.text_frame.paragraphs)} vs {len(tshape2.text_frame.paragraphs)}")
+                    return 0
+                
                # Compare alignment of each paragraph
                for para_idx, (para1, para2) in enumerate(zip(tshape1.text_frame.paragraphs, tshape2.text_frame.paragraphs)):
                    from pptx.enum.text import PP_ALIGN
--- a/desktop_env/evaluators/metrics/table.py
+++ b/desktop_env/evaluators/metrics/table.py
@@ -36,8 +36,14 @@ def _parse_sheet_idx(sheet_idx: Union[int, str]
    #  function _parse_sheet_idx {{{ # 
    if isinstance(sheet_idx, int):
        try:
-            index: str = result_sheet_names[sheet_idx]
-        except:
+            if not result_sheet_names or sheet_idx >= len(result_sheet_names):
+                logger.error(f"Sheet index {sheet_idx} out of range. Available sheets: {result_sheet_names}")
+                index = ""
+            else:
+                index: str = result_sheet_names[sheet_idx]
+                logger.debug(f"Sheet index {sheet_idx} resolved to sheet: {index}")
+        except Exception as e:
+            logger.error(f"Error resolving sheet index {sheet_idx}: {e}")
            index = ""
        book: BOOK = result
    elif sheet_idx.startswith("RI"):
@@ -114,12 +120,21 @@ def compare_table(result: str, expected: str = None, **options) -> float:
    """

    if result is None:
+        logger.error("Result file path is None")
+        return 0.
+
+    # Check if result file exists
+    if not os.path.exists(result):
+        logger.error(f"Result file not found: {result}")
        return 0.

    try:
+        logger.info(f"Loading result file: {result}")
        xlworkbookr: Workbook = openpyxl.load_workbook(filename=result)
        pdworkbookr = pd.ExcelFile(result)
-    except:
+        logger.info(f"Successfully loaded result file with sheets: {pdworkbookr.sheet_names}")
+    except Exception as e:
+        logger.error(f"Failed to load result file {result}: {e}")
        return 0.
    worksheetr_names: List[str] = pdworkbookr.sheet_names

@@ -432,19 +447,35 @@ def compare_table(result: str, expected: str = None, **options) -> float:
            # props: dict like {attribute: {"method": str, "ref": anything}}
            #   supported attributes: value & those supported by utils._read_cell_style

-            sheet: Worksheet = _load_sheet(*parse_idx(r["sheet_idx"], xlworkbookr, xlworkbooke))
-            if sheet is None:
-                return 0.
-            # data_frame: pd.DataFrame = _load_sheet(*parse_idx(r["sheet_idx"], pdworkbookr, pdworkbooke))
-            cell: Cell = sheet[r["coordinate"]]
-            metric: bool = True
-            for prpt, rule in r["props"].items():
-                if prpt == "value":
-                    val = read_cell_value(*parse_idx(r["sheet_idx"], result, expected), r["coordinate"])
-                else:
-                    val = _read_cell_style(prpt, cell)
+            try:
+                sheet: Worksheet = _load_sheet(*parse_idx(r["sheet_idx"], xlworkbookr, xlworkbooke))
+                if sheet is None:
+                    logger.error(f"Failed to load sheet for sheet_idx: {r['sheet_idx']}")
+                    return 0.
+                # data_frame: pd.DataFrame = _load_sheet(*parse_idx(r["sheet_idx"], pdworkbookr, pdworkbooke))
+                cell: Cell = sheet[r["coordinate"]]
+                metric: bool = True
+                for prpt, rule in r["props"].items():
+                    if prpt == "value":
+                        try:
+                            parsed_result = parse_idx(r["sheet_idx"], result, expected)
+                            logger.debug(f"parse_idx result: {parsed_result}")
+                            val = read_cell_value(*parsed_result, r["coordinate"])
+                            logger.debug(f"Cell {r['coordinate']} value: {val}")
+                        except Exception as e:
+                            logger.error(f"Failed to read cell value at {r['coordinate']}: {e}")
+                            val = None
+                    else:
+                        try:
+                            val = _read_cell_style(prpt, cell)
+                        except Exception as e:
+                            logger.error(f"Failed to read cell style {prpt} at {r['coordinate']}: {e}")
+                            val = None

-                metric = metric and _match_value_to_rule(val, rule)
+                    metric = metric and _match_value_to_rule(val, rule)
+            except Exception as e:
+                logger.error(f"Error in check_cell processing: {e}")
+                return 0.

            logger.debug("Assertion: %s[%s] :%s - %s"
                         , r["sheet_idx"], r["coordinate"]
--- a/desktop_env/evaluators/metrics/utils.py
+++ b/desktop_env/evaluators/metrics/utils.py
@@ -4,6 +4,7 @@ import functools
 import itertools
 import logging
 import operator
+import os
 import re
 import zipfile
 #import pandas as pd
@@ -33,10 +34,11 @@ V = TypeVar("Value")

 logger = logging.getLogger("desktopenv.metrics.utils")

-_xlsx_namespaces = [("oo", "http://schemas.openxmlformats.org/spreadsheetml/2006/main")
-    , ("x14", "http://schemas.microsoft.com/office/spreadsheetml/2009/9/main")
-    , ("xm", "http://schemas.microsoft.com/office/excel/2006/main")
-                    ]
+_xlsx_namespaces = [
+    ("oo", "http://schemas.openxmlformats.org/spreadsheetml/2006/main"),
+    ("x14", "http://schemas.microsoft.com/office/spreadsheetml/2009/9/main"),
+    ("xm", "http://schemas.microsoft.com/office/excel/2006/main")
+]
 _xlsx_ns_mapping = dict(_xlsx_namespaces)
 _xlsx_ns_imapping = dict(map(lambda itm: (itm[1], itm[0]), _xlsx_namespaces))
 _xlsx_ns_imapping["http://schemas.openxmlformats.org/spreadsheetml/2006/main"] = None
@@ -282,6 +284,13 @@ _shared_str_value_selector = lxml.cssselect.CSSSelector("oo|t", namespaces=_xlsx

 def read_cell_value(xlsx_file: str, sheet_name: str, coordinate: str) -> Any:
    #  read_cell_value {{{ # 
+    logger.debug(f"Reading cell value from {xlsx_file}, sheet: {sheet_name}, coordinate: {coordinate}")
+    
+    # Check if file exists
+    if not os.path.exists(xlsx_file):
+        logger.error(f"Excel file not found: {xlsx_file}")
+        return None
+    
    try:
        with zipfile.ZipFile(xlsx_file, "r") as z_f:
            try:
@@ -308,9 +317,17 @@ def read_cell_value(xlsx_file: str, sheet_name: str, coordinate: str) -> Any:
                                               , namespaces=_xlsx_ns_mapping
                                               )(sheet)
                if len(cells) == 0:
+                    logger.debug(f"Cell {coordinate} not found in sheet {sheet_name}")
                    return None
                cell: _Element = cells[0]
-    except zipfile.BadZipFile:
+    except zipfile.BadZipFile as e:
+        logger.error(f"Bad zip file {xlsx_file}: {e}")
+        return None
+    except KeyError as e:
+        logger.error(f"Sheet {sheet_name} not found in {xlsx_file}: {e}")
+        return None
+    except Exception as e:
+        logger.error(f"Error reading {xlsx_file}: {e}")
        return None

    cell: Dict[str, str] = xmltodict.parse(lxml.etree.tostring(cell, encoding="unicode")
--- a/evaluation_examples/examples/chrome/47543840-672a-467d-80df-8f7c3b9788c9.json
+++ b/evaluation_examples/examples/chrome/47543840-672a-467d-80df-8f7c3b9788c9.json
@@ -1,7 +1,7 @@
 {
  "id": "47543840-672a-467d-80df-8f7c3b9788c9",
  "snapshot": "chrome",
-  "instruction": "Show me the cars available for pickup at Boston Logan Intl Airport from the 10th to the 11th of next month, sorted by the number of seats to find the largest capacity.",
+  "instruction": "On the current website, show me the cars available for pickup at Boston Logan Intl Airport from the 10th to the 11th of next month, sorted by the number of seats to find the largest capacity.",
  "source": "test_task_1",
  "config": [
    {
--- a/evaluation_examples/examples/chrome/9f935cce-0a9f-435f-8007-817732bfc0a5.json
+++ b/evaluation_examples/examples/chrome/9f935cce-0a9f-435f-8007-817732bfc0a5.json
@@ -57,5 +57,5 @@
      }
    }
  },
-  "proxy": true
+  "proxy": false
 }
--- a/evaluation_examples/examples/chrome/a728a36e-8bf1-4bb6-9a03-ef039a5233f0.json
+++ b/evaluation_examples/examples/chrome/a728a36e-8bf1-4bb6-9a03-ef039a5233f0.json
@@ -56,5 +56,5 @@
      }
    }
  },
-  "proxy": true
+  "proxy": false
 }
--- a/evaluation_examples/examples/chrome/b4f95342-463e-4179-8c3f-193cd7241fb2.json
+++ b/evaluation_examples/examples/chrome/b4f95342-463e-4179-8c3f-193cd7241fb2.json
@@ -1,7 +1,7 @@
 {
  "id": "b4f95342-463e-4179-8c3f-193cd7241fb2",
  "snapshot": "chrome",
-  "instruction": "List as many of the next available dates for Diamond Campground as possible.",
+  "instruction": "Find the Next Available dates for Diamond.",
  "source": "test_task_1",
  "config": [
    {
--- a/evaluation_examples/examples/chrome/b7895e80-f4d1-4648-bee0-4eb45a6f1fa8.json
+++ b/evaluation_examples/examples/chrome/b7895e80-f4d1-4648-bee0-4eb45a6f1fa8.json
@@ -66,10 +66,10 @@
        "goto_prefix": "https://www.",
        "category": "xpath",
        "xpathObject": {
-          "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[1]/div/button/div[3]": "from",
-          "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[2]/button/div[3]": "to",
-          "/html/body/div[1]/main/div[3]/div[2]/div/div[1]/div/h2": "city",
-          "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[3]/button/div[3]/span/span[2]": "adult",
+          "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[1]/button/span/div/div": "from",
+          "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[2]/button/span/div/div": "to",
+          "/html/body/div[1]/main/div[3]/div[2]/div/div/div/h2": "city",
+          "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[3]/button/span/div/div": "adult",
          "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[3]/div/div[2]/div/div/div[2]/div/button/div/div": "rank"
        }
      }
@@ -101,10 +101,10 @@
          },
          "timezone": "America/New_York",
          "expected": {
-            "from": "{DoW}, {Month} {Day0D}",
-            "to": "{DoW}, {Month} {Day0D}",
+            "from": "Check In{DoW}, {Month} {Day0D}",
+            "to": "Check Out{DoW}, {Month} {Day0D}",
            "city": "New York City Hotels",
-            "adult": "2 guests",
+            "adult": "Rooms/Guests1 Room, 2 Guests",
            "rank": "Price (low to high)"
          }
        }
@@ -112,5 +112,5 @@
    ]
  },
  "proxy": true,
-  "possibility_of_env_change": "medium"
+  "possibility_of_env_change": "high"
 }
--- a/evaluation_examples/examples/chrome/fc6d8143-9452-4171-9459-7f515143419a.json
+++ b/evaluation_examples/examples/chrome/fc6d8143-9452-4171-9459-7f515143419a.json
@@ -1,7 +1,7 @@
 {
  "id": "fc6d8143-9452-4171-9459-7f515143419a",
  "snapshot": "chrome",
-  "instruction": "Find the status of tomorrow flights from New York-Kennedy airport to Chicago-O'Hare airport.",
+  "instruction": "Find flights from New York–Kennedy Airport to Chicago O'Hare Airport for tomorrow.",
  "source": "test_task_0",
  "config": [
    {
--- a/evaluation_examples/settings/proxy/dataimpulse.json
+++ b/evaluation_examples/settings/proxy/dataimpulse.json
@@ -2,8 +2,8 @@
    {
        "host": "gw.dataimpulse.com",
        "port": 823,
-        "username": "your_username",
-        "password": "your_password",
+        "username": "e750e5abb74376d28361",
+        "password": "e5ec245537e1e76a",
        "protocol": "http",
        "provider": "dataimpulse",
        "type": "residential",
--- a/mm_agents/anthropic/main.py
+++ b/mm_agents/anthropic/main.py
@@ -369,9 +369,10 @@ class AnthropicAgent:
                )

        except (APIError, APIStatusError, APIResponseValidationError) as e:
-            self.logger.exception(f"Anthropic API error: {str(e)}")
+            logger.exception(f"Anthropic API error: {str(e)}")
            try:
-                self.logger.warning("Retrying with backup API key...")
+                logger.warning("Retrying with backup API key...")
+
                backup_client = Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY_BACKUP"), max_retries=4)
                
                if self.model_name == "claude-3-7-sonnet-20250219" or self.model_name == "claude-4-opus-20250514" or self.model_name == "claude-4-sonnet-20250514":
@@ -393,13 +394,13 @@ class AnthropicAgent:
                        tools=tools,
                        betas=betas,
                    )
-                self.logger.info("Successfully used backup API key")
+                logger.info("Successfully used backup API key")
            except Exception as backup_e:
-                self.logger.exception(f"Backup API call also failed: {str(backup_e)}")
+                logger.exception(f"Backup API call also failed: {str(backup_e)}")
                return None, None

        except Exception as e:
-            self.logger.exception(f"Error in Anthropic API: {str(e)}")
+            logger.exception(f"Error in Anthropic API: {str(e)}")
            return None, None

        response_params = _response_to_params(response)
@@ -434,9 +435,15 @@ class AnthropicAgent:
            actions = ["DONE"]
        return reasonings, actions
    
-    def reset(self, *args, **kwargs):
+    def reset(self, _logger = None, *args, **kwargs):
        """
        Reset the agent's state.
        """
+        global logger
+        if _logger:
+            logger = _logger
+        else:
+            logger = logging.getLogger("desktopenv.agent")
        self.messages = []
-        self.logger.info(f"{self.class_name} reset.")
+        logger.info(f"{self.class_name} reset.")
+
--- a/mm_agents/openai_cua_agent.py
+++ b/mm_agents/openai_cua_agent.py
@@ -671,8 +671,14 @@ class OpenAICUAAgent:
        action_exit = False
        thought_exit = False
        message_exit = False
+        infeasible_message = False
+        infeasible_word_list = ["infeasible", "unfeasible", "impossible", "not feasible", "cannot be done"]
        for item in response.output:
            parsed_item = self._handle_item(item)
+            if item.type == "message" and any(word in parsed_item.lower() for word in infeasible_word_list):
+                actions.append({"action_space": "pyautogui", "action": "FAIL", "pending_checks": [], "call_id": ""})
+                infeasible_message = True
+                break
            if isinstance(parsed_item, dict) and parsed_item.get("action_space", None) == "pyautogui":
                actions.append(parsed_item)
            else:
@@ -693,7 +699,7 @@ class OpenAICUAAgent:
        #     state_correct = True
        # if action_exit and not message_exit:   
        #    state_correct = True
-        if action_exit:
+        if action_exit and not infeasible_message:
            state_correct = True
        if not state_correct:
            logger.warning("The state of the agent is not correct, action_exit: %s, thought_exit: %s, message_exit: %s", action_exit, thought_exit, message_exit)
@@ -747,6 +753,7 @@ class OpenAICUAAgent:
                # Convert the action to an Action object
                step_action = Action(action.get("action", ""), self.action_space)
                # Execute the action in the environment
+                print(f"Executing action: {step_action.get_action()}")
                obs, reward, terminated, info = self.env.step(step_action.get_action())
                
                screenshot_base64 = encode_image(obs["screenshot"])
--- a/monitor/.env
+++ b/monitor/.env
@@ -4,11 +4,11 @@
 # Monitor configuration
 TASK_CONFIG_PATH=../evaluation_examples/test_all.json
 EXAMPLES_BASE_PATH=../evaluation_examples/examples
-RESULTS_BASE_PATH=../results_all
+RESULTS_BASE_PATH=../results_operator_full_test_0713
 ACTION_SPACE=pyautogui
 OBSERVATION_TYPE=screenshot
 MODEL_NAME=computer-use-preview
-MAX_STEPS=150
+MAX_STEPS=100
 FLASK_PORT=80
 FLASK_HOST=0.0.0.0
-FLASK_DEBUG=true
+FLASK_DEBUG=false
--- a/monitor/main.py
+++ b/monitor/main.py
@@ -1,14 +1,17 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-

+from functools import cache
 import os
 import json
 import time
+import subprocess
 from datetime import datetime
 from pathlib import Path
 from flask import Flask, render_template_string, jsonify, send_file, request, render_template
 from dotenv import load_dotenv

+
 # Load environment variables from .env file
 load_dotenv()

@@ -38,12 +41,51 @@ OBSERVATION_TYPE=os.getenv("OBSERVATION_TYPE", "screenshot")
 MODEL_NAME=os.getenv("MODEL_NAME", "computer-use-preview")
 MAX_STEPS = int(os.getenv("MAX_STEPS", "150"))

+def initialize_default_config():
+    """Initialize default configuration from the first available config in results directory"""
+    global ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME, RESULTS_PATH
+    
+    if os.path.exists(RESULTS_BASE_PATH):
+        try:
+            # Scan for the first available configuration
+            for action_space in os.listdir(RESULTS_BASE_PATH):
+                action_space_path = os.path.join(RESULTS_BASE_PATH, action_space)
+                if os.path.isdir(action_space_path):
+                    for obs_type in os.listdir(action_space_path):
+                        obs_path = os.path.join(action_space_path, obs_type)
+                        if os.path.isdir(obs_path):
+                            for model_name in os.listdir(obs_path):
+                                model_path = os.path.join(obs_path, model_name)
+                                if os.path.isdir(model_path):
+                                    # Use the first available configuration as default
+                                    ACTION_SPACE = action_space
+                                    OBSERVATION_TYPE = obs_type
+                                    MODEL_NAME = model_name
+                                    RESULTS_PATH = model_path
+                                    print(f"Initialized default config: {ACTION_SPACE}/{OBSERVATION_TYPE}/{MODEL_NAME}")
+                                    return
+        except Exception as e:
+            print(f"Error scanning results directory for default config: {e}")
+    
+    # Fallback to original environment-based path if no configs found
+    RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
+    print(f"Using fallback config from environment: {ACTION_SPACE}/{OBSERVATION_TYPE}/{MODEL_NAME}")
+
+# Initialize default configuration
+initialize_default_config()
+
 RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)

+if RESULTS_PATH not in TASK_STATUS_CACHE:
+    # Initialize cache for this results path
+    TASK_STATUS_CACHE[RESULTS_PATH] = {}
+
+@cache
 def load_task_list():
    with open(TASK_CONFIG_PATH, 'r') as f:
        return json.load(f)

+@cache
 def get_task_info(task_type, task_id):
    task_file = os.path.join(EXAMPLES_BASE_PATH, task_type, f"{task_id}.json")
    if os.path.exists(task_file):
@@ -183,8 +225,8 @@ def get_task_status_brief(task_type, task_id):
    # Check if the status is already cached
    current_time = time.time()
    last_cache_time = None
-    if cache_key in TASK_STATUS_CACHE:
-        cached_status, cached_time = TASK_STATUS_CACHE[cache_key]
+    if cache_key in TASK_STATUS_CACHE[RESULTS_PATH]:
+        cached_status, cached_time = TASK_STATUS_CACHE[RESULTS_PATH][cache_key]
        last_cache_time = cached_time
        # If cached status is "Done", check if it's within the stability period
        if cached_status["status"].startswith("Done"):
@@ -312,7 +354,7 @@ def get_task_status_brief(task_type, task_id):
    # Cache the status if it is done or error
    if status.startswith("Done") or status == "Error":
        current_time = last_cache_time if last_cache_time else current_time
-        TASK_STATUS_CACHE[cache_key] = (status_dict, current_time)
+        TASK_STATUS_CACHE[RESULTS_PATH][cache_key] = (status_dict, current_time)
    
    return status_dict

@@ -434,6 +476,90 @@ def api_task_detail(task_type, task_id):
        "status": task_status
    })

+@app.route('/api/config')
+def api_config():
+    """Get configuration information from environment variables - deprecated, use /api/current-config instead"""
+    config_info = {
+        "task_config_path": TASK_CONFIG_PATH,
+        "results_base_path": RESULTS_BASE_PATH,
+        "action_space": ACTION_SPACE,
+        "observation_type": OBSERVATION_TYPE,
+        "model_name": MODEL_NAME,
+        "max_steps": MAX_STEPS,
+        "examples_base_path": EXAMPLES_BASE_PATH
+    }
+    return jsonify(config_info)
+
+@app.route('/api/available-configs')
+def api_available_configs():
+    """Get all available configuration combinations by scanning the results directory"""
+    configs = []
+    
+    if os.path.exists(RESULTS_BASE_PATH):
+        try:
+            # Scan action spaces
+            for action_space in os.listdir(RESULTS_BASE_PATH):
+                action_space_path = os.path.join(RESULTS_BASE_PATH, action_space)
+                if os.path.isdir(action_space_path):
+                    # Scan observation types
+                    for obs_type in os.listdir(action_space_path):
+                        obs_path = os.path.join(action_space_path, obs_type)
+                        if os.path.isdir(obs_path):
+                            # Scan model names
+                            for model_name in os.listdir(obs_path):
+                                model_path = os.path.join(obs_path, model_name)
+                                if os.path.isdir(model_path):
+                                    configs.append({
+                                        "action_space": action_space,
+                                        "observation_type": obs_type,
+                                        "model_name": model_name,
+                                        "path": model_path
+                                    })
+        except Exception as e:
+            print(f"Error scanning results directory: {e}")
+    
+    return jsonify(configs)
+
+@app.route('/api/current-config')
+def api_current_config():
+    """Get current configuration"""
+    return jsonify({
+        "action_space": ACTION_SPACE,
+        "observation_type": OBSERVATION_TYPE,
+        "model_name": MODEL_NAME,
+        "max_steps": MAX_STEPS,
+        "results_path": RESULTS_PATH
+    })
+
+@app.route('/api/set-config', methods=['POST'])
+def api_set_config():
+    """Set current configuration"""
+    global ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME, RESULTS_PATH
+    
+    data = request.get_json()
+    if not data:
+        return jsonify({"error": "No data provided"}), 400
+    
+    # Update global variables
+    ACTION_SPACE = data.get('action_space', ACTION_SPACE)
+    OBSERVATION_TYPE = data.get('observation_type', OBSERVATION_TYPE)
+    MODEL_NAME = data.get('model_name', MODEL_NAME)
+    
+    # Update results path
+    RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
+    
+    if RESULTS_PATH not in TASK_STATUS_CACHE:
+        # Initialize cache for this results path
+        TASK_STATUS_CACHE[RESULTS_PATH] = {}
+        
+    return jsonify({
+        "action_space": ACTION_SPACE,
+        "observation_type": OBSERVATION_TYPE,
+        "model_name": MODEL_NAME,
+        "max_steps": MAX_STEPS,
+        "results_path": RESULTS_PATH
+    })
+
 if __name__ == '__main__':
    # Check if necessary directories exist
    if not os.path.exists(TASK_CONFIG_PATH):
@@ -447,4 +573,4 @@ if __name__ == '__main__':
    port = 8080
    debug = os.getenv("FLASK_DEBUG", "false").lower() == "true"
    
-    app.run(host=host, port=port, debug=debug)
+    app.run(host=host, port=port, debug=debug, threaded=True)
--- a/monitor/static/index.css
+++ b/monitor/static/index.css
@@ -1,5 +1,63 @@
 /* filepath: /home/adlsdztony/codes/OSWorld/monitor/static/index.css */
 body { font-family: 'Segoe UI', Arial, sans-serif; margin: 0; padding: 0; background: linear-gradient(135deg, #f4f6fa 0%, #e9f0f9 100%); }
+
+.layout-container {
+    position: relative;
+    max-width: 1200px;
+    margin: 20px auto;
+    padding: 0 20px;
+}
+
+.main-content {
+    background: #fff;
+    border-radius: 14px;
+    box-shadow: 0 8px 32px rgba(0,0,0,0.1);
+    padding: 36px 44px;
+}
+
+/* Floating Config Sidebar */
+.config-sidebar {
+    position: fixed;
+    top: 20px;
+    left: -280px;
+    width: 300px;
+    height: calc(100vh - 40px);
+    z-index: 1000;
+    transition: left 0.3s ease;
+}
+
+.config-sidebar:hover {
+    left: 0;
+}
+
+.config-toggle-btn {
+    position: absolute;
+    right: -50px;
+    top: 50%;
+    transform: translateY(-50%);
+    width: 50px;
+    height: 50px;
+    background: linear-gradient(135deg, #007bff, #0056b3);
+    border-radius: 0 25px 25px 0;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    color: white;
+    font-size: 1.2em;
+    cursor: pointer;
+    box-shadow: 2px 0 10px rgba(0,0,0,0.2);
+    transition: all 0.3s ease;
+}
+
+.config-toggle-btn:hover {
+    background: linear-gradient(135deg, #0056b3, #004085);
+    transform: translateY(-50%) scale(1.05);
+}
+
+.config-sidebar:hover .config-toggle-btn {
+    opacity: 0.8;
+}
+
 .main-container { max-width: 1100px; margin: 40px auto; background: #fff; border-radius: 14px; box-shadow: 0 8px 32px rgba(0,0,0,0.1); padding: 36px 44px; }
 h1 { font-size: 2.5em; margin-bottom: 24px; color: #1a237e; text-align: center; position: relative; }
 h1:after { content: ''; display: block; width: 80px; height: 4px; background: linear-gradient(90deg, #007bff, #00c6ff); margin: 12px auto 0; border-radius: 2px; }
@@ -125,6 +183,18 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
    text-shadow: 0 1px 2px rgba(0,0,0,0.05);
 }

+.accuracy-percentage {
+    font-size: 0.7em;
+    font-weight: 600;
+    color: #ffffff;
+    margin-left: 8px;
+    background: rgba(255, 255, 255, 0.1);
+    padding: 4px 8px;
+    border-radius: 12px;
+    display: inline-block;
+    vertical-align: middle;
+}
+

 .stat-card span {
    font-size: 2em;
@@ -197,8 +267,9 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }

 .task-type-stats {
    display: flex;
-    gap: 16px;
    flex-wrap: wrap;
+    gap: 8px;
+    align-items: center;
 }

 .task-stat {
@@ -228,6 +299,22 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
    color: #b71c1c;
 }

+/* Task type statistics styles */
+.task-stat.score {
+    color: #ffc107;
+    background: rgba(255, 193, 7, 0.1);
+}
+
+.task-stat.steps {
+    color: #17a2b8;
+    background: rgba(23, 162, 184, 0.1);
+}
+
+.task-stat.rate {
+    color: #28a745;
+    background: rgba(40, 167, 69, 0.1);
+}
+
 .tasks-container {
    padding: 20px;
    transition: all 0.4s cubic-bezier(.4,0,.2,1);
@@ -427,3 +514,174 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
    background: #a5c7e5;
 }

+/* Configuration Panel Styles */
+.config-panel {
+    background: #fff;
+    border-radius: 0 14px 14px 0;
+    box-shadow: 0 8px 32px rgba(0,0,0,0.15);
+    overflow: hidden;
+    height: 100%;
+    display: flex;
+    flex-direction: column;
+}
+
+.config-header {
+    display: flex;
+    align-items: center;
+    padding: 16px 20px;
+    background: linear-gradient(135deg, #6c757d, #495057);
+    color: white;
+    flex-shrink: 0;
+}
+
+.config-header i {
+    margin-right: 10px;
+    font-size: 1.1em;
+}
+
+.config-header span {
+    font-weight: 600;
+    font-size: 1.1em;
+}
+
+.config-content {
+    padding: 20px;
+    flex: 1;
+    overflow-y: auto;
+}
+
+.config-selector {
+    margin-bottom: 20px;
+    padding-bottom: 15px;
+    border-bottom: 1px solid #dee2e6;
+}
+
+.selector-item {
+    display: flex;
+    flex-direction: column;
+    gap: 8px;
+}
+
+.selector-item label {
+    font-weight: 600;
+    color: #495057;
+    font-size: 0.9em;
+    text-transform: uppercase;
+    letter-spacing: 0.5px;
+}
+
+.selector-item select {
+    padding: 8px 12px;
+    border: 2px solid #e9ecef;
+    border-radius: 6px;
+    background: white;
+    font-size: 0.9em;
+    color: #495057;
+    cursor: pointer;
+    transition: all 0.3s ease;
+}
+
+.selector-item select:focus {
+    outline: none;
+    border-color: #007bff;
+    box-shadow: 0 0 0 3px rgba(0,123,255,0.1);
+}
+
+.selector-item select:hover {
+    border-color: #007bff;
+}
+
+.config-list {
+    display: flex;
+    flex-direction: column;
+    gap: 15px;
+}
+
+.config-item {
+    display: flex;
+    flex-direction: column;
+    background: #f8f9fa;
+    padding: 12px;
+    border-radius: 8px;
+    border-left: 4px solid #007bff;
+    transition: all 0.3s ease;
+}
+
+.config-item:hover {
+    transform: translateX(3px);
+    box-shadow: 0 4px 12px rgba(0,123,255,0.15);
+}
+
+.config-label {
+    font-weight: 600;
+    color: #495057;
+    margin-bottom: 5px;
+    font-size: 0.9em;
+    text-transform: uppercase;
+    color: #495057;
+    font-size: 0.85em;
+    margin-bottom: 6px;
+    text-transform: uppercase;
+    letter-spacing: 0.5px;
+}
+
+.config-value {
+    color: #007bff;
+    font-family: 'Courier New', monospace;
+    font-size: 0.9em;
+    font-weight: 600;
+    word-break: break-word;
+}
+
+.config-path {
+    font-size: 0.8em;
+    line-height: 1.3;
+}
+
+/* Responsive design for sidebar layout */
+@media (max-width: 1024px) {
+    .config-sidebar {
+        left: -250px;
+        width: 250px;
+    }
+    
+    .config-toggle-btn {
+        right: -40px;
+        width: 40px;
+        height: 40px;
+        font-size: 1em;
+    }
+}
+
+@media (max-width: 768px) {
+    .layout-container {
+        padding: 0 10px;
+    }
+    
+    .main-content {
+        padding: 20px 25px;
+    }
+    
+    .config-sidebar {
+        left: -220px;
+        width: 220px;
+        height: calc(100vh - 20px);
+        top: 10px;
+    }
+    
+    .config-toggle-btn {
+        right: -35px;
+        width: 35px;
+        height: 35px;
+        font-size: 0.9em;
+    }
+    
+    .config-content {
+        padding: 15px;
+    }
+    
+    .config-item {
+        padding: 10px;
+    }
+}
+
--- a/monitor/static/index.js
+++ b/monitor/static/index.js
@@ -1,5 +1,8 @@
 document.addEventListener('DOMContentLoaded', () => {
-    fetchTasks();
+    fetchAvailableConfigs().then(() => {
+        fetchConfig();
+        fetchTasks();
+    });
    // Bind filter functionality
    document.getElementById('total-tasks').parentElement.addEventListener('click', () => setTaskFilter('all'));
    document.getElementById('active-tasks').parentElement.addEventListener('click', () => setTaskFilter('active'));
@@ -9,6 +12,9 @@ document.addEventListener('DOMContentLoaded', () => {

 let allTaskData = null;
 let currentFilter = 'all';
+let availableConfigs = [];
+let currentConfig = null;
+let categoryStats = {};

 function refreshPage() {
    // Save expanded state before refresh
@@ -31,8 +37,8 @@ function fetchTasksForRefresh() {
    fetch('/api/tasks/brief')
        .then(response => response.json())
        .then(data => {
-            // Update stored data
            allTaskData = data;
+            categoryStats = calculateCategoryStats(data);
            // Only update statistics and task status, do not fully re-render
            updateStatistics(data);
            updateTaskStatus(data);
@@ -148,6 +154,7 @@ function fetchTasks() {
        .then(response => response.json())
        .then(data => {
            allTaskData = data;
+            categoryStats = calculateCategoryStats(data);
            renderTasks(data);
            updateStatistics(data);
        })
@@ -208,13 +215,15 @@ function updateStatistics(data) {
    document.getElementById('completed-tasks').textContent = completedTasks;
    document.getElementById('error-tasks').textContent = errorTasks;
    
-    // Update score display with formatted score
+    // Update score display with formatted score and accuracy percentage
    const scoreDisplay = document.getElementById('score-display');
    if (completedTasks > 0) {
        const scoreFormatted = totalScore.toFixed(2);
-        scoreDisplay.innerHTML = `<span>${scoreFormatted}</span> / <span>${completedTasks}</span>`;
+        const averageScore = totalScore / completedTasks;
+        const accuracyPercentage = (averageScore * 100).toFixed(1);
+        scoreDisplay.innerHTML = `<span>${scoreFormatted}</span> / <span>${completedTasks}</span> <span class="accuracy-percentage">(${accuracyPercentage}%)</span>`;
    } else {
-        scoreDisplay.innerHTML = '<span>0.00</span> / <span>0</span>';
+        scoreDisplay.innerHTML = '<span>0.00</span> / <span>0</span> <span class="accuracy-percentage">(0.0%)</span>';
    }
    
    // Highlight the currently selected statistics card
@@ -279,6 +288,10 @@ function renderTasks(data) {
        // Create header with task type name and statistics
        const typeHeader = document.createElement('div');
        typeHeader.className = 'task-type-header';
+        
+        // Get category stats for this task type
+        const stats = categoryStats[taskType] || {};
+        
        typeHeader.innerHTML = `
            <span class="task-type-name"><i class="fas fa-layer-group"></i> ${taskType}</span>
            <div class="task-type-stats">
@@ -286,6 +299,9 @@ function renderTasks(data) {
                <span class="task-stat"><i class="fas fa-tasks"></i> ${tasks.length} total</span>
                <span class="task-stat running"><i class="fas fa-running"></i> ${runningCount} active</span>
                <span class="task-stat completed"><i class="fas fa-check-circle"></i> ${completedCount} completed</span>
+                ${stats.avg_score ? `<span class="task-stat score"><i class="fas fa-star"></i> ${stats.avg_score} avg score</span>` : ''}
+                ${stats.avg_steps ? `<span class="task-stat steps"><i class="fas fa-chart-line"></i> ${stats.avg_steps} avg steps</span>` : ''}
+                ${stats.completion_rate ? `<span class="task-stat rate"><i class="fas fa-percentage"></i> ${stats.completion_rate}% completed</span>` : ''}
            </div>
        `;
        typeSection.appendChild(typeHeader);
@@ -453,7 +469,181 @@ function renderTasks(data) {
        container.appendChild(typeSection);
    });
 }
-// add auto-refresh with time interval 10 seconds
-setInterval(() => {
-        refreshPage();
-}, 10000); // 10 seconds interval
+
+function fetchAvailableConfigs() {
+    return fetch('/api/available-configs')
+        .then(response => response.json())
+        .then(data => {
+            availableConfigs = data;
+            populateConfigSelect();
+            return data;
+        })
+        .catch(error => {
+            console.error('Error fetching available configs:', error);
+            return [];
+        });
+}
+
+function populateConfigSelect() {
+    const select = document.getElementById('config-select');
+    select.innerHTML = '';
+    
+    if (availableConfigs.length === 0) {
+        select.innerHTML = '<option value="">No configurations found in results directory</option>';
+        return;
+    }
+    
+    // Add available configurations
+    availableConfigs.forEach((config, index) => {
+        const option = document.createElement('option');
+        option.value = index;
+        option.textContent = `${config.action_space} / ${config.observation_type} / ${config.model_name}`;
+        select.appendChild(option);
+    });
+}
+
+function changeConfiguration() {
+    const select = document.getElementById('config-select');
+    const selectedIndex = select.value;
+    
+    if (selectedIndex === '' || selectedIndex < 0 || selectedIndex >= availableConfigs.length) {
+        return;
+    }
+    
+    const selectedConfig = availableConfigs[selectedIndex];
+    
+    // Send configuration change request
+    fetch('/api/set-config', {
+        method: 'POST',
+        headers: {
+            'Content-Type': 'application/json',
+        },
+        body: JSON.stringify(selectedConfig)
+    })
+    .then(response => response.json())
+    .then(data => {
+        currentConfig = data;
+        displayConfig(data);
+        // Refresh tasks with new configuration
+        fetchTasks();
+    })
+    .catch(error => {
+        console.error('Error setting config:', error);
+        displayConfigError();
+    });
+}
+
+function fetchConfig() {
+    return fetch('/api/current-config')
+        .then(response => response.json())
+        .then(data => {
+            currentConfig = data;
+            displayConfig(data);
+            updateConfigSelect();
+            return data;
+        })
+        .catch(error => {
+            console.error('Error fetching config:', error);
+            displayConfigError();
+        });
+}
+
+function updateConfigSelect() {
+    if (!currentConfig || availableConfigs.length === 0) return;
+    
+    const select = document.getElementById('config-select');
+    const currentConfigIndex = availableConfigs.findIndex(config => 
+        config.action_space === currentConfig.action_space &&
+        config.observation_type === currentConfig.observation_type &&
+        config.model_name === currentConfig.model_name
+    );
+    
+    if (currentConfigIndex !== -1) {
+        select.value = currentConfigIndex;
+    } else {
+        // Current config not found in available configs, select the first one if available
+        if (availableConfigs.length > 0) {
+            select.value = 0;
+            console.warn('Current config not found in available configs, defaulting to first available config');
+        }
+    }
+}
+
+function displayConfig(config) {
+    document.getElementById('action-space').textContent = config.action_space || 'N/A';
+    document.getElementById('observation-type').textContent = config.observation_type || 'N/A';
+    document.getElementById('model-name').textContent = config.model_name || 'N/A';
+    document.getElementById('max-steps').textContent = config.max_steps || 'N/A';
+}
+
+function displayConfigError() {
+    const configValues = document.querySelectorAll('.config-value');
+    configValues.forEach(element => {
+        element.textContent = 'Error loading';
+        element.style.color = '#dc3545';
+    });
+}
+
+function calculateCategoryStats(data) {
+    const stats = {};
+    
+    Object.entries(data).forEach(([taskType, tasks]) => {
+        let totalTasks = tasks.length;
+        let completedTasks = 0;
+        let runningTasks = 0;
+        let errorTasks = 0;
+        let totalScore = 0;
+        let totalSteps = 0;
+        let completedWithSteps = 0;
+        
+        tasks.forEach(task => {
+            const status = task.status.status;
+            
+            if (['Done', 'Done (Message Exit)', 'Done (Max Steps)', 'Done (Thought Exit)'].includes(status)) {
+                completedTasks++;
+                
+                // Calculate score if available
+                if (task.status.result) {
+                    try {
+                        const score = parseFloat(task.status.result);
+                        if (!isNaN(score) && score >= 0 && score <= 1) {
+                            totalScore += score;
+                        }
+                    } catch (e) {
+                        // Ignore parsing errors
+                    }
+                }
+                
+                // Calculate steps for completed tasks
+                if (task.status.progress && task.status.progress > 0) {
+                    totalSteps += task.status.progress;
+                    completedWithSteps++;
+                }
+                
+            } else if (['Running', 'Preparing', 'Initializing'].includes(status)) {
+                runningTasks++;
+                
+            } else if (status === 'Error') {
+                errorTasks++;
+            }
+        });
+        
+        // Calculate averages
+        const avgScore = completedTasks > 0 ? totalScore / completedTasks : 0;
+        const avgSteps = completedWithSteps > 0 ? totalSteps / completedWithSteps : 0;
+        const completionRate = totalTasks > 0 ? (completedTasks / totalTasks * 100) : 0;
+        
+        stats[taskType] = {
+            total_tasks: totalTasks,
+            completed_tasks: completedTasks,
+            running_tasks: runningTasks,
+            error_tasks: errorTasks,
+            total_score: Math.round(totalScore * 100) / 100,
+            avg_score: Math.round(avgScore * 10000) / 10000,
+            avg_steps: Math.round(avgSteps * 10) / 10,
+            completion_rate: Math.round(completionRate * 10) / 10
+        };
+    });
+    
+    return stats;
+}
--- a/monitor/templates/index.html
+++ b/monitor/templates/index.html
@@ -12,19 +12,62 @@
    <link rel="stylesheet" href="/static/index.css">
 </head>
 <body>
-    <div class="main-container">
-        <h1>OSWorld Monitor <span class="system-status online">System Online</span></h1>
-        
-        <!-- Score Display Banner -->
-        <div class="score-banner">
-            <div class="score-content">
-                <i class="fas fa-star"></i>
-                <span class="score-label">Score:</span>
-                <span id="score-display" class="score-value">Loading...</span>
+    <div class="layout-container">
+        <!-- Floating Config Button and Sidebar -->
+        <div class="config-sidebar" id="config-sidebar">
+            <div class="config-toggle-btn">
+                <i class="fas fa-cogs"></i>
+            </div>
+            <div class="config-panel">
+                <div class="config-header">
+                    <i class="fas fa-cogs"></i>
+                    <span>Configuration</span>
+                </div>
+                <div class="config-content">
+                    <div class="config-selector">
+                        <div class="selector-item">
+                            <label for="config-select">Select Configuration:</label>
+                            <select id="config-select" onchange="changeConfiguration()">
+                                <option value="">Loading configurations...</option>
+                            </select>
+                        </div>
+                    </div>
+                    <div class="config-list">
+                        <div class="config-item">
+                            <span class="config-label">Action Space:</span>
+                            <span class="config-value" id="action-space">Loading...</span>
+                        </div>
+                        <div class="config-item">
+                            <span class="config-label">Observation:</span>
+                            <span class="config-value" id="observation-type">Loading...</span>
+                        </div>
+                        <div class="config-item">
+                            <span class="config-label">Model:</span>
+                            <span class="config-value" id="model-name">Loading...</span>
+                        </div>
+                        <div class="config-item">
+                            <span class="config-label">Max Steps:</span>
+                            <span class="config-value" id="max-steps">Loading...</span>
+                        </div>
+                    </div>
+                </div>
            </div>
        </div>
        
-        <div class="dashboard-stats">
+        <!-- Main Content -->
+        <div class="main-content">
+            <h1>OSWorld Monitor <span class="system-status online">System Online</span></h1>
+            
+            <!-- Score Display Banner -->
+            <div class="score-banner">
+                <div class="score-content">
+                    <i class="fas fa-star"></i>
+                    <span class="score-label">Score:</span>
+                    <span id="score-display" class="score-value">Loading...</span>
+                </div>
+            </div>
+            
+            <div class="dashboard-stats">
            <div class="stat-card">
                <i class="fas fa-running"></i>
                <span id="active-tasks">Loading...</span>
@@ -46,10 +89,11 @@
                <div class="stat-label">Total Tasks</div>
            </div>
        </div>
-        <div id="task-container">
-            <div class="loading-spinner">
-                <div class="spinner"></div>
-                <div>Loading task data...</div>
+            <div id="task-container">
+                <div class="loading-spinner">
+                    <div class="spinner"></div>
+                    <div>Loading task data...</div>
+                </div>
            </div>
        </div>
    </div>
--- a/run_operator.sh
+++ b/run_operator.sh
@@ -0,0 +1,9 @@
+python run_multienv_openaicua.py \
+--headless \
+--observation_type screenshot \
+--model computer-use-preview \
+--result_dir ./results_operator_full_test_0713 \
+--test_all_meta_path evaluation_examples/test_all.json \
+--max_steps 100 \
+--num_envs 15 \
+--provider_name aws
--- a/show_result.py
+++ b/show_result.py
@@ -68,4 +68,4 @@ def get_result(action_space, use_model, observation_type, result_dir):


 if __name__ == '__main__':
-    get_result("pyautogui", "gpt-4o", "a11y_tree", "./results")
+    get_result("pyautogui", "computer-use-preview", "screenshot", "./results_operator_full_test_0713")