diff --git a/desktop_env/evaluators/getters/file.py b/desktop_env/evaluators/getters/file.py
index f4ab03a..f329d10 100644
--- a/desktop_env/evaluators/getters/file.py
+++ b/desktop_env/evaluators/getters/file.py
@@ -1,10 +1,13 @@
import os
+import logging
from typing import Dict, List, Set
from typing import Optional, Any, Union
from datetime import datetime
import requests
import pandas as pd
+logger = logging.getLogger("desktopenv.getter.file")
+
def get_content_from_vm_file(env, config: Dict[str, Any]) -> Any:
"""
@@ -101,16 +104,42 @@ def get_vm_file(env, config: Dict[str, Any]) -> Union[Optional[str], List[Option
for i, (p, d) in enumerate(zip(paths, dests)):
_path = os.path.join(env.cache_dir, d)
- file = env.controller.get_file(p)
- if file is None:
+
+ try:
+ # Try to get file from VM
+ file = env.controller.get_file(p)
+ if file is None:
+ logger.warning(f"Failed to get file from VM: {p}")
+ if i in gives:
+ cache_paths.append(None)
+ continue
+
+ if i in gives:
+ cache_paths.append(_path)
+
+ # Write file with robust error handling
+ try:
+ # Ensure cache directory exists
+ os.makedirs(env.cache_dir, exist_ok=True)
+
+ with open(_path, "wb") as f:
+ f.write(file)
+ logger.info(f"Successfully saved file: {_path} ({len(file)} bytes)")
+
+ except IOError as e:
+ logger.error(f"IO error writing file {_path}: {e}")
+ if i in gives:
+ cache_paths[-1] = None # Replace the path we just added with None
+ except Exception as e:
+ logger.error(f"Unexpected error writing file {_path}: {e}")
+ if i in gives:
+ cache_paths[-1] = None
+
+ except Exception as e:
+ logger.error(f"Error processing file {p}: {e}")
if i in gives:
cache_paths.append(None)
- continue
-
- if i in gives:
- cache_paths.append(_path)
- with open(_path, "wb") as f:
- f.write(file)
+
return cache_paths[0] if len(cache_paths)==1 else cache_paths
diff --git a/desktop_env/evaluators/metrics/general.py b/desktop_env/evaluators/metrics/general.py
index d0f6195..27356a2 100644
--- a/desktop_env/evaluators/metrics/general.py
+++ b/desktop_env/evaluators/metrics/general.py
@@ -298,34 +298,84 @@ def check_json(result: str, rules: Dict[str, List[Dict[str, Union[List[str], str
"""
if result is None:
+ logger.warning("Result file path is None, returning 0.0")
+ return 0.
+
+ # Check if file exists
+ if not os.path.exists(result):
+ logger.warning(f"Result file does not exist: {result}, returning 0.0")
+ return 0.
+
+ try:
+ with open(result, 'r', encoding='utf-8') as f:
+ if is_yaml:
+ try:
+ # Use SafeLoader instead of Loader for better security and error handling
+ result_data: Dict[str, Any] = yaml.safe_load(f)
+ if result_data is None:
+ logger.warning(f"YAML file {result} is empty or contains only null values, returning 0.0")
+ return 0.
+ except yaml.YAMLError as e:
+ logger.error(f"YAML parsing error in file {result}: {e}")
+ logger.error(f"File content might be corrupted or have invalid YAML syntax")
+ return 0.
+ except Exception as e:
+ logger.error(f"Unexpected error parsing YAML file {result}: {e}")
+ return 0.
+ else:
+ try:
+ result_data: Dict[str, Any] = json.load(f)
+ except json.JSONDecodeError as e:
+ logger.error(f"JSON parsing error in file {result}: {e}")
+ return 0.
+ except Exception as e:
+ logger.error(f"Unexpected error parsing JSON file {result}: {e}")
+ return 0.
+ except IOError as e:
+ logger.error(f"IO error reading file {result}: {e}")
+ return 0.
+ except Exception as e:
+ logger.error(f"Unexpected error reading file {result}: {e}")
return 0.
- with open(result) as f:
- if is_yaml:
- result: Dict[str, Any] = yaml.load(f, Loader=yaml.Loader)
- else:
- result: Dict[str, Any] = json.load(f)
expect_rules = rules.get("expect", {})
unexpect_rules = rules.get("unexpect", {})
metric = True
for r in expect_rules:
- value = result
- for k in r["key"]:
- try:
- value = value[k]
- except KeyError:
- return 0.
- metric = metric and _match_value_to_rule(value, r)
+ value = result_data
+ try:
+ for k in r["key"]:
+ try:
+ value = value[k]
+ except KeyError:
+ logger.debug(f"Key '{k}' not found in result data, returning 0.0")
+ return 0.
+ except TypeError:
+ logger.debug(f"Cannot access key '{k}' - value is not a dictionary, returning 0.0")
+ return 0.
+ metric = metric and _match_value_to_rule(value, r)
+ except Exception as e:
+ logger.error(f"Error processing expect rule {r}: {e}")
+ return 0.
+
for r in unexpect_rules:
- value = result
- for k in r["key"]:
- try:
- value = value[k]
- except KeyError:
- value = None
- break
- metric = metric and not _match_value_to_rule(value, r)
+ value = result_data
+ try:
+ for k in r["key"]:
+ try:
+ value = value[k]
+ except KeyError:
+ value = None
+ break
+ except TypeError:
+ value = None
+ break
+ metric = metric and not _match_value_to_rule(value, r)
+ except Exception as e:
+ logger.error(f"Error processing unexpect rule {r}: {e}")
+ return 0.
+
return float(metric)
diff --git a/desktop_env/evaluators/metrics/slides.py b/desktop_env/evaluators/metrics/slides.py
index bee4f1a..81c4af9 100644
--- a/desktop_env/evaluators/metrics/slides.py
+++ b/desktop_env/evaluators/metrics/slides.py
@@ -73,6 +73,9 @@ def check_image_stretch_and_center(modified_ppt, original_ppt):
original_slide_images = [shape for shape in original_slide.shapes if shape.shape_type == 13]
modified_slide_images = [shape for shape in modified_slide.shapes if shape.shape_type == 13]
+ if not original_slide_images:
+ return 0.
+
the_image = original_slide_images[0]
the_modified_image = None
@@ -395,12 +398,38 @@ def compare_pptx_files(file1_path, file2_path, **options):
table2 = shape2.table
if enable_debug:
debug_logger.debug(f" Shape {shape_idx} - Comparing TABLE with {len(table1.rows)} rows and {len(table1.columns)} columns")
+ debug_logger.debug(f" Shape {shape_idx} - Table2 has {len(table2.rows)} rows and {len(table2.columns)} columns")
+
+ # Check if tables have the same dimensions
+ if len(table1.rows) != len(table2.rows) or len(table1.columns) != len(table2.columns):
+ if enable_debug:
+ debug_logger.debug(f" MISMATCH: Slide {slide_idx}, Shape {shape_idx} (TABLE) - Table dimensions differ:")
+ debug_logger.debug(f" Table1: {len(table1.rows)} rows x {len(table1.columns)} columns")
+ debug_logger.debug(f" Table2: {len(table2.rows)} rows x {len(table2.columns)} columns")
+ return 0
+
for row_idx in range(len(table1.rows)):
for col_idx in range(len(table1.columns)):
cell1 = table1.cell(row_idx, col_idx)
cell2 = table2.cell(row_idx, col_idx)
+ # Check if cells have the same number of paragraphs
+ if len(cell1.text_frame.paragraphs) != len(cell2.text_frame.paragraphs):
+ if enable_debug:
+ debug_logger.debug(f" MISMATCH: Slide {slide_idx}, Shape {shape_idx} (TABLE) - Cell [{row_idx},{col_idx}] - Different number of paragraphs:")
+ debug_logger.debug(f" Cell1 paragraphs: {len(cell1.text_frame.paragraphs)}")
+ debug_logger.debug(f" Cell2 paragraphs: {len(cell2.text_frame.paragraphs)}")
+ return 0
+
for para_idx, (para1, para2) in enumerate(zip(cell1.text_frame.paragraphs, cell2.text_frame.paragraphs)):
+ # Check if paragraphs have the same number of runs
+ if len(para1.runs) != len(para2.runs):
+ if enable_debug:
+ debug_logger.debug(f" MISMATCH: Slide {slide_idx}, Shape {shape_idx} (TABLE) - Cell [{row_idx},{col_idx}], Para {para_idx} - Different number of runs:")
+ debug_logger.debug(f" Para1 runs: {len(para1.runs)}")
+ debug_logger.debug(f" Para2 runs: {len(para2.runs)}")
+ return 0
+
for run_idx, (run1, run2) in enumerate(zip(para1.runs, para2.runs)):
# Check font color
if hasattr(run1.font.color, "rgb") and hasattr(run2.font.color, "rgb"):
@@ -451,6 +480,14 @@ def compare_pptx_files(file1_path, file2_path, **options):
if shape1.text.strip() != shape2.text.strip() and examine_text:
return 0
+ # check if the number of paragraphs are the same
+ if len(shape1.text_frame.paragraphs) != len(shape2.text_frame.paragraphs):
+ if enable_debug:
+ debug_logger.debug(f" MISMATCH: Slide {slide_idx}, Shape {shape_idx} - Different number of paragraphs:")
+ debug_logger.debug(f" Shape1 paragraphs: {len(shape1.text_frame.paragraphs)}")
+ debug_logger.debug(f" Shape2 paragraphs: {len(shape2.text_frame.paragraphs)}")
+ return 0
+
# check if the paragraphs are the same
para_idx = 0
for para1, para2 in zip(shape1.text_frame.paragraphs, shape2.text_frame.paragraphs):
@@ -487,6 +524,14 @@ def compare_pptx_files(file1_path, file2_path, **options):
if para1.level != para2.level and examine_indent:
return 0
+ # check if the number of runs are the same
+ if len(para1.runs) != len(para2.runs):
+ if enable_debug:
+ debug_logger.debug(f" MISMATCH: Slide {slide_idx}, Shape {shape_idx}, Para {para_idx} - Different number of runs:")
+ debug_logger.debug(f" Para1 runs: {len(para1.runs)}")
+ debug_logger.debug(f" Para2 runs: {len(para2.runs)}")
+ return 0
+
for run1, run2 in zip(para1.runs, para2.runs):
# check if the font properties are the same
@@ -634,6 +679,12 @@ def compare_pptx_files(file1_path, file2_path, **options):
debug_logger.debug(f" MISMATCH: Text differs - '{tshape1.text.strip()}' vs '{tshape2.text.strip()}'")
return 0
+ # Check if text shapes have the same number of paragraphs
+ if len(tshape1.text_frame.paragraphs) != len(tshape2.text_frame.paragraphs):
+ if enable_debug:
+ debug_logger.debug(f" MISMATCH: Different number of paragraphs - {len(tshape1.text_frame.paragraphs)} vs {len(tshape2.text_frame.paragraphs)}")
+ return 0
+
# Compare alignment of each paragraph
for para_idx, (para1, para2) in enumerate(zip(tshape1.text_frame.paragraphs, tshape2.text_frame.paragraphs)):
from pptx.enum.text import PP_ALIGN
diff --git a/desktop_env/evaluators/metrics/table.py b/desktop_env/evaluators/metrics/table.py
index db51850..5a0f79d 100644
--- a/desktop_env/evaluators/metrics/table.py
+++ b/desktop_env/evaluators/metrics/table.py
@@ -36,8 +36,14 @@ def _parse_sheet_idx(sheet_idx: Union[int, str]
# function _parse_sheet_idx {{{ #
if isinstance(sheet_idx, int):
try:
- index: str = result_sheet_names[sheet_idx]
- except:
+ if not result_sheet_names or sheet_idx >= len(result_sheet_names):
+ logger.error(f"Sheet index {sheet_idx} out of range. Available sheets: {result_sheet_names}")
+ index = ""
+ else:
+ index: str = result_sheet_names[sheet_idx]
+ logger.debug(f"Sheet index {sheet_idx} resolved to sheet: {index}")
+ except Exception as e:
+ logger.error(f"Error resolving sheet index {sheet_idx}: {e}")
index = ""
book: BOOK = result
elif sheet_idx.startswith("RI"):
@@ -114,12 +120,21 @@ def compare_table(result: str, expected: str = None, **options) -> float:
"""
if result is None:
+ logger.error("Result file path is None")
+ return 0.
+
+ # Check if result file exists
+ if not os.path.exists(result):
+ logger.error(f"Result file not found: {result}")
return 0.
try:
+ logger.info(f"Loading result file: {result}")
xlworkbookr: Workbook = openpyxl.load_workbook(filename=result)
pdworkbookr = pd.ExcelFile(result)
- except:
+ logger.info(f"Successfully loaded result file with sheets: {pdworkbookr.sheet_names}")
+ except Exception as e:
+ logger.error(f"Failed to load result file {result}: {e}")
return 0.
worksheetr_names: List[str] = pdworkbookr.sheet_names
@@ -432,19 +447,35 @@ def compare_table(result: str, expected: str = None, **options) -> float:
# props: dict like {attribute: {"method": str, "ref": anything}}
# supported attributes: value & those supported by utils._read_cell_style
- sheet: Worksheet = _load_sheet(*parse_idx(r["sheet_idx"], xlworkbookr, xlworkbooke))
- if sheet is None:
- return 0.
- # data_frame: pd.DataFrame = _load_sheet(*parse_idx(r["sheet_idx"], pdworkbookr, pdworkbooke))
- cell: Cell = sheet[r["coordinate"]]
- metric: bool = True
- for prpt, rule in r["props"].items():
- if prpt == "value":
- val = read_cell_value(*parse_idx(r["sheet_idx"], result, expected), r["coordinate"])
- else:
- val = _read_cell_style(prpt, cell)
+ try:
+ sheet: Worksheet = _load_sheet(*parse_idx(r["sheet_idx"], xlworkbookr, xlworkbooke))
+ if sheet is None:
+ logger.error(f"Failed to load sheet for sheet_idx: {r['sheet_idx']}")
+ return 0.
+ # data_frame: pd.DataFrame = _load_sheet(*parse_idx(r["sheet_idx"], pdworkbookr, pdworkbooke))
+ cell: Cell = sheet[r["coordinate"]]
+ metric: bool = True
+ for prpt, rule in r["props"].items():
+ if prpt == "value":
+ try:
+ parsed_result = parse_idx(r["sheet_idx"], result, expected)
+ logger.debug(f"parse_idx result: {parsed_result}")
+ val = read_cell_value(*parsed_result, r["coordinate"])
+ logger.debug(f"Cell {r['coordinate']} value: {val}")
+ except Exception as e:
+ logger.error(f"Failed to read cell value at {r['coordinate']}: {e}")
+ val = None
+ else:
+ try:
+ val = _read_cell_style(prpt, cell)
+ except Exception as e:
+ logger.error(f"Failed to read cell style {prpt} at {r['coordinate']}: {e}")
+ val = None
- metric = metric and _match_value_to_rule(val, rule)
+ metric = metric and _match_value_to_rule(val, rule)
+ except Exception as e:
+ logger.error(f"Error in check_cell processing: {e}")
+ return 0.
logger.debug("Assertion: %s[%s] :%s - %s"
, r["sheet_idx"], r["coordinate"]
diff --git a/desktop_env/evaluators/metrics/utils.py b/desktop_env/evaluators/metrics/utils.py
index e512a26..1136655 100644
--- a/desktop_env/evaluators/metrics/utils.py
+++ b/desktop_env/evaluators/metrics/utils.py
@@ -4,6 +4,7 @@ import functools
import itertools
import logging
import operator
+import os
import re
import zipfile
#import pandas as pd
@@ -33,10 +34,11 @@ V = TypeVar("Value")
logger = logging.getLogger("desktopenv.metrics.utils")
-_xlsx_namespaces = [("oo", "http://schemas.openxmlformats.org/spreadsheetml/2006/main")
- , ("x14", "http://schemas.microsoft.com/office/spreadsheetml/2009/9/main")
- , ("xm", "http://schemas.microsoft.com/office/excel/2006/main")
- ]
+_xlsx_namespaces = [
+ ("oo", "http://schemas.openxmlformats.org/spreadsheetml/2006/main"),
+ ("x14", "http://schemas.microsoft.com/office/spreadsheetml/2009/9/main"),
+ ("xm", "http://schemas.microsoft.com/office/excel/2006/main")
+]
_xlsx_ns_mapping = dict(_xlsx_namespaces)
_xlsx_ns_imapping = dict(map(lambda itm: (itm[1], itm[0]), _xlsx_namespaces))
_xlsx_ns_imapping["http://schemas.openxmlformats.org/spreadsheetml/2006/main"] = None
@@ -282,6 +284,13 @@ _shared_str_value_selector = lxml.cssselect.CSSSelector("oo|t", namespaces=_xlsx
def read_cell_value(xlsx_file: str, sheet_name: str, coordinate: str) -> Any:
# read_cell_value {{{ #
+ logger.debug(f"Reading cell value from {xlsx_file}, sheet: {sheet_name}, coordinate: {coordinate}")
+
+ # Check if file exists
+ if not os.path.exists(xlsx_file):
+ logger.error(f"Excel file not found: {xlsx_file}")
+ return None
+
try:
with zipfile.ZipFile(xlsx_file, "r") as z_f:
try:
@@ -308,9 +317,17 @@ def read_cell_value(xlsx_file: str, sheet_name: str, coordinate: str) -> Any:
, namespaces=_xlsx_ns_mapping
)(sheet)
if len(cells) == 0:
+ logger.debug(f"Cell {coordinate} not found in sheet {sheet_name}")
return None
cell: _Element = cells[0]
- except zipfile.BadZipFile:
+ except zipfile.BadZipFile as e:
+ logger.error(f"Bad zip file {xlsx_file}: {e}")
+ return None
+ except KeyError as e:
+ logger.error(f"Sheet {sheet_name} not found in {xlsx_file}: {e}")
+ return None
+ except Exception as e:
+ logger.error(f"Error reading {xlsx_file}: {e}")
return None
cell: Dict[str, str] = xmltodict.parse(lxml.etree.tostring(cell, encoding="unicode")
diff --git a/evaluation_examples/examples/chrome/47543840-672a-467d-80df-8f7c3b9788c9.json b/evaluation_examples/examples/chrome/47543840-672a-467d-80df-8f7c3b9788c9.json
index c74fdcf..4829d2d 100644
--- a/evaluation_examples/examples/chrome/47543840-672a-467d-80df-8f7c3b9788c9.json
+++ b/evaluation_examples/examples/chrome/47543840-672a-467d-80df-8f7c3b9788c9.json
@@ -1,7 +1,7 @@
{
"id": "47543840-672a-467d-80df-8f7c3b9788c9",
"snapshot": "chrome",
- "instruction": "Show me the cars available for pickup at Boston Logan Intl Airport from the 10th to the 11th of next month, sorted by the number of seats to find the largest capacity.",
+ "instruction": "On the current website, show me the cars available for pickup at Boston Logan Intl Airport from the 10th to the 11th of next month, sorted by the number of seats to find the largest capacity.",
"source": "test_task_1",
"config": [
{
diff --git a/evaluation_examples/examples/chrome/9f935cce-0a9f-435f-8007-817732bfc0a5.json b/evaluation_examples/examples/chrome/9f935cce-0a9f-435f-8007-817732bfc0a5.json
index 9b37187..a93c959 100644
--- a/evaluation_examples/examples/chrome/9f935cce-0a9f-435f-8007-817732bfc0a5.json
+++ b/evaluation_examples/examples/chrome/9f935cce-0a9f-435f-8007-817732bfc0a5.json
@@ -57,5 +57,5 @@
}
}
},
- "proxy": true
+ "proxy": false
}
\ No newline at end of file
diff --git a/evaluation_examples/examples/chrome/a728a36e-8bf1-4bb6-9a03-ef039a5233f0.json b/evaluation_examples/examples/chrome/a728a36e-8bf1-4bb6-9a03-ef039a5233f0.json
index 9e5d730..6bdffe9 100644
--- a/evaluation_examples/examples/chrome/a728a36e-8bf1-4bb6-9a03-ef039a5233f0.json
+++ b/evaluation_examples/examples/chrome/a728a36e-8bf1-4bb6-9a03-ef039a5233f0.json
@@ -56,5 +56,5 @@
}
}
},
- "proxy": true
+ "proxy": false
}
\ No newline at end of file
diff --git a/evaluation_examples/examples/chrome/b4f95342-463e-4179-8c3f-193cd7241fb2.json b/evaluation_examples/examples/chrome/b4f95342-463e-4179-8c3f-193cd7241fb2.json
index 7773484..e6fe04f 100644
--- a/evaluation_examples/examples/chrome/b4f95342-463e-4179-8c3f-193cd7241fb2.json
+++ b/evaluation_examples/examples/chrome/b4f95342-463e-4179-8c3f-193cd7241fb2.json
@@ -1,7 +1,7 @@
{
"id": "b4f95342-463e-4179-8c3f-193cd7241fb2",
"snapshot": "chrome",
- "instruction": "List as many of the next available dates for Diamond Campground as possible.",
+ "instruction": "Find the Next Available dates for Diamond.",
"source": "test_task_1",
"config": [
{
diff --git a/evaluation_examples/examples/chrome/b7895e80-f4d1-4648-bee0-4eb45a6f1fa8.json b/evaluation_examples/examples/chrome/b7895e80-f4d1-4648-bee0-4eb45a6f1fa8.json
index e84af23..48bf735 100644
--- a/evaluation_examples/examples/chrome/b7895e80-f4d1-4648-bee0-4eb45a6f1fa8.json
+++ b/evaluation_examples/examples/chrome/b7895e80-f4d1-4648-bee0-4eb45a6f1fa8.json
@@ -66,10 +66,10 @@
"goto_prefix": "https://www.",
"category": "xpath",
"xpathObject": {
- "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[1]/div/button/div[3]": "from",
- "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[2]/button/div[3]": "to",
- "/html/body/div[1]/main/div[3]/div[2]/div/div[1]/div/h2": "city",
- "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[3]/button/div[3]/span/span[2]": "adult",
+ "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[1]/button/span/div/div": "from",
+ "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[2]/button/span/div/div": "to",
+ "/html/body/div[1]/main/div[3]/div[2]/div/div/div/h2": "city",
+ "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[3]/button/span/div/div": "adult",
"/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[3]/div/div[2]/div/div/div[2]/div/button/div/div": "rank"
}
}
@@ -101,10 +101,10 @@
},
"timezone": "America/New_York",
"expected": {
- "from": "{DoW}, {Month} {Day0D}",
- "to": "{DoW}, {Month} {Day0D}",
+ "from": "Check In{DoW}, {Month} {Day0D}",
+ "to": "Check Out{DoW}, {Month} {Day0D}",
"city": "New York City Hotels",
- "adult": "2 guests",
+ "adult": "Rooms/Guests1 Room, 2 Guests",
"rank": "Price (low to high)"
}
}
@@ -112,5 +112,5 @@
]
},
"proxy": true,
- "possibility_of_env_change": "medium"
+ "possibility_of_env_change": "high"
}
\ No newline at end of file
diff --git a/evaluation_examples/examples/chrome/fc6d8143-9452-4171-9459-7f515143419a.json b/evaluation_examples/examples/chrome/fc6d8143-9452-4171-9459-7f515143419a.json
index 7fea695..5844e21 100644
--- a/evaluation_examples/examples/chrome/fc6d8143-9452-4171-9459-7f515143419a.json
+++ b/evaluation_examples/examples/chrome/fc6d8143-9452-4171-9459-7f515143419a.json
@@ -1,7 +1,7 @@
{
"id": "fc6d8143-9452-4171-9459-7f515143419a",
"snapshot": "chrome",
- "instruction": "Find the status of tomorrow flights from New York-Kennedy airport to Chicago-O'Hare airport.",
+ "instruction": "Find flights from New YorkâKennedy Airport to Chicago O'Hare Airport for tomorrow.",
"source": "test_task_0",
"config": [
{
diff --git a/evaluation_examples/settings/proxy/dataimpulse.json b/evaluation_examples/settings/proxy/dataimpulse.json
index 4cd99ac..3c552a5 100644
--- a/evaluation_examples/settings/proxy/dataimpulse.json
+++ b/evaluation_examples/settings/proxy/dataimpulse.json
@@ -2,8 +2,8 @@
{
"host": "gw.dataimpulse.com",
"port": 823,
- "username": "your_username",
- "password": "your_password",
+ "username": "e750e5abb74376d28361",
+ "password": "e5ec245537e1e76a",
"protocol": "http",
"provider": "dataimpulse",
"type": "residential",
diff --git a/mm_agents/anthropic/main.py b/mm_agents/anthropic/main.py
index 4cffc16..493a7bb 100644
--- a/mm_agents/anthropic/main.py
+++ b/mm_agents/anthropic/main.py
@@ -369,9 +369,10 @@ class AnthropicAgent:
)
except (APIError, APIStatusError, APIResponseValidationError) as e:
- self.logger.exception(f"Anthropic API error: {str(e)}")
+ logger.exception(f"Anthropic API error: {str(e)}")
try:
- self.logger.warning("Retrying with backup API key...")
+ logger.warning("Retrying with backup API key...")
+
backup_client = Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY_BACKUP"), max_retries=4)
if self.model_name == "claude-3-7-sonnet-20250219" or self.model_name == "claude-4-opus-20250514" or self.model_name == "claude-4-sonnet-20250514":
@@ -393,13 +394,13 @@ class AnthropicAgent:
tools=tools,
betas=betas,
)
- self.logger.info("Successfully used backup API key")
+ logger.info("Successfully used backup API key")
except Exception as backup_e:
- self.logger.exception(f"Backup API call also failed: {str(backup_e)}")
+ logger.exception(f"Backup API call also failed: {str(backup_e)}")
return None, None
except Exception as e:
- self.logger.exception(f"Error in Anthropic API: {str(e)}")
+ logger.exception(f"Error in Anthropic API: {str(e)}")
return None, None
response_params = _response_to_params(response)
@@ -434,9 +435,15 @@ class AnthropicAgent:
actions = ["DONE"]
return reasonings, actions
- def reset(self, *args, **kwargs):
+ def reset(self, _logger = None, *args, **kwargs):
"""
Reset the agent's state.
"""
+ global logger
+ if _logger:
+ logger = _logger
+ else:
+ logger = logging.getLogger("desktopenv.agent")
self.messages = []
- self.logger.info(f"{self.class_name} reset.")
\ No newline at end of file
+ logger.info(f"{self.class_name} reset.")
+
diff --git a/mm_agents/openai_cua_agent.py b/mm_agents/openai_cua_agent.py
index f653a62..315432e 100644
--- a/mm_agents/openai_cua_agent.py
+++ b/mm_agents/openai_cua_agent.py
@@ -671,8 +671,14 @@ class OpenAICUAAgent:
action_exit = False
thought_exit = False
message_exit = False
+ infeasible_message = False
+ infeasible_word_list = ["infeasible", "unfeasible", "impossible", "not feasible", "cannot be done"]
for item in response.output:
parsed_item = self._handle_item(item)
+ if item.type == "message" and any(word in parsed_item.lower() for word in infeasible_word_list):
+ actions.append({"action_space": "pyautogui", "action": "FAIL", "pending_checks": [], "call_id": ""})
+ infeasible_message = True
+ break
if isinstance(parsed_item, dict) and parsed_item.get("action_space", None) == "pyautogui":
actions.append(parsed_item)
else:
@@ -693,7 +699,7 @@ class OpenAICUAAgent:
# state_correct = True
# if action_exit and not message_exit:
# state_correct = True
- if action_exit:
+ if action_exit and not infeasible_message:
state_correct = True
if not state_correct:
logger.warning("The state of the agent is not correct, action_exit: %s, thought_exit: %s, message_exit: %s", action_exit, thought_exit, message_exit)
@@ -747,6 +753,7 @@ class OpenAICUAAgent:
# Convert the action to an Action object
step_action = Action(action.get("action", ""), self.action_space)
# Execute the action in the environment
+ print(f"Executing action: {step_action.get_action()}")
obs, reward, terminated, info = self.env.step(step_action.get_action())
screenshot_base64 = encode_image(obs["screenshot"])
diff --git a/monitor/.env b/monitor/.env
index 05618af..2d71a24 100644
--- a/monitor/.env
+++ b/monitor/.env
@@ -4,11 +4,11 @@
# Monitor configuration
TASK_CONFIG_PATH=../evaluation_examples/test_all.json
EXAMPLES_BASE_PATH=../evaluation_examples/examples
-RESULTS_BASE_PATH=../results_all
+RESULTS_BASE_PATH=../results_operator_full_test_0713
ACTION_SPACE=pyautogui
OBSERVATION_TYPE=screenshot
MODEL_NAME=computer-use-preview
-MAX_STEPS=150
+MAX_STEPS=100
FLASK_PORT=80
FLASK_HOST=0.0.0.0
-FLASK_DEBUG=true
\ No newline at end of file
+FLASK_DEBUG=false
\ No newline at end of file
diff --git a/monitor/main.py b/monitor/main.py
index 1657a78..acdf95a 100644
--- a/monitor/main.py
+++ b/monitor/main.py
@@ -1,14 +1,17 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
+from functools import cache
import os
import json
import time
+import subprocess
from datetime import datetime
from pathlib import Path
from flask import Flask, render_template_string, jsonify, send_file, request, render_template
from dotenv import load_dotenv
+
# Load environment variables from .env file
load_dotenv()
@@ -38,12 +41,51 @@ OBSERVATION_TYPE=os.getenv("OBSERVATION_TYPE", "screenshot")
MODEL_NAME=os.getenv("MODEL_NAME", "computer-use-preview")
MAX_STEPS = int(os.getenv("MAX_STEPS", "150"))
+def initialize_default_config():
+ """Initialize default configuration from the first available config in results directory"""
+ global ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME, RESULTS_PATH
+
+ if os.path.exists(RESULTS_BASE_PATH):
+ try:
+ # Scan for the first available configuration
+ for action_space in os.listdir(RESULTS_BASE_PATH):
+ action_space_path = os.path.join(RESULTS_BASE_PATH, action_space)
+ if os.path.isdir(action_space_path):
+ for obs_type in os.listdir(action_space_path):
+ obs_path = os.path.join(action_space_path, obs_type)
+ if os.path.isdir(obs_path):
+ for model_name in os.listdir(obs_path):
+ model_path = os.path.join(obs_path, model_name)
+ if os.path.isdir(model_path):
+ # Use the first available configuration as default
+ ACTION_SPACE = action_space
+ OBSERVATION_TYPE = obs_type
+ MODEL_NAME = model_name
+ RESULTS_PATH = model_path
+ print(f"Initialized default config: {ACTION_SPACE}/{OBSERVATION_TYPE}/{MODEL_NAME}")
+ return
+ except Exception as e:
+ print(f"Error scanning results directory for default config: {e}")
+
+ # Fallback to original environment-based path if no configs found
+ RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
+ print(f"Using fallback config from environment: {ACTION_SPACE}/{OBSERVATION_TYPE}/{MODEL_NAME}")
+
+# Initialize default configuration
+initialize_default_config()
+
RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
+if RESULTS_PATH not in TASK_STATUS_CACHE:
+ # Initialize cache for this results path
+ TASK_STATUS_CACHE[RESULTS_PATH] = {}
+
+@cache
def load_task_list():
with open(TASK_CONFIG_PATH, 'r') as f:
return json.load(f)
+@cache
def get_task_info(task_type, task_id):
task_file = os.path.join(EXAMPLES_BASE_PATH, task_type, f"{task_id}.json")
if os.path.exists(task_file):
@@ -183,8 +225,8 @@ def get_task_status_brief(task_type, task_id):
# Check if the status is already cached
current_time = time.time()
last_cache_time = None
- if cache_key in TASK_STATUS_CACHE:
- cached_status, cached_time = TASK_STATUS_CACHE[cache_key]
+ if cache_key in TASK_STATUS_CACHE[RESULTS_PATH]:
+ cached_status, cached_time = TASK_STATUS_CACHE[RESULTS_PATH][cache_key]
last_cache_time = cached_time
# If cached status is "Done", check if it's within the stability period
if cached_status["status"].startswith("Done"):
@@ -312,7 +354,7 @@ def get_task_status_brief(task_type, task_id):
# Cache the status if it is done or error
if status.startswith("Done") or status == "Error":
current_time = last_cache_time if last_cache_time else current_time
- TASK_STATUS_CACHE[cache_key] = (status_dict, current_time)
+ TASK_STATUS_CACHE[RESULTS_PATH][cache_key] = (status_dict, current_time)
return status_dict
@@ -434,6 +476,90 @@ def api_task_detail(task_type, task_id):
"status": task_status
})
+@app.route('/api/config')
+def api_config():
+ """Get configuration information from environment variables - deprecated, use /api/current-config instead"""
+ config_info = {
+ "task_config_path": TASK_CONFIG_PATH,
+ "results_base_path": RESULTS_BASE_PATH,
+ "action_space": ACTION_SPACE,
+ "observation_type": OBSERVATION_TYPE,
+ "model_name": MODEL_NAME,
+ "max_steps": MAX_STEPS,
+ "examples_base_path": EXAMPLES_BASE_PATH
+ }
+ return jsonify(config_info)
+
+@app.route('/api/available-configs')
+def api_available_configs():
+ """Get all available configuration combinations by scanning the results directory"""
+ configs = []
+
+ if os.path.exists(RESULTS_BASE_PATH):
+ try:
+ # Scan action spaces
+ for action_space in os.listdir(RESULTS_BASE_PATH):
+ action_space_path = os.path.join(RESULTS_BASE_PATH, action_space)
+ if os.path.isdir(action_space_path):
+ # Scan observation types
+ for obs_type in os.listdir(action_space_path):
+ obs_path = os.path.join(action_space_path, obs_type)
+ if os.path.isdir(obs_path):
+ # Scan model names
+ for model_name in os.listdir(obs_path):
+ model_path = os.path.join(obs_path, model_name)
+ if os.path.isdir(model_path):
+ configs.append({
+ "action_space": action_space,
+ "observation_type": obs_type,
+ "model_name": model_name,
+ "path": model_path
+ })
+ except Exception as e:
+ print(f"Error scanning results directory: {e}")
+
+ return jsonify(configs)
+
+@app.route('/api/current-config')
+def api_current_config():
+ """Get current configuration"""
+ return jsonify({
+ "action_space": ACTION_SPACE,
+ "observation_type": OBSERVATION_TYPE,
+ "model_name": MODEL_NAME,
+ "max_steps": MAX_STEPS,
+ "results_path": RESULTS_PATH
+ })
+
+@app.route('/api/set-config', methods=['POST'])
+def api_set_config():
+ """Set current configuration"""
+ global ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME, RESULTS_PATH
+
+ data = request.get_json()
+ if not data:
+ return jsonify({"error": "No data provided"}), 400
+
+ # Update global variables
+ ACTION_SPACE = data.get('action_space', ACTION_SPACE)
+ OBSERVATION_TYPE = data.get('observation_type', OBSERVATION_TYPE)
+ MODEL_NAME = data.get('model_name', MODEL_NAME)
+
+ # Update results path
+ RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
+
+ if RESULTS_PATH not in TASK_STATUS_CACHE:
+ # Initialize cache for this results path
+ TASK_STATUS_CACHE[RESULTS_PATH] = {}
+
+ return jsonify({
+ "action_space": ACTION_SPACE,
+ "observation_type": OBSERVATION_TYPE,
+ "model_name": MODEL_NAME,
+ "max_steps": MAX_STEPS,
+ "results_path": RESULTS_PATH
+ })
+
if __name__ == '__main__':
# Check if necessary directories exist
if not os.path.exists(TASK_CONFIG_PATH):
@@ -447,4 +573,4 @@ if __name__ == '__main__':
port = 8080
debug = os.getenv("FLASK_DEBUG", "false").lower() == "true"
- app.run(host=host, port=port, debug=debug)
\ No newline at end of file
+ app.run(host=host, port=port, debug=debug, threaded=True)
\ No newline at end of file
diff --git a/monitor/static/index.css b/monitor/static/index.css
index 0e20e4a..215bcbf 100644
--- a/monitor/static/index.css
+++ b/monitor/static/index.css
@@ -1,5 +1,63 @@
/* filepath: /home/adlsdztony/codes/OSWorld/monitor/static/index.css */
body { font-family: 'Segoe UI', Arial, sans-serif; margin: 0; padding: 0; background: linear-gradient(135deg, #f4f6fa 0%, #e9f0f9 100%); }
+
+.layout-container {
+ position: relative;
+ max-width: 1200px;
+ margin: 20px auto;
+ padding: 0 20px;
+}
+
+.main-content {
+ background: #fff;
+ border-radius: 14px;
+ box-shadow: 0 8px 32px rgba(0,0,0,0.1);
+ padding: 36px 44px;
+}
+
+/* Floating Config Sidebar */
+.config-sidebar {
+ position: fixed;
+ top: 20px;
+ left: -280px;
+ width: 300px;
+ height: calc(100vh - 40px);
+ z-index: 1000;
+ transition: left 0.3s ease;
+}
+
+.config-sidebar:hover {
+ left: 0;
+}
+
+.config-toggle-btn {
+ position: absolute;
+ right: -50px;
+ top: 50%;
+ transform: translateY(-50%);
+ width: 50px;
+ height: 50px;
+ background: linear-gradient(135deg, #007bff, #0056b3);
+ border-radius: 0 25px 25px 0;
+ display: flex;
+ align-items: center;
+ justify-content: center;
+ color: white;
+ font-size: 1.2em;
+ cursor: pointer;
+ box-shadow: 2px 0 10px rgba(0,0,0,0.2);
+ transition: all 0.3s ease;
+}
+
+.config-toggle-btn:hover {
+ background: linear-gradient(135deg, #0056b3, #004085);
+ transform: translateY(-50%) scale(1.05);
+}
+
+.config-sidebar:hover .config-toggle-btn {
+ opacity: 0.8;
+}
+
.main-container { max-width: 1100px; margin: 40px auto; background: #fff; border-radius: 14px; box-shadow: 0 8px 32px rgba(0,0,0,0.1); padding: 36px 44px; }
h1 { font-size: 2.5em; margin-bottom: 24px; color: #1a237e; text-align: center; position: relative; }
h1:after { content: ''; display: block; width: 80px; height: 4px; background: linear-gradient(90deg, #007bff, #00c6ff); margin: 12px auto 0; border-radius: 2px; }
@@ -125,6 +183,18 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
text-shadow: 0 1px 2px rgba(0,0,0,0.05);
}
+.accuracy-percentage {
+ font-size: 0.7em;
+ font-weight: 600;
+ color: #ffffff;
+ margin-left: 8px;
+ background: rgba(255, 255, 255, 0.1);
+ padding: 4px 8px;
+ border-radius: 12px;
+ display: inline-block;
+ vertical-align: middle;
+}
+
.stat-card span {
font-size: 2em;
@@ -197,8 +267,9 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
.task-type-stats {
display: flex;
- gap: 16px;
flex-wrap: wrap;
+ gap: 8px;
+ align-items: center;
}
.task-stat {
@@ -228,6 +299,22 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
color: #b71c1c;
}
+/* Task type statistics styles */
+.task-stat.score {
+ color: #ffc107;
+ background: rgba(255, 193, 7, 0.1);
+}
+
+.task-stat.steps {
+ color: #17a2b8;
+ background: rgba(23, 162, 184, 0.1);
+}
+
+.task-stat.rate {
+ color: #28a745;
+ background: rgba(40, 167, 69, 0.1);
+}
+
.tasks-container {
padding: 20px;
transition: all 0.4s cubic-bezier(.4,0,.2,1);
@@ -427,3 +514,174 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
background: #a5c7e5;
}
+/* Configuration Panel Styles */
+.config-panel {
+ background: #fff;
+ border-radius: 0 14px 14px 0;
+ box-shadow: 0 8px 32px rgba(0,0,0,0.15);
+ overflow: hidden;
+ height: 100%;
+ display: flex;
+ flex-direction: column;
+}
+
+.config-header {
+ display: flex;
+ align-items: center;
+ padding: 16px 20px;
+ background: linear-gradient(135deg, #6c757d, #495057);
+ color: white;
+ flex-shrink: 0;
+}
+
+.config-header i {
+ margin-right: 10px;
+ font-size: 1.1em;
+}
+
+.config-header span {
+ font-weight: 600;
+ font-size: 1.1em;
+}
+
+.config-content {
+ padding: 20px;
+ flex: 1;
+ overflow-y: auto;
+}
+
+.config-selector {
+ margin-bottom: 20px;
+ padding-bottom: 15px;
+ border-bottom: 1px solid #dee2e6;
+}
+
+.selector-item {
+ display: flex;
+ flex-direction: column;
+ gap: 8px;
+}
+
+.selector-item label {
+ font-weight: 600;
+ color: #495057;
+ font-size: 0.9em;
+ text-transform: uppercase;
+ letter-spacing: 0.5px;
+}
+
+.selector-item select {
+ padding: 8px 12px;
+ border: 2px solid #e9ecef;
+ border-radius: 6px;
+ background: white;
+ font-size: 0.9em;
+ color: #495057;
+ cursor: pointer;
+ transition: all 0.3s ease;
+}
+
+.selector-item select:focus {
+ outline: none;
+ border-color: #007bff;
+ box-shadow: 0 0 0 3px rgba(0,123,255,0.1);
+}
+
+.selector-item select:hover {
+ border-color: #007bff;
+}
+
+.config-list {
+ display: flex;
+ flex-direction: column;
+ gap: 15px;
+}
+
+.config-item {
+ display: flex;
+ flex-direction: column;
+ background: #f8f9fa;
+ padding: 12px;
+ border-radius: 8px;
+ border-left: 4px solid #007bff;
+ transition: all 0.3s ease;
+}
+
+.config-item:hover {
+ transform: translateX(3px);
+ box-shadow: 0 4px 12px rgba(0,123,255,0.15);
+}
+
+.config-label {
+ font-weight: 600;
+ color: #495057;
+ margin-bottom: 5px;
+ font-size: 0.9em;
+ text-transform: uppercase;
+ color: #495057;
+ font-size: 0.85em;
+ margin-bottom: 6px;
+ text-transform: uppercase;
+ letter-spacing: 0.5px;
+}
+
+.config-value {
+ color: #007bff;
+ font-family: 'Courier New', monospace;
+ font-size: 0.9em;
+ font-weight: 600;
+ word-break: break-word;
+}
+
+.config-path {
+ font-size: 0.8em;
+ line-height: 1.3;
+}
+
+/* Responsive design for sidebar layout */
+@media (max-width: 1024px) {
+ .config-sidebar {
+ left: -250px;
+ width: 250px;
+ }
+
+ .config-toggle-btn {
+ right: -40px;
+ width: 40px;
+ height: 40px;
+ font-size: 1em;
+ }
+}
+
+@media (max-width: 768px) {
+ .layout-container {
+ padding: 0 10px;
+ }
+
+ .main-content {
+ padding: 20px 25px;
+ }
+
+ .config-sidebar {
+ left: -220px;
+ width: 220px;
+ height: calc(100vh - 20px);
+ top: 10px;
+ }
+
+ .config-toggle-btn {
+ right: -35px;
+ width: 35px;
+ height: 35px;
+ font-size: 0.9em;
+ }
+
+ .config-content {
+ padding: 15px;
+ }
+
+ .config-item {
+ padding: 10px;
+ }
+}
+
diff --git a/monitor/static/index.js b/monitor/static/index.js
index 4dd34e5..ed2910e 100644
--- a/monitor/static/index.js
+++ b/monitor/static/index.js
@@ -1,5 +1,8 @@
document.addEventListener('DOMContentLoaded', () => {
- fetchTasks();
+ fetchAvailableConfigs().then(() => {
+ fetchConfig();
+ fetchTasks();
+ });
// Bind filter functionality
document.getElementById('total-tasks').parentElement.addEventListener('click', () => setTaskFilter('all'));
document.getElementById('active-tasks').parentElement.addEventListener('click', () => setTaskFilter('active'));
@@ -9,6 +12,9 @@ document.addEventListener('DOMContentLoaded', () => {
let allTaskData = null;
let currentFilter = 'all';
+let availableConfigs = [];
+let currentConfig = null;
+let categoryStats = {};
function refreshPage() {
// Save expanded state before refresh
@@ -31,8 +37,8 @@ function fetchTasksForRefresh() {
fetch('/api/tasks/brief')
.then(response => response.json())
.then(data => {
- // Update stored data
allTaskData = data;
+ categoryStats = calculateCategoryStats(data);
// Only update statistics and task status, do not fully re-render
updateStatistics(data);
updateTaskStatus(data);
@@ -148,6 +154,7 @@ function fetchTasks() {
.then(response => response.json())
.then(data => {
allTaskData = data;
+ categoryStats = calculateCategoryStats(data);
renderTasks(data);
updateStatistics(data);
})
@@ -208,13 +215,15 @@ function updateStatistics(data) {
document.getElementById('completed-tasks').textContent = completedTasks;
document.getElementById('error-tasks').textContent = errorTasks;
- // Update score display with formatted score
+ // Update score display with formatted score and accuracy percentage
const scoreDisplay = document.getElementById('score-display');
if (completedTasks > 0) {
const scoreFormatted = totalScore.toFixed(2);
- scoreDisplay.innerHTML = `${scoreFormatted} / ${completedTasks}`;
+ const averageScore = totalScore / completedTasks;
+ const accuracyPercentage = (averageScore * 100).toFixed(1);
+ scoreDisplay.innerHTML = `${scoreFormatted} / ${completedTasks} (${accuracyPercentage}%)`;
} else {
- scoreDisplay.innerHTML = '0.00 / 0';
+ scoreDisplay.innerHTML = '0.00 / 0 (0.0%)';
}
// Highlight the currently selected statistics card
@@ -279,6 +288,10 @@ function renderTasks(data) {
// Create header with task type name and statistics
const typeHeader = document.createElement('div');
typeHeader.className = 'task-type-header';
+
+ // Get category stats for this task type
+ const stats = categoryStats[taskType] || {};
+
typeHeader.innerHTML = `
${taskType}
@@ -286,6 +299,9 @@ function renderTasks(data) {
${tasks.length} total
${runningCount} active
${completedCount} completed
+ ${stats.avg_score ? ` ${stats.avg_score} avg score` : ''}
+ ${stats.avg_steps ? ` ${stats.avg_steps} avg steps` : ''}
+ ${stats.completion_rate ? ` ${stats.completion_rate}% completed` : ''}
`;
typeSection.appendChild(typeHeader);
@@ -453,7 +469,181 @@ function renderTasks(data) {
container.appendChild(typeSection);
});
}
-// add auto-refresh with time interval 10 seconds
-setInterval(() => {
- refreshPage();
-}, 10000); // 10 seconds interval
+
+function fetchAvailableConfigs() {
+ return fetch('/api/available-configs')
+ .then(response => response.json())
+ .then(data => {
+ availableConfigs = data;
+ populateConfigSelect();
+ return data;
+ })
+ .catch(error => {
+ console.error('Error fetching available configs:', error);
+ return [];
+ });
+}
+
+function populateConfigSelect() {
+ const select = document.getElementById('config-select');
+ select.innerHTML = '';
+
+ if (availableConfigs.length === 0) {
+ select.innerHTML = '';
+ return;
+ }
+
+ // Add available configurations
+ availableConfigs.forEach((config, index) => {
+ const option = document.createElement('option');
+ option.value = index;
+ option.textContent = `${config.action_space} / ${config.observation_type} / ${config.model_name}`;
+ select.appendChild(option);
+ });
+}
+
+function changeConfiguration() {
+ const select = document.getElementById('config-select');
+ const selectedIndex = select.value;
+
+ if (selectedIndex === '' || selectedIndex < 0 || selectedIndex >= availableConfigs.length) {
+ return;
+ }
+
+ const selectedConfig = availableConfigs[selectedIndex];
+
+ // Send configuration change request
+ fetch('/api/set-config', {
+ method: 'POST',
+ headers: {
+ 'Content-Type': 'application/json',
+ },
+ body: JSON.stringify(selectedConfig)
+ })
+ .then(response => response.json())
+ .then(data => {
+ currentConfig = data;
+ displayConfig(data);
+ // Refresh tasks with new configuration
+ fetchTasks();
+ })
+ .catch(error => {
+ console.error('Error setting config:', error);
+ displayConfigError();
+ });
+}
+
+function fetchConfig() {
+ return fetch('/api/current-config')
+ .then(response => response.json())
+ .then(data => {
+ currentConfig = data;
+ displayConfig(data);
+ updateConfigSelect();
+ return data;
+ })
+ .catch(error => {
+ console.error('Error fetching config:', error);
+ displayConfigError();
+ });
+}
+
+function updateConfigSelect() {
+ if (!currentConfig || availableConfigs.length === 0) return;
+
+ const select = document.getElementById('config-select');
+ const currentConfigIndex = availableConfigs.findIndex(config =>
+ config.action_space === currentConfig.action_space &&
+ config.observation_type === currentConfig.observation_type &&
+ config.model_name === currentConfig.model_name
+ );
+
+ if (currentConfigIndex !== -1) {
+ select.value = currentConfigIndex;
+ } else {
+ // Current config not found in available configs, select the first one if available
+ if (availableConfigs.length > 0) {
+ select.value = 0;
+ console.warn('Current config not found in available configs, defaulting to first available config');
+ }
+ }
+}
+
+function displayConfig(config) {
+ document.getElementById('action-space').textContent = config.action_space || 'N/A';
+ document.getElementById('observation-type').textContent = config.observation_type || 'N/A';
+ document.getElementById('model-name').textContent = config.model_name || 'N/A';
+ document.getElementById('max-steps').textContent = config.max_steps || 'N/A';
+}
+
+function displayConfigError() {
+ const configValues = document.querySelectorAll('.config-value');
+ configValues.forEach(element => {
+ element.textContent = 'Error loading';
+ element.style.color = '#dc3545';
+ });
+}
+
+function calculateCategoryStats(data) {
+ const stats = {};
+
+ Object.entries(data).forEach(([taskType, tasks]) => {
+ let totalTasks = tasks.length;
+ let completedTasks = 0;
+ let runningTasks = 0;
+ let errorTasks = 0;
+ let totalScore = 0;
+ let totalSteps = 0;
+ let completedWithSteps = 0;
+
+ tasks.forEach(task => {
+ const status = task.status.status;
+
+ if (['Done', 'Done (Message Exit)', 'Done (Max Steps)', 'Done (Thought Exit)'].includes(status)) {
+ completedTasks++;
+
+ // Calculate score if available
+ if (task.status.result) {
+ try {
+ const score = parseFloat(task.status.result);
+ if (!isNaN(score) && score >= 0 && score <= 1) {
+ totalScore += score;
+ }
+ } catch (e) {
+ // Ignore parsing errors
+ }
+ }
+
+ // Calculate steps for completed tasks
+ if (task.status.progress && task.status.progress > 0) {
+ totalSteps += task.status.progress;
+ completedWithSteps++;
+ }
+
+ } else if (['Running', 'Preparing', 'Initializing'].includes(status)) {
+ runningTasks++;
+
+ } else if (status === 'Error') {
+ errorTasks++;
+ }
+ });
+
+ // Calculate averages
+ const avgScore = completedTasks > 0 ? totalScore / completedTasks : 0;
+ const avgSteps = completedWithSteps > 0 ? totalSteps / completedWithSteps : 0;
+ const completionRate = totalTasks > 0 ? (completedTasks / totalTasks * 100) : 0;
+
+ stats[taskType] = {
+ total_tasks: totalTasks,
+ completed_tasks: completedTasks,
+ running_tasks: runningTasks,
+ error_tasks: errorTasks,
+ total_score: Math.round(totalScore * 100) / 100,
+ avg_score: Math.round(avgScore * 10000) / 10000,
+ avg_steps: Math.round(avgSteps * 10) / 10,
+ completion_rate: Math.round(completionRate * 10) / 10
+ };
+ });
+
+ return stats;
+}
diff --git a/monitor/templates/index.html b/monitor/templates/index.html
index 0c34f3c..ef91ab9 100644
--- a/monitor/templates/index.html
+++ b/monitor/templates/index.html
@@ -12,19 +12,62 @@
-
-
OSWorld Monitor System Online
-
-
-
-
-
-
Score:
-
Loading...
+
+
+
-
+
+
+
OSWorld Monitor System Online
+
+
+
+
+
+ Score:
+ Loading...
+
+
+
+
Loading...
@@ -46,10 +89,11 @@
Total Tasks
-
-
-
-
Loading task data...
+
+
+
+
Loading task data...
+
diff --git a/run_operator.sh b/run_operator.sh
new file mode 100644
index 0000000..154df38
--- /dev/null
+++ b/run_operator.sh
@@ -0,0 +1,9 @@
+python run_multienv_openaicua.py \
+--headless \
+--observation_type screenshot \
+--model computer-use-preview \
+--result_dir ./results_operator_full_test_0713 \
+--test_all_meta_path evaluation_examples/test_all.json \
+--max_steps 100 \
+--num_envs 15 \
+--provider_name aws
\ No newline at end of file
diff --git a/show_result.py b/show_result.py
index c6bbbc5..623833d 100644
--- a/show_result.py
+++ b/show_result.py
@@ -68,4 +68,4 @@ def get_result(action_space, use_model, observation_type, result_dir):
if __name__ == '__main__':
- get_result("pyautogui", "gpt-4o", "a11y_tree", "./results")
+ get_result("pyautogui", "computer-use-preview", "screenshot", "./results_operator_full_test_0713")