Merge branch 'fix_chrome'

This commit is contained in:
yuanmengqi
2025-07-15 02:13:58 +00:00
parent 7c807d4f3e
commit 756ef96850
21 changed files with 922 additions and 103 deletions

View File

@@ -1,10 +1,13 @@
import os
import logging
from typing import Dict, List, Set
from typing import Optional, Any, Union
from datetime import datetime
import requests
import pandas as pd
logger = logging.getLogger("desktopenv.getter.file")
def get_content_from_vm_file(env, config: Dict[str, Any]) -> Any:
"""
@@ -101,16 +104,42 @@ def get_vm_file(env, config: Dict[str, Any]) -> Union[Optional[str], List[Option
for i, (p, d) in enumerate(zip(paths, dests)):
_path = os.path.join(env.cache_dir, d)
file = env.controller.get_file(p)
if file is None:
try:
# Try to get file from VM
file = env.controller.get_file(p)
if file is None:
logger.warning(f"Failed to get file from VM: {p}")
if i in gives:
cache_paths.append(None)
continue
if i in gives:
cache_paths.append(_path)
# Write file with robust error handling
try:
# Ensure cache directory exists
os.makedirs(env.cache_dir, exist_ok=True)
with open(_path, "wb") as f:
f.write(file)
logger.info(f"Successfully saved file: {_path} ({len(file)} bytes)")
except IOError as e:
logger.error(f"IO error writing file {_path}: {e}")
if i in gives:
cache_paths[-1] = None # Replace the path we just added with None
except Exception as e:
logger.error(f"Unexpected error writing file {_path}: {e}")
if i in gives:
cache_paths[-1] = None
except Exception as e:
logger.error(f"Error processing file {p}: {e}")
if i in gives:
cache_paths.append(None)
continue
if i in gives:
cache_paths.append(_path)
with open(_path, "wb") as f:
f.write(file)
return cache_paths[0] if len(cache_paths)==1 else cache_paths

View File

@@ -298,34 +298,84 @@ def check_json(result: str, rules: Dict[str, List[Dict[str, Union[List[str], str
"""
if result is None:
logger.warning("Result file path is None, returning 0.0")
return 0.
# Check if file exists
if not os.path.exists(result):
logger.warning(f"Result file does not exist: {result}, returning 0.0")
return 0.
try:
with open(result, 'r', encoding='utf-8') as f:
if is_yaml:
try:
# Use SafeLoader instead of Loader for better security and error handling
result_data: Dict[str, Any] = yaml.safe_load(f)
if result_data is None:
logger.warning(f"YAML file {result} is empty or contains only null values, returning 0.0")
return 0.
except yaml.YAMLError as e:
logger.error(f"YAML parsing error in file {result}: {e}")
logger.error(f"File content might be corrupted or have invalid YAML syntax")
return 0.
except Exception as e:
logger.error(f"Unexpected error parsing YAML file {result}: {e}")
return 0.
else:
try:
result_data: Dict[str, Any] = json.load(f)
except json.JSONDecodeError as e:
logger.error(f"JSON parsing error in file {result}: {e}")
return 0.
except Exception as e:
logger.error(f"Unexpected error parsing JSON file {result}: {e}")
return 0.
except IOError as e:
logger.error(f"IO error reading file {result}: {e}")
return 0.
except Exception as e:
logger.error(f"Unexpected error reading file {result}: {e}")
return 0.
with open(result) as f:
if is_yaml:
result: Dict[str, Any] = yaml.load(f, Loader=yaml.Loader)
else:
result: Dict[str, Any] = json.load(f)
expect_rules = rules.get("expect", {})
unexpect_rules = rules.get("unexpect", {})
metric = True
for r in expect_rules:
value = result
for k in r["key"]:
try:
value = value[k]
except KeyError:
return 0.
metric = metric and _match_value_to_rule(value, r)
value = result_data
try:
for k in r["key"]:
try:
value = value[k]
except KeyError:
logger.debug(f"Key '{k}' not found in result data, returning 0.0")
return 0.
except TypeError:
logger.debug(f"Cannot access key '{k}' - value is not a dictionary, returning 0.0")
return 0.
metric = metric and _match_value_to_rule(value, r)
except Exception as e:
logger.error(f"Error processing expect rule {r}: {e}")
return 0.
for r in unexpect_rules:
value = result
for k in r["key"]:
try:
value = value[k]
except KeyError:
value = None
break
metric = metric and not _match_value_to_rule(value, r)
value = result_data
try:
for k in r["key"]:
try:
value = value[k]
except KeyError:
value = None
break
except TypeError:
value = None
break
metric = metric and not _match_value_to_rule(value, r)
except Exception as e:
logger.error(f"Error processing unexpect rule {r}: {e}")
return 0.
return float(metric)

View File

@@ -73,6 +73,9 @@ def check_image_stretch_and_center(modified_ppt, original_ppt):
original_slide_images = [shape for shape in original_slide.shapes if shape.shape_type == 13]
modified_slide_images = [shape for shape in modified_slide.shapes if shape.shape_type == 13]
if not original_slide_images:
return 0.
the_image = original_slide_images[0]
the_modified_image = None
@@ -395,12 +398,38 @@ def compare_pptx_files(file1_path, file2_path, **options):
table2 = shape2.table
if enable_debug:
debug_logger.debug(f" Shape {shape_idx} - Comparing TABLE with {len(table1.rows)} rows and {len(table1.columns)} columns")
debug_logger.debug(f" Shape {shape_idx} - Table2 has {len(table2.rows)} rows and {len(table2.columns)} columns")
# Check if tables have the same dimensions
if len(table1.rows) != len(table2.rows) or len(table1.columns) != len(table2.columns):
if enable_debug:
debug_logger.debug(f" MISMATCH: Slide {slide_idx}, Shape {shape_idx} (TABLE) - Table dimensions differ:")
debug_logger.debug(f" Table1: {len(table1.rows)} rows x {len(table1.columns)} columns")
debug_logger.debug(f" Table2: {len(table2.rows)} rows x {len(table2.columns)} columns")
return 0
for row_idx in range(len(table1.rows)):
for col_idx in range(len(table1.columns)):
cell1 = table1.cell(row_idx, col_idx)
cell2 = table2.cell(row_idx, col_idx)
# Check if cells have the same number of paragraphs
if len(cell1.text_frame.paragraphs) != len(cell2.text_frame.paragraphs):
if enable_debug:
debug_logger.debug(f" MISMATCH: Slide {slide_idx}, Shape {shape_idx} (TABLE) - Cell [{row_idx},{col_idx}] - Different number of paragraphs:")
debug_logger.debug(f" Cell1 paragraphs: {len(cell1.text_frame.paragraphs)}")
debug_logger.debug(f" Cell2 paragraphs: {len(cell2.text_frame.paragraphs)}")
return 0
for para_idx, (para1, para2) in enumerate(zip(cell1.text_frame.paragraphs, cell2.text_frame.paragraphs)):
# Check if paragraphs have the same number of runs
if len(para1.runs) != len(para2.runs):
if enable_debug:
debug_logger.debug(f" MISMATCH: Slide {slide_idx}, Shape {shape_idx} (TABLE) - Cell [{row_idx},{col_idx}], Para {para_idx} - Different number of runs:")
debug_logger.debug(f" Para1 runs: {len(para1.runs)}")
debug_logger.debug(f" Para2 runs: {len(para2.runs)}")
return 0
for run_idx, (run1, run2) in enumerate(zip(para1.runs, para2.runs)):
# Check font color
if hasattr(run1.font.color, "rgb") and hasattr(run2.font.color, "rgb"):
@@ -451,6 +480,14 @@ def compare_pptx_files(file1_path, file2_path, **options):
if shape1.text.strip() != shape2.text.strip() and examine_text:
return 0
# check if the number of paragraphs are the same
if len(shape1.text_frame.paragraphs) != len(shape2.text_frame.paragraphs):
if enable_debug:
debug_logger.debug(f" MISMATCH: Slide {slide_idx}, Shape {shape_idx} - Different number of paragraphs:")
debug_logger.debug(f" Shape1 paragraphs: {len(shape1.text_frame.paragraphs)}")
debug_logger.debug(f" Shape2 paragraphs: {len(shape2.text_frame.paragraphs)}")
return 0
# check if the paragraphs are the same
para_idx = 0
for para1, para2 in zip(shape1.text_frame.paragraphs, shape2.text_frame.paragraphs):
@@ -487,6 +524,14 @@ def compare_pptx_files(file1_path, file2_path, **options):
if para1.level != para2.level and examine_indent:
return 0
# check if the number of runs are the same
if len(para1.runs) != len(para2.runs):
if enable_debug:
debug_logger.debug(f" MISMATCH: Slide {slide_idx}, Shape {shape_idx}, Para {para_idx} - Different number of runs:")
debug_logger.debug(f" Para1 runs: {len(para1.runs)}")
debug_logger.debug(f" Para2 runs: {len(para2.runs)}")
return 0
for run1, run2 in zip(para1.runs, para2.runs):
# check if the font properties are the same
@@ -634,6 +679,12 @@ def compare_pptx_files(file1_path, file2_path, **options):
debug_logger.debug(f" MISMATCH: Text differs - '{tshape1.text.strip()}' vs '{tshape2.text.strip()}'")
return 0
# Check if text shapes have the same number of paragraphs
if len(tshape1.text_frame.paragraphs) != len(tshape2.text_frame.paragraphs):
if enable_debug:
debug_logger.debug(f" MISMATCH: Different number of paragraphs - {len(tshape1.text_frame.paragraphs)} vs {len(tshape2.text_frame.paragraphs)}")
return 0
# Compare alignment of each paragraph
for para_idx, (para1, para2) in enumerate(zip(tshape1.text_frame.paragraphs, tshape2.text_frame.paragraphs)):
from pptx.enum.text import PP_ALIGN

View File

@@ -36,8 +36,14 @@ def _parse_sheet_idx(sheet_idx: Union[int, str]
# function _parse_sheet_idx {{{ #
if isinstance(sheet_idx, int):
try:
index: str = result_sheet_names[sheet_idx]
except:
if not result_sheet_names or sheet_idx >= len(result_sheet_names):
logger.error(f"Sheet index {sheet_idx} out of range. Available sheets: {result_sheet_names}")
index = ""
else:
index: str = result_sheet_names[sheet_idx]
logger.debug(f"Sheet index {sheet_idx} resolved to sheet: {index}")
except Exception as e:
logger.error(f"Error resolving sheet index {sheet_idx}: {e}")
index = ""
book: BOOK = result
elif sheet_idx.startswith("RI"):
@@ -114,12 +120,21 @@ def compare_table(result: str, expected: str = None, **options) -> float:
"""
if result is None:
logger.error("Result file path is None")
return 0.
# Check if result file exists
if not os.path.exists(result):
logger.error(f"Result file not found: {result}")
return 0.
try:
logger.info(f"Loading result file: {result}")
xlworkbookr: Workbook = openpyxl.load_workbook(filename=result)
pdworkbookr = pd.ExcelFile(result)
except:
logger.info(f"Successfully loaded result file with sheets: {pdworkbookr.sheet_names}")
except Exception as e:
logger.error(f"Failed to load result file {result}: {e}")
return 0.
worksheetr_names: List[str] = pdworkbookr.sheet_names
@@ -432,19 +447,35 @@ def compare_table(result: str, expected: str = None, **options) -> float:
# props: dict like {attribute: {"method": str, "ref": anything}}
# supported attributes: value & those supported by utils._read_cell_style
sheet: Worksheet = _load_sheet(*parse_idx(r["sheet_idx"], xlworkbookr, xlworkbooke))
if sheet is None:
return 0.
# data_frame: pd.DataFrame = _load_sheet(*parse_idx(r["sheet_idx"], pdworkbookr, pdworkbooke))
cell: Cell = sheet[r["coordinate"]]
metric: bool = True
for prpt, rule in r["props"].items():
if prpt == "value":
val = read_cell_value(*parse_idx(r["sheet_idx"], result, expected), r["coordinate"])
else:
val = _read_cell_style(prpt, cell)
try:
sheet: Worksheet = _load_sheet(*parse_idx(r["sheet_idx"], xlworkbookr, xlworkbooke))
if sheet is None:
logger.error(f"Failed to load sheet for sheet_idx: {r['sheet_idx']}")
return 0.
# data_frame: pd.DataFrame = _load_sheet(*parse_idx(r["sheet_idx"], pdworkbookr, pdworkbooke))
cell: Cell = sheet[r["coordinate"]]
metric: bool = True
for prpt, rule in r["props"].items():
if prpt == "value":
try:
parsed_result = parse_idx(r["sheet_idx"], result, expected)
logger.debug(f"parse_idx result: {parsed_result}")
val = read_cell_value(*parsed_result, r["coordinate"])
logger.debug(f"Cell {r['coordinate']} value: {val}")
except Exception as e:
logger.error(f"Failed to read cell value at {r['coordinate']}: {e}")
val = None
else:
try:
val = _read_cell_style(prpt, cell)
except Exception as e:
logger.error(f"Failed to read cell style {prpt} at {r['coordinate']}: {e}")
val = None
metric = metric and _match_value_to_rule(val, rule)
metric = metric and _match_value_to_rule(val, rule)
except Exception as e:
logger.error(f"Error in check_cell processing: {e}")
return 0.
logger.debug("Assertion: %s[%s] :%s - %s"
, r["sheet_idx"], r["coordinate"]

View File

@@ -4,6 +4,7 @@ import functools
import itertools
import logging
import operator
import os
import re
import zipfile
#import pandas as pd
@@ -33,10 +34,11 @@ V = TypeVar("Value")
logger = logging.getLogger("desktopenv.metrics.utils")
_xlsx_namespaces = [("oo", "http://schemas.openxmlformats.org/spreadsheetml/2006/main")
, ("x14", "http://schemas.microsoft.com/office/spreadsheetml/2009/9/main")
, ("xm", "http://schemas.microsoft.com/office/excel/2006/main")
]
_xlsx_namespaces = [
("oo", "http://schemas.openxmlformats.org/spreadsheetml/2006/main"),
("x14", "http://schemas.microsoft.com/office/spreadsheetml/2009/9/main"),
("xm", "http://schemas.microsoft.com/office/excel/2006/main")
]
_xlsx_ns_mapping = dict(_xlsx_namespaces)
_xlsx_ns_imapping = dict(map(lambda itm: (itm[1], itm[0]), _xlsx_namespaces))
_xlsx_ns_imapping["http://schemas.openxmlformats.org/spreadsheetml/2006/main"] = None
@@ -282,6 +284,13 @@ _shared_str_value_selector = lxml.cssselect.CSSSelector("oo|t", namespaces=_xlsx
def read_cell_value(xlsx_file: str, sheet_name: str, coordinate: str) -> Any:
# read_cell_value {{{ #
logger.debug(f"Reading cell value from {xlsx_file}, sheet: {sheet_name}, coordinate: {coordinate}")
# Check if file exists
if not os.path.exists(xlsx_file):
logger.error(f"Excel file not found: {xlsx_file}")
return None
try:
with zipfile.ZipFile(xlsx_file, "r") as z_f:
try:
@@ -308,9 +317,17 @@ def read_cell_value(xlsx_file: str, sheet_name: str, coordinate: str) -> Any:
, namespaces=_xlsx_ns_mapping
)(sheet)
if len(cells) == 0:
logger.debug(f"Cell {coordinate} not found in sheet {sheet_name}")
return None
cell: _Element = cells[0]
except zipfile.BadZipFile:
except zipfile.BadZipFile as e:
logger.error(f"Bad zip file {xlsx_file}: {e}")
return None
except KeyError as e:
logger.error(f"Sheet {sheet_name} not found in {xlsx_file}: {e}")
return None
except Exception as e:
logger.error(f"Error reading {xlsx_file}: {e}")
return None
cell: Dict[str, str] = xmltodict.parse(lxml.etree.tostring(cell, encoding="unicode")

View File

@@ -1,7 +1,7 @@
{
"id": "47543840-672a-467d-80df-8f7c3b9788c9",
"snapshot": "chrome",
"instruction": "Show me the cars available for pickup at Boston Logan Intl Airport from the 10th to the 11th of next month, sorted by the number of seats to find the largest capacity.",
"instruction": "On the current website, show me the cars available for pickup at Boston Logan Intl Airport from the 10th to the 11th of next month, sorted by the number of seats to find the largest capacity.",
"source": "test_task_1",
"config": [
{

View File

@@ -57,5 +57,5 @@
}
}
},
"proxy": true
"proxy": false
}

View File

@@ -56,5 +56,5 @@
}
}
},
"proxy": true
"proxy": false
}

View File

@@ -1,7 +1,7 @@
{
"id": "b4f95342-463e-4179-8c3f-193cd7241fb2",
"snapshot": "chrome",
"instruction": "List as many of the next available dates for Diamond Campground as possible.",
"instruction": "Find the Next Available dates for Diamond.",
"source": "test_task_1",
"config": [
{

View File

@@ -66,10 +66,10 @@
"goto_prefix": "https://www.",
"category": "xpath",
"xpathObject": {
"/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[1]/div/button/div[3]": "from",
"/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[2]/button/div[3]": "to",
"/html/body/div[1]/main/div[3]/div[2]/div/div[1]/div/h2": "city",
"/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[3]/button/div[3]/span/span[2]": "adult",
"/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[1]/button/span/div/div": "from",
"/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[2]/button/span/div/div": "to",
"/html/body/div[1]/main/div[3]/div[2]/div/div/div/h2": "city",
"/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[3]/button/span/div/div": "adult",
"/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[3]/div/div[2]/div/div/div[2]/div/button/div/div": "rank"
}
}
@@ -101,10 +101,10 @@
},
"timezone": "America/New_York",
"expected": {
"from": "{DoW}, {Month} {Day0D}",
"to": "{DoW}, {Month} {Day0D}",
"from": "Check In{DoW}, {Month} {Day0D}",
"to": "Check Out{DoW}, {Month} {Day0D}",
"city": "New York City Hotels",
"adult": "2 guests",
"adult": "Rooms/Guests1 Room, 2 Guests",
"rank": "Price (low to high)"
}
}
@@ -112,5 +112,5 @@
]
},
"proxy": true,
"possibility_of_env_change": "medium"
"possibility_of_env_change": "high"
}

View File

@@ -1,7 +1,7 @@
{
"id": "fc6d8143-9452-4171-9459-7f515143419a",
"snapshot": "chrome",
"instruction": "Find the status of tomorrow flights from New York-Kennedy airport to Chicago-O'Hare airport.",
"instruction": "Find flights from New YorkKennedy Airport to Chicago O'Hare Airport for tomorrow.",
"source": "test_task_0",
"config": [
{

View File

@@ -2,8 +2,8 @@
{
"host": "gw.dataimpulse.com",
"port": 823,
"username": "your_username",
"password": "your_password",
"username": "e750e5abb74376d28361",
"password": "e5ec245537e1e76a",
"protocol": "http",
"provider": "dataimpulse",
"type": "residential",

View File

@@ -369,9 +369,10 @@ class AnthropicAgent:
)
except (APIError, APIStatusError, APIResponseValidationError) as e:
self.logger.exception(f"Anthropic API error: {str(e)}")
logger.exception(f"Anthropic API error: {str(e)}")
try:
self.logger.warning("Retrying with backup API key...")
logger.warning("Retrying with backup API key...")
backup_client = Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY_BACKUP"), max_retries=4)
if self.model_name == "claude-3-7-sonnet-20250219" or self.model_name == "claude-4-opus-20250514" or self.model_name == "claude-4-sonnet-20250514":
@@ -393,13 +394,13 @@ class AnthropicAgent:
tools=tools,
betas=betas,
)
self.logger.info("Successfully used backup API key")
logger.info("Successfully used backup API key")
except Exception as backup_e:
self.logger.exception(f"Backup API call also failed: {str(backup_e)}")
logger.exception(f"Backup API call also failed: {str(backup_e)}")
return None, None
except Exception as e:
self.logger.exception(f"Error in Anthropic API: {str(e)}")
logger.exception(f"Error in Anthropic API: {str(e)}")
return None, None
response_params = _response_to_params(response)
@@ -434,9 +435,15 @@ class AnthropicAgent:
actions = ["DONE"]
return reasonings, actions
def reset(self, *args, **kwargs):
def reset(self, _logger = None, *args, **kwargs):
"""
Reset the agent's state.
"""
global logger
if _logger:
logger = _logger
else:
logger = logging.getLogger("desktopenv.agent")
self.messages = []
self.logger.info(f"{self.class_name} reset.")
logger.info(f"{self.class_name} reset.")

View File

@@ -671,8 +671,14 @@ class OpenAICUAAgent:
action_exit = False
thought_exit = False
message_exit = False
infeasible_message = False
infeasible_word_list = ["infeasible", "unfeasible", "impossible", "not feasible", "cannot be done"]
for item in response.output:
parsed_item = self._handle_item(item)
if item.type == "message" and any(word in parsed_item.lower() for word in infeasible_word_list):
actions.append({"action_space": "pyautogui", "action": "FAIL", "pending_checks": [], "call_id": ""})
infeasible_message = True
break
if isinstance(parsed_item, dict) and parsed_item.get("action_space", None) == "pyautogui":
actions.append(parsed_item)
else:
@@ -693,7 +699,7 @@ class OpenAICUAAgent:
# state_correct = True
# if action_exit and not message_exit:
# state_correct = True
if action_exit:
if action_exit and not infeasible_message:
state_correct = True
if not state_correct:
logger.warning("The state of the agent is not correct, action_exit: %s, thought_exit: %s, message_exit: %s", action_exit, thought_exit, message_exit)
@@ -747,6 +753,7 @@ class OpenAICUAAgent:
# Convert the action to an Action object
step_action = Action(action.get("action", ""), self.action_space)
# Execute the action in the environment
print(f"Executing action: {step_action.get_action()}")
obs, reward, terminated, info = self.env.step(step_action.get_action())
screenshot_base64 = encode_image(obs["screenshot"])

View File

@@ -4,11 +4,11 @@
# Monitor configuration
TASK_CONFIG_PATH=../evaluation_examples/test_all.json
EXAMPLES_BASE_PATH=../evaluation_examples/examples
RESULTS_BASE_PATH=../results_all
RESULTS_BASE_PATH=../results_operator_full_test_0713
ACTION_SPACE=pyautogui
OBSERVATION_TYPE=screenshot
MODEL_NAME=computer-use-preview
MAX_STEPS=150
MAX_STEPS=100
FLASK_PORT=80
FLASK_HOST=0.0.0.0
FLASK_DEBUG=true
FLASK_DEBUG=false

View File

@@ -1,14 +1,17 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from functools import cache
import os
import json
import time
import subprocess
from datetime import datetime
from pathlib import Path
from flask import Flask, render_template_string, jsonify, send_file, request, render_template
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
@@ -38,12 +41,51 @@ OBSERVATION_TYPE=os.getenv("OBSERVATION_TYPE", "screenshot")
MODEL_NAME=os.getenv("MODEL_NAME", "computer-use-preview")
MAX_STEPS = int(os.getenv("MAX_STEPS", "150"))
def initialize_default_config():
"""Initialize default configuration from the first available config in results directory"""
global ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME, RESULTS_PATH
if os.path.exists(RESULTS_BASE_PATH):
try:
# Scan for the first available configuration
for action_space in os.listdir(RESULTS_BASE_PATH):
action_space_path = os.path.join(RESULTS_BASE_PATH, action_space)
if os.path.isdir(action_space_path):
for obs_type in os.listdir(action_space_path):
obs_path = os.path.join(action_space_path, obs_type)
if os.path.isdir(obs_path):
for model_name in os.listdir(obs_path):
model_path = os.path.join(obs_path, model_name)
if os.path.isdir(model_path):
# Use the first available configuration as default
ACTION_SPACE = action_space
OBSERVATION_TYPE = obs_type
MODEL_NAME = model_name
RESULTS_PATH = model_path
print(f"Initialized default config: {ACTION_SPACE}/{OBSERVATION_TYPE}/{MODEL_NAME}")
return
except Exception as e:
print(f"Error scanning results directory for default config: {e}")
# Fallback to original environment-based path if no configs found
RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
print(f"Using fallback config from environment: {ACTION_SPACE}/{OBSERVATION_TYPE}/{MODEL_NAME}")
# Initialize default configuration
initialize_default_config()
RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
if RESULTS_PATH not in TASK_STATUS_CACHE:
# Initialize cache for this results path
TASK_STATUS_CACHE[RESULTS_PATH] = {}
@cache
def load_task_list():
with open(TASK_CONFIG_PATH, 'r') as f:
return json.load(f)
@cache
def get_task_info(task_type, task_id):
task_file = os.path.join(EXAMPLES_BASE_PATH, task_type, f"{task_id}.json")
if os.path.exists(task_file):
@@ -183,8 +225,8 @@ def get_task_status_brief(task_type, task_id):
# Check if the status is already cached
current_time = time.time()
last_cache_time = None
if cache_key in TASK_STATUS_CACHE:
cached_status, cached_time = TASK_STATUS_CACHE[cache_key]
if cache_key in TASK_STATUS_CACHE[RESULTS_PATH]:
cached_status, cached_time = TASK_STATUS_CACHE[RESULTS_PATH][cache_key]
last_cache_time = cached_time
# If cached status is "Done", check if it's within the stability period
if cached_status["status"].startswith("Done"):
@@ -312,7 +354,7 @@ def get_task_status_brief(task_type, task_id):
# Cache the status if it is done or error
if status.startswith("Done") or status == "Error":
current_time = last_cache_time if last_cache_time else current_time
TASK_STATUS_CACHE[cache_key] = (status_dict, current_time)
TASK_STATUS_CACHE[RESULTS_PATH][cache_key] = (status_dict, current_time)
return status_dict
@@ -434,6 +476,90 @@ def api_task_detail(task_type, task_id):
"status": task_status
})
@app.route('/api/config')
def api_config():
"""Get configuration information from environment variables - deprecated, use /api/current-config instead"""
config_info = {
"task_config_path": TASK_CONFIG_PATH,
"results_base_path": RESULTS_BASE_PATH,
"action_space": ACTION_SPACE,
"observation_type": OBSERVATION_TYPE,
"model_name": MODEL_NAME,
"max_steps": MAX_STEPS,
"examples_base_path": EXAMPLES_BASE_PATH
}
return jsonify(config_info)
@app.route('/api/available-configs')
def api_available_configs():
"""Get all available configuration combinations by scanning the results directory"""
configs = []
if os.path.exists(RESULTS_BASE_PATH):
try:
# Scan action spaces
for action_space in os.listdir(RESULTS_BASE_PATH):
action_space_path = os.path.join(RESULTS_BASE_PATH, action_space)
if os.path.isdir(action_space_path):
# Scan observation types
for obs_type in os.listdir(action_space_path):
obs_path = os.path.join(action_space_path, obs_type)
if os.path.isdir(obs_path):
# Scan model names
for model_name in os.listdir(obs_path):
model_path = os.path.join(obs_path, model_name)
if os.path.isdir(model_path):
configs.append({
"action_space": action_space,
"observation_type": obs_type,
"model_name": model_name,
"path": model_path
})
except Exception as e:
print(f"Error scanning results directory: {e}")
return jsonify(configs)
@app.route('/api/current-config')
def api_current_config():
"""Get current configuration"""
return jsonify({
"action_space": ACTION_SPACE,
"observation_type": OBSERVATION_TYPE,
"model_name": MODEL_NAME,
"max_steps": MAX_STEPS,
"results_path": RESULTS_PATH
})
@app.route('/api/set-config', methods=['POST'])
def api_set_config():
"""Set current configuration"""
global ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME, RESULTS_PATH
data = request.get_json()
if not data:
return jsonify({"error": "No data provided"}), 400
# Update global variables
ACTION_SPACE = data.get('action_space', ACTION_SPACE)
OBSERVATION_TYPE = data.get('observation_type', OBSERVATION_TYPE)
MODEL_NAME = data.get('model_name', MODEL_NAME)
# Update results path
RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
if RESULTS_PATH not in TASK_STATUS_CACHE:
# Initialize cache for this results path
TASK_STATUS_CACHE[RESULTS_PATH] = {}
return jsonify({
"action_space": ACTION_SPACE,
"observation_type": OBSERVATION_TYPE,
"model_name": MODEL_NAME,
"max_steps": MAX_STEPS,
"results_path": RESULTS_PATH
})
if __name__ == '__main__':
# Check if necessary directories exist
if not os.path.exists(TASK_CONFIG_PATH):
@@ -447,4 +573,4 @@ if __name__ == '__main__':
port = 8080
debug = os.getenv("FLASK_DEBUG", "false").lower() == "true"
app.run(host=host, port=port, debug=debug)
app.run(host=host, port=port, debug=debug, threaded=True)

View File

@@ -1,5 +1,63 @@
/* filepath: /home/adlsdztony/codes/OSWorld/monitor/static/index.css */
body { font-family: 'Segoe UI', Arial, sans-serif; margin: 0; padding: 0; background: linear-gradient(135deg, #f4f6fa 0%, #e9f0f9 100%); }
.layout-container {
position: relative;
max-width: 1200px;
margin: 20px auto;
padding: 0 20px;
}
.main-content {
background: #fff;
border-radius: 14px;
box-shadow: 0 8px 32px rgba(0,0,0,0.1);
padding: 36px 44px;
}
/* Floating Config Sidebar */
.config-sidebar {
position: fixed;
top: 20px;
left: -280px;
width: 300px;
height: calc(100vh - 40px);
z-index: 1000;
transition: left 0.3s ease;
}
.config-sidebar:hover {
left: 0;
}
.config-toggle-btn {
position: absolute;
right: -50px;
top: 50%;
transform: translateY(-50%);
width: 50px;
height: 50px;
background: linear-gradient(135deg, #007bff, #0056b3);
border-radius: 0 25px 25px 0;
display: flex;
align-items: center;
justify-content: center;
color: white;
font-size: 1.2em;
cursor: pointer;
box-shadow: 2px 0 10px rgba(0,0,0,0.2);
transition: all 0.3s ease;
}
.config-toggle-btn:hover {
background: linear-gradient(135deg, #0056b3, #004085);
transform: translateY(-50%) scale(1.05);
}
.config-sidebar:hover .config-toggle-btn {
opacity: 0.8;
}
.main-container { max-width: 1100px; margin: 40px auto; background: #fff; border-radius: 14px; box-shadow: 0 8px 32px rgba(0,0,0,0.1); padding: 36px 44px; }
h1 { font-size: 2.5em; margin-bottom: 24px; color: #1a237e; text-align: center; position: relative; }
h1:after { content: ''; display: block; width: 80px; height: 4px; background: linear-gradient(90deg, #007bff, #00c6ff); margin: 12px auto 0; border-radius: 2px; }
@@ -125,6 +183,18 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
text-shadow: 0 1px 2px rgba(0,0,0,0.05);
}
.accuracy-percentage {
font-size: 0.7em;
font-weight: 600;
color: #ffffff;
margin-left: 8px;
background: rgba(255, 255, 255, 0.1);
padding: 4px 8px;
border-radius: 12px;
display: inline-block;
vertical-align: middle;
}
.stat-card span {
font-size: 2em;
@@ -197,8 +267,9 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
.task-type-stats {
display: flex;
gap: 16px;
flex-wrap: wrap;
gap: 8px;
align-items: center;
}
.task-stat {
@@ -228,6 +299,22 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
color: #b71c1c;
}
/* Task type statistics styles */
.task-stat.score {
color: #ffc107;
background: rgba(255, 193, 7, 0.1);
}
.task-stat.steps {
color: #17a2b8;
background: rgba(23, 162, 184, 0.1);
}
.task-stat.rate {
color: #28a745;
background: rgba(40, 167, 69, 0.1);
}
.tasks-container {
padding: 20px;
transition: all 0.4s cubic-bezier(.4,0,.2,1);
@@ -427,3 +514,174 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
background: #a5c7e5;
}
/* Configuration Panel Styles */
.config-panel {
background: #fff;
border-radius: 0 14px 14px 0;
box-shadow: 0 8px 32px rgba(0,0,0,0.15);
overflow: hidden;
height: 100%;
display: flex;
flex-direction: column;
}
.config-header {
display: flex;
align-items: center;
padding: 16px 20px;
background: linear-gradient(135deg, #6c757d, #495057);
color: white;
flex-shrink: 0;
}
.config-header i {
margin-right: 10px;
font-size: 1.1em;
}
.config-header span {
font-weight: 600;
font-size: 1.1em;
}
.config-content {
padding: 20px;
flex: 1;
overflow-y: auto;
}
.config-selector {
margin-bottom: 20px;
padding-bottom: 15px;
border-bottom: 1px solid #dee2e6;
}
.selector-item {
display: flex;
flex-direction: column;
gap: 8px;
}
.selector-item label {
font-weight: 600;
color: #495057;
font-size: 0.9em;
text-transform: uppercase;
letter-spacing: 0.5px;
}
.selector-item select {
padding: 8px 12px;
border: 2px solid #e9ecef;
border-radius: 6px;
background: white;
font-size: 0.9em;
color: #495057;
cursor: pointer;
transition: all 0.3s ease;
}
.selector-item select:focus {
outline: none;
border-color: #007bff;
box-shadow: 0 0 0 3px rgba(0,123,255,0.1);
}
.selector-item select:hover {
border-color: #007bff;
}
.config-list {
display: flex;
flex-direction: column;
gap: 15px;
}
.config-item {
display: flex;
flex-direction: column;
background: #f8f9fa;
padding: 12px;
border-radius: 8px;
border-left: 4px solid #007bff;
transition: all 0.3s ease;
}
.config-item:hover {
transform: translateX(3px);
box-shadow: 0 4px 12px rgba(0,123,255,0.15);
}
.config-label {
font-weight: 600;
color: #495057;
margin-bottom: 5px;
font-size: 0.9em;
text-transform: uppercase;
color: #495057;
font-size: 0.85em;
margin-bottom: 6px;
text-transform: uppercase;
letter-spacing: 0.5px;
}
.config-value {
color: #007bff;
font-family: 'Courier New', monospace;
font-size: 0.9em;
font-weight: 600;
word-break: break-word;
}
.config-path {
font-size: 0.8em;
line-height: 1.3;
}
/* Responsive design for sidebar layout */
@media (max-width: 1024px) {
.config-sidebar {
left: -250px;
width: 250px;
}
.config-toggle-btn {
right: -40px;
width: 40px;
height: 40px;
font-size: 1em;
}
}
@media (max-width: 768px) {
.layout-container {
padding: 0 10px;
}
.main-content {
padding: 20px 25px;
}
.config-sidebar {
left: -220px;
width: 220px;
height: calc(100vh - 20px);
top: 10px;
}
.config-toggle-btn {
right: -35px;
width: 35px;
height: 35px;
font-size: 0.9em;
}
.config-content {
padding: 15px;
}
.config-item {
padding: 10px;
}
}

View File

@@ -1,5 +1,8 @@
document.addEventListener('DOMContentLoaded', () => {
fetchTasks();
fetchAvailableConfigs().then(() => {
fetchConfig();
fetchTasks();
});
// Bind filter functionality
document.getElementById('total-tasks').parentElement.addEventListener('click', () => setTaskFilter('all'));
document.getElementById('active-tasks').parentElement.addEventListener('click', () => setTaskFilter('active'));
@@ -9,6 +12,9 @@ document.addEventListener('DOMContentLoaded', () => {
let allTaskData = null;
let currentFilter = 'all';
let availableConfigs = [];
let currentConfig = null;
let categoryStats = {};
function refreshPage() {
// Save expanded state before refresh
@@ -31,8 +37,8 @@ function fetchTasksForRefresh() {
fetch('/api/tasks/brief')
.then(response => response.json())
.then(data => {
// Update stored data
allTaskData = data;
categoryStats = calculateCategoryStats(data);
// Only update statistics and task status, do not fully re-render
updateStatistics(data);
updateTaskStatus(data);
@@ -148,6 +154,7 @@ function fetchTasks() {
.then(response => response.json())
.then(data => {
allTaskData = data;
categoryStats = calculateCategoryStats(data);
renderTasks(data);
updateStatistics(data);
})
@@ -208,13 +215,15 @@ function updateStatistics(data) {
document.getElementById('completed-tasks').textContent = completedTasks;
document.getElementById('error-tasks').textContent = errorTasks;
// Update score display with formatted score
// Update score display with formatted score and accuracy percentage
const scoreDisplay = document.getElementById('score-display');
if (completedTasks > 0) {
const scoreFormatted = totalScore.toFixed(2);
scoreDisplay.innerHTML = `<span>${scoreFormatted}</span> / <span>${completedTasks}</span>`;
const averageScore = totalScore / completedTasks;
const accuracyPercentage = (averageScore * 100).toFixed(1);
scoreDisplay.innerHTML = `<span>${scoreFormatted}</span> / <span>${completedTasks}</span> <span class="accuracy-percentage">(${accuracyPercentage}%)</span>`;
} else {
scoreDisplay.innerHTML = '<span>0.00</span> / <span>0</span>';
scoreDisplay.innerHTML = '<span>0.00</span> / <span>0</span> <span class="accuracy-percentage">(0.0%)</span>';
}
// Highlight the currently selected statistics card
@@ -279,6 +288,10 @@ function renderTasks(data) {
// Create header with task type name and statistics
const typeHeader = document.createElement('div');
typeHeader.className = 'task-type-header';
// Get category stats for this task type
const stats = categoryStats[taskType] || {};
typeHeader.innerHTML = `
<span class="task-type-name"><i class="fas fa-layer-group"></i> ${taskType}</span>
<div class="task-type-stats">
@@ -286,6 +299,9 @@ function renderTasks(data) {
<span class="task-stat"><i class="fas fa-tasks"></i> ${tasks.length} total</span>
<span class="task-stat running"><i class="fas fa-running"></i> ${runningCount} active</span>
<span class="task-stat completed"><i class="fas fa-check-circle"></i> ${completedCount} completed</span>
${stats.avg_score ? `<span class="task-stat score"><i class="fas fa-star"></i> ${stats.avg_score} avg score</span>` : ''}
${stats.avg_steps ? `<span class="task-stat steps"><i class="fas fa-chart-line"></i> ${stats.avg_steps} avg steps</span>` : ''}
${stats.completion_rate ? `<span class="task-stat rate"><i class="fas fa-percentage"></i> ${stats.completion_rate}% completed</span>` : ''}
</div>
`;
typeSection.appendChild(typeHeader);
@@ -453,7 +469,181 @@ function renderTasks(data) {
container.appendChild(typeSection);
});
}
// add auto-refresh with time interval 10 seconds
setInterval(() => {
refreshPage();
}, 10000); // 10 seconds interval
function fetchAvailableConfigs() {
return fetch('/api/available-configs')
.then(response => response.json())
.then(data => {
availableConfigs = data;
populateConfigSelect();
return data;
})
.catch(error => {
console.error('Error fetching available configs:', error);
return [];
});
}
function populateConfigSelect() {
const select = document.getElementById('config-select');
select.innerHTML = '';
if (availableConfigs.length === 0) {
select.innerHTML = '<option value="">No configurations found in results directory</option>';
return;
}
// Add available configurations
availableConfigs.forEach((config, index) => {
const option = document.createElement('option');
option.value = index;
option.textContent = `${config.action_space} / ${config.observation_type} / ${config.model_name}`;
select.appendChild(option);
});
}
function changeConfiguration() {
const select = document.getElementById('config-select');
const selectedIndex = select.value;
if (selectedIndex === '' || selectedIndex < 0 || selectedIndex >= availableConfigs.length) {
return;
}
const selectedConfig = availableConfigs[selectedIndex];
// Send configuration change request
fetch('/api/set-config', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify(selectedConfig)
})
.then(response => response.json())
.then(data => {
currentConfig = data;
displayConfig(data);
// Refresh tasks with new configuration
fetchTasks();
})
.catch(error => {
console.error('Error setting config:', error);
displayConfigError();
});
}
function fetchConfig() {
return fetch('/api/current-config')
.then(response => response.json())
.then(data => {
currentConfig = data;
displayConfig(data);
updateConfigSelect();
return data;
})
.catch(error => {
console.error('Error fetching config:', error);
displayConfigError();
});
}
function updateConfigSelect() {
if (!currentConfig || availableConfigs.length === 0) return;
const select = document.getElementById('config-select');
const currentConfigIndex = availableConfigs.findIndex(config =>
config.action_space === currentConfig.action_space &&
config.observation_type === currentConfig.observation_type &&
config.model_name === currentConfig.model_name
);
if (currentConfigIndex !== -1) {
select.value = currentConfigIndex;
} else {
// Current config not found in available configs, select the first one if available
if (availableConfigs.length > 0) {
select.value = 0;
console.warn('Current config not found in available configs, defaulting to first available config');
}
}
}
function displayConfig(config) {
document.getElementById('action-space').textContent = config.action_space || 'N/A';
document.getElementById('observation-type').textContent = config.observation_type || 'N/A';
document.getElementById('model-name').textContent = config.model_name || 'N/A';
document.getElementById('max-steps').textContent = config.max_steps || 'N/A';
}
function displayConfigError() {
const configValues = document.querySelectorAll('.config-value');
configValues.forEach(element => {
element.textContent = 'Error loading';
element.style.color = '#dc3545';
});
}
function calculateCategoryStats(data) {
const stats = {};
Object.entries(data).forEach(([taskType, tasks]) => {
let totalTasks = tasks.length;
let completedTasks = 0;
let runningTasks = 0;
let errorTasks = 0;
let totalScore = 0;
let totalSteps = 0;
let completedWithSteps = 0;
tasks.forEach(task => {
const status = task.status.status;
if (['Done', 'Done (Message Exit)', 'Done (Max Steps)', 'Done (Thought Exit)'].includes(status)) {
completedTasks++;
// Calculate score if available
if (task.status.result) {
try {
const score = parseFloat(task.status.result);
if (!isNaN(score) && score >= 0 && score <= 1) {
totalScore += score;
}
} catch (e) {
// Ignore parsing errors
}
}
// Calculate steps for completed tasks
if (task.status.progress && task.status.progress > 0) {
totalSteps += task.status.progress;
completedWithSteps++;
}
} else if (['Running', 'Preparing', 'Initializing'].includes(status)) {
runningTasks++;
} else if (status === 'Error') {
errorTasks++;
}
});
// Calculate averages
const avgScore = completedTasks > 0 ? totalScore / completedTasks : 0;
const avgSteps = completedWithSteps > 0 ? totalSteps / completedWithSteps : 0;
const completionRate = totalTasks > 0 ? (completedTasks / totalTasks * 100) : 0;
stats[taskType] = {
total_tasks: totalTasks,
completed_tasks: completedTasks,
running_tasks: runningTasks,
error_tasks: errorTasks,
total_score: Math.round(totalScore * 100) / 100,
avg_score: Math.round(avgScore * 10000) / 10000,
avg_steps: Math.round(avgSteps * 10) / 10,
completion_rate: Math.round(completionRate * 10) / 10
};
});
return stats;
}

View File

@@ -12,19 +12,62 @@
<link rel="stylesheet" href="/static/index.css">
</head>
<body>
<div class="main-container">
<h1>OSWorld Monitor <span class="system-status online">System Online</span></h1>
<!-- Score Display Banner -->
<div class="score-banner">
<div class="score-content">
<i class="fas fa-star"></i>
<span class="score-label">Score:</span>
<span id="score-display" class="score-value">Loading...</span>
<div class="layout-container">
<!-- Floating Config Button and Sidebar -->
<div class="config-sidebar" id="config-sidebar">
<div class="config-toggle-btn">
<i class="fas fa-cogs"></i>
</div>
<div class="config-panel">
<div class="config-header">
<i class="fas fa-cogs"></i>
<span>Configuration</span>
</div>
<div class="config-content">
<div class="config-selector">
<div class="selector-item">
<label for="config-select">Select Configuration:</label>
<select id="config-select" onchange="changeConfiguration()">
<option value="">Loading configurations...</option>
</select>
</div>
</div>
<div class="config-list">
<div class="config-item">
<span class="config-label">Action Space:</span>
<span class="config-value" id="action-space">Loading...</span>
</div>
<div class="config-item">
<span class="config-label">Observation:</span>
<span class="config-value" id="observation-type">Loading...</span>
</div>
<div class="config-item">
<span class="config-label">Model:</span>
<span class="config-value" id="model-name">Loading...</span>
</div>
<div class="config-item">
<span class="config-label">Max Steps:</span>
<span class="config-value" id="max-steps">Loading...</span>
</div>
</div>
</div>
</div>
</div>
<div class="dashboard-stats">
<!-- Main Content -->
<div class="main-content">
<h1>OSWorld Monitor <span class="system-status online">System Online</span></h1>
<!-- Score Display Banner -->
<div class="score-banner">
<div class="score-content">
<i class="fas fa-star"></i>
<span class="score-label">Score:</span>
<span id="score-display" class="score-value">Loading...</span>
</div>
</div>
<div class="dashboard-stats">
<div class="stat-card">
<i class="fas fa-running"></i>
<span id="active-tasks">Loading...</span>
@@ -46,10 +89,11 @@
<div class="stat-label">Total Tasks</div>
</div>
</div>
<div id="task-container">
<div class="loading-spinner">
<div class="spinner"></div>
<div>Loading task data...</div>
<div id="task-container">
<div class="loading-spinner">
<div class="spinner"></div>
<div>Loading task data...</div>
</div>
</div>
</div>
</div>

9
run_operator.sh Normal file
View File

@@ -0,0 +1,9 @@
python run_multienv_openaicua.py \
--headless \
--observation_type screenshot \
--model computer-use-preview \
--result_dir ./results_operator_full_test_0713 \
--test_all_meta_path evaluation_examples/test_all.json \
--max_steps 100 \
--num_envs 15 \
--provider_name aws

View File

@@ -68,4 +68,4 @@ def get_result(action_space, use_model, observation_type, result_dir):
if __name__ == '__main__':
get_result("pyautogui", "gpt-4o", "a11y_tree", "./results")
get_result("pyautogui", "computer-use-preview", "screenshot", "./results_operator_full_test_0713")