Merge branch 'fix_chrome'
This commit is contained in:
@@ -1,10 +1,13 @@
|
||||
import os
|
||||
import logging
|
||||
from typing import Dict, List, Set
|
||||
from typing import Optional, Any, Union
|
||||
from datetime import datetime
|
||||
import requests
|
||||
import pandas as pd
|
||||
|
||||
logger = logging.getLogger("desktopenv.getter.file")
|
||||
|
||||
|
||||
def get_content_from_vm_file(env, config: Dict[str, Any]) -> Any:
|
||||
"""
|
||||
@@ -101,16 +104,42 @@ def get_vm_file(env, config: Dict[str, Any]) -> Union[Optional[str], List[Option
|
||||
|
||||
for i, (p, d) in enumerate(zip(paths, dests)):
|
||||
_path = os.path.join(env.cache_dir, d)
|
||||
file = env.controller.get_file(p)
|
||||
if file is None:
|
||||
|
||||
try:
|
||||
# Try to get file from VM
|
||||
file = env.controller.get_file(p)
|
||||
if file is None:
|
||||
logger.warning(f"Failed to get file from VM: {p}")
|
||||
if i in gives:
|
||||
cache_paths.append(None)
|
||||
continue
|
||||
|
||||
if i in gives:
|
||||
cache_paths.append(_path)
|
||||
|
||||
# Write file with robust error handling
|
||||
try:
|
||||
# Ensure cache directory exists
|
||||
os.makedirs(env.cache_dir, exist_ok=True)
|
||||
|
||||
with open(_path, "wb") as f:
|
||||
f.write(file)
|
||||
logger.info(f"Successfully saved file: {_path} ({len(file)} bytes)")
|
||||
|
||||
except IOError as e:
|
||||
logger.error(f"IO error writing file {_path}: {e}")
|
||||
if i in gives:
|
||||
cache_paths[-1] = None # Replace the path we just added with None
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error writing file {_path}: {e}")
|
||||
if i in gives:
|
||||
cache_paths[-1] = None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing file {p}: {e}")
|
||||
if i in gives:
|
||||
cache_paths.append(None)
|
||||
continue
|
||||
|
||||
if i in gives:
|
||||
cache_paths.append(_path)
|
||||
with open(_path, "wb") as f:
|
||||
f.write(file)
|
||||
|
||||
return cache_paths[0] if len(cache_paths)==1 else cache_paths
|
||||
|
||||
|
||||
|
||||
@@ -298,34 +298,84 @@ def check_json(result: str, rules: Dict[str, List[Dict[str, Union[List[str], str
|
||||
"""
|
||||
|
||||
if result is None:
|
||||
logger.warning("Result file path is None, returning 0.0")
|
||||
return 0.
|
||||
|
||||
# Check if file exists
|
||||
if not os.path.exists(result):
|
||||
logger.warning(f"Result file does not exist: {result}, returning 0.0")
|
||||
return 0.
|
||||
|
||||
try:
|
||||
with open(result, 'r', encoding='utf-8') as f:
|
||||
if is_yaml:
|
||||
try:
|
||||
# Use SafeLoader instead of Loader for better security and error handling
|
||||
result_data: Dict[str, Any] = yaml.safe_load(f)
|
||||
if result_data is None:
|
||||
logger.warning(f"YAML file {result} is empty or contains only null values, returning 0.0")
|
||||
return 0.
|
||||
except yaml.YAMLError as e:
|
||||
logger.error(f"YAML parsing error in file {result}: {e}")
|
||||
logger.error(f"File content might be corrupted or have invalid YAML syntax")
|
||||
return 0.
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error parsing YAML file {result}: {e}")
|
||||
return 0.
|
||||
else:
|
||||
try:
|
||||
result_data: Dict[str, Any] = json.load(f)
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"JSON parsing error in file {result}: {e}")
|
||||
return 0.
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error parsing JSON file {result}: {e}")
|
||||
return 0.
|
||||
except IOError as e:
|
||||
logger.error(f"IO error reading file {result}: {e}")
|
||||
return 0.
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error reading file {result}: {e}")
|
||||
return 0.
|
||||
with open(result) as f:
|
||||
if is_yaml:
|
||||
result: Dict[str, Any] = yaml.load(f, Loader=yaml.Loader)
|
||||
else:
|
||||
result: Dict[str, Any] = json.load(f)
|
||||
|
||||
expect_rules = rules.get("expect", {})
|
||||
unexpect_rules = rules.get("unexpect", {})
|
||||
|
||||
metric = True
|
||||
for r in expect_rules:
|
||||
value = result
|
||||
for k in r["key"]:
|
||||
try:
|
||||
value = value[k]
|
||||
except KeyError:
|
||||
return 0.
|
||||
metric = metric and _match_value_to_rule(value, r)
|
||||
value = result_data
|
||||
try:
|
||||
for k in r["key"]:
|
||||
try:
|
||||
value = value[k]
|
||||
except KeyError:
|
||||
logger.debug(f"Key '{k}' not found in result data, returning 0.0")
|
||||
return 0.
|
||||
except TypeError:
|
||||
logger.debug(f"Cannot access key '{k}' - value is not a dictionary, returning 0.0")
|
||||
return 0.
|
||||
metric = metric and _match_value_to_rule(value, r)
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing expect rule {r}: {e}")
|
||||
return 0.
|
||||
|
||||
for r in unexpect_rules:
|
||||
value = result
|
||||
for k in r["key"]:
|
||||
try:
|
||||
value = value[k]
|
||||
except KeyError:
|
||||
value = None
|
||||
break
|
||||
metric = metric and not _match_value_to_rule(value, r)
|
||||
value = result_data
|
||||
try:
|
||||
for k in r["key"]:
|
||||
try:
|
||||
value = value[k]
|
||||
except KeyError:
|
||||
value = None
|
||||
break
|
||||
except TypeError:
|
||||
value = None
|
||||
break
|
||||
metric = metric and not _match_value_to_rule(value, r)
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing unexpect rule {r}: {e}")
|
||||
return 0.
|
||||
|
||||
return float(metric)
|
||||
|
||||
|
||||
|
||||
@@ -73,6 +73,9 @@ def check_image_stretch_and_center(modified_ppt, original_ppt):
|
||||
original_slide_images = [shape for shape in original_slide.shapes if shape.shape_type == 13]
|
||||
modified_slide_images = [shape for shape in modified_slide.shapes if shape.shape_type == 13]
|
||||
|
||||
if not original_slide_images:
|
||||
return 0.
|
||||
|
||||
the_image = original_slide_images[0]
|
||||
|
||||
the_modified_image = None
|
||||
@@ -395,12 +398,38 @@ def compare_pptx_files(file1_path, file2_path, **options):
|
||||
table2 = shape2.table
|
||||
if enable_debug:
|
||||
debug_logger.debug(f" Shape {shape_idx} - Comparing TABLE with {len(table1.rows)} rows and {len(table1.columns)} columns")
|
||||
debug_logger.debug(f" Shape {shape_idx} - Table2 has {len(table2.rows)} rows and {len(table2.columns)} columns")
|
||||
|
||||
# Check if tables have the same dimensions
|
||||
if len(table1.rows) != len(table2.rows) or len(table1.columns) != len(table2.columns):
|
||||
if enable_debug:
|
||||
debug_logger.debug(f" MISMATCH: Slide {slide_idx}, Shape {shape_idx} (TABLE) - Table dimensions differ:")
|
||||
debug_logger.debug(f" Table1: {len(table1.rows)} rows x {len(table1.columns)} columns")
|
||||
debug_logger.debug(f" Table2: {len(table2.rows)} rows x {len(table2.columns)} columns")
|
||||
return 0
|
||||
|
||||
for row_idx in range(len(table1.rows)):
|
||||
for col_idx in range(len(table1.columns)):
|
||||
cell1 = table1.cell(row_idx, col_idx)
|
||||
cell2 = table2.cell(row_idx, col_idx)
|
||||
|
||||
# Check if cells have the same number of paragraphs
|
||||
if len(cell1.text_frame.paragraphs) != len(cell2.text_frame.paragraphs):
|
||||
if enable_debug:
|
||||
debug_logger.debug(f" MISMATCH: Slide {slide_idx}, Shape {shape_idx} (TABLE) - Cell [{row_idx},{col_idx}] - Different number of paragraphs:")
|
||||
debug_logger.debug(f" Cell1 paragraphs: {len(cell1.text_frame.paragraphs)}")
|
||||
debug_logger.debug(f" Cell2 paragraphs: {len(cell2.text_frame.paragraphs)}")
|
||||
return 0
|
||||
|
||||
for para_idx, (para1, para2) in enumerate(zip(cell1.text_frame.paragraphs, cell2.text_frame.paragraphs)):
|
||||
# Check if paragraphs have the same number of runs
|
||||
if len(para1.runs) != len(para2.runs):
|
||||
if enable_debug:
|
||||
debug_logger.debug(f" MISMATCH: Slide {slide_idx}, Shape {shape_idx} (TABLE) - Cell [{row_idx},{col_idx}], Para {para_idx} - Different number of runs:")
|
||||
debug_logger.debug(f" Para1 runs: {len(para1.runs)}")
|
||||
debug_logger.debug(f" Para2 runs: {len(para2.runs)}")
|
||||
return 0
|
||||
|
||||
for run_idx, (run1, run2) in enumerate(zip(para1.runs, para2.runs)):
|
||||
# Check font color
|
||||
if hasattr(run1.font.color, "rgb") and hasattr(run2.font.color, "rgb"):
|
||||
@@ -451,6 +480,14 @@ def compare_pptx_files(file1_path, file2_path, **options):
|
||||
if shape1.text.strip() != shape2.text.strip() and examine_text:
|
||||
return 0
|
||||
|
||||
# check if the number of paragraphs are the same
|
||||
if len(shape1.text_frame.paragraphs) != len(shape2.text_frame.paragraphs):
|
||||
if enable_debug:
|
||||
debug_logger.debug(f" MISMATCH: Slide {slide_idx}, Shape {shape_idx} - Different number of paragraphs:")
|
||||
debug_logger.debug(f" Shape1 paragraphs: {len(shape1.text_frame.paragraphs)}")
|
||||
debug_logger.debug(f" Shape2 paragraphs: {len(shape2.text_frame.paragraphs)}")
|
||||
return 0
|
||||
|
||||
# check if the paragraphs are the same
|
||||
para_idx = 0
|
||||
for para1, para2 in zip(shape1.text_frame.paragraphs, shape2.text_frame.paragraphs):
|
||||
@@ -487,6 +524,14 @@ def compare_pptx_files(file1_path, file2_path, **options):
|
||||
if para1.level != para2.level and examine_indent:
|
||||
return 0
|
||||
|
||||
# check if the number of runs are the same
|
||||
if len(para1.runs) != len(para2.runs):
|
||||
if enable_debug:
|
||||
debug_logger.debug(f" MISMATCH: Slide {slide_idx}, Shape {shape_idx}, Para {para_idx} - Different number of runs:")
|
||||
debug_logger.debug(f" Para1 runs: {len(para1.runs)}")
|
||||
debug_logger.debug(f" Para2 runs: {len(para2.runs)}")
|
||||
return 0
|
||||
|
||||
for run1, run2 in zip(para1.runs, para2.runs):
|
||||
|
||||
# check if the font properties are the same
|
||||
@@ -634,6 +679,12 @@ def compare_pptx_files(file1_path, file2_path, **options):
|
||||
debug_logger.debug(f" MISMATCH: Text differs - '{tshape1.text.strip()}' vs '{tshape2.text.strip()}'")
|
||||
return 0
|
||||
|
||||
# Check if text shapes have the same number of paragraphs
|
||||
if len(tshape1.text_frame.paragraphs) != len(tshape2.text_frame.paragraphs):
|
||||
if enable_debug:
|
||||
debug_logger.debug(f" MISMATCH: Different number of paragraphs - {len(tshape1.text_frame.paragraphs)} vs {len(tshape2.text_frame.paragraphs)}")
|
||||
return 0
|
||||
|
||||
# Compare alignment of each paragraph
|
||||
for para_idx, (para1, para2) in enumerate(zip(tshape1.text_frame.paragraphs, tshape2.text_frame.paragraphs)):
|
||||
from pptx.enum.text import PP_ALIGN
|
||||
|
||||
@@ -36,8 +36,14 @@ def _parse_sheet_idx(sheet_idx: Union[int, str]
|
||||
# function _parse_sheet_idx {{{ #
|
||||
if isinstance(sheet_idx, int):
|
||||
try:
|
||||
index: str = result_sheet_names[sheet_idx]
|
||||
except:
|
||||
if not result_sheet_names or sheet_idx >= len(result_sheet_names):
|
||||
logger.error(f"Sheet index {sheet_idx} out of range. Available sheets: {result_sheet_names}")
|
||||
index = ""
|
||||
else:
|
||||
index: str = result_sheet_names[sheet_idx]
|
||||
logger.debug(f"Sheet index {sheet_idx} resolved to sheet: {index}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error resolving sheet index {sheet_idx}: {e}")
|
||||
index = ""
|
||||
book: BOOK = result
|
||||
elif sheet_idx.startswith("RI"):
|
||||
@@ -114,12 +120,21 @@ def compare_table(result: str, expected: str = None, **options) -> float:
|
||||
"""
|
||||
|
||||
if result is None:
|
||||
logger.error("Result file path is None")
|
||||
return 0.
|
||||
|
||||
# Check if result file exists
|
||||
if not os.path.exists(result):
|
||||
logger.error(f"Result file not found: {result}")
|
||||
return 0.
|
||||
|
||||
try:
|
||||
logger.info(f"Loading result file: {result}")
|
||||
xlworkbookr: Workbook = openpyxl.load_workbook(filename=result)
|
||||
pdworkbookr = pd.ExcelFile(result)
|
||||
except:
|
||||
logger.info(f"Successfully loaded result file with sheets: {pdworkbookr.sheet_names}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load result file {result}: {e}")
|
||||
return 0.
|
||||
worksheetr_names: List[str] = pdworkbookr.sheet_names
|
||||
|
||||
@@ -432,19 +447,35 @@ def compare_table(result: str, expected: str = None, **options) -> float:
|
||||
# props: dict like {attribute: {"method": str, "ref": anything}}
|
||||
# supported attributes: value & those supported by utils._read_cell_style
|
||||
|
||||
sheet: Worksheet = _load_sheet(*parse_idx(r["sheet_idx"], xlworkbookr, xlworkbooke))
|
||||
if sheet is None:
|
||||
return 0.
|
||||
# data_frame: pd.DataFrame = _load_sheet(*parse_idx(r["sheet_idx"], pdworkbookr, pdworkbooke))
|
||||
cell: Cell = sheet[r["coordinate"]]
|
||||
metric: bool = True
|
||||
for prpt, rule in r["props"].items():
|
||||
if prpt == "value":
|
||||
val = read_cell_value(*parse_idx(r["sheet_idx"], result, expected), r["coordinate"])
|
||||
else:
|
||||
val = _read_cell_style(prpt, cell)
|
||||
try:
|
||||
sheet: Worksheet = _load_sheet(*parse_idx(r["sheet_idx"], xlworkbookr, xlworkbooke))
|
||||
if sheet is None:
|
||||
logger.error(f"Failed to load sheet for sheet_idx: {r['sheet_idx']}")
|
||||
return 0.
|
||||
# data_frame: pd.DataFrame = _load_sheet(*parse_idx(r["sheet_idx"], pdworkbookr, pdworkbooke))
|
||||
cell: Cell = sheet[r["coordinate"]]
|
||||
metric: bool = True
|
||||
for prpt, rule in r["props"].items():
|
||||
if prpt == "value":
|
||||
try:
|
||||
parsed_result = parse_idx(r["sheet_idx"], result, expected)
|
||||
logger.debug(f"parse_idx result: {parsed_result}")
|
||||
val = read_cell_value(*parsed_result, r["coordinate"])
|
||||
logger.debug(f"Cell {r['coordinate']} value: {val}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to read cell value at {r['coordinate']}: {e}")
|
||||
val = None
|
||||
else:
|
||||
try:
|
||||
val = _read_cell_style(prpt, cell)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to read cell style {prpt} at {r['coordinate']}: {e}")
|
||||
val = None
|
||||
|
||||
metric = metric and _match_value_to_rule(val, rule)
|
||||
metric = metric and _match_value_to_rule(val, rule)
|
||||
except Exception as e:
|
||||
logger.error(f"Error in check_cell processing: {e}")
|
||||
return 0.
|
||||
|
||||
logger.debug("Assertion: %s[%s] :%s - %s"
|
||||
, r["sheet_idx"], r["coordinate"]
|
||||
|
||||
@@ -4,6 +4,7 @@ import functools
|
||||
import itertools
|
||||
import logging
|
||||
import operator
|
||||
import os
|
||||
import re
|
||||
import zipfile
|
||||
#import pandas as pd
|
||||
@@ -33,10 +34,11 @@ V = TypeVar("Value")
|
||||
|
||||
logger = logging.getLogger("desktopenv.metrics.utils")
|
||||
|
||||
_xlsx_namespaces = [("oo", "http://schemas.openxmlformats.org/spreadsheetml/2006/main")
|
||||
, ("x14", "http://schemas.microsoft.com/office/spreadsheetml/2009/9/main")
|
||||
, ("xm", "http://schemas.microsoft.com/office/excel/2006/main")
|
||||
]
|
||||
_xlsx_namespaces = [
|
||||
("oo", "http://schemas.openxmlformats.org/spreadsheetml/2006/main"),
|
||||
("x14", "http://schemas.microsoft.com/office/spreadsheetml/2009/9/main"),
|
||||
("xm", "http://schemas.microsoft.com/office/excel/2006/main")
|
||||
]
|
||||
_xlsx_ns_mapping = dict(_xlsx_namespaces)
|
||||
_xlsx_ns_imapping = dict(map(lambda itm: (itm[1], itm[0]), _xlsx_namespaces))
|
||||
_xlsx_ns_imapping["http://schemas.openxmlformats.org/spreadsheetml/2006/main"] = None
|
||||
@@ -282,6 +284,13 @@ _shared_str_value_selector = lxml.cssselect.CSSSelector("oo|t", namespaces=_xlsx
|
||||
|
||||
def read_cell_value(xlsx_file: str, sheet_name: str, coordinate: str) -> Any:
|
||||
# read_cell_value {{{ #
|
||||
logger.debug(f"Reading cell value from {xlsx_file}, sheet: {sheet_name}, coordinate: {coordinate}")
|
||||
|
||||
# Check if file exists
|
||||
if not os.path.exists(xlsx_file):
|
||||
logger.error(f"Excel file not found: {xlsx_file}")
|
||||
return None
|
||||
|
||||
try:
|
||||
with zipfile.ZipFile(xlsx_file, "r") as z_f:
|
||||
try:
|
||||
@@ -308,9 +317,17 @@ def read_cell_value(xlsx_file: str, sheet_name: str, coordinate: str) -> Any:
|
||||
, namespaces=_xlsx_ns_mapping
|
||||
)(sheet)
|
||||
if len(cells) == 0:
|
||||
logger.debug(f"Cell {coordinate} not found in sheet {sheet_name}")
|
||||
return None
|
||||
cell: _Element = cells[0]
|
||||
except zipfile.BadZipFile:
|
||||
except zipfile.BadZipFile as e:
|
||||
logger.error(f"Bad zip file {xlsx_file}: {e}")
|
||||
return None
|
||||
except KeyError as e:
|
||||
logger.error(f"Sheet {sheet_name} not found in {xlsx_file}: {e}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Error reading {xlsx_file}: {e}")
|
||||
return None
|
||||
|
||||
cell: Dict[str, str] = xmltodict.parse(lxml.etree.tostring(cell, encoding="unicode")
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"id": "47543840-672a-467d-80df-8f7c3b9788c9",
|
||||
"snapshot": "chrome",
|
||||
"instruction": "Show me the cars available for pickup at Boston Logan Intl Airport from the 10th to the 11th of next month, sorted by the number of seats to find the largest capacity.",
|
||||
"instruction": "On the current website, show me the cars available for pickup at Boston Logan Intl Airport from the 10th to the 11th of next month, sorted by the number of seats to find the largest capacity.",
|
||||
"source": "test_task_1",
|
||||
"config": [
|
||||
{
|
||||
|
||||
@@ -57,5 +57,5 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"proxy": true
|
||||
"proxy": false
|
||||
}
|
||||
@@ -56,5 +56,5 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"proxy": true
|
||||
"proxy": false
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"id": "b4f95342-463e-4179-8c3f-193cd7241fb2",
|
||||
"snapshot": "chrome",
|
||||
"instruction": "List as many of the next available dates for Diamond Campground as possible.",
|
||||
"instruction": "Find the Next Available dates for Diamond.",
|
||||
"source": "test_task_1",
|
||||
"config": [
|
||||
{
|
||||
|
||||
@@ -66,10 +66,10 @@
|
||||
"goto_prefix": "https://www.",
|
||||
"category": "xpath",
|
||||
"xpathObject": {
|
||||
"/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[1]/div/button/div[3]": "from",
|
||||
"/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[2]/button/div[3]": "to",
|
||||
"/html/body/div[1]/main/div[3]/div[2]/div/div[1]/div/h2": "city",
|
||||
"/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[3]/button/div[3]/span/span[2]": "adult",
|
||||
"/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[1]/button/span/div/div": "from",
|
||||
"/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[2]/button/span/div/div": "to",
|
||||
"/html/body/div[1]/main/div[3]/div[2]/div/div/div/h2": "city",
|
||||
"/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[3]/button/span/div/div": "adult",
|
||||
"/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[3]/div/div[2]/div/div/div[2]/div/button/div/div": "rank"
|
||||
}
|
||||
}
|
||||
@@ -101,10 +101,10 @@
|
||||
},
|
||||
"timezone": "America/New_York",
|
||||
"expected": {
|
||||
"from": "{DoW}, {Month} {Day0D}",
|
||||
"to": "{DoW}, {Month} {Day0D}",
|
||||
"from": "Check In{DoW}, {Month} {Day0D}",
|
||||
"to": "Check Out{DoW}, {Month} {Day0D}",
|
||||
"city": "New York City Hotels",
|
||||
"adult": "2 guests",
|
||||
"adult": "Rooms/Guests1 Room, 2 Guests",
|
||||
"rank": "Price (low to high)"
|
||||
}
|
||||
}
|
||||
@@ -112,5 +112,5 @@
|
||||
]
|
||||
},
|
||||
"proxy": true,
|
||||
"possibility_of_env_change": "medium"
|
||||
"possibility_of_env_change": "high"
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"id": "fc6d8143-9452-4171-9459-7f515143419a",
|
||||
"snapshot": "chrome",
|
||||
"instruction": "Find the status of tomorrow flights from New York-Kennedy airport to Chicago-O'Hare airport.",
|
||||
"instruction": "Find flights from New York–Kennedy Airport to Chicago O'Hare Airport for tomorrow.",
|
||||
"source": "test_task_0",
|
||||
"config": [
|
||||
{
|
||||
|
||||
@@ -2,8 +2,8 @@
|
||||
{
|
||||
"host": "gw.dataimpulse.com",
|
||||
"port": 823,
|
||||
"username": "your_username",
|
||||
"password": "your_password",
|
||||
"username": "e750e5abb74376d28361",
|
||||
"password": "e5ec245537e1e76a",
|
||||
"protocol": "http",
|
||||
"provider": "dataimpulse",
|
||||
"type": "residential",
|
||||
|
||||
@@ -369,9 +369,10 @@ class AnthropicAgent:
|
||||
)
|
||||
|
||||
except (APIError, APIStatusError, APIResponseValidationError) as e:
|
||||
self.logger.exception(f"Anthropic API error: {str(e)}")
|
||||
logger.exception(f"Anthropic API error: {str(e)}")
|
||||
try:
|
||||
self.logger.warning("Retrying with backup API key...")
|
||||
logger.warning("Retrying with backup API key...")
|
||||
|
||||
backup_client = Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY_BACKUP"), max_retries=4)
|
||||
|
||||
if self.model_name == "claude-3-7-sonnet-20250219" or self.model_name == "claude-4-opus-20250514" or self.model_name == "claude-4-sonnet-20250514":
|
||||
@@ -393,13 +394,13 @@ class AnthropicAgent:
|
||||
tools=tools,
|
||||
betas=betas,
|
||||
)
|
||||
self.logger.info("Successfully used backup API key")
|
||||
logger.info("Successfully used backup API key")
|
||||
except Exception as backup_e:
|
||||
self.logger.exception(f"Backup API call also failed: {str(backup_e)}")
|
||||
logger.exception(f"Backup API call also failed: {str(backup_e)}")
|
||||
return None, None
|
||||
|
||||
except Exception as e:
|
||||
self.logger.exception(f"Error in Anthropic API: {str(e)}")
|
||||
logger.exception(f"Error in Anthropic API: {str(e)}")
|
||||
return None, None
|
||||
|
||||
response_params = _response_to_params(response)
|
||||
@@ -434,9 +435,15 @@ class AnthropicAgent:
|
||||
actions = ["DONE"]
|
||||
return reasonings, actions
|
||||
|
||||
def reset(self, *args, **kwargs):
|
||||
def reset(self, _logger = None, *args, **kwargs):
|
||||
"""
|
||||
Reset the agent's state.
|
||||
"""
|
||||
global logger
|
||||
if _logger:
|
||||
logger = _logger
|
||||
else:
|
||||
logger = logging.getLogger("desktopenv.agent")
|
||||
self.messages = []
|
||||
self.logger.info(f"{self.class_name} reset.")
|
||||
logger.info(f"{self.class_name} reset.")
|
||||
|
||||
|
||||
@@ -671,8 +671,14 @@ class OpenAICUAAgent:
|
||||
action_exit = False
|
||||
thought_exit = False
|
||||
message_exit = False
|
||||
infeasible_message = False
|
||||
infeasible_word_list = ["infeasible", "unfeasible", "impossible", "not feasible", "cannot be done"]
|
||||
for item in response.output:
|
||||
parsed_item = self._handle_item(item)
|
||||
if item.type == "message" and any(word in parsed_item.lower() for word in infeasible_word_list):
|
||||
actions.append({"action_space": "pyautogui", "action": "FAIL", "pending_checks": [], "call_id": ""})
|
||||
infeasible_message = True
|
||||
break
|
||||
if isinstance(parsed_item, dict) and parsed_item.get("action_space", None) == "pyautogui":
|
||||
actions.append(parsed_item)
|
||||
else:
|
||||
@@ -693,7 +699,7 @@ class OpenAICUAAgent:
|
||||
# state_correct = True
|
||||
# if action_exit and not message_exit:
|
||||
# state_correct = True
|
||||
if action_exit:
|
||||
if action_exit and not infeasible_message:
|
||||
state_correct = True
|
||||
if not state_correct:
|
||||
logger.warning("The state of the agent is not correct, action_exit: %s, thought_exit: %s, message_exit: %s", action_exit, thought_exit, message_exit)
|
||||
@@ -747,6 +753,7 @@ class OpenAICUAAgent:
|
||||
# Convert the action to an Action object
|
||||
step_action = Action(action.get("action", ""), self.action_space)
|
||||
# Execute the action in the environment
|
||||
print(f"Executing action: {step_action.get_action()}")
|
||||
obs, reward, terminated, info = self.env.step(step_action.get_action())
|
||||
|
||||
screenshot_base64 = encode_image(obs["screenshot"])
|
||||
|
||||
@@ -4,11 +4,11 @@
|
||||
# Monitor configuration
|
||||
TASK_CONFIG_PATH=../evaluation_examples/test_all.json
|
||||
EXAMPLES_BASE_PATH=../evaluation_examples/examples
|
||||
RESULTS_BASE_PATH=../results_all
|
||||
RESULTS_BASE_PATH=../results_operator_full_test_0713
|
||||
ACTION_SPACE=pyautogui
|
||||
OBSERVATION_TYPE=screenshot
|
||||
MODEL_NAME=computer-use-preview
|
||||
MAX_STEPS=150
|
||||
MAX_STEPS=100
|
||||
FLASK_PORT=80
|
||||
FLASK_HOST=0.0.0.0
|
||||
FLASK_DEBUG=true
|
||||
FLASK_DEBUG=false
|
||||
134
monitor/main.py
134
monitor/main.py
@@ -1,14 +1,17 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from functools import cache
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
import subprocess
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from flask import Flask, render_template_string, jsonify, send_file, request, render_template
|
||||
from dotenv import load_dotenv
|
||||
|
||||
|
||||
# Load environment variables from .env file
|
||||
load_dotenv()
|
||||
|
||||
@@ -38,12 +41,51 @@ OBSERVATION_TYPE=os.getenv("OBSERVATION_TYPE", "screenshot")
|
||||
MODEL_NAME=os.getenv("MODEL_NAME", "computer-use-preview")
|
||||
MAX_STEPS = int(os.getenv("MAX_STEPS", "150"))
|
||||
|
||||
def initialize_default_config():
|
||||
"""Initialize default configuration from the first available config in results directory"""
|
||||
global ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME, RESULTS_PATH
|
||||
|
||||
if os.path.exists(RESULTS_BASE_PATH):
|
||||
try:
|
||||
# Scan for the first available configuration
|
||||
for action_space in os.listdir(RESULTS_BASE_PATH):
|
||||
action_space_path = os.path.join(RESULTS_BASE_PATH, action_space)
|
||||
if os.path.isdir(action_space_path):
|
||||
for obs_type in os.listdir(action_space_path):
|
||||
obs_path = os.path.join(action_space_path, obs_type)
|
||||
if os.path.isdir(obs_path):
|
||||
for model_name in os.listdir(obs_path):
|
||||
model_path = os.path.join(obs_path, model_name)
|
||||
if os.path.isdir(model_path):
|
||||
# Use the first available configuration as default
|
||||
ACTION_SPACE = action_space
|
||||
OBSERVATION_TYPE = obs_type
|
||||
MODEL_NAME = model_name
|
||||
RESULTS_PATH = model_path
|
||||
print(f"Initialized default config: {ACTION_SPACE}/{OBSERVATION_TYPE}/{MODEL_NAME}")
|
||||
return
|
||||
except Exception as e:
|
||||
print(f"Error scanning results directory for default config: {e}")
|
||||
|
||||
# Fallback to original environment-based path if no configs found
|
||||
RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
|
||||
print(f"Using fallback config from environment: {ACTION_SPACE}/{OBSERVATION_TYPE}/{MODEL_NAME}")
|
||||
|
||||
# Initialize default configuration
|
||||
initialize_default_config()
|
||||
|
||||
RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
|
||||
|
||||
if RESULTS_PATH not in TASK_STATUS_CACHE:
|
||||
# Initialize cache for this results path
|
||||
TASK_STATUS_CACHE[RESULTS_PATH] = {}
|
||||
|
||||
@cache
|
||||
def load_task_list():
|
||||
with open(TASK_CONFIG_PATH, 'r') as f:
|
||||
return json.load(f)
|
||||
|
||||
@cache
|
||||
def get_task_info(task_type, task_id):
|
||||
task_file = os.path.join(EXAMPLES_BASE_PATH, task_type, f"{task_id}.json")
|
||||
if os.path.exists(task_file):
|
||||
@@ -183,8 +225,8 @@ def get_task_status_brief(task_type, task_id):
|
||||
# Check if the status is already cached
|
||||
current_time = time.time()
|
||||
last_cache_time = None
|
||||
if cache_key in TASK_STATUS_CACHE:
|
||||
cached_status, cached_time = TASK_STATUS_CACHE[cache_key]
|
||||
if cache_key in TASK_STATUS_CACHE[RESULTS_PATH]:
|
||||
cached_status, cached_time = TASK_STATUS_CACHE[RESULTS_PATH][cache_key]
|
||||
last_cache_time = cached_time
|
||||
# If cached status is "Done", check if it's within the stability period
|
||||
if cached_status["status"].startswith("Done"):
|
||||
@@ -312,7 +354,7 @@ def get_task_status_brief(task_type, task_id):
|
||||
# Cache the status if it is done or error
|
||||
if status.startswith("Done") or status == "Error":
|
||||
current_time = last_cache_time if last_cache_time else current_time
|
||||
TASK_STATUS_CACHE[cache_key] = (status_dict, current_time)
|
||||
TASK_STATUS_CACHE[RESULTS_PATH][cache_key] = (status_dict, current_time)
|
||||
|
||||
return status_dict
|
||||
|
||||
@@ -434,6 +476,90 @@ def api_task_detail(task_type, task_id):
|
||||
"status": task_status
|
||||
})
|
||||
|
||||
@app.route('/api/config')
|
||||
def api_config():
|
||||
"""Get configuration information from environment variables - deprecated, use /api/current-config instead"""
|
||||
config_info = {
|
||||
"task_config_path": TASK_CONFIG_PATH,
|
||||
"results_base_path": RESULTS_BASE_PATH,
|
||||
"action_space": ACTION_SPACE,
|
||||
"observation_type": OBSERVATION_TYPE,
|
||||
"model_name": MODEL_NAME,
|
||||
"max_steps": MAX_STEPS,
|
||||
"examples_base_path": EXAMPLES_BASE_PATH
|
||||
}
|
||||
return jsonify(config_info)
|
||||
|
||||
@app.route('/api/available-configs')
|
||||
def api_available_configs():
|
||||
"""Get all available configuration combinations by scanning the results directory"""
|
||||
configs = []
|
||||
|
||||
if os.path.exists(RESULTS_BASE_PATH):
|
||||
try:
|
||||
# Scan action spaces
|
||||
for action_space in os.listdir(RESULTS_BASE_PATH):
|
||||
action_space_path = os.path.join(RESULTS_BASE_PATH, action_space)
|
||||
if os.path.isdir(action_space_path):
|
||||
# Scan observation types
|
||||
for obs_type in os.listdir(action_space_path):
|
||||
obs_path = os.path.join(action_space_path, obs_type)
|
||||
if os.path.isdir(obs_path):
|
||||
# Scan model names
|
||||
for model_name in os.listdir(obs_path):
|
||||
model_path = os.path.join(obs_path, model_name)
|
||||
if os.path.isdir(model_path):
|
||||
configs.append({
|
||||
"action_space": action_space,
|
||||
"observation_type": obs_type,
|
||||
"model_name": model_name,
|
||||
"path": model_path
|
||||
})
|
||||
except Exception as e:
|
||||
print(f"Error scanning results directory: {e}")
|
||||
|
||||
return jsonify(configs)
|
||||
|
||||
@app.route('/api/current-config')
|
||||
def api_current_config():
|
||||
"""Get current configuration"""
|
||||
return jsonify({
|
||||
"action_space": ACTION_SPACE,
|
||||
"observation_type": OBSERVATION_TYPE,
|
||||
"model_name": MODEL_NAME,
|
||||
"max_steps": MAX_STEPS,
|
||||
"results_path": RESULTS_PATH
|
||||
})
|
||||
|
||||
@app.route('/api/set-config', methods=['POST'])
|
||||
def api_set_config():
|
||||
"""Set current configuration"""
|
||||
global ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME, RESULTS_PATH
|
||||
|
||||
data = request.get_json()
|
||||
if not data:
|
||||
return jsonify({"error": "No data provided"}), 400
|
||||
|
||||
# Update global variables
|
||||
ACTION_SPACE = data.get('action_space', ACTION_SPACE)
|
||||
OBSERVATION_TYPE = data.get('observation_type', OBSERVATION_TYPE)
|
||||
MODEL_NAME = data.get('model_name', MODEL_NAME)
|
||||
|
||||
# Update results path
|
||||
RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
|
||||
|
||||
if RESULTS_PATH not in TASK_STATUS_CACHE:
|
||||
# Initialize cache for this results path
|
||||
TASK_STATUS_CACHE[RESULTS_PATH] = {}
|
||||
|
||||
return jsonify({
|
||||
"action_space": ACTION_SPACE,
|
||||
"observation_type": OBSERVATION_TYPE,
|
||||
"model_name": MODEL_NAME,
|
||||
"max_steps": MAX_STEPS,
|
||||
"results_path": RESULTS_PATH
|
||||
})
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Check if necessary directories exist
|
||||
if not os.path.exists(TASK_CONFIG_PATH):
|
||||
@@ -447,4 +573,4 @@ if __name__ == '__main__':
|
||||
port = 8080
|
||||
debug = os.getenv("FLASK_DEBUG", "false").lower() == "true"
|
||||
|
||||
app.run(host=host, port=port, debug=debug)
|
||||
app.run(host=host, port=port, debug=debug, threaded=True)
|
||||
@@ -1,5 +1,63 @@
|
||||
/* filepath: /home/adlsdztony/codes/OSWorld/monitor/static/index.css */
|
||||
body { font-family: 'Segoe UI', Arial, sans-serif; margin: 0; padding: 0; background: linear-gradient(135deg, #f4f6fa 0%, #e9f0f9 100%); }
|
||||
|
||||
.layout-container {
|
||||
position: relative;
|
||||
max-width: 1200px;
|
||||
margin: 20px auto;
|
||||
padding: 0 20px;
|
||||
}
|
||||
|
||||
.main-content {
|
||||
background: #fff;
|
||||
border-radius: 14px;
|
||||
box-shadow: 0 8px 32px rgba(0,0,0,0.1);
|
||||
padding: 36px 44px;
|
||||
}
|
||||
|
||||
/* Floating Config Sidebar */
|
||||
.config-sidebar {
|
||||
position: fixed;
|
||||
top: 20px;
|
||||
left: -280px;
|
||||
width: 300px;
|
||||
height: calc(100vh - 40px);
|
||||
z-index: 1000;
|
||||
transition: left 0.3s ease;
|
||||
}
|
||||
|
||||
.config-sidebar:hover {
|
||||
left: 0;
|
||||
}
|
||||
|
||||
.config-toggle-btn {
|
||||
position: absolute;
|
||||
right: -50px;
|
||||
top: 50%;
|
||||
transform: translateY(-50%);
|
||||
width: 50px;
|
||||
height: 50px;
|
||||
background: linear-gradient(135deg, #007bff, #0056b3);
|
||||
border-radius: 0 25px 25px 0;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
color: white;
|
||||
font-size: 1.2em;
|
||||
cursor: pointer;
|
||||
box-shadow: 2px 0 10px rgba(0,0,0,0.2);
|
||||
transition: all 0.3s ease;
|
||||
}
|
||||
|
||||
.config-toggle-btn:hover {
|
||||
background: linear-gradient(135deg, #0056b3, #004085);
|
||||
transform: translateY(-50%) scale(1.05);
|
||||
}
|
||||
|
||||
.config-sidebar:hover .config-toggle-btn {
|
||||
opacity: 0.8;
|
||||
}
|
||||
|
||||
.main-container { max-width: 1100px; margin: 40px auto; background: #fff; border-radius: 14px; box-shadow: 0 8px 32px rgba(0,0,0,0.1); padding: 36px 44px; }
|
||||
h1 { font-size: 2.5em; margin-bottom: 24px; color: #1a237e; text-align: center; position: relative; }
|
||||
h1:after { content: ''; display: block; width: 80px; height: 4px; background: linear-gradient(90deg, #007bff, #00c6ff); margin: 12px auto 0; border-radius: 2px; }
|
||||
@@ -125,6 +183,18 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
|
||||
text-shadow: 0 1px 2px rgba(0,0,0,0.05);
|
||||
}
|
||||
|
||||
.accuracy-percentage {
|
||||
font-size: 0.7em;
|
||||
font-weight: 600;
|
||||
color: #ffffff;
|
||||
margin-left: 8px;
|
||||
background: rgba(255, 255, 255, 0.1);
|
||||
padding: 4px 8px;
|
||||
border-radius: 12px;
|
||||
display: inline-block;
|
||||
vertical-align: middle;
|
||||
}
|
||||
|
||||
|
||||
.stat-card span {
|
||||
font-size: 2em;
|
||||
@@ -197,8 +267,9 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
|
||||
|
||||
.task-type-stats {
|
||||
display: flex;
|
||||
gap: 16px;
|
||||
flex-wrap: wrap;
|
||||
gap: 8px;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.task-stat {
|
||||
@@ -228,6 +299,22 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
|
||||
color: #b71c1c;
|
||||
}
|
||||
|
||||
/* Task type statistics styles */
|
||||
.task-stat.score {
|
||||
color: #ffc107;
|
||||
background: rgba(255, 193, 7, 0.1);
|
||||
}
|
||||
|
||||
.task-stat.steps {
|
||||
color: #17a2b8;
|
||||
background: rgba(23, 162, 184, 0.1);
|
||||
}
|
||||
|
||||
.task-stat.rate {
|
||||
color: #28a745;
|
||||
background: rgba(40, 167, 69, 0.1);
|
||||
}
|
||||
|
||||
.tasks-container {
|
||||
padding: 20px;
|
||||
transition: all 0.4s cubic-bezier(.4,0,.2,1);
|
||||
@@ -427,3 +514,174 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
|
||||
background: #a5c7e5;
|
||||
}
|
||||
|
||||
/* Configuration Panel Styles */
|
||||
.config-panel {
|
||||
background: #fff;
|
||||
border-radius: 0 14px 14px 0;
|
||||
box-shadow: 0 8px 32px rgba(0,0,0,0.15);
|
||||
overflow: hidden;
|
||||
height: 100%;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
}
|
||||
|
||||
.config-header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
padding: 16px 20px;
|
||||
background: linear-gradient(135deg, #6c757d, #495057);
|
||||
color: white;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
|
||||
.config-header i {
|
||||
margin-right: 10px;
|
||||
font-size: 1.1em;
|
||||
}
|
||||
|
||||
.config-header span {
|
||||
font-weight: 600;
|
||||
font-size: 1.1em;
|
||||
}
|
||||
|
||||
.config-content {
|
||||
padding: 20px;
|
||||
flex: 1;
|
||||
overflow-y: auto;
|
||||
}
|
||||
|
||||
.config-selector {
|
||||
margin-bottom: 20px;
|
||||
padding-bottom: 15px;
|
||||
border-bottom: 1px solid #dee2e6;
|
||||
}
|
||||
|
||||
.selector-item {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 8px;
|
||||
}
|
||||
|
||||
.selector-item label {
|
||||
font-weight: 600;
|
||||
color: #495057;
|
||||
font-size: 0.9em;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.5px;
|
||||
}
|
||||
|
||||
.selector-item select {
|
||||
padding: 8px 12px;
|
||||
border: 2px solid #e9ecef;
|
||||
border-radius: 6px;
|
||||
background: white;
|
||||
font-size: 0.9em;
|
||||
color: #495057;
|
||||
cursor: pointer;
|
||||
transition: all 0.3s ease;
|
||||
}
|
||||
|
||||
.selector-item select:focus {
|
||||
outline: none;
|
||||
border-color: #007bff;
|
||||
box-shadow: 0 0 0 3px rgba(0,123,255,0.1);
|
||||
}
|
||||
|
||||
.selector-item select:hover {
|
||||
border-color: #007bff;
|
||||
}
|
||||
|
||||
.config-list {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 15px;
|
||||
}
|
||||
|
||||
.config-item {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
background: #f8f9fa;
|
||||
padding: 12px;
|
||||
border-radius: 8px;
|
||||
border-left: 4px solid #007bff;
|
||||
transition: all 0.3s ease;
|
||||
}
|
||||
|
||||
.config-item:hover {
|
||||
transform: translateX(3px);
|
||||
box-shadow: 0 4px 12px rgba(0,123,255,0.15);
|
||||
}
|
||||
|
||||
.config-label {
|
||||
font-weight: 600;
|
||||
color: #495057;
|
||||
margin-bottom: 5px;
|
||||
font-size: 0.9em;
|
||||
text-transform: uppercase;
|
||||
color: #495057;
|
||||
font-size: 0.85em;
|
||||
margin-bottom: 6px;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.5px;
|
||||
}
|
||||
|
||||
.config-value {
|
||||
color: #007bff;
|
||||
font-family: 'Courier New', monospace;
|
||||
font-size: 0.9em;
|
||||
font-weight: 600;
|
||||
word-break: break-word;
|
||||
}
|
||||
|
||||
.config-path {
|
||||
font-size: 0.8em;
|
||||
line-height: 1.3;
|
||||
}
|
||||
|
||||
/* Responsive design for sidebar layout */
|
||||
@media (max-width: 1024px) {
|
||||
.config-sidebar {
|
||||
left: -250px;
|
||||
width: 250px;
|
||||
}
|
||||
|
||||
.config-toggle-btn {
|
||||
right: -40px;
|
||||
width: 40px;
|
||||
height: 40px;
|
||||
font-size: 1em;
|
||||
}
|
||||
}
|
||||
|
||||
@media (max-width: 768px) {
|
||||
.layout-container {
|
||||
padding: 0 10px;
|
||||
}
|
||||
|
||||
.main-content {
|
||||
padding: 20px 25px;
|
||||
}
|
||||
|
||||
.config-sidebar {
|
||||
left: -220px;
|
||||
width: 220px;
|
||||
height: calc(100vh - 20px);
|
||||
top: 10px;
|
||||
}
|
||||
|
||||
.config-toggle-btn {
|
||||
right: -35px;
|
||||
width: 35px;
|
||||
height: 35px;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
|
||||
.config-content {
|
||||
padding: 15px;
|
||||
}
|
||||
|
||||
.config-item {
|
||||
padding: 10px;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
document.addEventListener('DOMContentLoaded', () => {
|
||||
fetchTasks();
|
||||
fetchAvailableConfigs().then(() => {
|
||||
fetchConfig();
|
||||
fetchTasks();
|
||||
});
|
||||
// Bind filter functionality
|
||||
document.getElementById('total-tasks').parentElement.addEventListener('click', () => setTaskFilter('all'));
|
||||
document.getElementById('active-tasks').parentElement.addEventListener('click', () => setTaskFilter('active'));
|
||||
@@ -9,6 +12,9 @@ document.addEventListener('DOMContentLoaded', () => {
|
||||
|
||||
let allTaskData = null;
|
||||
let currentFilter = 'all';
|
||||
let availableConfigs = [];
|
||||
let currentConfig = null;
|
||||
let categoryStats = {};
|
||||
|
||||
function refreshPage() {
|
||||
// Save expanded state before refresh
|
||||
@@ -31,8 +37,8 @@ function fetchTasksForRefresh() {
|
||||
fetch('/api/tasks/brief')
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
// Update stored data
|
||||
allTaskData = data;
|
||||
categoryStats = calculateCategoryStats(data);
|
||||
// Only update statistics and task status, do not fully re-render
|
||||
updateStatistics(data);
|
||||
updateTaskStatus(data);
|
||||
@@ -148,6 +154,7 @@ function fetchTasks() {
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
allTaskData = data;
|
||||
categoryStats = calculateCategoryStats(data);
|
||||
renderTasks(data);
|
||||
updateStatistics(data);
|
||||
})
|
||||
@@ -208,13 +215,15 @@ function updateStatistics(data) {
|
||||
document.getElementById('completed-tasks').textContent = completedTasks;
|
||||
document.getElementById('error-tasks').textContent = errorTasks;
|
||||
|
||||
// Update score display with formatted score
|
||||
// Update score display with formatted score and accuracy percentage
|
||||
const scoreDisplay = document.getElementById('score-display');
|
||||
if (completedTasks > 0) {
|
||||
const scoreFormatted = totalScore.toFixed(2);
|
||||
scoreDisplay.innerHTML = `<span>${scoreFormatted}</span> / <span>${completedTasks}</span>`;
|
||||
const averageScore = totalScore / completedTasks;
|
||||
const accuracyPercentage = (averageScore * 100).toFixed(1);
|
||||
scoreDisplay.innerHTML = `<span>${scoreFormatted}</span> / <span>${completedTasks}</span> <span class="accuracy-percentage">(${accuracyPercentage}%)</span>`;
|
||||
} else {
|
||||
scoreDisplay.innerHTML = '<span>0.00</span> / <span>0</span>';
|
||||
scoreDisplay.innerHTML = '<span>0.00</span> / <span>0</span> <span class="accuracy-percentage">(0.0%)</span>';
|
||||
}
|
||||
|
||||
// Highlight the currently selected statistics card
|
||||
@@ -279,6 +288,10 @@ function renderTasks(data) {
|
||||
// Create header with task type name and statistics
|
||||
const typeHeader = document.createElement('div');
|
||||
typeHeader.className = 'task-type-header';
|
||||
|
||||
// Get category stats for this task type
|
||||
const stats = categoryStats[taskType] || {};
|
||||
|
||||
typeHeader.innerHTML = `
|
||||
<span class="task-type-name"><i class="fas fa-layer-group"></i> ${taskType}</span>
|
||||
<div class="task-type-stats">
|
||||
@@ -286,6 +299,9 @@ function renderTasks(data) {
|
||||
<span class="task-stat"><i class="fas fa-tasks"></i> ${tasks.length} total</span>
|
||||
<span class="task-stat running"><i class="fas fa-running"></i> ${runningCount} active</span>
|
||||
<span class="task-stat completed"><i class="fas fa-check-circle"></i> ${completedCount} completed</span>
|
||||
${stats.avg_score ? `<span class="task-stat score"><i class="fas fa-star"></i> ${stats.avg_score} avg score</span>` : ''}
|
||||
${stats.avg_steps ? `<span class="task-stat steps"><i class="fas fa-chart-line"></i> ${stats.avg_steps} avg steps</span>` : ''}
|
||||
${stats.completion_rate ? `<span class="task-stat rate"><i class="fas fa-percentage"></i> ${stats.completion_rate}% completed</span>` : ''}
|
||||
</div>
|
||||
`;
|
||||
typeSection.appendChild(typeHeader);
|
||||
@@ -453,7 +469,181 @@ function renderTasks(data) {
|
||||
container.appendChild(typeSection);
|
||||
});
|
||||
}
|
||||
// add auto-refresh with time interval 10 seconds
|
||||
setInterval(() => {
|
||||
refreshPage();
|
||||
}, 10000); // 10 seconds interval
|
||||
|
||||
function fetchAvailableConfigs() {
|
||||
return fetch('/api/available-configs')
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
availableConfigs = data;
|
||||
populateConfigSelect();
|
||||
return data;
|
||||
})
|
||||
.catch(error => {
|
||||
console.error('Error fetching available configs:', error);
|
||||
return [];
|
||||
});
|
||||
}
|
||||
|
||||
function populateConfigSelect() {
|
||||
const select = document.getElementById('config-select');
|
||||
select.innerHTML = '';
|
||||
|
||||
if (availableConfigs.length === 0) {
|
||||
select.innerHTML = '<option value="">No configurations found in results directory</option>';
|
||||
return;
|
||||
}
|
||||
|
||||
// Add available configurations
|
||||
availableConfigs.forEach((config, index) => {
|
||||
const option = document.createElement('option');
|
||||
option.value = index;
|
||||
option.textContent = `${config.action_space} / ${config.observation_type} / ${config.model_name}`;
|
||||
select.appendChild(option);
|
||||
});
|
||||
}
|
||||
|
||||
function changeConfiguration() {
|
||||
const select = document.getElementById('config-select');
|
||||
const selectedIndex = select.value;
|
||||
|
||||
if (selectedIndex === '' || selectedIndex < 0 || selectedIndex >= availableConfigs.length) {
|
||||
return;
|
||||
}
|
||||
|
||||
const selectedConfig = availableConfigs[selectedIndex];
|
||||
|
||||
// Send configuration change request
|
||||
fetch('/api/set-config', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
body: JSON.stringify(selectedConfig)
|
||||
})
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
currentConfig = data;
|
||||
displayConfig(data);
|
||||
// Refresh tasks with new configuration
|
||||
fetchTasks();
|
||||
})
|
||||
.catch(error => {
|
||||
console.error('Error setting config:', error);
|
||||
displayConfigError();
|
||||
});
|
||||
}
|
||||
|
||||
function fetchConfig() {
|
||||
return fetch('/api/current-config')
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
currentConfig = data;
|
||||
displayConfig(data);
|
||||
updateConfigSelect();
|
||||
return data;
|
||||
})
|
||||
.catch(error => {
|
||||
console.error('Error fetching config:', error);
|
||||
displayConfigError();
|
||||
});
|
||||
}
|
||||
|
||||
function updateConfigSelect() {
|
||||
if (!currentConfig || availableConfigs.length === 0) return;
|
||||
|
||||
const select = document.getElementById('config-select');
|
||||
const currentConfigIndex = availableConfigs.findIndex(config =>
|
||||
config.action_space === currentConfig.action_space &&
|
||||
config.observation_type === currentConfig.observation_type &&
|
||||
config.model_name === currentConfig.model_name
|
||||
);
|
||||
|
||||
if (currentConfigIndex !== -1) {
|
||||
select.value = currentConfigIndex;
|
||||
} else {
|
||||
// Current config not found in available configs, select the first one if available
|
||||
if (availableConfigs.length > 0) {
|
||||
select.value = 0;
|
||||
console.warn('Current config not found in available configs, defaulting to first available config');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function displayConfig(config) {
|
||||
document.getElementById('action-space').textContent = config.action_space || 'N/A';
|
||||
document.getElementById('observation-type').textContent = config.observation_type || 'N/A';
|
||||
document.getElementById('model-name').textContent = config.model_name || 'N/A';
|
||||
document.getElementById('max-steps').textContent = config.max_steps || 'N/A';
|
||||
}
|
||||
|
||||
function displayConfigError() {
|
||||
const configValues = document.querySelectorAll('.config-value');
|
||||
configValues.forEach(element => {
|
||||
element.textContent = 'Error loading';
|
||||
element.style.color = '#dc3545';
|
||||
});
|
||||
}
|
||||
|
||||
function calculateCategoryStats(data) {
|
||||
const stats = {};
|
||||
|
||||
Object.entries(data).forEach(([taskType, tasks]) => {
|
||||
let totalTasks = tasks.length;
|
||||
let completedTasks = 0;
|
||||
let runningTasks = 0;
|
||||
let errorTasks = 0;
|
||||
let totalScore = 0;
|
||||
let totalSteps = 0;
|
||||
let completedWithSteps = 0;
|
||||
|
||||
tasks.forEach(task => {
|
||||
const status = task.status.status;
|
||||
|
||||
if (['Done', 'Done (Message Exit)', 'Done (Max Steps)', 'Done (Thought Exit)'].includes(status)) {
|
||||
completedTasks++;
|
||||
|
||||
// Calculate score if available
|
||||
if (task.status.result) {
|
||||
try {
|
||||
const score = parseFloat(task.status.result);
|
||||
if (!isNaN(score) && score >= 0 && score <= 1) {
|
||||
totalScore += score;
|
||||
}
|
||||
} catch (e) {
|
||||
// Ignore parsing errors
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate steps for completed tasks
|
||||
if (task.status.progress && task.status.progress > 0) {
|
||||
totalSteps += task.status.progress;
|
||||
completedWithSteps++;
|
||||
}
|
||||
|
||||
} else if (['Running', 'Preparing', 'Initializing'].includes(status)) {
|
||||
runningTasks++;
|
||||
|
||||
} else if (status === 'Error') {
|
||||
errorTasks++;
|
||||
}
|
||||
});
|
||||
|
||||
// Calculate averages
|
||||
const avgScore = completedTasks > 0 ? totalScore / completedTasks : 0;
|
||||
const avgSteps = completedWithSteps > 0 ? totalSteps / completedWithSteps : 0;
|
||||
const completionRate = totalTasks > 0 ? (completedTasks / totalTasks * 100) : 0;
|
||||
|
||||
stats[taskType] = {
|
||||
total_tasks: totalTasks,
|
||||
completed_tasks: completedTasks,
|
||||
running_tasks: runningTasks,
|
||||
error_tasks: errorTasks,
|
||||
total_score: Math.round(totalScore * 100) / 100,
|
||||
avg_score: Math.round(avgScore * 10000) / 10000,
|
||||
avg_steps: Math.round(avgSteps * 10) / 10,
|
||||
completion_rate: Math.round(completionRate * 10) / 10
|
||||
};
|
||||
});
|
||||
|
||||
return stats;
|
||||
}
|
||||
|
||||
@@ -12,19 +12,62 @@
|
||||
<link rel="stylesheet" href="/static/index.css">
|
||||
</head>
|
||||
<body>
|
||||
<div class="main-container">
|
||||
<h1>OSWorld Monitor <span class="system-status online">System Online</span></h1>
|
||||
|
||||
<!-- Score Display Banner -->
|
||||
<div class="score-banner">
|
||||
<div class="score-content">
|
||||
<i class="fas fa-star"></i>
|
||||
<span class="score-label">Score:</span>
|
||||
<span id="score-display" class="score-value">Loading...</span>
|
||||
<div class="layout-container">
|
||||
<!-- Floating Config Button and Sidebar -->
|
||||
<div class="config-sidebar" id="config-sidebar">
|
||||
<div class="config-toggle-btn">
|
||||
<i class="fas fa-cogs"></i>
|
||||
</div>
|
||||
<div class="config-panel">
|
||||
<div class="config-header">
|
||||
<i class="fas fa-cogs"></i>
|
||||
<span>Configuration</span>
|
||||
</div>
|
||||
<div class="config-content">
|
||||
<div class="config-selector">
|
||||
<div class="selector-item">
|
||||
<label for="config-select">Select Configuration:</label>
|
||||
<select id="config-select" onchange="changeConfiguration()">
|
||||
<option value="">Loading configurations...</option>
|
||||
</select>
|
||||
</div>
|
||||
</div>
|
||||
<div class="config-list">
|
||||
<div class="config-item">
|
||||
<span class="config-label">Action Space:</span>
|
||||
<span class="config-value" id="action-space">Loading...</span>
|
||||
</div>
|
||||
<div class="config-item">
|
||||
<span class="config-label">Observation:</span>
|
||||
<span class="config-value" id="observation-type">Loading...</span>
|
||||
</div>
|
||||
<div class="config-item">
|
||||
<span class="config-label">Model:</span>
|
||||
<span class="config-value" id="model-name">Loading...</span>
|
||||
</div>
|
||||
<div class="config-item">
|
||||
<span class="config-label">Max Steps:</span>
|
||||
<span class="config-value" id="max-steps">Loading...</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="dashboard-stats">
|
||||
<!-- Main Content -->
|
||||
<div class="main-content">
|
||||
<h1>OSWorld Monitor <span class="system-status online">System Online</span></h1>
|
||||
|
||||
<!-- Score Display Banner -->
|
||||
<div class="score-banner">
|
||||
<div class="score-content">
|
||||
<i class="fas fa-star"></i>
|
||||
<span class="score-label">Score:</span>
|
||||
<span id="score-display" class="score-value">Loading...</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="dashboard-stats">
|
||||
<div class="stat-card">
|
||||
<i class="fas fa-running"></i>
|
||||
<span id="active-tasks">Loading...</span>
|
||||
@@ -46,10 +89,11 @@
|
||||
<div class="stat-label">Total Tasks</div>
|
||||
</div>
|
||||
</div>
|
||||
<div id="task-container">
|
||||
<div class="loading-spinner">
|
||||
<div class="spinner"></div>
|
||||
<div>Loading task data...</div>
|
||||
<div id="task-container">
|
||||
<div class="loading-spinner">
|
||||
<div class="spinner"></div>
|
||||
<div>Loading task data...</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
9
run_operator.sh
Normal file
9
run_operator.sh
Normal file
@@ -0,0 +1,9 @@
|
||||
python run_multienv_openaicua.py \
|
||||
--headless \
|
||||
--observation_type screenshot \
|
||||
--model computer-use-preview \
|
||||
--result_dir ./results_operator_full_test_0713 \
|
||||
--test_all_meta_path evaluation_examples/test_all.json \
|
||||
--max_steps 100 \
|
||||
--num_envs 15 \
|
||||
--provider_name aws
|
||||
@@ -68,4 +68,4 @@ def get_result(action_space, use_model, observation_type, result_dir):
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
get_result("pyautogui", "gpt-4o", "a11y_tree", "./results")
|
||||
get_result("pyautogui", "computer-use-preview", "screenshot", "./results_operator_full_test_0713")
|
||||
|
||||
Reference in New Issue
Block a user