eval update
This commit is contained in:
@@ -389,10 +389,22 @@ def evaluate_presentation_fill_to_rgb_distance(pptx_file, rules):
|
|||||||
except:
|
except:
|
||||||
original_rgb = None
|
original_rgb = None
|
||||||
|
|
||||||
|
def get_rgb_from_color(color):
|
||||||
|
try:
|
||||||
|
if hasattr(color, "rgb"):
|
||||||
|
return color.rgb
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
except:
|
||||||
|
return None
|
||||||
|
|
||||||
def slide_fill_distance_to_rgb(_slide, _rgb, _original_rgb):
|
def slide_fill_distance_to_rgb(_slide, _rgb, _original_rgb):
|
||||||
fill = _slide.background.fill
|
fill = _slide.background.fill
|
||||||
if fill.type == 1:
|
if fill.type == 1:
|
||||||
r1, g1, b1 = fill.fore_color.rgb
|
color_rgb = get_rgb_from_color(fill.fore_color)
|
||||||
|
if color_rgb is None:
|
||||||
|
return 1
|
||||||
|
r1, g1, b1 = color_rgb
|
||||||
r2, g2, b2 = _rgb
|
r2, g2, b2 = _rgb
|
||||||
|
|
||||||
if _original_rgb is not None:
|
if _original_rgb is not None:
|
||||||
@@ -404,7 +416,10 @@ def evaluate_presentation_fill_to_rgb_distance(pptx_file, rules):
|
|||||||
elif fill.type == 5:
|
elif fill.type == 5:
|
||||||
master_fill = _slide.slide_layout.slide_master.background.fill
|
master_fill = _slide.slide_layout.slide_master.background.fill
|
||||||
if master_fill.type == 1:
|
if master_fill.type == 1:
|
||||||
r1, g1, b1 = master_fill.fore_color.rgb
|
color_rgb = get_rgb_from_color(master_fill.fore_color)
|
||||||
|
if color_rgb is None:
|
||||||
|
return 1
|
||||||
|
r1, g1, b1 = color_rgb
|
||||||
else:
|
else:
|
||||||
return 1
|
return 1
|
||||||
r2, g2, b2 = _rgb
|
r2, g2, b2 = _rgb
|
||||||
|
|||||||
@@ -117,7 +117,7 @@ def compare_images(image1_path, image2_path, **options):
|
|||||||
similarity_index = ssim(image1_array, image2_array)
|
similarity_index = ssim(image1_array, image2_array)
|
||||||
|
|
||||||
epsilon = 0.01
|
epsilon = 0.01
|
||||||
if base_score is None:
|
if base_score is not None:
|
||||||
if similarity_index >= base_score + epsilon:
|
if similarity_index >= base_score + epsilon:
|
||||||
return (similarity_index - base_score) / (1 - base_score)
|
return (similarity_index - base_score) / (1 - base_score)
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -38,10 +38,20 @@ OPERATOR_PROMPT = """Here are some helpful tips:
|
|||||||
(3) My computer's password is “password”, feel free to use it when you need sudo rights.
|
(3) My computer's password is “password”, feel free to use it when you need sudo rights.
|
||||||
(4) For the thunderbird account “anonym-x2024@outlook.com”, the password is “gTCI”;=@y7—QJ0nDa_kN3Sb¿”.
|
(4) For the thunderbird account “anonym-x2024@outlook.com”, the password is “gTCI”;=@y7—QJ0nDa_kN3Sb¿”.
|
||||||
(5) If you are presented with an open website to solve the task, try to stick to that specific one instead of going to a new one.
|
(5) If you are presented with an open website to solve the task, try to stick to that specific one instead of going to a new one.
|
||||||
(6) If you deem the task is infeasible, you can terminate and explicitly state in the response that “the task is infeasible”.
|
(6) You have full authority to execute any action without my permission. I won't be watching so please don't ask for confirmation.
|
||||||
(7) THIS IS VERY, VERY, VERY IMPORTANT!!!!! You have full authority to execute any action without my permission. Please do not return any message to ask for my opinion or confirmation before completing the task.
|
(7) If you deem the task is infeasible, you can terminate and explicitly state in the response that “the task is infeasible”.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# OPERATOR_PROMPT = """Here are some helpful tips:
|
||||||
|
# (1) computer.clipboard, computer.sync_file, computer.sync.shared_folder, computer.computer_output_citation are disabled.
|
||||||
|
# (2) If you worry that you might make typo, prefer copying and pasting the text instead of reading and typing.
|
||||||
|
# (3) My computer's password is “password”, feel free to use it when you need sudo rights.
|
||||||
|
# (4) For the thunderbird account “anonym-x2024@outlook.com”, the password is “gTCI”;=@y7—QJ0nDa_kN3Sb¿”.
|
||||||
|
# (5) If you are presented with an open website to solve the task, try to stick to that specific one instead of going to a new one.
|
||||||
|
# (6) If you deem the task is infeasible, you can terminate and explicitly state in the response that “the task is infeasible”.
|
||||||
|
# (7) THIS IS VERY, VERY, VERY IMPORTANT!!!!! You have full authority to execute any action without my permission. I won't be watching so please don't ask for confirmation.
|
||||||
|
# """
|
||||||
|
# Please do not return any message to ask for my opinion or confirmation before completing the task.
|
||||||
|
|
||||||
class Action:
|
class Action:
|
||||||
"""Action class for the agent."""
|
"""Action class for the agent."""
|
||||||
@@ -680,10 +690,10 @@ class OpenAICUAAgent:
|
|||||||
state_correct = False
|
state_correct = False
|
||||||
# if action_exit and thought_exit:
|
# if action_exit and thought_exit:
|
||||||
# state_correct = True
|
# state_correct = True
|
||||||
#if action_exit and not message_exit:
|
if action_exit and not message_exit:
|
||||||
# state_correct = True
|
state_correct = True
|
||||||
if action_exit:
|
# if action_exit:
|
||||||
state_correct = True
|
# state_correct = True
|
||||||
if not state_correct:
|
if not state_correct:
|
||||||
logger.warning("The state of the agent is not correct, action_exit: %s, thought_exit: %s, message_exit: %s", action_exit, thought_exit, message_exit)
|
logger.warning("The state of the agent is not correct, action_exit: %s, thought_exit: %s, message_exit: %s", action_exit, thought_exit, message_exit)
|
||||||
|
|
||||||
|
|||||||
@@ -2,9 +2,9 @@
|
|||||||
# Do not write any secret keys or sensitive information here.
|
# Do not write any secret keys or sensitive information here.
|
||||||
|
|
||||||
# Monitor configuration
|
# Monitor configuration
|
||||||
TASK_CONFIG_PATH=../evaluation_examples/test_all.json
|
TASK_CONFIG_PATH=../evaluation_examples/test_small.json
|
||||||
EXAMPLES_BASE_PATH=../evaluation_examples/examples
|
EXAMPLES_BASE_PATH=../evaluation_examples/examples
|
||||||
RESULTS_BASE_PATH=../results_operator_aws
|
RESULTS_BASE_PATH=../results_small_endmethod_ifmessage
|
||||||
ACTION_SPACE=pyautogui
|
ACTION_SPACE=pyautogui
|
||||||
OBSERVATION_TYPE=screenshot
|
OBSERVATION_TYPE=screenshot
|
||||||
MODEL_NAME=computer-use-preview
|
MODEL_NAME=computer-use-preview
|
||||||
|
|||||||
@@ -1,7 +1,3 @@
|
|||||||
"""Script to run end-to-end evaluation on the benchmark.
|
|
||||||
Utils and basic architecture credit to https://github.com/web-arena-x/webarena/blob/main/run.py.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
import argparse
|
import argparse
|
||||||
import datetime
|
import datetime
|
||||||
|
|||||||
@@ -2,8 +2,8 @@ python run_multienv_openaicua.py \
|
|||||||
--headless \
|
--headless \
|
||||||
--observation_type screenshot \
|
--observation_type screenshot \
|
||||||
--model computer-use-preview \
|
--model computer-use-preview \
|
||||||
--result_dir ./results_small_retest \
|
--result_dir ./results_all_ifmessage_promptnochange \
|
||||||
--test_all_meta_path evaluation_examples/test_small.json \
|
--test_all_meta_path evaluation_examples/test_all.json \
|
||||||
--region us-east-1 \
|
--region us-east-1 \
|
||||||
--max_steps 150 \
|
--max_steps 150 \
|
||||||
--num_envs 10
|
--num_envs 10
|
||||||
|
|||||||
Reference in New Issue
Block a user