fix error
This commit is contained in:
11
evaluation_examples/test_bug_0608.json
Normal file
11
evaluation_examples/test_bug_0608.json
Normal file
@@ -0,0 +1,11 @@
|
||||
{
|
||||
"libreoffice_writer": [
|
||||
"0b17a146-2934-46c7-8727-73ff6b6483e8"
|
||||
],
|
||||
"libreoffice_calc": [
|
||||
"1954cced-e748-45c4-9c26-9855b97fbc5e"
|
||||
],
|
||||
"vlc": [
|
||||
"fba2c100-79e8-42df-ae74-b592418d54f4"
|
||||
]
|
||||
}
|
||||
5
evaluation_examples/test_impress.json
Normal file
5
evaluation_examples/test_impress.json
Normal file
@@ -0,0 +1,5 @@
|
||||
{
|
||||
"libreoffice_impress": [
|
||||
"0a211154-fda0-48d0-9274-eaac4ce5486d"
|
||||
]
|
||||
}
|
||||
66
evaluation_examples/test_rest0608.json
Normal file
66
evaluation_examples/test_rest0608.json
Normal file
@@ -0,0 +1,66 @@
|
||||
{
|
||||
"chrome": [
|
||||
"1704f00f-79e6-43a7-961b-cedd3724d5fd",
|
||||
"0d8b7de3-e8de-4d86-b9fd-dd2dce58a217",
|
||||
"cabb3bae-cccb-41bd-9f5d-0f3a9fecd825"
|
||||
],
|
||||
"gimp": [
|
||||
"62f7fd55-0687-4a43-b6e1-3eda16fc6252"
|
||||
],
|
||||
"libreoffice_calc": [
|
||||
"1954cced-e748-45c4-9c26-9855b97fbc5e",
|
||||
"035f41ba-6653-43ab-aa63-c86d449d62e5",
|
||||
"8b1ce5f2-59d2-4dcc-b0b0-666a714b9a14",
|
||||
"1334ca3e-f9e3-4db8-9ca7-b4c653be7d17",
|
||||
"21ab7b40-77c2-4ae6-8321-e00d3a086c73"
|
||||
],
|
||||
"libreoffice_impress": [
|
||||
"5d901039-a89c-4bfb-967b-bf66f4df075e",
|
||||
"15aece23-a215-4579-91b4-69eec72e18da",
|
||||
"a434992a-89df-4577-925c-0c58b747f0f4",
|
||||
"af2d657a-e6b3-4c6a-9f67-9e3ed015974c",
|
||||
"4ed5abd0-8b5d-47bd-839f-cacfa15ca37a",
|
||||
"04578141-1d42-4146-b9cf-6fab4ce5fd74"
|
||||
],
|
||||
"libreoffice_writer": [
|
||||
"0b17a146-2934-46c7-8727-73ff6b6483e8",
|
||||
"adf5e2c3-64c7-4644-b7b6-d2f0167927e7",
|
||||
"ecc2413d-8a48-416e-a3a2-d30106ca36cb"
|
||||
],
|
||||
"multi_apps": [
|
||||
"a0b9dc9c-fc07-4a88-8c5d-5e3ecad91bcb",
|
||||
"b52b40a5-ad70-4c53-b5b0-5650a8387052",
|
||||
"f8cfa149-d1c1-4215-8dac-4a0932bad3c2",
|
||||
"337d318b-aa07-4f4f-b763-89d9a2dd013f",
|
||||
"869de13e-bef9-4b91-ba51-f6708c40b096",
|
||||
"3a93cae4-ad3e-403e-8c12-65303b271818",
|
||||
"9219480b-3aed-47fc-8bac-d2cffc5849f7",
|
||||
"7e287123-70ca-47b9-8521-47db09b69b14",
|
||||
"e2392362-125e-4f76-a2ee-524b183a3412",
|
||||
"873cafdd-a581-47f6-8b33-b9696ddb7b05",
|
||||
"b337d106-053f-4d37-8da0-7f9c4043a66b",
|
||||
"20236825-b5df-46e7-89bf-62e1d640a897",
|
||||
"02ce9a50-7af2-47ed-8596-af0c230501f8",
|
||||
"3e3fc409-bff3-4905-bf16-c968eee3f807",
|
||||
"f5c13cdd-205c-4719-a562-348ae5cd1d91",
|
||||
"7ff48d5b-2df2-49da-b500-a5150ffc7f18",
|
||||
"e1fc0df3-c8b9-4ee7-864c-d0b590d3aa56",
|
||||
"788b3701-3ec9-4b67-b679-418bfa726c22",
|
||||
"d68204bf-11c1-4b13-b48b-d303c73d4bf6",
|
||||
"aceb0368-56b8-4073-b70e-3dc9aee184e0",
|
||||
"22a4636f-8179-4357-8e87-d1743ece1f81"
|
||||
],
|
||||
"thunderbird": [
|
||||
"08c73485-7c6d-4681-999d-919f5c32dcfa"
|
||||
],
|
||||
"vlc": [
|
||||
"59f21cfb-0120-4326-b255-a5b827b38967",
|
||||
"efcf0d81-0835-4880-b2fd-d866e8bc2294",
|
||||
"9195653c-f4aa-453d-aa95-787f6ccfaae9",
|
||||
"215dfd39-f493-4bc3-a027-8a97d72c61bf"
|
||||
],
|
||||
"vs_code": [
|
||||
"0ed39f63-6049-43d4-ba4d-5fa2fe04a951",
|
||||
"ea98c5d7-3cf9-4f9b-8ad3-366b58e0fcae"
|
||||
]
|
||||
}
|
||||
@@ -63,6 +63,33 @@ def setup_logger(example, example_result_dir):
|
||||
runtime_logger.addHandler(logging.FileHandler(os.path.join(example_result_dir, "runtime.log")))
|
||||
return runtime_logger
|
||||
|
||||
def run_single_example_human(env, example, max_steps, instruction, args, example_result_dir, scores):
|
||||
runtime_logger = setup_logger(example, example_result_dir)
|
||||
env.reset(task_config=example)
|
||||
time.sleep(60) # Wait for the environment to be ready
|
||||
obs = env._get_obs() # Get the initial observation
|
||||
|
||||
# Save initial screenshot
|
||||
with open(os.path.join(example_result_dir, "initial_state.png"), "wb") as _f:
|
||||
_f.write(obs['screenshot'])
|
||||
|
||||
# Save trajectory information
|
||||
with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
|
||||
f.write(json.dumps({
|
||||
"instruction": instruction,
|
||||
"initial_state": "initial_state.png"
|
||||
}))
|
||||
f.write("\n")
|
||||
|
||||
# Evaluate the result
|
||||
result = env.evaluate()
|
||||
logger.info("Result: %.2f", result)
|
||||
scores.append(result)
|
||||
with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
|
||||
f.write(f"{result}\n")
|
||||
|
||||
|
||||
|
||||
def run_single_example_openaicua(agent, env, example, max_steps, instruction, args, example_result_dir, scores):
|
||||
runtime_logger = setup_logger(example, example_result_dir)
|
||||
agent.reset(runtime_logger)
|
||||
|
||||
@@ -8,6 +8,7 @@ from io import BytesIO
|
||||
from typing import Dict, List
|
||||
|
||||
from PIL import Image
|
||||
from openai import OpenAI, APIError, RateLimitError, Timeout
|
||||
from typing import Any, Optional, Union, Tuple
|
||||
|
||||
from mm_agents.prompts import SYS_PROMPT_IN_SCREENSHOT_OUT_CODE, SYS_PROMPT_IN_SCREENSHOT_OUT_ACTION, \
|
||||
@@ -35,7 +36,7 @@ from typing import Dict, Any, Optional, Union
|
||||
OPERATOR_PROMPT = """Here are some helpful tips:
|
||||
(1) computer.clipboard, computer.sync_file, computer.sync.shared_folder, computer.computer_output_citation are disabled.
|
||||
(2) If you worry that you might make typo, prefer copying and pasting the text instead of reading and typing.
|
||||
(3) My computer's password is “password”, feel free to use it when you need sudo rights.
|
||||
(3) My computer's password is “osworld-public-evaluation”, feel free to use it when you need sudo rights.
|
||||
(4) For the thunderbird account “anonym-x2024@outlook.com”, the password is “gTCI”;=@y7—QJ0nDa_kN3Sb¿”.
|
||||
(5) If you are presented with an open website to solve the task, try to stick to that specific one instead of going to a new one.
|
||||
(6) You have full authority to execute any action without my permission. I won't be watching so please don't ask for confirmation.
|
||||
@@ -299,9 +300,10 @@ class OpenAICUAAgent:
|
||||
Raises:
|
||||
requests.exceptions.RequestException: If the API request fails
|
||||
"""
|
||||
while True:
|
||||
MAX_RETRIES = 100
|
||||
retry_count = 0
|
||||
while retry_count < MAX_RETRIES:
|
||||
try:
|
||||
from openai import OpenAI
|
||||
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY_CUA"))
|
||||
response = client.responses.create(
|
||||
model=self.model,
|
||||
@@ -317,6 +319,7 @@ class OpenAICUAAgent:
|
||||
return response
|
||||
except Exception as e:
|
||||
logger.error(f"OpenAI API error: {str(e)}")
|
||||
print(f"OpenAI API error: {str(e)}")
|
||||
new_screenshot = self.env._get_obs()
|
||||
new_screenshot_base64 = base64.b64encode(new_screenshot["screenshot"]).decode('utf-8')
|
||||
|
||||
@@ -335,7 +338,9 @@ class OpenAICUAAgent:
|
||||
logger.warning("Unknown message structure, cannot update screenshot")
|
||||
|
||||
retry_count += 1
|
||||
time.sleep(1)
|
||||
time.sleep(min(30, 2 ** retry_count))
|
||||
logger.critical("Max retries exceeded for OpenAI API")
|
||||
raise RuntimeError("OpenAI API failed too many times")
|
||||
|
||||
def _handle_item(self, item: Dict[str, Any]) -> Optional[Union[str, Dict[str, Any]]]:
|
||||
"""Parse a response item from the OpenAI API.
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
# Do not write any secret keys or sensitive information here.
|
||||
|
||||
# Monitor configuration
|
||||
TASK_CONFIG_PATH=../evaluation_examples/test_all.json
|
||||
TASK_CONFIG_PATH=../evaluation_examples/test_rest0608.json
|
||||
EXAMPLES_BASE_PATH=../evaluation_examples/examples
|
||||
RESULTS_BASE_PATH=../results_all_ifmessage_promptnochange
|
||||
RESULTS_BASE_PATH=../results_all_ifnoaction_promptnochange_adderror_error2
|
||||
ACTION_SPACE=pyautogui
|
||||
OBSERVATION_TYPE=screenshot
|
||||
MODEL_NAME=computer-use-preview
|
||||
|
||||
8
run_human.sh
Normal file
8
run_human.sh
Normal file
@@ -0,0 +1,8 @@
|
||||
python run_multienv_human.py \
|
||||
--headless \
|
||||
--observation_type screenshot \
|
||||
--result_dir ./results_human_impress \
|
||||
--test_all_meta_path evaluation_examples/test_impress.json \
|
||||
--region us-east-1 \
|
||||
--max_steps 3 \
|
||||
--num_envs 1
|
||||
@@ -257,7 +257,7 @@ def run_env_tasks(env_idx: int, env_tasks: dict, args: argparse.Namespace, share
|
||||
with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
|
||||
f.write(
|
||||
json.dumps(
|
||||
{"Error": f"Time limit exceeded in {domain}/{example_id}"}
|
||||
{"Error": f"{domain}/{example_id} - {e}"}
|
||||
)
|
||||
)
|
||||
f.write("\n")
|
||||
@@ -442,7 +442,7 @@ def get_result(action_space, use_model, observation_type, result_dir, total_file
|
||||
if __name__ == "__main__":
|
||||
####### The complete version of the list of examples #######
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
|
||||
|
||||
# Register signal handlers for graceful termination
|
||||
signal.signal(signal.SIGINT, signal_handler) # Handle Ctrl+C
|
||||
signal.signal(signal.SIGTERM, signal_handler) # Handle termination signal
|
||||
|
||||
@@ -2,8 +2,8 @@ python run_multienv_openaicua.py \
|
||||
--headless \
|
||||
--observation_type screenshot \
|
||||
--model computer-use-preview \
|
||||
--result_dir ./results_all_ifmessage_promptnochange \
|
||||
--test_all_meta_path evaluation_examples/test_all.json \
|
||||
--result_dir ./results_all_ifnoaction_promptnochange_adderror_error2 \
|
||||
--test_all_meta_path evaluation_examples/test_rest0608.json \
|
||||
--region us-east-1 \
|
||||
--max_steps 150 \
|
||||
--num_envs 25
|
||||
--num_envs 1
|
||||
|
||||
9
run_operator_impress.sh
Normal file
9
run_operator_impress.sh
Normal file
@@ -0,0 +1,9 @@
|
||||
python run_multienv_openaicua.py \
|
||||
--headless \
|
||||
--observation_type screenshot \
|
||||
--model computer-use-preview \
|
||||
--result_dir ./results_vlc_retest \
|
||||
--test_all_meta_path evaluation_examples/test_impress.json \
|
||||
--region us-east-1 \
|
||||
--max_steps 3 \
|
||||
--num_envs 1
|
||||
Reference in New Issue
Block a user