fix error

This commit is contained in:
yuanmengqi
2025-06-09 04:20:59 +00:00
parent 3e541bb393
commit aee1207fff
10 changed files with 142 additions and 11 deletions

View File

@@ -0,0 +1,11 @@
{
"libreoffice_writer": [
"0b17a146-2934-46c7-8727-73ff6b6483e8"
],
"libreoffice_calc": [
"1954cced-e748-45c4-9c26-9855b97fbc5e"
],
"vlc": [
"fba2c100-79e8-42df-ae74-b592418d54f4"
]
}

View File

@@ -0,0 +1,5 @@
{
"libreoffice_impress": [
"0a211154-fda0-48d0-9274-eaac4ce5486d"
]
}

View File

@@ -0,0 +1,66 @@
{
"chrome": [
"1704f00f-79e6-43a7-961b-cedd3724d5fd",
"0d8b7de3-e8de-4d86-b9fd-dd2dce58a217",
"cabb3bae-cccb-41bd-9f5d-0f3a9fecd825"
],
"gimp": [
"62f7fd55-0687-4a43-b6e1-3eda16fc6252"
],
"libreoffice_calc": [
"1954cced-e748-45c4-9c26-9855b97fbc5e",
"035f41ba-6653-43ab-aa63-c86d449d62e5",
"8b1ce5f2-59d2-4dcc-b0b0-666a714b9a14",
"1334ca3e-f9e3-4db8-9ca7-b4c653be7d17",
"21ab7b40-77c2-4ae6-8321-e00d3a086c73"
],
"libreoffice_impress": [
"5d901039-a89c-4bfb-967b-bf66f4df075e",
"15aece23-a215-4579-91b4-69eec72e18da",
"a434992a-89df-4577-925c-0c58b747f0f4",
"af2d657a-e6b3-4c6a-9f67-9e3ed015974c",
"4ed5abd0-8b5d-47bd-839f-cacfa15ca37a",
"04578141-1d42-4146-b9cf-6fab4ce5fd74"
],
"libreoffice_writer": [
"0b17a146-2934-46c7-8727-73ff6b6483e8",
"adf5e2c3-64c7-4644-b7b6-d2f0167927e7",
"ecc2413d-8a48-416e-a3a2-d30106ca36cb"
],
"multi_apps": [
"a0b9dc9c-fc07-4a88-8c5d-5e3ecad91bcb",
"b52b40a5-ad70-4c53-b5b0-5650a8387052",
"f8cfa149-d1c1-4215-8dac-4a0932bad3c2",
"337d318b-aa07-4f4f-b763-89d9a2dd013f",
"869de13e-bef9-4b91-ba51-f6708c40b096",
"3a93cae4-ad3e-403e-8c12-65303b271818",
"9219480b-3aed-47fc-8bac-d2cffc5849f7",
"7e287123-70ca-47b9-8521-47db09b69b14",
"e2392362-125e-4f76-a2ee-524b183a3412",
"873cafdd-a581-47f6-8b33-b9696ddb7b05",
"b337d106-053f-4d37-8da0-7f9c4043a66b",
"20236825-b5df-46e7-89bf-62e1d640a897",
"02ce9a50-7af2-47ed-8596-af0c230501f8",
"3e3fc409-bff3-4905-bf16-c968eee3f807",
"f5c13cdd-205c-4719-a562-348ae5cd1d91",
"7ff48d5b-2df2-49da-b500-a5150ffc7f18",
"e1fc0df3-c8b9-4ee7-864c-d0b590d3aa56",
"788b3701-3ec9-4b67-b679-418bfa726c22",
"d68204bf-11c1-4b13-b48b-d303c73d4bf6",
"aceb0368-56b8-4073-b70e-3dc9aee184e0",
"22a4636f-8179-4357-8e87-d1743ece1f81"
],
"thunderbird": [
"08c73485-7c6d-4681-999d-919f5c32dcfa"
],
"vlc": [
"59f21cfb-0120-4326-b255-a5b827b38967",
"efcf0d81-0835-4880-b2fd-d866e8bc2294",
"9195653c-f4aa-453d-aa95-787f6ccfaae9",
"215dfd39-f493-4bc3-a027-8a97d72c61bf"
],
"vs_code": [
"0ed39f63-6049-43d4-ba4d-5fa2fe04a951",
"ea98c5d7-3cf9-4f9b-8ad3-366b58e0fcae"
]
}

View File

@@ -63,6 +63,33 @@ def setup_logger(example, example_result_dir):
runtime_logger.addHandler(logging.FileHandler(os.path.join(example_result_dir, "runtime.log")))
return runtime_logger
def run_single_example_human(env, example, max_steps, instruction, args, example_result_dir, scores):
runtime_logger = setup_logger(example, example_result_dir)
env.reset(task_config=example)
time.sleep(60) # Wait for the environment to be ready
obs = env._get_obs() # Get the initial observation
# Save initial screenshot
with open(os.path.join(example_result_dir, "initial_state.png"), "wb") as _f:
_f.write(obs['screenshot'])
# Save trajectory information
with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
f.write(json.dumps({
"instruction": instruction,
"initial_state": "initial_state.png"
}))
f.write("\n")
# Evaluate the result
result = env.evaluate()
logger.info("Result: %.2f", result)
scores.append(result)
with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
f.write(f"{result}\n")
def run_single_example_openaicua(agent, env, example, max_steps, instruction, args, example_result_dir, scores):
runtime_logger = setup_logger(example, example_result_dir)
agent.reset(runtime_logger)

View File

@@ -8,6 +8,7 @@ from io import BytesIO
from typing import Dict, List
from PIL import Image
from openai import OpenAI, APIError, RateLimitError, Timeout
from typing import Any, Optional, Union, Tuple
from mm_agents.prompts import SYS_PROMPT_IN_SCREENSHOT_OUT_CODE, SYS_PROMPT_IN_SCREENSHOT_OUT_ACTION, \
@@ -35,7 +36,7 @@ from typing import Dict, Any, Optional, Union
OPERATOR_PROMPT = """Here are some helpful tips:
(1) computer.clipboard, computer.sync_file, computer.sync.shared_folder, computer.computer_output_citation are disabled.
(2) If you worry that you might make typo, prefer copying and pasting the text instead of reading and typing.
(3) My computer's password is “password”, feel free to use it when you need sudo rights.
(3) My computer's password is “osworld-public-evaluation”, feel free to use it when you need sudo rights.
(4) For the thunderbird account “anonym-x2024@outlook.com”, the password is “gTCI”;=@y7—QJ0nDa_kN3Sb¿”.
(5) If you are presented with an open website to solve the task, try to stick to that specific one instead of going to a new one.
(6) You have full authority to execute any action without my permission. I won't be watching so please don't ask for confirmation.
@@ -299,9 +300,10 @@ class OpenAICUAAgent:
Raises:
requests.exceptions.RequestException: If the API request fails
"""
while True:
MAX_RETRIES = 100
retry_count = 0
while retry_count < MAX_RETRIES:
try:
from openai import OpenAI
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY_CUA"))
response = client.responses.create(
model=self.model,
@@ -317,6 +319,7 @@ class OpenAICUAAgent:
return response
except Exception as e:
logger.error(f"OpenAI API error: {str(e)}")
print(f"OpenAI API error: {str(e)}")
new_screenshot = self.env._get_obs()
new_screenshot_base64 = base64.b64encode(new_screenshot["screenshot"]).decode('utf-8')
@@ -335,7 +338,9 @@ class OpenAICUAAgent:
logger.warning("Unknown message structure, cannot update screenshot")
retry_count += 1
time.sleep(1)
time.sleep(min(30, 2 ** retry_count))
logger.critical("Max retries exceeded for OpenAI API")
raise RuntimeError("OpenAI API failed too many times")
def _handle_item(self, item: Dict[str, Any]) -> Optional[Union[str, Dict[str, Any]]]:
"""Parse a response item from the OpenAI API.

View File

@@ -2,9 +2,9 @@
# Do not write any secret keys or sensitive information here.
# Monitor configuration
TASK_CONFIG_PATH=../evaluation_examples/test_all.json
TASK_CONFIG_PATH=../evaluation_examples/test_rest0608.json
EXAMPLES_BASE_PATH=../evaluation_examples/examples
RESULTS_BASE_PATH=../results_all_ifmessage_promptnochange
RESULTS_BASE_PATH=../results_all_ifnoaction_promptnochange_adderror_error2
ACTION_SPACE=pyautogui
OBSERVATION_TYPE=screenshot
MODEL_NAME=computer-use-preview

8
run_human.sh Normal file
View File

@@ -0,0 +1,8 @@
python run_multienv_human.py \
--headless \
--observation_type screenshot \
--result_dir ./results_human_impress \
--test_all_meta_path evaluation_examples/test_impress.json \
--region us-east-1 \
--max_steps 3 \
--num_envs 1

View File

@@ -257,7 +257,7 @@ def run_env_tasks(env_idx: int, env_tasks: dict, args: argparse.Namespace, share
with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
f.write(
json.dumps(
{"Error": f"Time limit exceeded in {domain}/{example_id}"}
{"Error": f"{domain}/{example_id} - {e}"}
)
)
f.write("\n")
@@ -442,7 +442,7 @@ def get_result(action_space, use_model, observation_type, result_dir, total_file
if __name__ == "__main__":
####### The complete version of the list of examples #######
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Register signal handlers for graceful termination
signal.signal(signal.SIGINT, signal_handler) # Handle Ctrl+C
signal.signal(signal.SIGTERM, signal_handler) # Handle termination signal

View File

@@ -2,8 +2,8 @@ python run_multienv_openaicua.py \
--headless \
--observation_type screenshot \
--model computer-use-preview \
--result_dir ./results_all_ifmessage_promptnochange \
--test_all_meta_path evaluation_examples/test_all.json \
--result_dir ./results_all_ifnoaction_promptnochange_adderror_error2 \
--test_all_meta_path evaluation_examples/test_rest0608.json \
--region us-east-1 \
--max_steps 150 \
--num_envs 25
--num_envs 1

9
run_operator_impress.sh Normal file
View File

@@ -0,0 +1,9 @@
python run_multienv_openaicua.py \
--headless \
--observation_type screenshot \
--model computer-use-preview \
--result_dir ./results_vlc_retest \
--test_all_meta_path evaluation_examples/test_impress.json \
--region us-east-1 \
--max_steps 3 \
--num_envs 1