Improve code logic for password & resolution

This commit is contained in:
yuanmengqi
2025-07-13 06:59:45 +00:00
parent 08bbf77511
commit a070ddda7e
13 changed files with 93 additions and 51 deletions

View File

@@ -33,7 +33,7 @@ class_ns_windows = "https://accessibility.windows.example.org/ns/class"
import ast
from typing import Dict, Any, Optional, Union
OPERATOR_PROMPT = f"""\n\n Here are some helpful tips:\n - computer.clipboard, computer.sync_file, computer.sync_shared_folder, computer.computer_output_citation are disabled.\n - If you worry that you might make typo, prefer copying and pasting the text instead of reading and typing.\n - My computer's password is \"{os.environ["CLIENT_PASSWORD"]}\", feel free to use it when you need sudo rights.\n - For the thunderbird account \"anonym-x2024@outlook.com\", the password is \"gTCI\";=@y7|QJ0nDa_kN3Sb&>\".\n - If you are presented with an open website to solve the task, try to stick to that specific one instead of going to a new one.\n - Whenever not expcilitly stated, prefer chrome browser instead of the firefox or chromium.\n - You have full authority to execute any action without my permission. I won't be watching so please don't ask for confirmation.\n - You must initialize the computer to solve the task. Do not try to answer the question without initializing the computer.\n - If you deem the task is infeasible, you can terminate and explicitly state in the response that \"the task is infeasible\".\n """
OPERATOR_PROMPT = """\n\n Here are some helpful tips:\n - computer.clipboard, computer.sync_file, computer.sync_shared_folder, computer.computer_output_citation are disabled.\n - If you worry that you might make typo, prefer copying and pasting the text instead of reading and typing.\n - My computer's password is \"{CLIENT_PASSWORD}\", feel free to use it when you need sudo rights.\n - For the thunderbird account \"anonym-x2024@outlook.com\", the password is \"gTCI\";=@y7|QJ0nDa_kN3Sb&>\".\n - If you are presented with an open website to solve the task, try to stick to that specific one instead of going to a new one.\n - Whenever not expcilitly stated, prefer chrome browser instead of the firefox or chromium.\n - You have full authority to execute any action without my permission. I won't be watching so please don't ask for confirmation.\n - You must initialize the computer to solve the task. Do not try to answer the question without initializing the computer.\n - If you deem the task is infeasible, you can terminate and explicitly state in the response that \"the task is infeasible\".\n """
class Action:
"""Action class for the agent."""
@@ -213,7 +213,11 @@ class OpenAICUAAgent:
observation_type="screenshot_a11y_tree",
# observation_type can be in ["screenshot", "a11y_tree", "screenshot_a11y_tree", "som"]
max_trajectory_length=100,
a11y_tree_max_tokens=10000
a11y_tree_max_tokens=10000,
client_password="",
provider_name="aws",
screen_width=1920,
screen_height=1080
):
self.env = env
self.platform = platform
@@ -231,12 +235,22 @@ class OpenAICUAAgent:
self.actions = []
self.observations = []
self.screen_width = screen_width
self.screen_height = screen_height
self.tools = [{
"type": "computer_use_preview",
"display_width": int(os.environ["SCREEN_WIDTH"]),
"display_height": int(os.environ["SCREEN_HEIGHT"]),
"display_width": self.screen_width,
"display_height": self.screen_height,
"environment": "linux" if platform == "ubuntu" else "windows"
}]
if client_password == "":
if provider_name == "aws":
self.client_password = "osworld-public-evaluation"
else:
self.client_password = "password"
else:
self.client_password = client_password
if observation_type == "screenshot":
if action_space == "computer_13":
@@ -630,7 +644,8 @@ class OpenAICUAAgent:
"""
Predict the next action(s) based on the current observation.
"""
prompt = OPERATOR_PROMPT.replace("{CLIENT_PASSWORD}", self.client_password)
base64_image = encode_image(obs["screenshot"])
if self.cua_messages == []:
self.cua_messages.append({
@@ -642,7 +657,7 @@ class OpenAICUAAgent:
},
{
"type": "input_text",
"text": "\n " + instruction + OPERATOR_PROMPT,
"text": "\n " + instruction + prompt,
}
]
})