* use aws pub ip * os task fix: set the default dim screen time to be 300s * OpenCUA-72B * update password * update * update * update opencua72b agent * change provider ip --------- Co-authored-by: Jiaqi <dengjiaqi@moonshot.cn>
484 lines
18 KiB
Python
484 lines
18 KiB
Python
import re
|
|
import base64
|
|
from loguru import logger
|
|
from typing import List, Optional
|
|
from PIL import Image
|
|
from io import BytesIO
|
|
import tempfile
|
|
import os
|
|
import math
|
|
|
|
def encode_image(image_content):
|
|
return base64.b64encode(image_content).decode("utf-8")
|
|
|
|
def smart_resize(
|
|
height: int,
|
|
width: int,
|
|
factor: int = 28,
|
|
min_pixels: int = 56 * 56,
|
|
max_pixels: int = 14 * 14 * 4 * 1280,
|
|
max_aspect_ratio_allowed: Optional[float] = None,
|
|
size_can_be_smaller_than_factor: bool = False,
|
|
):
|
|
"""Rescales the image so that the following conditions are met:
|
|
|
|
1. Both dimensions (height and width) are divisible by 'factor'.
|
|
|
|
2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
|
|
|
|
3. The aspect ratio of the image is maintained as closely as possible.
|
|
|
|
"""
|
|
if not size_can_be_smaller_than_factor and (height < factor or width < factor):
|
|
raise ValueError(
|
|
f"height:{height} or width:{width} must be larger than factor:{factor} "
|
|
f"(when size_can_be_smaller_than_factor is False)"
|
|
)
|
|
elif (
|
|
max_aspect_ratio_allowed is not None
|
|
and max(height, width) / min(height, width) > max_aspect_ratio_allowed
|
|
):
|
|
raise ValueError(
|
|
f"absolute aspect ratio must be smaller than {max_aspect_ratio_allowed}, "
|
|
f"got {max(height, width) / min(height, width)}"
|
|
f"(when max_aspect_ratio_allowed is not None)"
|
|
)
|
|
h_bar = max(1, round(height / factor)) * factor
|
|
w_bar = max(1, round(width / factor)) * factor
|
|
if h_bar * w_bar > max_pixels:
|
|
beta = math.sqrt((height * width) / max_pixels)
|
|
h_bar = max(1, math.floor(height / beta / factor)) * factor
|
|
w_bar = max(1, math.floor(width / beta / factor)) * factor
|
|
elif h_bar * w_bar < min_pixels:
|
|
beta = math.sqrt(min_pixels / (height * width))
|
|
h_bar = math.ceil(height * beta / factor) * factor
|
|
w_bar = math.ceil(width * beta / factor) * factor
|
|
return h_bar, w_bar
|
|
|
|
def call_openai_naive(model, payload, address_hint=None):
|
|
"""
|
|
Naive OpenAI API call using requests.
|
|
"""
|
|
# Extract fields from payload
|
|
model = payload.get("model")
|
|
payload["model"] = model.model_id if hasattr(model, "model_id") else "None"
|
|
# address_hint not used here
|
|
base_url = model.base_url
|
|
# logger.warning(f"Base URL: {base_url}, Payload model: {payload['model']}")
|
|
url = f"{base_url}/chat/completions"
|
|
headers = {
|
|
"Content-Type": "application/json",
|
|
}
|
|
data = {
|
|
**payload,
|
|
"n": 1,
|
|
}
|
|
max_retry = 5
|
|
chat_completions = None
|
|
success = False
|
|
while success is False and max_retry > 0:
|
|
try:
|
|
json_data = json.dumps(data)
|
|
response = requests.post(
|
|
url, headers=headers, data=json_data, timeout=120, verify=False
|
|
)
|
|
if response.status_code == 200:
|
|
chat_completions = response.json()
|
|
try:
|
|
finish_reason = chat_completions["choices"][0].get("finish_reason")
|
|
if (
|
|
finish_reason is not None and finish_reason == "stop"
|
|
): # for most of the time, length will not exceed max_tokens
|
|
success = True
|
|
else:
|
|
time.sleep(5)
|
|
max_retry -= 1
|
|
except Exception as e:
|
|
logger.error(f"Error in processing chat completion: {e}")
|
|
time.sleep(5)
|
|
max_retry -= 1
|
|
else:
|
|
logger.error(f"Failed to call OpenAI API: {response.text}")
|
|
time.sleep(5)
|
|
max_retry -= 1
|
|
except requests.exceptions.ReadTimeout:
|
|
# timeout is normal, don't print trace
|
|
max_retry -= 1
|
|
logger.warning(f"Timeout in OpenAI API call, left retries: {max_retry}")
|
|
time.sleep(5)
|
|
|
|
except Exception as e:
|
|
max_retry -= 1
|
|
logger.exception(f"Failed to call OpenAI API: {e}")
|
|
time.sleep(5)
|
|
|
|
if chat_completions is None:
|
|
raise RuntimeError("Failed to call OpenAI API, max_retry used up")
|
|
try:
|
|
infos = {}
|
|
if "choices" in chat_completions:
|
|
infos["finish_reason"] = chat_completions["choices"][0].get("finish_reason")
|
|
infos["n"] = len(chat_completions["choices"])
|
|
if "tool_calls" in chat_completions["choices"][0]["message"]:
|
|
infos["tool_calls"] = chat_completions["choices"][0]["message"][
|
|
"tool_calls"
|
|
]
|
|
infos["choices"] = chat_completions["choices"] # for the case of n > 1
|
|
if "usage" in chat_completions:
|
|
infos["usage"] = chat_completions["usage"]
|
|
return chat_completions["choices"][0]["message"]["content"], infos
|
|
except Exception as e:
|
|
logger.error(f"Error in processing chat completion {e}")
|
|
return "", {"n": 1, "usage": 0, "finish_reason": f"error {e}"}
|
|
|
|
|
|
def preprocess_for_naive_openai(self, payload):
|
|
if isinstance(payload["model"], str):
|
|
payload["model"] = getattr(self, "openai_client", None)
|
|
return payload
|
|
|
|
def encoded_img_to_pil_img(data_str):
|
|
base64_str = data_str.replace("data:image/png;base64,", "")
|
|
image_data = base64.b64decode(base64_str)
|
|
return Image.open(BytesIO(image_data))
|
|
|
|
|
|
def save_to_tmp_img_file(data_str):
|
|
base64_str = data_str.replace("data:image/png;base64,", "")
|
|
image_data = base64.b64decode(base64_str)
|
|
image = Image.open(BytesIO(image_data))
|
|
|
|
tmp_img_path = os.path.join(tempfile.mkdtemp(), "tmp_img.png")
|
|
image.save(tmp_img_path)
|
|
|
|
return tmp_img_path
|
|
|
|
|
|
def bbox_to_center_1000(bbox: str) -> tuple[int, int]:
|
|
regex_list = [
|
|
r"<\|box_start\|>\((\d+),(\d+)\),\((\d+),(\d+)\)<\|box_end\|>", # '<|box_start|>(576,12),(592,42)<|box_end|>'
|
|
r"<\|box_start\|>\[\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]\]<\|box_end\|>", # '<|box_start|>[[576, 12, 592, 42]]<|box_end|>'
|
|
r"<\|box_start\|>\[\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]<\|box_end\|>", # '<|box_start|>[[576, 12, 592, 42]<|box_end|>', this is actually wrong format, but we parse it anyway
|
|
r"<\|box_start\|>\((\d+),\s*(\d+),\s*(\d+),\s*(\d+)\)<\|box_end\|>", # '<|box_start|>(576, 12, 592, 42)<|box_end|>', this is actually wrong format, but we parse it anyway
|
|
r"\((\d+),(\d+)\),\((\d+),(\d+)\)", # Versions without the 'bbox' special tokens
|
|
r"\[\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]\]",
|
|
r"\[\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]",
|
|
r"\((\d+),\s*(\d+),\s*(\d+),\s*(\d+)\)",
|
|
]
|
|
for regex in regex_list:
|
|
match = re.search(regex, bbox)
|
|
if match:
|
|
break
|
|
if not match:
|
|
raise ValueError(
|
|
f"Bounding box coordinates not found in the input string: {bbox}"
|
|
)
|
|
x_top_left, y_top_left, x_bottom_right, y_bottom_right = map(int, match.groups())
|
|
x_center = (x_top_left + x_bottom_right) // 2
|
|
y_center = (y_top_left + y_bottom_right) // 2
|
|
return x_center, y_center
|
|
|
|
|
|
def bbox_to_center_1(bbox: str) -> tuple[int, int]:
|
|
regex_list = [
|
|
r"\[\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*\]",
|
|
]
|
|
for regex in regex_list:
|
|
match = re.search(regex, bbox)
|
|
if match:
|
|
break
|
|
if not match:
|
|
raise ValueError(
|
|
f"Bounding box coordinates not found in the input string: {bbox}"
|
|
)
|
|
coordinates = tuple(map(float, match.groups()))
|
|
coordinates = [int(coord * 1000) for coord in coordinates]
|
|
x_center = (coordinates[0] + coordinates[2]) // 2
|
|
y_center = (coordinates[1] + coordinates[3]) // 2
|
|
return x_center, y_center
|
|
|
|
def _coordinate_projection(x, y, screen_width, screen_height, coordinate_type):
|
|
if coordinate_type == "relative":
|
|
return int(round(x * screen_width)), int(round(y * screen_height))
|
|
elif coordinate_type == "absolute":
|
|
return x, y
|
|
elif coordinate_type == "qwen25":
|
|
height, width = smart_resize(
|
|
height=screen_height,
|
|
width=screen_width,
|
|
factor=28,
|
|
min_pixels=3136,
|
|
max_pixels=12845056,
|
|
)
|
|
return int(x / width * screen_width), int(y / height * screen_height)
|
|
elif coordinate_type == "relative1000":
|
|
if screen_width == 0 or screen_height == 0:
|
|
raise ValueError(
|
|
"Screen width and height must be greater than zero for relative1000 coordinates."
|
|
)
|
|
x_abs = int(round(x * screen_width / 1000))
|
|
y_abs = int(round(y * screen_height / 1000))
|
|
return x_abs, y_abs
|
|
else:
|
|
raise ValueError(f"Unsupported coordinate type: {coordinate_type}")
|
|
|
|
|
|
def rescale_coord(
|
|
coord: tuple[int, int],
|
|
original_width: int,
|
|
original_height: int,
|
|
scaled_width=1000,
|
|
scaled_height=1000,
|
|
) -> tuple[int, int]:
|
|
# According to https://huggingface.co/spaces/maxiw/OS-ATLAS/blob/398c3256a4fec409a074e0e4b5ac1d1d5bf7c240/app.py#L36
|
|
# It seems that OS-ATLAS model are rescaled to output 1000x1000 images
|
|
# So we need to rescale the coordinates back to the original image size
|
|
x_scale = original_width / scaled_width
|
|
y_scale = original_height / scaled_height
|
|
return int(coord[0] * x_scale), int(coord[1] * y_scale)
|
|
|
|
|
|
def _pyautogui_code_to_absolute_coordinates(
|
|
pyautogui_code_relative_coordinates,
|
|
logical_screen_size,
|
|
coordinate_type="relative",
|
|
model_input_size=None,
|
|
):
|
|
"""
|
|
Convert the relative coordinates in the pyautogui code to absolute coordinates based on the logical screen size.
|
|
"""
|
|
import re
|
|
import ast
|
|
|
|
if coordinate_type not in ["relative", "relative1000", "absolute", "qwen25"]:
|
|
raise ValueError(
|
|
f"Invalid coordinate type: {coordinate_type}. Expected one of ['relative', 'relative1000', 'absolute', 'qwen25']."
|
|
)
|
|
|
|
screen_width, screen_height = logical_screen_size
|
|
if model_input_size is not None:
|
|
model_width, model_height = model_input_size
|
|
width_scale, height_scale = (
|
|
screen_width / model_width,
|
|
screen_height / model_height,
|
|
)
|
|
else:
|
|
width_scale, height_scale = 1, 1
|
|
|
|
pattern = r"(pyautogui\.\w+\([^\)]*\))"
|
|
|
|
matches = re.findall(pattern, pyautogui_code_relative_coordinates)
|
|
|
|
new_code = pyautogui_code_relative_coordinates
|
|
|
|
for full_call in matches:
|
|
func_name_pattern = r"(pyautogui\.\w+)\((.*)\)"
|
|
func_match = re.match(func_name_pattern, full_call, re.DOTALL)
|
|
if not func_match:
|
|
continue
|
|
|
|
func_name = func_match.group(1)
|
|
args_str = func_match.group(2)
|
|
|
|
try:
|
|
parsed = ast.parse(f"func({args_str})").body[0].value
|
|
parsed_args = parsed.args
|
|
parsed_keywords = parsed.keywords
|
|
except SyntaxError:
|
|
return pyautogui_code_relative_coordinates
|
|
|
|
function_parameters = {
|
|
"click": ["x", "y", "clicks", "interval", "button", "duration", "pause"],
|
|
"moveTo": ["x", "y", "duration", "tween", "pause"],
|
|
"moveRel": ["xOffset", "yOffset", "duration", "tween", "pause"],
|
|
"dragTo": ["x", "y", "duration", "button", "mouseDownUp", "pause"],
|
|
"dragRel": [
|
|
"xOffset",
|
|
"yOffset",
|
|
"duration",
|
|
"button",
|
|
"mouseDownUp",
|
|
"pause",
|
|
],
|
|
"doubleClick": ["x", "y", "interval", "button", "duration", "pause"],
|
|
}
|
|
|
|
func_base_name = func_name.split(".")[-1]
|
|
|
|
param_names = function_parameters.get(func_base_name, [])
|
|
|
|
args = {}
|
|
for idx, arg in enumerate(parsed_args):
|
|
if idx < len(param_names):
|
|
param_name = param_names[idx]
|
|
arg_value = ast.literal_eval(arg)
|
|
args[param_name] = arg_value
|
|
|
|
try:
|
|
for kw in parsed_keywords:
|
|
param_name = kw.arg
|
|
arg_value = ast.literal_eval(kw.value)
|
|
args[param_name] = arg_value
|
|
except Exception as e:
|
|
logger.error(f"Error parsing keyword arguments: {e}")
|
|
return pyautogui_code_relative_coordinates
|
|
|
|
updated = False
|
|
if "x" in args and "y" in args:
|
|
try:
|
|
x_rel = float(args["x"])
|
|
y_rel = float(args["y"])
|
|
x_abs, y_abs = _coordinate_projection(
|
|
x_rel, y_rel, screen_width, screen_height, coordinate_type
|
|
)
|
|
# logger.warning(f"Projecting coordinates: ({x_rel}, {y_rel}) to ({x_abs}, {y_abs}) using {coordinate_type} projection.")
|
|
args["x"] = x_abs * width_scale
|
|
args["y"] = y_abs * height_scale
|
|
updated = True
|
|
except ValueError:
|
|
pass
|
|
|
|
if "xOffset" in args and "yOffset" in args:
|
|
try:
|
|
x_rel = float(args["xOffset"])
|
|
y_rel = float(args["yOffset"])
|
|
x_abs, y_abs = _coordinate_projection(
|
|
x_rel, y_rel, screen_width, screen_height, coordinate_type
|
|
)
|
|
args["xOffset"] = x_abs * width_scale
|
|
args["yOffset"] = y_abs * height_scale
|
|
updated = True
|
|
except ValueError:
|
|
pass
|
|
|
|
if updated:
|
|
reconstructed_args = []
|
|
for idx, param_name in enumerate(param_names):
|
|
if param_name in args:
|
|
arg_value = args[param_name]
|
|
if isinstance(arg_value, str):
|
|
arg_repr = f"'{arg_value}'"
|
|
else:
|
|
arg_repr = str(arg_value)
|
|
reconstructed_args.append(arg_repr)
|
|
else:
|
|
break
|
|
|
|
used_params = set(param_names[: len(reconstructed_args)])
|
|
for kw in parsed_keywords:
|
|
if kw.arg not in used_params:
|
|
arg_value = args[kw.arg]
|
|
if isinstance(arg_value, str):
|
|
arg_repr = f"{kw.arg}='{arg_value}'"
|
|
else:
|
|
arg_repr = f"{kw.arg}={arg_value}"
|
|
reconstructed_args.append(arg_repr)
|
|
|
|
new_args_str = ", ".join(reconstructed_args)
|
|
new_full_call = f"{func_name}({new_args_str})"
|
|
new_code = new_code.replace(full_call, new_full_call)
|
|
|
|
return new_code
|
|
|
|
|
|
def split_args(args_str: str) -> List[str]:
|
|
args = []
|
|
current_arg = ""
|
|
within_string = False
|
|
string_char = ""
|
|
prev_char = ""
|
|
for char in args_str:
|
|
if char in ['"', "'"]:
|
|
if not within_string:
|
|
within_string = True
|
|
string_char = char
|
|
elif within_string and prev_char != "\\" and char == string_char:
|
|
within_string = False
|
|
if char == "," and not within_string:
|
|
args.append(current_arg)
|
|
current_arg = ""
|
|
else:
|
|
current_arg += char
|
|
prev_char = char
|
|
if current_arg:
|
|
args.append(current_arg)
|
|
return args
|
|
|
|
|
|
def correct_pyautogui_arguments(code: str) -> str:
|
|
function_corrections = {
|
|
"write": {
|
|
"incorrect_args": ["text", "content"],
|
|
"correct_args": [],
|
|
"keyword_arg": "message",
|
|
},
|
|
"press": {
|
|
"incorrect_args": ["key", "button"],
|
|
"correct_args": [],
|
|
"keyword_arg": None,
|
|
},
|
|
"hotkey": {
|
|
"incorrect_args": ["key1", "key2", "keys"],
|
|
"correct_args": [],
|
|
"keyword_arg": None,
|
|
},
|
|
}
|
|
|
|
lines = code.strip().split("\n")
|
|
corrected_lines = []
|
|
|
|
for line in lines:
|
|
line = line.strip()
|
|
match = re.match(r"(pyautogui\.(\w+))\((.*)\)", line)
|
|
if match:
|
|
full_func_call = match.group(1)
|
|
func_name = match.group(2)
|
|
args_str = match.group(3)
|
|
|
|
if func_name in function_corrections:
|
|
func_info = function_corrections[func_name]
|
|
args = split_args(args_str)
|
|
corrected_args = []
|
|
|
|
for arg in args:
|
|
arg = arg.strip()
|
|
kwarg_match = re.match(r"(\w+)\s*=\s*(.*)", arg)
|
|
if kwarg_match:
|
|
arg_name = kwarg_match.group(1)
|
|
arg_value = kwarg_match.group(2)
|
|
|
|
if arg_name in func_info["incorrect_args"]:
|
|
if func_info["keyword_arg"]:
|
|
corrected_args.append(
|
|
f"{func_info['keyword_arg']}={arg_value}"
|
|
)
|
|
else:
|
|
corrected_args.append(arg_value)
|
|
else:
|
|
corrected_args.append(f"{arg_name}={arg_value}")
|
|
else:
|
|
corrected_args.append(arg)
|
|
|
|
corrected_args_str = ", ".join(corrected_args)
|
|
corrected_line = f"{full_func_call}({corrected_args_str})"
|
|
corrected_lines.append(corrected_line)
|
|
else:
|
|
corrected_lines.append(line)
|
|
else:
|
|
corrected_lines.append(line)
|
|
|
|
corrected_code = "\n".join(corrected_lines)
|
|
return corrected_code
|
|
|
|
def image_message_from_obs(obs, for_training=False):
|
|
if not for_training:
|
|
return {
|
|
"type": "image_url",
|
|
"image_url": {
|
|
"url": f"data:image/png;base64,{encode_image(obs['screenshot'])}",
|
|
"detail": "high",
|
|
},
|
|
}
|
|
else:
|
|
return {"type": "image_url", "image_url": {"url": obs["screenshot_path"]}}
|