Wxy/opencua (#260)

* OpenCUA Agent code base

* update url

* debug, modify url input

* debug opencua

* show result

* debug agent history overlap

* modify opencua agent; add comment lines
This commit is contained in:
Xinyuan Wang
2025-07-16 17:53:12 +08:00
committed by GitHub
parent 5e5058c1f2
commit 0f2655249c
4 changed files with 497 additions and 197 deletions

View File

@@ -1,38 +1,45 @@
import base64
from loguru import logger
"""
OpenCUA Agent Implementation
This module implements an OpenCUA agent for desktop automation tasks, building upon
existing frameworks and integrating multiple coordinate mapping systems.
Framework and Implementation Sources:
- Main framework structure follows: https://github.com/xlang-ai/OSWorld/blob/main/mm_agents/agent.py
- Agent implementation adapted from: https://github.com/xlang-ai/OSWorld/blob/main/mm_agents/aguvis_agent.py
- Qwen2.5-VL coordinate mapping from: https://github.com/QwenLM/Qwen2.5-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py
"""
import re
import time
import math
import httpx
from io import BytesIO
from typing import Dict, List, Tuple, Optional
import backoff
from PIL import Image
import os
import ast
import time
import json
import math
import copy
import httpx
import base64
import backoff
from io import BytesIO
from loguru import logger
from PIL import Image
from typing import Dict, List, Tuple, Optional
AGNET_SYS_PROMPT_L1 = """You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.\n\nFor each step, provide your response in this format:\n\nAction:\n Provide clear, concise, and actionable instructions:\n - If the action involves interacting with a specific target:\n - Describe target explicitly without using coordinates\n - Specify element names when possible (use original language if non-English)\n - Describe features (shape, color, position) if name unavailable\n - For window control buttons, identify correctly (minimize \"\", maximize \"\", close \"X\")\n - if the action involves keyboard actions like 'press', 'write', 'hotkey':\n - Consolidate repetitive keypresses with count\n - Specify expected text outcome for typing actions\n\nFinally, output the action as PyAutoGUI code or the following functions:\n- {\"name\": \"computer.triple_click\", \"description\": \"Triple click on the screen\", \"parameters\": {\"type\": \"object\", \"properties\": {\"x\": {\"type\": \"number\", \"description\": \"The x coordinate of the triple click\"}, \"y\": {\"type\": \"number\", \"description\": \"The y coordinate of the triple click\"}}, \"required\": [\"x\", \"y\"]}}\n- {\"name\": \"computer.terminate\", \"description\": \"Terminate the current task and report its completion status\", \"parameters\": {\"type\": \"object\", \"properties\": {\"status\": {\"type\": \"string\", \"enum\": [\"success\", \"fail\"], \"description\": \"The status of the task\"}}, \"required\": [\"status\"]}}""".strip()
AGNET_SYS_PROMPT_L1 = "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.\n\nFor each step, provide your response in this format:\n\nAction:\n Provide clear, concise, and actionable instructions:\n - If the action involves interacting with a specific target:\n - Describe target explicitly without using coordinates\n - Specify element names when possible (use original language if non-English)\n - Describe features (shape, color, position) if name unavailable\n - For window control buttons, identify correctly (minimize \"\", maximize \"\", close \"X\")\n - if the action involves keyboard actions like 'press', 'write', 'hotkey':\n - Consolidate repetitive keypresses with count\n - Specify expected text outcome for typing actions\n\nFinally, output the action as PyAutoGUI code or the following functions:\n- {\"name\": \"computer.triple_click\", \"description\": \"Triple click on the screen\", \"parameters\": {\"type\": \"object\", \"properties\": {\"x\": {\"type\": \"number\", \"description\": \"The x coordinate of the triple click\"}, \"y\": {\"type\": \"number\", \"description\": \"The y coordinate of the triple click\"}}, \"required\": [\"x\", \"y\"]}}\n- {\"name\": \"computer.terminate\", \"description\": \"Terminate the current task and report its completion status\", \"parameters\": {\"type\": \"object\", \"properties\": {\"status\": {\"type\": \"string\", \"enum\": [\"success\", \"fail\"], \"description\": \"The status of the task\"}}, \"required\": [\"status\"]}}".strip()
AGNET_SYS_PROMPT_L2 = "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.\n\nFor each step, provide your response in this format:\n\nThought:\n - Step by Step Progress Assessment:\n - Analyze completed task parts and their contribution to the overall goal\n - Reflect on potential errors, unexpected results, or obstacles\n - If previous action was incorrect, predict a logical recovery step\n - Next Action Analysis:\n - List possible next actions based on current state\n - Evaluate options considering current state and previous actions\n - Propose most logical next action\n - Anticipate consequences of the proposed action\n - For Text Input Actions:\n - Note current cursor position\n - Consolidate repetitive actions (specify count for multiple keypresses)\n - Describe expected final text outcome\n - Use first-person perspective in reasoning\n\nAction:\n Provide clear, concise, and actionable instructions:\n - If the action involves interacting with a specific target:\n - Describe target explicitly without using coordinates\n - Specify element names when possible (use original language if non-English)\n - Describe features (shape, color, position) if name unavailable\n - For window control buttons, identify correctly (minimize \"\", maximize \"\", close \"X\")\n - if the action involves keyboard actions like 'press', 'write', 'hotkey':\n - Consolidate repetitive keypresses with count\n - Specify expected text outcome for typing actions\n\nFinally, output the action as PyAutoGUI code or the following functions:\n- {\"name\": \"computer.triple_click\", \"description\": \"Triple click on the screen\", \"parameters\": {\"type\": \"object\", \"properties\": {\"x\": {\"type\": \"number\", \"description\": \"The x coordinate of the triple click\"}, \"y\": {\"type\": \"number\", \"description\": \"The y coordinate of the triple click\"}}, \"required\": [\"x\", \"y\"]}}\n- {\"name\": \"computer.terminate\", \"description\": \"Terminate the current task and report its completion status\", \"parameters\": {\"type\": \"object\", \"properties\": {\"status\": {\"type\": \"string\", \"enum\": [\"success\", \"fail\"], \"description\": \"The status of the task\"}}, \"required\": [\"status\"]}}".strip()
AGNET_SYS_PROMPT_L3 = "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.\n\nFor each step, provide your response in this format:\n\nObservation:\n - Describe the current computer state based on the full screenshot in detail. \n - Application Context:\n - The active application\n - The active window or page\n - Overall layout and visible interface\n - Key Elements:\n - Menu items and toolbars \n - Buttons and controls\n - Text fields and content\n - Dialog boxes or popups\n - Error messages or notifications\n - Loading states\n - Other key elements\n - Describe any content, elements, options, information or clues that are possibly relevant to achieving the task goal, including their name, content, or shape (if possible).\n\nThought:\n - Step by Step Progress Assessment:\n - Analyze completed task parts and their contribution to the overall goal\n - Reflect on potential errors, unexpected results, or obstacles\n - If previous action was incorrect, predict a logical recovery step\n - Next Action Analysis:\n - List possible next actions based on current state\n - Evaluate options considering current state and previous actions\n - Propose most logical next action\n - Anticipate consequences of the proposed action\n - For Text Input Actions:\n - Note current cursor position\n - Consolidate repetitive actions (specify count for multiple keypresses)\n - Describe expected final text outcome\n - Use first-person perspective in reasoning\n\nAction:\n Provide clear, concise, and actionable instructions:\n - If the action involves interacting with a specific target:\n - Describe target explicitly without using coordinates\n - Specify element names when possible (use original language if non-English)\n - Describe features (shape, color, position) if name unavailable\n - For window control buttons, identify correctly (minimize \"\", maximize \"\", close \"X\")\n - if the action involves keyboard actions like 'press', 'write', 'hotkey':\n - Consolidate repetitive keypresses with count\n - Specify expected text outcome for typing actions\n\nFinally, output the action as PyAutoGUI code or the following functions:\n- {\"name\": \"computer.triple_click\", \"description\": \"Triple click on the screen\", \"parameters\": {\"type\": \"object\", \"properties\": {\"x\": {\"type\": \"number\", \"description\": \"The x coordinate of the triple click\"}, \"y\": {\"type\": \"number\", \"description\": \"The y coordinate of the triple click\"}}, \"required\": [\"x\", \"y\"]}}\n- {\"name\": \"computer.terminate\", \"description\": \"Terminate the current task and report its completion status\", \"parameters\": {\"type\": \"object\", \"properties\": {\"status\": {\"type\": \"string\", \"enum\": [\"success\", \"fail\"], \"description\": \"The status of the task\"}}, \"required\": [\"status\"]}}\n".strip()
AGNET_SYS_PROMPT_L0 = """You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.
For each step, output the action as PyAutoGUI code or the following functions:
- {"name": "computer.triple_click", "description": "Triple click on the screen", "parameters": {"type": "object", "properties": {"x": {"type": "number", "description": "The x coordinate of the triple click"}, "y": {"type": "number", "description": "The y coordinate of the triple click"}}, "required": ["x", "y"]}}
- {"name": "computer.terminate", "description": "Terminate the current task and report its completion status", "parameters": {"type": "object", "properties": {"status": {"type": "string", "enum": ["success", "failure"], "description": "The status of the task"}}, "required": ["status"]}}
""".strip()
STEP_TEMPLATE = "# Step {step_num}:\n"
INSTRUTION_TEMPLATE = "# Task Instruction:\n{instruction}\n\nPlease generate the next move according to the screenshot, task instruction and previous steps (if provided).\n"
STEP_TEMPLATE = "# Step {step_num}:\n"
ACTION_HISTORY_TEMPLATE = "## Action:\n{action}\n"
THOUGHT_HISTORY_TEMPLATE = "## Thought:\n{thought}\n\n## Action:\n{action}\n"
OBSERVATION_HISTORY_TEMPLATE = "## Observation:\n{observation}\n\n## Thought:\n{thought}\n\n## Action:\n{action}\n"
DETAIL_HISTORY_TEMPLATE = "## Thought:\n{thought}\n\n## Action:\n{action}\n\n## Code:\n{code}\n"
# Function to encode the image
def encode_image(image_content):
"""Encode the image to base64"""
return base64.b64encode(image_content).decode('utf-8')
def parse_response_to_cot_and_action(input_string, screen_size, coordinate_type) -> Tuple[str, List[str], dict]:
@@ -40,57 +47,61 @@ def parse_response_to_cot_and_action(input_string, screen_size, coordinate_type)
try:
sections = {}
if "computer.terminate" in input_string.lower():
code_blocks = re.findall(r'```(?:code)?\s*(.*?)\s*```', input_string, re.DOTALL | re.IGNORECASE)
if code_blocks:
last_code = code_blocks[-1].strip().lower()
if "fail" in last_code:
return "FAIL", ["FAIL"], {}
elif "success" in last_code:
return "DONE", ["DONE"], {}
return "DONE", ["DONE"], {}
obs_match = re.search(r'^##\s*Observation\s*:?[\n\r]+(.*?)(?=^##\s*Thought:|^##\s*Action:|^##|\Z)', input_string, re.DOTALL | re.MULTILINE)
if obs_match:
sections['observation'] = obs_match.group(1).strip()
# logger.warning(f"Extracted Observation: {sections.get('observation', 'None')}")
thought_match = re.search(r'^##\s*Thought\s*:?[\n\r]+(.*?)(?=^##\s*Action:|^##|\Z)', input_string, re.DOTALL | re.MULTILINE)
if thought_match:
sections['thought'] = thought_match.group(1).strip()
# logger.warning(f"Extracted Thought: {sections.get('thought', 'None')}")
action_match = re.search(r'^##\s*Action\s*:?[\n\r]+(.*?)(?=^##|\Z)', input_string, re.DOTALL | re.MULTILINE)
if action_match:
action = action_match.group(1).strip()
sections['action'] = action.strip()
# logger.warning(f"Extracted Action: {sections.get('action', 'None')}")
code_blocks = re.findall(r'```(?:python)?\s*(.*?)\s*```', input_string, re.DOTALL)
if "computer.terminate" in input_string.lower():
# Look for code blocks that might contain terminate command
code_blocks = re.findall(r'```(?:code|python)?\s*(.*?)\s*```', input_string, re.DOTALL | re.IGNORECASE)
if code_blocks:
last_code = code_blocks[-1].strip().lower()
if "fail" in last_code:
sections['code'] = "FAIL"
return "FAIL", ["FAIL"], sections
elif "success" in last_code:
sections['code'] = "DONE"
return "DONE", ["DONE"], sections
# Default to DONE if terminate is mentioned but no specific status
sections['code'] = "DONE"
return "DONE", ["DONE"], sections
code_blocks = re.findall(r'```(?:python)\s*(.*?)\s*```', input_string, re.DOTALL)
if code_blocks:
code = code_blocks[-1].strip()
sections['original_code'] = transform_agnet_action_to_code_block(code)
corrected_code = correct_pyautogui_arguments(code)
sections['code'] = corrected_code
sections['code'] = project_coordinate_to_absolute_scale(corrected_code, screen_width=screen_size[0], screen_height=screen_size[1], coordinate_type=coordinate_type)
# logger.warning(f"Extracted Code: {sections.get('code', 'None')}")
else:
# No code blocks found
sections['code'] = "WAIT"
return "WAIT", ["WAIT"], sections
if 'code' not in sections:
logger.error("Missing required action or code section")
return None, None, {}
if 'action' not in sections: # TODO: new added
if 'action' not in sections:
sections['action'] = ""
return sections['action'], [sections['code']], sections
except Exception as e:
logger.exception(f"Error parsing response: {str(e)}\nInput string: {input_string}")
return None, None, {}
return None, None, {}
def correct_pyautogui_arguments(code: str) -> str:
"""Correct the pyautogui arguments"""
function_corrections = {
'write': {
'incorrect_args': ['text', 'content'],
@@ -154,6 +165,7 @@ def correct_pyautogui_arguments(code: str) -> str:
return corrected_code
def split_args(args_str: str) -> List[str]:
"""Split the arguments string into a list of arguments"""
args = []
current_arg = ''
within_string = False
@@ -185,13 +197,15 @@ def smart_resize(
max_aspect_ratio_allowed: Optional[float] = None,
size_can_be_smaller_than_factor: bool = False,
):
"""Rescales the image so that the following conditions are met:
"""
The function is modified from https://github.com/QwenLM/Qwen2.5-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py
1. Both dimensions (height and width) are divisible by 'factor'.
Qwen2.5-VL based model need this function to resize screenshots.
2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
3. The aspect ratio of the image is maintained as closely as possible.
Rescales the image so that the following conditions are met:
1. Both dimensions (height and width) are divisible by 'factor'.
2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
3. The aspect ratio of the image is maintained as closely as possible.
"""
if not size_can_be_smaller_than_factor and (height < factor or width < factor):
@@ -218,39 +232,29 @@ def smart_resize(
return h_bar, w_bar
def _coordinate_projection(x, y, screen_width, screen_height, coordinate_type):
if coordinate_type == "relative":
"""Project the coordinates to the absolute scale"""
if coordinate_type == "relative":
return int(round(x * screen_width)), int(round(y * screen_height))
elif coordinate_type == "absolute":
return x, y
elif coordinate_type == "qwen25":
if 0 <= x <= 1 and 0 <= y <= 1:
# If already normalized, treat like "relative"
return int(round(x * screen_width)), int(round(y * screen_height))
elif coordinate_type == "absolute":
return x, y
elif coordinate_type == "qwen25":
if 0 <= x <= 1 and 0 <= y <= 1:
# If already normalized, treat like "relative"
return int(round(x * screen_width)), int(round(y * screen_height))
height, width = smart_resize(
height=screen_height,
width=screen_width,
factor=28,
min_pixels=3136,
max_pixels=12845056
)
return int(x / width * screen_width), int(y / height * screen_height)
elif coordinate_type == "relative1000":
if screen_width == 0 or screen_height == 0:
raise ValueError("Screen width and height must be greater than zero for relative1000 coordinates.")
x_abs = int(round(x * screen_width / 1000))
y_abs = int(round(y * screen_height / 1000))
return x_abs, y_abs
else:
raise ValueError(f"Unsupported coordinate type: {coordinate_type}")
height, width = smart_resize(
height=screen_height,
width=screen_width,
factor=28,
min_pixels=3136,
max_pixels=12845056 # We use this max_pixels setting in our training data
)
return int(x / width * screen_width), int(y / height * screen_height)
else:
raise ValueError(f"Unsupported coordinate type: {coordinate_type}")
def project_coordinate_to_absolute_scale(pyautogui_code_relative_coordinates, screen_width, screen_height, coordinate_type="relative"):
"""
Convert the relative coordinates in the pyautogui code to absolute coordinates based on the logical screen size.
"""
import re
import ast
"""Convert the relative coordinates in the pyautogui code to absolute coordinates based on the logical screen size."""
if coordinate_type not in ["relative", "relative1000", "absolute", "qwen25"]:
raise ValueError(f"Invalid coordinate type: {coordinate_type}. Expected one of ['relative', 'relative1000', 'absolute', 'qwen25'].")
@@ -426,8 +430,7 @@ def update_code_with_new_coordinates(code, updated_positions):
Returns:
str: The updated Python code.
"""
# TODO: the matching logics in 'update_code_with_new_coordinates'
# and 'extract_positions_and_instructions' are not exactly the same
lines = code.splitlines()
updated_code_lines = []
position_index = 0 # Tracks which position update to use
@@ -463,36 +466,51 @@ def update_code_with_new_coordinates(code, updated_positions):
return "\n".join(updated_code_lines)
def transform_agnet_action_to_code_block(action):
"""Transform the agent action to a code block: not used in agent, for logging only"""
if "computer.terminate" in action or "browser.select_option" in action or "browser.clear" in action:
return f"```code\n{action}\n```"
else:
return f"```python\n{action}\n```"
class OpenCUAAgent:
"""
OpenCUA Agent for desktop automation tasks.
This class implements a OpenCUA Model based agent that can observe
desktop environments through screenshots and execute mouse/keyboard actions
via PyAutoGUI to complete automation tasks.
Attributes:
model (str): Name of the language model being used
history_type (str): Type of history recording mechanism
actions (list): History of executed actions
observations (list): History of environment observations
cots (list): Chain of thought reasoning records
"""
def __init__(
self,
model,
history_type: str,
max_image_history_length: int,
platform="ubuntu",
max_tokens=1500,
top_p=0.9,
temperature=0,
action_space="pyautogui",
observation_type="screenshot",
cot_level: str = "l2",
screen_size=(1920, 1080),
coordinate_type: str = "relative", # relative or qwen25
detail_history_length: int = 0,
model: str, # OpenCUA model name
history_type: str, # History step type: action_history, thought_history, observation_history
max_image_history_length: int = 3, # The max number of images in the history
platform: str = "ubuntu", # The platform of the computer
max_tokens: int = 1500, # The max number of tokens in the response
top_p: float = 0.9, # The top p value in the response
temperature: float = 0, # The temperature value in the response
action_space: str = "pyautogui", # The action space: pyautogui
observation_type: str = "screenshot", # The observation type: screenshot
cot_level: str = "l2", # The CoT level: l1, l2, l3
screen_size: Tuple[int, int] = (1920, 1080), # The screen size
coordinate_type: str = "relative", # The coordinate type: relative, absolute, qwen25
**kwargs
):
self.platform = platform
assert coordinate_type in ["relative", "absolute", "qwen25"]
assert action_space in ["pyautogui"], "Invalid action space"
assert observation_type in ["screenshot"], "Invalid observation type"
assert history_type in ["action_history", "thought_history", "observation_history"]
assert model is not None, "Model cannot be None"
self.model = model
assert self.model is not None, "Executor model cannot be None"
self.platform = platform
self.max_tokens = max_tokens
self.top_p = top_p
self.temperature = temperature
@@ -500,19 +518,9 @@ class OpenCUAAgent:
self.observation_type = observation_type
self.history_type = history_type
self.coordinate_type = coordinate_type
assert coordinate_type in ["relative", "relative1000", "absolute", "qwen25"]
assert action_space in ["pyautogui"], "Invalid action space"
assert observation_type in ["screenshot"], "Invalid observation type"
assert history_type in ["action_history", "thought_history", "observation_history"]
self.actions = []
self.observations = []
self.cots = []
self.cot_level = cot_level
self.screen_size = screen_size
self.max_image_history_length = max_image_history_length
self.detail_history_length = detail_history_length
if history_type == "action_history":
self.HISTORY_TEMPLATE = ACTION_HISTORY_TEMPLATE
@@ -522,15 +530,27 @@ class OpenCUAAgent:
self.HISTORY_TEMPLATE = OBSERVATION_HISTORY_TEMPLATE
else:
raise ValueError(f"Invalid history type: {history_type}")
if cot_level == "l3":
self.SYSTEM_PROMPT = AGNET_SYS_PROMPT_L3
elif cot_level == "l2":
self.SYSTEM_PROMPT = AGNET_SYS_PROMPT_L2
elif cot_level == "l1":
self.SYSTEM_PROMPT = AGNET_SYS_PROMPT_L1
else:
raise ValueError(f"Invalid COT level: {cot_level}")
self.actions = []
self.observations = []
self.cots = []
def reset(self, _logger=None):
global logger
logger = _logger if _logger is not None else logging.getLogger("desktopenv.agent")
self.observations = []
self.thoughts = []
self.cots = []
self.actions = []
self.image_summaries = []
def _scale_scroll_for_windows(self, code: str, factor: int = 50) -> str:
""" pyautogui.scroll has a different scale on Ubuntu and Windows, multiple 'factor' when scrolling on Windows system"""
@@ -541,7 +561,7 @@ class OpenCUAAgent:
code = pattern_pos.sub(lambda m: f"{m.group(1)}{int(m.group(2))*factor})", code)
return code
def predict(self, instruction: str, obs: Dict, **kwargs) -> List:
def predict(self, instruction: str, obs: Dict, **kwargs) -> Tuple[str, List[str], Dict]:
"""
Predict the next action(s) based on the current observation.
"""
@@ -557,31 +577,10 @@ class OpenCUAAgent:
print("Logical screen size", self.screen_size)
messages = []
if self.cot_level == "l3":
messages.append({
messages.append({
"role": "system",
"content": AGNET_SYS_PROMPT_L3
"content": self.SYSTEM_PROMPT
})
elif self.cot_level == "l2":
messages.append({
"role": "system",
"content": AGNET_SYS_PROMPT_L2
})
elif self.cot_level == "l1":
messages.append({
"role": "system",
"content": AGNET_SYS_PROMPT_L1
})
elif self.cot_level == "l0":
messages.append({
"role": "system",
"content": AGNET_SYS_PROMPT_L0
})
else:
raise ValueError(f"Invalid COT level: {self.cot_level}")
instruction_prompt = INSTRUTION_TEMPLATE.format(instruction=instruction)
history_step_texts = []
for i in range(len(self.actions)):
@@ -596,19 +595,11 @@ class OpenCUAAgent:
]
})
if self.detail_history_length > 0 and i >= len(self.actions) - self.detail_history_length:
history_content = STEP_TEMPLATE.format(step_num=i+1) + DETAIL_HISTORY_TEMPLATE.format(
observation=self.cots[i].get('observation'),
thought=self.cots[i].get('thought'),
action=self.cots[i]['action'],
code=self.cots[i]['original_code']
)
else:
history_content = STEP_TEMPLATE.format(step_num=i+1) + self.HISTORY_TEMPLATE.format(
observation=self.cots[i].get('observation'),
thought=self.cots[i].get('thought'),
action=self.cots[i]['action']
)
history_content = STEP_TEMPLATE.format(step_num=i+1) + self.HISTORY_TEMPLATE.format(
observation=self.cots[i].get('observation'),
thought=self.cots[i].get('thought'),
action=self.cots[i]['action']
)
messages.append({
"role": "assistant",
@@ -636,26 +627,11 @@ class OpenCUAAgent:
},
{
"type": "text",
"text": instruction_prompt
"text": INSTRUTION_TEMPLATE.format(instruction=instruction)
}
]
})
# Print message structure if needed
# logger.info("\nMessages structure:")
# messages_to_print = []
# current_image = 1
# for msg in messages:
# msg_copy = copy.deepcopy(msg)
# if isinstance(msg_copy['content'], list):
# for content in msg_copy['content']:
# if content['type'] == 'image_url':
# content['image_url']['url'] = f'Image {current_image}'
# current_image += 1
# messages_to_print.append(msg_copy)
# logger.info(json.dumps(messages_to_print, indent=2))
response = self.call_llm({
"model": self.model,
"messages": messages,
@@ -667,7 +643,7 @@ class OpenCUAAgent:
logger.info(f"Model Output: \n\n{response}")
if not response:
logger.error("No response found in the response.")
return response, [], {}
return "ERROR", [], {}
low_level_instruction, pyautogui_actions, other_cot = parse_response_to_cot_and_action(response, self.screen_size, self.coordinate_type)
if not pyautogui_actions:
@@ -683,13 +659,34 @@ class OpenCUAAgent:
logger.info(f"Parsed pyautogui Action: \n{pyautogui_actions}")
self.actions.append(low_level_instruction)
if 'action' not in other_cot or not other_cot['action'] or 'thought' not in other_cot or not other_cot['thought']:
logger.error("Error! no action/thought in cot")
logger.error(f"response: {response}")
logger.error(f"cot: {other_cot}")
self.cots.append(other_cot)
# Print message structure if needed
logger.info(f"\nInstruction: {instruction}")
messages_to_print = []
current_image = 1
for msg in messages:
msg_copy = copy.deepcopy(msg)
if isinstance(msg_copy['content'], list):
for content in msg_copy['content']:
if content['type'] == 'image_url':
content['image_url']['url'] = f'Image {current_image}'
current_image += 1
messages_to_print.append(msg_copy)
messages_to_print.append({
"new_step_cot": other_cot,
"response": response
})
logger.info(json.dumps(messages_to_print, indent=2))
return response, pyautogui_actions, {}
# return response, [parsed_action]
@backoff.on_exception(
backoff.constant,
# here you should add more model exceptions as you want,
@@ -703,6 +700,7 @@ class OpenCUAAgent:
max_tries=10
)
def call_llm(self, payload, model):
"""Call the LLM API"""
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {os.environ['OPENCUA_API_KEY']}"