Wxy/opencua (#274)

* OpenCUA Agent code base

* update url

* debug, modify url input

* debug opencua

* show result

* debug agent history overlap

* modify opencua agent; add comment lines

* update parallel; clean code; use sleep 3s

* ui-tars-0717
This commit is contained in:
Xinyuan Wang
2025-07-20 15:52:23 +08:00
committed by GitHub
parent bec7129fff
commit e10dd9267c
5 changed files with 320 additions and 224 deletions

View File

@@ -571,11 +571,6 @@ class OpenCUAAgent:
logger.info(f"========================== {self.model} ===================================")
logger.info(f"Instruction: \n{instruction}")
image_bytes = BytesIO(obs['screenshot'])
with Image.open(image_bytes) as img:
print("Actual screen size", img.size)
print("Logical screen size", self.screen_size)
messages = []
messages.append({
"role": "system",
@@ -598,7 +593,7 @@ class OpenCUAAgent:
history_content = STEP_TEMPLATE.format(step_num=i+1) + self.HISTORY_TEMPLATE.format(
observation=self.cots[i].get('observation'),
thought=self.cots[i].get('thought'),
action=self.cots[i]['action']
action=self.cots[i].get('action')
)
messages.append({
@@ -609,7 +604,7 @@ class OpenCUAAgent:
history_content = STEP_TEMPLATE.format(step_num=i+1) + self.HISTORY_TEMPLATE.format(
observation=self.cots[i].get('observation'),
thought=self.cots[i].get('thought'),
action=self.cots[i]['action']
action=self.cots[i].get('action')
)
history_step_texts.append(history_content)
if i == len(self.actions) - self.max_image_history_length:
@@ -640,7 +635,7 @@ class OpenCUAAgent:
"temperature": self.temperature
}, self.model)
logger.info(f"Model Output: \n\n{response}")
logger.info(f"Model Output: \n{response}")
if not response:
logger.error("No response found in the response.")
return "ERROR", [], {}
@@ -666,23 +661,23 @@ class OpenCUAAgent:
self.cots.append(other_cot)
# Print message structure if needed
logger.info(f"\nInstruction: {instruction}")
messages_to_print = []
current_image = 1
for msg in messages:
msg_copy = copy.deepcopy(msg)
if isinstance(msg_copy['content'], list):
for content in msg_copy['content']:
if content['type'] == 'image_url':
content['image_url']['url'] = f'Image {current_image}'
current_image += 1
messages_to_print.append(msg_copy)
# messages_to_print = []
# current_image = 1
# for msg in messages:
# msg_copy = copy.deepcopy(msg)
# if isinstance(msg_copy['content'], list):
# for content in msg_copy['content']:
# if content['type'] == 'image_url':
# content['image_url']['url'] = f'Image {current_image}'
# current_image += 1
# messages_to_print.append(msg_copy)
messages_to_print.append({
"new_step_cot": other_cot,
"response": response
})
logger.info(json.dumps(messages_to_print, indent=2))
# messages_to_print.append({
# "new_step_cot": other_cot,
# "response": response
# })
# logger.info(json.dumps(messages_to_print, indent=2))
logger.info(f"New step cot: {other_cot}")
return response, pyautogui_actions, {}
@@ -720,4 +715,10 @@ class OpenCUAAgent:
logger.error("Retrying...")
time.sleep(5)
else:
return response.json()['choices'][0]['message']['content']
response = response.json()
finish_reason = response["choices"][0].get("finish_reason")
if finish_reason is not None and finish_reason == "stop": # for most of the time, length will not exceed max_tokens
return response['choices'][0]['message']['content']
else:
logger.error("LLM did not finish properly, retrying...")
time.sleep(5)