Wxy/opencua (#274)
* OpenCUA Agent code base * update url * debug, modify url input * debug opencua * show result * debug agent history overlap * modify opencua agent; add comment lines * update parallel; clean code; use sleep 3s * ui-tars-0717
This commit is contained in:
@@ -571,11 +571,6 @@ class OpenCUAAgent:
|
||||
logger.info(f"========================== {self.model} ===================================")
|
||||
logger.info(f"Instruction: \n{instruction}")
|
||||
|
||||
image_bytes = BytesIO(obs['screenshot'])
|
||||
with Image.open(image_bytes) as img:
|
||||
print("Actual screen size", img.size)
|
||||
print("Logical screen size", self.screen_size)
|
||||
|
||||
messages = []
|
||||
messages.append({
|
||||
"role": "system",
|
||||
@@ -598,7 +593,7 @@ class OpenCUAAgent:
|
||||
history_content = STEP_TEMPLATE.format(step_num=i+1) + self.HISTORY_TEMPLATE.format(
|
||||
observation=self.cots[i].get('observation'),
|
||||
thought=self.cots[i].get('thought'),
|
||||
action=self.cots[i]['action']
|
||||
action=self.cots[i].get('action')
|
||||
)
|
||||
|
||||
messages.append({
|
||||
@@ -609,7 +604,7 @@ class OpenCUAAgent:
|
||||
history_content = STEP_TEMPLATE.format(step_num=i+1) + self.HISTORY_TEMPLATE.format(
|
||||
observation=self.cots[i].get('observation'),
|
||||
thought=self.cots[i].get('thought'),
|
||||
action=self.cots[i]['action']
|
||||
action=self.cots[i].get('action')
|
||||
)
|
||||
history_step_texts.append(history_content)
|
||||
if i == len(self.actions) - self.max_image_history_length:
|
||||
@@ -640,7 +635,7 @@ class OpenCUAAgent:
|
||||
"temperature": self.temperature
|
||||
}, self.model)
|
||||
|
||||
logger.info(f"Model Output: \n\n{response}")
|
||||
logger.info(f"Model Output: \n{response}")
|
||||
if not response:
|
||||
logger.error("No response found in the response.")
|
||||
return "ERROR", [], {}
|
||||
@@ -666,23 +661,23 @@ class OpenCUAAgent:
|
||||
self.cots.append(other_cot)
|
||||
|
||||
# Print message structure if needed
|
||||
logger.info(f"\nInstruction: {instruction}")
|
||||
messages_to_print = []
|
||||
current_image = 1
|
||||
for msg in messages:
|
||||
msg_copy = copy.deepcopy(msg)
|
||||
if isinstance(msg_copy['content'], list):
|
||||
for content in msg_copy['content']:
|
||||
if content['type'] == 'image_url':
|
||||
content['image_url']['url'] = f'Image {current_image}'
|
||||
current_image += 1
|
||||
messages_to_print.append(msg_copy)
|
||||
# messages_to_print = []
|
||||
# current_image = 1
|
||||
# for msg in messages:
|
||||
# msg_copy = copy.deepcopy(msg)
|
||||
# if isinstance(msg_copy['content'], list):
|
||||
# for content in msg_copy['content']:
|
||||
# if content['type'] == 'image_url':
|
||||
# content['image_url']['url'] = f'Image {current_image}'
|
||||
# current_image += 1
|
||||
# messages_to_print.append(msg_copy)
|
||||
|
||||
messages_to_print.append({
|
||||
"new_step_cot": other_cot,
|
||||
"response": response
|
||||
})
|
||||
logger.info(json.dumps(messages_to_print, indent=2))
|
||||
# messages_to_print.append({
|
||||
# "new_step_cot": other_cot,
|
||||
# "response": response
|
||||
# })
|
||||
# logger.info(json.dumps(messages_to_print, indent=2))
|
||||
logger.info(f"New step cot: {other_cot}")
|
||||
|
||||
return response, pyautogui_actions, {}
|
||||
|
||||
@@ -720,4 +715,10 @@ class OpenCUAAgent:
|
||||
logger.error("Retrying...")
|
||||
time.sleep(5)
|
||||
else:
|
||||
return response.json()['choices'][0]['message']['content']
|
||||
response = response.json()
|
||||
finish_reason = response["choices"][0].get("finish_reason")
|
||||
if finish_reason is not None and finish_reason == "stop": # for most of the time, length will not exceed max_tokens
|
||||
return response['choices'][0]['message']['content']
|
||||
else:
|
||||
logger.error("LLM did not finish properly, retrying...")
|
||||
time.sleep(5)
|
||||
|
||||
Reference in New Issue
Block a user