Wxy/opencua (#274)

* OpenCUA Agent code base * update url * debug, modify url input * debug opencua * show result * debug agent history overlap * modify opencua agent; add comment lines * update parallel; clean code; use sleep 3s * ui-tars-0717
2025-07-20 15:52:23 +08:00
parent bec7129fff
commit e10dd9267c
5 changed files with 320 additions and 224 deletions
--- a/mm_agents/opencua_agent.py
+++ b/mm_agents/opencua_agent.py
@@ -571,11 +571,6 @@ class OpenCUAAgent:
            logger.info(f"========================== {self.model} ===================================")
        logger.info(f"Instruction: \n{instruction}")

-        image_bytes = BytesIO(obs['screenshot'])
-        with Image.open(image_bytes) as img:
-            print("Actual screen size", img.size)
-            print("Logical screen size", self.screen_size)
-
        messages = []
        messages.append({
                "role": "system",
@@ -598,7 +593,7 @@ class OpenCUAAgent:
                history_content = STEP_TEMPLATE.format(step_num=i+1) + self.HISTORY_TEMPLATE.format(
                    observation=self.cots[i].get('observation'),
                    thought=self.cots[i].get('thought'),
-                    action=self.cots[i]['action']
+                    action=self.cots[i].get('action')
                )

                messages.append({
@@ -609,7 +604,7 @@ class OpenCUAAgent:
                history_content = STEP_TEMPLATE.format(step_num=i+1) + self.HISTORY_TEMPLATE.format(
                    observation=self.cots[i].get('observation'),
                    thought=self.cots[i].get('thought'),
-                    action=self.cots[i]['action']
+                    action=self.cots[i].get('action')
                )
                history_step_texts.append(history_content)
                if i == len(self.actions) - self.max_image_history_length:
@@ -640,7 +635,7 @@ class OpenCUAAgent:
            "temperature": self.temperature
        }, self.model)

-        logger.info(f"Model Output: \n\n{response}")
+        logger.info(f"Model Output: \n{response}")
        if not response:
            logger.error("No response found in the response.")
            return "ERROR", [], {}
@@ -666,23 +661,23 @@ class OpenCUAAgent:
        self.cots.append(other_cot)
        
        # Print message structure if needed
-        logger.info(f"\nInstruction: {instruction}")
-        messages_to_print = []
-        current_image = 1
-        for msg in messages:
-            msg_copy = copy.deepcopy(msg)
-            if isinstance(msg_copy['content'], list):
-                for content in msg_copy['content']:
-                    if content['type'] == 'image_url':
-                        content['image_url']['url'] = f'Image {current_image}'
-                        current_image += 1
-            messages_to_print.append(msg_copy)
+        # messages_to_print = []
+        # current_image = 1
+        # for msg in messages:
+        #     msg_copy = copy.deepcopy(msg)
+        #     if isinstance(msg_copy['content'], list):
+        #         for content in msg_copy['content']:
+        #             if content['type'] == 'image_url':
+        #                 content['image_url']['url'] = f'Image {current_image}'
+        #                 current_image += 1
+        #     messages_to_print.append(msg_copy)

-        messages_to_print.append({
-            "new_step_cot": other_cot,
-            "response": response
-        })
-        logger.info(json.dumps(messages_to_print, indent=2))
+        # messages_to_print.append({
+        #     "new_step_cot": other_cot,
+        #     "response": response
+        # })
+        # logger.info(json.dumps(messages_to_print, indent=2))
+        logger.info(f"New step cot: {other_cot}")

        return response, pyautogui_actions, {}
            
@@ -720,4 +715,10 @@ class OpenCUAAgent:
                logger.error("Retrying...")
                time.sleep(5)
            else:
-                return response.json()['choices'][0]['message']['content']
+                response = response.json()
+                finish_reason = response["choices"][0].get("finish_reason")
+                if finish_reason is not None and finish_reason == "stop": # for most of the time, length will not exceed max_tokens
+                    return response['choices'][0]['message']['content']
+                else:
+                    logger.error("LLM did not finish properly, retrying...")
+                    time.sleep(5)