feat/dart_gui (#371)

2025-11-07 21:50:01 +08:00
parent 6d43dbc532
commit 00b6468eb7
8 changed files with 2499 additions and 4 deletions
--- a/mm_agents/dart_gui/prompts.py
+++ b/mm_agents/dart_gui/prompts.py
@@ -0,0 +1,161 @@
+COMPUTER_USE_PROMPT = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
+
+## Output Format
+```
+Thought: ...
+Action: ...
+```
+
+## Action Space
+
+click(start_box='<|box_start|>(x1,y1)<|box_end|>')
+left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
+right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
+drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
+hotkey(key='')
+type(content='') #If you want to submit your input, use "\\n" at the end of `content`.
+scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
+wait() #Sleep for 5s and take a screenshot to check for any changes.
+finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
+
+## Note
+- Use {language} in `Thought` part.
+- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
+- My computer's password is 'password', feel free to use it when you need sudo rights.
+
+## User Instruction
+{instruction}
+"""
+
+COMPUTER_USE_PROMPT_WITH_CALL_USER = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
+
+## Output Format
+```
+Thought: ...
+Action: ...
+```
+
+## Action Space
+
+click(start_box='<|box_start|>(x1,y1)<|box_end|>')
+left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
+right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
+drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
+hotkey(key='')
+type(content='') #If you want to submit your input, use "\\n" at the end of `content`.
+scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
+wait() #Sleep for 5s and take a screenshot to check for any changes.
+finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
+call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.
+
+## Note
+- Use {language} in `Thought` part.
+- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
+- My computer's password is 'password', feel free to use it when you need sudo rights.
+
+## User Instruction
+{instruction}
+"""
+
+UITARS_ACTION_SPACE = """
+click(start_box='<|box_start|>(x1,y1)<|box_end|>')
+left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
+right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
+drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
+hotkey(key='')
+type(content='') #If you want to submit your input, use "\\n" at the end of `content`.
+scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
+wait() #Sleep for 5s and take a screenshot to check for any changes.
+finished()
+"""
+
+UITARS_CALL_USR_ACTION_SPACE = """
+click(start_box='<|box_start|>(x1,y1)<|box_end|>')
+left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
+right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
+drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
+hotkey(key='')
+type(content='') #If you want to submit your input, use "\\n" at the end of `content`.
+scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
+wait() #Sleep for 5s and take a screenshot to check for any changes.
+finished()
+call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.
+"""
+
+UITARS_NORMAL_ACTION_SPACE = """
+click(start_box='<|box_start|>(x1,y1)<|box_end|>')
+left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
+right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
+drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
+hotkey(key='')
+type(content='') #If you want to submit your input, use "\\n" at the end of `content`.
+scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
+wait() #Sleep for 5s and take a screenshot to check for any changes.
+finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
+"""
+
+UITARS_USR_PROMPT_NOTHOUGHT = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. 
+## Output Format
+```
+Action: ...
+```
+## Action Space
+click(start_box='<|box_start|>(x1,y1)<|box_end|>')
+left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
+right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
+drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
+hotkey(key='')
+type(content='') #If you want to submit your input, use "\\n" at the end of `content`.
+scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
+wait() #Sleep for 5s and take a screenshot to check for any changes.
+finished()
+call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.
+## User Instruction
+{instruction}
+"""
+
+UITARS_USR_PROMPT_THOUGHT = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. 
+
+## Output Format
+```
+Thought: ...
+Action: ...
+```
+
+## Action Space
+{action_space}
+
+## Note
+- Use {language} in `Thought` part.
+- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
+
+## User Instruction
+{instruction}
+"""
+
+
+FAILURE_INDICATORS = [
+    # Direct inability expressions
+    "无法", "不能", "不可以", "做不到", "实现不了", "完成不了","没法",
+    
+    # Regret/apology expressions  
+    "遗憾", "抱歉", "很抱歉", "非常抱歉", "对不起",
+    
+    # Not supported/available
+    "不直接支持", "不支持", "不提供", "不具备", "没有权限", "权限不足", "不在这里面","不符合",#"不存在",
+    
+    # Cannot access/handle
+    "无权访问", "访问不了", "处理不了", "操作不了", "执行不了", "没找到", "空空如也",
+    
+    # Not possible/feasible
+    "不可能", "无法实现", "实现不了", "办不到", "做不了","找不到","存在技术限制","没有找到","没有内置",
+    
+    # System limitations
+    "超出范围", "不在我的能力范围", "能力有限", "功能限制","没有成功","没成功","硬件的问题",
+    
+    # Refusal indicators
+    "拒绝", "不允许", "禁止", "不合适", "不恰当",
+    
+    # Trying Restart
+    "从头开始", "藏在", "浪费时间","一个更合理的思路","正确的方向","没有意义",#, "重新","重启",
+]
--- a/mm_agents/dart_gui/task_loader.py
+++ b/mm_agents/dart_gui/task_loader.py
@@ -0,0 +1,202 @@
+import asyncio
+from typing import List, Optional, Union, Dict, Any
+import json
+import os
+import hashlib
+from pathlib import Path
+from omegaconf import DictConfig
+from dataclasses import dataclass, asdict
+import copy
+import logging
+import random
+
+from prompts import COMPUTER_USE_PROMPT, COMPUTER_USE_PROMPT_WITH_CALL_USER
+from log_config import setup_logging
+
+# 设置统一的日志系统
+setup_logging()
+logger = logging.getLogger(__name__)
+
+class TaskLoader:
+    def __init__(self, task_cfg: DictConfig, storage_root):
+        self.task_file = Path(task_cfg.task_file)
+        #self.task_root = Path(task_cfg.task_root)
+        self.osworld_root = Path(task_cfg.osworld_root)
+        
+        self._latest_sha: Optional[str] = None
+        self.storage_root = storage_root
+        self.resume = task_cfg.resume
+
+    def poll_for_tasks(self) -> List[Dict]:
+        """find new tasks json file
+        return list of TaskInfo dict if there is new json
+        else return []
+        """
+        self._maybe_refresh_dataset()
+        
+        tasks_list = [task.to_dict() for task in self._tasks]
+        random.shuffle(tasks_list)
+
+        return tasks_list 
+    
+    def _maybe_refresh_dataset_bak(self):
+        
+        # check new json
+        latest_json = self._find_latest_json()
+
+        if latest_json is None:
+            return False # no json file
+        
+        sha = self._calc_sha1(latest_json)
+        if sha == self._latest_sha:
+            return False # no change
+        
+        with open(latest_json) as f:
+            data = json.load(f)
+            
+        raw_tasks = [
+            {"task_type": task_type, "task_id": task_id}
+            for task_type, task_ids in data.items()
+            for task_id in task_ids
+        ]
+        
+        self._tasks = [build_task(raw, self.osworld_root) for raw in raw_tasks]
+        self._latest_sha = sha
+
+        logger.info(f"当前任务文件: {str(latest_json)}")
+        logger.info(f"任务总数: {len(raw_tasks)}")
+        
+        return True
+    
+    def _maybe_refresh_dataset(self):
+        
+        latest_json = self.task_file
+        print("Current tasks file: ", str(latest_json))
+        
+        with open(latest_json) as f:
+            data = json.load(f)
+            
+        raw_tasks = [
+            {"task_type": task_type, "task_id": task_id}
+            for task_type, task_ids in data.items()
+            for task_id in task_ids
+        ]
+        
+        if self.resume:
+            # 过滤已完成或类型不匹配的任务
+            filtered_tasks = []
+            storage_root = Path(self.storage_root)
+
+            for raw in raw_tasks:
+                task_id = str(raw["task_id"])
+                task_type_expected = raw["task_type"]
+
+                # 找到所有以 task_id 开头的子目录（允许有多个版本）
+                candidate_dirs = [
+                    d for d in storage_root.iterdir()
+                    if d.is_dir() and d.name.startswith(task_id)
+                ]
+
+                # 默认认为任务未完成
+                task_finished = False
+
+                for d in candidate_dirs:
+                    cfg_path = d / "task_config.json"
+                    if not cfg_path.exists():
+                        print("找不到config文件")
+                        continue
+
+                    try:
+                        with cfg_path.open("r", encoding="utf-8") as cf:
+                            cfg = json.load(cf)
+                    except Exception:
+                        print("配置损坏，忽略此目录")
+                        continue
+
+                    # 3.1 task_type 不同 => 不是同一个任务，直接跳过这目录
+                    if cfg.get("raw", {}).get("task_type") != task_type_expected:
+                        continue
+
+                    # 3.2 task_type 相同，检查 reward.txt
+                    if (d / "reward.txt").exists():
+                        task_finished = True
+                        break  # 已找到完成记录，无需再看其他目录
+                if not task_finished:
+                    filtered_tasks.append(raw)
+            self._tasks = [build_task(raw, self.osworld_root) for raw in filtered_tasks]
+            print(f"Total number of tasks: {len(raw_tasks)}, Remained:{len(filtered_tasks)}")
+
+        else:
+            self._tasks = [build_task(raw, self.osworld_root) for raw in raw_tasks]
+            print(f"Total number of tasks: {len(raw_tasks)}")
+
+        return True
+        
+    def _find_latest_json(self) -> Optional[Path]:
+        files = list(self.task_root.glob("*.json"))
+        return max(files, key=lambda p: p.stat().st_mtime) if files else None
+    
+    @staticmethod
+    def _calc_sha1(fp: Path, chunk_size=2<<20) -> str:
+        h = hashlib.sha1()
+        with fp.open("rb") as f:
+            for chunk in iter(lambda: f.read(chunk_size), b""):
+                h.update(chunk)
+        return h.hexdigest()
+
+
+@dataclass
+class TaskInfo:
+    messages: List
+    instruction: str
+    task_config: Dict
+
+    def to_dict(self):
+        return asdict(self)
+
+
+def build_task(raw: Dict, osworld_root: Path, use_call_user: bool = False) -> TaskInfo:
+
+    task_type = raw["task_type"]
+    task_id = raw["task_id"]
+    task_path = os.path.join(osworld_root, task_type, task_id + ".json")
+    with open(task_path) as f:
+        task_data = json.load(f)
+
+    task_data["raw"] = {
+        "task_type": task_type,
+        "task_id": task_id
+    }
+
+    instruction = task_data["instruction"]
+
+    if "human-ground-truth" in task_data and "single-action" in task_data["human-ground-truth"]:
+        plan = task_data["human-ground-truth"]["single-action"]
+        plan_text = "\n".join(plan)
+        instruction = instruction.strip() + "\nHere is an instruction to help you complete the task: \n" + plan_text
+
+    system_prompt = COMPUTER_USE_PROMPT if not use_call_user else COMPUTER_USE_PROMPT_WITH_CALL_USER
+    messages = [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant."
+        },
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text", 
+                    "text": system_prompt.format(
+                        instruction=instruction, 
+                        language="English"
+                )}
+            ]
+        }
+    ]
+    
+
+    return TaskInfo(
+        messages = messages,
+        instruction = instruction,
+        task_config = task_data
+    )
--- a/mm_agents/dart_gui/utils.py
+++ b/mm_agents/dart_gui/utils.py
@@ -0,0 +1,511 @@
+import ast
+import base64
+import logging
+import math
+import re
+import xml.etree.ElementTree as ET
+from io import BytesIO
+from typing import Dict, List
+
+import numpy as np
+import openai
+
+from openai import OpenAI
+from PIL import Image
+from requests.exceptions import SSLError
+from mm_agents.dart_gui.prompts import FAILURE_INDICATORS
+
+# 设置日志系统
+logger = logging.getLogger(__name__)
+
+FINISH_WORD = "finished"
+WAIT_WORD = "wait"
+ENV_FAIL_WORD = "error_env"
+CALL_USER = "call_user"
+
+IMAGE_FACTOR = 28
+MIN_PIXELS = 100 * 28 * 28
+MAX_PIXELS = 16384 * 28 * 28
+MAX_RATIO = 200
+
+pure_text_settings = ["a11y_tree"]
+
+attributes_ns_ubuntu = "https://accessibility.windows.example.org/ns/attributes"
+attributes_ns_windows = "https://accessibility.windows.example.org/ns/attributes"
+state_ns_ubuntu = "https://accessibility.ubuntu.example.org/ns/state"
+state_ns_windows = "https://accessibility.windows.example.org/ns/state"
+component_ns_ubuntu = "https://accessibility.ubuntu.example.org/ns/component"
+component_ns_windows = "https://accessibility.windows.example.org/ns/component"
+value_ns_ubuntu = "https://accessibility.ubuntu.example.org/ns/value"
+value_ns_windows = "https://accessibility.windows.example.org/ns/value"
+class_ns_windows = "https://accessibility.windows.example.org/ns/class"
+# More namespaces defined in OSWorld, please check desktop_env/server/main.py
+
+# 定义一个函数来解析每个 action
+def parse_action(action_str):
+    try:
+        # 解析字符串为 AST 节点
+        node = ast.parse(action_str, mode='eval')
+
+        # 确保节点是一个表达式
+        if not isinstance(node, ast.Expression):
+            raise ValueError("Not an expression")
+
+        # 获取表达式的主体
+        call = node.body
+
+        # 确保主体是一个函数调用
+        if not isinstance(call, ast.Call):
+            raise ValueError("Not a function call")
+
+        # 获取函数名
+        if isinstance(call.func, ast.Name):
+            func_name = call.func.id
+        elif isinstance(call.func, ast.Attribute):
+            func_name = call.func.attr
+        else:
+            func_name = None
+
+        # 获取关键字参数
+        kwargs = {}
+        for kw in call.keywords:
+            key = kw.arg
+            # 处理不同类型的值，这里假设都是常量
+            if isinstance(kw.value, ast.Constant):
+                value = kw.value.value
+            elif isinstance(kw.value, ast.Str):  # 兼容旧版本 Python
+                value = kw.value.s
+            else:
+                value = None
+            kwargs[key] = value
+
+        return {
+            'function': func_name,
+            'args': kwargs
+        }
+
+    except Exception as e:
+        logger.error(f"Failed to parse action '{action_str}': {e}")
+        return None
+    
+def escape_single_quotes(text):
+    # 匹配未转义的单引号（不匹配 \\'）
+    pattern = r"(?<!\\)'"
+    return re.sub(pattern, r"\\'", text)
+
+def round_by_factor(number: int, factor: int) -> int:
+    """Returns the closest integer to 'number' that is divisible by 'factor'."""
+    return round(number / factor) * factor
+
+
+def ceil_by_factor(number: int, factor: int) -> int:
+    """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
+    return math.ceil(number / factor) * factor
+
+
+def floor_by_factor(number: int, factor: int) -> int:
+    """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
+    return math.floor(number / factor) * factor
+
+def linear_resize(
+    height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
+) -> tuple[int, int]:
+    if width * height > max_pixels:
+        """
+        如果图片超过/低于像素限制，则计算一个缩放因子resize_factor，使图片的像素数缩小到等于或小于max_pixels。这个缩放因子是通过开平方根计算的，确保纵横比保持不变,这样原始的相对坐标可以不经转换直接复用
+        """
+        resize_factor = math.sqrt(max_pixels / (width * height))
+        width, height = int(width * resize_factor), int(height * resize_factor)
+    if width * height < min_pixels:
+        resize_factor = math.sqrt(min_pixels / (width * height))
+        width, height = math.ceil(width * resize_factor), math.ceil(height * resize_factor)
+
+    return height, width 
+
+def smart_resize(
+    height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
+) -> tuple[int, int]:
+    """
+    Rescales the image so that the following conditions are met:
+
+    1. Both dimensions (height and width) are divisible by 'factor'.
+
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+
+    3. The aspect ratio of the image is maintained as closely as possible.
+    """
+    if max(height, width) / min(height, width) > MAX_RATIO:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
+        )
+    h_bar = max(factor, round_by_factor(height, factor))
+    w_bar = max(factor, round_by_factor(width, factor))
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = floor_by_factor(height / beta, factor)
+        w_bar = floor_by_factor(width / beta, factor)
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = ceil_by_factor(height * beta, factor)
+        w_bar = ceil_by_factor(width * beta, factor)
+    return h_bar, w_bar
+
+def parse_action_to_structure_output(text, factor, origin_resized_height, origin_resized_width, model_type, max_pixels=16384*28*28, min_pixels=100*28*28):
+    text = text.strip()
+    if model_type == "qwen25vl":
+        smart_resize_height, smart_resize_width = smart_resize(origin_resized_height, origin_resized_width, factor=IMAGE_FACTOR, min_pixels=min_pixels, max_pixels=max_pixels)
+
+    # 正则表达式匹配 Action 字符串
+    if text.startswith("Thought:"):
+        thought_pattern = r"Thought: (.+?)(?=\s*Action:|$)"
+        thought_hint = "Thought: "
+    elif text.startswith("Reflection:"):
+        thought_pattern = r"Reflection: (.+?)Action_Summary: (.+?)(?=\s*Action:|$)"
+        thought_hint = "Reflection: "
+    elif text.startswith("Action_Summary:"):
+        thought_pattern = r"Action_Summary: (.+?)(?=\s*Action:|$)"
+        thought_hint = "Action_Summary: "
+    else:
+        thought_pattern = r"Thought: (.+?)(?=\s*Action:|$)"
+        thought_hint = "Thought: "
+    reflection, thought = None, None
+    thought_match = re.search(thought_pattern, text, re.DOTALL)
+    if thought_match:
+        if len(thought_match.groups()) == 1:
+            thought = thought_match.group(1).strip()
+        elif len(thought_match.groups()) == 2:
+            thought = thought_match.group(2).strip()
+            reflection = thought_match.group(1).strip()
+    assert "Action:" in text
+    action_str = text.split("Action:")[-1]
+
+    tmp_all_action = action_str.split("\n\n")
+    all_action = []
+    for action_str in tmp_all_action:
+        if "type(content" in action_str:
+            # 正则表达式匹配 content 中的字符串并转义单引号
+            def escape_quotes(match):
+                content = match.group(1)  # 获取 content 的值
+                return content
+
+            # 使用正则表达式进行替换
+            pattern = r"type\(content='(.*?)'\)"  # 匹配 type(content='...')
+            content = re.sub(pattern, escape_quotes, action_str)
+
+            # 处理字符串
+            action_str = escape_single_quotes(content)
+            action_str = "type(content='" + action_str + "')"
+        
+        if "finished(content" in action_str:
+            # 正则表达式匹配 content 中的字符串并转义单引号
+            def escape_quotes(match):
+                content = match.group(1)  # 获取 content 的值
+                return content
+
+            # 使用正则表达式进行替换
+            pattern = r"finished\(content='(.*?)'\)"  # 匹配 type(content='...')
+            content = re.sub(pattern, escape_quotes, action_str)
+
+            # 处理字符串
+            action_str = escape_single_quotes(content)
+            action_str = "finished(content='" + action_str + "')"
+        all_action.append(action_str)
+
+    parsed_actions = [parse_action(action.replace("\n","\\n").lstrip()) for action in all_action]
+    actions = []
+    for action_instance, raw_str in zip(parsed_actions, all_action):
+        if action_instance == None:
+            logger.error(f"Action can't parse: {raw_str}")
+            # raise ValueError(f"Action can't parse: {raw_str}") 
+            continue
+        action_type = action_instance["function"]
+        params = action_instance["args"]
+
+        # import pdb; pdb.set_trace()
+        action_inputs = {}
+        for param_name, param in params.items():
+            if param == "": continue
+            param = param.lstrip()  # 去掉引号和多余的空格
+            # 处理start_box或者end_box参数格式 '<bbox>x1 y1 x2 y2</bbox>'
+            action_inputs[param_name.strip()] = param
+            
+            if "start_box" in param_name or "end_box" in param_name:
+                ori_box = param
+                # Remove parentheses and split the string by commas
+                numbers = ori_box.replace("(", "").replace(")", "").split(",")
+
+                # Convert to float and scale by 1000
+                # Qwen2.5vl output absolute coordinates, qwen2vl output relative coordinates
+                if model_type == "qwen25vl":
+                    float_numbers = []
+                    for num_idx, num in enumerate(numbers):
+                        num = float(num)
+                        if (num_idx + 1) % 2 == 0:
+                            float_numbers.append(float(num/smart_resize_height))
+                        else:
+                            float_numbers.append(float(num/smart_resize_width))
+                else:
+                    float_numbers = [float(num) / factor for num in numbers]
+
+                if len(float_numbers) == 2:
+                    float_numbers = [float_numbers[0], float_numbers[1], float_numbers[0], float_numbers[1]]
+                action_inputs[param_name.strip()] = str(float_numbers)
+
+        # import pdb; pdb.set_trace()
+        actions.append(
+            {
+            "reflection": reflection,
+            "thought": thought,
+            "action_type": action_type,
+            "action_inputs": action_inputs,
+            "text": text
+        })
+    return actions
+
+def parsing_response_to_pyautogui_code(responses, image_height: int, image_width:int, input_swap:bool=True) -> str:
+    '''
+    将M模型的输出解析为OSWorld中的action，生成pyautogui代码字符串
+    参数:
+        response: 包含模型输出的字典，结构类似于：
+        {
+            "action_type": "hotkey",
+            "action_inputs": {
+                "hotkey": "v ctrl",
+                "start_box": None,
+                "end_box": None
+            }
+        }
+    返回:
+        生成的pyautogui代码字符串
+    '''
+
+    pyautogui_code = "import pyautogui\nimport time\n"
+    if isinstance(responses, dict):
+        responses = [responses]
+    for response_id, response in enumerate(responses):
+        if "observation" in response:
+            observation = response["observation"]
+        else:
+            observation = ""
+
+        if "thought" in response:
+            thought = response["thought"]
+        else:
+            thought = ""
+        
+        if response_id == 0:
+            pyautogui_code += f"'''\nObservation:\n{observation}\n\nThought:\n{thought}\n'''\n"
+        else:
+            pyautogui_code += "\ntime.sleep(1)\n"
+
+        action_dict = response
+        response_text = action_dict.get("text", "")
+        action_type = action_dict.get("action_type")
+        action_inputs = action_dict.get("action_inputs", {})
+        
+        if action_type == "hotkey":
+            # Parsing hotkey action
+            if "key" in action_inputs:
+                hotkey = action_inputs.get("key", "")
+            else:
+                hotkey = action_inputs.get("hotkey", "")
+
+            if hotkey == "arrowleft":
+                hotkey = "left"
+
+            elif hotkey == "arrowright":
+                hotkey = "right"
+            
+            elif hotkey == "arrowup":
+                hotkey = "up"
+            
+            elif hotkey == "arrowdown":
+                hotkey = "down"
+
+            if hotkey:
+                # Handle other hotkeys
+                keys = hotkey.split()  # Split the keys by space
+                convert_keys = []
+                for key in keys:
+                    if key == "space":
+                        key = ' '
+                    convert_keys.append(key)
+                pyautogui_code += f"\npyautogui.hotkey({', '.join([repr(k) for k in convert_keys])})"
+        
+        elif action_type == "press":
+            # Parsing press action
+            if "key" in action_inputs:
+                key_to_press = action_inputs.get("key", "")
+            else:
+                key_to_press = action_inputs.get("press", "")
+
+            if hotkey == "arrowleft":
+                hotkey = "left"
+
+            elif hotkey == "arrowright":
+                hotkey = "right"
+            
+            elif hotkey == "arrowup":
+                hotkey = "up"
+            
+            elif hotkey == "arrowdown":
+                hotkey = "down"
+            
+            elif hotkey == "space":
+                hotkey = " "
+                
+            if key_to_press:
+                # Simulate pressing a single key
+                pyautogui_code += f"\npyautogui.press({repr(key_to_press)})"
+            
+        elif action_type == "keyup":
+            key_to_up = action_inputs.get("key", "")
+            pyautogui_code += f"\npyautogui.keyUp({repr(key_to_up)})"
+        
+        elif action_type == "keydown":
+            key_to_down = action_inputs.get("key", "")
+            pyautogui_code += f"\npyautogui.keyDown({repr(key_to_down)})"
+
+        elif action_type == "type":
+            # Parsing typing action using clipboard
+            content = action_inputs.get("content", "")
+            content = escape_single_quotes(content)
+            stripped_content = content
+            if content.endswith("\n") or content.endswith("\\n"):
+                stripped_content = stripped_content.rstrip("\\n").rstrip("\n")
+            if content:
+                if input_swap:
+                    pyautogui_code += "\nimport pyperclip"
+                    pyautogui_code += f"\npyperclip.copy('{stripped_content}')"
+                    pyautogui_code += "\npyautogui.hotkey('ctrl', 'v')"
+                    pyautogui_code += "\ntime.sleep(0.5)\n"
+                    if content.endswith("\n") or content.endswith("\\n"):
+                        pyautogui_code += "\npyautogui.press('enter')"
+                else:
+                    pyautogui_code += f"\npyautogui.write('{stripped_content}', interval=0.1)"
+                    pyautogui_code += "\ntime.sleep(0.5)\n"
+                    if content.endswith("\n") or content.endswith("\\n"):
+                        pyautogui_code += "\npyautogui.press('enter')"
+
+        
+        elif action_type in ["drag", "select"]:
+            # Parsing drag or select action based on start and end_boxes
+            start_box = action_inputs.get("start_box")
+            end_box = action_inputs.get("end_box")
+            if start_box and end_box:
+                x1, y1, x2, y2 = eval(start_box)  # Assuming box is in [x1, y1, x2, y2]
+                sx = round(float((x1 + x2) / 2) * image_width, 3)
+                sy = round(float((y1 + y2) / 2) * image_height, 3)
+                x1, y1, x2, y2 = eval(end_box)  # Assuming box is in [x1, y1, x2, y2]
+                ex = round(float((x1 + x2) / 2) * image_width, 3)
+                ey = round(float((y1 + y2) / 2) * image_height, 3)
+                pyautogui_code += (
+                    f"\npyautogui.moveTo({sx}, {sy})\n"
+                    f"\npyautogui.dragTo({ex}, {ey}, duration=1.0)\n"
+                )
+
+        elif action_type == "scroll":
+            # Parsing scroll action
+            start_box = action_inputs.get("start_box")
+            if start_box:
+                x1, y1, x2, y2 = eval(start_box)  # Assuming box is in [x1, y1, x2, y2]
+                x = round(float((x1 + x2) / 2) * image_width, 3)
+                y = round(float((y1 + y2) / 2) * image_height, 3)
+                
+                # # 先点对应区域，再滚动
+                # pyautogui_code += f"\npyautogui.click({x}, {y}, button='left')"
+            else:
+                x = None
+                y = None
+            direction = action_inputs.get("direction", "")
+            
+            if x == None:
+                if "up" in direction.lower():
+                    pyautogui_code += "\npyautogui.scroll(5)"
+                elif "down" in direction.lower():
+                    pyautogui_code += "\npyautogui.scroll(-5)"
+            else:
+                if "up" in direction.lower():
+                    pyautogui_code += f"\npyautogui.scroll(5, x={x}, y={y})"
+                elif "down" in direction.lower():
+                    pyautogui_code += f"\npyautogui.scroll(-5, x={x}, y={y})"
+
+        elif action_type in ["click", "left_single", "left_double", "right_single", "hover"]:
+            # Parsing mouse click actions
+            start_box = action_inputs.get("start_box")
+            start_box = str(start_box)
+            if start_box:
+                start_box = eval(start_box)
+                if start_box is None:
+                    logger.warning(f"[Warning] start_box is None and wired condition:\n{action_inputs}")
+                    
+                if len(start_box) == 4:
+                    x1, y1, x2, y2 = start_box  # Assuming box is in [x1, y1, x2, y2]
+                elif len(start_box) == 2:
+                    x1, y1 = start_box
+                    x2 = x1
+                    y2 = y1
+                x = round(float((x1 + x2) / 2) * image_width, 3)
+                y = round(float((y1 + y2) / 2) * image_height, 3)
+                if action_type == "left_single" or action_type == "click":
+                    pyautogui_code += f"\npyautogui.click({x}, {y}, button='left')"
+                elif action_type == "left_double":
+                    pyautogui_code += f"\npyautogui.doubleClick({x}, {y}, button='left')"
+                elif action_type == "right_single":
+                    pyautogui_code += f"\npyautogui.click({x}, {y}, button='right')"
+                elif action_type == "hover":
+                    pyautogui_code += f"\npyautogui.moveTo({x}, {y})"
+        
+        elif action_type in ["finished"]:
+            pyautogui_code = "DONE"
+            print(f"FINISHED:response_text: {response_text}")
+            print(f"FINISHED:response: {str(response)}")
+            for failure_indicator in FAILURE_INDICATORS:
+                if failure_indicator in response_text:
+                    pyautogui_code = "FAIL"
+                    break
+        elif action_type in ["wait"]:
+            pyautogui_code = "WAIT"
+            
+        elif action_type in ["call_user"]:
+            pyautogui_code = "FAIL"
+        else:
+            pyautogui_code += f"\n# Unrecognized action type: {action_type}"
+
+    return pyautogui_code
+
+def add_box_token(input_string):
+    # Step 1: Split the string into individual actions
+    if "Action: " in input_string and "start_box=" in input_string:
+        suffix = input_string.split("Action: ")[0] + "Action: "
+        actions = input_string.split("Action: ")[1:]
+        processed_actions = []
+        for action in actions:
+            action = action.strip()
+            # Step 2: Extract coordinates (start_box or end_box) using regex
+            coordinates = re.findall(r"(start_box|end_box)='\((\d+),\s*(\d+)\)'", action)
+            
+            updated_action = action  # Start with the original action
+            for coord_type, x, y in coordinates:
+                # Convert x and y to integers
+                updated_action = updated_action.replace(f"{coord_type}='({x},{y})'", f"{coord_type}='<|box_start|>({x},{y})<|box_end|>'")
+            processed_actions.append(updated_action)
+        
+        # Step 5: Reconstruct the final string
+        final_string = suffix + "\n\n".join(processed_actions)
+    else:
+        final_string = input_string
+    # print(f"Input string: {input_string}")
+    # print(f"Final string: {final_string}")
+    return [{"type": "text", "text": final_string}]
+
+def pil_to_base64(image):
+    """Convert PIL Image or bytes to base64 string"""
+    if isinstance(image, bytes):
+        # If it's already bytes, just encode to base64
+        return base64.b64encode(image).decode("utf-8")
+    else:
+        # If it's a PIL Image, convert it
+        buffer = BytesIO()
+        image.save(buffer, format="PNG")
+        return base64.b64encode(buffer.getvalue()).decode("utf-8")