diff --git a/desktop_env/server/main.py b/desktop_env/server/main.py index eef0afc..9f57659 100644 --- a/desktop_env/server/main.py +++ b/desktop_env/server/main.py @@ -21,6 +21,8 @@ from pyatspi import Action as ATAction from pyatspi import Component, Document from pyatspi import Text as ATText from pyatspi import Value as ATValue +from pywinauto import Desktop +from pywinauto.base_wrapper import BaseWrapper from pyxcursor import Xcursor @@ -170,18 +172,20 @@ def get_terminal_output(): return jsonify({"output": None, "status": "error"}) -_accessibility_ns_map = {"st": "uri:deskat:state.at-spi.gnome.org" - , "attr": "uri:deskat:attributes.at-spi.gnome.org" - , "cp": "uri:deskat:component.at-spi.gnome.org" - , "doc": "uri:deskat:document.at-spi.gnome.org" - , "docattr": "uri:deskat:attributes.document.at-spi.gnome.org" - , "txt": "uri:deskat:text.at-spi.gnome.org" - , "val": "uri:deskat:value.at-spi.gnome.org" - , "act": "uri:deskat:action.at-spi.gnome.org" - } +_accessibility_ns_map = { "st": "uri:deskat:state.at-spi.gnome.org" + , "attr": "uri:deskat:attributes.at-spi.gnome.org" + , "cp": "uri:deskat:component.at-spi.gnome.org" + , "doc": "uri:deskat:document.at-spi.gnome.org" + , "docattr": "uri:deskat:attributes.document.at-spi.gnome.org" + , "txt": "uri:deskat:text.at-spi.gnome.org" + , "val": "uri:deskat:value.at-spi.gnome.org" + , "act": "uri:deskat:action.at-spi.gnome.org" + , "win": "uri:deskat:uia.windows.microsoft.org" + } -def _create_atspi_node(node: Accessible, depth: int, flag: Optional[str] = None) -> _Element: +def _create_atspi_node(node: Accessible, depth: int = 0, flag: Optional[str] = None) -> _Element: + # function _create_atspi_node {{{ # if node.getRoleName() == "document spreadsheet": flag = "calc" if node.getRoleName() == "application" and node.name=="Thunderbird": @@ -370,6 +374,175 @@ def _create_atspi_node(node: Accessible, depth: int, flag: Optional[str] = None) break xml_node.append(_create_atspi_node(ch, depth+1, flag)) return xml_node + # }}} function _create_atspi_node # + +def _create_pywinauto_node(node: BaseWrapper, depth: int = 0, flag: Optional[str] = None) -> _Element: + # function _create_pywinauto_node {{{ # + #element_info: ElementInfo = node.element_info + attribute_dict: Dict[str, Any] = {"name": node.element_info.name} + + # States {{{ # + attribute_dict["{{{:}}}enabled".format(_accessibility_ns_map["st"])] = str(node.is_enabled()).lower() + attribute_dict["{{{:}}}visible".format(_accessibility_ns_map["st"])] = str(node.is_visible()).lower() + attribute_dict["{{{:}}}active".format(_accessibility_ns_map["st"])] = str(node.is_active()).lower() + + if hasattr(node, "is_minimized"): + try: + attribute_dict["{{{:}}}minimized".format(_accessibility_ns_map["st"])] = str(node.is_minimized()).lower() + except: + pass + if hasattr(node, "is_maximized"): + try: + attribute_dict["{{{:}}}maximized".format(_accessibility_ns_map["st"])] = str(node.is_maximized()).lower() + except: + pass + if hasattr(node, "is_normal"): + try: + attribute_dict["{{{:}}}normal".format(_accessibility_ns_map["st"])] = str(node.is_normal()).lower() + except: + pass + + if hasattr(node, "is_unicode"): + try: + attribute_dict["{{{:}}}unicode".format(_accessibility_ns_map["st"])] = str(node.is_unicode()).lower() + except: + pass + + if hasattr(node, "is_collapsed"): + try: + attribute_dict["{{{:}}}collapsed".format(_accessibility_ns_map["st"])] = str(node.is_collapsed()).lower() + except: + pass + if hasattr(node, "is_checkable"): + try: + attribute_dict["{{{:}}}checkable".format(_accessibility_ns_map["st"])] = str(node.is_checkable()).lower() + except: + pass + if hasattr(node, "is_checked"): + try: + attribute_dict["{{{:}}}checked".format(_accessibility_ns_map["st"])] = str(node.is_checked()).lower() + except: + pass + if hasattr(node, "is_focused"): + try: + attribute_dict["{{{:}}}focused".format(_accessibility_ns_map["st"])] = str(node.is_focused()).lower() + except: + pass + if hasattr(node, "is_keyboard_focused"): + try: + attribute_dict["{{{:}}}keyboard_focused".format(_accessibility_ns_map["st"])] = str(node.is_keyboard_focused()).lower() + except: + pass + if hasattr(node, "is_selected"): + try: + attribute_dict["{{{:}}}selected".format(_accessibility_ns_map["st"])] = str(node.is_selected()).lower() + except: + pass + if hasattr(node, "is_selection_required"): + try: + attribute_dict["{{{:}}}selection_required".format(_accessibility_ns_map["st"])] = str(node.is_selection_required()).lower() + except: + pass + if hasattr(node, "is_pressable"): + try: + attribute_dict["{{{:}}}pressable".format(_accessibility_ns_map["st"])] = str(node.is_pressable()).lower() + except: + pass + if hasattr(node, "is_pressed"): + try: + attribute_dict["{{{:}}}pressed".format(_accessibility_ns_map["st"])] = str(node.is_pressed()).lower() + except: + pass + + if hasattr(node, "is_expanded"): + try: + attribute_dict["{{{:}}}expanded".format(_accessibility_ns_map["st"])] = str(node.is_expanded()).lower() + except: + pass + if hasattr(node, "is_editable"): + try: + attribute_dict["{{{:}}}editable".format(_accessibility_ns_map["st"])] = str(node.is_editable()).lower() + except: + pass + # }}} States # + + # Component {{{ # + rectangle = node.rectangle() + attribute_dict["{{{:}}}screencoord".format(_accessibility_ns_map["cp"])] = "({:d}, {:d})".format(rectangle.left, rectangle.top) + attribute_dict["{{{:}}}size".format(_accessibility_ns_map["cp"])] = "({:d}, {:d})".format(rectangle.width(), rectangle.height()) + # }}} Component # + + # Text {{{ # + text: str = node.window_text() + if text==attribute_dict["name"]: + text = "" + #if hasattr(node, "texts"): + #texts: List[str] = node.texts()[1:] + #texts: Iterable[str] = map(lambda itm: itm if isinstance(itm, str) else "".join(itm), texts) + #text += "\n".join(texts) + #text = text.strip() + # }}} Text # + + # Selection {{{ # + if hasattr(node, "select"): + attribute_dict["selection"] = "true" + # }}} Selection # + + # Value {{{ # + if hasattr(node, "get_step"): + attribute_dict["{{{:}}}step".format(_accessibility_ns_map["val"])] = str(node.get_step()) + if hasattr(node, "value"): + attribute_dict["{{{:}}}value".format(_accessibility_ns_map["val"])] = str(node.value()) + if hasattr(node, "get_value"): + attribute_dict["{{{:}}}value".format(_accessibility_ns_map["val"])] = str(node.get_value()) + elif hasattr(node, "get_position"): + attribute_dict["{{{:}}}value".format(_accessibility_ns_map["val"])] = str(node.get_position()) + if hasattr(node, "min_value"): + attribute_dict["{{{:}}}min".format(_accessibility_ns_map["val"])] = str(node.min_value()) + elif hasattr(node, "get_range_min"): + attribute_dict["{{{:}}}min".format(_accessibility_ns_map["val"])] = str(node.get_range_min()) + if hasattr(node, "max_value"): + attribute_dict["{{{:}}}max".format(_accessibility_ns_map["val"])] = str(node.max_value()) + elif hasattr(node, "get_range_max"): + attribute_dict["{{{:}}}max".format(_accessibility_ns_map["val"])] = str(node.get_range_max()) + # }}} Value # + + attribute_dict["{{{:}}}class".format(_accessibility_ns_map["win"])] = str(type(node)) + + node_role_name: str = node.class_name().lower().replace(" ", "-") + node_role_name = "".join( map( lambda ch: ch if ch.isidentifier()\ + or ch in {"-"}\ + or ch.isalnum() + else "-" + , node_role_name + ) + ) + if node_role_name.strip() == "": + node_role_name = "unknown" + + xml_node = lxml.etree.Element( + node_role_name, + attrib=attribute_dict, + nsmap=_accessibility_ns_map + ) + if text is not None and len(text)>0 and text!=attribute_dict["name"]: + xml_node.text = text + + # HYPERPARAMETER + if depth==50: + logger.warning("Max depth reached") + #print("Max depth reached") + return xml_node + + for i, ch in enumerate(node.children()): + # HYPERPARAMETER + if i>=2048: + logger.warning("Max width reached") + #print("Max width reached") + break + xml_node.append(_create_pywinauto_node(ch, depth+1, flag)) + return xml_node + # }}} function _create_pywinauto_node # @app.route("/accessibility", methods=["GET"]) def get_accessibility_tree(): @@ -381,7 +554,15 @@ def get_accessibility_tree(): desktop_xml: _Element = _create_atspi_node(desktop, 0) return jsonify({"AT": lxml.etree.tostring(desktop_xml, encoding="unicode")}) - # TODO: Windows AT may be read through `pywinauto` module, however, two different backends `win32` and `uia` are supported and different results may be returned + elif os_name == "Windows": + # Windows AT may be read through `pywinauto` module, however, two different backends `win32` and `uia` are supported and different results may be returned + desktop: Desktop = Desktop(backend="uia") + xml_node = lxml.etree.Element("desktop", nsmap=_accessibility_ns_map) + for wnd in desktop.windows(): + logger.debug("Win UIA AT parsing: %s(%d)", wnd.element_info.name, len(wnd.children())) + node: _Element = _create_pywinauto_node(wnd, 1) + xml_node.append(node) + return jsonify({"AT": lxml.etree.tostring(xml_node, encoding="unicode")}) else: return "Currently not implemented for platform {:}.".format(platform.platform()), 500 diff --git a/mm_agents/gpt_4v_agent.py b/mm_agents/gpt_4v_agent.py index 4fb52ea..507f2c6 100644 --- a/mm_agents/gpt_4v_agent.py +++ b/mm_agents/gpt_4v_agent.py @@ -45,6 +45,10 @@ def linearize_accessibility_tree(accessibility_tree): linearized_accessibility_tree += node.attrib.get('name') + "\t" if node.text: linearized_accessibility_tree += (node.text if '"' not in node.text else '"{:}"'.format(node.text.replace('"', '""'))) + "\t" + elif node.get("{uri:deskat:uia.windows.microsoft.org}class").endswith("EditWrapper")\ + and node.get("{uri:deskat:value.at-spi.gnome.org}value"): + text: str = node.get("{uri:deskat:value.at-spi.gnome.org}value") + linearized_accessibility_tree += (text if '"' not in text else '"{:}"'.format(text.replace('"', '""'))) + "\t" else: linearized_accessibility_tree += '""\t' linearized_accessibility_tree += node.attrib.get(