From 5eb541718875dc603f7ab244824792ed3679a74d Mon Sep 17 00:00:00 2001 From: eun2ce Date: Fri, 26 Sep 2025 19:25:28 +0900 Subject: [PATCH] fix #210: add a11y_tree support to UITARSAgent (#346) --- mm_agents/uitars_agent.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/mm_agents/uitars_agent.py b/mm_agents/uitars_agent.py index a36c628..165562f 100644 --- a/mm_agents/uitars_agent.py +++ b/mm_agents/uitars_agent.py @@ -506,17 +506,18 @@ class UITARSAgent: if last_action_after_obs is not None and self.infer_mode == "double_image": self.history_images.append(last_action_after_obs["screenshot"]) - self.history_images.append(obs["screenshot"]) - if self.observation_type in ["screenshot", "screenshot_a11y_tree"]: - base64_image = obs["screenshot"] + self.history_images.append(obs["screenshot"]) + + if self.observation_type in ["screenshot", "screenshot_a11y_tree", "a11y_tree"]: + base64_image = obs["screenshot"] if self.observation_type in ["screenshot", "screenshot_a11y_tree"] else None try: linearized_accessibility_tree = ( linearize_accessibility_tree( accessibility_tree=obs["accessibility_tree"], platform=self.platform, ) - if self.observation_type == "screenshot_a11y_tree" + if self.observation_type in ["screenshot_a11y_tree", "a11y_tree"] else None ) except: @@ -535,7 +536,14 @@ class UITARSAgent: "accessibility_tree": linearized_accessibility_tree, } ) - else: + elif self.observation_type == "a11y_tree": + self.observations.append( + { + "screenshot": None, + "accessibility_tree": linearized_accessibility_tree, + } + ) + else: # screenshot self.observations.append( {"screenshot": base64_image, "accessibility_tree": None} ) @@ -760,4 +768,4 @@ class UITARSAgent: self.actions = [] self.observations = [] self.history_images = [] - self.history_responses = [] \ No newline at end of file + self.history_responses = []