From 3db0591868321688685f400e45cf5d26de6cf55a Mon Sep 17 00:00:00 2001
From: David Chang <zdy004007@126.com>
Date: Mon, 18 Mar 2024 17:42:13 +0800
Subject: [PATCH 1/3] ver Mar18th

checked Claude agent
---
 branch-config/filelist | 3 +++
 branch_flag            | 2 +-
 mm_agents/agent.py     | 7 ++++++-
 run.py                 | 1 +
 4 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/branch-config/filelist b/branch-config/filelist
index a34a418..c6aac47 100644
--- a/branch-config/filelist
+++ b/branch-config/filelist
@@ -5,5 +5,8 @@ evaluation_examples
 logs
 
 mm_agents
+run.py
+lib_run_single.py
+settings.json
 
 quick_evaluate.py
diff --git a/branch_flag b/branch_flag
index 9daeafb..cb4898e 100644
--- a/branch_flag
+++ b/branch_flag
@@ -1 +1 @@
-test
+claude
diff --git a/mm_agents/agent.py b/mm_agents/agent.py
index ff92673..7a454cc 100644
--- a/mm_agents/agent.py
+++ b/mm_agents/agent.py
@@ -256,7 +256,6 @@ class PromptAgent:
             if self.observation_type == "screenshot_a11y_tree":
                 _screenshot = previous_obs["screenshot"]
                 _linearized_accessibility_tree = previous_obs["accessibility_tree"]
-                logger.debug("LINEAR AT: %s", _linearized_accessibility_tree)
 
                 messages.append({
                     "role": "user",
@@ -343,6 +342,7 @@ class PromptAgent:
         if self.observation_type in ["screenshot", "screenshot_a11y_tree"]:
             base64_image = encode_image(obs["screenshot"])
             linearized_accessibility_tree = linearize_accessibility_tree(accessibility_tree=obs["accessibility_tree"])
+            logger.debug("LINEAR AT: %s", linearized_accessibility_tree)
 
             if self.observation_type == "screenshot_a11y_tree":
                 self.observations.append({
@@ -376,6 +376,7 @@ class PromptAgent:
             })
         elif self.observation_type == "a11y_tree":
             linearized_accessibility_tree = linearize_accessibility_tree(accessibility_tree=obs["accessibility_tree"])
+            logger.debug("LINEAR AT: %s", linearized_accessibility_tree)
 
             self.observations.append({
                 "screenshot": None,
@@ -423,6 +424,8 @@ class PromptAgent:
         # with open("messages.json", "w") as f:
         #     f.write(json.dumps(messages, indent=4))
 
+        #logger.info("PROMPT: %s", messages)
+
         response = self.call_llm({
             "model": self.model,
             "messages": messages,
@@ -522,6 +525,8 @@ class PromptAgent:
                 claude_messages[1]['content'].insert(0, claude_system_message_item)
                 claude_messages.pop(0)
 
+            logger.debug("CLAUDE MESSAGE: %s", repr(claude_messages))
+
             # headers = {
             #     "x-api-key": os.environ["ANTHROPIC_API_KEY"],
             #     "anthropic-version": "2023-06-01",
diff --git a/run.py b/run.py
index 5212bc0..2277d11 100644
--- a/run.py
+++ b/run.py
@@ -140,6 +140,7 @@ def test(
 
     env = DesktopEnv(
         path_to_vm=args.path_to_vm,
+        snapshot_name="Snapshot 35",
         action_space=agent.action_space,
         screen_size=(args.screen_width, args.screen_height),
         headless=args.headless,

From b5d58b8ecd12cf873b240dcbeb86f65a3f3fa80b Mon Sep 17 00:00:00 2001
From: David Chang <zdy004007@126.com>
Date: Tue, 19 Mar 2024 17:43:34 +0800
Subject: [PATCH 2/3] ver Mar19th

a tiny fix
---
 mm_agents/accessibility_tree_wrap/heuristic_retrieve.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
index e37f614..9611ea3 100644
--- a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
+++ b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
@@ -40,7 +40,7 @@ def judge_node(node: ET, platform="ubuntu", check_image=False) -> bool:
                or node.tag.endswith("textfield")\
                or node.tag.endswith("textarea")\
                or node.tag.endswith("menu")\
-               or node.tag in [ "alert", "canvas", "check-box"
+               or node.tag in { "alert", "canvas", "check-box"
                               , "combo-box", "entry", "icon"
                               , "image", "paragraph", "scroll-bar"
                               , "section", "slider", "static"
@@ -48,7 +48,7 @@ def judge_node(node: ET, platform="ubuntu", check_image=False) -> bool:
                               , "netuiribbontab", "start", "trayclockwclass"
                               , "traydummysearchcontrol", "uiimage", "uiproperty"
                               , "uiribboncommandbar"
-                              ]
+                              }
     keeps = keeps and ( platform=="ubuntu"\
                         and node.get("{{{:}}}showing".format(state_ns), "false")=="true"\
                         and node.get("{{{:}}}visible".format(state_ns), "false")=="true"\

From 4df088e2ad982f857fc5276b08ac3d57eca971bc Mon Sep 17 00:00:00 2001
From: David Chang <zdy004007@126.com>
Date: Tue, 19 Mar 2024 18:41:55 +0800
Subject: [PATCH 3/3] ver Mar19thv2

supplemented at info back for som setting
---
 .../heuristic_retrieve.py                     | 35 +++++++++++---
 mm_agents/agent.py                            | 48 ++++++++++++-------
 mm_agents/prompts.py                          |  2 +-
 3 files changed, 59 insertions(+), 26 deletions(-)

diff --git a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
index 9611ea3..934d8fd 100644
--- a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
+++ b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
@@ -2,7 +2,7 @@ import xml.etree.ElementTree as ET
 
 from PIL import Image, ImageDraw, ImageFont
 
-from typing import Tuple
+from typing import Tuple, List
 
 def find_leaf_nodes(xlm_file_str):
     if not xlm_file_str:
@@ -66,7 +66,7 @@ def judge_node(node: ET, platform="ubuntu", check_image=False) -> bool:
 
     coordinates: Tuple[int, int] = eval(node.get("{{{:}}}screencoord".format(component_ns), "(-1, -1)"))
     sizes: Tuple[int, int] = eval(node.get("{{{:}}}size".format(component_ns), "(-1, -1)"))
-    keeps = keeps and coordinates[0]>0 and coordinates[1]>0 and sizes[0]>0 and sizes[1]>0
+    keeps = keeps and coordinates[0]>=0 and coordinates[1]>=0 and sizes[0]>0 and sizes[1]>0
     return keeps
 
 def filter_nodes(root: ET, platform="ubuntu", check_image=False):
@@ -86,6 +86,7 @@ def draw_bounding_boxes(nodes, image_file_path, output_image_file_path):
     draw = ImageDraw.Draw(image)
     marks = []
     drew_nodes = []
+    text_informations: List[str] = ["index\ttag\tname\ttext"]
 
     try:
         # Adjust the path to the font file you have or use a default one
@@ -135,18 +136,38 @@ def draw_bounding_boxes(nodes, image_file_path, output_image_file_path):
                 #draw.rectangle([text_position, (text_position[0] + 25, text_position[1] + 18)], fill='black')
                 draw.rectangle(text_bbox, fill='black')
                 draw.text(text_position, str(index), font=font, anchor="lb", fill="white")
-                index += 1
 
                 # each mark is an x, y, w, h tuple
                 marks.append([coords[0], coords[1], size[0], size[1]])
                 drew_nodes.append(_node)
 
+                if _node.text:
+                    node_text = ( _node.text if '"' not in _node.text\
+                             else '"{:}"'.format(_node.text.replace('"', '""'))
+                                )
+                elif _node.get("{uri:deskat:uia.windows.microsoft.org}class", "").endswith("EditWrapper") \
+                        and _node.get("{uri:deskat:value.at-spi.gnome.org}value"):
+                    node_text: str = _node.get("{uri:deskat:value.at-spi.gnome.org}value")
+                    node_text = (node_text if '"' not in node_text\
+                             else '"{:}"'.format(node_text.replace('"', '""'))
+                                )
+                else:
+                    node_text = '""'
+                text_information: str = "{:d}\t{:}\t{:}\t{:}"\
+                                            .format( index, _node.tag
+                                                   , _node.get("name", "")
+                                                   , node_text
+                                                   )
+                text_informations.append(text_information)
+
+                index += 1
+
             except ValueError:
                 pass
 
     # Save the result
     image.save(output_image_file_path)
-    return marks, drew_nodes
+    return marks, drew_nodes, "\n".join(text_informations)
 
 
 def print_nodes_with_indent(nodes, indent=0):
@@ -157,12 +178,12 @@ def print_nodes_with_indent(nodes, indent=0):
 
 if __name__ == '__main__':
     import json
-    with open('selection_sorted(imaged).xml', 'r', encoding='utf-8') as f:
+    with open('3.xml', 'r', encoding='utf-8') as f:
         xml_file_str = f.read()
     filtered_nodes = filter_nodes(ET.fromstring(xml_file_str))
     print(len(filtered_nodes))
-    masks = draw_bounding_boxes( filtered_nodes, 'selection_sorted(imaged).png'
-                               , 'selection_sorted(imaged).ai.png'
+    masks = draw_bounding_boxes( filtered_nodes, '3.a.png'
+                               , '3.png'
                                )
 
     # print(masks)
diff --git a/mm_agents/agent.py b/mm_agents/agent.py
index 9613f44..f2d4b5c 100644
--- a/mm_agents/agent.py
+++ b/mm_agents/agent.py
@@ -37,27 +37,36 @@ def linearize_accessibility_tree(accessibility_tree):
     # leaf_nodes = find_leaf_nodes(accessibility_tree)
     filtered_nodes = filter_nodes(ET.fromstring(accessibility_tree))
 
-    linearized_accessibility_tree = "tag\tname\ttext\tposition (top-left x&y)\tsize (w&h)\n"
+    linearized_accessibility_tree = ["tag\tname\ttext\tposition (top-left x&y)\tsize (w&h)"]
     # Linearize the accessibility tree nodes into a table format
 
     for node in filtered_nodes:
-        linearized_accessibility_tree += node.tag + "\t"
-        linearized_accessibility_tree += node.attrib.get('name') + "\t"
+        #linearized_accessibility_tree += node.tag + "\t"
+        #linearized_accessibility_tree += node.attrib.get('name') + "\t"
         if node.text:
-            linearized_accessibility_tree += (node.text if '"' not in node.text else '"{:}"'.format(
-                node.text.replace('"', '""'))) + "\t"
+            text = ( node.text if '"' not in node.text\
+                else '"{:}"'.format(node.text.replace('"', '""'))
+                   )
         elif node.get("{uri:deskat:uia.windows.microsoft.org}class", "").endswith("EditWrapper") \
                 and node.get("{uri:deskat:value.at-spi.gnome.org}value"):
             text: str = node.get("{uri:deskat:value.at-spi.gnome.org}value")
-            linearized_accessibility_tree += (text if '"' not in text else '"{:}"'.format(
-                text.replace('"', '""'))) + "\t"
+            text = (text if '"' not in text\
+                else '"{:}"'.format(text.replace('"', '""'))
+                   )
         else:
-            linearized_accessibility_tree += '""\t'
-        linearized_accessibility_tree += node.attrib.get(
-            '{uri:deskat:component.at-spi.gnome.org}screencoord', "") + "\t"
-        linearized_accessibility_tree += node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size', "") + "\n"
+            text = '""'
+        #linearized_accessibility_tree += node.attrib.get(
+                #, "") + "\t"
+        #linearized_accessibility_tree += node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size', "") + "\n"
+        linearized_accessibility_tree.append(
+                "{:}\t{:}\t{:}\t{:}\t{:}".format(
+                    node.tag, node.get("name", ""), text
+                  , node.get('{uri:deskat:component.at-spi.gnome.org}screencoord', "")
+                  , node.get('{uri:deskat:component.at-spi.gnome.org}size', "")
+                  )
+              )
 
-    return linearized_accessibility_tree
+    return "\n".join(linearized_accessibility_tree)
 
 
 def tag_screenshot(screenshot, accessibility_tree):
@@ -68,9 +77,9 @@ def tag_screenshot(screenshot, accessibility_tree):
     # nodes = filter_nodes(find_leaf_nodes(accessibility_tree))
     nodes = filter_nodes(ET.fromstring(accessibility_tree), check_image=True)
     # Make tag screenshot
-    marks, drew_nodes = draw_bounding_boxes(nodes, screenshot, tagged_screenshot_file_path)
+    marks, drew_nodes, element_list = draw_bounding_boxes(nodes, screenshot, tagged_screenshot_file_path)
 
-    return marks, drew_nodes, tagged_screenshot_file_path
+    return marks, drew_nodes, tagged_screenshot_file_path, element_list
 
 
 def parse_actions_from_string(input_string):
@@ -395,11 +404,13 @@ class PromptAgent:
             })
         elif self.observation_type == "som":
             # Add som to the screenshot
-            masks, drew_nodes, tagged_screenshot = tag_screenshot(obs["screenshot"], obs["accessibility_tree"])
+            masks, drew_nodes, tagged_screenshot, linearized_accessibility_tree = tag_screenshot(obs["screenshot"], obs["accessibility_tree"])
             base64_image = encode_image(tagged_screenshot)
+            logger.debug("LINEAR AT: %s", linearized_accessibility_tree)
 
             self.observations.append({
-                "screenshot": base64_image
+                "screenshot": base64_image,
+                "accessibility_tree": linearized_accessibility_tree
             })
 
             messages.append({
@@ -407,7 +418,8 @@ class PromptAgent:
                 "content": [
                     {
                         "type": "text",
-                        "text": "Given the tagged screenshot as below. What's the next step that you will do to help with the task?"
+                        "text": "Given the tagged screenshot and info from accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format(
+                            linearized_accessibility_tree)
                     },
                     {
                         "type": "image_url",
@@ -774,7 +786,7 @@ class PromptAgent:
             if response.status_code == HTTPStatus.OK:
                 try:
                     return response.json()['output']['choices'][0]['message']['content']
-                except Exception as e:
+                except Exception:
                     return ""
             else:
                 print(response.code)  # The error code.
diff --git a/mm_agents/prompts.py b/mm_agents/prompts.py
index 462aac7..c609a66 100644
--- a/mm_agents/prompts.py
+++ b/mm_agents/prompts.py
@@ -801,7 +801,7 @@ You CAN predict multiple actions at one step, but you should only return one act
 SYS_PROMPT_IN_SOM_OUT_TAG = """
 You are an agent which follow my instruction and perform desktop computer tasks as instructed.
 You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard.
-For each step, you will get an observation of the desktop by a screenshot with interact-able elements marked with numerical tags. And you will predict the action of the computer based on the image.
+For each step, you will get an observation of the desktop by 1) a screenshot with interact-able elements marked with numerical tags; and 2) accessibility tree, which is based on AT-SPI library. And you will predict the action of the computer based on the image and test information.
 
 You are required to use `pyautogui` to perform the action grounded to the observation, but DONOT use the `pyautogui.locateCenterOnScreen` function to locate the element you want to operate with since we have no image of the element you want to operate with. DONOT USE `pyautogui.screenshot()` to make screenshot.
 You can replace x, y in the code with the tag of the element you want to operate with. such as: