Merge remote-tracking branch 'origin/main'

# Conflicts: # mm_agents/agent.py # run.py
2024-03-15 21:10:32 +08:00
parent 5cbf1b28ca 1789a28657
commit 4db207fc27
11 changed files with 215 additions and 85 deletions
--- a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
+++ b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
@@ -55,12 +55,12 @@ def judge_node(node: ET, platform="ubuntu") -> bool:
                     or platform=="windows"\
                        and node.get("{{{:}}}visible".format(state_ns), "false")=="true"\
                      )\
-                    and ( node.get("{{{:}}}enabled".format(state_ns), "false")=="true"\
-                       or node.get("{{{:}}}editable".format(state_ns), "false")=="true"\
-                       or node.get("{{{:}}}expandable".format(state_ns), "false")=="true"\
-                       or node.get("{{{:}}}checkable".format(state_ns), "false")=="true"
-                        )\
-                    and (node.get("name", "") != "" or node.text is not None and len(node.text)>0)
+                  and ( node.get("{{{:}}}enabled".format(state_ns), "false")=="true"\
+                     or node.get("{{{:}}}editable".format(state_ns), "false")=="true"\
+                     or node.get("{{{:}}}expandable".format(state_ns), "false")=="true"\
+                     or node.get("{{{:}}}checkable".format(state_ns), "false")=="true"
+                      )\
+                  and (node.get("name", "") != "" or node.text is not None and len(node.text)>0)

    coordinates: Tuple[int, int] = eval(node.get("{{{:}}}screencoord".format(component_ns), "(-1, -1)"))
    sizes: Tuple[int, int] = eval(node.get("{{{:}}}size".format(component_ns), "(-1, -1)"))
--- a/mm_agents/agent.py
+++ b/mm_agents/agent.py
@@ -5,11 +5,13 @@ import os
 import re
 import time
 import uuid
+import openai
 import xml.etree.ElementTree as ET
 from http import HTTPStatus
 from io import BytesIO
 from typing import Dict, List
-
+from google.api_core.exceptions import InvalidArgument
+import backoff
 import dashscope
 import google.generativeai as genai
 import requests
@@ -22,6 +24,8 @@ from mm_agents.prompts import SYS_PROMPT_IN_SCREENSHOT_OUT_CODE, SYS_PROMPT_IN_S
    SYS_PROMPT_IN_SOM_A11Y_OUT_TAG, \
    SYS_PROMPT_SEEACT, ACTION_DESCRIPTION_PROMPT_SEEACT, ACTION_GROUNDING_PROMPT_SEEACT

+# todo: cross-check with visualwebarena
+
 logger = logging.getLogger("desktopenv.agent")


@@ -506,18 +510,25 @@ class PromptAgent:
        try:
            actions = self.parse_actions(response, masks)
            self.thoughts.append(response)
-        except Exception as e:
+        except ValueError as e:
            print("Failed to parse action from response", e)
            actions = None
            self.thoughts.append("")

        return actions

-    # @backoff.on_exception(
-    #     backoff.expo,
-    #     (Exception),
-    #     max_tries=5
-    # )
+    @backoff.on_exception(
+        backoff.expo,
+        # here you should add more model exceptions as you want,
+        # but you are forbidden to add "Exception", that is, a common type of exception
+        # because we want to catch this kind of Exception in the outside to ensure each example won't exceed the time limit
+        (openai.RateLimitError,
+        openai.BadRequestError,
+        openai.InternalServerError,
+        InvalidArgument),
+        max_tries=5
+    )
+
    def call_llm(self, payload):

        if self.model.startswith("gpt"):
@@ -525,7 +536,7 @@ class PromptAgent:
                "Content-Type": "application/json",
                "Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}"
            }
-            logger.info("Generating content with GPT model: %s", self.model)
+            # logger.info("Generating content with GPT model: %s", self.model)
            response = requests.post(
                "https://api.openai.com/v1/chat/completions",
                headers=headers,