Modify the logic of SoM agent

2024-02-01 18:58:22 +08:00
parent c31c9f4e7d
commit 32bcdd0937
2 changed files with 40 additions and 23 deletions
--- a/mm_agents/gpt_4v_agent.py
+++ b/mm_agents/gpt_4v_agent.py
@@ -150,24 +150,21 @@ def parse_code_from_string(input_string):

 def parse_code_from_som_string(input_string, masks):
    # parse the output string by masks
-    mappings = []
+    tag_vars = ""
    for i, mask in enumerate(masks):
        x, y, w, h = mask
-        mappings.append(("tag#" + str(i + 1), "{}, {}".format(int(x + w // 2), int(y + h // 2))))
-
-    def replace_tags_with_mappings(text, mappings):
-        pattern = r'tag#\d+'
-        matches = re.findall(pattern, text)
-
-        for match in matches:
-            for mapping in mappings:
-                if match == mapping[0]:
-                    text = text.replace(match, mapping[1])
-                    break
-                logger.error("Predicting the tag with index {} failed.".format(match))
-                return ""
+        tag_vars += "tag_" + str(i + 1) + "=" + "({}, {})".format(int(x + w // 2), int(y + h // 2))
+        tag_vars += "\n"

    actions = parse_code_from_string(input_string)
+
+    for i, action in enumerate(actions):
+        if action.strip() in ['WAIT', 'DONE', 'FAIL']:
+            pass
+        else:
+            action = tag_vars + action
+            actions[i] = action
+
    return actions


@@ -561,19 +558,39 @@ class GPT4v_Agent:
                return response.json()['choices'][0]['message']['content']

        elif self.model.startswith("mistral"):
+            print("call mistral")
            messages = payload["messages"]
            max_tokens = payload["max_tokens"]

+            misrtal_messages = []
+
+            for i, message in enumerate(messages):
+                mistral_message = {
+                    "role": message["role"],
+                    "content": []
+                }
+
+                for part in message["content"]:
+                    mistral_message['content'] = part['text'] if part['type'] == "text" else None
+
+                misrtal_messages.append(mistral_message)
+
+            # the mistral not support system message in our endpoint, so we concatenate it at the first user message
+            if misrtal_messages[0]['role'] == "system":
+                misrtal_messages[1]['content'] = misrtal_messages[0]['content'] + "\n" + misrtal_messages[1]['content']
+                misrtal_messages.pop(0)
+
            openai.api_base = "http://localhost:8000/v1"
            openai.api_key = "test"
            response = openai.ChatCompletion.create(
-                messages=messages,
-                model="Mixtral-8x7B-Instruct-v0.1",
-                max_tokens=max_tokens
+                messages=misrtal_messages,
+                model="Mixtral-8x7B-Instruct-v0.1"
            )
+
            try:
                return response['choices'][0]['message']['content']
            except Exception as e:
+                print("Failed to call LLM: " + str(e))
                return ""

        elif self.model.startswith("gemini"):
--- a/mm_agents/prompts.py
+++ b/mm_agents/prompts.py
@@ -806,9 +806,9 @@ For each step, you will get an observation of the desktop by 1) a screenshot; an
 You are required to use `pyautogui` to perform the action grounded to the observation, but DONOT use the `pyautogui.locateCenterOnScreen` function to locate the element you want to operate with since we have no image of the element you want to operate with. DONOT USE `pyautogui.screenshot()` to make screenshot.
 You can replace x, y in the code with the tag of the element you want to operate with. such as:
 ```python
-pyautogui.moveTo(tag#3)
-pyautogui.click(tag#2)
-pyautogui.dragTo(tag#1, button='left')
+pyautogui.moveTo(tag_3)
+pyautogui.click(tag_2)
+pyautogui.dragTo(tag_1, button='left')
 ```
 When you think you can directly output precise x and y coordinates or there is no tag on which you want to interact, you can also use them directly. 
 But you should be careful to ensure that the coordinates are correct.
@@ -856,9 +856,9 @@ ACTION_GROUNDING_PROMPT_SEEACT = """
 You are required to use `pyautogui` to perform the action grounded to the observation, but DONOT use the `pyautogui.locateCenterOnScreen` function to locate the element you want to operate with since we have no image of the element you want to operate with. DONOT USE `pyautogui.screenshot()` to make screenshot.
 You can replace x, y in the code with the tag of the element you want to operate with. such as:
 ```python
-pyautogui.moveTo(tag#3)
-pyautogui.click(tag#2)
-pyautogui.dragTo(tag#1, button='left')
+pyautogui.moveTo(tag_3)
+pyautogui.click(tag_2)
+pyautogui.dragTo(tag_1, button='left')
 ```
 When you think you can directly output precise x and y coordinates or there is no tag on which you want to interact, you can also use them directly. 
 But you should be careful to ensure that the coordinates are correct.