diff --git a/mm_agents/gpt_4v_agent.py b/mm_agents/gpt_4v_agent.py index 596f5ee..b94a4f9 100644 --- a/mm_agents/gpt_4v_agent.py +++ b/mm_agents/gpt_4v_agent.py @@ -150,24 +150,21 @@ def parse_code_from_string(input_string): def parse_code_from_som_string(input_string, masks): # parse the output string by masks - mappings = [] + tag_vars = "" for i, mask in enumerate(masks): x, y, w, h = mask - mappings.append(("tag#" + str(i + 1), "{}, {}".format(int(x + w // 2), int(y + h // 2)))) - - def replace_tags_with_mappings(text, mappings): - pattern = r'tag#\d+' - matches = re.findall(pattern, text) - - for match in matches: - for mapping in mappings: - if match == mapping[0]: - text = text.replace(match, mapping[1]) - break - logger.error("Predicting the tag with index {} failed.".format(match)) - return "" + tag_vars += "tag_" + str(i + 1) + "=" + "({}, {})".format(int(x + w // 2), int(y + h // 2)) + tag_vars += "\n" actions = parse_code_from_string(input_string) + + for i, action in enumerate(actions): + if action.strip() in ['WAIT', 'DONE', 'FAIL']: + pass + else: + action = tag_vars + action + actions[i] = action + return actions @@ -561,19 +558,39 @@ class GPT4v_Agent: return response.json()['choices'][0]['message']['content'] elif self.model.startswith("mistral"): + print("call mistral") messages = payload["messages"] max_tokens = payload["max_tokens"] + misrtal_messages = [] + + for i, message in enumerate(messages): + mistral_message = { + "role": message["role"], + "content": [] + } + + for part in message["content"]: + mistral_message['content'] = part['text'] if part['type'] == "text" else None + + misrtal_messages.append(mistral_message) + + # the mistral not support system message in our endpoint, so we concatenate it at the first user message + if misrtal_messages[0]['role'] == "system": + misrtal_messages[1]['content'] = misrtal_messages[0]['content'] + "\n" + misrtal_messages[1]['content'] + misrtal_messages.pop(0) + openai.api_base = "http://localhost:8000/v1" openai.api_key = "test" response = openai.ChatCompletion.create( - messages=messages, - model="Mixtral-8x7B-Instruct-v0.1", - max_tokens=max_tokens + messages=misrtal_messages, + model="Mixtral-8x7B-Instruct-v0.1" ) + try: return response['choices'][0]['message']['content'] except Exception as e: + print("Failed to call LLM: " + str(e)) return "" elif self.model.startswith("gemini"): diff --git a/mm_agents/prompts.py b/mm_agents/prompts.py index 85295de..e23a211 100644 --- a/mm_agents/prompts.py +++ b/mm_agents/prompts.py @@ -806,9 +806,9 @@ For each step, you will get an observation of the desktop by 1) a screenshot; an You are required to use `pyautogui` to perform the action grounded to the observation, but DONOT use the `pyautogui.locateCenterOnScreen` function to locate the element you want to operate with since we have no image of the element you want to operate with. DONOT USE `pyautogui.screenshot()` to make screenshot. You can replace x, y in the code with the tag of the element you want to operate with. such as: ```python -pyautogui.moveTo(tag#3) -pyautogui.click(tag#2) -pyautogui.dragTo(tag#1, button='left') +pyautogui.moveTo(tag_3) +pyautogui.click(tag_2) +pyautogui.dragTo(tag_1, button='left') ``` When you think you can directly output precise x and y coordinates or there is no tag on which you want to interact, you can also use them directly. But you should be careful to ensure that the coordinates are correct. @@ -856,9 +856,9 @@ ACTION_GROUNDING_PROMPT_SEEACT = """ You are required to use `pyautogui` to perform the action grounded to the observation, but DONOT use the `pyautogui.locateCenterOnScreen` function to locate the element you want to operate with since we have no image of the element you want to operate with. DONOT USE `pyautogui.screenshot()` to make screenshot. You can replace x, y in the code with the tag of the element you want to operate with. such as: ```python -pyautogui.moveTo(tag#3) -pyautogui.click(tag#2) -pyautogui.dragTo(tag#1, button='left') +pyautogui.moveTo(tag_3) +pyautogui.click(tag_2) +pyautogui.dragTo(tag_1, button='left') ``` When you think you can directly output precise x and y coordinates or there is no tag on which you want to interact, you can also use them directly. But you should be careful to ensure that the coordinates are correct.