Modify the logic of SoM agent

This commit is contained in:
Timothyxxx
2024-02-01 18:58:22 +08:00
parent c31c9f4e7d
commit 32bcdd0937
2 changed files with 40 additions and 23 deletions

View File

@@ -150,24 +150,21 @@ def parse_code_from_string(input_string):
def parse_code_from_som_string(input_string, masks): def parse_code_from_som_string(input_string, masks):
# parse the output string by masks # parse the output string by masks
mappings = [] tag_vars = ""
for i, mask in enumerate(masks): for i, mask in enumerate(masks):
x, y, w, h = mask x, y, w, h = mask
mappings.append(("tag#" + str(i + 1), "{}, {}".format(int(x + w // 2), int(y + h // 2)))) tag_vars += "tag_" + str(i + 1) + "=" + "({}, {})".format(int(x + w // 2), int(y + h // 2))
tag_vars += "\n"
def replace_tags_with_mappings(text, mappings):
pattern = r'tag#\d+'
matches = re.findall(pattern, text)
for match in matches:
for mapping in mappings:
if match == mapping[0]:
text = text.replace(match, mapping[1])
break
logger.error("Predicting the tag with index {} failed.".format(match))
return ""
actions = parse_code_from_string(input_string) actions = parse_code_from_string(input_string)
for i, action in enumerate(actions):
if action.strip() in ['WAIT', 'DONE', 'FAIL']:
pass
else:
action = tag_vars + action
actions[i] = action
return actions return actions
@@ -561,19 +558,39 @@ class GPT4v_Agent:
return response.json()['choices'][0]['message']['content'] return response.json()['choices'][0]['message']['content']
elif self.model.startswith("mistral"): elif self.model.startswith("mistral"):
print("call mistral")
messages = payload["messages"] messages = payload["messages"]
max_tokens = payload["max_tokens"] max_tokens = payload["max_tokens"]
misrtal_messages = []
for i, message in enumerate(messages):
mistral_message = {
"role": message["role"],
"content": []
}
for part in message["content"]:
mistral_message['content'] = part['text'] if part['type'] == "text" else None
misrtal_messages.append(mistral_message)
# the mistral not support system message in our endpoint, so we concatenate it at the first user message
if misrtal_messages[0]['role'] == "system":
misrtal_messages[1]['content'] = misrtal_messages[0]['content'] + "\n" + misrtal_messages[1]['content']
misrtal_messages.pop(0)
openai.api_base = "http://localhost:8000/v1" openai.api_base = "http://localhost:8000/v1"
openai.api_key = "test" openai.api_key = "test"
response = openai.ChatCompletion.create( response = openai.ChatCompletion.create(
messages=messages, messages=misrtal_messages,
model="Mixtral-8x7B-Instruct-v0.1", model="Mixtral-8x7B-Instruct-v0.1"
max_tokens=max_tokens
) )
try: try:
return response['choices'][0]['message']['content'] return response['choices'][0]['message']['content']
except Exception as e: except Exception as e:
print("Failed to call LLM: " + str(e))
return "" return ""
elif self.model.startswith("gemini"): elif self.model.startswith("gemini"):

View File

@@ -806,9 +806,9 @@ For each step, you will get an observation of the desktop by 1) a screenshot; an
You are required to use `pyautogui` to perform the action grounded to the observation, but DONOT use the `pyautogui.locateCenterOnScreen` function to locate the element you want to operate with since we have no image of the element you want to operate with. DONOT USE `pyautogui.screenshot()` to make screenshot. You are required to use `pyautogui` to perform the action grounded to the observation, but DONOT use the `pyautogui.locateCenterOnScreen` function to locate the element you want to operate with since we have no image of the element you want to operate with. DONOT USE `pyautogui.screenshot()` to make screenshot.
You can replace x, y in the code with the tag of the element you want to operate with. such as: You can replace x, y in the code with the tag of the element you want to operate with. such as:
```python ```python
pyautogui.moveTo(tag#3) pyautogui.moveTo(tag_3)
pyautogui.click(tag#2) pyautogui.click(tag_2)
pyautogui.dragTo(tag#1, button='left') pyautogui.dragTo(tag_1, button='left')
``` ```
When you think you can directly output precise x and y coordinates or there is no tag on which you want to interact, you can also use them directly. When you think you can directly output precise x and y coordinates or there is no tag on which you want to interact, you can also use them directly.
But you should be careful to ensure that the coordinates are correct. But you should be careful to ensure that the coordinates are correct.
@@ -856,9 +856,9 @@ ACTION_GROUNDING_PROMPT_SEEACT = """
You are required to use `pyautogui` to perform the action grounded to the observation, but DONOT use the `pyautogui.locateCenterOnScreen` function to locate the element you want to operate with since we have no image of the element you want to operate with. DONOT USE `pyautogui.screenshot()` to make screenshot. You are required to use `pyautogui` to perform the action grounded to the observation, but DONOT use the `pyautogui.locateCenterOnScreen` function to locate the element you want to operate with since we have no image of the element you want to operate with. DONOT USE `pyautogui.screenshot()` to make screenshot.
You can replace x, y in the code with the tag of the element you want to operate with. such as: You can replace x, y in the code with the tag of the element you want to operate with. such as:
```python ```python
pyautogui.moveTo(tag#3) pyautogui.moveTo(tag_3)
pyautogui.click(tag#2) pyautogui.click(tag_2)
pyautogui.dragTo(tag#1, button='left') pyautogui.dragTo(tag_1, button='left')
``` ```
When you think you can directly output precise x and y coordinates or there is no tag on which you want to interact, you can also use them directly. When you think you can directly output precise x and y coordinates or there is no tag on which you want to interact, you can also use them directly.
But you should be careful to ensure that the coordinates are correct. But you should be careful to ensure that the coordinates are correct.