Modify the logic of SoM agent
This commit is contained in:
@@ -150,24 +150,21 @@ def parse_code_from_string(input_string):
|
||||
|
||||
def parse_code_from_som_string(input_string, masks):
|
||||
# parse the output string by masks
|
||||
mappings = []
|
||||
tag_vars = ""
|
||||
for i, mask in enumerate(masks):
|
||||
x, y, w, h = mask
|
||||
mappings.append(("tag#" + str(i + 1), "{}, {}".format(int(x + w // 2), int(y + h // 2))))
|
||||
|
||||
def replace_tags_with_mappings(text, mappings):
|
||||
pattern = r'tag#\d+'
|
||||
matches = re.findall(pattern, text)
|
||||
|
||||
for match in matches:
|
||||
for mapping in mappings:
|
||||
if match == mapping[0]:
|
||||
text = text.replace(match, mapping[1])
|
||||
break
|
||||
logger.error("Predicting the tag with index {} failed.".format(match))
|
||||
return ""
|
||||
tag_vars += "tag_" + str(i + 1) + "=" + "({}, {})".format(int(x + w // 2), int(y + h // 2))
|
||||
tag_vars += "\n"
|
||||
|
||||
actions = parse_code_from_string(input_string)
|
||||
|
||||
for i, action in enumerate(actions):
|
||||
if action.strip() in ['WAIT', 'DONE', 'FAIL']:
|
||||
pass
|
||||
else:
|
||||
action = tag_vars + action
|
||||
actions[i] = action
|
||||
|
||||
return actions
|
||||
|
||||
|
||||
@@ -561,19 +558,39 @@ class GPT4v_Agent:
|
||||
return response.json()['choices'][0]['message']['content']
|
||||
|
||||
elif self.model.startswith("mistral"):
|
||||
print("call mistral")
|
||||
messages = payload["messages"]
|
||||
max_tokens = payload["max_tokens"]
|
||||
|
||||
misrtal_messages = []
|
||||
|
||||
for i, message in enumerate(messages):
|
||||
mistral_message = {
|
||||
"role": message["role"],
|
||||
"content": []
|
||||
}
|
||||
|
||||
for part in message["content"]:
|
||||
mistral_message['content'] = part['text'] if part['type'] == "text" else None
|
||||
|
||||
misrtal_messages.append(mistral_message)
|
||||
|
||||
# the mistral not support system message in our endpoint, so we concatenate it at the first user message
|
||||
if misrtal_messages[0]['role'] == "system":
|
||||
misrtal_messages[1]['content'] = misrtal_messages[0]['content'] + "\n" + misrtal_messages[1]['content']
|
||||
misrtal_messages.pop(0)
|
||||
|
||||
openai.api_base = "http://localhost:8000/v1"
|
||||
openai.api_key = "test"
|
||||
response = openai.ChatCompletion.create(
|
||||
messages=messages,
|
||||
model="Mixtral-8x7B-Instruct-v0.1",
|
||||
max_tokens=max_tokens
|
||||
messages=misrtal_messages,
|
||||
model="Mixtral-8x7B-Instruct-v0.1"
|
||||
)
|
||||
|
||||
try:
|
||||
return response['choices'][0]['message']['content']
|
||||
except Exception as e:
|
||||
print("Failed to call LLM: " + str(e))
|
||||
return ""
|
||||
|
||||
elif self.model.startswith("gemini"):
|
||||
|
||||
@@ -806,9 +806,9 @@ For each step, you will get an observation of the desktop by 1) a screenshot; an
|
||||
You are required to use `pyautogui` to perform the action grounded to the observation, but DONOT use the `pyautogui.locateCenterOnScreen` function to locate the element you want to operate with since we have no image of the element you want to operate with. DONOT USE `pyautogui.screenshot()` to make screenshot.
|
||||
You can replace x, y in the code with the tag of the element you want to operate with. such as:
|
||||
```python
|
||||
pyautogui.moveTo(tag#3)
|
||||
pyautogui.click(tag#2)
|
||||
pyautogui.dragTo(tag#1, button='left')
|
||||
pyautogui.moveTo(tag_3)
|
||||
pyautogui.click(tag_2)
|
||||
pyautogui.dragTo(tag_1, button='left')
|
||||
```
|
||||
When you think you can directly output precise x and y coordinates or there is no tag on which you want to interact, you can also use them directly.
|
||||
But you should be careful to ensure that the coordinates are correct.
|
||||
@@ -856,9 +856,9 @@ ACTION_GROUNDING_PROMPT_SEEACT = """
|
||||
You are required to use `pyautogui` to perform the action grounded to the observation, but DONOT use the `pyautogui.locateCenterOnScreen` function to locate the element you want to operate with since we have no image of the element you want to operate with. DONOT USE `pyautogui.screenshot()` to make screenshot.
|
||||
You can replace x, y in the code with the tag of the element you want to operate with. such as:
|
||||
```python
|
||||
pyautogui.moveTo(tag#3)
|
||||
pyautogui.click(tag#2)
|
||||
pyautogui.dragTo(tag#1, button='left')
|
||||
pyautogui.moveTo(tag_3)
|
||||
pyautogui.click(tag_2)
|
||||
pyautogui.dragTo(tag_1, button='left')
|
||||
```
|
||||
When you think you can directly output precise x and y coordinates or there is no tag on which you want to interact, you can also use them directly.
|
||||
But you should be careful to ensure that the coordinates are correct.
|
||||
|
||||
Reference in New Issue
Block a user