flsol demo: fix top_p/claude/gemini, force coordinates, add reflection comments, screenshot mode
This commit is contained in:
@@ -752,7 +752,6 @@ class PromptAgent:
|
||||
elif self.model.startswith("claude"):
|
||||
messages = payload["messages"]
|
||||
max_tokens = payload["max_tokens"]
|
||||
top_p = payload["top_p"]
|
||||
temperature = payload["temperature"]
|
||||
|
||||
claude_messages = []
|
||||
@@ -796,11 +795,10 @@ class PromptAgent:
|
||||
"max_tokens": max_tokens,
|
||||
"messages": claude_messages,
|
||||
"temperature": temperature,
|
||||
"top_p": top_p
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
"https://api.apiyi.com/v1/messages",
|
||||
os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1").rstrip("/") + "/messages",
|
||||
headers=headers,
|
||||
json=payload
|
||||
)
|
||||
@@ -816,7 +814,7 @@ class PromptAgent:
|
||||
elif self.model.startswith("mistral"):
|
||||
messages = payload["messages"]
|
||||
max_tokens = payload["max_tokens"]
|
||||
top_p = payload["top_p"]
|
||||
top_p = payload.get("top_p", 0.9)
|
||||
temperature = payload["temperature"]
|
||||
|
||||
assert self.observation_type in pure_text_settings, f"The model {self.model} can only support text-based input, please consider change based model or settings"
|
||||
@@ -871,7 +869,7 @@ class PromptAgent:
|
||||
# THUDM/cogagent-chat-hf
|
||||
messages = payload["messages"]
|
||||
max_tokens = payload["max_tokens"]
|
||||
top_p = payload["top_p"]
|
||||
top_p = payload.get("top_p", 0.9)
|
||||
temperature = payload["temperature"]
|
||||
|
||||
cog_messages = []
|
||||
@@ -920,7 +918,7 @@ class PromptAgent:
|
||||
elif self.model in ["gemini-pro", "gemini-pro-vision"]:
|
||||
messages = payload["messages"]
|
||||
max_tokens = payload["max_tokens"]
|
||||
top_p = payload["top_p"]
|
||||
top_p = payload.get("top_p", 0.9)
|
||||
temperature = payload["temperature"]
|
||||
|
||||
if self.model == "gemini-pro":
|
||||
@@ -989,10 +987,10 @@ class PromptAgent:
|
||||
)
|
||||
return response.text
|
||||
|
||||
elif self.model.startswith("gemini"):
|
||||
elif self.model in ["gemini-pro", "gemini-pro-vision", "gemini-1.5-pro", "gemini-1.5-flash"]:
|
||||
messages = payload["messages"]
|
||||
max_tokens = payload["max_tokens"]
|
||||
top_p = payload["top_p"]
|
||||
top_p = payload.get("top_p", 0.9)
|
||||
temperature = payload["temperature"]
|
||||
|
||||
gemini_messages = []
|
||||
@@ -1068,7 +1066,7 @@ class PromptAgent:
|
||||
elif self.model == "llama3-70b":
|
||||
messages = payload["messages"]
|
||||
max_tokens = payload["max_tokens"]
|
||||
top_p = payload["top_p"]
|
||||
top_p = payload.get("top_p", 0.9)
|
||||
temperature = payload["temperature"]
|
||||
|
||||
assert self.observation_type in pure_text_settings, f"The model {self.model} can only support text-based input, please consider change based model or settings"
|
||||
@@ -1121,7 +1119,7 @@ class PromptAgent:
|
||||
elif self.model.startswith("qwen"):
|
||||
messages = payload["messages"]
|
||||
max_tokens = payload["max_tokens"]
|
||||
top_p = payload["top_p"]
|
||||
top_p = payload.get("top_p", 0.9)
|
||||
temperature = payload["temperature"]
|
||||
|
||||
qwen_messages = []
|
||||
@@ -1200,7 +1198,21 @@ class PromptAgent:
|
||||
return ""
|
||||
|
||||
else:
|
||||
raise ValueError("Invalid model: " + self.model)
|
||||
# Fallback: openai-compatible for any unrecognized model (e.g. gemini-3.1 via apiyi)
|
||||
base_url = os.environ.get('OPENAI_BASE_URL', os.environ.get('OPENAI_API_BASE', 'https://api.openai.com'))
|
||||
api_url = f"{base_url}/chat/completions" if base_url.endswith('/v1') else f"{base_url}/v1/chat/completions"
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}"
|
||||
}
|
||||
logger.info("Generating content with openai-compatible model: %s", self.model)
|
||||
response = requests.post(api_url, headers=headers, json=payload)
|
||||
if response.status_code != 200:
|
||||
logger.error("Failed to call LLM: " + response.text)
|
||||
time.sleep(5)
|
||||
return ""
|
||||
else:
|
||||
return response.json()['choices'][0]['message']['content']
|
||||
|
||||
def parse_actions(self, response: str, masks=None):
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ For each step, you will get an observation of an image, which is the screenshot
|
||||
|
||||
You are required to use `pyautogui` to perform the action grounded to the observation, but DONOT use the `pyautogui.locateCenterOnScreen` function to locate the element you want to operate with since we have no image of the element you want to operate with. DONOT USE `pyautogui.screenshot()` to make screenshot.
|
||||
Return one line or multiple lines of python code to perform the action each time, be time efficient. When predicting multiple lines of code, make some small sleep like `time.sleep(0.5);` interval so that the machine could take; Each time you need to predict a complete code, no variables or function can be shared from history
|
||||
You need to to specify the coordinates of by yourself based on your observation of current observation, but you should be careful to ensure that the coordinates are correct.
|
||||
If the task instruction contains explicit coordinates like (x,y), use those exact coordinates directly without re-estimating from the screenshot. Otherwise, specify coordinates based on your observation.
|
||||
You ONLY need to return the code inside a code block, like this:
|
||||
```python
|
||||
# your code here
|
||||
@@ -18,7 +18,11 @@ When you think the task is done, return ```DONE```.
|
||||
IMPORTANT: You must return EITHER a python code block OR a special code (WAIT/DONE/FAIL) in each response, NEVER BOTH. Do NOT include ```DONE``` in the same response as a ```python``` code block. After executing an action, wait for the next observation to verify the result before returning ```DONE```.
|
||||
|
||||
My computer's password is '{{CLIENT_PASSWORD}}', feel free to use it when you need sudo rights.
|
||||
First give the current screenshot and previous things we did a short reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
|
||||
Before writing any code, you MUST add Chinese comments in this exact format:
|
||||
# 【观察】Describe what you currently see on the screen in 1-2 sentences
|
||||
# 【判断】Explain which step you are on and what needs to be done next
|
||||
# 【动作】Describe the specific action you are about to take
|
||||
Then provide the pyautogui code. This reflection format is mandatory for every response and makes the demo easy to follow.
|
||||
""".strip()
|
||||
|
||||
SYS_PROMPT_IN_SCREENSHOT_OUT_CODE_FEW_SHOT = """
|
||||
@@ -557,7 +561,11 @@ When you think the task is done, return ```DONE```.
|
||||
IMPORTANT: You must return EITHER a python code block OR a special code (WAIT/DONE/FAIL) in each response, NEVER BOTH. Do NOT include ```DONE``` in the same response as a ```python``` code block. After executing an action, wait for the next observation to verify the result before returning ```DONE```.
|
||||
|
||||
My computer's password is '{{CLIENT_PASSWORD}}', feel free to use it when you need sudo rights.
|
||||
First give the current screenshot and previous things we did a short reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
|
||||
Before writing any code, you MUST add Chinese comments in this exact format:
|
||||
# 【观察】Describe what you currently see on the screen in 1-2 sentences
|
||||
# 【判断】Explain which step you are on and what needs to be done next
|
||||
# 【动作】Describe the specific action you are about to take
|
||||
Then provide the pyautogui code. This reflection format is mandatory for every response and makes the demo easy to follow.
|
||||
""".strip()
|
||||
|
||||
SYS_PROMPT_IN_A11Y_OUT_ACTION = """
|
||||
@@ -826,7 +834,11 @@ When you think the task is done, return ```DONE```.
|
||||
IMPORTANT: You must return EITHER a python code block OR a special code (WAIT/DONE/FAIL) in each response, NEVER BOTH. Do NOT include ```DONE``` in the same response as a ```python``` code block. After executing an action, wait for the next observation to verify the result before returning ```DONE```.
|
||||
|
||||
My computer's password is '{{CLIENT_PASSWORD}}', feel free to use it when you need sudo rights.
|
||||
First give the current screenshot and previous things we did a short reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
|
||||
Before writing any code, you MUST add Chinese comments in this exact format:
|
||||
# 【观察】Describe what you currently see on the screen in 1-2 sentences
|
||||
# 【判断】Explain which step you are on and what needs to be done next
|
||||
# 【动作】Describe the specific action you are about to take
|
||||
Then provide the pyautogui code. This reflection format is mandatory for every response and makes the demo easy to follow.
|
||||
""".strip()
|
||||
|
||||
SYS_PROMPT_IN_BOTH_OUT_ACTION = """
|
||||
@@ -1101,7 +1113,11 @@ When you think the task can not be done, return ```FAIL```, don't easily say ```
|
||||
When you think the task is done, return ```DONE```.
|
||||
|
||||
My computer's password is '{{CLIENT_PASSWORD}}', feel free to use it when you need sudo rights.
|
||||
First give the current screenshot and previous things we did a short reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
|
||||
Before writing any code, you MUST add Chinese comments in this exact format:
|
||||
# 【观察】Describe what you currently see on the screen in 1-2 sentences
|
||||
# 【判断】Explain which step you are on and what needs to be done next
|
||||
# 【动作】Describe the specific action you are about to take
|
||||
Then provide the pyautogui code. This reflection format is mandatory for every response and makes the demo easy to follow.
|
||||
""".strip()
|
||||
|
||||
SYS_PROMPT_SEEACT = """
|
||||
@@ -1151,7 +1167,11 @@ When you think the task can not be done, return ```FAIL```, don't easily say ```
|
||||
When you think the task is done, return ```DONE```.
|
||||
|
||||
My computer's password is '{{CLIENT_PASSWORD}}', feel free to use it when you need sudo rights.
|
||||
First give the current screenshot and previous things we did a short reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
|
||||
Before writing any code, you MUST add Chinese comments in this exact format:
|
||||
# 【观察】Describe what you currently see on the screen in 1-2 sentences
|
||||
# 【判断】Explain which step you are on and what needs to be done next
|
||||
# 【动作】Describe the specific action you are about to take
|
||||
Then provide the pyautogui code. This reflection format is mandatory for every response and makes the demo easy to follow.
|
||||
"""
|
||||
|
||||
AGUVIS_PLANNER_SYS_PROMPT = """
|
||||
@@ -1177,7 +1197,11 @@ Here are some guidelines for you:
|
||||
3. Return ```Done``` when you think the task is done. Return ```Fail``` when you think the task can not be done.
|
||||
|
||||
My computer's password is '{{CLIENT_PASSWORD}}', feel free to use it when you need sudo rights.
|
||||
First give the current screenshot and previous things we did a short reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
|
||||
Before writing any code, you MUST add Chinese comments in this exact format:
|
||||
# 【观察】Describe what you currently see on the screen in 1-2 sentences
|
||||
# 【判断】Explain which step you are on and what needs to be done next
|
||||
# 【动作】Describe the specific action you are about to take
|
||||
Then provide the pyautogui code. This reflection format is mandatory for every response and makes the demo easy to follow.
|
||||
""".strip()
|
||||
|
||||
AGUVIS_SYS_PROMPT = """You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.
|
||||
|
||||
Reference in New Issue
Block a user