flsol demo: fix top_p/claude/gemini, force coordinates, add reflection comments, screenshot mode

2026-03-22 13:57:33 +08:00
parent 093b779045
commit f32e5f9e64
8 changed files with 881 additions and 50 deletions
--- a/mm_agents/agent.py
+++ b/mm_agents/agent.py
@@ -752,7 +752,6 @@ class PromptAgent:
        elif self.model.startswith("claude"):
            messages = payload["messages"]
            max_tokens = payload["max_tokens"]
-            top_p = payload["top_p"]
            temperature = payload["temperature"]

            claude_messages = []
@@ -796,11 +795,10 @@ class PromptAgent:
                "max_tokens": max_tokens,
                "messages": claude_messages,
                "temperature": temperature,
-                "top_p": top_p
            }

            response = requests.post(
-                "https://api.apiyi.com/v1/messages",
+                os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1").rstrip("/") + "/messages",
                headers=headers,
                json=payload
            )
@@ -816,7 +814,7 @@ class PromptAgent:
        elif self.model.startswith("mistral"):
            messages = payload["messages"]
            max_tokens = payload["max_tokens"]
-            top_p = payload["top_p"]
+            top_p = payload.get("top_p", 0.9)
            temperature = payload["temperature"]

            assert self.observation_type in pure_text_settings, f"The model {self.model} can only support text-based input, please consider change based model or settings"
@@ -871,7 +869,7 @@ class PromptAgent:
            # THUDM/cogagent-chat-hf
            messages = payload["messages"]
            max_tokens = payload["max_tokens"]
-            top_p = payload["top_p"]
+            top_p = payload.get("top_p", 0.9)
            temperature = payload["temperature"]

            cog_messages = []
@@ -920,7 +918,7 @@ class PromptAgent:
        elif self.model in ["gemini-pro", "gemini-pro-vision"]:
            messages = payload["messages"]
            max_tokens = payload["max_tokens"]
-            top_p = payload["top_p"]
+            top_p = payload.get("top_p", 0.9)
            temperature = payload["temperature"]

            if self.model == "gemini-pro":
@@ -989,10 +987,10 @@ class PromptAgent:
            )
            return response.text

-        elif self.model.startswith("gemini"):
+        elif self.model in ["gemini-pro", "gemini-pro-vision", "gemini-1.5-pro", "gemini-1.5-flash"]:
            messages = payload["messages"]
            max_tokens = payload["max_tokens"]
-            top_p = payload["top_p"]
+            top_p = payload.get("top_p", 0.9)
            temperature = payload["temperature"]

            gemini_messages = []
@@ -1068,7 +1066,7 @@ class PromptAgent:
        elif self.model == "llama3-70b":
            messages = payload["messages"]
            max_tokens = payload["max_tokens"]
-            top_p = payload["top_p"]
+            top_p = payload.get("top_p", 0.9)
            temperature = payload["temperature"]

            assert self.observation_type in pure_text_settings, f"The model {self.model} can only support text-based input, please consider change based model or settings"
@@ -1121,7 +1119,7 @@ class PromptAgent:
        elif self.model.startswith("qwen"):
            messages = payload["messages"]
            max_tokens = payload["max_tokens"]
-            top_p = payload["top_p"]
+            top_p = payload.get("top_p", 0.9)
            temperature = payload["temperature"]

            qwen_messages = []
@@ -1200,7 +1198,21 @@ class PromptAgent:
                return ""

        else:
-            raise ValueError("Invalid model: " + self.model)
+            # Fallback: openai-compatible for any unrecognized model (e.g. gemini-3.1 via apiyi)
+            base_url = os.environ.get('OPENAI_BASE_URL', os.environ.get('OPENAI_API_BASE', 'https://api.openai.com'))
+            api_url = f"{base_url}/chat/completions" if base_url.endswith('/v1') else f"{base_url}/v1/chat/completions"
+            headers = {
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}"
+            }
+            logger.info("Generating content with openai-compatible model: %s", self.model)
+            response = requests.post(api_url, headers=headers, json=payload)
+            if response.status_code != 200:
+                logger.error("Failed to call LLM: " + response.text)
+                time.sleep(5)
+                return ""
+            else:
+                return response.json()['choices'][0]['message']['content']

    def parse_actions(self, response: str, masks=None):

--- a/mm_agents/prompts.py
+++ b/mm_agents/prompts.py
@@ -5,7 +5,7 @@ For each step, you will get an observation of an image, which is the screenshot

 You are required to use `pyautogui` to perform the action grounded to the observation, but DONOT use the `pyautogui.locateCenterOnScreen` function to locate the element you want to operate with since we have no image of the element you want to operate with. DONOT USE `pyautogui.screenshot()` to make screenshot.
 Return one line or multiple lines of python code to perform the action each time, be time efficient. When predicting multiple lines of code, make some small sleep like `time.sleep(0.5);` interval so that the machine could take; Each time you need to predict a complete code, no variables or function can be shared from history
-You need to to specify the coordinates of by yourself based on your observation of current observation, but you should be careful to ensure that the coordinates are correct.
+If the task instruction contains explicit coordinates like (x,y), use those exact coordinates directly without re-estimating from the screenshot. Otherwise, specify coordinates based on your observation.
 You ONLY need to return the code inside a code block, like this:
 ```python
 # your code here
@@ -18,7 +18,11 @@ When you think the task is done, return ```DONE```.
 IMPORTANT: You must return EITHER a python code block OR a special code (WAIT/DONE/FAIL) in each response, NEVER BOTH. Do NOT include ```DONE``` in the same response as a ```python``` code block. After executing an action, wait for the next observation to verify the result before returning ```DONE```.

 My computer's password is '{{CLIENT_PASSWORD}}', feel free to use it when you need sudo rights.
-First give the current screenshot and previous things we did a short reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
+Before writing any code, you MUST add Chinese comments in this exact format:
+# 【观察】Describe what you currently see on the screen in 1-2 sentences
+# 【判断】Explain which step you are on and what needs to be done next
+# 【动作】Describe the specific action you are about to take
+Then provide the pyautogui code. This reflection format is mandatory for every response and makes the demo easy to follow.
 """.strip()

 SYS_PROMPT_IN_SCREENSHOT_OUT_CODE_FEW_SHOT = """
@@ -557,7 +561,11 @@ When you think the task is done, return ```DONE```.
 IMPORTANT: You must return EITHER a python code block OR a special code (WAIT/DONE/FAIL) in each response, NEVER BOTH. Do NOT include ```DONE``` in the same response as a ```python``` code block. After executing an action, wait for the next observation to verify the result before returning ```DONE```.

 My computer's password is '{{CLIENT_PASSWORD}}', feel free to use it when you need sudo rights.
-First give the current screenshot and previous things we did a short reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
+Before writing any code, you MUST add Chinese comments in this exact format:
+# 【观察】Describe what you currently see on the screen in 1-2 sentences
+# 【判断】Explain which step you are on and what needs to be done next
+# 【动作】Describe the specific action you are about to take
+Then provide the pyautogui code. This reflection format is mandatory for every response and makes the demo easy to follow.
 """.strip()

 SYS_PROMPT_IN_A11Y_OUT_ACTION = """
@@ -826,7 +834,11 @@ When you think the task is done, return ```DONE```.
 IMPORTANT: You must return EITHER a python code block OR a special code (WAIT/DONE/FAIL) in each response, NEVER BOTH. Do NOT include ```DONE``` in the same response as a ```python``` code block. After executing an action, wait for the next observation to verify the result before returning ```DONE```.

 My computer's password is '{{CLIENT_PASSWORD}}', feel free to use it when you need sudo rights.
-First give the current screenshot and previous things we did a short reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
+Before writing any code, you MUST add Chinese comments in this exact format:
+# 【观察】Describe what you currently see on the screen in 1-2 sentences
+# 【判断】Explain which step you are on and what needs to be done next
+# 【动作】Describe the specific action you are about to take
+Then provide the pyautogui code. This reflection format is mandatory for every response and makes the demo easy to follow.
 """.strip()

 SYS_PROMPT_IN_BOTH_OUT_ACTION = """
@@ -1101,7 +1113,11 @@ When you think the task can not be done, return ```FAIL```, don't easily say ```
 When you think the task is done, return ```DONE```.

 My computer's password is '{{CLIENT_PASSWORD}}', feel free to use it when you need sudo rights.
-First give the current screenshot and previous things we did a short reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
+Before writing any code, you MUST add Chinese comments in this exact format:
+# 【观察】Describe what you currently see on the screen in 1-2 sentences
+# 【判断】Explain which step you are on and what needs to be done next
+# 【动作】Describe the specific action you are about to take
+Then provide the pyautogui code. This reflection format is mandatory for every response and makes the demo easy to follow.
 """.strip()

 SYS_PROMPT_SEEACT = """
@@ -1151,7 +1167,11 @@ When you think the task can not be done, return ```FAIL```, don't easily say ```
 When you think the task is done, return ```DONE```.

 My computer's password is '{{CLIENT_PASSWORD}}', feel free to use it when you need sudo rights.
-First give the current screenshot and previous things we did a short reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
+Before writing any code, you MUST add Chinese comments in this exact format:
+# 【观察】Describe what you currently see on the screen in 1-2 sentences
+# 【判断】Explain which step you are on and what needs to be done next
+# 【动作】Describe the specific action you are about to take
+Then provide the pyautogui code. This reflection format is mandatory for every response and makes the demo easy to follow.
 """

 AGUVIS_PLANNER_SYS_PROMPT = """
@@ -1177,7 +1197,11 @@ Here are some guidelines for you:
 3. Return ```Done``` when you think the task is done. Return ```Fail``` when you think the task can not be done.

 My computer's password is '{{CLIENT_PASSWORD}}', feel free to use it when you need sudo rights.
-First give the current screenshot and previous things we did a short reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
+Before writing any code, you MUST add Chinese comments in this exact format:
+# 【观察】Describe what you currently see on the screen in 1-2 sentences
+# 【判断】Explain which step you are on and what needs to be done next
+# 【动作】Describe the specific action you are about to take
+Then provide the pyautogui code. This reflection format is mandatory for every response and makes the demo easy to follow.
 """.strip()

 AGUVIS_SYS_PROMPT = """You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.