feat: enhance run_coact.py and related agents with improved task handling and configuration

- Updated TASK_DESCRIPTION in run_coact.py to clarify task-solving steps and requirements. - Modified configuration parameters for provider name and client password for better security and flexibility. - Enhanced OrchestratorUserProxyAgent to include user instruction in the auto-reply and improved screenshot handling. - Adjusted coding_agent.py to ensure proper verification of results before saving changes. - Improved CUA agent prompts to maintain application state and handle user instructions more effectively. - Ensured existing code logic remains unchanged while enhancing functionality and usability.
2025-08-13 09:04:09 +00:00
parent d2ae0f697d
commit 7fb5860da0
4 changed files with 100 additions and 59 deletions
--- a/mm_agents/coact/cua_agent.py
+++ b/mm_agents/coact/cua_agent.py
@@ -19,10 +19,11 @@ PROMPT_TEMPLATE = """# Task

 # Hints
 - Sudo password is "{CLIENT_PASSWORD}".
- If you meet "Authentication required" dialog, enter the "{CLIENT_PASSWORD}" to continue.
- Do not close the any application or window or tab that is already opened.
- Do not close the window at the end of the task.
+- Keep the windows/applications opened at the end of the task.
+- Do not use shortcut to reload the application except for the browser, just close and reopen.
+- If "The document has been changed by others" pops out, you should click "cancel" and reopen the file.
 - If you have completed the user task, reply with the information you want the user to know along with 'TERMINATE'.
+- If you don't know how to continue the task, reply your concern or question along with 'IDK'.
 """.strip()
 DEFAULT_REPLY = "Please continue the user task. If you have completed the user task, reply with the information you want the user to know along with 'TERMINATE'."

@@ -118,7 +119,9 @@ def call_openai_cua(client: OpenAI,
                    "environment": environment,
                }],
                input=history_inputs,
-                reasoning={"summary": "concise"},
+                reasoning={
+                    "summary": "concise"
+                },
                tool_choice="required",
                truncation="auto",
            )
@@ -205,6 +208,10 @@ def run_cua(
                    reasoning = "My thinking process\n" + "\n- ".join(reasoning_list) + '\nPlease check the screenshot and see if it fulfills your requirements.'
                    breakflag = True
                    break
+                if 'IDK' in o.content[0].text:
+                    reasoning = f"{o.content[0].text}. I don't know how to complete the task. Please check the current screenshot."
+                    breakflag = True
+                    break
                try:
                    json.loads(o.content[0].text)
                    history_inputs.pop(len(history_inputs) - len(response.output) + i)