From af47ed8fb1d505109431d4c57513304027609063 Mon Sep 17 00:00:00 2001 From: Yuan Mengqi <100453613+yuanmengqi@users.noreply.github.com> Date: Tue, 15 Jul 2025 13:02:42 +0800 Subject: [PATCH] fix infeasible&chrome tasks (#258) * fix chrome * fix: fix proxy setup * feat&fix: add proxy support in setup and remove hardcoded proxy from example * fix tasks * fix chrome finished * fix * clean chrome_fix code * clean chrome_fix code * fix chrome 2888b4e6-5b47-4b57-8bf5-c73827890774 * fix multiapps * fix chrome 2888b4e6-5b47-4b57-8bf5-c73827890774 * fix some multi_apps tasks * fix some multi_apps tasks * fix password&resolution * fix password&resolution * Improve code logic for password & resolution * edit * Merge branch 'main' into fix_chrome * fix chrome tasks * Merge branch 'fix_chrome' * fix insensible&chrome tasks --------- Co-authored-by: adlsdztony --- .../47543840-672a-467d-80df-8f7c3b9788c9.json | 2 +- .../9f935cce-0a9f-435f-8007-817732bfc0a5.json | 2 +- .../a728a36e-8bf1-4bb6-9a03-ef039a5233f0.json | 2 +- .../b4f95342-463e-4179-8c3f-193cd7241fb2.json | 2 +- .../b7895e80-f4d1-4648-bee0-4eb45a6f1fa8.json | 16 ++++++++-------- .../fc6d8143-9452-4171-9459-7f515143419a.json | 2 +- mm_agents/openai_cua_agent.py | 8 +++++++- 7 files changed, 20 insertions(+), 14 deletions(-) diff --git a/evaluation_examples/examples/chrome/47543840-672a-467d-80df-8f7c3b9788c9.json b/evaluation_examples/examples/chrome/47543840-672a-467d-80df-8f7c3b9788c9.json index c74fdcf..4829d2d 100644 --- a/evaluation_examples/examples/chrome/47543840-672a-467d-80df-8f7c3b9788c9.json +++ b/evaluation_examples/examples/chrome/47543840-672a-467d-80df-8f7c3b9788c9.json @@ -1,7 +1,7 @@ { "id": "47543840-672a-467d-80df-8f7c3b9788c9", "snapshot": "chrome", - "instruction": "Show me the cars available for pickup at Boston Logan Intl Airport from the 10th to the 11th of next month, sorted by the number of seats to find the largest capacity.", + "instruction": "On the current website, show me the cars available for pickup at Boston Logan Intl Airport from the 10th to the 11th of next month, sorted by the number of seats to find the largest capacity.", "source": "test_task_1", "config": [ { diff --git a/evaluation_examples/examples/chrome/9f935cce-0a9f-435f-8007-817732bfc0a5.json b/evaluation_examples/examples/chrome/9f935cce-0a9f-435f-8007-817732bfc0a5.json index 9b37187..a93c959 100644 --- a/evaluation_examples/examples/chrome/9f935cce-0a9f-435f-8007-817732bfc0a5.json +++ b/evaluation_examples/examples/chrome/9f935cce-0a9f-435f-8007-817732bfc0a5.json @@ -57,5 +57,5 @@ } } }, - "proxy": true + "proxy": false } \ No newline at end of file diff --git a/evaluation_examples/examples/chrome/a728a36e-8bf1-4bb6-9a03-ef039a5233f0.json b/evaluation_examples/examples/chrome/a728a36e-8bf1-4bb6-9a03-ef039a5233f0.json index 9e5d730..6bdffe9 100644 --- a/evaluation_examples/examples/chrome/a728a36e-8bf1-4bb6-9a03-ef039a5233f0.json +++ b/evaluation_examples/examples/chrome/a728a36e-8bf1-4bb6-9a03-ef039a5233f0.json @@ -56,5 +56,5 @@ } } }, - "proxy": true + "proxy": false } \ No newline at end of file diff --git a/evaluation_examples/examples/chrome/b4f95342-463e-4179-8c3f-193cd7241fb2.json b/evaluation_examples/examples/chrome/b4f95342-463e-4179-8c3f-193cd7241fb2.json index 7773484..e6fe04f 100644 --- a/evaluation_examples/examples/chrome/b4f95342-463e-4179-8c3f-193cd7241fb2.json +++ b/evaluation_examples/examples/chrome/b4f95342-463e-4179-8c3f-193cd7241fb2.json @@ -1,7 +1,7 @@ { "id": "b4f95342-463e-4179-8c3f-193cd7241fb2", "snapshot": "chrome", - "instruction": "List as many of the next available dates for Diamond Campground as possible.", + "instruction": "Find the Next Available dates for Diamond.", "source": "test_task_1", "config": [ { diff --git a/evaluation_examples/examples/chrome/b7895e80-f4d1-4648-bee0-4eb45a6f1fa8.json b/evaluation_examples/examples/chrome/b7895e80-f4d1-4648-bee0-4eb45a6f1fa8.json index e84af23..48bf735 100644 --- a/evaluation_examples/examples/chrome/b7895e80-f4d1-4648-bee0-4eb45a6f1fa8.json +++ b/evaluation_examples/examples/chrome/b7895e80-f4d1-4648-bee0-4eb45a6f1fa8.json @@ -66,10 +66,10 @@ "goto_prefix": "https://www.", "category": "xpath", "xpathObject": { - "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[1]/div/button/div[3]": "from", - "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[2]/button/div[3]": "to", - "/html/body/div[1]/main/div[3]/div[2]/div/div[1]/div/h2": "city", - "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[3]/button/div[3]/span/span[2]": "adult", + "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[1]/button/span/div/div": "from", + "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[2]/button/span/div/div": "to", + "/html/body/div[1]/main/div[3]/div[2]/div/div/div/h2": "city", + "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[3]/button/span/div/div": "adult", "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[3]/div/div[2]/div/div/div[2]/div/button/div/div": "rank" } } @@ -101,10 +101,10 @@ }, "timezone": "America/New_York", "expected": { - "from": "{DoW}, {Month} {Day0D}", - "to": "{DoW}, {Month} {Day0D}", + "from": "Check In{DoW}, {Month} {Day0D}", + "to": "Check Out{DoW}, {Month} {Day0D}", "city": "New York City Hotels", - "adult": "2 guests", + "adult": "Rooms/Guests1 Room, 2 Guests", "rank": "Price (low to high)" } } @@ -112,5 +112,5 @@ ] }, "proxy": true, - "possibility_of_env_change": "medium" + "possibility_of_env_change": "high" } \ No newline at end of file diff --git a/evaluation_examples/examples/chrome/fc6d8143-9452-4171-9459-7f515143419a.json b/evaluation_examples/examples/chrome/fc6d8143-9452-4171-9459-7f515143419a.json index 7fea695..5844e21 100644 --- a/evaluation_examples/examples/chrome/fc6d8143-9452-4171-9459-7f515143419a.json +++ b/evaluation_examples/examples/chrome/fc6d8143-9452-4171-9459-7f515143419a.json @@ -1,7 +1,7 @@ { "id": "fc6d8143-9452-4171-9459-7f515143419a", "snapshot": "chrome", - "instruction": "Find the status of tomorrow flights from New York-Kennedy airport to Chicago-O'Hare airport.", + "instruction": "Find flights from New York–Kennedy Airport to Chicago O'Hare Airport for tomorrow.", "source": "test_task_0", "config": [ { diff --git a/mm_agents/openai_cua_agent.py b/mm_agents/openai_cua_agent.py index f653a62..e615308 100644 --- a/mm_agents/openai_cua_agent.py +++ b/mm_agents/openai_cua_agent.py @@ -671,8 +671,14 @@ class OpenAICUAAgent: action_exit = False thought_exit = False message_exit = False + infeasible_message = False + infeasible_word_list = ["infeasible", "unfeasible", "impossible", "not feasible", "cannot be done"] for item in response.output: parsed_item = self._handle_item(item) + if item.type == "message" and any(word in parsed_item.lower() for word in infeasible_word_list): + actions.append({"action_space": "pyautogui", "action": "FAIL", "pending_checks": [], "call_id": ""}) + infeasible_message = True + break if isinstance(parsed_item, dict) and parsed_item.get("action_space", None) == "pyautogui": actions.append(parsed_item) else: @@ -693,7 +699,7 @@ class OpenAICUAAgent: # state_correct = True # if action_exit and not message_exit: # state_correct = True - if action_exit: + if action_exit and not infeasible_message: state_correct = True if not state_correct: logger.warning("The state of the agent is not correct, action_exit: %s, thought_exit: %s, message_exit: %s", action_exit, thought_exit, message_exit)