diff --git a/evaluation_examples/examples/chrome/47543840-672a-467d-80df-8f7c3b9788c9.json b/evaluation_examples/examples/chrome/47543840-672a-467d-80df-8f7c3b9788c9.json index c74fdcf..4829d2d 100644 --- a/evaluation_examples/examples/chrome/47543840-672a-467d-80df-8f7c3b9788c9.json +++ b/evaluation_examples/examples/chrome/47543840-672a-467d-80df-8f7c3b9788c9.json @@ -1,7 +1,7 @@ { "id": "47543840-672a-467d-80df-8f7c3b9788c9", "snapshot": "chrome", - "instruction": "Show me the cars available for pickup at Boston Logan Intl Airport from the 10th to the 11th of next month, sorted by the number of seats to find the largest capacity.", + "instruction": "On the current website, show me the cars available for pickup at Boston Logan Intl Airport from the 10th to the 11th of next month, sorted by the number of seats to find the largest capacity.", "source": "test_task_1", "config": [ { diff --git a/evaluation_examples/examples/chrome/9f935cce-0a9f-435f-8007-817732bfc0a5.json b/evaluation_examples/examples/chrome/9f935cce-0a9f-435f-8007-817732bfc0a5.json index 9b37187..a93c959 100644 --- a/evaluation_examples/examples/chrome/9f935cce-0a9f-435f-8007-817732bfc0a5.json +++ b/evaluation_examples/examples/chrome/9f935cce-0a9f-435f-8007-817732bfc0a5.json @@ -57,5 +57,5 @@ } } }, - "proxy": true + "proxy": false } \ No newline at end of file diff --git a/evaluation_examples/examples/chrome/a728a36e-8bf1-4bb6-9a03-ef039a5233f0.json b/evaluation_examples/examples/chrome/a728a36e-8bf1-4bb6-9a03-ef039a5233f0.json index 9e5d730..6bdffe9 100644 --- a/evaluation_examples/examples/chrome/a728a36e-8bf1-4bb6-9a03-ef039a5233f0.json +++ b/evaluation_examples/examples/chrome/a728a36e-8bf1-4bb6-9a03-ef039a5233f0.json @@ -56,5 +56,5 @@ } } }, - "proxy": true + "proxy": false } \ No newline at end of file diff --git a/evaluation_examples/examples/chrome/b4f95342-463e-4179-8c3f-193cd7241fb2.json b/evaluation_examples/examples/chrome/b4f95342-463e-4179-8c3f-193cd7241fb2.json index 7773484..e6fe04f 100644 --- a/evaluation_examples/examples/chrome/b4f95342-463e-4179-8c3f-193cd7241fb2.json +++ b/evaluation_examples/examples/chrome/b4f95342-463e-4179-8c3f-193cd7241fb2.json @@ -1,7 +1,7 @@ { "id": "b4f95342-463e-4179-8c3f-193cd7241fb2", "snapshot": "chrome", - "instruction": "List as many of the next available dates for Diamond Campground as possible.", + "instruction": "Find the Next Available dates for Diamond.", "source": "test_task_1", "config": [ { diff --git a/evaluation_examples/examples/chrome/b7895e80-f4d1-4648-bee0-4eb45a6f1fa8.json b/evaluation_examples/examples/chrome/b7895e80-f4d1-4648-bee0-4eb45a6f1fa8.json index e84af23..48bf735 100644 --- a/evaluation_examples/examples/chrome/b7895e80-f4d1-4648-bee0-4eb45a6f1fa8.json +++ b/evaluation_examples/examples/chrome/b7895e80-f4d1-4648-bee0-4eb45a6f1fa8.json @@ -66,10 +66,10 @@ "goto_prefix": "https://www.", "category": "xpath", "xpathObject": { - "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[1]/div/button/div[3]": "from", - "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[2]/button/div[3]": "to", - "/html/body/div[1]/main/div[3]/div[2]/div/div[1]/div/h2": "city", - "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[3]/button/div[3]/span/span[2]": "adult", + "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[1]/button/span/div/div": "from", + "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[2]/button/span/div/div": "to", + "/html/body/div[1]/main/div[3]/div[2]/div/div/div/h2": "city", + "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[3]/button/span/div/div": "adult", "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[3]/div/div[2]/div/div/div[2]/div/button/div/div": "rank" } } @@ -101,10 +101,10 @@ }, "timezone": "America/New_York", "expected": { - "from": "{DoW}, {Month} {Day0D}", - "to": "{DoW}, {Month} {Day0D}", + "from": "Check In{DoW}, {Month} {Day0D}", + "to": "Check Out{DoW}, {Month} {Day0D}", "city": "New York City Hotels", - "adult": "2 guests", + "adult": "Rooms/Guests1 Room, 2 Guests", "rank": "Price (low to high)" } } @@ -112,5 +112,5 @@ ] }, "proxy": true, - "possibility_of_env_change": "medium" + "possibility_of_env_change": "high" } \ No newline at end of file diff --git a/evaluation_examples/examples/chrome/fc6d8143-9452-4171-9459-7f515143419a.json b/evaluation_examples/examples/chrome/fc6d8143-9452-4171-9459-7f515143419a.json index 7fea695..5844e21 100644 --- a/evaluation_examples/examples/chrome/fc6d8143-9452-4171-9459-7f515143419a.json +++ b/evaluation_examples/examples/chrome/fc6d8143-9452-4171-9459-7f515143419a.json @@ -1,7 +1,7 @@ { "id": "fc6d8143-9452-4171-9459-7f515143419a", "snapshot": "chrome", - "instruction": "Find the status of tomorrow flights from New York-Kennedy airport to Chicago-O'Hare airport.", + "instruction": "Find flights from New York–Kennedy Airport to Chicago O'Hare Airport for tomorrow.", "source": "test_task_0", "config": [ { diff --git a/mm_agents/openai_cua_agent.py b/mm_agents/openai_cua_agent.py index f653a62..e615308 100644 --- a/mm_agents/openai_cua_agent.py +++ b/mm_agents/openai_cua_agent.py @@ -671,8 +671,14 @@ class OpenAICUAAgent: action_exit = False thought_exit = False message_exit = False + infeasible_message = False + infeasible_word_list = ["infeasible", "unfeasible", "impossible", "not feasible", "cannot be done"] for item in response.output: parsed_item = self._handle_item(item) + if item.type == "message" and any(word in parsed_item.lower() for word in infeasible_word_list): + actions.append({"action_space": "pyautogui", "action": "FAIL", "pending_checks": [], "call_id": ""}) + infeasible_message = True + break if isinstance(parsed_item, dict) and parsed_item.get("action_space", None) == "pyautogui": actions.append(parsed_item) else: @@ -693,7 +699,7 @@ class OpenAICUAAgent: # state_correct = True # if action_exit and not message_exit: # state_correct = True - if action_exit: + if action_exit and not infeasible_message: state_correct = True if not state_correct: logger.warning("The state of the agent is not correct, action_exit: %s, thought_exit: %s, message_exit: %s", action_exit, thought_exit, message_exit)