From ac24ccce99a60b8a7e9b93c235de09b0f0c3c68d Mon Sep 17 00:00:00 2001 From: XXZ <108875830+ThisisXXZ@users.noreply.github.com> Date: Thu, 3 Jul 2025 21:53:58 +0800 Subject: [PATCH 1/2] fix: fix multiapp tasks (#229) Co-authored-by: adlsdztony --- desktop_env/evaluators/metrics/gimp.py | 8 ++++--- desktop_env/evaluators/metrics/others.py | 6 +++-- .../00fa164e-2612-4439-992e-157d019a8436.json | 2 +- .../02ce9a50-7af2-47ed-8596-af0c230501f8.json | 4 ++-- .../26660ad1-6ebb-4f59-8cba-a8432dfe8d38.json | 19 +++++++++------ .../3680a5ee-6870-426a-a997-eba929a0d25c.json | 2 +- .../3f05f3b9-29ba-4b6b-95aa-2204697ffc06.json | 2 +- .../48d05431-6cd5-4e76-82eb-12b60d823f7d.json | 9 ++++---- .../4c26e3f3-3a14-4d86-b44a-d3cedebbb487.json | 7 ++++-- .../58565672-7bfe-48ab-b828-db349231de6b.json | 2 +- .../5990457f-2adb-467b-a4af-5c857c92d762.json | 1 + .../6f4073b8-d8ea-4ade-8a18-c5d1d5d5aa9a.json | 23 +++++++++++++++---- 12 files changed, 57 insertions(+), 28 deletions(-) diff --git a/desktop_env/evaluators/metrics/gimp.py b/desktop_env/evaluators/metrics/gimp.py index c87453b..9f374e1 100644 --- a/desktop_env/evaluators/metrics/gimp.py +++ b/desktop_env/evaluators/metrics/gimp.py @@ -193,7 +193,7 @@ def structure_check_by_mse(img1, img2, threshold=0.03): (np.array(img1, dtype=np.float32) / 255 - np.array(img2, dtype=np.float32) / 255) ** 2) structure_same = True if mse < threshold else False - print("MSE: ", mse) + print(f"MSE: {mse}, threshold: {threshold}") return structure_same @@ -204,7 +204,7 @@ def structure_check_by_ssim(img1, img2, threshold=0.9): return similarity >= threshold -def check_brightness_decrease_and_structure_sim(src_path, tgt_path): +def check_brightness_decrease_and_structure_sim(src_path, tgt_path, threshold=0.03): """ Check the brightness of src is lower than tgt and the structures are similar gimp:7a4deb26-d57d-4ea9-9a73-630f66a7b568 @@ -219,13 +219,15 @@ def check_brightness_decrease_and_structure_sim(src_path, tgt_path): brightness_src = calculate_brightness(img_src) brightness_tgt = calculate_brightness(img_tgt) brightness_reduced = brightness_tgt > brightness_src + + # print(f"Brightness src: {brightness_src}, tgt: {brightness_tgt}, reduced: {brightness_reduced}") # Normalize and compare images target_brightness = 128 img_src_normalized = normalize_brightness(img_src, target_brightness) img_tgt_normalized = normalize_brightness(img_tgt, target_brightness) - structure_same = structure_check_by_mse(img_src_normalized, img_tgt_normalized) + structure_same = structure_check_by_mse(img_src_normalized, img_tgt_normalized, threshold=threshold) if brightness_reduced and structure_same: return 1. else: diff --git a/desktop_env/evaluators/metrics/others.py b/desktop_env/evaluators/metrics/others.py index ebb5994..eb0bf0c 100644 --- a/desktop_env/evaluators/metrics/others.py +++ b/desktop_env/evaluators/metrics/others.py @@ -63,11 +63,13 @@ def compare_epub(result: str, expected: str) -> float: result_files: List[str] = process_epub(result) expected_files: List[str] = process_epub(expected) - metric: float = 1. + metric: float = 0. for f1, f2 in zip(result_files, expected_files): current_metric: float = diff_text_file(f1, f2) logger.debug("%s vs %s: %f", f1, f2, current_metric) - metric *= current_metric + metric += current_metric + if len(result_files) > 0: + metric /= len(result_files) return metric diff --git a/evaluation_examples/examples/multi_apps/00fa164e-2612-4439-992e-157d019a8436.json b/evaluation_examples/examples/multi_apps/00fa164e-2612-4439-992e-157d019a8436.json index ae3eac0..0eab05e 100644 --- a/evaluation_examples/examples/multi_apps/00fa164e-2612-4439-992e-157d019a8436.json +++ b/evaluation_examples/examples/multi_apps/00fa164e-2612-4439-992e-157d019a8436.json @@ -1,7 +1,7 @@ { "id": "00fa164e-2612-4439-992e-157d019a8436", "snapshot": "libreoffice_writer", - "instruction": "I need to include the experiment results from \"~/Documents/awesome-desktop/expe-results.xlsx\" into the currently writing report. Specifically, extract the results of LLM-based models and insert a table into the \"Main Results\" section of my report.", + "instruction": "I need to include the experiment results from \"~/Documents/awesome-desktop/expe-results.xlsx\" into the currently writing report. Specifically, extract the results of GPT-4 and insert a table into the \"Main Results\" section of my report.", "source": "authors", "config": [ { diff --git a/evaluation_examples/examples/multi_apps/02ce9a50-7af2-47ed-8596-af0c230501f8.json b/evaluation_examples/examples/multi_apps/02ce9a50-7af2-47ed-8596-af0c230501f8.json index e520e5b..8658658 100644 --- a/evaluation_examples/examples/multi_apps/02ce9a50-7af2-47ed-8596-af0c230501f8.json +++ b/evaluation_examples/examples/multi_apps/02ce9a50-7af2-47ed-8596-af0c230501f8.json @@ -1,7 +1,7 @@ { "id": "02ce9a50-7af2-47ed-8596-af0c230501f8", "snapshot": "libreoffice_writer", - "instruction": "I am currently utilizing LibreOffice Writer to compose a Linux tutorial, and I intend to display the outcomes generated by executing the \"ls\" command in /home/user. Kindly execute this command and save the screenshot as 'ls.png' on the Desktop.", + "instruction": "I am currently utilizing LibreOffice Writer to compose a Linux tutorial, and I intend to display the outcomes generated by executing the \"ls\" command in /home/user. Kindly execute this command and save the screenshot of the terminal as 'ls.png' on the Desktop.", "source": "authors", "config": [ { @@ -54,7 +54,7 @@ "type": "rule", "rules": { "type": "text", - "text": " Ls" + "text": "ls" } } }, diff --git a/evaluation_examples/examples/multi_apps/26660ad1-6ebb-4f59-8cba-a8432dfe8d38.json b/evaluation_examples/examples/multi_apps/26660ad1-6ebb-4f59-8cba-a8432dfe8d38.json index 8be2de5..6e2204b 100644 --- a/evaluation_examples/examples/multi_apps/26660ad1-6ebb-4f59-8cba-a8432dfe8d38.json +++ b/evaluation_examples/examples/multi_apps/26660ad1-6ebb-4f59-8cba-a8432dfe8d38.json @@ -1,7 +1,7 @@ { "id": "26660ad1-6ebb-4f59-8cba-a8432dfe8d38", "snapshot": "multiapps", - "instruction": "I want to test the quality of the network environment my laptop is currently in. Please measure my network situation through speedtest.net, export the measurement results, and save them to ~/Test/Speed (if the dir does not exist, create it).", + "instruction": "I want to test the quality of the network environment my laptop is currently in. Please measure my network situation through speedtest.net, copy the results in speedtest.net/results, and save them to ~/Test/Speed/results.txt (if the dir does not exist, create it). Each metric occupies one line, with the metric name and its value separated by a single space.", "source": "https://www.speedtest.net/", "config": [ { @@ -54,16 +54,21 @@ "browser" ], "evaluator": { - "func": "compare_time_in_speedtest_results", + "func": "file_contains", "result": { "type": "vm_file", - "path": "/home/user/Test/Speed/Speedtest Results Export-.csv", - "dest": "Speedtest Results Export-.csv", - "time_suffix": true + "path": "/home/user/Test/Speed/results.txt", + "dest": "results.txt" }, "expected": { - "type": "time_diff_range", - "diff_range_in_minutes": "60" + "type": "rule", + "rules": { + "expected": [ + "Ping", + "Download", + "Upload" + ] + } } }, "proxy": true diff --git a/evaluation_examples/examples/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c.json b/evaluation_examples/examples/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c.json index 379908a..3ad3704 100644 --- a/evaluation_examples/examples/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c.json +++ b/evaluation_examples/examples/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c.json @@ -1,7 +1,7 @@ { "id": "3680a5ee-6870-426a-a997-eba929a0d25c", "snapshot": "libreoffice_calc", - "instruction": "I have file1.xlsx and file2.ods on the Desktop and each has one column. Help me use only the command line to merge these two columns into one LibreOffice Calc file called output.csv and open it from terminal.", + "instruction": "I have file1.xlsx and file2.ods on my Desktop, each containing a single column. Using only the command line, help me merge these two columns into a single column by concatenating the strings from both rows, save the result as ~/Desktop/output.csv, and open it in LibreOffice Calc from the terminal", "source": "https://unix.stackexchange.com/questions/510850/how-to-open-calc-from-terminal-and-insert-files", "config": [ { diff --git a/evaluation_examples/examples/multi_apps/3f05f3b9-29ba-4b6b-95aa-2204697ffc06.json b/evaluation_examples/examples/multi_apps/3f05f3b9-29ba-4b6b-95aa-2204697ffc06.json index 8c389c9..ae0ac1c 100644 --- a/evaluation_examples/examples/multi_apps/3f05f3b9-29ba-4b6b-95aa-2204697ffc06.json +++ b/evaluation_examples/examples/multi_apps/3f05f3b9-29ba-4b6b-95aa-2204697ffc06.json @@ -1,7 +1,7 @@ { "id": "3f05f3b9-29ba-4b6b-95aa-2204697ffc06", "snapshot": "os", - "instruction": "I have a collection of MP3s with blank meta data, but already named with their artists and titles. I've heard that Picard or Kid3 may help, but I'm unfamiliar with them. Can you help me to fix the meta data?", + "instruction": "I have a collection of MP3s with blank meta data, but already named with their artists and titles. I've heard that Picard or Kid3 may help, but I'm unfamiliar with them. Can you help me to fix the meta data \"title\" and \"artist\"?", "source": "authors", "config": [ { diff --git a/evaluation_examples/examples/multi_apps/48d05431-6cd5-4e76-82eb-12b60d823f7d.json b/evaluation_examples/examples/multi_apps/48d05431-6cd5-4e76-82eb-12b60d823f7d.json index 0344766..bf3f492 100644 --- a/evaluation_examples/examples/multi_apps/48d05431-6cd5-4e76-82eb-12b60d823f7d.json +++ b/evaluation_examples/examples/multi_apps/48d05431-6cd5-4e76-82eb-12b60d823f7d.json @@ -1,7 +1,7 @@ { "id": "48d05431-6cd5-4e76-82eb-12b60d823f7d", "snapshot": "os", - "instruction": "When I ran \"conda install datasets\" in terminal, I got \"conda: command not found\". Could you help me solve it?", + "instruction": "When I ran \"conda install datasets\" in terminal, I got \"conda: command not found\". Could you help me solve it so that I can use conda command right away?", "source": "authors", "config": [ { @@ -45,17 +45,18 @@ "os", "chrome" ], + "evaluator": { - "func": "is_in_list", + "func": "exact_match", "result": { "type": "vm_command_line", - "command": "conda list", + "command": "grep -q 'conda initialize' ~/.bashrc && echo 1 || echo 0", "shell": true }, "expected": { "type": "rule", "rules": { - "expected": "packages in environment at" + "expected": "1\n" } } }, diff --git a/evaluation_examples/examples/multi_apps/4c26e3f3-3a14-4d86-b44a-d3cedebbb487.json b/evaluation_examples/examples/multi_apps/4c26e3f3-3a14-4d86-b44a-d3cedebbb487.json index b7d220c..69fb707 100644 --- a/evaluation_examples/examples/multi_apps/4c26e3f3-3a14-4d86-b44a-d3cedebbb487.json +++ b/evaluation_examples/examples/multi_apps/4c26e3f3-3a14-4d86-b44a-d3cedebbb487.json @@ -30,15 +30,18 @@ ], "evaluator": { "func": "check_brightness_decrease_and_structure_sim", - "result": { + "expected": { "type": "vm_file", "path": "/home/user/Desktop/background.png", "dest": "background.png" }, - "expected": { + "result": { "type": "cloud_file", "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/4c26e3f3-3a14-4d86-b44a-d3cedebbb487/back.png", "dest": "image_original.png" + }, + "options": { + "threshold": 0.15 } }, "proxy": false diff --git a/evaluation_examples/examples/multi_apps/58565672-7bfe-48ab-b828-db349231de6b.json b/evaluation_examples/examples/multi_apps/58565672-7bfe-48ab-b828-db349231de6b.json index 32bf621..97bd872 100644 --- a/evaluation_examples/examples/multi_apps/58565672-7bfe-48ab-b828-db349231de6b.json +++ b/evaluation_examples/examples/multi_apps/58565672-7bfe-48ab-b828-db349231de6b.json @@ -1,7 +1,7 @@ { "id": "58565672-7bfe-48ab-b828-db349231de6b", "snapshot": "chrome", - "instruction": "Can you assist me by opening the first link in the latest email in Bills folder from Thunderbird and displaying it in a new Chrome tab?", + "instruction": "Can you assist me by opening the first link in the latest email in Bills folder and displaying it in a new Chrome tab?", "source": "https://superuser.com/questions/1792660/open-link-from-other-application-does-not-open-the-url-in-firefox", "config": [ { diff --git a/evaluation_examples/examples/multi_apps/5990457f-2adb-467b-a4af-5c857c92d762.json b/evaluation_examples/examples/multi_apps/5990457f-2adb-467b-a4af-5c857c92d762.json index 695a9d2..6a3e3de 100644 --- a/evaluation_examples/examples/multi_apps/5990457f-2adb-467b-a4af-5c857c92d762.json +++ b/evaluation_examples/examples/multi_apps/5990457f-2adb-467b-a4af-5c857c92d762.json @@ -9,6 +9,7 @@ "parameters": { "command": [ "google-chrome", + "--proxy-server=http://127.0.0.1:18888", "--remote-debugging-port=1337" ] } diff --git a/evaluation_examples/examples/multi_apps/6f4073b8-d8ea-4ade-8a18-c5d1d5d5aa9a.json b/evaluation_examples/examples/multi_apps/6f4073b8-d8ea-4ade-8a18-c5d1d5d5aa9a.json index 2727d8b..1ae393e 100644 --- a/evaluation_examples/examples/multi_apps/6f4073b8-d8ea-4ade-8a18-c5d1d5d5aa9a.json +++ b/evaluation_examples/examples/multi_apps/6f4073b8-d8ea-4ade-8a18-c5d1d5d5aa9a.json @@ -65,8 +65,15 @@ "expected": [ "Scottsdale", "Atlanta", - "Lake Tahoe", - "Banff", + [ + "Lake Tahoe", + "Stateline" + ], + [ + "Banff", + "Alberta's Rockies", + "Alberta’s Rockies" + ], "Beijing", [ "Montreal", @@ -87,7 +94,11 @@ "Barcelona", "Toulon", "Sydney", - "Long Beach", + [ + "Los Angeles", + "Long Beach", + "LA" + ], "Vancouver", "Stockholm", [ @@ -95,7 +106,11 @@ "Montréal" ], "New Orleans", - "Long Beach", + [ + "Los Angeles", + "Long Beach", + "LA" + ], "Vancouver" ] } From adc9ad88c2c2980cd401e7968985e056edfc6811 Mon Sep 17 00:00:00 2001 From: Danyang Zhang Date: Thu, 3 Jul 2025 21:55:55 +0800 Subject: [PATCH 2/2] Thunderbird eval fix (#233) * ver Jul2nd updated task requiring set up new email account * ver Jul3rd fixed several tasks --- .../multi_apps/415ef462-bed3-493a-ac36-ca8c6d23bf1b.json | 4 ++-- .../thunderbird/15c3b339-88f7-4a86-ab16-e71c58dcb01e.json | 8 ++++---- .../thunderbird/3f28fe4f-5d9d-4994-a456-efd78cfae1a3.json | 6 +++--- .../thunderbird/7b1e1ff9-bb85-49be-b01d-d6424be18cd0.json | 4 ++-- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/evaluation_examples/examples/multi_apps/415ef462-bed3-493a-ac36-ca8c6d23bf1b.json b/evaluation_examples/examples/multi_apps/415ef462-bed3-493a-ac36-ca8c6d23bf1b.json index 8bcd85e..04f43a6 100644 --- a/evaluation_examples/examples/multi_apps/415ef462-bed3-493a-ac36-ca8c6d23bf1b.json +++ b/evaluation_examples/examples/multi_apps/415ef462-bed3-493a-ac36-ca8c6d23bf1b.json @@ -1,7 +1,7 @@ { "id": "415ef462-bed3-493a-ac36-ca8c6d23bf1b", "snapshot": "thunderbird", - "instruction": "There's an e-mail containing the AWS invoice for December saved in local \"Bills\" folder. Extract the invoice PDF to the my receipts folder. Follow the file name pattern of the old files and update a record in my tally book.", + "instruction": "There's an e-mail containing the AWS invoice for December saved in local \"Bills\" folder. Extract the invoice PDF to the my receipts folder. Follow the file name pattern of the old files and append a record at the end of my tally book.", "source": "authors", "config": [ { @@ -202,4 +202,4 @@ ] }, "proxy": false -} \ No newline at end of file +} diff --git a/evaluation_examples/examples/thunderbird/15c3b339-88f7-4a86-ab16-e71c58dcb01e.json b/evaluation_examples/examples/thunderbird/15c3b339-88f7-4a86-ab16-e71c58dcb01e.json index e0848eb..59f8ce0 100644 --- a/evaluation_examples/examples/thunderbird/15c3b339-88f7-4a86-ab16-e71c58dcb01e.json +++ b/evaluation_examples/examples/thunderbird/15c3b339-88f7-4a86-ab16-e71c58dcb01e.json @@ -1,7 +1,7 @@ { "id": "15c3b339-88f7-4a86-ab16-e71c58dcb01e", "snapshot": "thunderbird", - "instruction": "Help me access my outlook account with address \"anonym-x2024@outlook.com\" and password 'Wlv(z._6y|a,rrjfQuQhIi\\$;' (without ')", + "instruction": "Help me access my outlook account with address \"anonym-x2024@outlook.com\" and password 'password' (without ') in Thunderbird. It doesn't mather if Thunderbird remind of login or connection failure. Just finish the account setup and I will check the things like password mannually later.", "source": "https://www.wikihow.com/Access-Gmail-With-Mozilla-Thunderbird", "config": [ { @@ -85,16 +85,16 @@ { "url": "imap://outlook.office365.com", "user": "anonym-x2024@outlook.com", - "password": "Wlv(z._6y|a,rrjfQuQhIi\\$;" + "password": "password" }, { "url": "smtp://smtp.office365.com", "user": "anonym-x2024@outlook.com", - "password": "Wlv(z._6y|a,rrjfQuQhIi\\$;" + "password": "password" } ] } } }, "proxy": false -} \ No newline at end of file +} diff --git a/evaluation_examples/examples/thunderbird/3f28fe4f-5d9d-4994-a456-efd78cfae1a3.json b/evaluation_examples/examples/thunderbird/3f28fe4f-5d9d-4994-a456-efd78cfae1a3.json index 2409586..3863c2a 100644 --- a/evaluation_examples/examples/thunderbird/3f28fe4f-5d9d-4994-a456-efd78cfae1a3.json +++ b/evaluation_examples/examples/thunderbird/3f28fe4f-5d9d-4994-a456-efd78cfae1a3.json @@ -1,7 +1,7 @@ { "id": "3f28fe4f-5d9d-4994-a456-efd78cfae1a3", "snapshot": "thunderbird", - "instruction": "Set up a signature using my name and affiliation. My name is Anonym and my affiliation is XYZ Lab.", + "instruction": "Set up a plain text signature for my email account in Thunderbird. The first line is my name \"Anonym\" and the second line is my affiliation \"XYZ Lab\".", "source": "https://www.adsigner.com/user-manual/signatures/setup-email-client-thunderbird/#:~:text=is%20probably%20hidden.-,Right%20click%20on%20the%20empty%20space%20at%20the%20top%20of,signature%20from%20a%20file%20instead.", "config": [ { @@ -70,7 +70,7 @@ "expect": { "mail.identity.id1.htmlSigText": { "method": "re.S", - "ref": "Anonym.+XYZ Lab" + "ref": "Anonym\\nXYZ Lab" } } } @@ -78,4 +78,4 @@ "func": "check_thunderbird_prefs" }, "proxy": false -} \ No newline at end of file +} diff --git a/evaluation_examples/examples/thunderbird/7b1e1ff9-bb85-49be-b01d-d6424be18cd0.json b/evaluation_examples/examples/thunderbird/7b1e1ff9-bb85-49be-b01d-d6424be18cd0.json index 939a016..def2b5d 100644 --- a/evaluation_examples/examples/thunderbird/7b1e1ff9-bb85-49be-b01d-d6424be18cd0.json +++ b/evaluation_examples/examples/thunderbird/7b1e1ff9-bb85-49be-b01d-d6424be18cd0.json @@ -1,7 +1,7 @@ { "id": "7b1e1ff9-bb85-49be-b01d-d6424be18cd0", "snapshot": "thunderbird", - "instruction": "Could you help me open up the Thunderbird profile manager utility?", + "instruction": "Could you help me open up the profile management tabpage in Thunderbird? I want the profile management tabpage inside Thunderbird app, but not the profile chooser dialog during app launch.", "source": "https://www.quora.com/How-do-I-open-a-Thunderbird-profile-manager-utility", "config": [ { @@ -58,4 +58,4 @@ "func": "check_accessibility_tree" }, "proxy": false -} \ No newline at end of file +}