From ee851aeb543174ce791d1073daaf13d26f0440b7 Mon Sep 17 00:00:00 2001 From: tsuky_chen <91684733+chenjix@users.noreply.github.com> Date: Thu, 1 Feb 2024 16:09:24 +0800 Subject: [PATCH 1/6] Update 0cecd4f3-74de-457b-ba94-29ad6b5dafb6.json --- .../libreoffice_calc/0cecd4f3-74de-457b-ba94-29ad6b5dafb6.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evaluation_examples/examples/libreoffice_calc/0cecd4f3-74de-457b-ba94-29ad6b5dafb6.json b/evaluation_examples/examples/libreoffice_calc/0cecd4f3-74de-457b-ba94-29ad6b5dafb6.json index 97ad2a1..56ba374 100644 --- a/evaluation_examples/examples/libreoffice_calc/0cecd4f3-74de-457b-ba94-29ad6b5dafb6.json +++ b/evaluation_examples/examples/libreoffice_calc/0cecd4f3-74de-457b-ba94-29ad6b5dafb6.json @@ -1,7 +1,7 @@ { "id": "0cecd4f3-74de-457b-ba94-29ad6b5dafb6", "snapshot": "libreoffice_calc", - "instruction": "Rename \"Sheet 1\" to \"LARS Resources\". Then make a copy of it. Place the copy before \"Sheet 2\" and rename it by appending a suffix \"(Backup)\", concatenated by a which space. And Also rename \"Sheet2\" by appending the suffix \"(Offline)\".", + "instruction": "Rename \"Sheet 1\" to \"LARS Resources\". Then make a copy of it. Place the copy before \"Sheet 2\" and rename it by appending a suffix \"(Backup)\", concatenated by a white space. And Also rename \"Sheet2\" by appending the suffix \"(Offline)\".", "source": "https://www.libreofficehelp.com/add-insert-delete-copy-move-rename-a-worksheet-in-libreoffice-calc/", "config": [ { From 62f50cdc26ceb67600806a38a0497f8a9c4eedaa Mon Sep 17 00:00:00 2001 From: tsuky_chen <91684733+chenjix@users.noreply.github.com> Date: Thu, 1 Feb 2024 16:10:47 +0800 Subject: [PATCH 2/6] Update 7a4e4bc8-922c-4c84-865c-25ba34136be1.json --- .../libreoffice_calc/7a4e4bc8-922c-4c84-865c-25ba34136be1.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evaluation_examples/examples/libreoffice_calc/7a4e4bc8-922c-4c84-865c-25ba34136be1.json b/evaluation_examples/examples/libreoffice_calc/7a4e4bc8-922c-4c84-865c-25ba34136be1.json index 99133d0..58857fa 100644 --- a/evaluation_examples/examples/libreoffice_calc/7a4e4bc8-922c-4c84-865c-25ba34136be1.json +++ b/evaluation_examples/examples/libreoffice_calc/7a4e4bc8-922c-4c84-865c-25ba34136be1.json @@ -1,7 +1,7 @@ { "id": "7a4e4bc8-922c-4c84-865c-25ba34136be1", "snapshot": "libreoffice_calc", - "instruction": "Reorder the columns to be \"Data\", \"First Name\", \"Last Name\", \"Order ID\", \"Sales\"", + "instruction": "Reorder the columns to be \"Date\", \"First Name\", \"Last Name\", \"Order ID\", \"Sales\"", "source": "https://www.youtube.com/shorts/bvUhr1AHs44", "config": [ { From 660cbe908153a40876bd611547eedc6b1f3c29b7 Mon Sep 17 00:00:00 2001 From: rhythmcao Date: Thu, 1 Feb 2024 16:21:19 +0800 Subject: [PATCH 3/6] expert human test for multi-app finished, fix some small issues --- .../46407397-a7d5-4c6b-92c6-dbe038b1457b.json | 2 +- .../897e3b53-5d4d-444b-85cb-2cdc8a97d903.json | 2 +- .../b52b40a5-ad70-4c53-b5b0-5650a8387052.json | 14 +++++++------- .../settings/googledrive/credentials.json | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/evaluation_examples/examples/multi_apps/46407397-a7d5-4c6b-92c6-dbe038b1457b.json b/evaluation_examples/examples/multi_apps/46407397-a7d5-4c6b-92c6-dbe038b1457b.json index f4fd1bf..db41601 100644 --- a/evaluation_examples/examples/multi_apps/46407397-a7d5-4c6b-92c6-dbe038b1457b.json +++ b/evaluation_examples/examples/multi_apps/46407397-a7d5-4c6b-92c6-dbe038b1457b.json @@ -68,7 +68,7 @@ "parameters": { "command": [ "tar", - "-xzv", + "-xz", "--recursive-unlink", "-f", "/home/user/thunderbird-profile.tar.gz", diff --git a/evaluation_examples/examples/multi_apps/897e3b53-5d4d-444b-85cb-2cdc8a97d903.json b/evaluation_examples/examples/multi_apps/897e3b53-5d4d-444b-85cb-2cdc8a97d903.json index 9da5cc0..6417237 100644 --- a/evaluation_examples/examples/multi_apps/897e3b53-5d4d-444b-85cb-2cdc8a97d903.json +++ b/evaluation_examples/examples/multi_apps/897e3b53-5d4d-444b-85cb-2cdc8a97d903.json @@ -1,7 +1,7 @@ { "id": "897e3b53-5d4d-444b-85cb-2cdc8a97d903", "snapshot": "chrome", - "instruction": "I have a LibreOffice Writer file form.docx on the desktop. Help me convert it to PDF format and store it in the forms/ folder in my Google Drive.", + "instruction": "I have a LibreOffice Writer file form.docx on the desktop. Help me convert it to PDF format and store the PDF in the forms/ folder in my Google Drive.", "source": "https://marketplace.uipath.com/listings/convert-word-file-to-pdf-and-store-in-onedrive", "config": [ { diff --git a/evaluation_examples/examples/multi_apps/b52b40a5-ad70-4c53-b5b0-5650a8387052.json b/evaluation_examples/examples/multi_apps/b52b40a5-ad70-4c53-b5b0-5650a8387052.json index 25b9024..e9e879d 100644 --- a/evaluation_examples/examples/multi_apps/b52b40a5-ad70-4c53-b5b0-5650a8387052.json +++ b/evaluation_examples/examples/multi_apps/b52b40a5-ad70-4c53-b5b0-5650a8387052.json @@ -68,13 +68,13 @@ "type": "execute", "parameters": { "command": [ - "tar", - "-xzv", - "--recursive-unlink", - "-f", - "/home/user/thunderbird-profile.tar.gz", - "-C", - "/home/user/" + "tar", + "-xz", + "--recursive-unlink", + "-f", + "/home/user/thunderbird-profile.tar.gz", + "-C", + "/home/user/" ] } }, diff --git a/evaluation_examples/settings/googledrive/credentials.json b/evaluation_examples/settings/googledrive/credentials.json index 1be4912..34d7be4 100644 --- a/evaluation_examples/settings/googledrive/credentials.json +++ b/evaluation_examples/settings/googledrive/credentials.json @@ -1 +1 @@ -{"access_token": "ya29.a0AfB_byAZmDTDsYds_iatV8a30PUPWcDHVW4Cyg71pTlD0f3eBBwAjV4WpVL8LdAle8sT4j_rX4rWH8iCt3QI2YdrQLFPlaVdBk0zRGGtAEcebIDuQy_VKD6j5c3IGxok9PDON-Mft0ZVJjUVEopgLYA4fYwctbQZ8nyl4AaCgYKAX4SARISFQHGX2Mim-LRNXCfACmecJH94-D09A0173", "client_id": "786888752612-rgng5v9hcq4as7pn0b40gt9r5lekmht9.apps.googleusercontent.com", "client_secret": "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", "refresh_token": "1//0ehtafHmucszRCgYIARAAGA4SNwF-L9IrpDBsnzdHKAlRfrkvzNFw1cpdnRY8rhM5gy4flsPYdysMav27yHamJx39BBGq-LLw40s", "token_expiry": "2024-01-31T14:41:25Z", "token_uri": "https://oauth2.googleapis.com/token", "user_agent": null, "revoke_uri": "https://oauth2.googleapis.com/revoke", "id_token": null, "id_token_jwt": null, "token_response": {"access_token": "ya29.a0AfB_byAZmDTDsYds_iatV8a30PUPWcDHVW4Cyg71pTlD0f3eBBwAjV4WpVL8LdAle8sT4j_rX4rWH8iCt3QI2YdrQLFPlaVdBk0zRGGtAEcebIDuQy_VKD6j5c3IGxok9PDON-Mft0ZVJjUVEopgLYA4fYwctbQZ8nyl4AaCgYKAX4SARISFQHGX2Mim-LRNXCfACmecJH94-D09A0173", "expires_in": 3599, "scope": "https://www.googleapis.com/auth/drive", "token_type": "Bearer"}, "scopes": ["https://www.googleapis.com/auth/drive"], "token_info_uri": "https://oauth2.googleapis.com/tokeninfo", "invalid": false, "_class": "OAuth2Credentials", "_module": "oauth2client.client"} \ No newline at end of file +{"access_token": "ya29.a0AfB_byD2_A8RPm6KzfjaNifjfgZ2M-D9G16GAPIj1ANxM3AMq4DLUXxj76CalsByOsqTUgEvADd-FEKL0FkBBc4ow-EuaLUEOm4yw2LNEOFMhVD_k4PvEIf4767fYU5o__GtyrGt5pNJy0MaBukDY2ui7GQwDuFFGt2q_AaCgYKAYMSARISFQHGX2MidvFUuBpNsy4fkC5DP2k8Aw0173", "client_id": "786888752612-rgng5v9hcq4as7pn0b40gt9r5lekmht9.apps.googleusercontent.com", "client_secret": "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", "refresh_token": "1//0ehtafHmucszRCgYIARAAGA4SNwF-L9IrpDBsnzdHKAlRfrkvzNFw1cpdnRY8rhM5gy4flsPYdysMav27yHamJx39BBGq-LLw40s", "token_expiry": "2024-02-01T08:29:08Z", "token_uri": "https://oauth2.googleapis.com/token", "user_agent": null, "revoke_uri": "https://oauth2.googleapis.com/revoke", "id_token": null, "id_token_jwt": null, "token_response": {"access_token": "ya29.a0AfB_byD2_A8RPm6KzfjaNifjfgZ2M-D9G16GAPIj1ANxM3AMq4DLUXxj76CalsByOsqTUgEvADd-FEKL0FkBBc4ow-EuaLUEOm4yw2LNEOFMhVD_k4PvEIf4767fYU5o__GtyrGt5pNJy0MaBukDY2ui7GQwDuFFGt2q_AaCgYKAYMSARISFQHGX2MidvFUuBpNsy4fkC5DP2k8Aw0173", "expires_in": 3599, "scope": "https://www.googleapis.com/auth/drive", "token_type": "Bearer"}, "scopes": ["https://www.googleapis.com/auth/drive"], "token_info_uri": "https://oauth2.googleapis.com/tokeninfo", "invalid": false, "_class": "OAuth2Credentials", "_module": "oauth2client.client"} \ No newline at end of file From fe21c4753390ebbbdde8bbec084ed228e5ea657e Mon Sep 17 00:00:00 2001 From: BlankCheng <913501223@qq.com> Date: Thu, 1 Feb 2024 16:27:24 +0800 Subject: [PATCH 4/6] Fix samples of Impress and OS --- .../21760ecb-8f62-40d2-8d85-0cee5725cb72.json | 2 +- .../5d901039-a89c-4bfb-967b-bf66f4df075e.json | 2 +- .../9ec204e4-f0a3-42f8-8458-b772a6797cab.json | 2 +- .../af23762e-2bfd-4a1d-aada-20fa8de9ce07.json | 4 ++-- .../examples/os/7688b85f-87a4-4e4a-b2f8-f3d6c3f29b82.json | 2 +- .../examples/os/bedcedc4-4d72-425e-ad62-21960b11fe0d.json | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/evaluation_examples/examples/libreoffice_impress/21760ecb-8f62-40d2-8d85-0cee5725cb72.json b/evaluation_examples/examples/libreoffice_impress/21760ecb-8f62-40d2-8d85-0cee5725cb72.json index 70bd696..173c5ac 100644 --- a/evaluation_examples/examples/libreoffice_impress/21760ecb-8f62-40d2-8d85-0cee5725cb72.json +++ b/evaluation_examples/examples/libreoffice_impress/21760ecb-8f62-40d2-8d85-0cee5725cb72.json @@ -1,7 +1,7 @@ { "id": "21760ecb-8f62-40d2-8d85-0cee5725cb72", "snapshot": "libreoffice_impress", - "instruction": "Could you help me add silde transition \"dissolve\" to my first page?", + "instruction": "Could you help me add slide transition \"dissolve\" to my first page?", "source": "https://www.libreofficehelp.com/add-animations-transitions-libreoffice-impress-slides/", "config": [ { diff --git a/evaluation_examples/examples/libreoffice_impress/5d901039-a89c-4bfb-967b-bf66f4df075e.json b/evaluation_examples/examples/libreoffice_impress/5d901039-a89c-4bfb-967b-bf66f4df075e.json index 03daf6c..9c4f050 100644 --- a/evaluation_examples/examples/libreoffice_impress/5d901039-a89c-4bfb-967b-bf66f4df075e.json +++ b/evaluation_examples/examples/libreoffice_impress/5d901039-a89c-4bfb-967b-bf66f4df075e.json @@ -1,7 +1,7 @@ { "id": "5d901039-a89c-4bfb-967b-bf66f4df075e", "snapshot": "libreoffice_impress", - "instruction": "I want to make this page my cover page. Could you help me stretch this image to fill the entire page, keeping its proportion and centering the image.", + "instruction": "I want to turn the rectangular image of Columbus on the first page into a cover page. Could you help me stretch this image to fill the entire page, keeping its proportion and centering the image?", "source": "https://superuser.com/questions/986776/how-can-i-stretch-an-image-in-a-libreoffice-impress-presentation-to-fill-the-pag", "config": [ { diff --git a/evaluation_examples/examples/libreoffice_impress/9ec204e4-f0a3-42f8-8458-b772a6797cab.json b/evaluation_examples/examples/libreoffice_impress/9ec204e4-f0a3-42f8-8458-b772a6797cab.json index 7c85a91..07be4b5 100644 --- a/evaluation_examples/examples/libreoffice_impress/9ec204e4-f0a3-42f8-8458-b772a6797cab.json +++ b/evaluation_examples/examples/libreoffice_impress/9ec204e4-f0a3-42f8-8458-b772a6797cab.json @@ -62,7 +62,7 @@ "expected": { "type": "cloud_file", "path": "https://drive.usercontent.google.com/download?id=1otbzscpOZ0tCXMvsMC0MmNWUC7Pv71of&export=download&authuser=0&confirm=t&uuid=faa0b0c1-6b14-4bce-a1fd-ccf824ee1e60&at=APZUnTXw6TlBOlrPPZ2OhfGnNPf0:1705338135842", - "dest": "MLA_Workshop_061X_Works_Cited_Gold.docx" + "dest": "MLA_Workshop_061X_Works_Cited_Gold.pptx" }, "result": { "type": "vm_file", diff --git a/evaluation_examples/examples/libreoffice_impress/af23762e-2bfd-4a1d-aada-20fa8de9ce07.json b/evaluation_examples/examples/libreoffice_impress/af23762e-2bfd-4a1d-aada-20fa8de9ce07.json index 8ca8792..e93015a 100644 --- a/evaluation_examples/examples/libreoffice_impress/af23762e-2bfd-4a1d-aada-20fa8de9ce07.json +++ b/evaluation_examples/examples/libreoffice_impress/af23762e-2bfd-4a1d-aada-20fa8de9ce07.json @@ -2,7 +2,7 @@ "id": "af23762e-2bfd-4a1d-aada-20fa8de9ce07", "snapshot": "libreoffice_impress", "instruction": "I am making PPT on LibreOffice Impress for presentation tomorrow. I need to summarize contents on one slide use Impress \"Summary Slide\" feature. Could you make that for me?", - "source": "https://www.libreofficehelp.com/export-libreoffice-impress-slides-images/#:~:text=Exporting%20a%20single%20slide%20as.jpg%2C.png%2C%20etc%20image%20is,on%20the%20checkbox%20Selection.%20Provide%20jpg%20quality%20options.", + "source": "https://superuser.com/questions/1059080/how-to-make-a-summary-slide-in-impress-listing-the-titles-of-all-slides-autom", "config": [ { "type": "download", @@ -62,7 +62,7 @@ "expected": { "type": "cloud_file", "path": "https://drive.usercontent.google.com/download?id=1nRwmFgYdskv3EiriZZFoT8TzM9CsG5B0&export=download&authuser=0&confirm=t&uuid=f2f919df-2867-4bc3-8bb9-dabd51108ebb&at=APZUnTWzw9LJWWXvH0cvdaWL-Ij-:1705319339474", - "dest": "Forests_Gold.docx" + "dest": "Forests_Gold.pptx" }, "result": { "type": "vm_file", diff --git a/evaluation_examples/examples/os/7688b85f-87a4-4e4a-b2f8-f3d6c3f29b82.json b/evaluation_examples/examples/os/7688b85f-87a4-4e4a-b2f8-f3d6c3f29b82.json index da8f5c4..2c0d1d1 100644 --- a/evaluation_examples/examples/os/7688b85f-87a4-4e4a-b2f8-f3d6c3f29b82.json +++ b/evaluation_examples/examples/os/7688b85f-87a4-4e4a-b2f8-f3d6c3f29b82.json @@ -1,7 +1,7 @@ { "id": "7688b85f-87a4-4e4a-b2f8-f3d6c3f29b82", "snapshot": "os", - "instruction": "Can you move the file with the path 'todo.txt' on the Desktop to the directory with the path 'done' on the Desktop?", + "instruction": "Can you move the file 'todo.txt' on the Desktop to the directory 'done/' on the Desktop?", "source": "https://ubuntu.com/tutorials/command-line-for-beginners#5-moving-and-manipulating-files", "config": [ { diff --git a/evaluation_examples/examples/os/bedcedc4-4d72-425e-ad62-21960b11fe0d.json b/evaluation_examples/examples/os/bedcedc4-4d72-425e-ad62-21960b11fe0d.json index 6c67c0c..97e8545 100644 --- a/evaluation_examples/examples/os/bedcedc4-4d72-425e-ad62-21960b11fe0d.json +++ b/evaluation_examples/examples/os/bedcedc4-4d72-425e-ad62-21960b11fe0d.json @@ -1,7 +1,7 @@ { "id": "bedcedc4-4d72-425e-ad62-21960b11fe0d", "snapshot": "os", - "instruction": "Could you set the 'Dim screen when inactive' to on in setting?", + "instruction": "Could you set the 'Dim screen when inactive' to off in setting?", "source": "https://www.youtube.com/watch?v=D4WyNjt_hbQ&t=2s", "trajectory": "trajectories/", "config": [ From 59e2417a081d857c5f2a961d8a3bbc5a41650301 Mon Sep 17 00:00:00 2001 From: Timothyxxx <384084775@qq.com> Date: Thu, 1 Feb 2024 16:55:38 +0800 Subject: [PATCH 5/6] Add Mistral, Qwen, Gemini support; Fix minor bugs --- desktop_env/envs/desktop_env.py | 2 +- desktop_env/evaluators/metrics/gimp.py | 3 + .../66399b0d-8fda-4618-95c4-bfc6191617e9.json | 2 +- .../6ada715d-3aae-4a32-a6a7-429b2e43fb93.json | 2 +- mm_agents/gemini_pro_agent.py | 136 ------------- mm_agents/gemini_pro_vision_agent.py | 115 ----------- mm_agents/gpt_4v_agent.py | 183 ++++++++++++++---- 7 files changed, 156 insertions(+), 287 deletions(-) delete mode 100644 mm_agents/gemini_pro_agent.py delete mode 100644 mm_agents/gemini_pro_vision_agent.py diff --git a/desktop_env/envs/desktop_env.py b/desktop_env/envs/desktop_env.py index e870b21..b12fbca 100644 --- a/desktop_env/envs/desktop_env.py +++ b/desktop_env/envs/desktop_env.py @@ -30,7 +30,7 @@ def _execute_command(command: List[str]) -> None: p = subprocess.Popen(command) p.wait() else: - result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=60, text=True) + result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=60, text=True, encoding="utf-8") if result.returncode != 0: raise Exception("\033[91m" + result.stdout + result.stderr + "\033[0m") return result.stdout diff --git a/desktop_env/evaluators/metrics/gimp.py b/desktop_env/evaluators/metrics/gimp.py index a1d3a82..30c9b68 100644 --- a/desktop_env/evaluators/metrics/gimp.py +++ b/desktop_env/evaluators/metrics/gimp.py @@ -328,6 +328,9 @@ def check_structure_sim(src_path, tgt_path): Check if the structure of the two images are similar gimp:2a729ded-3296-423d-aec4-7dd55ed5fbb3 """ + if src_path is None or tgt_path is None: + return 0. + img_src = Image.open(src_path) img_tgt = Image.open(tgt_path) structure_same = structure_check_by_ssim(img_src, img_tgt) diff --git a/evaluation_examples/examples/libreoffice_writer/66399b0d-8fda-4618-95c4-bfc6191617e9.json b/evaluation_examples/examples/libreoffice_writer/66399b0d-8fda-4618-95c4-bfc6191617e9.json index a7951e4..33b8a1a 100644 --- a/evaluation_examples/examples/libreoffice_writer/66399b0d-8fda-4618-95c4-bfc6191617e9.json +++ b/evaluation_examples/examples/libreoffice_writer/66399b0d-8fda-4618-95c4-bfc6191617e9.json @@ -27,7 +27,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; time.sleep(1); pyautogui.press(\"down\", presses=40, interval=0.1); time.sleep(1); pyautogui.scroll(-2)" + "import pyautogui; import time; time.sleep(5); pyautogui.press(\"down\", presses=40, interval=10); time.sleep(1); pyautogui.scroll(-2)" ] } } diff --git a/evaluation_examples/examples/libreoffice_writer/6ada715d-3aae-4a32-a6a7-429b2e43fb93.json b/evaluation_examples/examples/libreoffice_writer/6ada715d-3aae-4a32-a6a7-429b2e43fb93.json index a7024d6..7151032 100644 --- a/evaluation_examples/examples/libreoffice_writer/6ada715d-3aae-4a32-a6a7-429b2e43fb93.json +++ b/evaluation_examples/examples/libreoffice_writer/6ada715d-3aae-4a32-a6a7-429b2e43fb93.json @@ -38,7 +38,7 @@ "command": [ "python", "-c", - "import pyautogui; import time; time.sleep(1); pyautogui.press(\"down\", presses=8); time.sleep(1); pyautogui.scroll(-2)" + "import pyautogui; import time; time.sleep(5); pyautogui.press(\"down\", presses=8, interval=3); time.sleep(1); pyautogui.scroll(-2)" ] } } diff --git a/mm_agents/gemini_pro_agent.py b/mm_agents/gemini_pro_agent.py deleted file mode 100644 index ce84488..0000000 --- a/mm_agents/gemini_pro_agent.py +++ /dev/null @@ -1,136 +0,0 @@ -# todo: needs to be refactored - -import time -from typing import Dict, List - -import google.generativeai as genai - -from mm_agents.accessibility_tree_wrap.heuristic_retrieve import find_leaf_nodes, filter_nodes -from mm_agents.gpt_4_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION -from mm_agents.gpt_4_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE -from mm_agents.gpt_4v_agent import parse_actions_from_string, parse_code_from_string - - -class GeminiPro_Agent: - def __init__(self, api_key, instruction, model='gemini-pro', max_tokens=300, temperature=0.0, - action_space="computer_13"): - genai.configure(api_key=api_key) - self.instruction = instruction - self.model = genai.GenerativeModel(model) - self.max_tokens = max_tokens - self.temperature = temperature - self.action_space = action_space - - self.trajectory = [ - { - "role": "system", - "parts": [ - { - "computer_13": SYS_PROMPT_ACTION, - "pyautogui": SYS_PROMPT_CODE - }[action_space] + "\nHere is the instruction for the task: {}".format(self.instruction) - ] - } - ] - - def predict(self, obs: Dict) -> List: - """ - Predict the next action(s) based on the current observation. - Only support single-round conversation, only fill-in the last desktop screenshot. - """ - accessibility_tree = obs["accessibility_tree"] - - leaf_nodes = find_leaf_nodes(accessibility_tree) - filtered_nodes = filter_nodes(leaf_nodes) - - linearized_accessibility_tree = "tag\ttext\tposition\tsize\n" - # Linearize the accessibility tree nodes into a table format - - for node in filtered_nodes: - linearized_accessibility_tree += node.tag + "\t" - linearized_accessibility_tree += node.attrib.get('name') + "\t" - linearized_accessibility_tree += node.attrib.get( - '{uri:deskat:component.at-spi.gnome.org}screencoord') + "\t" - linearized_accessibility_tree += node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size') + "\n" - - self.trajectory.append({ - "role": "user", - "parts": [ - "Given the XML format of accessibility tree (convert and formatted into table) as below:\n{}\nWhat's the next step that you will do to help with the task?".format( - linearized_accessibility_tree)] - }) - - # todo: Remove this step once the Gemini supports multi-round conversation - all_message_str = "" - for i in range(len(self.trajectory)): - if i == 0: - all_message_template = "<|im_start|>system\n{}\n<|im_end|>\n" - elif i % 2 == 1: - all_message_template = "<|im_start|>user\n{}\n<|im_end|>\n" - else: - all_message_template = "<|im_start|>assistant\n{}\n<|im_end|>\n" - - all_message_str += all_message_template.format(self.trajectory[i]["parts"][0]) - - print("All message: >>>>>>>>>>>>>>>> ") - print( - all_message_str - ) - - message_for_gemini = { - "role": "user", - "parts": [all_message_str] - } - - traj_to_show = [] - for i in range(len(self.trajectory)): - traj_to_show.append(self.trajectory[i]["parts"][0]) - if len(self.trajectory[i]["parts"]) > 1: - traj_to_show.append("screenshot_obs") - - print("Trajectory:", traj_to_show) - - while True: - try: - response = self.model.generate_content( - message_for_gemini, - generation_config={ - "max_output_tokens": self.max_tokens, - "temperature": self.temperature - } - ) - break - except: - print("Failed to generate response, retrying...") - time.sleep(5) - pass - - try: - response_text = response.text - except: - return [] - - try: - actions = self.parse_actions(response_text) - except: - print("Failed to parse action from response:", response_text) - actions = [] - - return actions - - def parse_actions(self, response: str): - # parse from the response - if self.action_space == "computer_13": - actions = parse_actions_from_string(response) - elif self.action_space == "pyautogui": - actions = parse_code_from_string(response) - else: - raise ValueError("Invalid action space: " + self.action_space) - - # add action into the trajectory - self.trajectory.append({ - "role": "assistant", - "parts": [response] - }) - - return actions diff --git a/mm_agents/gemini_pro_vision_agent.py b/mm_agents/gemini_pro_vision_agent.py deleted file mode 100644 index 4a537db..0000000 --- a/mm_agents/gemini_pro_vision_agent.py +++ /dev/null @@ -1,115 +0,0 @@ -# todo: needs to be refactored - -import time -from typing import Dict, List - -import PIL.Image -import google.generativeai as genai - -from mm_agents.gpt_4v_agent import parse_actions_from_string, parse_code_from_string -from mm_agents.gpt_4v_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION -from mm_agents.gpt_4v_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE - - -class GeminiProV_Agent: - def __init__(self, api_key, instruction, model='gemini-pro-vision', max_tokens=300, temperature=0.0, - action_space="computer_13"): - genai.configure(api_key=api_key) - self.instruction = instruction - self.model = genai.GenerativeModel(model) - self.max_tokens = max_tokens - self.temperature = temperature - self.action_space = action_space - - self.trajectory = [ - { - "role": "system", - "parts": [ - { - "computer_13": SYS_PROMPT_ACTION, - "pyautogui": SYS_PROMPT_CODE - }[action_space] + "\nHere is the instruction for the task: {}".format(self.instruction) - ] - } - ] - - def predict(self, obs: Dict) -> List: - """ - Predict the next action(s) based on the current observation. - Only support single-round conversation, only fill-in the last desktop screenshot. - """ - img = PIL.Image.open(obs["screenshot"]) - self.trajectory.append({ - "role": "user", - "parts": ["What's the next step that you will do to help with the task?", img] - }) - - # todo: Remove this step once the Gemini supports multi-round conversation - all_message_str = "" - for i in range(len(self.trajectory)): - if i == 0: - all_message_template = "<|im_start|>system\n{}\n<|im_end|>\n" - elif i % 2 == 1: - all_message_template = "<|im_start|>user\n{}\n<|im_end|>\n" - else: - all_message_template = "<|im_start|>assistant\n{}\n<|im_end|>\n" - - all_message_str += all_message_template.format(self.trajectory[i]["parts"][0]) - - message_for_gemini = { - "role": "user", - "parts": [all_message_str, img] - } - - traj_to_show = [] - for i in range(len(self.trajectory)): - traj_to_show.append(self.trajectory[i]["parts"][0]) - if len(self.trajectory[i]["parts"]) > 1: - traj_to_show.append("screenshot_obs") - - print("Trajectory:", traj_to_show) - - while True: - try: - response = self.model.generate_content( - message_for_gemini, - generation_config={ - "max_output_tokens": self.max_tokens, - "temperature": self.temperature - } - ) - break - except: - print("Failed to generate response, retrying...") - time.sleep(5) - pass - - try: - response_text = response.text - except: - return [] - - try: - actions = self.parse_actions(response_text) - except: - print("Failed to parse action from response:", response_text) - actions = [] - - return actions - - def parse_actions(self, response: str): - # parse from the response - if self.action_space == "computer_13": - actions = parse_actions_from_string(response) - elif self.action_space == "pyautogui": - actions = parse_code_from_string(response) - else: - raise ValueError("Invalid action space: " + self.action_space) - - # add action into the trajectory - self.trajectory.append({ - "role": "assistant", - "parts": [response] - }) - - return actions diff --git a/mm_agents/gpt_4v_agent.py b/mm_agents/gpt_4v_agent.py index 0dd15cf..68c07f3 100644 --- a/mm_agents/gpt_4v_agent.py +++ b/mm_agents/gpt_4v_agent.py @@ -1,12 +1,20 @@ import base64 import json +import logging import os import re +import time import uuid +from http import HTTPStatus +from io import BytesIO from typing import Dict, List import backoff +import dashscope +import google.generativeai as genai +import openai import requests +from PIL import Image from openai.error import ( APIConnectionError, APIError, @@ -22,8 +30,6 @@ from mm_agents.prompts import SYS_PROMPT_IN_SCREENSHOT_OUT_CODE, SYS_PROMPT_IN_S SYS_PROMPT_IN_SOM_A11Y_OUT_TAG, \ SYS_PROMPT_SEEACT, ACTION_DESCRIPTION_PROMPT_SEEACT, ACTION_GROUNDING_PROMPT_SEEACT -import logging - logger = logging.getLogger("desktopenv.agent") @@ -44,11 +50,13 @@ def linearize_accessibility_tree(accessibility_tree): linearized_accessibility_tree += node.tag + "\t" linearized_accessibility_tree += node.attrib.get('name') + "\t" if node.text: - linearized_accessibility_tree += (node.text if '"' not in node.text else '"{:}"'.format(node.text.replace('"', '""'))) + "\t" - elif node.get("{uri:deskat:uia.windows.microsoft.org}class", "").endswith("EditWrapper")\ + linearized_accessibility_tree += (node.text if '"' not in node.text else '"{:}"'.format( + node.text.replace('"', '""'))) + "\t" + elif node.get("{uri:deskat:uia.windows.microsoft.org}class", "").endswith("EditWrapper") \ and node.get("{uri:deskat:value.at-spi.gnome.org}value"): text: str = node.get("{uri:deskat:value.at-spi.gnome.org}value") - linearized_accessibility_tree += (text if '"' not in text else '"{:}"'.format(text.replace('"', '""'))) + "\t" + linearized_accessibility_tree += (text if '"' not in text else '"{:}"'.format( + text.replace('"', '""'))) + "\t" else: linearized_accessibility_tree += '""\t' linearized_accessibility_tree += node.attrib.get( @@ -145,10 +153,21 @@ def parse_code_from_som_string(input_string, masks): x, y, w, h = mask mappings.append(("tag#" + str(i + 1), "{}, {}".format(int(x + w // 2), int(y + h // 2)))) - # reverse the mappings - for mapping in mappings[::-1]: - input_string = input_string.replace(mapping[0], mapping[1]) + def replace_tags_with_mappings(text, mappings): + pattern = r'tag#\d+' + matches = re.findall(pattern, text) + for match in matches: + for mapping in mappings: + if match == mapping[0]: + text = text.replace(match, mapping[1]) + break + logger.error("Predicting the tag with index {} failed.".format(match)) + return "" + + return text + + input_string = replace_tags_with_mappings(input_string, mappings) actions = parse_code_from_string(input_string) return actions @@ -295,7 +314,7 @@ class GPT4v_Agent: { "type": "image_url", "image_url": { - "url": f"data:image/jpeg;base64,{_screenshot}", + "url": f"data:image/png;base64,{_screenshot}", "detail": "high" } } @@ -314,7 +333,7 @@ class GPT4v_Agent: { "type": "image_url", "image_url": { - "url": f"data:image/jpeg;base64,{_screenshot}", + "url": f"data:image/png;base64,{_screenshot}", "detail": "high" } } @@ -375,7 +394,7 @@ class GPT4v_Agent: { "type": "image_url", "image_url": { - "url": f"data:image/jpeg;base64,{base64_image}", + "url": f"data:image/png;base64,{base64_image}", "detail": "high" } } @@ -421,7 +440,7 @@ class GPT4v_Agent: { "type": "image_url", "image_url": { - "url": f"data:image/jpeg;base64,{base64_image}", + "url": f"data:image/png;base64,{base64_image}", "detail": "high" } } @@ -448,7 +467,7 @@ class GPT4v_Agent: { "type": "image_url", "image_url": { - "url": f"data:image/jpeg;base64,{base64_image}", + "url": f"data:image/png;base64,{base64_image}", "detail": "high" } } @@ -510,32 +529,130 @@ class GPT4v_Agent: @backoff.on_exception( backoff.expo, (APIError, RateLimitError, APIConnectionError, ServiceUnavailableError, InvalidRequestError), - max_tries=3 + max_tries=10 ) def call_llm(self, payload): - response = requests.post( - "https://api.openai.com/v1/chat/completions", - headers=self.headers, - json=payload - ) + if self.model.startswith("gpt"): + response = requests.post( + "https://api.openai.com/v1/chat/completions", + headers=self.headers, + json=payload + ) - if response.status_code != 200: - if response.json()['error']['code'] == "context_length_exceeded": - print("Context length exceeded. Retrying with a smaller context.") - payload["messages"] = payload["messages"][-1:] - retry_response = requests.post( - "https://api.openai.com/v1/chat/completions", - headers=self.headers, - json=payload - ) - if retry_response.status_code != 200: - print("Failed to call LLM: " + retry_response.text) + if response.status_code != 200: + if response.json()['error']['code'] == "context_length_exceeded": + print("Context length exceeded. Retrying with a smaller context.") + payload["messages"] = payload["messages"][-1:] + retry_response = requests.post( + "https://api.openai.com/v1/chat/completions", + headers=self.headers, + json=payload + ) + if retry_response.status_code != 200: + print("Failed to call LLM: " + retry_response.text) + return "" + + print("Failed to call LLM: " + response.text) + time.sleep(5) + return "" + else: + return response.json()['choices'][0]['message']['content'] + + elif self.model.startswith("mistral"): + messages = payload["messages"] + max_tokens = payload["max_tokens"] + + openai.api_base = "http://localhost:8000/v1" + openai.api_key = "test" + response = openai.ChatCompletion.create( + messages=messages, + model="Mixtral-8x7B-Instruct-v0.1", + max_tokens=max_tokens + ) + try: + return response['choices'][0]['message']['content'] + except Exception as e: + return "" + + elif self.model.startswith("gemini"): + + api_key = os.environ.get("GENAI_API_KEY") + genai.api_key = api_key + def encoded_img_to_pil_img(data_str): + base64_str = data_str.replace("data:image/png;base64,", "") + image_data = base64.b64decode(base64_str) + image = Image.open(BytesIO(image_data)) + + return image + + messages = payload["messages"] + max_tokens = payload["max_tokens"] + + gemini_messages = [] + for i, message in enumerate(messages): + gemini_message = { + "role": message["role"], + "parts": [] + } + assert len(message["content"]) in [1, 2], "One text, or one text with one image" + + # The gemini only support the last image as single image input + if i == len(messages) - 1: + for part in message["content"]: + gemini_message['parts'].append(part['text']) if part['type'] == "text" \ + else gemini_message['parts'].append(encoded_img_to_pil_img(part['image_url']['url'])) + else: + for part in message["content"]: + gemini_message['parts'].append(part['text']) if part['type'] == "text" else None + + gemini_messages.append(gemini_message) + + response = genai.GenerativeModel(self.model).generate_content( + gemini_messages, + generation_config={ + "max_output_tokens": max_tokens + } + ) + + try: + return response.text + except Exception as e: + return "" + elif self.model.startswith("qwen"): + messages = payload["messages"] + max_tokens = payload["max_tokens"] + + qwen_messages = [] + + for i, message in enumerate(messages): + qwen_message = { + "role": message["role"], + "content": [] + } + assert len(message["content"]) in [1, 2], "One text, or one text with one image" + for part in message["content"]: + qwen_message['content'].append({"image": part['image_url']['url']}) if part['type'] == "image_url" else None + qwen_message['content'].append({"text": part['text']}) if part['type'] == "text" else None + + qwen_messages.append(qwen_message) + + response = dashscope.MultiModalConversation.call(model='qwen-vl-plus', + messages=messages) + # The response status_code is HTTPStatus.OK indicate success, + # otherwise indicate request is failed, you can get error code + # and message from code and message. + if response.status_code == HTTPStatus.OK: + try: + return response.json()['output']['choices'][0]['message']['content'] + except Exception as e: return "" + else: + print(response.code) # The error code. + print(response.message) # The error message. + return "" - print("Failed to call LLM: " + response.text) - return "" else: - return response.json()['choices'][0]['message']['content'] + raise ValueError("Invalid model: " + self.model) def parse_actions(self, response: str, masks=None): From 32bcdd093761033a99d4b374d65a750489047004 Mon Sep 17 00:00:00 2001 From: Timothyxxx <384084775@qq.com> Date: Thu, 1 Feb 2024 18:58:22 +0800 Subject: [PATCH 6/6] Modify the logic of SoM agent --- mm_agents/gpt_4v_agent.py | 51 ++++++++++++++++++++++++++------------- mm_agents/prompts.py | 12 ++++----- 2 files changed, 40 insertions(+), 23 deletions(-) diff --git a/mm_agents/gpt_4v_agent.py b/mm_agents/gpt_4v_agent.py index 596f5ee..b94a4f9 100644 --- a/mm_agents/gpt_4v_agent.py +++ b/mm_agents/gpt_4v_agent.py @@ -150,24 +150,21 @@ def parse_code_from_string(input_string): def parse_code_from_som_string(input_string, masks): # parse the output string by masks - mappings = [] + tag_vars = "" for i, mask in enumerate(masks): x, y, w, h = mask - mappings.append(("tag#" + str(i + 1), "{}, {}".format(int(x + w // 2), int(y + h // 2)))) - - def replace_tags_with_mappings(text, mappings): - pattern = r'tag#\d+' - matches = re.findall(pattern, text) - - for match in matches: - for mapping in mappings: - if match == mapping[0]: - text = text.replace(match, mapping[1]) - break - logger.error("Predicting the tag with index {} failed.".format(match)) - return "" + tag_vars += "tag_" + str(i + 1) + "=" + "({}, {})".format(int(x + w // 2), int(y + h // 2)) + tag_vars += "\n" actions = parse_code_from_string(input_string) + + for i, action in enumerate(actions): + if action.strip() in ['WAIT', 'DONE', 'FAIL']: + pass + else: + action = tag_vars + action + actions[i] = action + return actions @@ -561,19 +558,39 @@ class GPT4v_Agent: return response.json()['choices'][0]['message']['content'] elif self.model.startswith("mistral"): + print("call mistral") messages = payload["messages"] max_tokens = payload["max_tokens"] + misrtal_messages = [] + + for i, message in enumerate(messages): + mistral_message = { + "role": message["role"], + "content": [] + } + + for part in message["content"]: + mistral_message['content'] = part['text'] if part['type'] == "text" else None + + misrtal_messages.append(mistral_message) + + # the mistral not support system message in our endpoint, so we concatenate it at the first user message + if misrtal_messages[0]['role'] == "system": + misrtal_messages[1]['content'] = misrtal_messages[0]['content'] + "\n" + misrtal_messages[1]['content'] + misrtal_messages.pop(0) + openai.api_base = "http://localhost:8000/v1" openai.api_key = "test" response = openai.ChatCompletion.create( - messages=messages, - model="Mixtral-8x7B-Instruct-v0.1", - max_tokens=max_tokens + messages=misrtal_messages, + model="Mixtral-8x7B-Instruct-v0.1" ) + try: return response['choices'][0]['message']['content'] except Exception as e: + print("Failed to call LLM: " + str(e)) return "" elif self.model.startswith("gemini"): diff --git a/mm_agents/prompts.py b/mm_agents/prompts.py index 85295de..e23a211 100644 --- a/mm_agents/prompts.py +++ b/mm_agents/prompts.py @@ -806,9 +806,9 @@ For each step, you will get an observation of the desktop by 1) a screenshot; an You are required to use `pyautogui` to perform the action grounded to the observation, but DONOT use the `pyautogui.locateCenterOnScreen` function to locate the element you want to operate with since we have no image of the element you want to operate with. DONOT USE `pyautogui.screenshot()` to make screenshot. You can replace x, y in the code with the tag of the element you want to operate with. such as: ```python -pyautogui.moveTo(tag#3) -pyautogui.click(tag#2) -pyautogui.dragTo(tag#1, button='left') +pyautogui.moveTo(tag_3) +pyautogui.click(tag_2) +pyautogui.dragTo(tag_1, button='left') ``` When you think you can directly output precise x and y coordinates or there is no tag on which you want to interact, you can also use them directly. But you should be careful to ensure that the coordinates are correct. @@ -856,9 +856,9 @@ ACTION_GROUNDING_PROMPT_SEEACT = """ You are required to use `pyautogui` to perform the action grounded to the observation, but DONOT use the `pyautogui.locateCenterOnScreen` function to locate the element you want to operate with since we have no image of the element you want to operate with. DONOT USE `pyautogui.screenshot()` to make screenshot. You can replace x, y in the code with the tag of the element you want to operate with. such as: ```python -pyautogui.moveTo(tag#3) -pyautogui.click(tag#2) -pyautogui.dragTo(tag#1, button='left') +pyautogui.moveTo(tag_3) +pyautogui.click(tag_2) +pyautogui.dragTo(tag_1, button='left') ``` When you think you can directly output precise x and y coordinates or there is no tag on which you want to interact, you can also use them directly. But you should be careful to ensure that the coordinates are correct.