From ee851aeb543174ce791d1073daaf13d26f0440b7 Mon Sep 17 00:00:00 2001
From: tsuky_chen <91684733+chenjix@users.noreply.github.com>
Date: Thu, 1 Feb 2024 16:09:24 +0800
Subject: [PATCH 1/6] Update 0cecd4f3-74de-457b-ba94-29ad6b5dafb6.json

---
 .../libreoffice_calc/0cecd4f3-74de-457b-ba94-29ad6b5dafb6.json  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evaluation_examples/examples/libreoffice_calc/0cecd4f3-74de-457b-ba94-29ad6b5dafb6.json b/evaluation_examples/examples/libreoffice_calc/0cecd4f3-74de-457b-ba94-29ad6b5dafb6.json
index 97ad2a1..56ba374 100644
--- a/evaluation_examples/examples/libreoffice_calc/0cecd4f3-74de-457b-ba94-29ad6b5dafb6.json
+++ b/evaluation_examples/examples/libreoffice_calc/0cecd4f3-74de-457b-ba94-29ad6b5dafb6.json
@@ -1,7 +1,7 @@
 {
   "id": "0cecd4f3-74de-457b-ba94-29ad6b5dafb6",
   "snapshot": "libreoffice_calc",
-  "instruction": "Rename \"Sheet 1\" to \"LARS Resources\". Then make a copy of it. Place the copy before \"Sheet 2\" and rename it by appending a suffix \"(Backup)\", concatenated by a which space. And Also rename \"Sheet2\" by appending the suffix \"(Offline)\".",
+  "instruction": "Rename \"Sheet 1\" to \"LARS Resources\". Then make a copy of it. Place the copy before \"Sheet 2\" and rename it by appending a suffix \"(Backup)\", concatenated by a white space. And Also rename \"Sheet2\" by appending the suffix \"(Offline)\".",
   "source": "https://www.libreofficehelp.com/add-insert-delete-copy-move-rename-a-worksheet-in-libreoffice-calc/",
   "config": [
     {

From 62f50cdc26ceb67600806a38a0497f8a9c4eedaa Mon Sep 17 00:00:00 2001
From: tsuky_chen <91684733+chenjix@users.noreply.github.com>
Date: Thu, 1 Feb 2024 16:10:47 +0800
Subject: [PATCH 2/6] Update 7a4e4bc8-922c-4c84-865c-25ba34136be1.json

---
 .../libreoffice_calc/7a4e4bc8-922c-4c84-865c-25ba34136be1.json  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evaluation_examples/examples/libreoffice_calc/7a4e4bc8-922c-4c84-865c-25ba34136be1.json b/evaluation_examples/examples/libreoffice_calc/7a4e4bc8-922c-4c84-865c-25ba34136be1.json
index 99133d0..58857fa 100644
--- a/evaluation_examples/examples/libreoffice_calc/7a4e4bc8-922c-4c84-865c-25ba34136be1.json
+++ b/evaluation_examples/examples/libreoffice_calc/7a4e4bc8-922c-4c84-865c-25ba34136be1.json
@@ -1,7 +1,7 @@
 {
   "id": "7a4e4bc8-922c-4c84-865c-25ba34136be1",
   "snapshot": "libreoffice_calc",
-  "instruction": "Reorder the columns to be \"Data\", \"First Name\", \"Last Name\", \"Order ID\", \"Sales\"",
+  "instruction": "Reorder the columns to be \"Date\", \"First Name\", \"Last Name\", \"Order ID\", \"Sales\"",
   "source": "https://www.youtube.com/shorts/bvUhr1AHs44",
   "config": [
     {

From 660cbe908153a40876bd611547eedc6b1f3c29b7 Mon Sep 17 00:00:00 2001
From: rhythmcao <ruishengcao@gmail.com>
Date: Thu, 1 Feb 2024 16:21:19 +0800
Subject: [PATCH 3/6] expert human test for multi-app finished, fix some small
 issues

---
 .../46407397-a7d5-4c6b-92c6-dbe038b1457b.json      |  2 +-
 .../897e3b53-5d4d-444b-85cb-2cdc8a97d903.json      |  2 +-
 .../b52b40a5-ad70-4c53-b5b0-5650a8387052.json      | 14 +++++++-------
 .../settings/googledrive/credentials.json          |  2 +-
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/evaluation_examples/examples/multi_apps/46407397-a7d5-4c6b-92c6-dbe038b1457b.json b/evaluation_examples/examples/multi_apps/46407397-a7d5-4c6b-92c6-dbe038b1457b.json
index f4fd1bf..db41601 100644
--- a/evaluation_examples/examples/multi_apps/46407397-a7d5-4c6b-92c6-dbe038b1457b.json
+++ b/evaluation_examples/examples/multi_apps/46407397-a7d5-4c6b-92c6-dbe038b1457b.json
@@ -68,7 +68,7 @@
             "parameters": {
                 "command": [
                 "tar",
-                "-xzv",
+                "-xz",
                 "--recursive-unlink",
                 "-f",
                 "/home/user/thunderbird-profile.tar.gz",
diff --git a/evaluation_examples/examples/multi_apps/897e3b53-5d4d-444b-85cb-2cdc8a97d903.json b/evaluation_examples/examples/multi_apps/897e3b53-5d4d-444b-85cb-2cdc8a97d903.json
index 9da5cc0..6417237 100644
--- a/evaluation_examples/examples/multi_apps/897e3b53-5d4d-444b-85cb-2cdc8a97d903.json
+++ b/evaluation_examples/examples/multi_apps/897e3b53-5d4d-444b-85cb-2cdc8a97d903.json
@@ -1,7 +1,7 @@
 {
     "id": "897e3b53-5d4d-444b-85cb-2cdc8a97d903",
     "snapshot": "chrome",
-    "instruction": "I have a LibreOffice Writer file form.docx on the desktop. Help me convert it to PDF format and store it in the forms/ folder in my Google Drive.",
+    "instruction": "I have a LibreOffice Writer file form.docx on the desktop. Help me convert it to PDF format and store the PDF in the forms/ folder in my Google Drive.",
     "source": "https://marketplace.uipath.com/listings/convert-word-file-to-pdf-and-store-in-onedrive",
     "config": [
         {
diff --git a/evaluation_examples/examples/multi_apps/b52b40a5-ad70-4c53-b5b0-5650a8387052.json b/evaluation_examples/examples/multi_apps/b52b40a5-ad70-4c53-b5b0-5650a8387052.json
index 25b9024..e9e879d 100644
--- a/evaluation_examples/examples/multi_apps/b52b40a5-ad70-4c53-b5b0-5650a8387052.json
+++ b/evaluation_examples/examples/multi_apps/b52b40a5-ad70-4c53-b5b0-5650a8387052.json
@@ -68,13 +68,13 @@
             "type": "execute",
             "parameters": {
                 "command": [
-                "tar",
-                "-xzv",
-                "--recursive-unlink",
-                "-f",
-                "/home/user/thunderbird-profile.tar.gz",
-                "-C",
-                "/home/user/"
+                    "tar",
+                    "-xz",
+                    "--recursive-unlink",
+                    "-f",
+                    "/home/user/thunderbird-profile.tar.gz",
+                    "-C",
+                    "/home/user/"
                 ]
             }
         },
diff --git a/evaluation_examples/settings/googledrive/credentials.json b/evaluation_examples/settings/googledrive/credentials.json
index 1be4912..34d7be4 100644
--- a/evaluation_examples/settings/googledrive/credentials.json
+++ b/evaluation_examples/settings/googledrive/credentials.json
@@ -1 +1 @@
-{"access_token": "ya29.a0AfB_byAZmDTDsYds_iatV8a30PUPWcDHVW4Cyg71pTlD0f3eBBwAjV4WpVL8LdAle8sT4j_rX4rWH8iCt3QI2YdrQLFPlaVdBk0zRGGtAEcebIDuQy_VKD6j5c3IGxok9PDON-Mft0ZVJjUVEopgLYA4fYwctbQZ8nyl4AaCgYKAX4SARISFQHGX2Mim-LRNXCfACmecJH94-D09A0173", "client_id": "786888752612-rgng5v9hcq4as7pn0b40gt9r5lekmht9.apps.googleusercontent.com", "client_secret": "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", "refresh_token": "1//0ehtafHmucszRCgYIARAAGA4SNwF-L9IrpDBsnzdHKAlRfrkvzNFw1cpdnRY8rhM5gy4flsPYdysMav27yHamJx39BBGq-LLw40s", "token_expiry": "2024-01-31T14:41:25Z", "token_uri": "https://oauth2.googleapis.com/token", "user_agent": null, "revoke_uri": "https://oauth2.googleapis.com/revoke", "id_token": null, "id_token_jwt": null, "token_response": {"access_token": "ya29.a0AfB_byAZmDTDsYds_iatV8a30PUPWcDHVW4Cyg71pTlD0f3eBBwAjV4WpVL8LdAle8sT4j_rX4rWH8iCt3QI2YdrQLFPlaVdBk0zRGGtAEcebIDuQy_VKD6j5c3IGxok9PDON-Mft0ZVJjUVEopgLYA4fYwctbQZ8nyl4AaCgYKAX4SARISFQHGX2Mim-LRNXCfACmecJH94-D09A0173", "expires_in": 3599, "scope": "https://www.googleapis.com/auth/drive", "token_type": "Bearer"}, "scopes": ["https://www.googleapis.com/auth/drive"], "token_info_uri": "https://oauth2.googleapis.com/tokeninfo", "invalid": false, "_class": "OAuth2Credentials", "_module": "oauth2client.client"}
\ No newline at end of file
+{"access_token": "ya29.a0AfB_byD2_A8RPm6KzfjaNifjfgZ2M-D9G16GAPIj1ANxM3AMq4DLUXxj76CalsByOsqTUgEvADd-FEKL0FkBBc4ow-EuaLUEOm4yw2LNEOFMhVD_k4PvEIf4767fYU5o__GtyrGt5pNJy0MaBukDY2ui7GQwDuFFGt2q_AaCgYKAYMSARISFQHGX2MidvFUuBpNsy4fkC5DP2k8Aw0173", "client_id": "786888752612-rgng5v9hcq4as7pn0b40gt9r5lekmht9.apps.googleusercontent.com", "client_secret": "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", "refresh_token": "1//0ehtafHmucszRCgYIARAAGA4SNwF-L9IrpDBsnzdHKAlRfrkvzNFw1cpdnRY8rhM5gy4flsPYdysMav27yHamJx39BBGq-LLw40s", "token_expiry": "2024-02-01T08:29:08Z", "token_uri": "https://oauth2.googleapis.com/token", "user_agent": null, "revoke_uri": "https://oauth2.googleapis.com/revoke", "id_token": null, "id_token_jwt": null, "token_response": {"access_token": "ya29.a0AfB_byD2_A8RPm6KzfjaNifjfgZ2M-D9G16GAPIj1ANxM3AMq4DLUXxj76CalsByOsqTUgEvADd-FEKL0FkBBc4ow-EuaLUEOm4yw2LNEOFMhVD_k4PvEIf4767fYU5o__GtyrGt5pNJy0MaBukDY2ui7GQwDuFFGt2q_AaCgYKAYMSARISFQHGX2MidvFUuBpNsy4fkC5DP2k8Aw0173", "expires_in": 3599, "scope": "https://www.googleapis.com/auth/drive", "token_type": "Bearer"}, "scopes": ["https://www.googleapis.com/auth/drive"], "token_info_uri": "https://oauth2.googleapis.com/tokeninfo", "invalid": false, "_class": "OAuth2Credentials", "_module": "oauth2client.client"}
\ No newline at end of file

From fe21c4753390ebbbdde8bbec084ed228e5ea657e Mon Sep 17 00:00:00 2001
From: BlankCheng <913501223@qq.com>
Date: Thu, 1 Feb 2024 16:27:24 +0800
Subject: [PATCH 4/6] Fix samples of Impress and OS

---
 .../21760ecb-8f62-40d2-8d85-0cee5725cb72.json                 | 2 +-
 .../5d901039-a89c-4bfb-967b-bf66f4df075e.json                 | 2 +-
 .../9ec204e4-f0a3-42f8-8458-b772a6797cab.json                 | 2 +-
 .../af23762e-2bfd-4a1d-aada-20fa8de9ce07.json                 | 4 ++--
 .../examples/os/7688b85f-87a4-4e4a-b2f8-f3d6c3f29b82.json     | 2 +-
 .../examples/os/bedcedc4-4d72-425e-ad62-21960b11fe0d.json     | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/evaluation_examples/examples/libreoffice_impress/21760ecb-8f62-40d2-8d85-0cee5725cb72.json b/evaluation_examples/examples/libreoffice_impress/21760ecb-8f62-40d2-8d85-0cee5725cb72.json
index 70bd696..173c5ac 100644
--- a/evaluation_examples/examples/libreoffice_impress/21760ecb-8f62-40d2-8d85-0cee5725cb72.json
+++ b/evaluation_examples/examples/libreoffice_impress/21760ecb-8f62-40d2-8d85-0cee5725cb72.json
@@ -1,7 +1,7 @@
 {
   "id": "21760ecb-8f62-40d2-8d85-0cee5725cb72",
   "snapshot": "libreoffice_impress",
-  "instruction": "Could you help me add silde transition \"dissolve\" to my first page?",
+  "instruction": "Could you help me add slide transition \"dissolve\" to my first page?",
   "source": "https://www.libreofficehelp.com/add-animations-transitions-libreoffice-impress-slides/",
   "config": [
     {
diff --git a/evaluation_examples/examples/libreoffice_impress/5d901039-a89c-4bfb-967b-bf66f4df075e.json b/evaluation_examples/examples/libreoffice_impress/5d901039-a89c-4bfb-967b-bf66f4df075e.json
index 03daf6c..9c4f050 100644
--- a/evaluation_examples/examples/libreoffice_impress/5d901039-a89c-4bfb-967b-bf66f4df075e.json
+++ b/evaluation_examples/examples/libreoffice_impress/5d901039-a89c-4bfb-967b-bf66f4df075e.json
@@ -1,7 +1,7 @@
 {
   "id": "5d901039-a89c-4bfb-967b-bf66f4df075e",
   "snapshot": "libreoffice_impress",
-  "instruction": "I want to make this page my cover page. Could you help me stretch this image to fill the entire page, keeping its proportion and centering the image.",
+  "instruction": "I want to turn the rectangular image of Columbus on the first page into a cover page. Could you help me stretch this image to fill the entire page, keeping its proportion and centering the image?",
   "source": "https://superuser.com/questions/986776/how-can-i-stretch-an-image-in-a-libreoffice-impress-presentation-to-fill-the-pag",
   "config": [
     {
diff --git a/evaluation_examples/examples/libreoffice_impress/9ec204e4-f0a3-42f8-8458-b772a6797cab.json b/evaluation_examples/examples/libreoffice_impress/9ec204e4-f0a3-42f8-8458-b772a6797cab.json
index 7c85a91..07be4b5 100644
--- a/evaluation_examples/examples/libreoffice_impress/9ec204e4-f0a3-42f8-8458-b772a6797cab.json
+++ b/evaluation_examples/examples/libreoffice_impress/9ec204e4-f0a3-42f8-8458-b772a6797cab.json
@@ -62,7 +62,7 @@
     "expected": {
       "type": "cloud_file",
       "path": "https://drive.usercontent.google.com/download?id=1otbzscpOZ0tCXMvsMC0MmNWUC7Pv71of&export=download&authuser=0&confirm=t&uuid=faa0b0c1-6b14-4bce-a1fd-ccf824ee1e60&at=APZUnTXw6TlBOlrPPZ2OhfGnNPf0:1705338135842",
-      "dest": "MLA_Workshop_061X_Works_Cited_Gold.docx"
+      "dest": "MLA_Workshop_061X_Works_Cited_Gold.pptx"
     },
     "result": {
       "type": "vm_file",
diff --git a/evaluation_examples/examples/libreoffice_impress/af23762e-2bfd-4a1d-aada-20fa8de9ce07.json b/evaluation_examples/examples/libreoffice_impress/af23762e-2bfd-4a1d-aada-20fa8de9ce07.json
index 8ca8792..e93015a 100644
--- a/evaluation_examples/examples/libreoffice_impress/af23762e-2bfd-4a1d-aada-20fa8de9ce07.json
+++ b/evaluation_examples/examples/libreoffice_impress/af23762e-2bfd-4a1d-aada-20fa8de9ce07.json
@@ -2,7 +2,7 @@
   "id": "af23762e-2bfd-4a1d-aada-20fa8de9ce07",
   "snapshot": "libreoffice_impress",
   "instruction": "I am making PPT on LibreOffice Impress for presentation tomorrow. I need to summarize contents on one slide use Impress \"Summary Slide\" feature. Could you make that for me?",
-  "source": "https://www.libreofficehelp.com/export-libreoffice-impress-slides-images/#:~:text=Exporting%20a%20single%20slide%20as.jpg%2C.png%2C%20etc%20image%20is,on%20the%20checkbox%20Selection.%20Provide%20jpg%20quality%20options.",
+  "source": "https://superuser.com/questions/1059080/how-to-make-a-summary-slide-in-impress-listing-the-titles-of-all-slides-autom",
   "config": [
     {
       "type": "download",
@@ -62,7 +62,7 @@
     "expected": {
       "type": "cloud_file",
       "path": "https://drive.usercontent.google.com/download?id=1nRwmFgYdskv3EiriZZFoT8TzM9CsG5B0&export=download&authuser=0&confirm=t&uuid=f2f919df-2867-4bc3-8bb9-dabd51108ebb&at=APZUnTWzw9LJWWXvH0cvdaWL-Ij-:1705319339474",
-      "dest": "Forests_Gold.docx"
+      "dest": "Forests_Gold.pptx"
     },
     "result": {
       "type": "vm_file",
diff --git a/evaluation_examples/examples/os/7688b85f-87a4-4e4a-b2f8-f3d6c3f29b82.json b/evaluation_examples/examples/os/7688b85f-87a4-4e4a-b2f8-f3d6c3f29b82.json
index da8f5c4..2c0d1d1 100644
--- a/evaluation_examples/examples/os/7688b85f-87a4-4e4a-b2f8-f3d6c3f29b82.json
+++ b/evaluation_examples/examples/os/7688b85f-87a4-4e4a-b2f8-f3d6c3f29b82.json
@@ -1,7 +1,7 @@
 {
     "id": "7688b85f-87a4-4e4a-b2f8-f3d6c3f29b82",
     "snapshot": "os",
-    "instruction": "Can you move the file with the path 'todo.txt' on the Desktop to the directory with the path 'done' on the Desktop?",
+    "instruction": "Can you move the file 'todo.txt' on the Desktop to the directory 'done/' on the Desktop?",
     "source": "https://ubuntu.com/tutorials/command-line-for-beginners#5-moving-and-manipulating-files",
     "config": [
       {
diff --git a/evaluation_examples/examples/os/bedcedc4-4d72-425e-ad62-21960b11fe0d.json b/evaluation_examples/examples/os/bedcedc4-4d72-425e-ad62-21960b11fe0d.json
index 6c67c0c..97e8545 100644
--- a/evaluation_examples/examples/os/bedcedc4-4d72-425e-ad62-21960b11fe0d.json
+++ b/evaluation_examples/examples/os/bedcedc4-4d72-425e-ad62-21960b11fe0d.json
@@ -1,7 +1,7 @@
 {
   "id": "bedcedc4-4d72-425e-ad62-21960b11fe0d",
   "snapshot": "os",
-  "instruction": "Could you set the 'Dim screen when inactive' to on in setting?",
+  "instruction": "Could you set the 'Dim screen when inactive' to off in setting?",
   "source": "https://www.youtube.com/watch?v=D4WyNjt_hbQ&t=2s",
   "trajectory": "trajectories/",
   "config": [

From 59e2417a081d857c5f2a961d8a3bbc5a41650301 Mon Sep 17 00:00:00 2001
From: Timothyxxx <384084775@qq.com>
Date: Thu, 1 Feb 2024 16:55:38 +0800
Subject: [PATCH 5/6] Add Mistral, Qwen, Gemini support; Fix minor bugs

---
 desktop_env/envs/desktop_env.py               |   2 +-
 desktop_env/evaluators/metrics/gimp.py        |   3 +
 .../66399b0d-8fda-4618-95c4-bfc6191617e9.json |   2 +-
 .../6ada715d-3aae-4a32-a6a7-429b2e43fb93.json |   2 +-
 mm_agents/gemini_pro_agent.py                 | 136 -------------
 mm_agents/gemini_pro_vision_agent.py          | 115 -----------
 mm_agents/gpt_4v_agent.py                     | 183 ++++++++++++++----
 7 files changed, 156 insertions(+), 287 deletions(-)
 delete mode 100644 mm_agents/gemini_pro_agent.py
 delete mode 100644 mm_agents/gemini_pro_vision_agent.py

diff --git a/desktop_env/envs/desktop_env.py b/desktop_env/envs/desktop_env.py
index e870b21..b12fbca 100644
--- a/desktop_env/envs/desktop_env.py
+++ b/desktop_env/envs/desktop_env.py
@@ -30,7 +30,7 @@ def _execute_command(command: List[str]) -> None:
         p = subprocess.Popen(command)
         p.wait()
     else:
-        result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=60, text=True)
+        result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=60, text=True, encoding="utf-8")
         if result.returncode != 0:
             raise Exception("\033[91m" + result.stdout + result.stderr + "\033[0m")
         return result.stdout
diff --git a/desktop_env/evaluators/metrics/gimp.py b/desktop_env/evaluators/metrics/gimp.py
index a1d3a82..30c9b68 100644
--- a/desktop_env/evaluators/metrics/gimp.py
+++ b/desktop_env/evaluators/metrics/gimp.py
@@ -328,6 +328,9 @@ def check_structure_sim(src_path, tgt_path):
     Check if the structure of the two images are similar
     gimp:2a729ded-3296-423d-aec4-7dd55ed5fbb3
     """
+    if src_path is None or tgt_path is None:
+        return 0.
+
     img_src = Image.open(src_path)
     img_tgt = Image.open(tgt_path)
     structure_same = structure_check_by_ssim(img_src, img_tgt)
diff --git a/evaluation_examples/examples/libreoffice_writer/66399b0d-8fda-4618-95c4-bfc6191617e9.json b/evaluation_examples/examples/libreoffice_writer/66399b0d-8fda-4618-95c4-bfc6191617e9.json
index a7951e4..33b8a1a 100644
--- a/evaluation_examples/examples/libreoffice_writer/66399b0d-8fda-4618-95c4-bfc6191617e9.json
+++ b/evaluation_examples/examples/libreoffice_writer/66399b0d-8fda-4618-95c4-bfc6191617e9.json
@@ -27,7 +27,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; time.sleep(1); pyautogui.press(\"down\", presses=40, interval=0.1); time.sleep(1); pyautogui.scroll(-2)"
+          "import pyautogui; import time; time.sleep(5); pyautogui.press(\"down\", presses=40, interval=10); time.sleep(1); pyautogui.scroll(-2)"
         ]
       }
     }
diff --git a/evaluation_examples/examples/libreoffice_writer/6ada715d-3aae-4a32-a6a7-429b2e43fb93.json b/evaluation_examples/examples/libreoffice_writer/6ada715d-3aae-4a32-a6a7-429b2e43fb93.json
index a7024d6..7151032 100644
--- a/evaluation_examples/examples/libreoffice_writer/6ada715d-3aae-4a32-a6a7-429b2e43fb93.json
+++ b/evaluation_examples/examples/libreoffice_writer/6ada715d-3aae-4a32-a6a7-429b2e43fb93.json
@@ -38,7 +38,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; time.sleep(1); pyautogui.press(\"down\", presses=8); time.sleep(1); pyautogui.scroll(-2)"
+          "import pyautogui; import time; time.sleep(5); pyautogui.press(\"down\", presses=8, interval=3); time.sleep(1); pyautogui.scroll(-2)"
         ]
       }
     }
diff --git a/mm_agents/gemini_pro_agent.py b/mm_agents/gemini_pro_agent.py
deleted file mode 100644
index ce84488..0000000
--- a/mm_agents/gemini_pro_agent.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# todo: needs to be refactored
-
-import time
-from typing import Dict, List
-
-import google.generativeai as genai
-
-from mm_agents.accessibility_tree_wrap.heuristic_retrieve import find_leaf_nodes, filter_nodes
-from mm_agents.gpt_4_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION
-from mm_agents.gpt_4_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE
-from mm_agents.gpt_4v_agent import parse_actions_from_string, parse_code_from_string
-
-
-class GeminiPro_Agent:
-    def __init__(self, api_key, instruction, model='gemini-pro', max_tokens=300, temperature=0.0,
-                 action_space="computer_13"):
-        genai.configure(api_key=api_key)
-        self.instruction = instruction
-        self.model = genai.GenerativeModel(model)
-        self.max_tokens = max_tokens
-        self.temperature = temperature
-        self.action_space = action_space
-
-        self.trajectory = [
-            {
-                "role": "system",
-                "parts": [
-                    {
-                        "computer_13": SYS_PROMPT_ACTION,
-                        "pyautogui": SYS_PROMPT_CODE
-                    }[action_space] + "\nHere is the instruction for the task: {}".format(self.instruction)
-                ]
-            }
-        ]
-
-    def predict(self, obs: Dict) -> List:
-        """
-        Predict the next action(s) based on the current observation.
-        Only support single-round conversation, only fill-in the last desktop screenshot.
-        """
-        accessibility_tree = obs["accessibility_tree"]
-
-        leaf_nodes = find_leaf_nodes(accessibility_tree)
-        filtered_nodes = filter_nodes(leaf_nodes)
-
-        linearized_accessibility_tree = "tag\ttext\tposition\tsize\n"
-        # Linearize the accessibility tree nodes into a table format
-
-        for node in filtered_nodes:
-            linearized_accessibility_tree += node.tag + "\t"
-            linearized_accessibility_tree += node.attrib.get('name') + "\t"
-            linearized_accessibility_tree += node.attrib.get(
-                '{uri:deskat:component.at-spi.gnome.org}screencoord') + "\t"
-            linearized_accessibility_tree += node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size') + "\n"
-
-        self.trajectory.append({
-            "role": "user",
-            "parts": [
-                "Given the XML format of accessibility tree (convert and formatted into table) as below:\n{}\nWhat's the next step that you will do to help with the task?".format(
-                    linearized_accessibility_tree)]
-        })
-
-        # todo: Remove this step once the Gemini supports multi-round conversation
-        all_message_str = ""
-        for i in range(len(self.trajectory)):
-            if i == 0:
-                all_message_template = "<|im_start|>system\n{}\n<|im_end|>\n"
-            elif i % 2 == 1:
-                all_message_template = "<|im_start|>user\n{}\n<|im_end|>\n"
-            else:
-                all_message_template = "<|im_start|>assistant\n{}\n<|im_end|>\n"
-
-            all_message_str += all_message_template.format(self.trajectory[i]["parts"][0])
-
-        print("All message: >>>>>>>>>>>>>>>> ")
-        print(
-            all_message_str
-        )
-
-        message_for_gemini = {
-            "role": "user",
-            "parts": [all_message_str]
-        }
-
-        traj_to_show = []
-        for i in range(len(self.trajectory)):
-            traj_to_show.append(self.trajectory[i]["parts"][0])
-            if len(self.trajectory[i]["parts"]) > 1:
-                traj_to_show.append("screenshot_obs")
-
-        print("Trajectory:", traj_to_show)
-
-        while True:
-            try:
-                response = self.model.generate_content(
-                    message_for_gemini,
-                    generation_config={
-                        "max_output_tokens": self.max_tokens,
-                        "temperature": self.temperature
-                    }
-                )
-                break
-            except:
-                print("Failed to generate response, retrying...")
-                time.sleep(5)
-                pass
-
-        try:
-            response_text = response.text
-        except:
-            return []
-
-        try:
-            actions = self.parse_actions(response_text)
-        except:
-            print("Failed to parse action from response:", response_text)
-            actions = []
-
-        return actions
-
-    def parse_actions(self, response: str):
-        # parse from the response
-        if self.action_space == "computer_13":
-            actions = parse_actions_from_string(response)
-        elif self.action_space == "pyautogui":
-            actions = parse_code_from_string(response)
-        else:
-            raise ValueError("Invalid action space: " + self.action_space)
-
-        # add action into the trajectory
-        self.trajectory.append({
-            "role": "assistant",
-            "parts": [response]
-        })
-
-        return actions
diff --git a/mm_agents/gemini_pro_vision_agent.py b/mm_agents/gemini_pro_vision_agent.py
deleted file mode 100644
index 4a537db..0000000
--- a/mm_agents/gemini_pro_vision_agent.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# todo: needs to be refactored
-
-import time
-from typing import Dict, List
-
-import PIL.Image
-import google.generativeai as genai
-
-from mm_agents.gpt_4v_agent import parse_actions_from_string, parse_code_from_string
-from mm_agents.gpt_4v_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION
-from mm_agents.gpt_4v_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE
-
-
-class GeminiProV_Agent:
-    def __init__(self, api_key, instruction, model='gemini-pro-vision', max_tokens=300, temperature=0.0,
-                 action_space="computer_13"):
-        genai.configure(api_key=api_key)
-        self.instruction = instruction
-        self.model = genai.GenerativeModel(model)
-        self.max_tokens = max_tokens
-        self.temperature = temperature
-        self.action_space = action_space
-
-        self.trajectory = [
-            {
-                "role": "system",
-                "parts": [
-                    {
-                        "computer_13": SYS_PROMPT_ACTION,
-                        "pyautogui": SYS_PROMPT_CODE
-                    }[action_space] + "\nHere is the instruction for the task: {}".format(self.instruction)
-                ]
-            }
-        ]
-
-    def predict(self, obs: Dict) -> List:
-        """
-        Predict the next action(s) based on the current observation.
-        Only support single-round conversation, only fill-in the last desktop screenshot.
-        """
-        img = PIL.Image.open(obs["screenshot"])
-        self.trajectory.append({
-            "role": "user",
-            "parts": ["What's the next step that you will do to help with the task?", img]
-        })
-
-        # todo: Remove this step once the Gemini supports multi-round conversation
-        all_message_str = ""
-        for i in range(len(self.trajectory)):
-            if i == 0:
-                all_message_template = "<|im_start|>system\n{}\n<|im_end|>\n"
-            elif i % 2 == 1:
-                all_message_template = "<|im_start|>user\n{}\n<|im_end|>\n"
-            else:
-                all_message_template = "<|im_start|>assistant\n{}\n<|im_end|>\n"
-
-            all_message_str += all_message_template.format(self.trajectory[i]["parts"][0])
-
-        message_for_gemini = {
-            "role": "user",
-            "parts": [all_message_str, img]
-        }
-
-        traj_to_show = []
-        for i in range(len(self.trajectory)):
-            traj_to_show.append(self.trajectory[i]["parts"][0])
-            if len(self.trajectory[i]["parts"]) > 1:
-                traj_to_show.append("screenshot_obs")
-
-        print("Trajectory:", traj_to_show)
-
-        while True:
-            try:
-                response = self.model.generate_content(
-                    message_for_gemini,
-                    generation_config={
-                        "max_output_tokens": self.max_tokens,
-                        "temperature": self.temperature
-                    }
-                )
-                break
-            except:
-                print("Failed to generate response, retrying...")
-                time.sleep(5)
-                pass
-
-        try:
-            response_text = response.text
-        except:
-            return []
-
-        try:
-            actions = self.parse_actions(response_text)
-        except:
-            print("Failed to parse action from response:", response_text)
-            actions = []
-
-        return actions
-
-    def parse_actions(self, response: str):
-        # parse from the response
-        if self.action_space == "computer_13":
-            actions = parse_actions_from_string(response)
-        elif self.action_space == "pyautogui":
-            actions = parse_code_from_string(response)
-        else:
-            raise ValueError("Invalid action space: " + self.action_space)
-
-        # add action into the trajectory
-        self.trajectory.append({
-            "role": "assistant",
-            "parts": [response]
-        })
-
-        return actions
diff --git a/mm_agents/gpt_4v_agent.py b/mm_agents/gpt_4v_agent.py
index 0dd15cf..68c07f3 100644
--- a/mm_agents/gpt_4v_agent.py
+++ b/mm_agents/gpt_4v_agent.py
@@ -1,12 +1,20 @@
 import base64
 import json
+import logging
 import os
 import re
+import time
 import uuid
+from http import HTTPStatus
+from io import BytesIO
 from typing import Dict, List
 
 import backoff
+import dashscope
+import google.generativeai as genai
+import openai
 import requests
+from PIL import Image
 from openai.error import (
     APIConnectionError,
     APIError,
@@ -22,8 +30,6 @@ from mm_agents.prompts import SYS_PROMPT_IN_SCREENSHOT_OUT_CODE, SYS_PROMPT_IN_S
     SYS_PROMPT_IN_SOM_A11Y_OUT_TAG, \
     SYS_PROMPT_SEEACT, ACTION_DESCRIPTION_PROMPT_SEEACT, ACTION_GROUNDING_PROMPT_SEEACT
 
-import logging
-
 logger = logging.getLogger("desktopenv.agent")
 
 
@@ -44,11 +50,13 @@ def linearize_accessibility_tree(accessibility_tree):
         linearized_accessibility_tree += node.tag + "\t"
         linearized_accessibility_tree += node.attrib.get('name') + "\t"
         if node.text:
-            linearized_accessibility_tree += (node.text if '"' not in node.text else '"{:}"'.format(node.text.replace('"', '""'))) + "\t"
-        elif node.get("{uri:deskat:uia.windows.microsoft.org}class", "").endswith("EditWrapper")\
+            linearized_accessibility_tree += (node.text if '"' not in node.text else '"{:}"'.format(
+                node.text.replace('"', '""'))) + "\t"
+        elif node.get("{uri:deskat:uia.windows.microsoft.org}class", "").endswith("EditWrapper") \
                 and node.get("{uri:deskat:value.at-spi.gnome.org}value"):
             text: str = node.get("{uri:deskat:value.at-spi.gnome.org}value")
-            linearized_accessibility_tree += (text if '"' not in text else '"{:}"'.format(text.replace('"', '""'))) + "\t"
+            linearized_accessibility_tree += (text if '"' not in text else '"{:}"'.format(
+                text.replace('"', '""'))) + "\t"
         else:
             linearized_accessibility_tree += '""\t'
         linearized_accessibility_tree += node.attrib.get(
@@ -145,10 +153,21 @@ def parse_code_from_som_string(input_string, masks):
         x, y, w, h = mask
         mappings.append(("tag#" + str(i + 1), "{}, {}".format(int(x + w // 2), int(y + h // 2))))
 
-    # reverse the mappings
-    for mapping in mappings[::-1]:
-        input_string = input_string.replace(mapping[0], mapping[1])
+    def replace_tags_with_mappings(text, mappings):
+        pattern = r'tag#\d+'
+        matches = re.findall(pattern, text)
 
+        for match in matches:
+            for mapping in mappings:
+                if match == mapping[0]:
+                    text = text.replace(match, mapping[1])
+                    break
+                logger.error("Predicting the tag with index {} failed.".format(match))
+                return ""
+
+        return text
+
+    input_string = replace_tags_with_mappings(input_string, mappings)
     actions = parse_code_from_string(input_string)
     return actions
 
@@ -295,7 +314,7 @@ class GPT4v_Agent:
                         {
                             "type": "image_url",
                             "image_url": {
-                                "url": f"data:image/jpeg;base64,{_screenshot}",
+                                "url": f"data:image/png;base64,{_screenshot}",
                                 "detail": "high"
                             }
                         }
@@ -314,7 +333,7 @@ class GPT4v_Agent:
                         {
                             "type": "image_url",
                             "image_url": {
-                                "url": f"data:image/jpeg;base64,{_screenshot}",
+                                "url": f"data:image/png;base64,{_screenshot}",
                                 "detail": "high"
                             }
                         }
@@ -375,7 +394,7 @@ class GPT4v_Agent:
                     {
                         "type": "image_url",
                         "image_url": {
-                            "url": f"data:image/jpeg;base64,{base64_image}",
+                            "url": f"data:image/png;base64,{base64_image}",
                             "detail": "high"
                         }
                     }
@@ -421,7 +440,7 @@ class GPT4v_Agent:
                     {
                         "type": "image_url",
                         "image_url": {
-                            "url": f"data:image/jpeg;base64,{base64_image}",
+                            "url": f"data:image/png;base64,{base64_image}",
                             "detail": "high"
                         }
                     }
@@ -448,7 +467,7 @@ class GPT4v_Agent:
                     {
                         "type": "image_url",
                         "image_url": {
-                            "url": f"data:image/jpeg;base64,{base64_image}",
+                            "url": f"data:image/png;base64,{base64_image}",
                             "detail": "high"
                         }
                     }
@@ -510,32 +529,130 @@ class GPT4v_Agent:
     @backoff.on_exception(
         backoff.expo,
         (APIError, RateLimitError, APIConnectionError, ServiceUnavailableError, InvalidRequestError),
-        max_tries=3
+        max_tries=10
     )
     def call_llm(self, payload):
-        response = requests.post(
-            "https://api.openai.com/v1/chat/completions",
-            headers=self.headers,
-            json=payload
-        )
+        if self.model.startswith("gpt"):
+            response = requests.post(
+                "https://api.openai.com/v1/chat/completions",
+                headers=self.headers,
+                json=payload
+            )
 
-        if response.status_code != 200:
-            if response.json()['error']['code'] == "context_length_exceeded":
-                print("Context length exceeded. Retrying with a smaller context.")
-                payload["messages"] = payload["messages"][-1:]
-                retry_response = requests.post(
-                    "https://api.openai.com/v1/chat/completions",
-                    headers=self.headers,
-                    json=payload
-                )
-                if retry_response.status_code != 200:
-                    print("Failed to call LLM: " + retry_response.text)
+            if response.status_code != 200:
+                if response.json()['error']['code'] == "context_length_exceeded":
+                    print("Context length exceeded. Retrying with a smaller context.")
+                    payload["messages"] = payload["messages"][-1:]
+                    retry_response = requests.post(
+                        "https://api.openai.com/v1/chat/completions",
+                        headers=self.headers,
+                        json=payload
+                    )
+                    if retry_response.status_code != 200:
+                        print("Failed to call LLM: " + retry_response.text)
+                        return ""
+
+                print("Failed to call LLM: " + response.text)
+                time.sleep(5)
+                return ""
+            else:
+                return response.json()['choices'][0]['message']['content']
+
+        elif self.model.startswith("mistral"):
+            messages = payload["messages"]
+            max_tokens = payload["max_tokens"]
+
+            openai.api_base = "http://localhost:8000/v1"
+            openai.api_key = "test"
+            response = openai.ChatCompletion.create(
+                messages=messages,
+                model="Mixtral-8x7B-Instruct-v0.1",
+                max_tokens=max_tokens
+            )
+            try:
+                return response['choices'][0]['message']['content']
+            except Exception as e:
+                return ""
+
+        elif self.model.startswith("gemini"):
+
+            api_key = os.environ.get("GENAI_API_KEY")
+            genai.api_key = api_key
+            def encoded_img_to_pil_img(data_str):
+                base64_str = data_str.replace("data:image/png;base64,", "")
+                image_data = base64.b64decode(base64_str)
+                image = Image.open(BytesIO(image_data))
+
+                return image
+
+            messages = payload["messages"]
+            max_tokens = payload["max_tokens"]
+
+            gemini_messages = []
+            for i, message in enumerate(messages):
+                gemini_message = {
+                    "role": message["role"],
+                    "parts": []
+                }
+                assert len(message["content"]) in [1, 2], "One text, or one text with one image"
+
+                # The gemini only support the last image as single image input
+                if i == len(messages) - 1:
+                    for part in message["content"]:
+                        gemini_message['parts'].append(part['text']) if part['type'] == "text" \
+                            else gemini_message['parts'].append(encoded_img_to_pil_img(part['image_url']['url']))
+                else:
+                    for part in message["content"]:
+                        gemini_message['parts'].append(part['text']) if part['type'] == "text" else None
+
+                gemini_messages.append(gemini_message)
+
+            response = genai.GenerativeModel(self.model).generate_content(
+                gemini_messages,
+                generation_config={
+                    "max_output_tokens": max_tokens
+                }
+            )
+
+            try:
+                return response.text
+            except Exception as e:
+                return ""
+        elif self.model.startswith("qwen"):
+            messages = payload["messages"]
+            max_tokens = payload["max_tokens"]
+
+            qwen_messages = []
+
+            for i, message in enumerate(messages):
+                qwen_message = {
+                    "role": message["role"],
+                    "content": []
+                }
+                assert len(message["content"]) in [1, 2], "One text, or one text with one image"
+                for part in message["content"]:
+                    qwen_message['content'].append({"image": part['image_url']['url']}) if part['type'] == "image_url" else None
+                    qwen_message['content'].append({"text": part['text']}) if part['type'] == "text" else None
+
+                qwen_messages.append(qwen_message)
+
+            response = dashscope.MultiModalConversation.call(model='qwen-vl-plus',
+                                                             messages=messages)
+            # The response status_code is HTTPStatus.OK indicate success,
+            # otherwise indicate request is failed, you can get error code
+            # and message from code and message.
+            if response.status_code == HTTPStatus.OK:
+                try:
+                    return response.json()['output']['choices'][0]['message']['content']
+                except Exception as e:
                     return ""
+            else:
+                print(response.code)  # The error code.
+                print(response.message)  # The error message.
+                return ""
 
-            print("Failed to call LLM: " + response.text)
-            return ""
         else:
-            return response.json()['choices'][0]['message']['content']
+            raise ValueError("Invalid model: " + self.model)
 
     def parse_actions(self, response: str, masks=None):
 

From 32bcdd093761033a99d4b374d65a750489047004 Mon Sep 17 00:00:00 2001
From: Timothyxxx <384084775@qq.com>
Date: Thu, 1 Feb 2024 18:58:22 +0800
Subject: [PATCH 6/6] Modify the logic of SoM agent

---
 mm_agents/gpt_4v_agent.py | 51 ++++++++++++++++++++++++++-------------
 mm_agents/prompts.py      | 12 ++++-----
 2 files changed, 40 insertions(+), 23 deletions(-)

diff --git a/mm_agents/gpt_4v_agent.py b/mm_agents/gpt_4v_agent.py
index 596f5ee..b94a4f9 100644
--- a/mm_agents/gpt_4v_agent.py
+++ b/mm_agents/gpt_4v_agent.py
@@ -150,24 +150,21 @@ def parse_code_from_string(input_string):
 
 def parse_code_from_som_string(input_string, masks):
     # parse the output string by masks
-    mappings = []
+    tag_vars = ""
     for i, mask in enumerate(masks):
         x, y, w, h = mask
-        mappings.append(("tag#" + str(i + 1), "{}, {}".format(int(x + w // 2), int(y + h // 2))))
-
-    def replace_tags_with_mappings(text, mappings):
-        pattern = r'tag#\d+'
-        matches = re.findall(pattern, text)
-
-        for match in matches:
-            for mapping in mappings:
-                if match == mapping[0]:
-                    text = text.replace(match, mapping[1])
-                    break
-                logger.error("Predicting the tag with index {} failed.".format(match))
-                return ""
+        tag_vars += "tag_" + str(i + 1) + "=" + "({}, {})".format(int(x + w // 2), int(y + h // 2))
+        tag_vars += "\n"
 
     actions = parse_code_from_string(input_string)
+
+    for i, action in enumerate(actions):
+        if action.strip() in ['WAIT', 'DONE', 'FAIL']:
+            pass
+        else:
+            action = tag_vars + action
+            actions[i] = action
+
     return actions
 
 
@@ -561,19 +558,39 @@ class GPT4v_Agent:
                 return response.json()['choices'][0]['message']['content']
 
         elif self.model.startswith("mistral"):
+            print("call mistral")
             messages = payload["messages"]
             max_tokens = payload["max_tokens"]
 
+            misrtal_messages = []
+
+            for i, message in enumerate(messages):
+                mistral_message = {
+                    "role": message["role"],
+                    "content": []
+                }
+
+                for part in message["content"]:
+                    mistral_message['content'] = part['text'] if part['type'] == "text" else None
+
+                misrtal_messages.append(mistral_message)
+
+            # the mistral not support system message in our endpoint, so we concatenate it at the first user message
+            if misrtal_messages[0]['role'] == "system":
+                misrtal_messages[1]['content'] = misrtal_messages[0]['content'] + "\n" + misrtal_messages[1]['content']
+                misrtal_messages.pop(0)
+
             openai.api_base = "http://localhost:8000/v1"
             openai.api_key = "test"
             response = openai.ChatCompletion.create(
-                messages=messages,
-                model="Mixtral-8x7B-Instruct-v0.1",
-                max_tokens=max_tokens
+                messages=misrtal_messages,
+                model="Mixtral-8x7B-Instruct-v0.1"
             )
+
             try:
                 return response['choices'][0]['message']['content']
             except Exception as e:
+                print("Failed to call LLM: " + str(e))
                 return ""
 
         elif self.model.startswith("gemini"):
diff --git a/mm_agents/prompts.py b/mm_agents/prompts.py
index 85295de..e23a211 100644
--- a/mm_agents/prompts.py
+++ b/mm_agents/prompts.py
@@ -806,9 +806,9 @@ For each step, you will get an observation of the desktop by 1) a screenshot; an
 You are required to use `pyautogui` to perform the action grounded to the observation, but DONOT use the `pyautogui.locateCenterOnScreen` function to locate the element you want to operate with since we have no image of the element you want to operate with. DONOT USE `pyautogui.screenshot()` to make screenshot.
 You can replace x, y in the code with the tag of the element you want to operate with. such as:
 ```python
-pyautogui.moveTo(tag#3)
-pyautogui.click(tag#2)
-pyautogui.dragTo(tag#1, button='left')
+pyautogui.moveTo(tag_3)
+pyautogui.click(tag_2)
+pyautogui.dragTo(tag_1, button='left')
 ```
 When you think you can directly output precise x and y coordinates or there is no tag on which you want to interact, you can also use them directly. 
 But you should be careful to ensure that the coordinates are correct.
@@ -856,9 +856,9 @@ ACTION_GROUNDING_PROMPT_SEEACT = """
 You are required to use `pyautogui` to perform the action grounded to the observation, but DONOT use the `pyautogui.locateCenterOnScreen` function to locate the element you want to operate with since we have no image of the element you want to operate with. DONOT USE `pyautogui.screenshot()` to make screenshot.
 You can replace x, y in the code with the tag of the element you want to operate with. such as:
 ```python
-pyautogui.moveTo(tag#3)
-pyautogui.click(tag#2)
-pyautogui.dragTo(tag#1, button='left')
+pyautogui.moveTo(tag_3)
+pyautogui.click(tag_2)
+pyautogui.dragTo(tag_1, button='left')
 ```
 When you think you can directly output precise x and y coordinates or there is no tag on which you want to interact, you can also use them directly. 
 But you should be careful to ensure that the coordinates are correct.