From 97b567a287e036744f6868957fabf7d38f072bf2 Mon Sep 17 00:00:00 2001
From: Timothyxxx <384084775@qq.com>
Date: Fri, 26 Apr 2024 13:32:41 +0800
Subject: [PATCH] Update README and ROADMAP; Fix typos; optimize the code for
 llm calling in agent.py

---
 README.md                        |  13 ++-
 ROADMAP.md                       |   2 +-
 desktop_env/controllers/setup.py |   2 +-
 mm_agents/agent.py               | 162 +++++++++++++++----------------
 4 files changed, 90 insertions(+), 89 deletions(-)

diff --git a/README.md b/README.md
index be0dfcb..0a92158 100644
--- a/README.md
+++ b/README.md
@@ -63,6 +63,7 @@ pip install desktop-env
 vmrun -T ws list
 ```
 If the installation along with the environment variable set is successful, you will see the message showing the current running virtual machines.
+> **Note:** We will also support using [VirtualBox](https://www.virtualbox.org/) in the near future if you have issues with VMware Pro. However, features such as parallelism and macOS on Apple chips are not supported.
 
 All set! Our setup script will automatically download the necessary virtual machines and configure the environment for you.
 
@@ -135,6 +136,14 @@ Correctly implement the agent interface and import your customized version in th
 Afterward, you can execute a command similar to the one in the previous section to run the benchmark on your agent.
 
 ## ❓ FAQ
+### What is the username and password for the virtual machines?
+The username and password for the virtual machines are as follows:
+- **Ubuntu:** `user` / `password`
+
+### How can I configure a proxy for the VM if I'm behind a GFW?
+
+See [Proxy Guideline](PROXY_GUIDELINE.md).
+
 ### What are the running times and costs under different settings?
 | Setting                        | Expected Time* | Budget Cost (Full Test Set/Small Test Set) |
 | ------------------------------ | -------------- | ------------------------------------------ |
@@ -145,10 +154,6 @@ Afterward, you can execute a command similar to the one in the previous section
 
 \*No environment parallelism. Calculated in April 2024.
 
-### How can I configure a proxy for the VM if I'm behind a GFW?
-
-See [Proxy Guideline](PROXY_GUIDELINE.md)
-
 ## 📄 Citation
 If you find this environment useful, please consider citing our work:
 ```
diff --git a/ROADMAP.md b/ROADMAP.md
index 5b152b2..f842205 100644
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -24,7 +24,7 @@ If you are interested in contributing to the project, please check the [CONTRIBU
 - [ ] VPN setup doc for those who need it
 - [ ] Support running on platforms that have nested virtualization, e.g. Google Cloud, AWS, etc. 
 - [ ] Prepare for the first release of Windows vm image for the environment
-- [ ] Be able to run without virtual machine platform VMware
+- [ ] Be able to run without virtual machine platform VMware Pro, e.g. VirtualBox, or other platforms
 
 
 ## Road Map of Annotation Tool
diff --git a/desktop_env/controllers/setup.py b/desktop_env/controllers/setup.py
index 1de4ec0..3bacb25 100644
--- a/desktop_env/controllers/setup.py
+++ b/desktop_env/controllers/setup.py
@@ -242,7 +242,7 @@ class SetupController:
             until: Optional[Dict[str, Any]] = None
     ):
         if not command:
-            raise Exception("Empty comman to launch.")
+            raise Exception("Empty command to launch.")
 
         until: Dict[str, Any] = until or {}
         terminates: bool = False
diff --git a/mm_agents/agent.py b/mm_agents/agent.py
index 8d9494d..da28ea8 100644
--- a/mm_agents/agent.py
+++ b/mm_agents/agent.py
@@ -1,5 +1,4 @@
 import base64
-import hashlib
 import json
 import logging
 import os
@@ -9,7 +8,6 @@ import time
 import xml.etree.ElementTree as ET
 from http import HTTPStatus
 from io import BytesIO
-from pathlib import Path
 from typing import Dict, List
 
 import backoff
@@ -19,7 +17,7 @@ import openai
 import requests
 import tiktoken
 from PIL import Image
-from google.api_core.exceptions import InvalidArgument
+from google.api_core.exceptions import InvalidArgument, ResourceExhausted, InternalServerError, BadRequest
 
 from mm_agents.accessibility_tree_wrap.heuristic_retrieve import filter_nodes, draw_bounding_boxes
 from mm_agents.prompts import SYS_PROMPT_IN_SCREENSHOT_OUT_CODE, SYS_PROMPT_IN_SCREENSHOT_OUT_ACTION, \
@@ -487,13 +485,17 @@ class PromptAgent:
 
         # logger.info("PROMPT: %s", messages)
 
-        response = self.call_llm({
-            "model": self.model,
-            "messages": messages,
-            "max_tokens": self.max_tokens,
-            "top_p": self.top_p,
-            "temperature": self.temperature
-        })
+        try:
+            response = self.call_llm({
+                "model": self.model,
+                "messages": messages,
+                "max_tokens": self.max_tokens,
+                "top_p": self.top_p,
+                "temperature": self.temperature
+            })
+        except Exception as e:
+            logger.error("Failed to call" + self.model + ", Error: " + str(e))
+            response = ""
 
         logger.info("RESPONSE: %s", response)
 
@@ -512,10 +514,18 @@ class PromptAgent:
         # here you should add more model exceptions as you want,
         # but you are forbidden to add "Exception", that is, a common type of exception
         # because we want to catch this kind of Exception in the outside to ensure each example won't exceed the time limit
-        (openai.RateLimitError,
-         openai.BadRequestError,
-         openai.InternalServerError,
-         InvalidArgument),
+        (
+                # OpenAI exceptions
+                openai.RateLimitError,
+                openai.BadRequestError,
+                openai.InternalServerError,
+
+                # Google exceptions
+                InvalidArgument,
+                ResourceExhausted,
+                InternalServerError,
+                BadRequest,
+        ),
         max_tries=5
     )
     def call_llm(self, payload):
@@ -767,29 +777,25 @@ class PromptAgent:
             logger.info("Generating content with Gemini model: %s", self.model)
             request_options = {"timeout": 120}
             gemini_model = genai.GenerativeModel(self.model)
-            try:
-                response = gemini_model.generate_content(
-                    gemini_messages,
-                    generation_config={
-                        "candidate_count": 1,
-                        "max_output_tokens": max_tokens,
-                        "top_p": top_p,
-                        "temperature": temperature
-                    },
-                    safety_settings={
-                        "harassment": "block_none",
-                        "hate": "block_none",
-                        "sex": "block_none",
-                        "danger": "block_none"
-                    },
-                    request_options=request_options
-                )
-                return response.text
-            except Exception as e:
-                logger.error("Meet exception when calling Gemini API, " + str(e.__class__.__name__) + str(e))
-                logger.error(f"count_tokens: {gemini_model.count_tokens(gemini_messages)}")
-                logger.error(f"generation_config: {max_tokens}, {top_p}, {temperature}")
-                return ""
+
+            response = gemini_model.generate_content(
+                gemini_messages,
+                generation_config={
+                    "candidate_count": 1,
+                    "max_output_tokens": max_tokens,
+                    "top_p": top_p,
+                    "temperature": temperature
+                },
+                safety_settings={
+                    "harassment": "block_none",
+                    "hate": "block_none",
+                    "sex": "block_none",
+                    "danger": "block_none"
+                },
+                request_options=request_options
+            )
+            return response.text
+
 
         elif self.model == "gemini-1.5-pro-latest":
             messages = payload["messages"]
@@ -797,19 +803,6 @@ class PromptAgent:
             top_p = payload["top_p"]
             temperature = payload["temperature"]
 
-            uploaded_files = []
-
-            # def upload_if_needed(pathname: str) -> list[str]:
-            #     path = Path(pathname)
-            #     hash_id = hashlib.sha256(path.read_bytes()).hexdigest()
-            #     try:
-            #         existing_file = genai.get_file(name=hash_id)
-            #         return [existing_file.uri]
-            #     except:
-            #         pass
-            #     uploaded_files.append(genai.upload_file(path=path, display_name=hash_id))
-            #     return [uploaded_files[-1].uri]
-
             gemini_messages = []
             for i, message in enumerate(messages):
                 role_mapping = {
@@ -818,21 +811,23 @@ class PromptAgent:
                     "system": "system"
                 }
                 assert len(message["content"]) in [1, 2], "One text, or one text with one image"
+                gemini_message = {
+                    "role": role_mapping[message["role"]],
+                    "parts": []
+                }
 
                 # The gemini only support the last image as single image input
                 for part in message["content"]:
-                    gemini_message = {
-                        "role": role_mapping[message["role"]],
-                        "parts": []
-                    }
+
                     if part['type'] == "image_url":
-                        gemini_message['parts'].append(encoded_img_to_pil_img(part['image_url']['url']))
+                        # Put the image at the beginning of the message
+                        gemini_message['parts'].insert(0, encoded_img_to_pil_img(part['image_url']['url']))
                     elif part['type'] == "text":
                         gemini_message['parts'].append(part['text'])
                     else:
                         raise ValueError("Invalid content type: " + part['type'])
 
-                    gemini_messages.append(gemini_message)
+                gemini_messages.append(gemini_message)
 
             # the system message of gemini-1.5-pro-latest need to be inputted through model initialization parameter
             system_instruction = None
@@ -849,33 +844,34 @@ class PromptAgent:
                 self.model,
                 system_instruction=system_instruction
             )
-            try:
-                response = gemini_model.generate_content(
-                    gemini_messages,
-                    generation_config={
-                        "candidate_count": 1,
-                        "max_output_tokens": max_tokens,
-                        "top_p": top_p,
-                        "temperature": temperature
-                    },
-                    safety_settings={
-                        "harassment": "block_none",
-                        "hate": "block_none",
-                        "sex": "block_none",
-                        "danger": "block_none"
-                    },
-                    request_options=request_options
-                )
-                for uploaded_file in uploaded_files:
-                    genai.delete_file(name=uploaded_file.name)
-                return response.text
-            except Exception as e:
-                logger.error("Meet exception when calling Gemini API, " + str(e.__class__.__name__) + str(e))
-                logger.error(f"count_tokens: {gemini_model.count_tokens(gemini_messages)}")
-                logger.error(f"generation_config: {max_tokens}, {top_p}, {temperature}")
-                for uploaded_file in uploaded_files:
-                    genai.delete_file(name=uploaded_file.name)
-                return ""
+
+            with open("response.json", "w") as f:
+                messages_to_save = []
+                for message in gemini_messages:
+                    messages_to_save.append({
+                        "role": message["role"],
+                        "content": [part if isinstance(part, str) else "image" for part in message["parts"]]
+                    })
+                json.dump(messages_to_save, f, indent=4)
+
+            response = gemini_model.generate_content(
+                gemini_messages,
+                generation_config={
+                    "candidate_count": 1,
+                    "max_output_tokens": max_tokens,
+                    "top_p": top_p,
+                    "temperature": temperature
+                },
+                safety_settings={
+                    "harassment": "block_none",
+                    "hate": "block_none",
+                    "sex": "block_none",
+                    "danger": "block_none"
+                },
+                request_options=request_options
+            )
+
+            return response.text
 
         elif self.model.startswith("qwen"):
             messages = payload["messages"]