From 493b71982197971304212c630b3fcb9a2a99d940 Mon Sep 17 00:00:00 2001
From: Timothyxxx <384084775@qq.com>
Date: Mon, 15 Jan 2024 21:58:33 +0800
Subject: [PATCH] Add gemini agent implementation; Add missed requirements;
 Minor fix some small bugs

---
 desktop_env/envs/desktop_env.py   |  7 +++-
 experiment.py                     | 26 ++++++-------
 mm_agents/fuyu_test.py            | 20 ----------
 mm_agents/gemini_agent.py         | 65 ++++++++++++++++++++-----------
 mm_agents/gemini_agent_text.py    |  0
 mm_agents/gpt_4v_agent.py         | 25 ++++--------
 mm_agents/gpt_4v_agent_text.py    |  0
 mm_agents/gpt_4v_prompt_action.py |  2 +-
 mm_agents/gpt_4v_prompt_code.py   | 18 ++++++---
 requirements.txt                  |  2 +
 10 files changed, 82 insertions(+), 83 deletions(-)
 delete mode 100644 mm_agents/fuyu_test.py
 create mode 100644 mm_agents/gemini_agent_text.py
 create mode 100644 mm_agents/gpt_4v_agent_text.py

diff --git a/desktop_env/envs/desktop_env.py b/desktop_env/envs/desktop_env.py
index e2ef08b..8a79d72 100644
--- a/desktop_env/envs/desktop_env.py
+++ b/desktop_env/envs/desktop_env.py
@@ -231,8 +231,11 @@ class DesktopEnv(gym.Env):
             # the set of all possible actions defined in the action representation
             self.controller.execute_action(action)
         elif self.action_space == "pyautogui":
-            # the set of all possible python commands insides `pyautogui`
-            self.controller.execute_python_command(action)
+            if action in ['WAIT', 'FAIL', 'DONE']:
+                self.controller.execute_action(action)
+            else:
+                # the set of all possible python commands insides `pyautogui`
+                self.controller.execute_python_command(action)
 
         observation = {
             "screenshot": self._get_obs(),
diff --git a/experiment.py b/experiment.py
index 48bb1dc..f6d1232 100644
--- a/experiment.py
+++ b/experiment.py
@@ -6,6 +6,7 @@ import sys
 
 from desktop_env.envs.desktop_env import DesktopEnv
 from mm_agents.gpt_4v_agent import GPT4v_Agent
+from mm_agents.gemini_agent import GeminiPro_Agent
 
 #  Logger Configs {{{ # 
 logger = logging.getLogger()
@@ -44,7 +45,7 @@ logger = logging.getLogger("desktopenv.experiment")
 PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx"
 
 
-def run_one_example(example, agent, max_steps=2, example_trajectory_dir="exp_trajectory", recording=True):
+def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_trajectory", recording=True):
     trajectory_recording_path = os.path.join(example_trajectory_dir, "trajectory.json")
     env = DesktopEnv(
         path_to_vm=PATH_TO_VM,
@@ -53,7 +54,6 @@ def run_one_example(example, agent, max_steps=2, example_trajectory_dir="exp_tra
     )
     # reset the environment to certain snapshot
     observation = env.reset()
-    observation['instruction'] = example['instruction']
     done = False
     step_num = 0
 
@@ -63,17 +63,14 @@ def run_one_example(example, agent, max_steps=2, example_trajectory_dir="exp_tra
 
     while not done and step_num < max_steps:
         actions = agent.predict(observation)
+        step_num += 1
         for action in actions:
-            step_num += 1
-
             # Capture the timestamp before executing the action
             action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
+            logger.info("Step %d: %s", step_num, action)
 
             observation, reward, done, info = env.step(action)
-            observation['instruction'] = example['instruction']
 
-            # Logging
-            logger.info("Step %d: %s", step_num, action)
             logger.info("Reward: %.2f", reward)
             logger.info("Done: %s", done)
             logger.info("Info: %s", info)
@@ -114,19 +111,22 @@ def run_one_example(example, agent, max_steps=2, example_trajectory_dir="exp_tra
 
 if __name__ == "__main__":
     action_space = "pyautogui"
-    example_class = "vlc"
-    example_id = "8f080098-ddb1-424c-b438-4e96e5e4786e"
+    example_class = "thunderbird"
+    example_id = "bb5e4c0d-f964-439c-97b6-bdb9747de3f4"
 
     with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f:
         example = json.load(f)
-    example["snapshot"] = "exp_setup"
+    example["snapshot"] = "exp_setup2"
 
-    api_key = os.environ.get("OPENAI_API_KEY")
-    agent = GPT4v_Agent(api_key=api_key, action_space=action_space)
+    # api_key = os.environ.get("OPENAI_API_KEY")
+    # agent = GPT4v_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space)
+
+    api_key = os.environ.get("GENAI_API_KEY")
+    agent = GeminiPro_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space)
 
     root_trajectory_dir = "exp_trajectory"
 
     example_trajectory_dir = os.path.join(root_trajectory_dir, example_class, example_id)
     os.makedirs(example_trajectory_dir, exist_ok=True)
 
-    run_one_example(example, agent, 2, example_trajectory_dir)
+    run_one_example(example, agent, 10, example_trajectory_dir)
diff --git a/mm_agents/fuyu_test.py b/mm_agents/fuyu_test.py
deleted file mode 100644
index ea77186..0000000
--- a/mm_agents/fuyu_test.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from transformers import FuyuProcessor, FuyuForCausalLM
-from PIL import Image
-
-image = Image.open("stackoverflow.png").convert("RGB")
-
-# load model and processor
-model_id = "adept/fuyu-8b"
-processor = FuyuProcessor.from_pretrained(model_id)
-model = FuyuForCausalLM.from_pretrained(model_id, device_map="cuda:0")
-
-# prepare inputs for the model
-text_prompt = "Description:\n"
-
-inputs = processor(text=text_prompt, images=image, return_tensors="pt").to("cuda:0")
-
-# autoregressively generate text
-generation_output = model.generate(**inputs, max_new_tokens=100)
-generation_text = processor.batch_decode(generation_output[:, -100:], skip_special_tokens=True)
-
-print(generation_text)
diff --git a/mm_agents/gemini_agent.py b/mm_agents/gemini_agent.py
index 37e22f2..1593c9a 100644
--- a/mm_agents/gemini_agent.py
+++ b/mm_agents/gemini_agent.py
@@ -1,4 +1,4 @@
-from typing import Dict
+from typing import Dict, List
 
 import PIL.Image
 import google.generativeai as genai
@@ -9,10 +9,13 @@ from mm_agents.gpt_4v_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE
 
 
 class GeminiPro_Agent:
-    def __init__(self, api_key, model='gemini-pro-vision', max_tokens=300, action_space="computer_13"):
-        genai.configure(api_key)
+    def __init__(self, api_key, instruction, model='gemini-pro-vision', max_tokens=300, temperature=0.0,
+                 action_space="computer_13"):
+        genai.configure(api_key=api_key)
+        self.instruction = instruction
         self.model = genai.GenerativeModel(model)
         self.max_tokens = max_tokens
+        self.temperature = temperature
         self.action_space = action_space
 
         self.trajectory = [
@@ -22,22 +25,39 @@ class GeminiPro_Agent:
                     {
                         "computer_13": SYS_PROMPT_ACTION,
                         "pyautogui": SYS_PROMPT_CODE
-                    }[action_space]
+                    }[action_space] + "\nHere is the instruction for the task: {}".format(self.instruction)
                 ]
             }
         ]
 
-    def predict(self, obs: Dict):
+    def predict(self, obs: Dict) -> List:
         """
         Predict the next action(s) based on the current observation.
+        Only support single-round conversation, only fill-in the last desktop screenshot.
         """
         img = PIL.Image.open(obs["screenshot"])
         self.trajectory.append({
             "role": "user",
-            "parts": ["To accomplish the task '{}' and given the current screenshot, what's the next step?".format(
-                obs["instruction"]), img]
+            "parts": ["What's the next step that you will do to help with the task?", img]
         })
 
+        # todo: Remove this step once the Gemini supports multi-round conversation
+        all_message_str = ""
+        for i in range(len(self.trajectory)):
+            if i == 0:
+                all_message_template = "<|im_start|>system\n{}\n<|im_end|>\n"
+            elif i % 2 == 1:
+                all_message_template = "<|im_start|>user\n{}\n<|im_end|>\n"
+            else:
+                all_message_template = "<|im_start|>assistant\n{}\n<|im_end|>\n"
+
+            all_message_str += all_message_template.format(self.trajectory[i]["parts"][0])
+
+        message_for_gemini = {
+            "role": "user",
+            "parts": [all_message_str, img]
+        }
+
         traj_to_show = []
         for i in range(len(self.trajectory)):
             traj_to_show.append(self.trajectory[i]["parts"][0])
@@ -46,29 +66,28 @@ class GeminiPro_Agent:
 
         print("Trajectory:", traj_to_show)
 
-        response = self.model.generate_content(self.trajectory, max_tokens=self.max_tokens)
+        response = self.model.generate_content(
+            message_for_gemini,
+            generation_config={
+                "max_output_tokens": self.max_tokens,
+                "temperature": self.temperature
+            }
+        )
 
         try:
-            # fixme: change to fit the new response format from gemini pro
-            actions = self.parse_actions(response.json()['choices'][0]['message']['content'])
+            response_text = response.text
         except:
-            # todo: add error handling
-            print("Failed to parse action from response:", response.json()['choices'][0]['message']['content'])
-            actions = None
+            return []
+
+        try:
+            actions = self.parse_actions(response_text)
+        except:
+            print("Failed to parse action from response:", response_text)
+            actions = []
 
         return actions
 
     def parse_actions(self, response: str):
-        # response example
-        """
-        ```json
-        {
-          "action_type": "CLICK",
-          "click_type": "RIGHT"
-        }
-        ```
-        """
-
         # parse from the response
         if self.action_space == "computer_13":
             actions = parse_actions_from_string(response)
diff --git a/mm_agents/gemini_agent_text.py b/mm_agents/gemini_agent_text.py
new file mode 100644
index 0000000..e69de29
diff --git a/mm_agents/gpt_4v_agent.py b/mm_agents/gpt_4v_agent.py
index 203b40c..81d128e 100644
--- a/mm_agents/gpt_4v_agent.py
+++ b/mm_agents/gpt_4v_agent.py
@@ -1,7 +1,7 @@
 import base64
 import json
 import re
-from typing import Dict
+from typing import Dict, List
 
 import requests
 
@@ -63,7 +63,8 @@ def parse_code_from_string(input_string):
 
 
 class GPT4v_Agent:
-    def __init__(self, api_key, model="gpt-4-vision-preview", max_tokens=300, action_space="computer_13"):
+    def __init__(self, api_key, instruction, model="gpt-4-vision-preview", max_tokens=300, action_space="computer_13"):
+        self.instruction = instruction
         self.model = model
         self.max_tokens = max_tokens
         self.action_space = action_space
@@ -82,13 +83,13 @@ class GPT4v_Agent:
                         "text": {
                             "computer_13": SYS_PROMPT_ACTION,
                             "pyautogui": SYS_PROMPT_CODE
-                        }[action_space]
+                        }[action_space] + "\nHere is the instruction for the task: {}".format(self.instruction)
                     },
                 ]
             }
         ]
 
-    def predict(self, obs: Dict):
+    def predict(self, obs: Dict) -> List:
         """
         Predict the next action(s) based on the current observation.
         """
@@ -98,8 +99,7 @@ class GPT4v_Agent:
             "content": [
                 {
                     "type": "text",
-                    "text": "To accomplish the task '{}' and given the current screenshot, what's the next step?".format(
-                        obs["instruction"])
+                    "text": "What's the next step that you will do to help with the task?"
                 },
                 {
                     "type": "image_url",
@@ -128,23 +128,12 @@ class GPT4v_Agent:
         try:
             actions = self.parse_actions(response.json()['choices'][0]['message']['content'])
         except:
-            # todo: add error handling
-            print("Failed to parse action from response:", response.json()['choices'][0]['message']['content'])
+            print("Failed to parse action from response:", response.json())
             actions = None
 
         return actions
 
     def parse_actions(self, response: str):
-        # response example
-        """
-        ```json
-        {
-          "action_type": "CLICK",
-          "click_type": "RIGHT"
-        }
-        ```
-        """
-
         # parse from the response
         if self.action_space == "computer_13":
             actions = parse_actions_from_string(response)
diff --git a/mm_agents/gpt_4v_agent_text.py b/mm_agents/gpt_4v_agent_text.py
new file mode 100644
index 0000000..e69de29
diff --git a/mm_agents/gpt_4v_prompt_action.py b/mm_agents/gpt_4v_prompt_action.py
index 650b136..4323df6 100644
--- a/mm_agents/gpt_4v_prompt_action.py
+++ b/mm_agents/gpt_4v_prompt_action.py
@@ -237,7 +237,7 @@ for example, format as:
 ```
 
 REMEMBER:
-For every step, you should only return the action_type and the parameters of your action as a dict, without any other things. 
+For every step, you should only RETURN ME THE action_type AND parameters I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
 You MUST wrap the dict with backticks (\`).
 You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty.
 You CAN predict multiple actions at one step, but you should only return one action for each step.
diff --git a/mm_agents/gpt_4v_prompt_code.py b/mm_agents/gpt_4v_prompt_code.py
index 17e8c9d..aa768e9 100644
--- a/mm_agents/gpt_4v_prompt_code.py
+++ b/mm_agents/gpt_4v_prompt_code.py
@@ -1,11 +1,17 @@
 SYS_PROMPT = """
-You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection.
-For each step, you will get an observation of an image, which is the screenshot of the computer screen. And you will predict the action of the computer based on the image.
-
+You are an agent which follow my instruction and perform desktop computer tasks as instructed.
+You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard.
+For each step, you will get an observation of an image, which is the screenshot of the computer screen and you will predict the action of the computer based on the image.
 You are required to use `pyautogui` to perform the action. 
 Return one line or multiple lines of python code to perform the action each time, be time efficient.
+You ONLY need to return the code inside a code block, like this:
+```python
+# your code here
+```
+Specially, it is also allowed to return the following special code:
+When you think you have to wait for some time, return ```WAIT```;
+When you think the task can not be done, return ```FAIL```;
+When you think the task is done, return ```DONE```.
 
-When you think you have to wait for some time, return `WAIT`.
-When you think the task can not be done, return `FAIL`.
-When you think the task is done, return `DONE`.
+First give the current screenshot and previous things we did a reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
 """
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index a13f733..f2c4cc3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -30,3 +30,5 @@ ImageHash
 scikit-image
 librosa
 pymupdf
+chardet
+playwright