From 8c0525c20e24e710fe4c2ac0cc7e9fbe0ddf2b65 Mon Sep 17 00:00:00 2001
From: Timothyxxx <384084775@qq.com>
Date: Mon, 27 Nov 2023 00:29:09 +0800
Subject: [PATCH] Adapt for Windows os; Refine README
---
README.md | 12 +++----
SERVER_SETUP.md | 23 ++++++++++++
SSH_SERVER_SETUP.md | 50 --------------------------
desktop_env/controllers/xdotool.py | 6 ++--
desktop_env/envs/desktop_env.py | 56 ++++++++++++++++--------------
requirements.txt | 3 ++
6 files changed, 66 insertions(+), 84 deletions(-)
create mode 100644 SERVER_SETUP.md
delete mode 100644 SSH_SERVER_SETUP.md
diff --git a/README.md b/README.md
index cf9e69d..bc12ca1 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@
3. Set up bridge for connecting to VM
1. Option 1: Install [xdotool](https://github.com/jordansissel/xdotool) on VM
2. Option 2: Install [mouse](https://github.com/boppreh/mouse/)
-4. Set up SSH server on VM | [Guide](./SSH_SERVER_SETUP.md)
+4. Set up SSH server on VM | [Guide](./SERVER_SETUP.md)
5. Install screenshot tool (in vm)
1. `sudo apt install imagemagick-6.q16hdri`
2. `DISPLAY=:0 import -window root screenshot.png`
@@ -22,12 +22,7 @@
2. `rm -rf ~/screenshot.png`
7. Set up python and install [mouse](https://github.com/boppreh/mouse/) and [keyboard](https://github.com/jordansissel/xdotool)
-## Windows setup guide
-1. Copy and paste the file `windows_server/main.py` to the windows vm
-2. Make sure `mouse` and `keyboard` are installed
-3. Run the file `pythonw main.py`
-4. `ipconfig /all` and find the ip address
## Road map (Proposed)
@@ -36,6 +31,11 @@
- MacOS is closed source and cannot be legally installed
- Windows is available legally and can be installed
- [x] Build gym-like python interface for controlling the VM
+- [] Make configuration much easier from code perspective
+ - [ ] README
+ - [ ] Make it easier to install the dependencies
+ - [ ] Make it easier to install the VM
+ - [ ] Make it easier to set up the VM
- [ ] Recording of actions (mouse movement, click, keyboard) for human to annotate, and we can replay it
- [ ] This part may be conflict with work from [Aran Komatsuzaki](https://twitter.com/arankomatsuzaki) team, a.k.a. [Duck AI](https://duckai.org/)
- [ ] Build a simple task, e.g. open a browser, open a website, click on a button, and close the browser
diff --git a/SERVER_SETUP.md b/SERVER_SETUP.md
new file mode 100644
index 0000000..f67c01e
--- /dev/null
+++ b/SERVER_SETUP.md
@@ -0,0 +1,23 @@
+# Server Setup Guide
+
+- [Linux](#linux)
+- [Windows](#windows)
+
+## Linux
+
+
+
+1. `sudo apt install openssh-server`
+2. `sudo systemctl enable ssh --now`
+3. `sudo ufw disable` (disable firewall - safe for local network, otherwise `sudo ufw allow ssh`)
+4. `ip a` - find ip address
+5. ssh username@
+6. On host, run `ssh-copy-id @`
+
+
+## Windows
+
+1. Copy and paste the file `windows_server/main.py` to the windows vm
+2. Make sure `mouse` and `keyboard` are installed
+3. Run the file `python main.py`
+4. `ipconfig /all` and find the ip address
\ No newline at end of file
diff --git a/SSH_SERVER_SETUP.md b/SSH_SERVER_SETUP.md
deleted file mode 100644
index 83c5507..0000000
--- a/SSH_SERVER_SETUP.md
+++ /dev/null
@@ -1,50 +0,0 @@
-# SSH Server Setup Guide
-
-- [Linux](#linux)
-- [Windows](#windows)
-
-## Linux
-
-
-
-1. `sudo apt install openssh-server`
-2. `sudo systemctl enable ssh --now`
-3. `sudo ufw disable` (disable firewall - safe for local network, otherwise `sudo ufw allow ssh`)
-4. `ip a` - find ip address
-5. ssh username@
-6. On host, run `ssh-copy-id @`
-
-## Windows
-
-To SSH into a Windows machine and control it using the terminal, you need to set up an SSH server on the Windows machine and then connect to it from an SSH client. Microsoft has integrated an OpenSSH server and client in Windows 10 and later, which makes this process more straightforward. Here's how you can do it:
-
-> Make sure that your windows account has a password
-
-### Setting Up SSH Server on Windows
-
-1. **Enable OpenSSH Server:**
-
- - Open **Settings** → **Apps** → **Optional Features**.
- - Scan the list to see if OpenSSH Server is installed. If it's not, click on **Add a feature**, then find **OpenSSH Server**, and click **Install**.
-
-2. **Start the SSH Service:**
-
- - Open **Services** from the Start menu.
- - Find the **OpenSSH SSH Server** service, right-click it, and select **Properties**.
- - Set the startup type to **Automatic** and then start the service.
-
-3. **Configure the Firewall (if necessary):**
-
- - In most cases, Windows Firewall will automatically allow SSH connections. However, if you have a third-party firewall or if connections are being blocked, you may need to manually open port 22 (default SSH port).
-
-4. **Add ssh key to windows**
-
- - Add the public ssh key to `C:\ProgramData\ssh\administrators_authorized_keys` and `~/.ssh/authorized_keys`
-
-### Connecting to the Windows Machine from SSH Client
-
-1. **From a Linux/Mac Client:**
-
- - Open the terminal.
- - Use the command `ssh username@windows-ip-address`. Replace `username` with the Windows account username and `windows-ip-address` with the IP address of the Windows machine.
- - Accept the fingerprint (if it's the first time connecting) and enter the password when prompted.
diff --git a/desktop_env/controllers/xdotool.py b/desktop_env/controllers/xdotool.py
index 620bee6..abb268f 100644
--- a/desktop_env/controllers/xdotool.py
+++ b/desktop_env/controllers/xdotool.py
@@ -1,9 +1,11 @@
from fabric import Connection
+from typing import List
+
class XDoToolController:
def __init__(self, ssh_connection: Connection):
self.ssh_connection = ssh_connection
- def _execute_xdotool_command(self, command: list[str]) -> None:
+ def _execute_xdotool_command(self, command: List[str]) -> None:
result = self.ssh_connection.run(f"DISPLAY=:0 xdotool {command}", hide=True)
- return result.stdout.strip()
\ No newline at end of file
+ return result.stdout.strip()
diff --git a/desktop_env/envs/desktop_env.py b/desktop_env/envs/desktop_env.py
index 98fa586..962d176 100644
--- a/desktop_env/envs/desktop_env.py
+++ b/desktop_env/envs/desktop_env.py
@@ -1,5 +1,5 @@
from enum import Enum
-from typing import Literal
+from typing import Literal, List, Tuple
import subprocess
from fabric import Connection
import time
@@ -22,19 +22,21 @@ class Action(Enum):
KEY_UP = 6
TYPE = 7
+
VM_TYPE = Literal['ubuntu', 'windows']
+
class DesktopEnv(gym.Env):
"""DesktopEnv with OpenAI Gym interface."""
- def __init__(self, path_to_vm: str, username: str, password: str,
- host: str, snapshot_path: str = "snapshot", vm_os: VM_TYPE = "ubuntu"):
+ def __init__(self, path_to_vm: str, username: str, password: str,
+ host: str, snapshot_path: str = "some_point_browser", vm_os: VM_TYPE = "ubuntu"):
self.path_to_vm = path_to_vm
self.username = username
self.password = password
self.host = host
- self.snapshot_path = snapshot_path
-
+ self.snapshot_path = snapshot_path # todo: handling the logic of snapshot directory
+
self.screen_width = 800
self.screen_height = 800
# Define the action and observation space
@@ -51,13 +53,15 @@ class DesktopEnv(gym.Env):
# Additional setup
self.metadata = {'render.modes': ['rgb_array']}
+
+ # Initialize emulator
+ print("Initializing...")
self._start_emulator()
- self._wait_for_emulator_load()
# set up controllers
self.mouse_controller, self.keyboard_controller = self._create_controllers(vm_os)
- def _create_controllers(self, vm_os: VM_TYPE) -> tuple[AbstractMouseController, AbstractKeyboardController]:
+ def _create_controllers(self, vm_os: VM_TYPE) -> Tuple[AbstractMouseController, AbstractKeyboardController]:
if vm_os == "ubuntu":
ssh_connection = Connection(host=self.host, user=self.username, connect_kwargs={"password": self.password})
mouse_controller = XDoToolMouseController(ssh_connection)
@@ -67,33 +71,29 @@ class DesktopEnv(gym.Env):
keyboard_controller = PythonKeyboardController(http_server=self.host)
else:
raise NotImplementedError(vm_os)
-
+
return mouse_controller, keyboard_controller
def _start_emulator(self):
- self._execute_command(["vmrun", "start", self.path_to_vm])
-
- def _wait_for_emulator_load(self):
while True:
try:
output = subprocess.check_output(f"vmrun -T ws list", shell=True, stderr=subprocess.STDOUT)
output = output.decode()
if self.path_to_vm.lstrip("~/") in output:
print("VM is running.")
- return
+ break
else:
- print("Waiting for VM to start...")
+ print("Starting VM...")
+ self._execute_command(["vmrun", "-T", "ws", "start", self.path_to_vm])
time.sleep(5)
except subprocess.CalledProcessError as e:
print(f"Error executing command: {e.output.decode().strip()}")
- return
- def _execute_command(self, command: list[str]) -> None:
- process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ def _execute_command(self, command: List[str]) -> None:
+ process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
stdout, stderr = process.communicate()
if process.returncode != 0:
print(f"Error executing command: {command}")
- print(stderr.decode())
return None
else:
return stdout.decode()
@@ -103,23 +103,27 @@ class DesktopEnv(gym.Env):
def _get_screenshot(self):
image_path = "./screenshot.png"
- self._execute_command(["vmrun", "-T", "ws", "-gu", self.username, "-gp", self.password, "captureScreen", self.path_to_vm, image_path])
+ self._execute_command(
+ ["vmrun", "-T", "ws", "-gu", self.username, "-gp", self.password, "captureScreen", self.path_to_vm,
+ image_path])
return image_path
-
+
def _get_obs(self):
screenshot_image_path = self._get_screenshot()
with Image.open(screenshot_image_path) as img:
return np.array(img)
def reset(self):
- input()
- self._execute_command(["vmrun", "-T", "ws", "revertToSnapshot", self.path_to_vm, self.snapshot_path])
- input()
- self._start_emulator()
- input()
- self._wait_for_emulator_load()
- observation = self._get_obs()
+ print("Resetting environment...")
+ print("Reverting to snapshot to {}...".format(self.snapshot_path))
+ self._execute_command(["vmrun", "-T", "ws", "revertToSnapshot", self.path_to_vm, self.snapshot_path])
+
+ print("Starting emulator...")
+ self._start_emulator()
+ print("Emulator started.")
+
+ observation = self._get_obs()
return observation
def step(self, action):
diff --git a/requirements.txt b/requirements.txt
index e390586..cb56195 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,3 +3,6 @@ Pillow
fabric
gymnasium
requests
+transformers
+torch
+accelerate