Adapt for Windows os; Refine README

This commit is contained in:
Timothyxxx
2023-11-27 00:29:09 +08:00
parent 6dee58252e
commit 8c0525c20e
6 changed files with 66 additions and 84 deletions

View File

@@ -13,7 +13,7 @@
3. Set up bridge for connecting to VM
1. Option 1: Install [xdotool](https://github.com/jordansissel/xdotool) on VM
2. Option 2: Install [mouse](https://github.com/boppreh/mouse/)
4. Set up SSH server on VM | [Guide](./SSH_SERVER_SETUP.md)
4. Set up SSH server on VM | [Guide](./SERVER_SETUP.md)
5. Install screenshot tool (in vm)
1. `sudo apt install imagemagick-6.q16hdri`
2. `DISPLAY=:0 import -window root screenshot.png`
@@ -22,12 +22,7 @@
2. `rm -rf ~/screenshot.png`
7. Set up python and install [mouse](https://github.com/boppreh/mouse/) and [keyboard](https://github.com/jordansissel/xdotool)
## Windows setup guide
1. Copy and paste the file `windows_server/main.py` to the windows vm
2. Make sure `mouse` and `keyboard` are installed
3. Run the file `pythonw main.py`
4. `ipconfig /all` and find the ip address
## Road map (Proposed)
@@ -36,6 +31,11 @@
- MacOS is closed source and cannot be legally installed
- Windows is available legally and can be installed
- [x] Build gym-like python interface for controlling the VM
- [] Make configuration much easier from code perspective
- [ ] README
- [ ] Make it easier to install the dependencies
- [ ] Make it easier to install the VM
- [ ] Make it easier to set up the VM
- [ ] Recording of actions (mouse movement, click, keyboard) for human to annotate, and we can replay it
- [ ] This part may be conflict with work from [Aran Komatsuzaki](https://twitter.com/arankomatsuzaki) team, a.k.a. [Duck AI](https://duckai.org/)
- [ ] Build a simple task, e.g. open a browser, open a website, click on a button, and close the browser

23
SERVER_SETUP.md Normal file
View File

@@ -0,0 +1,23 @@
# Server Setup Guide
- [Linux](#linux)
- [Windows](#windows)
## Linux
<https://averagelinuxuser.com/ssh-into-virtualbox/>
1. `sudo apt install openssh-server`
2. `sudo systemctl enable ssh --now`
3. `sudo ufw disable` (disable firewall - safe for local network, otherwise `sudo ufw allow ssh`)
4. `ip a` - find ip address
5. ssh username@<ip_address>
6. On host, run `ssh-copy-id <username>@<ip_address>`
## Windows
1. Copy and paste the file `windows_server/main.py` to the windows vm
2. Make sure `mouse` and `keyboard` are installed
3. Run the file `python main.py`
4. `ipconfig /all` and find the ip address

View File

@@ -1,50 +0,0 @@
# SSH Server Setup Guide
- [Linux](#linux)
- [Windows](#windows)
## Linux
<https://averagelinuxuser.com/ssh-into-virtualbox/>
1. `sudo apt install openssh-server`
2. `sudo systemctl enable ssh --now`
3. `sudo ufw disable` (disable firewall - safe for local network, otherwise `sudo ufw allow ssh`)
4. `ip a` - find ip address
5. ssh username@<ip_address>
6. On host, run `ssh-copy-id <username>@<ip_address>`
## Windows
To SSH into a Windows machine and control it using the terminal, you need to set up an SSH server on the Windows machine and then connect to it from an SSH client. Microsoft has integrated an OpenSSH server and client in Windows 10 and later, which makes this process more straightforward. Here's how you can do it:
> Make sure that your windows account has a password
### Setting Up SSH Server on Windows
1. **Enable OpenSSH Server:**
- Open **Settings****Apps****Optional Features**.
- Scan the list to see if OpenSSH Server is installed. If it's not, click on **Add a feature**, then find **OpenSSH Server**, and click **Install**.
2. **Start the SSH Service:**
- Open **Services** from the Start menu.
- Find the **OpenSSH SSH Server** service, right-click it, and select **Properties**.
- Set the startup type to **Automatic** and then start the service.
3. **Configure the Firewall (if necessary):**
- In most cases, Windows Firewall will automatically allow SSH connections. However, if you have a third-party firewall or if connections are being blocked, you may need to manually open port 22 (default SSH port).
4. **Add ssh key to windows**
- Add the public ssh key to `C:\ProgramData\ssh\administrators_authorized_keys` and `~/.ssh/authorized_keys`
### Connecting to the Windows Machine from SSH Client
1. **From a Linux/Mac Client:**
- Open the terminal.
- Use the command `ssh username@windows-ip-address`. Replace `username` with the Windows account username and `windows-ip-address` with the IP address of the Windows machine.
- Accept the fingerprint (if it's the first time connecting) and enter the password when prompted.

View File

@@ -1,9 +1,11 @@
from fabric import Connection
from typing import List
class XDoToolController:
def __init__(self, ssh_connection: Connection):
self.ssh_connection = ssh_connection
def _execute_xdotool_command(self, command: list[str]) -> None:
def _execute_xdotool_command(self, command: List[str]) -> None:
result = self.ssh_connection.run(f"DISPLAY=:0 xdotool {command}", hide=True)
return result.stdout.strip()
return result.stdout.strip()

View File

@@ -1,5 +1,5 @@
from enum import Enum
from typing import Literal
from typing import Literal, List, Tuple
import subprocess
from fabric import Connection
import time
@@ -22,19 +22,21 @@ class Action(Enum):
KEY_UP = 6
TYPE = 7
VM_TYPE = Literal['ubuntu', 'windows']
class DesktopEnv(gym.Env):
"""DesktopEnv with OpenAI Gym interface."""
def __init__(self, path_to_vm: str, username: str, password: str,
host: str, snapshot_path: str = "snapshot", vm_os: VM_TYPE = "ubuntu"):
def __init__(self, path_to_vm: str, username: str, password: str,
host: str, snapshot_path: str = "some_point_browser", vm_os: VM_TYPE = "ubuntu"):
self.path_to_vm = path_to_vm
self.username = username
self.password = password
self.host = host
self.snapshot_path = snapshot_path
self.snapshot_path = snapshot_path # todo: handling the logic of snapshot directory
self.screen_width = 800
self.screen_height = 800
# Define the action and observation space
@@ -51,13 +53,15 @@ class DesktopEnv(gym.Env):
# Additional setup
self.metadata = {'render.modes': ['rgb_array']}
# Initialize emulator
print("Initializing...")
self._start_emulator()
self._wait_for_emulator_load()
# set up controllers
self.mouse_controller, self.keyboard_controller = self._create_controllers(vm_os)
def _create_controllers(self, vm_os: VM_TYPE) -> tuple[AbstractMouseController, AbstractKeyboardController]:
def _create_controllers(self, vm_os: VM_TYPE) -> Tuple[AbstractMouseController, AbstractKeyboardController]:
if vm_os == "ubuntu":
ssh_connection = Connection(host=self.host, user=self.username, connect_kwargs={"password": self.password})
mouse_controller = XDoToolMouseController(ssh_connection)
@@ -67,33 +71,29 @@ class DesktopEnv(gym.Env):
keyboard_controller = PythonKeyboardController(http_server=self.host)
else:
raise NotImplementedError(vm_os)
return mouse_controller, keyboard_controller
def _start_emulator(self):
self._execute_command(["vmrun", "start", self.path_to_vm])
def _wait_for_emulator_load(self):
while True:
try:
output = subprocess.check_output(f"vmrun -T ws list", shell=True, stderr=subprocess.STDOUT)
output = output.decode()
if self.path_to_vm.lstrip("~/") in output:
print("VM is running.")
return
break
else:
print("Waiting for VM to start...")
print("Starting VM...")
self._execute_command(["vmrun", "-T", "ws", "start", self.path_to_vm])
time.sleep(5)
except subprocess.CalledProcessError as e:
print(f"Error executing command: {e.output.decode().strip()}")
return
def _execute_command(self, command: list[str]) -> None:
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
def _execute_command(self, command: List[str]) -> None:
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
stdout, stderr = process.communicate()
if process.returncode != 0:
print(f"Error executing command: {command}")
print(stderr.decode())
return None
else:
return stdout.decode()
@@ -103,23 +103,27 @@ class DesktopEnv(gym.Env):
def _get_screenshot(self):
image_path = "./screenshot.png"
self._execute_command(["vmrun", "-T", "ws", "-gu", self.username, "-gp", self.password, "captureScreen", self.path_to_vm, image_path])
self._execute_command(
["vmrun", "-T", "ws", "-gu", self.username, "-gp", self.password, "captureScreen", self.path_to_vm,
image_path])
return image_path
def _get_obs(self):
screenshot_image_path = self._get_screenshot()
with Image.open(screenshot_image_path) as img:
return np.array(img)
def reset(self):
input()
self._execute_command(["vmrun", "-T", "ws", "revertToSnapshot", self.path_to_vm, self.snapshot_path])
input()
self._start_emulator()
input()
self._wait_for_emulator_load()
observation = self._get_obs()
print("Resetting environment...")
print("Reverting to snapshot to {}...".format(self.snapshot_path))
self._execute_command(["vmrun", "-T", "ws", "revertToSnapshot", self.path_to_vm, self.snapshot_path])
print("Starting emulator...")
self._start_emulator()
print("Emulator started.")
observation = self._get_obs()
return observation
def step(self, action):

View File

@@ -3,3 +3,6 @@ Pillow
fabric
gymnasium
requests
transformers
torch
accelerate