Merge branch 'main' into zdy

This commit is contained in:
David Chang
2024-01-15 12:12:05 +08:00
46 changed files with 1585 additions and 457 deletions

View File

@@ -20,7 +20,7 @@ todo
- [x] Set up a pipeline and build agents implementation (zero-shot) for the task - [x] Set up a pipeline and build agents implementation (zero-shot) for the task
- [x] Start to design on which tasks inside the DesktopENv to focus on, start to wrap up the environment to be public - [x] Start to design on which tasks inside the DesktopENv to focus on, start to wrap up the environment to be public
- [x] Start to annotate the examples for ~~training~~ and testing - [x] Start to annotate the examples for ~~training~~ and testing
- [ ] Error handling during file passing and file opening, etc. - [x] Error handling during file passing and file opening, etc.
- [ ] Add accessibility tree from the OS into the observation space - [x] Add accessibility tree from the OS into the observation space
- [ ] Add pre-process and post-process action support for benchmarking setup and evaluation - [ ] Add pre-process and post-process action support for benchmarking setup and evaluation
- [ ] Multiprocess support, this can enable the reinforcement learning to be more efficient - [ ] Multiprocess support, this can enable the reinforcement learning to be more efficient

View File

@@ -197,8 +197,10 @@ class PythonController:
if "text" not in parameters: if "text" not in parameters:
raise Exception(f"Unknown parameters: {parameters}") raise Exception(f"Unknown parameters: {parameters}")
# deal with special ' and \ characters # deal with special ' and \ characters
text = parameters["text"].replace("\\", "\\\\").replace("'", "\\'") # text = parameters["text"].replace("\\", "\\\\").replace("'", "\\'")
self.execute_python_command(f"pyautogui.typewrite('{text}')") # self.execute_python_command(f"pyautogui.typewrite('{text}')")
text = parameters["text"]
self.execute_python_command("pyautogui.typewrite({:})".format(repr(text)))
elif action_type == "PRESS": elif action_type == "PRESS":
if "key" not in parameters: if "key" not in parameters:
@@ -237,6 +239,9 @@ class PythonController:
keys_para_rep = "', '".join(keys) keys_para_rep = "', '".join(keys)
self.execute_python_command(f"pyautogui.hotkey('{keys_para_rep}')") self.execute_python_command(f"pyautogui.hotkey('{keys_para_rep}')")
elif action_type in ['WAIT', 'FAIL', 'DONE']:
pass
else: else:
raise Exception(f"Unknown action type: {action_type}") raise Exception(f"Unknown action type: {action_type}")
@@ -280,3 +285,31 @@ class PythonController:
else: else:
logger.error("Failed to get wallpaper. Status code: %d", response.status_code) logger.error("Failed to get wallpaper. Status code: %d", response.status_code)
return None return None
def get_vm_desktop_path(self):
"""
Gets the desktop path of the vm.
"""
response = requests.post(self.http_server + "/desktop_path")
if response.status_code == 200:
logger.info("Desktop path downloaded successfully")
return response.json()["desktop_path"]
else:
logger.error("Failed to get desktop path. Status code: %d", response.status_code)
return None
def get_vm_directory_tree(self, path):
"""
Gets the directory tree of the vm.
"""
payload = json.dumps({"path": path})
headers = {
'Content-Type': 'application/json'
}
response = requests.post(self.http_server + "/list_directory", headers=headers, data=payload)
if response.status_code == 200:
logger.info("Directory tree downloaded successfully")
return response.json()["directory_tree"]
else:
logger.error("Failed to get directory tree. Status code: %d", response.status_code)
return None

View File

@@ -1,18 +1,18 @@
import json import json
import time import logging
import os.path import os.path
import time
import traceback import traceback
import uuid import uuid
from typing import Dict, List
from typing import Any, Union, Optional from typing import Any, Union, Optional
from typing import Dict, List
import requests import requests
from playwright.sync_api import sync_playwright from playwright.sync_api import sync_playwright
from requests_toolbelt.multipart.encoder import MultipartEncoder from requests_toolbelt.multipart.encoder import MultipartEncoder
from desktop_env.evaluators.metrics.utils import compare_urls from desktop_env.evaluators.metrics.utils import compare_urls
import logging
logger = logging.getLogger("desktopenv.setup") logger = logging.getLogger("desktopenv.setup")
@@ -20,6 +20,7 @@ class SetupController:
def __init__(self, vm_ip: str, cache_dir: str): def __init__(self, vm_ip: str, cache_dir: str):
self.vm_ip: str = vm_ip self.vm_ip: str = vm_ip
self.http_server: str = f"http://{vm_ip}:5000" self.http_server: str = f"http://{vm_ip}:5000"
self.http_server_setup_root: str = f"http://{vm_ip}:5000/setup"
self.cache_dir: str = cache_dir self.cache_dir: str = cache_dir
def reset_cache_dir(self, cache_dir: str): def reset_cache_dir(self, cache_dir: str):
@@ -57,31 +58,31 @@ class SetupController:
# can add other setup steps # can add other setup steps
# ZDY_COMMENT: merged with launch # ZDY_COMMENT: merged with launch
#def _command_setup(self, command: str): # def _command_setup(self, command: str):
#""" # """
#Directly send a command into the virtual machine os for setting up. # Directly send a command into the virtual machine os for setting up.
#""" # """
#payload = json.dumps({"command": command}) # payload = json.dumps({"command": command})
#headers = { # headers = {
#'Content-Type': 'application/json' # 'Content-Type': 'application/json'
#} # }
#timeout = 5 # timeout = 5
#timout_whitelist = ["vlc"] # timout_whitelist = ["vlc"]
# #
#try: # try:
# #
#response = requests.post(self.http_server + "/execute", headers=headers, data=payload, timeout=timeout) # response = requests.post(self.http_server + "/execute", headers=headers, data=payload, timeout=timeout)
#if response.status_code == 200: # if response.status_code == 200:
#print("Command executed successfully:", response.text) # print("Command executed successfully:", response.text)
#else: # else:
#print("Failed to execute command. Status code:", response.status_code) # print("Failed to execute command. Status code:", response.status_code)
#except requests.exceptions.Timeout as e: # except requests.exceptions.Timeout as e:
#if command in timout_whitelist: # if command in timout_whitelist:
#print("Command executed successfully:", command) # print("Command executed successfully:", command)
#else: # else:
#print("An error occurred while trying to execute the command:", e) # print("An error occurred while trying to execute the command:", e)
#except requests.exceptions.RequestException as e: # except requests.exceptions.RequestException as e:
#print("An error occurred while trying to execute the command:", e) # print("An error occurred while trying to execute the command:", e)
def _download_setup(self, files: List[Dict[str, str]]): def _download_setup(self, files: List[Dict[str, str]]):
""" """
@@ -224,9 +225,14 @@ class SetupController:
except requests.exceptions.RequestException as e: except requests.exceptions.RequestException as e:
logger.error("An error occurred while trying to send the request: %s", e) logger.error("An error occurred while trying to send the request: %s", e)
def _execute_setup( self, command: List[str] def _execute_setup(
, stdout: str = "", stderr: str = "" self,
, shell: bool = False, until: Optional[Dict[str, Any]] = None): command: List[str],
stdout: str = "",
stderr: str = "",
shell: bool = False,
until: Optional[Dict[str, Any]] = None
):
if not command: if not command:
raise Exception("Empty comman to launch.") raise Exception("Empty comman to launch.")
@@ -248,10 +254,10 @@ class SetupController:
if stderr: if stderr:
with open(os.path.join(self.cache_dir, stderr), "w") as f: with open(os.path.join(self.cache_dir, stderr), "w") as f:
f.write(results["error"]) f.write(results["error"])
logger.info( "Command executed successfully: %s -> %s" logger.info("Command executed successfully: %s -> %s"
, " ".join(command) , " ".join(command)
, response.text , response.text
) )
else: else:
logger.error("Failed to launch application. Status code: %s", response.text) logger.error("Failed to launch application. Status code: %s", response.text)
results = None results = None
@@ -263,13 +269,13 @@ class SetupController:
results = None results = None
nb_failings += 1 nb_failings += 1
if len(until)==0: if len(until) == 0:
terminates = True terminates = True
elif results is not None: elif results is not None:
terminates = "returncode" in until and results["returncode"]==until["returncode"]\ terminates = "returncode" in until and results["returncode"] == until["returncode"] \
or "stdout" in until and until["stdout"] in results["output"]\ or "stdout" in until and until["stdout"] in results["output"] \
or "stderr" in until and until["stderr"] in results["error"] or "stderr" in until and until["stderr"] in results["error"]
terminates = terminates or nb_failings>=5 terminates = terminates or nb_failings >= 5
if not terminates: if not terminates:
time.sleep(0.3) time.sleep(0.3)
@@ -292,6 +298,25 @@ class SetupController:
# TODO # TODO
raise NotImplementedError() raise NotImplementedError()
def _activate_window_setup(self, window_name: str):
if not window_name:
raise Exception(f"Setup Open - Invalid path ({window_name}).")
payload = json.dumps({"window_name": window_name})
headers = {
'Content-Type': 'application/json'
}
# send request to server to open file
try:
response = requests.post(self.http_server + "/setup" + "/activate_window", headers=headers, data=payload)
if response.status_code == 200:
logger.info("Command executed successfully: %s", response.text)
else:
logger.error(f"Failed to activate window {window_name}. Status code: %s", response.text)
except requests.exceptions.RequestException as e:
logger.error("An error occurred while trying to send the request: %s", e)
# Chrome setup # Chrome setup
def _chrome_open_tabs_setup(self, urls_to_open: List[str]): def _chrome_open_tabs_setup(self, urls_to_open: List[str]):
host = self.vm_ip host = self.vm_ip

View File

@@ -186,5 +186,18 @@ ACTION_SPACE = [
"optional": False, "optional": False,
} }
} }
},
############################################################################################################
{
"action_type": "WAIT",
"note": "wait until the next action",
},
{
"action_type": "FAIL",
"note": "decide the task can not be performed",
},
{
"action_type": "DONE",
"note": "decide the task is done",
} }
] ]

View File

@@ -1,28 +1,30 @@
from __future__ import annotations from __future__ import annotations
import logging
import os import os
import subprocess import subprocess
import tempfile
import time import time
from typing import Callable, Any, Optional
# import uuid # import uuid
# import platform # import platform
from typing import List, Dict from typing import List, Dict
from typing import Callable, Any, Optional
import tempfile
import gymnasium as gym import gymnasium as gym
# import requests
from desktop_env.controllers.python import PythonController from desktop_env.controllers.python import PythonController
from desktop_env.controllers.setup import SetupController from desktop_env.controllers.setup import SetupController
# from desktop_env.evaluators import eval_funcs # from desktop_env.evaluators import eval_funcs
from desktop_env.evaluators import metrics, getters from desktop_env.evaluators import metrics, getters
import logging # import requests
logger = logging.getLogger("desktopenv.env") logger = logging.getLogger("desktopenv.env")
Metric = Callable[[Any, Any], float] Metric = Callable[[Any, Any], float]
Getter = Callable[[gym.Env, Dict[str, Any]], Any] Getter = Callable[[gym.Env, Dict[str, Any]], Any]
def _execute_command(command: List[str]) -> None: def _execute_command(command: List[str]) -> None:
if command[:4] == ["vmrun", "-T", "ws", "start"]: if command[:4] == ["vmrun", "-T", "ws", "start"]:
p = subprocess.Popen(command) p = subprocess.Popen(command)
@@ -84,8 +86,8 @@ class DesktopEnv(gym.Env):
self.setup_controller = SetupController(vm_ip=self.vm_ip, cache_dir=self.cache_dir) self.setup_controller = SetupController(vm_ip=self.vm_ip, cache_dir=self.cache_dir)
# Meta info of the VM, move to the reset() function # Meta info of the VM, move to the reset() function
self.vm_platform: str = "" # self.controller.get_vm_platform() self.vm_platform: str = "" # self.controller.get_vm_platform()
self.vm_screen_size = None # self.controller.get_vm_screen_size() self.vm_screen_size = None # self.controller.get_vm_screen_size()
# mode: human or machine # mode: human or machine
assert action_space in ["computer_13", "pyautogui"] assert action_space in ["computer_13", "pyautogui"]
@@ -164,7 +166,7 @@ class DesktopEnv(gym.Env):
self.evaluator["expected"]["type"])) if "expected" in self.evaluator else None self.evaluator["expected"]["type"])) if "expected" in self.evaluator else None
self.metric_options: Dict[str, Any] = self.evaluator.get("options", {}) self.metric_options: Dict[str, Any] = self.evaluator.get("options", {})
def reset(self, task_config: Optional[Dict[str, Any]] = None, seed=None, options=None): def reset(self, task_config: Optional[Dict[str, Any]] = None, seed=None, options=None) -> Dict[str, Any]:
logger.info("Resetting environment...") logger.info("Resetting environment...")
logger.info("Switching task...") logger.info("Switching task...")
@@ -202,11 +204,27 @@ class DesktopEnv(gym.Env):
time.sleep(5) time.sleep(5)
logger.info("Environment setup complete.") logger.info("Environment setup complete.")
observation = self._get_obs() observation = {"screenshot": self._get_obs()}
return observation return observation
def step(self, action, pause=0.5): def step(self, action, pause=0.5):
self._step_no += 1 self._step_no += 1
self.action_history.append(action)
reward = 0 # todo: Define reward calculation for each example
done = False # todo: Define episode termination condition for each example
info = {}
# handle the special actions
if action in ['WAIT', 'FAIL', 'DONE']:
if action == 'WAIT':
time.sleep(pause)
elif action == 'FAIL':
done = True
info = {"fail": True}
elif action == 'DONE':
done = True
info = {"done": True}
# fixme: add reminding logic here, decide if the action is valid for the current action_space # fixme: add reminding logic here, decide if the action is valid for the current action_space
if self.action_space == "computer_13": if self.action_space == "computer_13":
@@ -215,18 +233,14 @@ class DesktopEnv(gym.Env):
elif self.action_space == "pyautogui": elif self.action_space == "pyautogui":
# the set of all possible python commands insides `pyautogui` # the set of all possible python commands insides `pyautogui`
self.controller.execute_python_command(action) self.controller.execute_python_command(action)
self.action_history.append(action)
# todo: maybe for the better here we need to add a logic to wait until the rendering is done
time.sleep(pause)
observation = { observation = {
"screenshot": self._get_obs(), "screenshot": self._get_obs(),
"accessibility_tree": self.controller.get_accessibility_tree(),
"terminal": self.controller.get_terminal_output(), "terminal": self.controller.get_terminal_output(),
"instruction": self.instruction "instruction": self.instruction
} }
reward = 0 # todo: Define reward calculation for each example
done = False # todo: Define episode termination condition for each example
info = {}
return observation, reward, done, info return observation, reward, done, info
def evaluate(self): def evaluate(self):

View File

@@ -1,5 +1,9 @@
from .chrome import get_default_search_engine, get_cookie_data, get_bookmarks, get_open_tabs_info, get_pdf_from_url, \
get_shortcuts_on_desktop
from .file import get_cloud_file, get_vm_file, get_cache_file from .file import get_cloud_file, get_vm_file, get_cache_file
from .general import get_vm_command_line
from .info import get_vm_screen_size, get_vm_window_size, get_vm_wallpaper from .info import get_vm_screen_size, get_vm_window_size, get_vm_wallpaper
from .misc import get_rule, get_accessibility_tree from .misc import get_rule, get_accessibility_tree
from .replay import get_replay
from .vlc import get_vlc_playing_info, get_vlc_config from .vlc import get_vlc_playing_info, get_vlc_config
from .chrome import get_default_search_engine, get_bookmarks, get_open_tabs_info from .vscode import get_vscode_config

View File

@@ -46,6 +46,10 @@ def get_default_search_engine(env, config: Dict[str, str]):
def get_cookie_data(env, config: Dict[str, str]): def get_cookie_data(env, config: Dict[str, str]):
"""
Get the cookies from the Chrome browser.
Assume the cookies are stored in the default location, not encrypted and not large in size.
"""
os_type = env.vm_platform os_type = env.vm_platform
if os_type == 'Windows': if os_type == 'Windows':
chrome_cookie_file_path = env.controller.execute_python_command("""import os; print(os.path.join(os.getenv('LOCALAPPDATA'), chrome_cookie_file_path = env.controller.execute_python_command("""import os; print(os.path.join(os.getenv('LOCALAPPDATA'),
@@ -61,21 +65,23 @@ def get_cookie_data(env, config: Dict[str, str]):
else: else:
raise Exception('Unsupported operating system') raise Exception('Unsupported operating system')
# todo: add a new controller function to connect the cookie database
#############
try: try:
conn = sqlite3.connect(chrome_cookie_file_path) content = env.controller.get_file(chrome_cookie_file_path)
_path = os.path.join(env.cache_dir, config["dest"])
with open(_path, "wb") as f:
f.write(content)
conn = sqlite3.connect(_path)
cursor = conn.cursor() cursor = conn.cursor()
# Query to check for OpenAI cookies # Query to check for OpenAI cookies
cursor.execute("SELECT * FROM cookies") cursor.execute("SELECT * FROM cookies")
cookies = cursor.fetchall() cookies = cursor.fetchall()
return cookies return cookies
except Exception as e: except Exception as e:
logger.error(f"Error: {e}") logger.error(f"Error: {e}")
return None return None
#############
def get_bookmarks(env, config: Dict[str, str]): def get_bookmarks(env, config: Dict[str, str]):
@@ -94,17 +100,12 @@ def get_bookmarks(env, config: Dict[str, str]):
else: else:
raise Exception('Unsupported operating system') raise Exception('Unsupported operating system')
try: content = env.controller.get_file(preference_file_path)
content = env.controller.get_file(preference_file_path) if not content:
# make content json variable return []
data = json.load(content) data = json.loads(content)
bookmarks = data.get('roots', {})
bookmarks = data.get('roots', {}) return bookmarks
return bookmarks
except Exception as e:
logger.error(f"Error: {e}")
return None
# todo: move this to the main.py # todo: move this to the main.py
@@ -190,3 +191,83 @@ def get_active_tab_info(env, config: Dict[str, str]):
browser.close() browser.close()
return active_tab_info return active_tab_info
def get_pdf_from_url(env, config: Dict[str, str]) -> str:
"""
Download a PDF from a URL.
"""
_url = config["path"]
_path = os.path.join(env.cache_dir, config["dest"])
host = env.vm_ip
port = 9222 # fixme: this port is hard-coded, need to be changed from config file
remote_debugging_url = f"http://{host}:{port}"
with sync_playwright() as p:
browser = p.chromium.connect_over_cdp(remote_debugging_url)
page = browser.new_page()
page.goto(_url)
page.pdf(path=_path)
browser.close()
return _path
# fixme: needs to be changed (maybe through post-processing) since it's not working
def get_chrome_saved_address(env, config: Dict[str, str]):
# host = env.vm_ip
host = "192.168.13.130"
port = 9222 # fixme: this port is hard-coded, need to be changed from config file
remote_debugging_url = f"http://{host}:{port}"
with sync_playwright() as p:
# connect to remote Chrome instance
browser = p.chromium.connect_over_cdp(remote_debugging_url)
page = browser.new_page()
# Navigate to Chrome's settings page for autofill
page.goto("chrome://settings/addresses")
# Get the HTML content of the page
content = page.content()
browser.close()
return content
def get_shortcuts_on_desktop(env, config: Dict[str, str]):
# Find out the operating system
os_name = env.vm_platform
# Depending on the OS, define the shortcut file extension
if os_name == 'Windows':
# Windows shortcuts are typically .url or .lnk files
shortcut_extension = '.lnk'
elif os_name == 'Darwin':
# macOS's shortcuts are .webloc files
shortcut_extension = '.webloc'
elif os_name == 'Linux':
# Linux (Ubuntu, etc.) shortcuts are typically .desktop files
shortcut_extension = '.desktop'
else:
logger.error(f"Unsupported operating system: {os_name}")
return []
# Get the path to the desktop folder
desktop_path = env.controller.get_vm_desktop_path()
desktop_directory_tree = env.controller.get_vm_directory_tree(desktop_path)
shortcuts_paths = [file['name'] for file in desktop_directory_tree['children'] if
file['name'].endswith(shortcut_extension)]
short_cuts = {}
for shortcut_path in shortcuts_paths:
short_cuts[shortcut_path] = env.controller.get_file(env.controller.execute_python_command(
f"import os; print(os.path.join(os.path.expanduser('~'), 'Desktop', '{shortcut_path}'))")['output'].strip()).decode('utf-8')
return short_cuts

View File

@@ -40,7 +40,7 @@ def get_vm_file(env, config: Dict[str, str]) -> Optional[str]:
file = env.controller.get_file(config["path"]) file = env.controller.get_file(config["path"])
if file is None: if file is None:
return None return None
#raise FileNotFoundError("File not found on VM: {:}".format(config["path"])) # raise FileNotFoundError("File not found on VM: {:}".format(config["path"]))
with open(_path, "wb") as f: with open(_path, "wb") as f:
f.write(file) f.write(file)

View File

@@ -1,23 +1,19 @@
import logging
from typing import Dict from typing import Dict
import os
import requests import requests
logger = logging.getLogger("desktopenv.getters.general")
def get_string(env, config: Dict[str, str]) -> str:
"""
Config:
string (str)
"""
return config["string"] def get_vm_command_line(env, config: Dict[str, str]):
vm_ip = env.vm_ip
port = 5000
command = config["command"]
def get_command_line(env, config: Dict[str, str]) -> str: response = requests.post(f"http://{vm_ip}:{port}/execute", json={"command": command})
"""
Config: if response.status_code == 200:
string (str) return response.json()["output"]
""" else:
logger.error("Failed to get vm command line. Status code: %d", response.status_code)
f = os.popen(config["command"]) return None
return f.read()

View File

@@ -1,6 +1,5 @@
import logging import logging
from typing import TypeVar from typing import TypeVar
#from typing import Dict, List
logger = logging.getLogger("desktopenv.getters.misc") logger = logging.getLogger("desktopenv.getters.misc")
@@ -13,6 +12,7 @@ def get_rule(env, config: R) -> R:
""" """
return config["rules"] return config["rules"]
def get_accessibility_tree(env, *args) -> str: def get_accessibility_tree(env, *args) -> str:
accessibility_tree: str = env.controller.get_accessibility_tree() accessibility_tree: str = env.controller.get_accessibility_tree()
logger.debug("AT@eval: %s", accessibility_tree) logger.debug("AT@eval: %s", accessibility_tree)

View File

@@ -0,0 +1,20 @@
from typing import List, Dict, Any
def get_replay(env, trajectory: List[Dict[str, Any]]) -> None:
# fixme: need to be combined with the accessibility tree to activate the selection of the target window
def parse(action):
if action["type"] == "hotkey":
keys = "', '".join(action["param"])
return f"pyautogui.hotkey('{keys}')"
if action["type"] == "typewrite":
text = action["param"]
return f"pyautogui.typewrite('{text}')"
if action["type"] == "press":
key = action["param"]
return f"pyautogui.press('{key}')"
for action in trajectory:
env.controller.execute_python_command(parse(action))

View File

@@ -0,0 +1,34 @@
import logging
from typing import Any, Dict
from .file import get_vm_file
from .replay import get_replay
logger = logging.getLogger("desktopenv.getters.vscode")
def get_vscode_config(env, config: Dict[str, Any]) -> str:
os_type = env.vm_platform
vscode_extension_command = config["vscode_extension_command"]
# fixme: depends on how we config and install the vscode in virtual machine, need to be aligned and double-checked
if os_type == "MacOS":
trajectory = [
{"type": "hotkey", "param": ["command", "shift", "p"]},
{"type": "typewrite", "param": vscode_extension_command},
{"type": "press", "param": "enter"}
]
else:
trajectory = [
{"type": "hotkey", "param": ["ctrl", "shift", "p"]},
{"type": "typewrite", "param": vscode_extension_command},
{"type": "press", "param": "enter"}
]
get_replay(env, trajectory)
return get_vm_file(env, {
"path": config["path"],
"dest": config["dest"]
})

View File

@@ -1,4 +1,4 @@
from .chrome import is_expected_tabs, is_expected_bookmarks from .chrome import is_expected_tabs, is_expected_bookmarks, compare_pdfs, is_cookie_deleted, is_shortcut_on_desktop
from .docs import compare_font_names, compare_subscript_contains, has_page_numbers_in_footers from .docs import compare_font_names, compare_subscript_contains, has_page_numbers_in_footers
from .docs import find_default_font, contains_page_break, compare_docx_files, compare_docx_tables, compare_line_spacing, \ from .docs import find_default_font, contains_page_break, compare_docx_files, compare_docx_tables, compare_line_spacing, \
compare_insert_equation compare_insert_equation
@@ -13,4 +13,5 @@ from .vlc import is_vlc_playing, is_vlc_recordings_folder, is_vlc_fullscreen, co
from .gimp import increase_saturation, decrease_brightness, check_file_exists, compare_triangle_positions from .gimp import increase_saturation, decrease_brightness, check_file_exists, compare_triangle_positions
from .general import check_csv, check_accessibility_tree, check_list, run_sqlite3 from .general import check_csv, check_accessibility_tree, check_list, run_sqlite3
from .thunderbird import check_thunderbird_prefs, check_thunderbird_filter from .thunderbird import check_thunderbird_prefs, check_thunderbird_filter
from .vscode import compare_text_file, compare_config, compare_answer, is_extension_installed
from .impress import check_slide_numbers_color, compare_pptx_files, check_for_two_lines

View File

@@ -1,5 +1,9 @@
import logging import logging
from typing import Any, Dict, List from typing import Any, Dict, List
import fitz # PyMuPDF
import rapidfuzz.fuzz as fuzz
from desktop_env.evaluators.metrics.utils import are_lists_equal, compare_urls from desktop_env.evaluators.metrics.utils import are_lists_equal, compare_urls
logger = logging.getLogger("desktopenv.metrics.chrome") logger = logging.getLogger("desktopenv.metrics.chrome")
@@ -22,18 +26,72 @@ def is_expected_tabs(open_tabs: List[Dict[str, str]], rule: Dict[str, Any]) -> f
return 0 return 0
def is_expected_bookmarks(bookmarks: List[Dict[str, Any]], rule: Dict[str, Any]) -> float: def is_expected_bookmarks(bookmarks: List[str], rule: Dict[str, Any]) -> float:
""" """
Checks if the expected bookmarks are in Chrome. Checks if the expected bookmarks are in Chrome.
""" """
if not bookmarks:
# todo return 0.
match_type = rule['type'] elif rule['type'] == "bookmark_bar_folders_names":
bookmark_bar_folders_names = [bookmark['name'] for bookmark in bookmarks['bookmark_bar']['children'] if
if match_type == "url": bookmark['type'] == 'folder']
expected_urls = rule['urls'] return 1. if set(bookmark_bar_folders_names) == set(rule['names']) else 0.
actual_urls = [bookmark['url'] for bookmark in bookmarks] elif rule['type'] == "bookmark_bar_websites_urls":
return 1 if are_lists_equal(expected_urls, actual_urls, compare_urls) else 0 bookmark_bar_websites_urls = [bookmark['url'] for bookmark in bookmarks['bookmark_bar']['children'] if
bookmark['type'] == 'url']
return 1. if set(bookmark_bar_websites_urls) == set(rule['urls']) else 0.
else: else:
logger.error(f"Unknown type: {match_type}") raise TypeError(f"{rule['type']} not support yet!")
return 0
def compare_pdfs(pdf1_path, pdf2_path):
"""
Compare two PDF files.
"""
def extract_text_from_pdf(pdf_path):
"""Extract text from each page of the PDF."""
text = ""
with fitz.open(pdf_path) as pdf:
for page in pdf:
text += page.get_text()
return text.strip()
text1 = extract_text_from_pdf(pdf1_path)
text2 = extract_text_from_pdf(pdf2_path)
return fuzz.ratio(text1, text2) / 100
def is_cookie_deleted(cookie_data, rule):
"""
Check if the cookie is deleted.
"""
if rule['type'] == 'domains':
cookies_domains = [cookie[1] for cookie in cookie_data]
for domain in rule['domains']:
for cookies_domain in cookies_domains:
if compare_urls(domain, cookies_domain):
return 0.
return 1.
else:
raise TypeError(f"{rule['type']} not support yet!")
def is_shortcut_on_desktop(shortcuts: Dict[str, str], rule):
"""
Check if the shortcut is on the desktop.
"""
# fixme: if the name of the website changed in the future, this will not work; can be replaced with url
if rule['type'] == 'name':
for shortcut_path, shortcut_content in shortcuts.items():
if "Name=" + rule['name'] + "\n" in shortcut_content:
return 1.
return 0.
elif rule['type'] == 'url':
raise TypeError(f"{rule['type']} not support yet!")
elif rule['type'] == 'id':
raise TypeError(f"{rule['type']} not support yet!")
else:
raise TypeError(f"{rule['type']} not support yet!")

View File

@@ -1,12 +1,14 @@
import xml.etree.ElementTree as ET import logging
import os import os
import xml.etree.ElementTree as ET
from typing import List, Dict, Any from typing import List, Dict, Any
from docx import Document from docx import Document
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
import logging
logger = logging.getLogger("desktopenv.metric.docs") logger = logging.getLogger("desktopenv.metric.docs")
def find_default_font(config_file_path, rules): def find_default_font(config_file_path, rules):
"""Find the default font in LibreOffice Writer.""" """Find the default font in LibreOffice Writer."""
default_font = None default_font = None

View File

@@ -1,4 +1,75 @@
from pptx import Presentation from pptx import Presentation
import os
def is_red_color(color):
#judge if the color is red
print(color.rgb)
return color and color.rgb == (255, 0, 0)
def get_master_placeholder_color(prs):
# get the color of the placeholder
masters = prs.slide_masters
for idx, master in enumerate(masters):
for placeholder in master.placeholders:
if placeholder.has_text_frame and placeholder.text == "<number>":
text_frame = placeholder.text_frame
if text_frame.paragraphs:
first_paragraph = text_frame.paragraphs[0]
return first_paragraph.font.color
return None
def check_slide_numbers_color(pptx_file_path):
presentation = Presentation(pptx_file_path)
for i, slide in enumerate(presentation.slides):
for shape in slide.shapes:
# check if the shape is a text box
if hasattr(shape, "text"):
if shape.text.isdigit():
# "SlidePlaceholder" is the name of the placeholder in the master slide
page_number_text = shape.text
font_color = get_master_placeholder_color(presentation)
print(font_color)
return 1 if font_color is not None and is_red_color(font_color) else 0
def compare_pptx_files(file1_path, file2_path):
prs1 = Presentation(file1_path)
prs2 = Presentation(file2_path)
# compare the number of slides
if len(prs1.slides) != len(prs2.slides):
return 0
# compare the content of each slide
for slide1, slide2 in zip(prs1.slides, prs2.slides):
# check if the shapes are the same
for shape1, shape2 in zip(slide1.shapes, slide2.shapes):
if hasattr(shape1, "text") and hasattr(shape2, "text"):
if shape1.text != shape2.text:
return 0
return 1
def has_two_lines_on_page(slide):
line_count = 0
for shape in slide.shapes:
if shape.shape_type == 1: # 1 表示 Line 形状
line_count += 1
if line_count >= 2:
return True
return False
def check_for_two_lines(prs):
prs = Presentation(prs)
for i, slide in enumerate(prs.slides):
if has_two_lines_on_page(slide):
return 1
return 0
def check_file_exists(directory, filename):
file_path = os.path.join(directory, filename)
return 1 if os.path.isfile(file_path) else 0
if __name__ == "__main__": if __name__ == "__main__":
path1 = "../../任务数据/LibreOffice Impress/Change_Color_Slide_Number_gold_textbox.pptx" path1 = "../../任务数据/LibreOffice Impress/Change_Color_Slide_Number_gold_textbox.pptx"

View File

@@ -1,37 +1,38 @@
import lxml.cssselect
from lxml.etree import _Element as Element
import lxml.etree
import fnmatch import fnmatch
from typing import Dict, List from typing import Dict, List
import lxml.cssselect
import lxml.etree
from lxml.etree import _Element as Element
_libconf_namespaces = [("oor", "http://openoffice.org/2001/registry")] _libconf_namespaces = [("oor", "http://openoffice.org/2001/registry")]
_libconf_ns_mapping = dict(_libconf_namespaces) _libconf_ns_mapping = dict(_libconf_namespaces)
_setup_locale_selector = lxml.cssselect.CSSSelector( 'item[oor|path$=L10N]>prop[oor|name=ooSetupSystemLocale]>value' _setup_locale_selector = lxml.cssselect.CSSSelector('item[oor|path$=L10N]>prop[oor|name=ooSetupSystemLocale]>value',
, namespaces=_libconf_ns_mapping namespaces=_libconf_ns_mapping)
) _locale_selector = lxml.cssselect.CSSSelector('item[oor|path$=L10N]>prop[oor|name=ooLocale]>value',
_locale_selector = lxml.cssselect.CSSSelector( 'item[oor|path$=L10N]>prop[oor|name=ooLocale]>value' namespaces=_libconf_ns_mapping)
, namespaces=_libconf_ns_mapping
)
def check_libre_locale(config_file: str, rules: Dict[str, List[str]]) -> float: def check_libre_locale(config_file: str, rules: Dict[str, List[str]]) -> float:
config: Element = lxml.etree.parse(config_file).getroot() config: Element = lxml.etree.parse(config_file).getroot()
setup_locale_setting: List[Element] = _setup_locale_selector(config) setup_locale_setting: List[Element] = _setup_locale_selector(config)
locale_setting: List[Element] = _locale_selector(config) locale_setting: List[Element] = _locale_selector(config)
setup_locale_setting: str = setup_locale_setting[0].text\ setup_locale_setting: str = setup_locale_setting[0].text \
if len(setup_locale_setting)>0\ if len(setup_locale_setting) > 0 \
else locale_setting[0].text else locale_setting[0].text
return float( any( fnmatch.fnmatchcase(setup_locale_setting, ptn)\ return float(any(fnmatch.fnmatchcase(setup_locale_setting, ptn) \
for ptn in rules["locale_set"] for ptn in rules["locale_set"]
) )
) )
if __name__ == "__main__": if __name__ == "__main__":
path1 = "../../任务数据/LibreOffice Calc/registrymodifications.ru.xcu" path1 = "../../任务数据/LibreOffice Calc/registrymodifications.ru.xcu"
print( check_libre_locale( path1, { "locale_set": [ "ru-*", "de-*", "fr-*" print(check_libre_locale(path1, {"locale_set": ["ru-*", "de-*", "fr-*"
, "pt-*", "es-*", "it-*" , "pt-*", "es-*", "it-*"
] ]
} }
) )
) )

View File

@@ -1,13 +1,11 @@
from pypdf import PdfReader
import operator import operator
from typing import Dict
from typing import Any from typing import Any
from typing import Dict
from pypdf import PdfReader
def check_pdf_pages(pdf_file: str, rules: Dict[str, Any]) -> float: def check_pdf_pages(pdf_file: str, rules: Dict[str, Any]) -> float:
reader = PdfReader(pdf_file) reader = PdfReader(pdf_file)
nb_pages: int = len(reader.pages) nb_pages: int = len(reader.pages)
return float( getattr(operator, rules["relation"])( nb_pages return float(getattr(operator, rules["relation"])(nb_pages, rules["ref_value"]))
, rules["ref_value"]
)
)

View File

@@ -1,18 +1,19 @@
import pandas as pd import logging
import operator
from numbers import Number
from typing import Any, Union
from typing import Dict, List
import openpyxl import openpyxl
import pandas as pd
from openpyxl import Workbook from openpyxl import Workbook
from openpyxl.worksheet.worksheet import Worksheet from openpyxl.worksheet.worksheet import Worksheet
from .utils import load_charts, load_sparklines from .utils import load_charts, load_sparklines
import operator
from typing import Dict, List
from typing import Any, Union
from numbers import Number
import logging
logger = logging.getLogger("desktopenv.metric.table") logger = logging.getLogger("desktopenv.metric.table")
def compare_table(actual: str, expected: str, **options) -> float: def compare_table(actual: str, expected: str, **options) -> float:
""" """
Args: Args:
@@ -44,28 +45,28 @@ def compare_table(actual: str, expected: str, **options) -> float:
workbook1: Workbook = openpyxl.load_workbook(actual) workbook1: Workbook = openpyxl.load_workbook(actual)
workbook2: Workbook = openpyxl.load_workbook(expected) workbook2: Workbook = openpyxl.load_workbook(expected)
if ftr=="sparkline": if ftr == "sparkline":
sp1 = load_sparklines(actual) sp1 = load_sparklines(actual)
sp2 = load_sparklines(expected) sp2 = load_sparklines(expected)
new_metric: bool = sp1 == sp2 new_metric: bool = sp1 == sp2
logger.debug("Sparkline Metric: {:}".format(new_metric)) logger.debug("Sparkline Metric: {:}".format(new_metric))
elif ftr=="chart": elif ftr == "chart":
charts1 = load_charts(workbook1, **options) charts1 = load_charts(workbook1, **options)
charts2 = load_charts(workbook2, **options) charts2 = load_charts(workbook2, **options)
new_metric: bool = charts1 == charts2 new_metric: bool = charts1 == charts2
logger.debug("Chart Metric: {:}".format(new_metric)) logger.debug("Chart Metric: {:}".format(new_metric))
elif ftr=="number_format": elif ftr == "number_format":
number_formats1: List[str] = [ c.number_format.lower()\ number_formats1: List[str] = [c.number_format.lower() \
for col in workbook1.active.iter_cols()\ for col in workbook1.active.iter_cols() \
for c in col\ for c in col \
if c.data_type=="n" if c.data_type == "n"
] ]
number_formats2: List[str] = [ c.number_format.lower()\ number_formats2: List[str] = [c.number_format.lower() \
for col in workbook2.active.iter_cols()\ for col in workbook2.active.iter_cols() \
for c in col\ for c in col \
if c.data_type=="n" if c.data_type == "n"
] ]
new_metric: bool = number_formats1==number_formats2 new_metric: bool = number_formats1 == number_formats2
logger.debug("Number Format Metric: {:}".format(new_metric)) logger.debug("Number Format Metric: {:}".format(new_metric))
else: else:
raise NotImplementedError("Unsupported xlsx feature: {:}".format(ftr)) raise NotImplementedError("Unsupported xlsx feature: {:}".format(ftr))
@@ -73,6 +74,7 @@ def compare_table(actual: str, expected: str, **options) -> float:
return float(metric) return float(metric)
def check_sheet_list(result: str, rules: List[Dict[str, Any]]) -> float: def check_sheet_list(result: str, rules: List[Dict[str, Any]]) -> float:
if result is None: if result is None:
return 0. return 0.
@@ -114,6 +116,7 @@ def check_sheet_list(result: str, rules: List[Dict[str, Any]]) -> float:
return float(passes) return float(passes)
def check_xlsx_freeze(result: str, rules: Dict[str, str]) -> float: def check_xlsx_freeze(result: str, rules: Dict[str, str]) -> float:
if result is None: if result is None:
return 0. return 0.
@@ -121,16 +124,18 @@ def check_xlsx_freeze(result: str, rules: Dict[str, str]) -> float:
worksheet: Worksheet = openpyxl.load_workbook(filename=result).active worksheet: Worksheet = openpyxl.load_workbook(filename=result).active
return float(worksheet.freeze_panes == rules["position"]) return float(worksheet.freeze_panes == rules["position"])
def check_xlsx_zoom(result: str, rules: Dict[str, Union[str, Number]]) -> float: def check_xlsx_zoom(result: str, rules: Dict[str, Union[str, Number]]) -> float:
if result is None: if result is None:
return 0. return 0.
worksheet = openpyxl.load_workbook(filename=result).active worksheet = openpyxl.load_workbook(filename=result).active
zoom_scale: Number = worksheet.sheet_view.zoomScale or 100. zoom_scale: Number = worksheet.sheet_view.zoomScale or 100.
return float( getattr(operator, rules["relation"])( zoom_scale return float(getattr(operator, rules["relation"])(zoom_scale
, rules["ref_value"] , rules["ref_value"]
) )
) )
if __name__ == '__main__': if __name__ == '__main__':
# path1 = "" # path1 = ""
@@ -168,51 +173,51 @@ if __name__ == '__main__':
# ] # ]
# print(check_sheet_list(path1, rule)) # print(check_sheet_list(path1, rule))
#path1 = "../../任务数据/LibreOffice Calc/Create_column_charts_using_statistics_gold.xlsx" # path1 = "../../任务数据/LibreOffice Calc/Create_column_charts_using_statistics_gold.xlsx"
#path2 = "../../任务数据/LibreOffice Calc/Create_column_charts_using_statistics_gold2.xlsx" # path2 = "../../任务数据/LibreOffice Calc/Create_column_charts_using_statistics_gold2.xlsx"
#print(compare_table(path1, path2, features=["chart"], chart_props=["type", "direction"])) # print(compare_table(path1, path2, features=["chart"], chart_props=["type", "direction"]))
#path1 = "../../任务数据/LibreOffice Calc/Represent_in_millions_billions_gold.xlsx" # path1 = "../../任务数据/LibreOffice Calc/Represent_in_millions_billions_gold.xlsx"
#path2 = "../../任务数据/LibreOffice Calc/Represent_in_millions_billions_gold3.xlsx" # path2 = "../../任务数据/LibreOffice Calc/Represent_in_millions_billions_gold3.xlsx"
#path1 = "../../任务数据/LibreOffice Calc/Set_Decimal_Separator_Dot.xlsx" # path1 = "../../任务数据/LibreOffice Calc/Set_Decimal_Separator_Dot.xlsx"
#path2 = "../../任务数据/LibreOffice Calc/Set_Decimal_Separator_Dot_gold.xlsx" # path2 = "../../任务数据/LibreOffice Calc/Set_Decimal_Separator_Dot_gold.xlsx"
#workbook1: Workbook = openpyxl.load_workbook(filename=path1) # workbook1: Workbook = openpyxl.load_workbook(filename=path1)
#worksheet1: Worksheet = workbook1.active # worksheet1: Worksheet = workbook1.active
#import itertools # import itertools
#for col, r in itertools.product( ['A', 'B'] # for col, r in itertools.product( ['A', 'B']
#, range(1, 20) # , range(1, 20)
#): # ):
#position: str = "{:}{:d}".format(col, r) # position: str = "{:}{:d}".format(col, r)
#print(worksheet1[position]) # print(worksheet1[position])
#print(worksheet1[position].value) # print(worksheet1[position].value)
#print(worksheet1[position].number_format) # print(worksheet1[position].number_format)
#workbook2: Workbook = openpyxl.load_workbook(filename=path2) # workbook2: Workbook = openpyxl.load_workbook(filename=path2)
#worksheet2: Worksheet = workbook2.active # worksheet2: Worksheet = workbook2.active
#for col, r in itertools.product( ['A', 'B'] # for col, r in itertools.product( ['A', 'B']
#, range(1, 20) # , range(1, 20)
#): # ):
#position: str = "{:}{:d}".format(col, r) # position: str = "{:}{:d}".format(col, r)
#print(worksheet2[position]) # print(worksheet2[position])
#print(worksheet2[position].value) # print(worksheet2[position].value)
#print(worksheet2[position].number_format) # print(worksheet2[position].number_format)
#print(compare_table(path1, path2, features=["number_format"])) # print(compare_table(path1, path2, features=["number_format"]))
#path1 = "../../任务数据/LibreOffice Calc/Zoom_Out_Oversized_Cells_gold.xlsx" # path1 = "../../任务数据/LibreOffice Calc/Zoom_Out_Oversized_Cells_gold.xlsx"
#path2 = "../../任务数据/LibreOffice Calc/Zoom_Out_Oversized_Cells.xlsx" # path2 = "../../任务数据/LibreOffice Calc/Zoom_Out_Oversized_Cells.xlsx"
#workbook1: Workbook = openpyxl.load_workbook(filename=path1) # workbook1: Workbook = openpyxl.load_workbook(filename=path1)
#worksheet1: Worksheet = workbook1.active # worksheet1: Worksheet = workbook1.active
#print(worksheet1.sheet_view.zoomScale) # print(worksheet1.sheet_view.zoomScale)
#print(type(worksheet1.sheet_view.zoomScale)) # print(type(worksheet1.sheet_view.zoomScale))
# #
#import os # import os
#import os.path # import os.path
#for wb in filter( lambda f: f.endswith(".xlsx") # for wb in filter( lambda f: f.endswith(".xlsx")
#, os.listdir("../../任务数据/LibreOffice Calc/") # , os.listdir("../../任务数据/LibreOffice Calc/")
#): # ):
#path = os.path.join("../../任务数据/LibreOffice Calc/", wb) # path = os.path.join("../../任务数据/LibreOffice Calc/", wb)
#print(wb, openpyxl.load_workbook(filename=path).active.sheet_view.zoomScale) # print(wb, openpyxl.load_workbook(filename=path).active.sheet_view.zoomScale)
#print(check_zoom(path1, {"relation": "lt", "ref_value": 100})) # print(check_zoom(path1, {"relation": "lt", "ref_value": 100}))
#print(check_zoom(path2, {"relation": "lt", "ref_value": 100})) # print(check_zoom(path2, {"relation": "lt", "ref_value": 100}))
path1 = "../../任务数据/LibreOffice Calc/Padding_Decimals_In_Formular_gold.xlsx" path1 = "../../任务数据/LibreOffice Calc/Padding_Decimals_In_Formular_gold.xlsx"
data_frame: pd.DataFrame = pd.read_excel(path1) data_frame: pd.DataFrame = pd.read_excel(path1)

View File

@@ -1,16 +1,18 @@
from typing import Dict
def compare_text_file(actual: str, expected: str, **options) -> float: def compare_text_file(actual: str, expected: str, **options) -> float:
""" """
Args: Args:
actual (str): path to result xlsx actual (str): path to result text file
expected (str): path to gold xlsx expected (str): path to gold text file
options (Dict[str, List[str]]): dict like
{
}
Return: Return:
float: the score float: the score
""" """
if not actual:
return 0.
with open(actual) as f1: with open(actual) as f1:
actual_text = f1.read() actual_text = f1.read()
with open(expected) as f2: with open(expected) as f2:
@@ -20,13 +22,46 @@ def compare_text_file(actual: str, expected: str, **options) -> float:
return 1.0 return 1.0
return 0.0 return 0.0
def compare_answer(actual: str, expected: str, **options) -> float:
if actual == expected: def compare_config(actual: str, rules: Dict, **options) -> float:
if not actual:
return 0.
with open(actual) as f1:
actual_text = f1.read()
if actual_text == rules['expect']:
return 1.0 return 1.0
return 0.0
def compare_answer(actual: str, rules: Dict, **options) -> float:
"""
Args:
actual (str): result string
expected (str): gold string
Return:
float: the score
"""
if not actual:
return 0.
if actual == rules['expect']:
return 1.0
# TODO: can use text embedding to get non-zero return # TODO: can use text embedding to get non-zero return
return 0.0 return 0.0
if __name__ == '__main__':
print(compare_text_file("README.md", "README.md")) def is_extension_installed(actual: str, rules: Dict, **options):
if rules['type'] == 'contain':
if rules['expected'] in actual:
return 1.0
return 0.0
elif rules['type'] == 'not_contain':
if rules['expected'] not in actual:
return 1.0
return 0.0
else:
raise NotImplementedError

View File

@@ -71,3 +71,10 @@ You can use accerciser to check the accessibility tree on GNOME VM.
```sh ```sh
sudo apt install accerciser sudo apt install accerciser
``` ```
### Additional Installation
Activating the window manager control requires the installation of `wmctrl`:
```bash
sudo apt install wmctrl
```

View File

@@ -3,29 +3,26 @@ import os
import platform import platform
import subprocess import subprocess
from pathlib import Path from pathlib import Path
from typing import Any, Optional
from typing import List, Dict
import Xlib
import lxml.etree import lxml.etree
from lxml.etree import _Element
import pyatspi import pyatspi
import pyautogui
import requests
from PIL import Image
from Xlib import display, X
from flask import Flask, request, jsonify, send_file, abort
from lxml.etree import _Element
from pyatspi import Accessible, StateType from pyatspi import Accessible, StateType
from pyatspi import Action as ATAction
from pyatspi import Component, Document from pyatspi import Component, Document
from pyatspi import Text as ATText from pyatspi import Text as ATText
from pyatspi import Value as ATValue from pyatspi import Value as ATValue
from pyatspi import Action as ATAction
from typing import List, Dict
from typing import Any, Optional
import Xlib
import pyautogui
from PIL import Image
from Xlib import display, X
from pyxcursor import Xcursor from pyxcursor import Xcursor
import requests
from flask import Flask, request, jsonify, send_file, abort
from werkzeug.utils import secure_filename
app = Flask(__name__) app = Flask(__name__)
pyautogui.PAUSE = 0 pyautogui.PAUSE = 0
@@ -141,22 +138,24 @@ def get_terminal_output():
xpath = '//application[@name="gnome-terminal-server"]/frame[@st:active="true"]//terminal[@st:focused="true"]' xpath = '//application[@name="gnome-terminal-server"]/frame[@st:active="true"]//terminal[@st:focused="true"]'
terminals: List[_Element] = desktop_xml.xpath(xpath, namespaces=_accessibility_ns_map) terminals: List[_Element] = desktop_xml.xpath(xpath, namespaces=_accessibility_ns_map)
output = terminals[0].text.rstrip() if len(terminals) == 1 else None output = terminals[0].text.rstrip() if len(terminals) == 1 else None
else: # windows and macos platform is not implemented currently else: # windows and macos platform is not implemented currently
raise NotImplementedError raise NotImplementedError
return jsonify({"output": output, "status": "success"}) return jsonify({"output": output, "status": "success"})
except: except:
return jsonify({"output": None, "status": "error"}) return jsonify({"output": None, "status": "error"})
_accessibility_ns_map = { "st": "uri:deskat:state.at-spi.gnome.org" _accessibility_ns_map = {"st": "uri:deskat:state.at-spi.gnome.org"
, "attr": "uri:deskat:attributes.at-spi.gnome.org" , "attr": "uri:deskat:attributes.at-spi.gnome.org"
, "cp": "uri:deskat:component.at-spi.gnome.org" , "cp": "uri:deskat:component.at-spi.gnome.org"
, "doc": "uri:deskat:document.at-spi.gnome.org" , "doc": "uri:deskat:document.at-spi.gnome.org"
, "docattr": "uri:deskat:attributes.document.at-spi.gnome.org" , "docattr": "uri:deskat:attributes.document.at-spi.gnome.org"
, "txt": "uri:deskat:text.at-spi.gnome.org" , "txt": "uri:deskat:text.at-spi.gnome.org"
, "val": "uri:deskat:value.at-spi.gnome.org" , "val": "uri:deskat:value.at-spi.gnome.org"
, "act": "uri:deskat:action.at-spi.gnome.org" , "act": "uri:deskat:action.at-spi.gnome.org"
} }
def _create_node(node: Accessible) -> _Element: def _create_node(node: Accessible) -> _Element:
attribute_dict: Dict[str, Any] = {"name": node.name} attribute_dict: Dict[str, Any] = {"name": node.name}
@@ -164,11 +163,11 @@ def _create_node(node: Accessible) -> _Element:
states: List[StateType] = node.getState().get_states() states: List[StateType] = node.getState().get_states()
for st in states: for st in states:
state_name: str = StateType._enum_lookup[st] state_name: str = StateType._enum_lookup[st]
attribute_dict[ "{{{:}}}{:}"\ attribute_dict["{{{:}}}{:}" \
.format( _accessibility_ns_map["st"] .format(_accessibility_ns_map["st"]
, state_name.split("_", maxsplit=1)[1].lower() , state_name.split("_", maxsplit=1)[1].lower()
) )
] = "true" ] = "true"
# }}} States # # }}} States #
# Attributes {{{ # # Attributes {{{ #
@@ -177,11 +176,11 @@ def _create_node(node: Accessible) -> _Element:
attribute_name: str attribute_name: str
attribute_value: str attribute_value: str
attribute_name, attribute_value = attrbt.split(":", maxsplit=1) attribute_name, attribute_value = attrbt.split(":", maxsplit=1)
attribute_dict[ "{{{:}}}{:}"\ attribute_dict["{{{:}}}{:}" \
.format( _accessibility_ns_map["attr"] .format(_accessibility_ns_map["attr"]
, attribute_name , attribute_name
) )
] = attribute_value ] = attribute_value
# }}} Attributes # # }}} Attributes #
# Component {{{ # # Component {{{ #
@@ -190,9 +189,12 @@ def _create_node(node: Accessible) -> _Element:
except NotImplementedError: except NotImplementedError:
pass pass
else: else:
attribute_dict["{{{:}}}screencoord".format(_accessibility_ns_map["cp"])] = str(component.getPosition(pyatspi.XY_SCREEN)) attribute_dict["{{{:}}}screencoord".format(_accessibility_ns_map["cp"])] = str(
attribute_dict["{{{:}}}windowcoord".format(_accessibility_ns_map["cp"])] = str(component.getPosition(pyatspi.XY_WINDOW)) component.getPosition(pyatspi.XY_SCREEN))
attribute_dict["{{{:}}}parentcoord".format(_accessibility_ns_map["cp"])] = str(component.getPosition(pyatspi.XY_PARENT)) attribute_dict["{{{:}}}windowcoord".format(_accessibility_ns_map["cp"])] = str(
component.getPosition(pyatspi.XY_WINDOW))
attribute_dict["{{{:}}}parentcoord".format(_accessibility_ns_map["cp"])] = str(
component.getPosition(pyatspi.XY_PARENT))
attribute_dict["{{{:}}}size".format(_accessibility_ns_map["cp"])] = str(component.getSize()) attribute_dict["{{{:}}}size".format(_accessibility_ns_map["cp"])] = str(component.getSize())
# }}} Component # # }}} Component #
@@ -209,11 +211,11 @@ def _create_node(node: Accessible) -> _Element:
attribute_name: str attribute_name: str
attribute_value: str attribute_value: str
attribute_name, attribute_value = attrbt.split(":", maxsplit=1) attribute_name, attribute_value = attrbt.split(":", maxsplit=1)
attribute_dict[ "{{{:}}}{:}"\ attribute_dict["{{{:}}}{:}" \
.format( _accessibility_ns_map["docattr"] .format(_accessibility_ns_map["docattr"]
, attribute_name , attribute_name
) )
] = attribute_value ] = attribute_value
# }}} Document # # }}} Document #
# Text {{{ # # Text {{{ #
@@ -223,13 +225,13 @@ def _create_node(node: Accessible) -> _Element:
pass pass
else: else:
# only text shown on current screen is available # only text shown on current screen is available
#attribute_dict["txt:text"] = text_obj.getText(0, text_obj.characterCount) # attribute_dict["txt:text"] = text_obj.getText(0, text_obj.characterCount)
text: str = text_obj.getText(0, text_obj.characterCount) text: str = text_obj.getText(0, text_obj.characterCount)
# }}} Text # # }}} Text #
# Selection {{{ # # Selection {{{ #
try: try:
node.querySelection() node.querySelection()
except NotImplementedError: except NotImplementedError:
pass pass
else: else:
@@ -256,34 +258,36 @@ def _create_node(node: Accessible) -> _Element:
else: else:
for i in range(action.nActions): for i in range(action.nActions):
action_name: str = action.getName(i).replace(" ", "-") action_name: str = action.getName(i).replace(" ", "-")
attribute_dict[ "{{{:}}}{:}_desc"\ attribute_dict["{{{:}}}{:}_desc" \
.format( _accessibility_ns_map["act"] .format(_accessibility_ns_map["act"]
, action_name , action_name
) )
] = action.getDescription(i) ] = action.getDescription(i)
attribute_dict[ "{{{:}}}{:}_kb"\ attribute_dict["{{{:}}}{:}_kb" \
.format( _accessibility_ns_map["act"] .format(_accessibility_ns_map["act"]
, action_name , action_name
) )
] = action.getKeyBinding(i) ] = action.getKeyBinding(i)
# }}} Action # # }}} Action #
xml_node = lxml.etree.Element( node.getRoleName().replace(" ", "-") xml_node = lxml.etree.Element(node.getRoleName().replace(" ", "-")
, attrib=attribute_dict , attrib=attribute_dict
, nsmap=_accessibility_ns_map , nsmap=_accessibility_ns_map
) )
if "text" in locals() and len(text)>0: if "text" in locals() and len(text) > 0:
xml_node.text = text xml_node.text = text
for ch in node: for ch in node:
xml_node.append(_create_node(ch)) xml_node.append(_create_node(ch))
return xml_node return xml_node
@app.route("/accessibility", methods=["GET"]) @app.route("/accessibility", methods=["GET"])
def get_accessibility_tree(): def get_accessibility_tree():
desktop: Accessible = pyatspi.Registry.getDesktop(0) desktop: Accessible = pyatspi.Registry.getDesktop(0)
desktop_xml: _Element = _create_node(desktop) desktop_xml: _Element = _create_node(desktop)
return jsonify({"AT": lxml.etree.tostring(desktop_xml, encoding="unicode")}) return jsonify({"AT": lxml.etree.tostring(desktop_xml, encoding="unicode")})
@app.route('/screen_size', methods=['POST']) @app.route('/screen_size', methods=['POST'])
def get_screen_size(): def get_screen_size():
d = display.Display() d = display.Display()
@@ -563,5 +567,43 @@ def open_file():
return f"Failed to open {path}. Error: {e}", 500 return f"Failed to open {path}. Error: {e}", 500
@app.route("/setup/activate_window", methods=['POST'])
def activate_window():
data = request.json
window_name = data.get('window_name', None)
os_name = platform.system()
if os_name == 'Windows':
import pygetwindow as gw
try:
# Find the VS Code window
vscode_window = gw.getWindowsWithTitle(window_name)[0]
# Activate the window, bringing it to the front
vscode_window.activate()
except IndexError:
return "VS Code window not found.", 404
elif os_name == 'Darwin':
import pygetwindow as gw
try:
# Find the VS Code window
vscode_window = gw.getWindowsWithTitle(window_name)[0]
# Un-minimize the window and then bring it to the front
vscode_window.unminimize()
vscode_window.activate()
except IndexError:
return "VS Code window not found.", 404
elif os_name == 'Linux':
# Attempt to activate VS Code window using wmctrl
subprocess.Popen(["wmctrl", "-a", window_name])
else:
return f"Operating system {os_name} not supported.", 400
return "File opened successfully", 200
if __name__ == '__main__': if __name__ == '__main__':
app.run(debug=True, host="0.0.0.0") app.run(debug=True, host="0.0.0.0")

View File

@@ -36,7 +36,8 @@
"expected": { "expected": {
"type": "rule", "type": "rule",
"rules": { "rules": {
"type": "bookmark_bar_folders_names",
"names": ["Favorites"]
} }
} }
} }

View File

@@ -3,16 +3,50 @@
"snapshot": "chrome", "snapshot": "chrome",
"instruction": "Hey, I need a quick way back to this site. Could you whip up a shortcut on my desktop for me?", "instruction": "Hey, I need a quick way back to this site. Could you whip up a shortcut on my desktop for me?",
"source": "https://www.laptopmag.com/articles/how-to-create-desktop-shortcuts-for-web-pages-using-chrome", "source": "https://www.laptopmag.com/articles/how-to-create-desktop-shortcuts-for-web-pages-using-chrome",
"config": [], "config": [
{
"type": "launch",
"parameters": {
"command": [
"google-chrome",
"--remote-debugging-port=1337"
]
}
},
{
"type": "launch",
"parameters": {
"command": [
"socat",
"tcp-listen:9222,fork",
"tcp:localhost:1337"
]
}
},
{
"type": "chrome_open_tabs",
"parameters": {
"urls_to_open": [
"https://www.mathsisfun.com/games/2048.html"
]
}
}
],
"trajectory": "trajectories/", "trajectory": "trajectories/",
"related_apps": [ "related_apps": [
"chrome" "chrome"
], ],
"evaluator": { "evaluator": {
"func": "", "func": "is_shortcut_on_desktop",
"result": { "result": {
"type": "shortcuts_on_desktop"
}, },
"expected": { "expected": {
"type": "rule",
"rules": {
"type": "name",
"name": "Play Puzzle Game 2048"
}
} }
} }
} }

View File

@@ -1,18 +1,53 @@
{ {
"id": "7a5a7856-f1b6-42a4-ade9-1ca81ca0f263", "id": "7a5a7856-f1b6-42a4-ade9-1ca81ca0f263",
"snapshot": "chrome", "snapshot": "chrome",
"instruction": "Can you save this webpage I'm looking at to my bookmarks so I can come back to it later?", "instruction": "Can you save this webpage I'm looking at to bookmarks bar so I can come back to it later?",
"source": "https://www.youtube.com/watch?v=ZaZ8GcTxjXA", "source": "https://www.youtube.com/watch?v=ZaZ8GcTxjXA",
"config": [], "config": [
{
"type": "launch",
"parameters": {
"command": [
"google-chrome",
"--remote-debugging-port=1337"
]
}
},
{
"type": "launch",
"parameters": {
"command": [
"socat",
"tcp-listen:9222,fork",
"tcp:localhost:1337"
]
}
},
{
"type": "chrome_open_tabs",
"parameters": {
"urls_to_open": [
"https://blog.eleuther.ai/rotary-embeddings/",
"https://jalammar.github.io/illustrated-transformer/"
]
}
}
],
"trajectory": "trajectories/", "trajectory": "trajectories/",
"related_apps": [ "related_apps": [
"chrome" "chrome"
], ],
"evaluator": { "evaluator": {
"func": "", "func": "is_expected_bookmarks",
"result": { "result": {
"type": "bookmarks"
}, },
"expected": { "expected": {
"type": "rule",
"rules": {
"type": "bookmark_bar_websites_urls",
"urls": ["https://jalammar.github.io/illustrated-transformer/"]
}
} }
} }
} }

View File

@@ -1,18 +1,54 @@
{ {
"id": "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3", "id": "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3",
"snapshot": "chrome", "snapshot": "chrome",
"instruction": "Can you help me clean up my computer by getting rid of all the tracking things that websites like Amazon or eBay might have saved? I want to make sure my browsing is private and those sites don't remember me.", "instruction": "Can you help me clean up my computer by getting rid of all the tracking things that Amazon might have saved? I want to make sure my browsing is private and those sites don't remember me.",
"source": "https://support.google.com/chrome/answer/95647?hl=en&ref_topic=7438325&sjid=16867045591165135686-AP#zippy=%2Cdelete-cookies-from-a-site", "source": "https://support.google.com/chrome/answer/95647?hl=en&ref_topic=7438325&sjid=16867045591165135686-AP#zippy=%2Cdelete-cookies-from-a-site",
"config": [], "config": [
{
"type": "launch",
"parameters": {
"command": [
"google-chrome",
"--remote-debugging-port=1337"
]
}
},
{
"type": "launch",
"parameters": {
"command": [
"socat",
"tcp-listen:9222,fork",
"tcp:localhost:1337"
]
}
},
{
"type": "chrome_open_tabs",
"parameters": {
"urls_to_open": [
"https://www.amazon.com",
"https://www.amazon.com/s?k=huggingface+transformers+book"
]
}
}
],
"trajectory": "trajectories/", "trajectory": "trajectories/",
"related_apps": [ "related_apps": [
"chrome" "chrome"
], ],
"evaluator": { "evaluator": {
"func": "", "func": "is_cookie_deleted",
"result": { "result": {
"type": "cookie_data",
"dest": "Cookies"
}, },
"expected": { "expected": {
"type": "rule",
"rules": {
"type": "domains",
"domains": [".amazon.com"]
}
} }
} }
} }

View File

@@ -3,16 +3,50 @@
"snapshot": "chrome", "snapshot": "chrome",
"instruction": "Computer, can you turn the webpage I'm looking at into a PDF file and put it on my main screen, you know, the Desktop?", "instruction": "Computer, can you turn the webpage I'm looking at into a PDF file and put it on my main screen, you know, the Desktop?",
"source": "https://in5stepstutorials.com/google-chrome/save-web-page-as-pdf-in-chrome.php", "source": "https://in5stepstutorials.com/google-chrome/save-web-page-as-pdf-in-chrome.php",
"config": [], "config": [
{
"type": "launch",
"parameters": {
"command": [
"google-chrome",
"--remote-debugging-port=1337"
]
}
},
{
"type": "launch",
"parameters": {
"command": [
"socat",
"tcp-listen:9222,fork",
"tcp:localhost:1337"
]
}
},
{
"type": "chrome_open_tabs",
"parameters": {
"urls_to_open": [
"https://lilianweng.github.io/posts/2023-06-23-agent/"
]
}
}
],
"trajectory": "trajectories/", "trajectory": "trajectories/",
"related_apps": [ "related_apps": [
"chrome" "chrome"
], ],
"evaluator": { "evaluator": {
"func": "", "func": "compare_pdfs",
"result": { "result": {
"type": "vm_file",
"path": "Desktop/LLM Powered Autonomous Agents _ Lil'Log.pdf",
"dest": "LLM Powered Autonomous Agents _ Lil'Log.pdf"
}, },
"expected": { "expected": {
"type": "pdf_from_url",
"path": "https://lilianweng.github.io/posts/2023-06-23-agent/",
"dest": "LLM Powered Autonomous Agents _ Lil'Log_gold.pdf"
} }
} }
} }

View File

@@ -1,12 +1,34 @@
{ {
"id": "455d3c66-7dc6-4537-a39a-36d3e9119df7", "id": "455d3c66-7dc6-4537-a39a-36d3e9119df7",
"snapshot": "libreoffice_impress", "snapshot": "libreoffice_impress",
"instruction": "Could you help me export impress file to image jpg file?", "instruction": "Could you help me export an Impress file to a .jpg image file and save it as res.jpg on the Desktop? ",
"source": "https://stackoverflow.com/questions/75626383/how-export-libreoffice-impress-to-image", "source": "https://stackoverflow.com/questions/75626383/how-export-libreoffice-impress-to-image",
"config": [], "config": [
{
"type": "download",
"parameters": {
"files": [
{
"url": "https://drive.usercontent.google.com/download?id=12MxMjw28_t1nTLihlDpToKebjsSDsjwx&export=download&authuser=0&confirm=t&uuid=1ccc1da0-d7c7-494f-a0e3-59eb55f54e3f&at=APZUnTXvNIRMsF2cjZuFxmQzByhC:1705253210291",
"path": "Desktop/wssf-project-plan-on-a-page.pptx"
}
]
}
},
{
"type": "open",
"parameters": {
"path": "Desktop/wssf-project-plan-on-a-page.pptx"
}
}
],
"trajectory": "trajectories/", "trajectory": "trajectories/",
"related_apps": [ "related_apps": [
"" "libreoffice_impress"
], ],
"evaluator": "evaluation_dir" "evaluator": {
} "func": "check_file_exists",
"file_name": "res.png",
"directory": "/home/user/Desktop/"
}
}

View File

@@ -1,12 +1,37 @@
{ {
"id": "550ce7e7-747b-495f-b122-acdc4d0b8e54", "id": "550ce7e7-747b-495f-b122-acdc4d0b8e54",
"snapshot": "libreoffice_impress", "snapshot": "libreoffice_impress",
"instruction": "Could you help me add a strike-through on this text", "instruction": "I am checking our soccer club's to-do list for the last semester and adding strike-through sign on the line we have already accomplished. Could you help me add a strike-through on the first and second line?",
"source": "https://superuser.com/questions/1211035/libreoffice-impress-animations-how-to-strikethrough-on-click?rq=1", "source": "https://superuser.com/questions/1211035/libreoffice-impress-animations-how-to-strikethrough-on-click?rq=1",
"config": [], "config": [
{
"type": "download",
"parameters": {
"files": [
{
"url": "https://drive.usercontent.google.com/download?id=1fw0baVZ15s0r1WGEBftgED2H0ljZgYtu&export=download&authuser=0&confirm=t&uuid=df03788a-81ef-4e55-b33a-2fba7ab28cb8&at=APZUnTXPb-sm88KNwmNeugbhPrzx:17052529805399",
"path": "Desktop/New_Club_Spring_2018_Training.pptx"
}
]
}
},
{
"type": "open",
"parameters": {
"path": "Desktop/New_Club_Spring_2018_Training.pptx"
}
}
],
"trajectory": "trajectories/", "trajectory": "trajectories/",
"related_apps": [ "related_apps": [
"" "libreoffice_impress"
], ],
"evaluator": "evaluation_dir" "evaluator": {
} "func": "check_for_two_lines",
"result": {
"type": "vm_file",
"path": "Desktop/New_Club_Spring_2018_Training.pptx",
"dest": "New_Club_Spring_2018_Training.pptx"
}
}
}

View File

@@ -1,12 +1,42 @@
{ {
"id": "5d901039-a89c-4bfb-967b-bf66f4df075e", "id": "5d901039-a89c-4bfb-967b-bf66f4df075e",
"snapshot": "libreoffice_impress", "snapshot": "libreoffice_impress",
"instruction": "Help me stretch the image to fill the entire page, keeping its proportion and centering the image", "instruction": "I want to make this page my cover page. Could you help me stretch this image to fill the entire page, keeping its proportion and centering the image.",
"source": "https://superuser.com/questions/986776/how-can-i-stretch-an-image-in-a-libreoffice-impress-presentation-to-fill-the-pag", "source": "https://superuser.com/questions/986776/how-can-i-stretch-an-image-in-a-libreoffice-impress-presentation-to-fill-the-pag",
"config": [], "config": [
{
"type": "download",
"parameters": {
"files": [
{
"url": "https://drive.usercontent.google.com/download?id=16K6TpGIRZpqOJUu-mtJQ_78kIwLcn-4D&export=download&authuser=0&confirm=t&uuid=945b6f33-53d2-4e87-ada9-efa8b938a499&at=APZUnTVw4fKyJPW0vAAJURruAJIP:1705250184439",
"path": "Desktop/CPD_Background_Investigation_Process.pptx"
}
]
}
},
{
"type": "open",
"parameters": {
"path": "Desktop/CPD_Background_Investigation_Process.pptx"
}
}
],
"trajectory": "trajectories/", "trajectory": "trajectories/",
"related_apps": [ "related_apps": [
"" "libreoffice_impress"
], ],
"evaluator": "evaluation_dir" "evaluator": {
} "func": "compare_pptx_files",
"expected": {
"type": "cloud_file",
"path": "https://drive.usercontent.google.com/download?id=1rsvFPyHYiIPh1c8Nj8say0NJCG2VIDr7&export=download&authuser=0&confirm=t&uuid=aac08a92-6595-47d8-84dc-8f1ab1df987f&at=APZUnTXIWCn5B0CpLttvG2bsr_a7:1705250423565",
"dest": "CPD_Background_Investigation_Process_Gold.docx"
},
"result": {
"type": "vm_file",
"path": "Desktop/CPD_Background_Investigation_Process.pptx",
"dest": "CPD_Background_Investigation_Process.pptx"
}
}
}

View File

@@ -1,7 +1,7 @@
{ {
"id": "0ed39f63-6049-43d4-ba4d-5fa2fe04a951", "id": "0ed39f63-6049-43d4-ba4d-5fa2fe04a951",
"snapshot": "vscode", "snapshot": "vscode",
"instruction": "Could you help me find and replace \"text\" with \"test\" in this file?", "instruction": "Please change all the places that say \"text\" to \"test\" in this document for me.",
"source": "https://www.quora.com/How-do-you-find-and-replace-text-in-Visual-Studio-Code", "source": "https://www.quora.com/How-do-you-find-and-replace-text-in-Visual-Studio-Code",
"config": [ "config": [
{ {
@@ -16,9 +16,15 @@
} }
}, },
{ {
"type": "open", "type": "launch",
"parameters": { "parameters": {
"path": "Desktop/vscode_replace_text.txt" "command": ["code", "Desktop/vscode_replace_text.txt"]
}
},
{
"type": "activate_window",
"parameters": {
"window_name": "Visual Studio Code"
} }
} }
], ],

View File

@@ -1,13 +1,50 @@
{ {
"id": "53ad5833-3455-407b-bbc6-45b4c79ab8fb", "id": "53ad5833-3455-407b-bbc6-45b4c79ab8fb",
"snapshot": "vscode", "snapshot": "vscode",
"instruction": "Could you help me open the project at /home/user/project?", "instruction": "I'd like the \"project\" in the \"user\" folder under \"home\" to be opened with VS Code, please.",
"source": "https://www.youtube.com/watch?v=VqCgcpAypFQ", "source": "https://www.youtube.com/watch?v=VqCgcpAypFQ",
"config": [ "config": [
{
"type": "launch",
"parameters": {
"command": [
"code"
]
}
},
{ {
"type": "command", "type": "command",
"parameters": { "parameters": {
"command": ["mkdir", "-p", "/home/user/project"] "command": [
"mkdir",
"-p",
"/home/user/project/.vscode"
]
}
},
{
"type": "download",
"parameters": {
"files": [
{
"url": "https://drive.usercontent.google.com/download?id=1akdsiRVdq6CUtT-FX8Dpf8ruPTq6DcFn&export=download&authuser=0&confirm=t&uuid=ce2fa96a-454e-43d9-bbe3-98553b7eed0d&at=APZUnTVw_YQ1URTvP34vrmKcw0b4:1705222451052",
"path": "/home/user/project/main.py"
},
{
"url": "https://drive.usercontent.google.com/download?id=1BkwtqtAzv_K2CrTbJZ0HbMHBffzdD9vc&export=download&authuser=0&confirm=t&uuid=28f77090-deef-49a1-b156-91317881e75e&at=APZUnTXuaR6i_3t3Prslk535GaO5:1705222457290",
"path": "/home/user/project/README.md"
},
{
"url": "https://drive.usercontent.google.com/download?id=1ea_zF2tbcXOB8w9neBV-U5xI2nnPzIw_&export=download&authuser=0&confirm=t&uuid=9cf8c5bb-a880-475c-b80b-967a0c4fbea4&at=APZUnTUdjIj80F3Mbgi72eZDTZLO:1705222462443",
"path": "/home/user/project/.vscode/settings.json"
}
]
}
},
{
"type": "activate_window",
"parameters": {
"window_name": "Visual Studio Code"
} }
} }
], ],
@@ -15,5 +52,27 @@
"related_apps": [ "related_apps": [
"vscode" "vscode"
], ],
"evaluator": "evaluation_dir" "evaluator": {
"postconfig": [
{
"type": "activate_window",
"parameters": {
"window_name": "Visual Studio Code"
}
}
],
"func": "compare_config",
"expected": {
"type": "rule",
"rules": {
"expect": "project"
}
},
"result": {
"type": "vscode_config",
"vscode_extension_command": "OpenProject",
"path": "OpenProject.txt",
"dest": "OpenProject.txt"
}
}
} }

View File

@@ -1,12 +1,50 @@
{ {
"id": "59ed65c7-e9a6-43db-833f-76d6730c0004", "id": "59ed65c7-e9a6-43db-833f-76d6730c0004",
"snapshot": "vscode", "snapshot": "vscode",
"instruction": "Could you help me start debugging with the breakpoint at line 15?", "instruction": "Could you help me start debugging with the breakpoint at line 100?",
"source": "https://www.youtube.com/watch?v=7qZBwhSlfOo", "source": "https://www.youtube.com/watch?v=7qZBwhSlfOo",
"config": [], "config": [
{
"type": "download",
"parameters": {
"files": [
{
"url": "https://drive.usercontent.google.com/download?id=1eLlB7UqRjh55vm0SIxb96aU1WbbK3H3T&export=download&authuser=0&confirm=t&uuid=379d1cbf-cca1-454a-a5a6-c5389024f728&at=APZUnTWn4vJZhfvrdfYZ6byVfaSj:1705159150342",
"path": "Desktop/main.py"
}
]
}
},
{
"type": "launch",
"parameters": {
"command": ["code", "Desktop/main.py"]
}
},
{
"type": "activate_window",
"parameters": {
"window_name": "Visual Studio Code"
}
}
],
"trajectory": "trajectories/", "trajectory": "trajectories/",
"related_apps": [ "related_apps": [
"vscode" "vscode"
], ],
"evaluator": "evaluation_dir" "evaluator": {
"func": "compare_config",
"expected": {
"type": "rule",
"rules": {
"expect": "100"
}
},
"result": {
"type": "vscode_config",
"vscode_extension_command": "GetBreakPoint",
"path": "GetBreakPoint.txt",
"dest": "GetBreakPoint.txt"
}
}
} }

View File

@@ -3,10 +3,39 @@
"snapshot": "vscode", "snapshot": "vscode",
"instruction": "Could you help me change the color theme to Dark?", "instruction": "Could you help me change the color theme to Dark?",
"source": "https://www.youtube.com/watch?v=ORrELERGIHs", "source": "https://www.youtube.com/watch?v=ORrELERGIHs",
"config": [], "config": [
{
"type": "launch",
"parameters": {
"command": [
"code"
]
}
},
{
"type": "activate_window",
"parameters": {
"window_name": "Visual Studio Code"
}
}
],
"trajectory": "trajectories/982d12a5-beab-424f-8d38-d2a48429e511", "trajectory": "trajectories/982d12a5-beab-424f-8d38-d2a48429e511",
"related_apps": [ "related_apps": [
"vscode" "vscode"
], ],
"evaluator": "evaluation_dir" "evaluator": {
"func": "compare_config",
"expected": {
"type": "rule",
"rules": {
"expect": "2"
}
},
"result": {
"type": "vscode_config",
"vscode_extension_command": "GetColorTheme",
"path": "GetColorTheme.txt",
"dest": "GetColorTheme.txt"
}
}
} }

View File

@@ -3,20 +3,44 @@
"snapshot": "vscode", "snapshot": "vscode",
"instruction": "Help me install the extension Python.", "instruction": "Help me install the extension Python.",
"source": "https://www.youtube.com/watch?v=VqCgcpAypFQ", "source": "https://www.youtube.com/watch?v=VqCgcpAypFQ",
"config": [], "config": [
{
"type": "launch",
"parameters": {
"command": [
"code"
]
}
},
{
"type": "activate_window",
"parameters": {
"window_name": "Visual Studio Code"
}
}
],
"trajectory": "trajectories/eabc805a-bfcf-4460-b250-ac92135819f6", "trajectory": "trajectories/eabc805a-bfcf-4460-b250-ac92135819f6",
"related_apps": [ "related_apps": [
"vscode" "vscode"
], ],
"evaluator": { "evaluator": {
"func": "compare_answer", "func": "is_extension_installed",
"expected": {
"type": "string",
"string": "ms-python.python\n"
},
"result": { "result": {
"type": "command_line", "type": "vm_command_line",
"command": "code --list-extensions | grep ms-python.python" "command": [
"code",
"--list-extensions",
"|",
"grep",
"ms-python.python"
]
},
"expected": {
"type": "rule",
"rules": {
"type": "contain",
"expected": "ms-python.python"
}
} }
} }
} }

104
experiment.py Normal file
View File

@@ -0,0 +1,104 @@
import datetime
import json
import logging
import os
import sys
from desktop_env.envs.desktop_env import DesktopEnv
from mm_agents.gpt_4v_agent import GPT4v_Agent
# Logger Configs {{{ #
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8")
debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8")
stdout_handler = logging.StreamHandler(sys.stdout)
sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8")
file_handler.setLevel(logging.INFO)
debug_handler.setLevel(logging.DEBUG)
stdout_handler.setLevel(logging.INFO)
sdebug_handler.setLevel(logging.DEBUG)
formatter = logging.Formatter(
fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s")
file_handler.setFormatter(formatter)
debug_handler.setFormatter(formatter)
stdout_handler.setFormatter(formatter)
sdebug_handler.setFormatter(formatter)
stdout_handler.addFilter(logging.Filter("desktopenv"))
sdebug_handler.addFilter(logging.Filter("desktopenv"))
logger.addHandler(file_handler)
logger.addHandler(debug_handler)
logger.addHandler(stdout_handler)
logger.addHandler(sdebug_handler)
# }}} Logger Configs #
logger = logging.getLogger("desktopenv.experiment")
PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx"
def run_one_example(example, agent, max_steps=20, example_trajectory_dir="exp_trajectory"):
trajectory_recording_path = os.path.join(example_trajectory_dir, "trajectory.json")
env = DesktopEnv(
path_to_vm=PATH_TO_VM,
action_space=agent.action_space,
task_config=example
)
# reset the environment to certain snapshot
observation = env.reset()
observation['instruction'] = example['instruction']
done = False
step_num = 0
# todo: save the screenshots and actions to a folder
while not done and step_num < max_steps:
actions = agent.predict(observation)
for action in actions:
observation, reward, done, info = env.step(action)
observation['instruction'] = example['instruction']
step_num += 1
logger.info("Step %d", step_num)
logger.info("Action: %s", actions)
observation.pop("accessibility_tree")
logger.info("Observation: %s", observation)
logger.info("Reward: %.2f", reward)
logger.info("Info: %s", info)
logger.info("================================\n")
if done:
logger.info("The episode is done.")
break
result = env.evaluate()
logger.info("Result: %.2f", result)
# env.close()
logger.info("Environment closed.")
if __name__ == "__main__":
action_space = "pyautogui"
example_class = "vlc"
example_id = "8f080098-ddb1-424c-b438-4e96e5e4786e"
with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f:
example = json.load(f)
example["snapshot"] = "chrome_setup"
api_key = os.environ.get("OPENAI_API_KEY")
agent = GPT4v_Agent(api_key=api_key, action_space=action_space)
root_trajectory_dir = "exp_trajectory"
example_trajectory_dir = os.path.join(root_trajectory_dir, example_class, example_id)
os.makedirs(example_trajectory_dir, exist_ok=True)
run_one_example(example, agent, 20, example_trajectory_dir)

23
main.py
View File

@@ -1,10 +1,10 @@
import datetime
import json import json
from desktop_env.envs.desktop_env import DesktopEnv
import logging import logging
import os import os
import sys import sys
import datetime
from desktop_env.envs.desktop_env import DesktopEnv
# Logger Configs {{{ # # Logger Configs {{{ #
logger = logging.getLogger() logger = logging.getLogger()
@@ -12,17 +12,18 @@ logger.setLevel(logging.DEBUG)
datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S") datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str))) file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8")
debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str))) debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8")
stdout_handler = logging.StreamHandler(sys.stdout) stdout_handler = logging.StreamHandler(sys.stdout)
sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str))) sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8")
file_handler.setLevel(logging.INFO) file_handler.setLevel(logging.INFO)
debug_handler.setLevel(logging.DEBUG) debug_handler.setLevel(logging.DEBUG)
stdout_handler.setLevel(logging.INFO) stdout_handler.setLevel(logging.INFO)
sdebug_handler.setLevel(logging.DEBUG) sdebug_handler.setLevel(logging.DEBUG)
formatter = logging.Formatter(fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s") formatter = logging.Formatter(
fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s")
file_handler.setFormatter(formatter) file_handler.setFormatter(formatter)
debug_handler.setFormatter(formatter) debug_handler.setFormatter(formatter)
stdout_handler.setFormatter(formatter) stdout_handler.setFormatter(formatter)
@@ -39,6 +40,7 @@ logger.addHandler(sdebug_handler)
logger = logging.getLogger("desktopenv.main") logger = logging.getLogger("desktopenv.main")
def human_agent(): def human_agent():
""" """
Runs the Gym environment with human input. Runs the Gym environment with human input.
@@ -76,7 +78,8 @@ def human_agent():
# } # }
logger.info(trajectory[i]) logger.info(trajectory[i])
observation, reward, done, info = env.step(trajectory[i], pause=5) observation, reward, done, info = env.step(trajectory[i])
observation.pop("accessibility_tree")
logger.info("Observation: %s", observation) logger.info("Observation: %s", observation)
logger.info("Reward: %.2f", reward) logger.info("Reward: %.2f", reward)
logger.info("Info: %s", info) logger.info("Info: %s", info)
@@ -87,12 +90,14 @@ def human_agent():
logger.info("The episode is done.") logger.info("The episode is done.")
break break
#input("PAUSING")
result = env.evaluate() result = env.evaluate()
logger.info("Result: %.2f", result) logger.info("Result: %.2f", result)
#input("PAUSING") #input("PAUSING")
#env.close() # env.close()
logger.info("Environment closed.") logger.info("Environment closed.")

84
mm_agents/gemini_agent.py Normal file
View File

@@ -0,0 +1,84 @@
from typing import Dict
import PIL.Image
import google.generativeai as genai
from mm_agents.gpt_4v_agent import parse_actions_from_string, parse_code_from_string
from mm_agents.gpt_4v_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION
from mm_agents.gpt_4v_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE
class GeminiPro_Agent:
def __init__(self, api_key, model='gemini-pro-vision', max_tokens=300, action_space="computer_13"):
genai.configure(api_key)
self.model = genai.GenerativeModel(model)
self.max_tokens = max_tokens
self.action_space = action_space
self.trajectory = [
{
"role": "system",
"parts": [
{
"computer_13": SYS_PROMPT_ACTION,
"pyautogui": SYS_PROMPT_CODE
}[action_space]
]
}
]
def predict(self, obs: Dict):
"""
Predict the next action(s) based on the current observation.
"""
img = PIL.Image.open(obs["screenshot"])
self.trajectory.append({
"role": "user",
"parts": ["To accomplish the task '{}' and given the current screenshot, what's the next step?".format(
obs["instruction"]), img]
})
traj_to_show = []
for i in range(len(self.trajectory)):
traj_to_show.append(self.trajectory[i]["parts"][0])
if len(self.trajectory[i]["parts"]) > 1:
traj_to_show.append("screenshot_obs")
print("Trajectory:", traj_to_show)
response = self.model.generate_content(self.trajectory, max_tokens=self.max_tokens)
try:
# fixme: change to fit the new response format from gemini pro
actions = self.parse_actions(response.json()['choices'][0]['message']['content'])
except:
# todo: add error handling
print("Failed to parse action from response:", response.json()['choices'][0]['message']['content'])
actions = None
return actions
def parse_actions(self, response: str):
# response example
"""
```json
{
"action_type": "CLICK",
"click_type": "RIGHT"
}
```
"""
# parse from the response
if self.action_space == "computer_13":
actions = parse_actions_from_string(response)
elif self.action_space == "pyautogui":
actions = parse_code_from_string(response)
# add action into the trajectory
self.trajectory.append({
"role": "assistant",
"parts": [response]
})
return actions

View File

@@ -1,19 +0,0 @@
import PIL.Image
import google.generativeai as genai
genai.configure(api_key="AIzaSyANsETKHVo-D8jZu1SnTSaQgLOJEDgnj9Q")
# for m in genai.list_models():
# if 'generateContent' in m.supported_generation_methods:
# print(m.name)
model = genai.GenerativeModel('gemini-pro-vision')
img = PIL.Image.open('image.jpg')
messages = [
{'role':'user',
'parts': ["Explain this image.", img]}
]
response = model.generate_content(messages)

View File

@@ -1,12 +1,12 @@
# fixme: Need to be rewrite on new action space
import os
import re
import base64 import base64
from desktop_env.envs.desktop_env import Action, MouseClick
import json import json
import re
from typing import Dict
import requests import requests
from mm_agents.gpt_4v_prompt import SYS_PROMPT
from mm_agents.gpt_4v_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION
from mm_agents.gpt_4v_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE
# Function to encode the image # Function to encode the image
@@ -47,11 +47,26 @@ def parse_actions_from_string(input_string):
raise ValueError("Invalid response format: " + input_string) raise ValueError("Invalid response format: " + input_string)
def parse_code_from_string(input_string):
# This regular expression will match both ```code``` and ```python code```
# and capture the `code` part. It uses a non-greedy match for the content inside.
pattern = r"```(?:\w+\s+)?(.*?)```"
# Find all non-overlapping matches in the string
matches = re.findall(pattern, input_string, re.DOTALL)
# The regex above captures the content inside the triple backticks.
# The `re.DOTALL` flag allows the dot `.` to match newline characters as well,
# so the code inside backticks can span multiple lines.
# matches now contains all the captured code snippets
return matches
class GPT4v_Agent: class GPT4v_Agent:
def __init__(self, api_key, instruction, model="gpt-4-vision-preview", max_tokens=300): def __init__(self, api_key, model="gpt-4-vision-preview", max_tokens=300, action_space="computer_13"):
self.instruction = instruction
self.model = model self.model = model
self.max_tokens = max_tokens self.max_tokens = max_tokens
self.action_space = action_space
self.headers = { self.headers = {
"Content-Type": "application/json", "Content-Type": "application/json",
@@ -64,20 +79,27 @@ class GPT4v_Agent:
"content": [ "content": [
{ {
"type": "text", "type": "text",
"text": SYS_PROMPT "text": {
"computer_13": SYS_PROMPT_ACTION,
"pyautogui": SYS_PROMPT_CODE
}[action_space]
}, },
] ]
} }
] ]
def predict(self, obs): def predict(self, obs: Dict):
base64_image = encode_image(obs) """
Predict the next action(s) based on the current observation.
"""
base64_image = encode_image(obs["screenshot"])
self.trajectory.append({ self.trajectory.append({
"role": "user", "role": "user",
"content": [ "content": [
{ {
"type": "text", "type": "text",
"text": "What's the next step for instruction '{}'?".format(self.instruction) "text": "To accomplish the task '{}' and given the current screenshot, what's the next step?".format(
obs["instruction"])
}, },
{ {
"type": "image_url", "type": "image_url",
@@ -87,12 +109,15 @@ class GPT4v_Agent:
} }
] ]
}) })
traj_to_show = [] traj_to_show = []
for i in range(len(self.trajectory)): for i in range(len(self.trajectory)):
traj_to_show.append(self.trajectory[i]["content"][0]["text"]) traj_to_show.append(self.trajectory[i]["content"][0]["text"])
if len(self.trajectory[i]["content"]) > 1: if len(self.trajectory[i]["content"]) > 1:
traj_to_show.append("screenshot_obs") traj_to_show.append("screenshot_obs")
print("Trajectory:", traj_to_show) print("Trajectory:", traj_to_show)
payload = { payload = {
"model": self.model, "model": self.model,
"messages": self.trajectory, "messages": self.trajectory,
@@ -103,6 +128,7 @@ class GPT4v_Agent:
try: try:
actions = self.parse_actions(response.json()['choices'][0]['message']['content']) actions = self.parse_actions(response.json()['choices'][0]['message']['content'])
except: except:
# todo: add error handling
print("Failed to parse action from response:", response.json()['choices'][0]['message']['content']) print("Failed to parse action from response:", response.json()['choices'][0]['message']['content'])
actions = None actions = None
@@ -120,7 +146,10 @@ class GPT4v_Agent:
""" """
# parse from the response # parse from the response
actions = parse_actions_from_string(response) if self.action_space == "computer_13":
actions = parse_actions_from_string(response)
elif self.action_space == "pyautogui":
actions = parse_code_from_string(response)
# add action into the trajectory # add action into the trajectory
self.trajectory.append({ self.trajectory.append({
@@ -133,34 +162,4 @@ class GPT4v_Agent:
] ]
}) })
# parse action return actions
parsed_actions = []
for action in actions:
parsed_action = {}
action_type = Action[action['action_type']].value
parsed_action["action_type"] = action_type
if action_type == Action.CLICK.value or action_type == Action.MOUSE_DOWN.value or action_type == Action.MOUSE_UP.value:
parsed_action["click_type"] = MouseClick[action['click_type']].value
if action_type == Action.MOUSE_MOVE.value:
parsed_action["x"] = action["x"]
parsed_action["y"] = action["y"]
if action_type == Action.KEY.value:
parsed_action["key"] = action["key"] # handle the condition of single key and multiple keys
if action_type == Action.TYPE.value:
parsed_action["text"] = action["text"]
parsed_actions.append(parsed_action)
return parsed_actions
if __name__ == '__main__':
# OpenAI API Key
api_key = os.environ.get("OPENAI_API_KEY")
agent = GPT4v_Agent(api_key=api_key, instruction="Open Google Sheet")
print(agent.predict(obs="stackoverflow.png"))

View File

@@ -1,52 +0,0 @@
You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection.
For each step, you will get an observation of an image, which is the screenshot of the computer screen. And you will predict the action of the computer based on the image.
Here is the description of the action space:
Firstly you need to predict the class of your action, select from one below:
- **MOUSE_MOVE**: move the mouse to a specific position
- **CLICK**: click on the screen
- **MOUSE_DOWN**: press the mouse button
- **MOUSE_UP**: release the mouse button
- **KEY**: press a key on the keyboard
- **KEY_DOWN**: press a key on the keyboard
- **KEY_UP**: release a key on the keyboard
- **TYPE**: type a string on the keyboard
Then you need to predict the parameters of your action:
- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor
for example, format as:
```
{
"action_type": "MOUSE_MOVE",
"x": 1319.11,
"y": 65.06
}
```
- For [CLICK, MOUSE_DOWN, MOUSE_UP], you need to specify the click_type as well, select from [LEFT, MIDDLE, RIGHT, WHEEL_UP, WHEEL_DOWN], which means you click the left button, middle button, right button, wheel up or wheel down of your mouse:
for example, format as:
```
{
"action_type": "CLICK",
"click_type": "LEFT"
}
```
- For [KEY, KEY_DOWN, KEY_UP, TYPE], you need to choose a(multiple) key(s) from the keyboard, select from [A-Z, 0-9, F1-F12, ESC, TAB, ENTER, SPACE, BACKSPACE, SHIFT, CTRL, ALT, UP, DOWN, LEFT, RIGHT, CAPSLOCK, NUMLOCK, SCROLLLOCK, INSERT, DELETE, HOME, END, PAGEUP, PAGEDOWN]:
for example, format as:
```
{
"action_type": "TYPE",
"text": [
"w",
"i",
"k",
"i",
"p",
"e",
"d",
"i",
"a"
]
}
```
For every setup, you should only return the action_type and the parameters of your action as a dict, without any other things.

View File

@@ -1,19 +1,207 @@
SYS_PROMPT = """ SYS_PROMPT = """
You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection. You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection.
For each step, you will get an observation of an image, which is the screenshot of the computer screen. And you will predict the action of the computer based on the image. For each step, you will get an observation of an image, which is the screenshot of the computer screen. And you will predict the action of the computer based on the image.
Here is the description of the action space:
Firstly you need to predict the class of your action, select from one below: HERE is the description of the action space you need to predict, follow the format and choose the correct action type and parameters:
- **MOUSE_MOVE**: move the mouse to a specific position ACTION_SPACE = [
- **CLICK**: click on the screen {
- **MOUSE_DOWN**: press the mouse button "action_type": "MOVE_TO",
- **MOUSE_UP**: release the mouse button "note": "move the cursor to the specified position",
- **KEY**: press a key on the keyboard "parameters": {
- **KEY_DOWN**: press a key on the keyboard "x": {
- **KEY_UP**: release a key on the keyboard "type": float,
- **TYPE**: type a string on the keyboard "range": [0, X_MAX],
"optional": False,
Then you need to predict the parameters of your action: },
"y": {
"type": float,
"range": [0, Y_MAX],
"optional": False,
}
}
},
{
"action_type": "CLICK",
"note": "click the left button if the button not specified, otherwise click the specified button; click at the current position if x and y are not specified, otherwise click at the specified position",
"parameters": {
"button": {
"type": str,
"range": ["left", "right", "middle"],
"optional": True,
},
"x": {
"type": float,
"range": [0, X_MAX],
"optional": True,
},
"y": {
"type": float,
"range": [0, Y_MAX],
"optional": True,
},
"num_clicks": {
"type": int,
"range": [1, 2, 3],
"optional": True,
},
}
},
{
"action_type": "MOUSE_DOWN",
"note": "press the left button if the button not specified, otherwise press the specified button",
"parameters": {
"button": {
"type": str,
"range": ["left", "right", "middle"],
"optional": True,
}
}
},
{
"action_type": "MOUSE_UP",
"note": "release the left button if the button not specified, otherwise release the specified button",
"parameters": {
"button": {
"type": str,
"range": ["left", "right", "middle"],
"optional": True,
}
}
},
{
"action_type": "RIGHT_CLICK",
"note": "right click at the current position if x and y are not specified, otherwise right click at the specified position",
"parameters": {
"x": {
"type": float,
"range": [0, X_MAX],
"optional": True,
},
"y": {
"type": float,
"range": [0, Y_MAX],
"optional": True,
}
}
},
{
"action_type": "DOUBLE_CLICK",
"note": "double click at the current position if x and y are not specified, otherwise double click at the specified position",
"parameters": {
"x": {
"type": float,
"range": [0, X_MAX],
"optional": True,
},
"y": {
"type": float,
"range": [0, Y_MAX],
"optional": True,
}
}
},
{
"action_type": "DRAG_TO",
"note": "drag the cursor to the specified position with the left button pressed",
"parameters": {
"x": {
"type": float,
"range": [0, X_MAX],
"optional": False,
},
"y": {
"type": float,
"range": [0, Y_MAX],
"optional": False,
}
}
},
{
"action_type": "SCROLL",
"note": "scroll the mouse wheel up or down",
"parameters": {
"dx": {
"type": int,
"range": None,
"optional": False,
},
"dy": {
"type": int,
"range": None,
"optional": False,
}
}
},
{
"action_type": "TYPING",
"note": "type the specified text",
"parameters": {
"text": {
"type": str,
"range": None,
"optional": False,
}
}
},
{
"action_type": "PRESS",
"note": "press the specified key and release it",
"parameters": {
"key": {
"type": str,
"range": KEYBOARD_KEYS,
"optional": False,
}
}
},
{
"action_type": "KEY_DOWN",
"note": "press the specified key",
"parameters": {
"key": {
"type": str,
"range": KEYBOARD_KEYS,
"optional": False,
}
}
},
{
"action_type": "KEY_UP",
"note": "release the specified key",
"parameters": {
"key": {
"type": str,
"range": KEYBOARD_KEYS,
"optional": False,
}
}
},
{
"action_type": "HOTKEY",
"note": "press the specified key combination",
"parameters": {
"keys": {
"type": list,
"range": [KEYBOARD_KEYS],
"optional": False,
}
}
},
############################################################################################################
{
"action_type": "WAIT",
"note": "wait until the next action",
},
{
"action_type": "FAIL",
"note": "decide the task can not be performed",
},
{
"action_type": "DONE",
"note": "decide the task is done",
}
]
Firstly you need to predict the class of your action, then you need to predict the parameters of your action:
- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is (1920, 1080) - For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is (1920, 1080)
for example, format as: for example, format as:
``` ```
@@ -48,7 +236,9 @@ for example, format as:
} }
``` ```
For every step, you should only return the action_type and the parameters of your action as a dict, without any other things. You MUST wrap the dict with backticks (\`). REMEMBER:
You can predict multiple actions at one step, but you should only return one action for each step. For every step, you should only return the action_type and the parameters of your action as a dict, without any other things.
You MUST wrap the dict with backticks (\`).
You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty. You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty.
You CAN predict multiple actions at one step, but you should only return one action for each step.
""" """

View File

@@ -4,5 +4,8 @@ For each step, you will get an observation of an image, which is the screenshot
You are required to use `pyautogui` to perform the action. You are required to use `pyautogui` to perform the action.
Return one line or multiple lines of python code to perform the action each time, be time efficient. Return one line or multiple lines of python code to perform the action each time, be time efficient.
Return `None` if you cannot perform the action.
When you think you have to wait for some time, return `WAIT`.
When you think the task can not be done, return `FAIL`.
When you think the task is done, return `DONE`.
""" """

View File

@@ -29,3 +29,4 @@ opencv-python
ImageHash ImageHash
scikit-image scikit-image
librosa librosa
pymupdf