From ab71ebb2ba6e6e0165f399166da6129e381693dc Mon Sep 17 00:00:00 2001 From: Timothyxxx <384084775@qq.com> Date: Thu, 4 Jan 2024 17:05:17 +0800 Subject: [PATCH 1/4] Initialize VLC getters and metrics, fix some bugs in infra logic, needs to be refactored later on --- desktop_env/controllers/python.py | 19 +++++- desktop_env/controllers/setup.py | 66 ++++++++++++------- desktop_env/envs/desktop_env.py | 3 +- desktop_env/evaluators/getters/__init__.py | 1 + desktop_env/evaluators/getters/file.py | 5 +- desktop_env/evaluators/metrics/__init__.py | 3 +- .../59f21cfb-0120-4326-b255-a5b827b38967.json | 36 +++++++++- 7 files changed, 101 insertions(+), 32 deletions(-) diff --git a/desktop_env/controllers/python.py b/desktop_env/controllers/python.py index 2b0dfb7..5469800 100644 --- a/desktop_env/controllers/python.py +++ b/desktop_env/controllers/python.py @@ -80,7 +80,8 @@ class PythonController: y = parameters["y"] if "num_clicks" in parameters: num_clicks = parameters["num_clicks"] - self.execute_python_command(f"pyautogui.click(button='{button}', x={x}, y={y}, clicks={num_clicks})") + self.execute_python_command( + f"pyautogui.click(button='{button}', x={x}, y={y}, clicks={num_clicks})") else: self.execute_python_command(f"pyautogui.click(button='{button}', x={x}, y={y})") elif "button" in parameters and "x" not in parameters and "y" not in parameters: @@ -143,7 +144,8 @@ class PythonController: if "x" in parameters and "y" in parameters: x = parameters["x"] y = parameters["y"] - self.execute_python_command(f"pyautogui.dragTo({x}, {y}, duration=1.0, button='left', mouseDownUp=True)") + self.execute_python_command( + f"pyautogui.dragTo({x}, {y}, duration=1.0, button='left', mouseDownUp=True)") elif action_type == "SCROLL": # todo: check if it is related to the operating system, as https://github.com/TheDuckAI/DuckTrack/blob/main/ducktrack/playback.py pointed out @@ -206,3 +208,16 @@ class PythonController: else: raise Exception(f"Unknown action type: {action_type}") + + + def get_vlc_status(self, host='localhost', port=8080, password='password'): + url = f'http://{host}:{port}/requests/status.xml' + + response = requests.get(url, auth=('', password)) + + if response.status_code == 200: + print("File downloaded successfully") + return response.content + else: + print("Failed to get vlc status. Status code:", response.status_code) + return None diff --git a/desktop_env/controllers/setup.py b/desktop_env/controllers/setup.py index e28287b..2320e99 100644 --- a/desktop_env/controllers/setup.py +++ b/desktop_env/controllers/setup.py @@ -10,12 +10,11 @@ from typing import Any class SetupController: - def __init__( self - , http_server: str - , cache_dir: str - ): - self.http_server = http_server + "/setup" + def __init__(self, http_server: str, cache_dir: str): + self.http_server: str = http_server + self.http_server_setup_root = http_server + "/setup" self.cache_dir: str = cache_dir + def reset_cache_dir(self, cache_dir: str): self.cache_dir = cache_dir @@ -48,6 +47,32 @@ class SetupController: # self._open_setup(config) # can add other setup steps + def _command_setup(self, command: str): + """ + Directly send a command into the virtual machine os for setting up. + """ + payload = json.dumps({"command": command}) + headers = { + 'Content-Type': 'application/json' + } + timeout = 5 + timout_whitelist = ["vlc"] + + try: + + response = requests.post(self.http_server + "/execute", headers=headers, data=payload, timeout=timeout) + if response.status_code == 200: + print("Command executed successfully:", response.text) + else: + print("Failed to execute command. Status code:", response.status_code) + except requests.exceptions.Timeout as e: + if command in timout_whitelist: + print("Command executed successfully:", command) + else: + print("An error occurred while trying to execute the command:", e) + except requests.exceptions.RequestException as e: + print("An error occurred while trying to execute the command:", e) + def _download_setup(self, files: List[Dict[str, str]]): """ Args: @@ -66,12 +91,9 @@ class SetupController: for f in files: url: str = f["url"] path: str = f["path"] - cache_path: str = os.path.join( self.cache_dir - , "{:}_{:}".format( - uuid.uuid5(uuid.NAMESPACE_URL, url) - , os.path.basename(path) - ) - ) + cache_path: str = os.path.join(self.cache_dir, "{:}_{:}".format( + uuid.uuid5(uuid.NAMESPACE_URL, url), + os.path.basename(path))) if not url or not path: raise Exception(f"Setup Download - Invalid URL ({url}) or path ({path}).") @@ -97,21 +119,21 @@ class SetupController: if not downloaded: raise requests.RequestException(f"Failed to download {url}. No retries left. Error: {e}") - #payload = json.dumps({"url": url, "path": path}) - #headers = { - #'Content-Type': 'application/json' - #} + # payload = json.dumps({"url": url, "path": path}) + # headers = { + # 'Content-Type': 'application/json' + # } - form = MultipartEncoder( { "file_path": path - , "file_data": (os.path.basename(path), open(cache_path, "rb")) - } - ) + form = MultipartEncoder({ + "file_path": path, + "file_data": (os.path.basename(path), open(cache_path, "rb")) + }) headers = {"Content-Type": form.content_type} print(form.content_type) # send request to server to upload file try: - response = requests.post(self.http_server + "/upload", headers=headers, data=form) + response = requests.post(self.http_server_setup_root + "/upload", headers=headers, data=form) if response.status_code == 200: print("Command executed successfully:", response.text) else: @@ -136,7 +158,7 @@ class SetupController: # send request to server to change wallpaper try: - response = requests.post(self.http_server + "/change_wallpaper", headers=headers, data=payload) + response = requests.post(self.http_server_setup_root + "/change_wallpaper", headers=headers, data=payload) if response.status_code == 200: print("Command executed successfully:", response.text) else: @@ -163,7 +185,7 @@ class SetupController: # send request to server to open file try: - response = requests.post(self.http_server + "/open_file", headers=headers, data=payload) + response = requests.post(self.http_server_setup_root + "/open_file", headers=headers, data=payload) if response.status_code == 200: print("Command executed successfully:", response.text) else: diff --git a/desktop_env/envs/desktop_env.py b/desktop_env/envs/desktop_env.py index 03086ff..c953f3c 100644 --- a/desktop_env/envs/desktop_env.py +++ b/desktop_env/envs/desktop_env.py @@ -85,7 +85,8 @@ class DesktopEnv(gym.Env): # Initialize emulator and controller print("Initializing...") self._start_emulator() - self.host = f"http://{self._get_vm_ip()}:5000" + self.vm_ip = self._get_vm_ip() + self.host = f"http://{self.vm_ip}:5000" self.controller = PythonController(http_server=self.host) self.setup_controller = SetupController(http_server=self.host, cache_dir=self.cache_dir) diff --git a/desktop_env/evaluators/getters/__init__.py b/desktop_env/evaluators/getters/__init__.py index 81a23fd..770c30c 100644 --- a/desktop_env/evaluators/getters/__init__.py +++ b/desktop_env/evaluators/getters/__init__.py @@ -1,2 +1,3 @@ from .file import get_cloud_file, get_vm_file from .misc import get_rule +from .vlc import get_vlc_playing_info diff --git a/desktop_env/evaluators/getters/file.py b/desktop_env/evaluators/getters/file.py index a9be430..25fd081 100644 --- a/desktop_env/evaluators/getters/file.py +++ b/desktop_env/evaluators/getters/file.py @@ -3,6 +3,7 @@ from typing import Dict import os import requests + def get_cloud_file(env, config: Dict[str, str]) -> str: """ Config: @@ -25,6 +26,7 @@ def get_cloud_file(env, config: Dict[str, str]) -> str: return _path + def get_vm_file(env, config: Dict[str, str]) -> str: """ Config: @@ -33,12 +35,9 @@ def get_vm_file(env, config: Dict[str, str]) -> str: """ _path = os.path.join(env.cache_dir, config["dest"]) - if os.path.exists(_path): - return _path file = env.controller.get_file(config["path"]) with open(_path, "wb") as f: f.write(file) return _path - diff --git a/desktop_env/evaluators/metrics/__init__.py b/desktop_env/evaluators/metrics/__init__.py index 498df17..80a26cf 100644 --- a/desktop_env/evaluators/metrics/__init__.py +++ b/desktop_env/evaluators/metrics/__init__.py @@ -2,4 +2,5 @@ from .table import compare_table from .table import check_sheet_list, check_xlsx_freeze, check_zoom from .docs import find_default_font, contains_page_break, compare_docx_files, compare_docx_tables, compare_line_spacing, compare_insert_equation from .docs import compare_font_names, compare_subscript_contains, has_page_numbers_in_footers -from .docs import is_first_line_centered, check_file_exists, compare_contains_image \ No newline at end of file +from .docs import is_first_line_centered, check_file_exists, compare_contains_image +from .vlc import is_vlc_playing diff --git a/evaluation_examples/examples/vlc/59f21cfb-0120-4326-b255-a5b827b38967.json b/evaluation_examples/examples/vlc/59f21cfb-0120-4326-b255-a5b827b38967.json index 512427e..fabd42b 100644 --- a/evaluation_examples/examples/vlc/59f21cfb-0120-4326-b255-a5b827b38967.json +++ b/evaluation_examples/examples/vlc/59f21cfb-0120-4326-b255-a5b827b38967.json @@ -1,12 +1,42 @@ { "id": "59f21cfb-0120-4326-b255-a5b827b38967", "snapshot": "base_setup", - "instruction": "Could you help me play the file at FILE_PATH?", + "instruction": "Play the music video on my desktop", "source": "https://docs.videolan.me/vlc-user/desktop/3.0/en/basic/media.html#playing-a-file", - "config": [], + "config": [ + { + "type": "download", + "parameters": { + "files": [ + { + "url": "https://drive.usercontent.google.com/download?id=14-vhVMVw53e0l-MDVBFbngFAE1jMqvgm&export=download&authuser=0&confirm=t&uuid=d31607ed-0075-4fe5-b68c-b24b6eec356e&at=APZUnTV0Wy0672VFGrQChgHmd1Ba:1704337791613", + "path": "Desktop/Rick Astley - Never Gonna Give You Up (Official Music Video).mp4" + } + ] + } + }, + { + "type": "command", + "parameters": { + "command": "vlc" + } + } + ], "trajectory": "trajectories/", "related_apps": [ "vlc" ], - "evaluator": "evaluation_dir" + "evaluator": { + "func": "is_vlc_playing", + "expected": { + "type": "rule", + "rules": { + "file_path": "Desktop/Rick Astley - Never Gonna Give You Up (Official Music Video).mp4" + } + }, + "result": { + "type": "vlc_playing_info", + "dest": "status.xml" + } + } } From 2401513c1961ec7c976c3eae28347081d028012c Mon Sep 17 00:00:00 2001 From: Timothyxxx <384084775@qq.com> Date: Thu, 4 Jan 2024 17:55:07 +0800 Subject: [PATCH 2/4] Initialize VLC getters and metrics, fix some bugs in infra logic, needs to be refactored later on --- desktop_env/evaluators/getters/vlc.py | 20 +++++++++++++++ desktop_env/evaluators/metrics/gimp.py | 22 ++++++++++++++++ desktop_env/evaluators/metrics/vlc.py | 35 ++++++++++++-------------- 3 files changed, 58 insertions(+), 19 deletions(-) create mode 100644 desktop_env/evaluators/getters/vlc.py create mode 100644 desktop_env/evaluators/metrics/gimp.py diff --git a/desktop_env/evaluators/getters/vlc.py b/desktop_env/evaluators/getters/vlc.py new file mode 100644 index 0000000..e00ce61 --- /dev/null +++ b/desktop_env/evaluators/getters/vlc.py @@ -0,0 +1,20 @@ +import os +from typing import Dict + + +def get_vlc_playing_info(env, config: Dict[str, str]): + """ + Gets the current playing information from VLC's HTTP interface. + """ + _path = os.path.join(env.cache_dir, config["dest"]) + + host = env.vm_ip + port = 8080 + password = 'password' + + content = env.controller.get_vlc_status(host, port, password) + print("content: ", content) + with open(_path, "wb") as f: + f.write(content) + + return _path diff --git a/desktop_env/evaluators/metrics/gimp.py b/desktop_env/evaluators/metrics/gimp.py new file mode 100644 index 0000000..fbc328c --- /dev/null +++ b/desktop_env/evaluators/metrics/gimp.py @@ -0,0 +1,22 @@ +import os + + +def get_gimp_export_path(): + # Path to GIMP's configuration file. This example assumes GIMP version 2.10. + # You need to adjust the path according to the GIMP version and user's file system. + gimp_config_file = os.path.expanduser("~/.config/GIMP/2.10/gimprc") + + try: + # Open and read the configuration file + with open(gimp_config_file, 'r') as file: + for line in file: + # Search for the default export path setting + if "default-export-path" in line: + # Extract the current path from the line (assuming it's enclosed in quotes) + current_path = line.split('"')[1] + # Compare the current path with the expected path + return current_path + except FileNotFoundError: + # Handle the case where the configuration file is not found + print("GIMP configuration file not found") + return False diff --git a/desktop_env/evaluators/metrics/vlc.py b/desktop_env/evaluators/metrics/vlc.py index d3083c2..b5272eb 100644 --- a/desktop_env/evaluators/metrics/vlc.py +++ b/desktop_env/evaluators/metrics/vlc.py @@ -1,11 +1,12 @@ import os import platform -import requests from xml.etree import ElementTree import pygetwindow as gw import pyautogui +from typing import Dict -def read_vlc_config(setting_name): + +def get_vlc_config(setting_name): """ Reads the VLC configuration file to check for a specific setting. @@ -38,24 +39,22 @@ def read_vlc_config(setting_name): return None -def get_vlc_playing_info(host='localhost', port=8080, password='password'): +def is_vlc_playing(actual: str, rule: Dict[str, str]) -> float: """ - Gets the current playing information from VLC's HTTP interface. + Checks if VLC is currently playing a file. """ - url = f'http://{host}:{port}/requests/status.xml' - try: - response = requests.get(url, auth=('', password)) - if response.status_code == 200: - tree = ElementTree.fromstring(response.content) - status = tree.find('state').text - if status == 'playing': - file_info = tree.find('information/category[@name="meta"]/info[@name="filename"]').text - return status, file_info - return status, None - except Exception as e: - print(f"Error: {e}") + with open(actual, 'rb') as file: + actual_status = file.read().decode('utf-8') - return None, None + tree = ElementTree.fromstring(actual_status) + status = tree.find('state').text + if status == 'playing': + file_info = tree.find('information/category[@name="meta"]/info[@name="filename"]').text + print("file_info: ", file_info) + if file_info: + return 1 if file_info.endswith(rule['expected']) else 0 + else: + return 0 def is_vlc_fullscreen(): @@ -83,5 +82,3 @@ def is_vlc_fullscreen(): except Exception as e: print(f"An error occurred: {e}") return False - - From 3cbb57f24c4a041dd1c29c975abefe7ef51f5daf Mon Sep 17 00:00:00 2001 From: Timothyxxx <384084775@qq.com> Date: Fri, 5 Jan 2024 11:00:31 +0800 Subject: [PATCH 3/4] Add the GUI set-of-mark object detector data collection script --- mm_agents/gui_som/__init__.py | 0 mm_agents/gui_som/data_preparation/README.md | 8 + .../gui_som/data_preparation/__init__.py | 0 .../data_preparation/get_tag_elem_dict.js | 158 ++++++++++++++++++ .../majestic_million_download.py | 39 +++++ .../data_preparation/scrape_crawler.py | 119 +++++++++++++ 6 files changed, 324 insertions(+) create mode 100644 mm_agents/gui_som/__init__.py create mode 100644 mm_agents/gui_som/data_preparation/README.md create mode 100644 mm_agents/gui_som/data_preparation/__init__.py create mode 100644 mm_agents/gui_som/data_preparation/get_tag_elem_dict.js create mode 100644 mm_agents/gui_som/data_preparation/majestic_million_download.py create mode 100644 mm_agents/gui_som/data_preparation/scrape_crawler.py diff --git a/mm_agents/gui_som/__init__.py b/mm_agents/gui_som/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mm_agents/gui_som/data_preparation/README.md b/mm_agents/gui_som/data_preparation/README.md new file mode 100644 index 0000000..cf95798 --- /dev/null +++ b/mm_agents/gui_som/data_preparation/README.md @@ -0,0 +1,8 @@ +1. Get the URLs from majestic_million and save them to `majestic_million.csv` +```bash +python3 majestic_million.py +``` +2. Run scrapy spider to get the data from the URLs +```bash +python scrapy_crawler.py +``` \ No newline at end of file diff --git a/mm_agents/gui_som/data_preparation/__init__.py b/mm_agents/gui_som/data_preparation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mm_agents/gui_som/data_preparation/get_tag_elem_dict.js b/mm_agents/gui_som/data_preparation/get_tag_elem_dict.js new file mode 100644 index 0000000..f838ada --- /dev/null +++ b/mm_agents/gui_som/data_preparation/get_tag_elem_dict.js @@ -0,0 +1,158 @@ +(() => { + let labels = []; + let selector_id_table = {}; + var generateQuerySelector = function (el) { + function cssEscape(value) { + if (!value) return ''; + // Escape all CSS special characters, including the colon. + return value.replace(/([!"#$%&'()*+,./:;<=>?@[\]^`{|}~])/g, '\\$&'); + } + + function getChildIndex(el) { + var siblings = Array.from(el.parentNode.children); + var sameTagSiblings = siblings.filter(sibling => sibling.tagName === el.tagName); + return sameTagSiblings.indexOf(el); + } + + if (el.tagName.toLowerCase() === "html") { + return "HTML"; + } + + var str = el.tagName; + var idPresent = false; // Add a flag to check if an ID is present + + if (el.id !== "") { + str += "#" + cssEscape(el.id); + idPresent = true; // Set the flag to true if there's an ID + } + + if (el.className) { + var classes = el.className.split(/\s+/).filter(Boolean); // Filter out empty strings + for (var i = 0; i < classes.length; i++) { + str += "." + cssEscape(classes[i]); + } + } + + // Only add :nth-of-type() if no ID is present + if (!idPresent) { + str += ":nth-of-type(" + (getChildIndex(el) + 1) + ")"; + } + + // Use '>' combinator if parent is not 'HTML' + var parentSelector = generateQuerySelector(el.parentNode); + return parentSelector === "HTML" ? str : parentSelector + " > " + str; + } + + + function unmarkPage() { + for (const label of labels) { + document.body.removeChild(label); + } + labels = []; + } + + // Expose the unmarkPage function globally + window.unmarkPage = unmarkPage; + + function markPage() { + unmarkPage(); + + var bodyRect = document.body.getBoundingClientRect(); + + var items = Array.prototype.slice.call( + document.querySelectorAll('*') + ).map(function (element) { + var vw = Math.max(document.documentElement.clientWidth || 0, window.innerWidth || 0); + var vh = Math.max(document.documentElement.clientHeight || 0, window.innerHeight || 0); + + var rects = [...element.getClientRects()].filter(bb => { + var center_x = bb.left + bb.width / 2; + var center_y = bb.top + bb.height / 2; + var elAtCenter = document.elementFromPoint(center_x, center_y); + + return elAtCenter === element || element.contains(elAtCenter) + }).map(bb => { + const rect = { + left: Math.max(0, bb.left), + top: Math.max(0, bb.top), + right: Math.min(vw, bb.right), + bottom: Math.min(vh, bb.bottom) + }; + return { + ...rect, + width: rect.right - rect.left, + height: rect.bottom - rect.top + } + }); + + var area = rects.reduce((acc, rect) => acc + rect.width * rect.height, 0); + + return { + element: element, + include: + (element.tagName === "INPUT" || element.tagName === "TEXTAREA" || element.tagName === "SELECT") || + (element.tagName === "BUTTON" || element.tagName === "A" || (element.onclick != null) || window.getComputedStyle(element).cursor == "pointer") || + (element.tagName === "IFRAME" || element.tagName === "VIDEO") + , + area, + rects, + text: element.textContent.trim().replace(/\s{2,}/g, ' ') + }; + }).filter(item => + item.include && (item.area >= 20) + ); + + // Only keep inner clickable items + items = items.filter(x => !items.some(y => x.element.contains(y.element) && !(x == y))) + + // Function to generate random colors + function getRandomColor() { + var letters = '0123456789ABCDEF'; + var color = '#'; + for (var i = 0; i < 6; i++) { + color += letters[Math.floor(Math.random() * 16)]; + } + return color; + } + + // Lets create a floating border on top of these elements that will always be visible + items.forEach(function (item, index) { + selector_id_table[index.toString()] = item.rects; + item.rects.forEach((bbox) => { + newElement = document.createElement("div"); + var borderColor = getRandomColor(); + newElement.style.outline = `2px dashed ${borderColor}`; + newElement.style.position = "fixed"; + newElement.style.left = bbox.left + "px"; + newElement.style.top = bbox.top + "px"; + newElement.style.width = bbox.width + "px"; + newElement.style.height = bbox.height + "px"; + newElement.style.pointerEvents = "none"; + newElement.style.boxSizing = "border-box"; + newElement.style.zIndex = 2147483647; + // newElement.style.background = `${borderColor}80`; + + // Add floating label at the corner + var label = document.createElement("span"); + label.textContent = index; + label.style.position = "absolute"; + label.style.top = "-19px"; + label.style.left = "0px"; + label.style.background = borderColor; + label.style.color = "white"; + label.style.padding = "2px 4px"; + label.style.fontSize = "12px"; + label.style.borderRadius = "2px"; + newElement.appendChild(label); + + document.body.appendChild(newElement); + labels.push(newElement); + // item.element.setAttribute("-ai-label", label.textContent); + }); + }) + return selector_id_table; + } + + return markPage(); +})() + diff --git a/mm_agents/gui_som/data_preparation/majestic_million_download.py b/mm_agents/gui_som/data_preparation/majestic_million_download.py new file mode 100644 index 0000000..b76d934 --- /dev/null +++ b/mm_agents/gui_som/data_preparation/majestic_million_download.py @@ -0,0 +1,39 @@ +import csv + +import requests + + +# Latest run on 2024.1.4 +def download_csv(url, file_path): + response = requests.get(url) + with open(file_path, 'w', newline='', encoding='utf-8') as file: + file.write(response.text) + + +def read_csv(file_path): + urls = [] + with open(file_path, newline='', encoding='utf-8') as csvfile: + reader = csv.reader(csvfile) + next(reader, None) # Skip the header + for row in reader: + urls.append(row[2]) # Assuming the URL is in the third column + return urls + + +def main(): + url = 'http://downloads.majestic.com/majestic_million.csv' + file_path = 'majestic_million.csv' + + print("Downloading Majestic Million CSV...") + download_csv(url, file_path) + + print("Reading URLs from CSV...") + urls = read_csv(file_path) + + # Print the first 10 URLs as a sample + for url in urls[:10]: + print(url) + + +if __name__ == "__main__": + main() diff --git a/mm_agents/gui_som/data_preparation/scrape_crawler.py b/mm_agents/gui_som/data_preparation/scrape_crawler.py new file mode 100644 index 0000000..4ba93c5 --- /dev/null +++ b/mm_agents/gui_som/data_preparation/scrape_crawler.py @@ -0,0 +1,119 @@ +import json +import os +import random +import time +import uuid +from multiprocessing import Pool + +from playwright.sync_api import sync_playwright + +from majestic_million_download import read_csv + +# JavaScript code as a string +with open('get_tag_elem_dict.js', 'r') as f: + get_tag_elem_dict_js_code = f.read() + + +def scrape_data(website_url, action_depth=10): + # if file exists, skip + if os.path.exists(os.path.join('collected_data', website_url.split("//")[1])): + print("Data already exists, skipping...") + return + + def click_random_link(page): + links = page.query_selector_all("a") + if links: + random_link = random.choice(links) + try: + page.evaluate("window.unmarkPage()") + + # Capture the initial HTML content of the body + initial_content = page.inner_html("body") + + # Click the link and wait for potential navigation + random_link.click() + page.wait_for_timeout(5000) # wait for 5 seconds to allow page changes to occur + + # Capture the new HTML content of the body + new_content = page.inner_html("body") + + # Compare the contents + if new_content != initial_content: + print("Content change detected.") + return True + else: + print("No content change detected.") + return False + + except Exception as e: + print("Error occurred:", e) + return False + else: + print("No links found on the page.") + return False + + return False + + with sync_playwright() as p: + # Launch the browser + browser = p.chromium.launch() + context = browser.new_context(viewport={'width': 1920, 'height': 1080}, locale='en-US') + context.set_extra_http_headers({'Accept-Language': 'en-US'}) + page = context.new_page() + + # Navigate to Google + page.goto(website_url, timeout=60000, wait_until='networkidle') + + data_id = str(uuid.uuid4()) + data_dir = os.path.join('collected_data', website_url.split("//")[1], data_id) + os.makedirs(data_dir, exist_ok=True) + page.screenshot(path=os.path.join(data_dir, 'screenshot_0.png')) + tag_elem_dict = page.evaluate(get_tag_elem_dict_js_code) + with open(os.path.join(data_dir, 'meta_data_0.json'), 'w') as f: + json.dump({ + 'timestamp': time.time(), + 'url': website_url, + 'data_id': data_id, + 'tag_elem_dict': tag_elem_dict + }, f, indent=4) + page.screenshot(path=os.path.join(data_dir, 'screenshot_som_0.png')) + + for i in range(action_depth): + if not click_random_link(page): + print("Invalid click or no navigation, stopping random clicks.") + break + page.screenshot(path=os.path.join(data_dir, f'screenshot_{i + 1}.png')) + tag_elem_dict = page.evaluate(get_tag_elem_dict_js_code) + with open(os.path.join(data_dir, f'meta_data_{i + 1}.json'), 'w') as f: + json.dump({ + 'timestamp': time.time(), + 'url': website_url, + 'data_id': data_id, + 'tag_elem_dict': tag_elem_dict + }, f, indent=4) + page.screenshot(path=os.path.join(data_dir, f'screenshot_som_{i + 1}.png')) + + # Close the browser + browser.close() + + +def run_one(url): + try: + scrape_data("https://" + url, action_depth=5) + except Exception as e: + print("Error scraping data:", e) + print("Start next one...") + + +def main(): + urls = read_csv("majestic_million.csv")[:20000] + + # Number of processes + num_processes = 50 # Adjust based on your system's capability, on my i9-13900k, 50 processes can be used + + with Pool(num_processes) as pool: + pool.map(run_one, urls) + + +if __name__ == '__main__': + main() From fbb49187343b2361481db835684949d6e14603b6 Mon Sep 17 00:00:00 2001 From: David Chang Date: Fri, 5 Jan 2024 16:08:29 +0800 Subject: [PATCH 4/4] ver Jan5thv2 tested correctness of merging --- .gitignore | 1 + desktop_env/evaluators/metrics/__init__.py | 2 +- requirements.txt | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 37c4ba6..1672492 100644 --- a/.gitignore +++ b/.gitignore @@ -172,3 +172,4 @@ snapshots branch_flag branch-config *.syncthing.*.tmp +cache diff --git a/desktop_env/evaluators/metrics/__init__.py b/desktop_env/evaluators/metrics/__init__.py index 05bbcbd..18c9477 100644 --- a/desktop_env/evaluators/metrics/__init__.py +++ b/desktop_env/evaluators/metrics/__init__.py @@ -1,5 +1,5 @@ from .table import compare_table -from .table import check_sheet_list, check_xlsx_freeze, check_zoom +from .table import check_sheet_list, check_xlsx_freeze, check_xlsx_zoom from .docs import find_default_font, contains_page_break, compare_docx_files, compare_docx_tables, compare_line_spacing, compare_insert_equation from .docs import compare_font_names, compare_subscript_contains, has_page_numbers_in_footers from .docs import is_first_line_centered, check_file_exists, compare_contains_image diff --git a/requirements.txt b/requirements.txt index 5a3707b..d97aedd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,3 +22,4 @@ openpyxl python-docx python-pptx pypdf +PyGetWindow