Merge branch 'main' into zdy
This commit is contained in:
5
.gitignore
vendored
5
.gitignore
vendored
@@ -1,3 +1,7 @@
|
||||
# Model checkpoints
|
||||
*.pth
|
||||
*.pt
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
@@ -166,3 +170,4 @@ tags
|
||||
tags-opts
|
||||
snapshots
|
||||
*.syncthing.*.tmp
|
||||
cache
|
||||
|
||||
@@ -82,7 +82,8 @@ class PythonController:
|
||||
y = parameters["y"]
|
||||
if "num_clicks" in parameters:
|
||||
num_clicks = parameters["num_clicks"]
|
||||
self.execute_python_command(f"pyautogui.click(button='{button}', x={x}, y={y}, clicks={num_clicks})")
|
||||
self.execute_python_command(
|
||||
f"pyautogui.click(button='{button}', x={x}, y={y}, clicks={num_clicks})")
|
||||
else:
|
||||
self.execute_python_command(f"pyautogui.click(button='{button}', x={x}, y={y})")
|
||||
elif "button" in parameters and "x" not in parameters and "y" not in parameters:
|
||||
@@ -145,7 +146,8 @@ class PythonController:
|
||||
if "x" in parameters and "y" in parameters:
|
||||
x = parameters["x"]
|
||||
y = parameters["y"]
|
||||
self.execute_python_command(f"pyautogui.dragTo({x}, {y}, duration=1.0, button='left', mouseDownUp=True)")
|
||||
self.execute_python_command(
|
||||
f"pyautogui.dragTo({x}, {y}, duration=1.0, button='left', mouseDownUp=True)")
|
||||
|
||||
elif action_type == "SCROLL":
|
||||
# todo: check if it is related to the operating system, as https://github.com/TheDuckAI/DuckTrack/blob/main/ducktrack/playback.py pointed out
|
||||
@@ -208,3 +210,16 @@ class PythonController:
|
||||
|
||||
else:
|
||||
raise Exception(f"Unknown action type: {action_type}")
|
||||
|
||||
|
||||
def get_vlc_status(self, host='localhost', port=8080, password='password'):
|
||||
url = f'http://{host}:{port}/requests/status.xml'
|
||||
|
||||
response = requests.get(url, auth=('', password))
|
||||
|
||||
if response.status_code == 200:
|
||||
print("File downloaded successfully")
|
||||
return response.content
|
||||
else:
|
||||
print("Failed to get vlc status. Status code:", response.status_code)
|
||||
return None
|
||||
|
||||
@@ -14,12 +14,11 @@ logger = logging.getLogger("desktopenv.setup")
|
||||
import traceback
|
||||
|
||||
class SetupController:
|
||||
def __init__( self
|
||||
, http_server: str
|
||||
, cache_dir: str
|
||||
):
|
||||
self.http_server = http_server + "/setup"
|
||||
def __init__(self, http_server: str, cache_dir: str):
|
||||
self.http_server: str = http_server
|
||||
self.http_server_setup_root = http_server + "/setup"
|
||||
self.cache_dir: str = cache_dir
|
||||
|
||||
def reset_cache_dir(self, cache_dir: str):
|
||||
self.cache_dir = cache_dir
|
||||
|
||||
@@ -52,6 +51,33 @@ class SetupController:
|
||||
# self._open_setup(config)
|
||||
# can add other setup steps
|
||||
|
||||
# ZDY_COMMENT: merged with launch
|
||||
#def _command_setup(self, command: str):
|
||||
#"""
|
||||
#Directly send a command into the virtual machine os for setting up.
|
||||
#"""
|
||||
#payload = json.dumps({"command": command})
|
||||
#headers = {
|
||||
#'Content-Type': 'application/json'
|
||||
#}
|
||||
#timeout = 5
|
||||
#timout_whitelist = ["vlc"]
|
||||
#
|
||||
#try:
|
||||
#
|
||||
#response = requests.post(self.http_server + "/execute", headers=headers, data=payload, timeout=timeout)
|
||||
#if response.status_code == 200:
|
||||
#print("Command executed successfully:", response.text)
|
||||
#else:
|
||||
#print("Failed to execute command. Status code:", response.status_code)
|
||||
#except requests.exceptions.Timeout as e:
|
||||
#if command in timout_whitelist:
|
||||
#print("Command executed successfully:", command)
|
||||
#else:
|
||||
#print("An error occurred while trying to execute the command:", e)
|
||||
#except requests.exceptions.RequestException as e:
|
||||
#print("An error occurred while trying to execute the command:", e)
|
||||
|
||||
def _download_setup(self, files: List[Dict[str, str]]):
|
||||
"""
|
||||
Args:
|
||||
@@ -70,12 +96,9 @@ class SetupController:
|
||||
for f in files:
|
||||
url: str = f["url"]
|
||||
path: str = f["path"]
|
||||
cache_path: str = os.path.join( self.cache_dir
|
||||
, "{:}_{:}".format(
|
||||
uuid.uuid5(uuid.NAMESPACE_URL, url)
|
||||
, os.path.basename(path)
|
||||
)
|
||||
)
|
||||
cache_path: str = os.path.join(self.cache_dir, "{:}_{:}".format(
|
||||
uuid.uuid5(uuid.NAMESPACE_URL, url),
|
||||
os.path.basename(path)))
|
||||
|
||||
if not url or not path:
|
||||
raise Exception(f"Setup Download - Invalid URL ({url}) or path ({path}).")
|
||||
@@ -101,22 +124,22 @@ class SetupController:
|
||||
if not downloaded:
|
||||
raise requests.RequestException(f"Failed to download {url}. No retries left. Error: {e}")
|
||||
|
||||
#payload = json.dumps({"url": url, "path": path})
|
||||
#headers = {
|
||||
#'Content-Type': 'application/json'
|
||||
#}
|
||||
# payload = json.dumps({"url": url, "path": path})
|
||||
# headers = {
|
||||
# 'Content-Type': 'application/json'
|
||||
# }
|
||||
|
||||
form = MultipartEncoder( { "file_path": path
|
||||
, "file_data": (os.path.basename(path), open(cache_path, "rb"))
|
||||
}
|
||||
)
|
||||
form = MultipartEncoder({
|
||||
"file_path": path,
|
||||
"file_data": (os.path.basename(path), open(cache_path, "rb"))
|
||||
})
|
||||
headers = {"Content-Type": form.content_type}
|
||||
logger.debug(form.content_type)
|
||||
|
||||
# send request to server to upload file
|
||||
try:
|
||||
logger.debug("REQUEST ADDRESS: %s", self.http_server + "/upload")
|
||||
response = requests.post(self.http_server + "/upload", headers=headers, data=form)
|
||||
logger.debug("REQUEST ADDRESS: %s", self.http_server_setup_root + "/upload")
|
||||
response = requests.post(self.http_server_setup_root + "/upload", headers=headers, data=form)
|
||||
if response.status_code == 200:
|
||||
logger.info("Command executed successfully: %s", response.text)
|
||||
else:
|
||||
@@ -141,7 +164,7 @@ class SetupController:
|
||||
|
||||
# send request to server to change wallpaper
|
||||
try:
|
||||
response = requests.post(self.http_server + "/change_wallpaper", headers=headers, data=payload)
|
||||
response = requests.post(self.http_server_setup_root + "/change_wallpaper", headers=headers, data=payload)
|
||||
if response.status_code == 200:
|
||||
logger.info("Command executed successfully: %s", response.text)
|
||||
else:
|
||||
@@ -168,7 +191,7 @@ class SetupController:
|
||||
|
||||
# send request to server to open file
|
||||
try:
|
||||
response = requests.post(self.http_server + "/open_file", headers=headers, data=payload)
|
||||
response = requests.post(self.http_server_setup_root + "/open_file", headers=headers, data=payload)
|
||||
if response.status_code == 200:
|
||||
logger.info("Command executed successfully: %s", response.text)
|
||||
else:
|
||||
@@ -184,7 +207,7 @@ class SetupController:
|
||||
headers = {"Content-Type": "application/json"}
|
||||
|
||||
try:
|
||||
response = requests.post(self.http_server + "/launch", headers=headers, data=payload)
|
||||
response = requests.post(self.http_server_setup_root + "/launch", headers=headers, data=payload)
|
||||
if response.status_code == 200:
|
||||
logger.info("Command executed successfully: %s", response.text)
|
||||
else:
|
||||
@@ -200,7 +223,7 @@ class SetupController:
|
||||
headers = {"Content-Type": "application/json"}
|
||||
|
||||
try:
|
||||
response = requests.post(self.http_server + "/execute", headers=headers, data=payload)
|
||||
response = requests.post(self.http_server_setup_root + "/execute", headers=headers, data=payload)
|
||||
if response.status_code == 200:
|
||||
results: Dict[str, str] = response.json()
|
||||
if stdout:
|
||||
|
||||
@@ -76,7 +76,8 @@ class DesktopEnv(gym.Env):
|
||||
# Initialize emulator and controller
|
||||
logger.info("Initializing...")
|
||||
self._start_emulator()
|
||||
self.host = f"http://{self._get_vm_ip()}:5000"
|
||||
self.vm_ip = self._get_vm_ip()
|
||||
self.host = f"http://{self.vm_ip}:5000"
|
||||
self.controller = PythonController(http_server=self.host)
|
||||
self.setup_controller = SetupController(http_server=self.host, cache_dir=self.cache_dir)
|
||||
|
||||
|
||||
@@ -1,2 +1,3 @@
|
||||
from .file import get_cloud_file, get_vm_file, get_cache_file
|
||||
from .misc import get_rule
|
||||
from .vlc import get_vlc_playing_info
|
||||
|
||||
@@ -3,6 +3,7 @@ from typing import Dict
|
||||
import os
|
||||
import requests
|
||||
|
||||
|
||||
def get_cloud_file(env, config: Dict[str, str]) -> str:
|
||||
"""
|
||||
Config:
|
||||
@@ -25,6 +26,7 @@ def get_cloud_file(env, config: Dict[str, str]) -> str:
|
||||
|
||||
return _path
|
||||
|
||||
|
||||
def get_vm_file(env, config: Dict[str, str]) -> str:
|
||||
"""
|
||||
Config:
|
||||
|
||||
20
desktop_env/evaluators/getters/vlc.py
Normal file
20
desktop_env/evaluators/getters/vlc.py
Normal file
@@ -0,0 +1,20 @@
|
||||
import os
|
||||
from typing import Dict
|
||||
|
||||
|
||||
def get_vlc_playing_info(env, config: Dict[str, str]):
|
||||
"""
|
||||
Gets the current playing information from VLC's HTTP interface.
|
||||
"""
|
||||
_path = os.path.join(env.cache_dir, config["dest"])
|
||||
|
||||
host = env.vm_ip
|
||||
port = 8080
|
||||
password = 'password'
|
||||
|
||||
content = env.controller.get_vlc_status(host, port, password)
|
||||
print("content: ", content)
|
||||
with open(_path, "wb") as f:
|
||||
f.write(content)
|
||||
|
||||
return _path
|
||||
@@ -5,4 +5,5 @@ from .docs import compare_font_names, compare_subscript_contains, has_page_numbe
|
||||
from .docs import is_first_line_centered, check_file_exists, compare_contains_image
|
||||
from .pdf import check_pdf_pages
|
||||
from .libreoffice import check_libre_locale
|
||||
#from .vlc import is_vlc_playing
|
||||
from .general import check_csv
|
||||
|
||||
22
desktop_env/evaluators/metrics/gimp.py
Normal file
22
desktop_env/evaluators/metrics/gimp.py
Normal file
@@ -0,0 +1,22 @@
|
||||
import os
|
||||
|
||||
|
||||
def get_gimp_export_path():
|
||||
# Path to GIMP's configuration file. This example assumes GIMP version 2.10.
|
||||
# You need to adjust the path according to the GIMP version and user's file system.
|
||||
gimp_config_file = os.path.expanduser("~/.config/GIMP/2.10/gimprc")
|
||||
|
||||
try:
|
||||
# Open and read the configuration file
|
||||
with open(gimp_config_file, 'r') as file:
|
||||
for line in file:
|
||||
# Search for the default export path setting
|
||||
if "default-export-path" in line:
|
||||
# Extract the current path from the line (assuming it's enclosed in quotes)
|
||||
current_path = line.split('"')[1]
|
||||
# Compare the current path with the expected path
|
||||
return current_path
|
||||
except FileNotFoundError:
|
||||
# Handle the case where the configuration file is not found
|
||||
print("GIMP configuration file not found")
|
||||
return False
|
||||
@@ -1,14 +1,14 @@
|
||||
import os
|
||||
import platform
|
||||
import requests
|
||||
from xml.etree import ElementTree
|
||||
import pygetwindow as gw
|
||||
import pyautogui
|
||||
from typing import Dict
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger("desktopenv.metrics.vlc")
|
||||
|
||||
def read_vlc_config(setting_name):
|
||||
def get_vlc_config(setting_name):
|
||||
"""
|
||||
Reads the VLC configuration file to check for a specific setting.
|
||||
|
||||
@@ -41,24 +41,22 @@ def read_vlc_config(setting_name):
|
||||
return None
|
||||
|
||||
|
||||
def get_vlc_playing_info(host='localhost', port=8080, password='password'):
|
||||
def is_vlc_playing(actual: str, rule: Dict[str, str]) -> float:
|
||||
"""
|
||||
Gets the current playing information from VLC's HTTP interface.
|
||||
Checks if VLC is currently playing a file.
|
||||
"""
|
||||
url = f'http://{host}:{port}/requests/status.xml'
|
||||
try:
|
||||
response = requests.get(url, auth=('', password))
|
||||
if response.status_code == 200:
|
||||
tree = ElementTree.fromstring(response.content)
|
||||
status = tree.find('state').text
|
||||
if status == 'playing':
|
||||
file_info = tree.find('information/category[@name="meta"]/info[@name="filename"]').text
|
||||
return status, file_info
|
||||
return status, None
|
||||
except Exception as e:
|
||||
logger.error(f"Error: {e}")
|
||||
with open(actual, 'rb') as file:
|
||||
actual_status = file.read().decode('utf-8')
|
||||
|
||||
return None, None
|
||||
tree = ElementTree.fromstring(actual_status)
|
||||
status = tree.find('state').text
|
||||
if status == 'playing':
|
||||
file_info = tree.find('information/category[@name="meta"]/info[@name="filename"]').text
|
||||
print("file_info: ", file_info)
|
||||
if file_info:
|
||||
return 1 if file_info.endswith(rule['expected']) else 0
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
def is_vlc_fullscreen():
|
||||
@@ -86,5 +84,3 @@ def is_vlc_fullscreen():
|
||||
except Exception as e:
|
||||
logger.error(f"An error occurred: {e}")
|
||||
return False
|
||||
|
||||
|
||||
|
||||
@@ -1,12 +1,42 @@
|
||||
{
|
||||
"id": "59f21cfb-0120-4326-b255-a5b827b38967",
|
||||
"snapshot": "base_setup",
|
||||
"instruction": "Could you help me play the file at FILE_PATH?",
|
||||
"instruction": "Play the music video on my desktop",
|
||||
"source": "https://docs.videolan.me/vlc-user/desktop/3.0/en/basic/media.html#playing-a-file",
|
||||
"config": [],
|
||||
"config": [
|
||||
{
|
||||
"type": "download",
|
||||
"parameters": {
|
||||
"files": [
|
||||
{
|
||||
"url": "https://drive.usercontent.google.com/download?id=14-vhVMVw53e0l-MDVBFbngFAE1jMqvgm&export=download&authuser=0&confirm=t&uuid=d31607ed-0075-4fe5-b68c-b24b6eec356e&at=APZUnTV0Wy0672VFGrQChgHmd1Ba:1704337791613",
|
||||
"path": "Desktop/Rick Astley - Never Gonna Give You Up (Official Music Video).mp4"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "launch",
|
||||
"parameters": {
|
||||
"command": "vlc"
|
||||
}
|
||||
}
|
||||
],
|
||||
"trajectory": "trajectories/",
|
||||
"related_apps": [
|
||||
"vlc"
|
||||
],
|
||||
"evaluator": "evaluation_dir"
|
||||
"evaluator": {
|
||||
"func": "is_vlc_playing",
|
||||
"expected": {
|
||||
"type": "rule",
|
||||
"rules": {
|
||||
"file_path": "Desktop/Rick Astley - Never Gonna Give You Up (Official Music Video).mp4"
|
||||
}
|
||||
},
|
||||
"result": {
|
||||
"type": "vlc_playing_info",
|
||||
"dest": "status.xml"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
0
mm_agents/gui_som/__init__.py
Normal file
0
mm_agents/gui_som/__init__.py
Normal file
8
mm_agents/gui_som/data_preparation/README.md
Normal file
8
mm_agents/gui_som/data_preparation/README.md
Normal file
@@ -0,0 +1,8 @@
|
||||
1. Get the URLs from majestic_million and save them to `majestic_million.csv`
|
||||
```bash
|
||||
python3 majestic_million.py
|
||||
```
|
||||
2. Run scrapy spider to get the data from the URLs
|
||||
```bash
|
||||
python scrapy_crawler.py
|
||||
```
|
||||
0
mm_agents/gui_som/data_preparation/__init__.py
Normal file
0
mm_agents/gui_som/data_preparation/__init__.py
Normal file
158
mm_agents/gui_som/data_preparation/get_tag_elem_dict.js
Normal file
158
mm_agents/gui_som/data_preparation/get_tag_elem_dict.js
Normal file
@@ -0,0 +1,158 @@
|
||||
(() => {
|
||||
let labels = [];
|
||||
let selector_id_table = {};
|
||||
var generateQuerySelector = function (el) {
|
||||
function cssEscape(value) {
|
||||
if (!value) return '';
|
||||
// Escape all CSS special characters, including the colon.
|
||||
return value.replace(/([!"#$%&'()*+,./:;<=>?@[\]^`{|}~])/g, '\\$&');
|
||||
}
|
||||
|
||||
function getChildIndex(el) {
|
||||
var siblings = Array.from(el.parentNode.children);
|
||||
var sameTagSiblings = siblings.filter(sibling => sibling.tagName === el.tagName);
|
||||
return sameTagSiblings.indexOf(el);
|
||||
}
|
||||
|
||||
if (el.tagName.toLowerCase() === "html") {
|
||||
return "HTML";
|
||||
}
|
||||
|
||||
var str = el.tagName;
|
||||
var idPresent = false; // Add a flag to check if an ID is present
|
||||
|
||||
if (el.id !== "") {
|
||||
str += "#" + cssEscape(el.id);
|
||||
idPresent = true; // Set the flag to true if there's an ID
|
||||
}
|
||||
|
||||
if (el.className) {
|
||||
var classes = el.className.split(/\s+/).filter(Boolean); // Filter out empty strings
|
||||
for (var i = 0; i < classes.length; i++) {
|
||||
str += "." + cssEscape(classes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// Only add :nth-of-type() if no ID is present
|
||||
if (!idPresent) {
|
||||
str += ":nth-of-type(" + (getChildIndex(el) + 1) + ")";
|
||||
}
|
||||
|
||||
// Use '>' combinator if parent is not 'HTML'
|
||||
var parentSelector = generateQuerySelector(el.parentNode);
|
||||
return parentSelector === "HTML" ? str : parentSelector + " > " + str;
|
||||
}
|
||||
|
||||
|
||||
function unmarkPage() {
|
||||
for (const label of labels) {
|
||||
document.body.removeChild(label);
|
||||
}
|
||||
labels = [];
|
||||
}
|
||||
|
||||
// Expose the unmarkPage function globally
|
||||
window.unmarkPage = unmarkPage;
|
||||
|
||||
function markPage() {
|
||||
unmarkPage();
|
||||
|
||||
var bodyRect = document.body.getBoundingClientRect();
|
||||
|
||||
var items = Array.prototype.slice.call(
|
||||
document.querySelectorAll('*')
|
||||
).map(function (element) {
|
||||
var vw = Math.max(document.documentElement.clientWidth || 0, window.innerWidth || 0);
|
||||
var vh = Math.max(document.documentElement.clientHeight || 0, window.innerHeight || 0);
|
||||
|
||||
var rects = [...element.getClientRects()].filter(bb => {
|
||||
var center_x = bb.left + bb.width / 2;
|
||||
var center_y = bb.top + bb.height / 2;
|
||||
var elAtCenter = document.elementFromPoint(center_x, center_y);
|
||||
|
||||
return elAtCenter === element || element.contains(elAtCenter)
|
||||
}).map(bb => {
|
||||
const rect = {
|
||||
left: Math.max(0, bb.left),
|
||||
top: Math.max(0, bb.top),
|
||||
right: Math.min(vw, bb.right),
|
||||
bottom: Math.min(vh, bb.bottom)
|
||||
};
|
||||
return {
|
||||
...rect,
|
||||
width: rect.right - rect.left,
|
||||
height: rect.bottom - rect.top
|
||||
}
|
||||
});
|
||||
|
||||
var area = rects.reduce((acc, rect) => acc + rect.width * rect.height, 0);
|
||||
|
||||
return {
|
||||
element: element,
|
||||
include:
|
||||
(element.tagName === "INPUT" || element.tagName === "TEXTAREA" || element.tagName === "SELECT") ||
|
||||
(element.tagName === "BUTTON" || element.tagName === "A" || (element.onclick != null) || window.getComputedStyle(element).cursor == "pointer") ||
|
||||
(element.tagName === "IFRAME" || element.tagName === "VIDEO")
|
||||
,
|
||||
area,
|
||||
rects,
|
||||
text: element.textContent.trim().replace(/\s{2,}/g, ' ')
|
||||
};
|
||||
}).filter(item =>
|
||||
item.include && (item.area >= 20)
|
||||
);
|
||||
|
||||
// Only keep inner clickable items
|
||||
items = items.filter(x => !items.some(y => x.element.contains(y.element) && !(x == y)))
|
||||
|
||||
// Function to generate random colors
|
||||
function getRandomColor() {
|
||||
var letters = '0123456789ABCDEF';
|
||||
var color = '#';
|
||||
for (var i = 0; i < 6; i++) {
|
||||
color += letters[Math.floor(Math.random() * 16)];
|
||||
}
|
||||
return color;
|
||||
}
|
||||
|
||||
// Lets create a floating border on top of these elements that will always be visible
|
||||
items.forEach(function (item, index) {
|
||||
selector_id_table[index.toString()] = item.rects;
|
||||
item.rects.forEach((bbox) => {
|
||||
newElement = document.createElement("div");
|
||||
var borderColor = getRandomColor();
|
||||
newElement.style.outline = `2px dashed ${borderColor}`;
|
||||
newElement.style.position = "fixed";
|
||||
newElement.style.left = bbox.left + "px";
|
||||
newElement.style.top = bbox.top + "px";
|
||||
newElement.style.width = bbox.width + "px";
|
||||
newElement.style.height = bbox.height + "px";
|
||||
newElement.style.pointerEvents = "none";
|
||||
newElement.style.boxSizing = "border-box";
|
||||
newElement.style.zIndex = 2147483647;
|
||||
// newElement.style.background = `${borderColor}80`;
|
||||
|
||||
// Add floating label at the corner
|
||||
var label = document.createElement("span");
|
||||
label.textContent = index;
|
||||
label.style.position = "absolute";
|
||||
label.style.top = "-19px";
|
||||
label.style.left = "0px";
|
||||
label.style.background = borderColor;
|
||||
label.style.color = "white";
|
||||
label.style.padding = "2px 4px";
|
||||
label.style.fontSize = "12px";
|
||||
label.style.borderRadius = "2px";
|
||||
newElement.appendChild(label);
|
||||
|
||||
document.body.appendChild(newElement);
|
||||
labels.push(newElement);
|
||||
// item.element.setAttribute("-ai-label", label.textContent);
|
||||
});
|
||||
})
|
||||
return selector_id_table;
|
||||
}
|
||||
|
||||
return markPage();
|
||||
})()
|
||||
|
||||
@@ -0,0 +1,39 @@
|
||||
import csv
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
# Latest run on 2024.1.4
|
||||
def download_csv(url, file_path):
|
||||
response = requests.get(url)
|
||||
with open(file_path, 'w', newline='', encoding='utf-8') as file:
|
||||
file.write(response.text)
|
||||
|
||||
|
||||
def read_csv(file_path):
|
||||
urls = []
|
||||
with open(file_path, newline='', encoding='utf-8') as csvfile:
|
||||
reader = csv.reader(csvfile)
|
||||
next(reader, None) # Skip the header
|
||||
for row in reader:
|
||||
urls.append(row[2]) # Assuming the URL is in the third column
|
||||
return urls
|
||||
|
||||
|
||||
def main():
|
||||
url = 'http://downloads.majestic.com/majestic_million.csv'
|
||||
file_path = 'majestic_million.csv'
|
||||
|
||||
print("Downloading Majestic Million CSV...")
|
||||
download_csv(url, file_path)
|
||||
|
||||
print("Reading URLs from CSV...")
|
||||
urls = read_csv(file_path)
|
||||
|
||||
# Print the first 10 URLs as a sample
|
||||
for url in urls[:10]:
|
||||
print(url)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
119
mm_agents/gui_som/data_preparation/scrape_crawler.py
Normal file
119
mm_agents/gui_som/data_preparation/scrape_crawler.py
Normal file
@@ -0,0 +1,119 @@
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
import uuid
|
||||
from multiprocessing import Pool
|
||||
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
from majestic_million_download import read_csv
|
||||
|
||||
# JavaScript code as a string
|
||||
with open('get_tag_elem_dict.js', 'r') as f:
|
||||
get_tag_elem_dict_js_code = f.read()
|
||||
|
||||
|
||||
def scrape_data(website_url, action_depth=10):
|
||||
# if file exists, skip
|
||||
if os.path.exists(os.path.join('collected_data', website_url.split("//")[1])):
|
||||
print("Data already exists, skipping...")
|
||||
return
|
||||
|
||||
def click_random_link(page):
|
||||
links = page.query_selector_all("a")
|
||||
if links:
|
||||
random_link = random.choice(links)
|
||||
try:
|
||||
page.evaluate("window.unmarkPage()")
|
||||
|
||||
# Capture the initial HTML content of the body
|
||||
initial_content = page.inner_html("body")
|
||||
|
||||
# Click the link and wait for potential navigation
|
||||
random_link.click()
|
||||
page.wait_for_timeout(5000) # wait for 5 seconds to allow page changes to occur
|
||||
|
||||
# Capture the new HTML content of the body
|
||||
new_content = page.inner_html("body")
|
||||
|
||||
# Compare the contents
|
||||
if new_content != initial_content:
|
||||
print("Content change detected.")
|
||||
return True
|
||||
else:
|
||||
print("No content change detected.")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
print("Error occurred:", e)
|
||||
return False
|
||||
else:
|
||||
print("No links found on the page.")
|
||||
return False
|
||||
|
||||
return False
|
||||
|
||||
with sync_playwright() as p:
|
||||
# Launch the browser
|
||||
browser = p.chromium.launch()
|
||||
context = browser.new_context(viewport={'width': 1920, 'height': 1080}, locale='en-US')
|
||||
context.set_extra_http_headers({'Accept-Language': 'en-US'})
|
||||
page = context.new_page()
|
||||
|
||||
# Navigate to Google
|
||||
page.goto(website_url, timeout=60000, wait_until='networkidle')
|
||||
|
||||
data_id = str(uuid.uuid4())
|
||||
data_dir = os.path.join('collected_data', website_url.split("//")[1], data_id)
|
||||
os.makedirs(data_dir, exist_ok=True)
|
||||
page.screenshot(path=os.path.join(data_dir, 'screenshot_0.png'))
|
||||
tag_elem_dict = page.evaluate(get_tag_elem_dict_js_code)
|
||||
with open(os.path.join(data_dir, 'meta_data_0.json'), 'w') as f:
|
||||
json.dump({
|
||||
'timestamp': time.time(),
|
||||
'url': website_url,
|
||||
'data_id': data_id,
|
||||
'tag_elem_dict': tag_elem_dict
|
||||
}, f, indent=4)
|
||||
page.screenshot(path=os.path.join(data_dir, 'screenshot_som_0.png'))
|
||||
|
||||
for i in range(action_depth):
|
||||
if not click_random_link(page):
|
||||
print("Invalid click or no navigation, stopping random clicks.")
|
||||
break
|
||||
page.screenshot(path=os.path.join(data_dir, f'screenshot_{i + 1}.png'))
|
||||
tag_elem_dict = page.evaluate(get_tag_elem_dict_js_code)
|
||||
with open(os.path.join(data_dir, f'meta_data_{i + 1}.json'), 'w') as f:
|
||||
json.dump({
|
||||
'timestamp': time.time(),
|
||||
'url': website_url,
|
||||
'data_id': data_id,
|
||||
'tag_elem_dict': tag_elem_dict
|
||||
}, f, indent=4)
|
||||
page.screenshot(path=os.path.join(data_dir, f'screenshot_som_{i + 1}.png'))
|
||||
|
||||
# Close the browser
|
||||
browser.close()
|
||||
|
||||
|
||||
def run_one(url):
|
||||
try:
|
||||
scrape_data("https://" + url, action_depth=5)
|
||||
except Exception as e:
|
||||
print("Error scraping data:", e)
|
||||
print("Start next one...")
|
||||
|
||||
|
||||
def main():
|
||||
urls = read_csv("majestic_million.csv")[:20000]
|
||||
|
||||
# Number of processes
|
||||
num_processes = 50 # Adjust based on your system's capability, on my i9-13900k, 50 processes can be used
|
||||
|
||||
with Pool(num_processes) as pool:
|
||||
pool.map(run_one, urls)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -22,3 +22,4 @@ openpyxl
|
||||
python-docx
|
||||
python-pptx
|
||||
pypdf
|
||||
PyGetWindow
|
||||
|
||||
Reference in New Issue
Block a user