diff --git a/mm_agents/gui_som/__init__.py b/mm_agents/gui_som/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mm_agents/gui_som/data_preparation/README.md b/mm_agents/gui_som/data_preparation/README.md new file mode 100644 index 0000000..cf95798 --- /dev/null +++ b/mm_agents/gui_som/data_preparation/README.md @@ -0,0 +1,8 @@ +1. Get the URLs from majestic_million and save them to `majestic_million.csv` +```bash +python3 majestic_million.py +``` +2. Run scrapy spider to get the data from the URLs +```bash +python scrapy_crawler.py +``` \ No newline at end of file diff --git a/mm_agents/gui_som/data_preparation/__init__.py b/mm_agents/gui_som/data_preparation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mm_agents/gui_som/data_preparation/get_tag_elem_dict.js b/mm_agents/gui_som/data_preparation/get_tag_elem_dict.js new file mode 100644 index 0000000..f838ada --- /dev/null +++ b/mm_agents/gui_som/data_preparation/get_tag_elem_dict.js @@ -0,0 +1,158 @@ +(() => { + let labels = []; + let selector_id_table = {}; + var generateQuerySelector = function (el) { + function cssEscape(value) { + if (!value) return ''; + // Escape all CSS special characters, including the colon. + return value.replace(/([!"#$%&'()*+,./:;<=>?@[\]^`{|}~])/g, '\\$&'); + } + + function getChildIndex(el) { + var siblings = Array.from(el.parentNode.children); + var sameTagSiblings = siblings.filter(sibling => sibling.tagName === el.tagName); + return sameTagSiblings.indexOf(el); + } + + if (el.tagName.toLowerCase() === "html") { + return "HTML"; + } + + var str = el.tagName; + var idPresent = false; // Add a flag to check if an ID is present + + if (el.id !== "") { + str += "#" + cssEscape(el.id); + idPresent = true; // Set the flag to true if there's an ID + } + + if (el.className) { + var classes = el.className.split(/\s+/).filter(Boolean); // Filter out empty strings + for (var i = 0; i < classes.length; i++) { + str += "." + cssEscape(classes[i]); + } + } + + // Only add :nth-of-type() if no ID is present + if (!idPresent) { + str += ":nth-of-type(" + (getChildIndex(el) + 1) + ")"; + } + + // Use '>' combinator if parent is not 'HTML' + var parentSelector = generateQuerySelector(el.parentNode); + return parentSelector === "HTML" ? str : parentSelector + " > " + str; + } + + + function unmarkPage() { + for (const label of labels) { + document.body.removeChild(label); + } + labels = []; + } + + // Expose the unmarkPage function globally + window.unmarkPage = unmarkPage; + + function markPage() { + unmarkPage(); + + var bodyRect = document.body.getBoundingClientRect(); + + var items = Array.prototype.slice.call( + document.querySelectorAll('*') + ).map(function (element) { + var vw = Math.max(document.documentElement.clientWidth || 0, window.innerWidth || 0); + var vh = Math.max(document.documentElement.clientHeight || 0, window.innerHeight || 0); + + var rects = [...element.getClientRects()].filter(bb => { + var center_x = bb.left + bb.width / 2; + var center_y = bb.top + bb.height / 2; + var elAtCenter = document.elementFromPoint(center_x, center_y); + + return elAtCenter === element || element.contains(elAtCenter) + }).map(bb => { + const rect = { + left: Math.max(0, bb.left), + top: Math.max(0, bb.top), + right: Math.min(vw, bb.right), + bottom: Math.min(vh, bb.bottom) + }; + return { + ...rect, + width: rect.right - rect.left, + height: rect.bottom - rect.top + } + }); + + var area = rects.reduce((acc, rect) => acc + rect.width * rect.height, 0); + + return { + element: element, + include: + (element.tagName === "INPUT" || element.tagName === "TEXTAREA" || element.tagName === "SELECT") || + (element.tagName === "BUTTON" || element.tagName === "A" || (element.onclick != null) || window.getComputedStyle(element).cursor == "pointer") || + (element.tagName === "IFRAME" || element.tagName === "VIDEO") + , + area, + rects, + text: element.textContent.trim().replace(/\s{2,}/g, ' ') + }; + }).filter(item => + item.include && (item.area >= 20) + ); + + // Only keep inner clickable items + items = items.filter(x => !items.some(y => x.element.contains(y.element) && !(x == y))) + + // Function to generate random colors + function getRandomColor() { + var letters = '0123456789ABCDEF'; + var color = '#'; + for (var i = 0; i < 6; i++) { + color += letters[Math.floor(Math.random() * 16)]; + } + return color; + } + + // Lets create a floating border on top of these elements that will always be visible + items.forEach(function (item, index) { + selector_id_table[index.toString()] = item.rects; + item.rects.forEach((bbox) => { + newElement = document.createElement("div"); + var borderColor = getRandomColor(); + newElement.style.outline = `2px dashed ${borderColor}`; + newElement.style.position = "fixed"; + newElement.style.left = bbox.left + "px"; + newElement.style.top = bbox.top + "px"; + newElement.style.width = bbox.width + "px"; + newElement.style.height = bbox.height + "px"; + newElement.style.pointerEvents = "none"; + newElement.style.boxSizing = "border-box"; + newElement.style.zIndex = 2147483647; + // newElement.style.background = `${borderColor}80`; + + // Add floating label at the corner + var label = document.createElement("span"); + label.textContent = index; + label.style.position = "absolute"; + label.style.top = "-19px"; + label.style.left = "0px"; + label.style.background = borderColor; + label.style.color = "white"; + label.style.padding = "2px 4px"; + label.style.fontSize = "12px"; + label.style.borderRadius = "2px"; + newElement.appendChild(label); + + document.body.appendChild(newElement); + labels.push(newElement); + // item.element.setAttribute("-ai-label", label.textContent); + }); + }) + return selector_id_table; + } + + return markPage(); +})() + diff --git a/mm_agents/gui_som/data_preparation/majestic_million_download.py b/mm_agents/gui_som/data_preparation/majestic_million_download.py new file mode 100644 index 0000000..b76d934 --- /dev/null +++ b/mm_agents/gui_som/data_preparation/majestic_million_download.py @@ -0,0 +1,39 @@ +import csv + +import requests + + +# Latest run on 2024.1.4 +def download_csv(url, file_path): + response = requests.get(url) + with open(file_path, 'w', newline='', encoding='utf-8') as file: + file.write(response.text) + + +def read_csv(file_path): + urls = [] + with open(file_path, newline='', encoding='utf-8') as csvfile: + reader = csv.reader(csvfile) + next(reader, None) # Skip the header + for row in reader: + urls.append(row[2]) # Assuming the URL is in the third column + return urls + + +def main(): + url = 'http://downloads.majestic.com/majestic_million.csv' + file_path = 'majestic_million.csv' + + print("Downloading Majestic Million CSV...") + download_csv(url, file_path) + + print("Reading URLs from CSV...") + urls = read_csv(file_path) + + # Print the first 10 URLs as a sample + for url in urls[:10]: + print(url) + + +if __name__ == "__main__": + main() diff --git a/mm_agents/gui_som/data_preparation/scrape_crawler.py b/mm_agents/gui_som/data_preparation/scrape_crawler.py new file mode 100644 index 0000000..4ba93c5 --- /dev/null +++ b/mm_agents/gui_som/data_preparation/scrape_crawler.py @@ -0,0 +1,119 @@ +import json +import os +import random +import time +import uuid +from multiprocessing import Pool + +from playwright.sync_api import sync_playwright + +from majestic_million_download import read_csv + +# JavaScript code as a string +with open('get_tag_elem_dict.js', 'r') as f: + get_tag_elem_dict_js_code = f.read() + + +def scrape_data(website_url, action_depth=10): + # if file exists, skip + if os.path.exists(os.path.join('collected_data', website_url.split("//")[1])): + print("Data already exists, skipping...") + return + + def click_random_link(page): + links = page.query_selector_all("a") + if links: + random_link = random.choice(links) + try: + page.evaluate("window.unmarkPage()") + + # Capture the initial HTML content of the body + initial_content = page.inner_html("body") + + # Click the link and wait for potential navigation + random_link.click() + page.wait_for_timeout(5000) # wait for 5 seconds to allow page changes to occur + + # Capture the new HTML content of the body + new_content = page.inner_html("body") + + # Compare the contents + if new_content != initial_content: + print("Content change detected.") + return True + else: + print("No content change detected.") + return False + + except Exception as e: + print("Error occurred:", e) + return False + else: + print("No links found on the page.") + return False + + return False + + with sync_playwright() as p: + # Launch the browser + browser = p.chromium.launch() + context = browser.new_context(viewport={'width': 1920, 'height': 1080}, locale='en-US') + context.set_extra_http_headers({'Accept-Language': 'en-US'}) + page = context.new_page() + + # Navigate to Google + page.goto(website_url, timeout=60000, wait_until='networkidle') + + data_id = str(uuid.uuid4()) + data_dir = os.path.join('collected_data', website_url.split("//")[1], data_id) + os.makedirs(data_dir, exist_ok=True) + page.screenshot(path=os.path.join(data_dir, 'screenshot_0.png')) + tag_elem_dict = page.evaluate(get_tag_elem_dict_js_code) + with open(os.path.join(data_dir, 'meta_data_0.json'), 'w') as f: + json.dump({ + 'timestamp': time.time(), + 'url': website_url, + 'data_id': data_id, + 'tag_elem_dict': tag_elem_dict + }, f, indent=4) + page.screenshot(path=os.path.join(data_dir, 'screenshot_som_0.png')) + + for i in range(action_depth): + if not click_random_link(page): + print("Invalid click or no navigation, stopping random clicks.") + break + page.screenshot(path=os.path.join(data_dir, f'screenshot_{i + 1}.png')) + tag_elem_dict = page.evaluate(get_tag_elem_dict_js_code) + with open(os.path.join(data_dir, f'meta_data_{i + 1}.json'), 'w') as f: + json.dump({ + 'timestamp': time.time(), + 'url': website_url, + 'data_id': data_id, + 'tag_elem_dict': tag_elem_dict + }, f, indent=4) + page.screenshot(path=os.path.join(data_dir, f'screenshot_som_{i + 1}.png')) + + # Close the browser + browser.close() + + +def run_one(url): + try: + scrape_data("https://" + url, action_depth=5) + except Exception as e: + print("Error scraping data:", e) + print("Start next one...") + + +def main(): + urls = read_csv("majestic_million.csv")[:20000] + + # Number of processes + num_processes = 50 # Adjust based on your system's capability, on my i9-13900k, 50 processes can be used + + with Pool(num_processes) as pool: + pool.map(run_one, urls) + + +if __name__ == '__main__': + main()