Add the GUI set-of-mark object detector data collection script
This commit is contained in:
0
mm_agents/gui_som/__init__.py
Normal file
0
mm_agents/gui_som/__init__.py
Normal file
8
mm_agents/gui_som/data_preparation/README.md
Normal file
8
mm_agents/gui_som/data_preparation/README.md
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
1. Get the URLs from majestic_million and save them to `majestic_million.csv`
|
||||||
|
```bash
|
||||||
|
python3 majestic_million.py
|
||||||
|
```
|
||||||
|
2. Run scrapy spider to get the data from the URLs
|
||||||
|
```bash
|
||||||
|
python scrapy_crawler.py
|
||||||
|
```
|
||||||
0
mm_agents/gui_som/data_preparation/__init__.py
Normal file
0
mm_agents/gui_som/data_preparation/__init__.py
Normal file
158
mm_agents/gui_som/data_preparation/get_tag_elem_dict.js
Normal file
158
mm_agents/gui_som/data_preparation/get_tag_elem_dict.js
Normal file
@@ -0,0 +1,158 @@
|
|||||||
|
(() => {
|
||||||
|
let labels = [];
|
||||||
|
let selector_id_table = {};
|
||||||
|
var generateQuerySelector = function (el) {
|
||||||
|
function cssEscape(value) {
|
||||||
|
if (!value) return '';
|
||||||
|
// Escape all CSS special characters, including the colon.
|
||||||
|
return value.replace(/([!"#$%&'()*+,./:;<=>?@[\]^`{|}~])/g, '\\$&');
|
||||||
|
}
|
||||||
|
|
||||||
|
function getChildIndex(el) {
|
||||||
|
var siblings = Array.from(el.parentNode.children);
|
||||||
|
var sameTagSiblings = siblings.filter(sibling => sibling.tagName === el.tagName);
|
||||||
|
return sameTagSiblings.indexOf(el);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (el.tagName.toLowerCase() === "html") {
|
||||||
|
return "HTML";
|
||||||
|
}
|
||||||
|
|
||||||
|
var str = el.tagName;
|
||||||
|
var idPresent = false; // Add a flag to check if an ID is present
|
||||||
|
|
||||||
|
if (el.id !== "") {
|
||||||
|
str += "#" + cssEscape(el.id);
|
||||||
|
idPresent = true; // Set the flag to true if there's an ID
|
||||||
|
}
|
||||||
|
|
||||||
|
if (el.className) {
|
||||||
|
var classes = el.className.split(/\s+/).filter(Boolean); // Filter out empty strings
|
||||||
|
for (var i = 0; i < classes.length; i++) {
|
||||||
|
str += "." + cssEscape(classes[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Only add :nth-of-type() if no ID is present
|
||||||
|
if (!idPresent) {
|
||||||
|
str += ":nth-of-type(" + (getChildIndex(el) + 1) + ")";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use '>' combinator if parent is not 'HTML'
|
||||||
|
var parentSelector = generateQuerySelector(el.parentNode);
|
||||||
|
return parentSelector === "HTML" ? str : parentSelector + " > " + str;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
function unmarkPage() {
|
||||||
|
for (const label of labels) {
|
||||||
|
document.body.removeChild(label);
|
||||||
|
}
|
||||||
|
labels = [];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Expose the unmarkPage function globally
|
||||||
|
window.unmarkPage = unmarkPage;
|
||||||
|
|
||||||
|
function markPage() {
|
||||||
|
unmarkPage();
|
||||||
|
|
||||||
|
var bodyRect = document.body.getBoundingClientRect();
|
||||||
|
|
||||||
|
var items = Array.prototype.slice.call(
|
||||||
|
document.querySelectorAll('*')
|
||||||
|
).map(function (element) {
|
||||||
|
var vw = Math.max(document.documentElement.clientWidth || 0, window.innerWidth || 0);
|
||||||
|
var vh = Math.max(document.documentElement.clientHeight || 0, window.innerHeight || 0);
|
||||||
|
|
||||||
|
var rects = [...element.getClientRects()].filter(bb => {
|
||||||
|
var center_x = bb.left + bb.width / 2;
|
||||||
|
var center_y = bb.top + bb.height / 2;
|
||||||
|
var elAtCenter = document.elementFromPoint(center_x, center_y);
|
||||||
|
|
||||||
|
return elAtCenter === element || element.contains(elAtCenter)
|
||||||
|
}).map(bb => {
|
||||||
|
const rect = {
|
||||||
|
left: Math.max(0, bb.left),
|
||||||
|
top: Math.max(0, bb.top),
|
||||||
|
right: Math.min(vw, bb.right),
|
||||||
|
bottom: Math.min(vh, bb.bottom)
|
||||||
|
};
|
||||||
|
return {
|
||||||
|
...rect,
|
||||||
|
width: rect.right - rect.left,
|
||||||
|
height: rect.bottom - rect.top
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
var area = rects.reduce((acc, rect) => acc + rect.width * rect.height, 0);
|
||||||
|
|
||||||
|
return {
|
||||||
|
element: element,
|
||||||
|
include:
|
||||||
|
(element.tagName === "INPUT" || element.tagName === "TEXTAREA" || element.tagName === "SELECT") ||
|
||||||
|
(element.tagName === "BUTTON" || element.tagName === "A" || (element.onclick != null) || window.getComputedStyle(element).cursor == "pointer") ||
|
||||||
|
(element.tagName === "IFRAME" || element.tagName === "VIDEO")
|
||||||
|
,
|
||||||
|
area,
|
||||||
|
rects,
|
||||||
|
text: element.textContent.trim().replace(/\s{2,}/g, ' ')
|
||||||
|
};
|
||||||
|
}).filter(item =>
|
||||||
|
item.include && (item.area >= 20)
|
||||||
|
);
|
||||||
|
|
||||||
|
// Only keep inner clickable items
|
||||||
|
items = items.filter(x => !items.some(y => x.element.contains(y.element) && !(x == y)))
|
||||||
|
|
||||||
|
// Function to generate random colors
|
||||||
|
function getRandomColor() {
|
||||||
|
var letters = '0123456789ABCDEF';
|
||||||
|
var color = '#';
|
||||||
|
for (var i = 0; i < 6; i++) {
|
||||||
|
color += letters[Math.floor(Math.random() * 16)];
|
||||||
|
}
|
||||||
|
return color;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Lets create a floating border on top of these elements that will always be visible
|
||||||
|
items.forEach(function (item, index) {
|
||||||
|
selector_id_table[index.toString()] = item.rects;
|
||||||
|
item.rects.forEach((bbox) => {
|
||||||
|
newElement = document.createElement("div");
|
||||||
|
var borderColor = getRandomColor();
|
||||||
|
newElement.style.outline = `2px dashed ${borderColor}`;
|
||||||
|
newElement.style.position = "fixed";
|
||||||
|
newElement.style.left = bbox.left + "px";
|
||||||
|
newElement.style.top = bbox.top + "px";
|
||||||
|
newElement.style.width = bbox.width + "px";
|
||||||
|
newElement.style.height = bbox.height + "px";
|
||||||
|
newElement.style.pointerEvents = "none";
|
||||||
|
newElement.style.boxSizing = "border-box";
|
||||||
|
newElement.style.zIndex = 2147483647;
|
||||||
|
// newElement.style.background = `${borderColor}80`;
|
||||||
|
|
||||||
|
// Add floating label at the corner
|
||||||
|
var label = document.createElement("span");
|
||||||
|
label.textContent = index;
|
||||||
|
label.style.position = "absolute";
|
||||||
|
label.style.top = "-19px";
|
||||||
|
label.style.left = "0px";
|
||||||
|
label.style.background = borderColor;
|
||||||
|
label.style.color = "white";
|
||||||
|
label.style.padding = "2px 4px";
|
||||||
|
label.style.fontSize = "12px";
|
||||||
|
label.style.borderRadius = "2px";
|
||||||
|
newElement.appendChild(label);
|
||||||
|
|
||||||
|
document.body.appendChild(newElement);
|
||||||
|
labels.push(newElement);
|
||||||
|
// item.element.setAttribute("-ai-label", label.textContent);
|
||||||
|
});
|
||||||
|
})
|
||||||
|
return selector_id_table;
|
||||||
|
}
|
||||||
|
|
||||||
|
return markPage();
|
||||||
|
})()
|
||||||
|
|
||||||
@@ -0,0 +1,39 @@
|
|||||||
|
import csv
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
# Latest run on 2024.1.4
|
||||||
|
def download_csv(url, file_path):
|
||||||
|
response = requests.get(url)
|
||||||
|
with open(file_path, 'w', newline='', encoding='utf-8') as file:
|
||||||
|
file.write(response.text)
|
||||||
|
|
||||||
|
|
||||||
|
def read_csv(file_path):
|
||||||
|
urls = []
|
||||||
|
with open(file_path, newline='', encoding='utf-8') as csvfile:
|
||||||
|
reader = csv.reader(csvfile)
|
||||||
|
next(reader, None) # Skip the header
|
||||||
|
for row in reader:
|
||||||
|
urls.append(row[2]) # Assuming the URL is in the third column
|
||||||
|
return urls
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
url = 'http://downloads.majestic.com/majestic_million.csv'
|
||||||
|
file_path = 'majestic_million.csv'
|
||||||
|
|
||||||
|
print("Downloading Majestic Million CSV...")
|
||||||
|
download_csv(url, file_path)
|
||||||
|
|
||||||
|
print("Reading URLs from CSV...")
|
||||||
|
urls = read_csv(file_path)
|
||||||
|
|
||||||
|
# Print the first 10 URLs as a sample
|
||||||
|
for url in urls[:10]:
|
||||||
|
print(url)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
119
mm_agents/gui_som/data_preparation/scrape_crawler.py
Normal file
119
mm_agents/gui_som/data_preparation/scrape_crawler.py
Normal file
@@ -0,0 +1,119 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
import uuid
|
||||||
|
from multiprocessing import Pool
|
||||||
|
|
||||||
|
from playwright.sync_api import sync_playwright
|
||||||
|
|
||||||
|
from majestic_million_download import read_csv
|
||||||
|
|
||||||
|
# JavaScript code as a string
|
||||||
|
with open('get_tag_elem_dict.js', 'r') as f:
|
||||||
|
get_tag_elem_dict_js_code = f.read()
|
||||||
|
|
||||||
|
|
||||||
|
def scrape_data(website_url, action_depth=10):
|
||||||
|
# if file exists, skip
|
||||||
|
if os.path.exists(os.path.join('collected_data', website_url.split("//")[1])):
|
||||||
|
print("Data already exists, skipping...")
|
||||||
|
return
|
||||||
|
|
||||||
|
def click_random_link(page):
|
||||||
|
links = page.query_selector_all("a")
|
||||||
|
if links:
|
||||||
|
random_link = random.choice(links)
|
||||||
|
try:
|
||||||
|
page.evaluate("window.unmarkPage()")
|
||||||
|
|
||||||
|
# Capture the initial HTML content of the body
|
||||||
|
initial_content = page.inner_html("body")
|
||||||
|
|
||||||
|
# Click the link and wait for potential navigation
|
||||||
|
random_link.click()
|
||||||
|
page.wait_for_timeout(5000) # wait for 5 seconds to allow page changes to occur
|
||||||
|
|
||||||
|
# Capture the new HTML content of the body
|
||||||
|
new_content = page.inner_html("body")
|
||||||
|
|
||||||
|
# Compare the contents
|
||||||
|
if new_content != initial_content:
|
||||||
|
print("Content change detected.")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print("No content change detected.")
|
||||||
|
return False
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print("Error occurred:", e)
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
print("No links found on the page.")
|
||||||
|
return False
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
with sync_playwright() as p:
|
||||||
|
# Launch the browser
|
||||||
|
browser = p.chromium.launch()
|
||||||
|
context = browser.new_context(viewport={'width': 1920, 'height': 1080}, locale='en-US')
|
||||||
|
context.set_extra_http_headers({'Accept-Language': 'en-US'})
|
||||||
|
page = context.new_page()
|
||||||
|
|
||||||
|
# Navigate to Google
|
||||||
|
page.goto(website_url, timeout=60000, wait_until='networkidle')
|
||||||
|
|
||||||
|
data_id = str(uuid.uuid4())
|
||||||
|
data_dir = os.path.join('collected_data', website_url.split("//")[1], data_id)
|
||||||
|
os.makedirs(data_dir, exist_ok=True)
|
||||||
|
page.screenshot(path=os.path.join(data_dir, 'screenshot_0.png'))
|
||||||
|
tag_elem_dict = page.evaluate(get_tag_elem_dict_js_code)
|
||||||
|
with open(os.path.join(data_dir, 'meta_data_0.json'), 'w') as f:
|
||||||
|
json.dump({
|
||||||
|
'timestamp': time.time(),
|
||||||
|
'url': website_url,
|
||||||
|
'data_id': data_id,
|
||||||
|
'tag_elem_dict': tag_elem_dict
|
||||||
|
}, f, indent=4)
|
||||||
|
page.screenshot(path=os.path.join(data_dir, 'screenshot_som_0.png'))
|
||||||
|
|
||||||
|
for i in range(action_depth):
|
||||||
|
if not click_random_link(page):
|
||||||
|
print("Invalid click or no navigation, stopping random clicks.")
|
||||||
|
break
|
||||||
|
page.screenshot(path=os.path.join(data_dir, f'screenshot_{i + 1}.png'))
|
||||||
|
tag_elem_dict = page.evaluate(get_tag_elem_dict_js_code)
|
||||||
|
with open(os.path.join(data_dir, f'meta_data_{i + 1}.json'), 'w') as f:
|
||||||
|
json.dump({
|
||||||
|
'timestamp': time.time(),
|
||||||
|
'url': website_url,
|
||||||
|
'data_id': data_id,
|
||||||
|
'tag_elem_dict': tag_elem_dict
|
||||||
|
}, f, indent=4)
|
||||||
|
page.screenshot(path=os.path.join(data_dir, f'screenshot_som_{i + 1}.png'))
|
||||||
|
|
||||||
|
# Close the browser
|
||||||
|
browser.close()
|
||||||
|
|
||||||
|
|
||||||
|
def run_one(url):
|
||||||
|
try:
|
||||||
|
scrape_data("https://" + url, action_depth=5)
|
||||||
|
except Exception as e:
|
||||||
|
print("Error scraping data:", e)
|
||||||
|
print("Start next one...")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
urls = read_csv("majestic_million.csv")[:20000]
|
||||||
|
|
||||||
|
# Number of processes
|
||||||
|
num_processes = 50 # Adjust based on your system's capability, on my i9-13900k, 50 processes can be used
|
||||||
|
|
||||||
|
with Pool(num_processes) as pool:
|
||||||
|
pool.map(run_one, urls)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user