122 lines
4.0 KiB
Python
122 lines
4.0 KiB
Python
import json
|
|
import os
|
|
import random
|
|
import time
|
|
import uuid
|
|
from multiprocessing import Pool
|
|
|
|
from playwright.sync_api import sync_playwright
|
|
|
|
from majestic_million_download import read_csv
|
|
|
|
# JavaScript code as a string
|
|
with open('get_tag_elem_dict.js', 'r') as f:
|
|
get_tag_elem_dict_js_code = f.read()
|
|
|
|
|
|
def scrape_data(website_url, action_depth=10):
|
|
# if file exists, skip
|
|
if os.path.exists(os.path.join('collected_data', website_url.split("//")[1])):
|
|
# print("Data already exists, skipping...")
|
|
return
|
|
|
|
def click_random_link(page):
|
|
links = page.query_selector_all("a")
|
|
if links:
|
|
random_link = random.choice(links)
|
|
try:
|
|
page.evaluate("window.unmarkPage()")
|
|
|
|
# Capture the initial HTML content of the body
|
|
initial_content = page.inner_html("body")
|
|
|
|
# Click the link and wait for potential navigation
|
|
random_link.click()
|
|
page.wait_for_timeout(5000) # wait for 5 seconds to allow page changes to occur
|
|
|
|
# Capture the new HTML content of the body
|
|
new_content = page.inner_html("body")
|
|
|
|
# Compare the contents
|
|
if new_content != initial_content:
|
|
print("Content change detected.")
|
|
return True
|
|
else:
|
|
print("No content change detected.")
|
|
return False
|
|
|
|
except Exception as e:
|
|
print("Error occurred:", e)
|
|
return False
|
|
else:
|
|
print("No links found on the page.")
|
|
return False
|
|
|
|
return False
|
|
|
|
with sync_playwright() as p:
|
|
# Launch the browser
|
|
browser = p.chromium.launch()
|
|
context = browser.new_context(viewport={'width': 1920, 'height': 1080}, locale='en-US')
|
|
context.set_extra_http_headers({'Accept-Language': 'en-US'})
|
|
page = context.new_page()
|
|
|
|
# Navigate to Google
|
|
page.goto(website_url, timeout=60000, wait_until='networkidle')
|
|
|
|
data_id = str(uuid.uuid4())
|
|
data_dir = os.path.join('collected_data', website_url.split("//")[1], data_id)
|
|
os.makedirs(data_dir, exist_ok=True)
|
|
page.screenshot(path=os.path.join(data_dir, 'screenshot_0.png'))
|
|
tag_elem_dict = page.evaluate(get_tag_elem_dict_js_code)
|
|
with open(os.path.join(data_dir, 'meta_data_0.json'), 'w') as f:
|
|
json.dump({
|
|
'timestamp': time.time(),
|
|
'url': website_url,
|
|
'data_id': data_id,
|
|
'tag_elem_dict': tag_elem_dict
|
|
}, f, indent=4)
|
|
page.screenshot(path=os.path.join(data_dir, 'screenshot_som_0.png'))
|
|
|
|
for i in range(action_depth):
|
|
if not click_random_link(page):
|
|
print("Invalid click or no navigation, stopping random clicks.")
|
|
break
|
|
page.screenshot(path=os.path.join(data_dir, f'screenshot_{i + 1}.png'))
|
|
tag_elem_dict = page.evaluate(get_tag_elem_dict_js_code)
|
|
with open(os.path.join(data_dir, f'meta_data_{i + 1}.json'), 'w') as f:
|
|
json.dump({
|
|
'timestamp': time.time(),
|
|
'url': website_url,
|
|
'data_id': data_id,
|
|
'tag_elem_dict': tag_elem_dict
|
|
}, f, indent=4)
|
|
page.screenshot(path=os.path.join(data_dir, f'screenshot_som_{i + 1}.png'))
|
|
|
|
# Close the browser
|
|
browser.close()
|
|
|
|
|
|
def run_one(url):
|
|
try:
|
|
scrape_data("https://" + url, action_depth=5)
|
|
scrape_data("http://" + url, action_depth=5)
|
|
except Exception as e:
|
|
print("Error scraping data:", e)
|
|
print("Start next one...")
|
|
|
|
|
|
def main():
|
|
urls = read_csv("majestic_million.csv")[:20000]
|
|
random.shuffle(urls)
|
|
|
|
# Number of processes
|
|
num_processes = 50 # Adjust based on your system's capability, on my i9-13900k, 50 processes can be used
|
|
|
|
with Pool(num_processes) as pool:
|
|
pool.map(run_one, urls)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|