VLC updates, and some infra bugs fix

This commit is contained in:
Timothyxxx
2024-01-09 09:30:11 +08:00
parent 2b09b7ce41
commit fa84b20ea5
13 changed files with 278 additions and 71 deletions

View File

@@ -17,7 +17,7 @@ with open('get_tag_elem_dict.js', 'r') as f:
def scrape_data(website_url, action_depth=10):
# if file exists, skip
if os.path.exists(os.path.join('collected_data', website_url.split("//")[1])):
print("Data already exists, skipping...")
# print("Data already exists, skipping...")
return
def click_random_link(page):
@@ -100,6 +100,7 @@ def scrape_data(website_url, action_depth=10):
def run_one(url):
try:
scrape_data("https://" + url, action_depth=5)
scrape_data("http://" + url, action_depth=5)
except Exception as e:
print("Error scraping data:", e)
print("Start next one...")
@@ -107,6 +108,7 @@ def run_one(url):
def main():
urls = read_csv("majestic_million.csv")[:20000]
random.shuffle(urls)
# Number of processes
num_processes = 50 # Adjust based on your system's capability, on my i9-13900k, 50 processes can be used