VLC updates, and some infra bugs fix
This commit is contained in:
@@ -17,7 +17,7 @@ with open('get_tag_elem_dict.js', 'r') as f:
|
||||
def scrape_data(website_url, action_depth=10):
|
||||
# if file exists, skip
|
||||
if os.path.exists(os.path.join('collected_data', website_url.split("//")[1])):
|
||||
print("Data already exists, skipping...")
|
||||
# print("Data already exists, skipping...")
|
||||
return
|
||||
|
||||
def click_random_link(page):
|
||||
@@ -100,6 +100,7 @@ def scrape_data(website_url, action_depth=10):
|
||||
def run_one(url):
|
||||
try:
|
||||
scrape_data("https://" + url, action_depth=5)
|
||||
scrape_data("http://" + url, action_depth=5)
|
||||
except Exception as e:
|
||||
print("Error scraping data:", e)
|
||||
print("Start next one...")
|
||||
@@ -107,6 +108,7 @@ def run_one(url):
|
||||
|
||||
def main():
|
||||
urls = read_csv("majestic_million.csv")[:20000]
|
||||
random.shuffle(urls)
|
||||
|
||||
# Number of processes
|
||||
num_processes = 50 # Adjust based on your system's capability, on my i9-13900k, 50 processes can be used
|
||||
|
||||
Reference in New Issue
Block a user