VLC updates, and some infra bugs fix

2024-01-09 09:30:11 +08:00
parent 2b09b7ce41
commit fa84b20ea5
13 changed files with 278 additions and 71 deletions
--- a/mm_agents/gui_som/data_preparation/scrape_crawler.py
+++ b/mm_agents/gui_som/data_preparation/scrape_crawler.py
@@ -17,7 +17,7 @@ with open('get_tag_elem_dict.js', 'r') as f:
 def scrape_data(website_url, action_depth=10):
    # if file exists, skip
    if os.path.exists(os.path.join('collected_data', website_url.split("//")[1])):
-        print("Data already exists, skipping...")
+        # print("Data already exists, skipping...")
        return

    def click_random_link(page):
@@ -100,6 +100,7 @@ def scrape_data(website_url, action_depth=10):
 def run_one(url):
    try:
        scrape_data("https://" + url, action_depth=5)
+        scrape_data("http://" + url, action_depth=5)
    except Exception as e:
        print("Error scraping data:", e)
        print("Start next one...")
@@ -107,6 +108,7 @@ def run_one(url):

 def main():
    urls = read_csv("majestic_million.csv")[:20000]
+    random.shuffle(urls)

    # Number of processes
    num_processes = 50  # Adjust based on your system's capability, on my i9-13900k, 50 processes can be used