diff --git a/desktop_env/evaluators/getters/chrome.py b/desktop_env/evaluators/getters/chrome.py index 724e2de..68728dc 100644 --- a/desktop_env/evaluators/getters/chrome.py +++ b/desktop_env/evaluators/getters/chrome.py @@ -542,49 +542,82 @@ def get_page_info(env, config: Dict[str, str]): url = config["url"] remote_debugging_url = f"http://{host}:{port}" - with sync_playwright() as p: - # connect to remote Chrome instance + + # Configuration for retry and timeout + max_retries = 2 + timeout_ms = 60000 # Increase timeout to 60 seconds + + for attempt in range(max_retries): try: - browser = p.chromium.connect_over_cdp(remote_debugging_url) + logger.info(f"[PAGE_INFO] Attempt {attempt + 1}/{max_retries} for URL: {url}") + + with sync_playwright() as p: + # connect to remote Chrome instance + try: + browser = p.chromium.connect_over_cdp(remote_debugging_url) + logger.info(f"[PAGE_INFO] Successfully connected to existing Chrome instance") + except Exception as e: + logger.warning(f"[PAGE_INFO] Failed to connect to existing Chrome instance: {e}") + # If the connection fails, start a new browser instance + platform.machine() + if "arm" in platform.machine(): + # start a new browser instance if the connection fails + payload = json.dumps({"command": [ + "chromium", + "--remote-debugging-port=1337" + ], "shell": False}) + else: + payload = json.dumps({"command": [ + "google-chrome", + "--remote-debugging-port=1337" + ], "shell": False}) + + headers = {"Content-Type": "application/json"} + requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload) + time.sleep(5) + browser = p.chromium.connect_over_cdp(remote_debugging_url) + logger.info(f"[PAGE_INFO] Successfully connected to new Chrome instance") + + page = browser.new_page() + + # Set longer timeout for navigation + page.set_default_timeout(timeout_ms) + + logger.info(f"[PAGE_INFO] Navigating to URL: {url}") + page.goto(url, wait_until='networkidle', timeout=timeout_ms) + + try: + # Wait for the page to finish loading, this prevents the "execution context was destroyed" issue + page.wait_for_load_state('networkidle', timeout=timeout_ms) # Wait for the 'load' event to complete + title = page.title() + url = page.url + page_info = {'title': title, 'url': url, 'content': page.content()} + logger.info(f"[PAGE_INFO] Successfully loaded page. Title: '{title}'") + except TimeoutError: + # If page loading times out, catch the exception and store the current information in the list + logger.warning(f"[PAGE_INFO] Page load timeout for URL: {url}") + page_info = {'title': 'Load timeout', 'url': page.url, 'content': page.content()} + except Exception as e: + # Catch other potential exceptions that might occur while reading the page title + logger.error(f'[PAGE_INFO] Error: {e}') + page_info = {'title': 'Error encountered', 'url': page.url, 'content': page.content()} + + browser.close() + return page_info + except Exception as e: - # If the connection fails, start a new browser instance - platform.machine() - if "arm" in platform.machine(): - # start a new browser instance if the connection fails - payload = json.dumps({"command": [ - "chromium", - "--remote-debugging-port=1337" - ], "shell": False}) + logger.error(f"[PAGE_INFO] Attempt {attempt + 1} failed: {str(e)}") + logger.error(f"[PAGE_INFO] Exception type: {type(e).__name__}") + + if attempt < max_retries - 1: + logger.info(f"[PAGE_INFO] Retrying in 3 seconds...") + time.sleep(3) else: - payload = json.dumps({"command": [ - "google-chrome", - "--remote-debugging-port=1337" - ], "shell": False}) + logger.error(f"[PAGE_INFO] All {max_retries} attempts failed. Returning error info.") + return {'title': 'Connection failed', 'url': url, 'content': ''} - headers = {"Content-Type": "application/json"} - requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload) - time.sleep(5) - browser = p.chromium.connect_over_cdp(remote_debugging_url) - - page = browser.contexts[0].new_page() - page.goto(url) - - try: - # Wait for the page to finish loading, this prevents the "execution context was destroyed" issue - page.wait_for_load_state('load') # Wait for the 'load' event to complete - title = page.title() - url = page.url - page_info = {'title': title, 'url': url, 'content': page.content()} - except TimeoutError: - # If page loading times out, catch the exception and store the current information in the list - page_info = {'title': 'Load timeout', 'url': page.url, 'content': page.content()} - except Exception as e: - # Catch other potential exceptions that might occur while reading the page title - print(f'Error: {e}') - page_info = {'title': 'Error encountered', 'url': page.url, 'content': page.content()} - - browser.close() - return page_info + # This should never be reached, but just in case + return {'title': 'Unknown error', 'url': url, 'content': ''} def get_open_tabs_info(env, config: Dict[str, str]): @@ -593,52 +626,85 @@ def get_open_tabs_info(env, config: Dict[str, str]): server_port = env.server_port remote_debugging_url = f"http://{host}:{port}" - with sync_playwright() as p: - # connect to remote Chrome instance + + # Configuration for retry and timeout + max_retries = 2 + timeout_ms = 30000 # 30 seconds for tab info + + for attempt in range(max_retries): try: - browser = p.chromium.connect_over_cdp(remote_debugging_url) - except Exception as e: - # If the connection fails, start a new browser instance - platform.machine() - if "arm" in platform.machine(): - # start a new browser instance if the connection fails - payload = json.dumps({"command": [ - "chromium", - "--remote-debugging-port=1337" - ], "shell": False}) - else: - payload = json.dumps({"command": [ - "google-chrome", - "--remote-debugging-port=1337" - ], "shell": False}) + logger.info(f"[OPEN_TABS_INFO] Attempt {attempt + 1}/{max_retries}") + + with sync_playwright() as p: + # connect to remote Chrome instance + try: + browser = p.chromium.connect_over_cdp(remote_debugging_url) + logger.info(f"[OPEN_TABS_INFO] Successfully connected to existing Chrome instance") + except Exception as e: + logger.warning(f"[OPEN_TABS_INFO] Failed to connect to existing Chrome instance: {e}") + # If the connection fails, start a new browser instance + platform.machine() + if "arm" in platform.machine(): + # start a new browser instance if the connection fails + payload = json.dumps({"command": [ + "chromium", + "--remote-debugging-port=1337" + ], "shell": False}) + else: + payload = json.dumps({"command": [ + "google-chrome", + "--remote-debugging-port=1337" + ], "shell": False}) - headers = {"Content-Type": "application/json"} - requests.post(f"http://{host}:{server_port}/setup/launch", headers=headers, data=payload) - time.sleep(5) - try: - browser = p.chromium.connect_over_cdp(remote_debugging_url) - except Exception as e: + headers = {"Content-Type": "application/json"} + requests.post(f"http://{host}:{server_port}/setup/launch", headers=headers, data=payload) + time.sleep(5) + try: + browser = p.chromium.connect_over_cdp(remote_debugging_url) + logger.info(f"[OPEN_TABS_INFO] Successfully connected to new Chrome instance") + except Exception as e: + logger.error(f"[OPEN_TABS_INFO] Failed to connect to new Chrome instance: {e}") + return [] + + tabs_info = [] + for context in browser.contexts: + for page in context.pages: + try: + # Set timeout for each page + page.set_default_timeout(timeout_ms) + + # Wait for the page to finish loading, this prevents the "execution context was destroyed" issue + page.wait_for_load_state('networkidle', timeout=timeout_ms) # Wait for the 'load' event to complete + title = page.title() + url = page.url + tabs_info.append({'title': title, 'url': url}) + logger.info(f"[OPEN_TABS_INFO] Tab info: '{title}' -> {url}") + except TimeoutError: + # If page loading times out, catch the exception and store the current information in the list + logger.warning(f"[OPEN_TABS_INFO] Tab load timeout for URL: {page.url}") + tabs_info.append({'title': 'Load timeout', 'url': page.url}) + except Exception as e: + # Catch other potential exceptions that might occur while reading the page title + logger.error(f'[OPEN_TABS_INFO] Error reading tab info: {e}') + tabs_info.append({'title': 'Error encountered', 'url': page.url}) + + browser.close() + logger.info(f"[OPEN_TABS_INFO] Successfully retrieved info for {len(tabs_info)} tabs") + return tabs_info + + except Exception as e: + logger.error(f"[OPEN_TABS_INFO] Attempt {attempt + 1} failed: {str(e)}") + logger.error(f"[OPEN_TABS_INFO] Exception type: {type(e).__name__}") + + if attempt < max_retries - 1: + logger.info(f"[OPEN_TABS_INFO] Retrying in 3 seconds...") + time.sleep(3) + else: + logger.error(f"[OPEN_TABS_INFO] All {max_retries} attempts failed. Returning empty list.") return [] - tabs_info = [] - for context in browser.contexts: - for page in context.pages: - try: - # Wait for the page to finish loading, this prevents the "execution context was destroyed" issue - page.wait_for_load_state('networkidle') # Wait for the 'load' event to complete - title = page.title() - url = page.url - tabs_info.append({'title': title, 'url': url}) - except TimeoutError: - # If page loading times out, catch the exception and store the current information in the list - tabs_info.append({'title': 'Load timeout', 'url': page.url}) - except Exception as e: - # Catch other potential exceptions that might occur while reading the page title - print(f'Error: {e}') - tabs_info.append({'title': 'Error encountered', 'url': page.url}) - - browser.close() - return tabs_info + # This should never be reached, but just in case + return [] def get_active_url_from_accessTree(env, config): @@ -727,37 +793,79 @@ def get_active_tab_info(env, config: Dict[str, str]): if active_tab_url is None: logger.error("Failed to get the url of active tab") return None + + logger.info(f"[ACTIVE_TAB_INFO] Active tab URL: {active_tab_url}") + host = env.vm_ip port = env.chromium_port # fixme: this port is hard-coded, need to be changed from config file remote_debugging_url = f"http://{host}:{port}" - with sync_playwright() as p: - # connect to remote Chrome instance, since it is supposed to be the active one, we won't start a new one if failed + + # Configuration for retry and timeout + max_retries = 2 + timeout_ms = 60000 # 60 seconds for active tab + + for attempt in range(max_retries): try: - browser = p.chromium.connect_over_cdp(remote_debugging_url) + logger.info(f"[ACTIVE_TAB_INFO] Attempt {attempt + 1}/{max_retries}") + + with sync_playwright() as p: + # connect to remote Chrome instance, since it is supposed to be the active one, we won't start a new one if failed + try: + browser = p.chromium.connect_over_cdp(remote_debugging_url) + logger.info(f"[ACTIVE_TAB_INFO] Successfully connected to Chrome instance") + except Exception as e: + logger.error(f"[ACTIVE_TAB_INFO] Failed to connect to Chrome instance: {e}") + return None + + active_tab_info = {} + # go to the target URL page + page = browser.new_page() + + # Set longer timeout for navigation + page.set_default_timeout(timeout_ms) + + try: + logger.info(f"[ACTIVE_TAB_INFO] Navigating to URL: {active_tab_url}") + page.goto(active_tab_url, wait_until='networkidle', timeout=timeout_ms) + page.wait_for_load_state('networkidle', timeout=timeout_ms) # Wait for the 'load' event to complete + + active_tab_info = { + 'title': page.title(), + 'url': page.url, + 'content': page.content() # get the HTML content of the page + } + + logger.info(f"[ACTIVE_TAB_INFO] Successfully loaded page. Title: '{active_tab_info['title']}'") + logger.info(f"[ACTIVE_TAB_INFO] Current URL: '{active_tab_info['url']}'") + + except TimeoutError: + logger.warning(f"[ACTIVE_TAB_INFO] Page load timeout for URL: {active_tab_url}") + active_tab_info = { + 'title': 'Load timeout', + 'url': page.url, + 'content': page.content() + } + except Exception as e: + logger.error(f"[ACTIVE_TAB_INFO] Failed to go to the target URL page: {e}") + return None + + browser.close() + return active_tab_info + except Exception as e: - return None + logger.error(f"[ACTIVE_TAB_INFO] Attempt {attempt + 1} failed: {str(e)}") + logger.error(f"[ACTIVE_TAB_INFO] Exception type: {type(e).__name__}") + + if attempt < max_retries - 1: + logger.info(f"[ACTIVE_TAB_INFO] Retrying in 3 seconds...") + time.sleep(3) + else: + logger.error(f"[ACTIVE_TAB_INFO] All {max_retries} attempts failed.") + return None - active_tab_info = {} - # go to the target URL page - page = browser.new_page() - try: - page.goto(active_tab_url) - except: - logger.error("Failed to go to the target URL page") - return None - page.wait_for_load_state('load') # Wait for the 'load' event to complete - active_tab_info = { - 'title': page.title(), - 'url': page.url, - 'content': page.content() # get the HTML content of the page - } - - browser.close() - # print("active_tab_title: {}".format(active_tab_info.get('title', 'None'))) - # print("active_tab_url: {}".format(active_tab_info.get('url', 'None'))) - # print("active_tab_content: {}".format(active_tab_info.get('content', 'None'))) - return active_tab_info + # This should never be reached, but just in case + return None def get_pdf_from_url(env, config: Dict[str, str]) -> str: @@ -766,41 +874,110 @@ def get_pdf_from_url(env, config: Dict[str, str]) -> str: """ _url = config["path"] _path = os.path.join(env.cache_dir, config["dest"]) + + # Add logging for debugging + logger.info(f"[PDF_FROM_URL] Starting PDF download from URL: {_url}") + logger.info(f"[PDF_FROM_URL] Target path: {_path}") host = env.vm_ip port = env.chromium_port # fixme: this port is hard-coded, need to be changed from config file server_port = env.server_port remote_debugging_url = f"http://{host}:{port}" - - with sync_playwright() as p: + + # Configuration for retry and timeout + max_retries = 3 + timeout_ms = 60000 # Increase timeout to 60 seconds + + for attempt in range(max_retries): try: - browser = p.chromium.connect_over_cdp(remote_debugging_url) + logger.info(f"[PDF_FROM_URL] Attempt {attempt + 1}/{max_retries}") + + with sync_playwright() as p: + try: + browser = p.chromium.connect_over_cdp(remote_debugging_url) + logger.info(f"[PDF_FROM_URL] Successfully connected to existing Chrome instance") + except Exception as e: + logger.warning(f"[PDF_FROM_URL] Failed to connect to existing Chrome instance: {e}") + logger.info(f"[PDF_FROM_URL] Starting new Chrome instance...") + + # If the connection fails, start a new browser instance + platform.machine() + if "arm" in platform.machine(): + # start a new browser instance if the connection fails + payload = json.dumps({"command": [ + "chromium", + "--remote-debugging-port=1337" + ], "shell": False}) + else: + payload = json.dumps({"command": [ + "google-chrome", + "--remote-debugging-port=1337" + ], "shell": False}) + + headers = {"Content-Type": "application/json"} + requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload) + time.sleep(5) + browser = p.chromium.connect_over_cdp(remote_debugging_url) + logger.info(f"[PDF_FROM_URL] Successfully connected to new Chrome instance") + + page = browser.new_page() + + # Set longer timeout for navigation + page.set_default_timeout(timeout_ms) + + logger.info(f"[PDF_FROM_URL] Navigating to URL: {_url}") + page.goto(_url, wait_until='networkidle', timeout=timeout_ms) + + # Wait for page to be fully loaded + logger.info(f"[PDF_FROM_URL] Waiting for page to be fully loaded...") + page.wait_for_load_state('networkidle', timeout=timeout_ms) + + # Additional wait to ensure all content is rendered + time.sleep(3) + + logger.info(f"[PDF_FROM_URL] Page loaded successfully. Title: '{page.title()}'") + logger.info(f"[PDF_FROM_URL] Current URL: '{page.url}'") + + # Generate PDF + logger.info(f"[PDF_FROM_URL] Generating PDF...") + page.pdf(path=_path) + + logger.info(f"[PDF_FROM_URL] PDF generated successfully at: {_path}") + browser.close() + + # Verify PDF file was created + if os.path.exists(_path): + file_size = os.path.getsize(_path) + logger.info(f"[PDF_FROM_URL] PDF file created successfully. Size: {file_size} bytes") + return _path + else: + logger.error(f"[PDF_FROM_URL] PDF file was not created at expected path: {_path}") + raise FileNotFoundError(f"PDF file was not created at {_path}") + except Exception as e: - # If the connection fails, start a new browser instance - platform.machine() - if "arm" in platform.machine(): - # start a new browser instance if the connection fails - payload = json.dumps({"command": [ - "chromium", - "--remote-debugging-port=1337" - ], "shell": False}) + logger.error(f"[PDF_FROM_URL] Attempt {attempt + 1} failed: {str(e)}") + logger.error(f"[PDF_FROM_URL] Exception type: {type(e).__name__}") + + if attempt < max_retries - 1: + logger.info(f"[PDF_FROM_URL] Retrying in 5 seconds...") + time.sleep(5) else: - payload = json.dumps({"command": [ - "google-chrome", - "--remote-debugging-port=1337" - ], "shell": False}) - - headers = {"Content-Type": "application/json"} - requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload) - time.sleep(5) - browser = p.chromium.connect_over_cdp(remote_debugging_url) - - page = browser.new_page() - page.goto(_url) - page.pdf(path=_path) - browser.close() + logger.error(f"[PDF_FROM_URL] All {max_retries} attempts failed. Giving up.") + + # Create a placeholder file or return a default path + try: + # Create an empty PDF file as fallback + with open(_path, 'w') as f: + f.write("%PDF-1.4\n%EOF\n") + logger.warning(f"[PDF_FROM_URL] Created empty PDF file as fallback: {_path}") + return _path + except Exception as fallback_error: + logger.error(f"[PDF_FROM_URL] Failed to create fallback file: {fallback_error}") + # Return the path anyway, even if file creation failed + return _path + # This should never be reached, but just in case return _path @@ -811,41 +988,75 @@ def get_chrome_saved_address(env, config: Dict[str, str]): server_port = env.server_port remote_debugging_url = f"http://{host}:{port}" - with sync_playwright() as p: - # connect to remote Chrome instance + + # Configuration for retry and timeout + max_retries = 2 + timeout_ms = 30000 # 30 seconds for settings page + + for attempt in range(max_retries): try: - browser = p.chromium.connect_over_cdp(remote_debugging_url) + logger.info(f"[CHROME_SAVED_ADDRESS] Attempt {attempt + 1}/{max_retries}") + + with sync_playwright() as p: + # connect to remote Chrome instance + try: + browser = p.chromium.connect_over_cdp(remote_debugging_url) + logger.info(f"[CHROME_SAVED_ADDRESS] Successfully connected to existing Chrome instance") + except Exception as e: + logger.warning(f"[CHROME_SAVED_ADDRESS] Failed to connect to existing Chrome instance: {e}") + # If the connection fails, start a new browser instance + platform.machine() + if "arm" in platform.machine(): + # start a new browser instance if the connection fails + payload = json.dumps({"command": [ + "chromium", + "--remote-debugging-port=1337" + ], "shell": False}) + else: + payload = json.dumps({"command": [ + "google-chrome", + "--remote-debugging-port=1337" + ], "shell": False}) + + headers = {"Content-Type": "application/json"} + requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload) + time.sleep(5) + browser = p.chromium.connect_over_cdp(remote_debugging_url) + logger.info(f"[CHROME_SAVED_ADDRESS] Successfully connected to new Chrome instance") + + page = browser.new_page() + + # Set longer timeout for navigation + page.set_default_timeout(timeout_ms) + + # Navigate to Chrome's settings page for autofill + logger.info(f"[CHROME_SAVED_ADDRESS] Navigating to Chrome settings page") + page.goto("chrome://settings/addresses", wait_until='networkidle', timeout=timeout_ms) + + # Wait for page to be fully loaded + page.wait_for_load_state('networkidle', timeout=timeout_ms) + + # Get the HTML content of the page + content = page.content() + + logger.info(f"[CHROME_SAVED_ADDRESS] Successfully retrieved settings page content") + browser.close() + + return content + except Exception as e: - # If the connection fails, start a new browser instance - platform.machine() - if "arm" in platform.machine(): - # start a new browser instance if the connection fails - payload = json.dumps({"command": [ - "chromium", - "--remote-debugging-port=1337" - ], "shell": False}) + logger.error(f"[CHROME_SAVED_ADDRESS] Attempt {attempt + 1} failed: {str(e)}") + logger.error(f"[CHROME_SAVED_ADDRESS] Exception type: {type(e).__name__}") + + if attempt < max_retries - 1: + logger.info(f"[CHROME_SAVED_ADDRESS] Retrying in 3 seconds...") + time.sleep(3) else: - payload = json.dumps({"command": [ - "google-chrome", - "--remote-debugging-port=1337" - ], "shell": False}) + logger.error(f"[CHROME_SAVED_ADDRESS] All {max_retries} attempts failed. Returning empty content.") + return "" - headers = {"Content-Type": "application/json"} - requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload) - time.sleep(5) - browser = p.chromium.connect_over_cdp(remote_debugging_url) - - page = browser.new_page() - - # Navigate to Chrome's settings page for autofill - page.goto("chrome://settings/addresses") - - # Get the HTML content of the page - content = page.content() - - browser.close() - - return content + # This should never be reached, but just in case + return "" def get_shortcuts_on_desktop(env, config: Dict[str, str]): @@ -891,38 +1102,76 @@ def get_number_of_search_results(env, config: Dict[str, str]): server_port = env.server_port remote_debugging_url = f"http://{host}:{port}" - with sync_playwright() as p: + + # Configuration for retry and timeout + max_retries = 2 + timeout_ms = 45000 # 45 seconds for search results + + for attempt in range(max_retries): try: - browser = p.chromium.connect_over_cdp(remote_debugging_url) + logger.info(f"[SEARCH_RESULTS] Attempt {attempt + 1}/{max_retries} for URL: {url}") + + with sync_playwright() as p: + try: + browser = p.chromium.connect_over_cdp(remote_debugging_url) + logger.info(f"[SEARCH_RESULTS] Successfully connected to existing Chrome instance") + except Exception as e: + logger.warning(f"[SEARCH_RESULTS] Failed to connect to existing Chrome instance: {e}") + # If the connection fails, start a new browser instance + platform.machine() + if "arm" in platform.machine(): + # start a new browser instance if the connection fails + payload = json.dumps({"command": [ + "chromium", + "--remote-debugging-port=1337" + ], "shell": False}) + else: + payload = json.dumps({"command": [ + "google-chrome", + "--remote-debugging-port=1337" + ], "shell": False}) + + headers = {"Content-Type": "application/json"} + requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload) + time.sleep(5) + browser = p.chromium.connect_over_cdp(remote_debugging_url) + logger.info(f"[SEARCH_RESULTS] Successfully connected to new Chrome instance") + + page = browser.new_page() + + # Set longer timeout for navigation + page.set_default_timeout(timeout_ms) + + logger.info(f"[SEARCH_RESULTS] Navigating to URL: {url}") + page.goto(url, wait_until='networkidle', timeout=timeout_ms) + + # Wait for page to be fully loaded + page.wait_for_load_state('networkidle', timeout=timeout_ms) + + search_results = page.query_selector_all(result_selector) + actual_count = len(search_results) + + logger.info(f"[SEARCH_RESULTS] Found {actual_count} search results") + browser.close() + + return actual_count + except Exception as e: - # If the connection fails, start a new browser instance - platform.machine() - if "arm" in platform.machine(): - # start a new browser instance if the connection fails - payload = json.dumps({"command": [ - "chromium", - "--remote-debugging-port=1337" - ], "shell": False}) + logger.error(f"[SEARCH_RESULTS] Attempt {attempt + 1} failed: {str(e)}") + logger.error(f"[SEARCH_RESULTS] Exception type: {type(e).__name__}") + + if attempt < max_retries - 1: + logger.info(f"[SEARCH_RESULTS] Retrying in 3 seconds...") + time.sleep(3) else: - payload = json.dumps({"command": [ - "google-chrome", - "--remote-debugging-port=1337" - ], "shell": False}) + logger.error(f"[SEARCH_RESULTS] All {max_retries} attempts failed. Returning 0.") + return 0 - headers = {"Content-Type": "application/json"} - requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload) - time.sleep(5) - browser = p.chromium.connect_over_cdp(remote_debugging_url) - page = browser.new_page() - page.goto(url) - search_results = page.query_selector_all(result_selector) - actual_count = len(search_results) - browser.close() - - return actual_count + # This should never be reached, but just in case + return 0 -def get_googledrive_file(env, config: Dict[str, Any]) -> str: +def get_googledrive_file(env, config: Dict[str, Any]) -> Any: """ Get the desired file from Google Drive based on config, return the downloaded local filepath. @args: keys in config dict settings_file(str): target filepath to the settings file for Google Drive authentication, default is 'evaluation_examples/settings/googledrive/settings.yml' @@ -959,18 +1208,21 @@ def get_googledrive_file(env, config: Dict[str, Any]) -> str: return _path if 'query' in config: - return get_single_file(config['query'], os.path.join(env.cache_dir, config['dest'])) + result = get_single_file(config['query'], os.path.join(env.cache_dir, config['dest'])) + return result elif 'path' in config: query = [f"title = '{fp}' and mimeType = 'application/vnd.google-apps.folder' and trashed = false" if idx < len( config['path']) - 1 else f"title = '{fp}' and trashed = false" for idx, fp in enumerate(config['path'])] - return get_single_file(query, os.path.join(env.cache_dir, config['dest'])) + result = get_single_file(query, os.path.join(env.cache_dir, config['dest'])) + return result elif 'query_list' in config: _path_list = [] assert len(config['query_list']) == len(config['dest']) for idx, query in enumerate(config['query_list']): dest = config['dest'][idx] - _path_list.append(get_single_file(query, os.path.join(env.cache_dir, dest))) + result = get_single_file(query, os.path.join(env.cache_dir, dest)) + _path_list.append(result) return _path_list else: # path_list in config _path_list = [] @@ -981,7 +1233,8 @@ def get_googledrive_file(env, config: Dict[str, Any]) -> str: path) - 1 else f"title = '{fp}' and trashed = false" for jdx, fp in enumerate(path)] dest = config['dest'][idx] - _path_list.append(get_single_file(query, os.path.join(env.cache_dir, dest))) + result = get_single_file(query, os.path.join(env.cache_dir, dest)) + _path_list.append(result) return _path_list @@ -1692,16 +1945,31 @@ def get_url_dashPart(env, config: Dict[str, str]): return None # extract the last dash-separated part of the URL, and delete all the characters after "id" - dash_part = active_tab_url.split("/")[config["partIndex"]] - if config["needDeleteId"]: + # Ensure partIndex is an integer + try: + part_index = int(config["partIndex"]) + except (ValueError, TypeError): + logger.error(f"[URL_DASH_PART] Invalid partIndex: {config.get('partIndex', 'None')}. Must be an integer.") + return None + + url_parts = active_tab_url.split("/") + if part_index >= len(url_parts): + logger.error(f"[URL_DASH_PART] partIndex {part_index} is out of range for URL with {len(url_parts)} parts") + return None + + dash_part = url_parts[part_index] + if config.get("needDeleteId", False): dash_part = dash_part.split("?")[0] - # print("active_tab_title: {}".format(active_tab_info.get('title', 'None'))) - # print("active_tab_url: {}".format(active_tab_info.get('url', 'None'))) - # print("active_tab_content: {}".format(active_tab_info.get('content', 'None'))) + + logger.info(f"[URL_DASH_PART] Extracted dash part: '{dash_part}' from URL: {active_tab_url}") + if config["returnType"] == "string": return dash_part elif config["returnType"] == "json": return {config["key"]: dash_part} + else: + logger.error(f"[URL_DASH_PART] Invalid returnType: {config.get('returnType', 'None')}. Must be 'string' or 'json'.") + return None def get_macys_product_url_parse(env, config: Dict[str, str]):