feat: enhance Chrome evaluator with improved retry logic and logging

- Implemented retry mechanism for connecting to Chrome instances, allowing up to two attempts before failure.
- Increased timeout settings for page navigation and loading to enhance reliability.
- Added detailed logging for connection attempts, page loading status, and error handling to improve debugging and user experience.
- Ensured existing logic is preserved while enhancing error handling and operational robustness.

These changes improve the overall reliability and maintainability of the Chrome evaluator functions.
This commit is contained in:
yuanmengqi
2025-07-17 11:15:47 +00:00
parent 2c51950e73
commit 9d04624e41

View File

@@ -542,49 +542,82 @@ def get_page_info(env, config: Dict[str, str]):
url = config["url"]
remote_debugging_url = f"http://{host}:{port}"
with sync_playwright() as p:
# connect to remote Chrome instance
# Configuration for retry and timeout
max_retries = 2
timeout_ms = 60000 # Increase timeout to 60 seconds
for attempt in range(max_retries):
try:
browser = p.chromium.connect_over_cdp(remote_debugging_url)
logger.info(f"[PAGE_INFO] Attempt {attempt + 1}/{max_retries} for URL: {url}")
with sync_playwright() as p:
# connect to remote Chrome instance
try:
browser = p.chromium.connect_over_cdp(remote_debugging_url)
logger.info(f"[PAGE_INFO] Successfully connected to existing Chrome instance")
except Exception as e:
logger.warning(f"[PAGE_INFO] Failed to connect to existing Chrome instance: {e}")
# If the connection fails, start a new browser instance
platform.machine()
if "arm" in platform.machine():
# start a new browser instance if the connection fails
payload = json.dumps({"command": [
"chromium",
"--remote-debugging-port=1337"
], "shell": False})
else:
payload = json.dumps({"command": [
"google-chrome",
"--remote-debugging-port=1337"
], "shell": False})
headers = {"Content-Type": "application/json"}
requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
time.sleep(5)
browser = p.chromium.connect_over_cdp(remote_debugging_url)
logger.info(f"[PAGE_INFO] Successfully connected to new Chrome instance")
page = browser.new_page()
# Set longer timeout for navigation
page.set_default_timeout(timeout_ms)
logger.info(f"[PAGE_INFO] Navigating to URL: {url}")
page.goto(url, wait_until='networkidle', timeout=timeout_ms)
try:
# Wait for the page to finish loading, this prevents the "execution context was destroyed" issue
page.wait_for_load_state('networkidle', timeout=timeout_ms) # Wait for the 'load' event to complete
title = page.title()
url = page.url
page_info = {'title': title, 'url': url, 'content': page.content()}
logger.info(f"[PAGE_INFO] Successfully loaded page. Title: '{title}'")
except TimeoutError:
# If page loading times out, catch the exception and store the current information in the list
logger.warning(f"[PAGE_INFO] Page load timeout for URL: {url}")
page_info = {'title': 'Load timeout', 'url': page.url, 'content': page.content()}
except Exception as e:
# Catch other potential exceptions that might occur while reading the page title
logger.error(f'[PAGE_INFO] Error: {e}')
page_info = {'title': 'Error encountered', 'url': page.url, 'content': page.content()}
browser.close()
return page_info
except Exception as e:
# If the connection fails, start a new browser instance
platform.machine()
if "arm" in platform.machine():
# start a new browser instance if the connection fails
payload = json.dumps({"command": [
"chromium",
"--remote-debugging-port=1337"
], "shell": False})
logger.error(f"[PAGE_INFO] Attempt {attempt + 1} failed: {str(e)}")
logger.error(f"[PAGE_INFO] Exception type: {type(e).__name__}")
if attempt < max_retries - 1:
logger.info(f"[PAGE_INFO] Retrying in 3 seconds...")
time.sleep(3)
else:
payload = json.dumps({"command": [
"google-chrome",
"--remote-debugging-port=1337"
], "shell": False})
logger.error(f"[PAGE_INFO] All {max_retries} attempts failed. Returning error info.")
return {'title': 'Connection failed', 'url': url, 'content': ''}
headers = {"Content-Type": "application/json"}
requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
time.sleep(5)
browser = p.chromium.connect_over_cdp(remote_debugging_url)
page = browser.contexts[0].new_page()
page.goto(url)
try:
# Wait for the page to finish loading, this prevents the "execution context was destroyed" issue
page.wait_for_load_state('load') # Wait for the 'load' event to complete
title = page.title()
url = page.url
page_info = {'title': title, 'url': url, 'content': page.content()}
except TimeoutError:
# If page loading times out, catch the exception and store the current information in the list
page_info = {'title': 'Load timeout', 'url': page.url, 'content': page.content()}
except Exception as e:
# Catch other potential exceptions that might occur while reading the page title
print(f'Error: {e}')
page_info = {'title': 'Error encountered', 'url': page.url, 'content': page.content()}
browser.close()
return page_info
# This should never be reached, but just in case
return {'title': 'Unknown error', 'url': url, 'content': ''}
def get_open_tabs_info(env, config: Dict[str, str]):
@@ -593,52 +626,85 @@ def get_open_tabs_info(env, config: Dict[str, str]):
server_port = env.server_port
remote_debugging_url = f"http://{host}:{port}"
with sync_playwright() as p:
# connect to remote Chrome instance
# Configuration for retry and timeout
max_retries = 2
timeout_ms = 30000 # 30 seconds for tab info
for attempt in range(max_retries):
try:
browser = p.chromium.connect_over_cdp(remote_debugging_url)
except Exception as e:
# If the connection fails, start a new browser instance
platform.machine()
if "arm" in platform.machine():
# start a new browser instance if the connection fails
payload = json.dumps({"command": [
"chromium",
"--remote-debugging-port=1337"
], "shell": False})
else:
payload = json.dumps({"command": [
"google-chrome",
"--remote-debugging-port=1337"
], "shell": False})
logger.info(f"[OPEN_TABS_INFO] Attempt {attempt + 1}/{max_retries}")
with sync_playwright() as p:
# connect to remote Chrome instance
try:
browser = p.chromium.connect_over_cdp(remote_debugging_url)
logger.info(f"[OPEN_TABS_INFO] Successfully connected to existing Chrome instance")
except Exception as e:
logger.warning(f"[OPEN_TABS_INFO] Failed to connect to existing Chrome instance: {e}")
# If the connection fails, start a new browser instance
platform.machine()
if "arm" in platform.machine():
# start a new browser instance if the connection fails
payload = json.dumps({"command": [
"chromium",
"--remote-debugging-port=1337"
], "shell": False})
else:
payload = json.dumps({"command": [
"google-chrome",
"--remote-debugging-port=1337"
], "shell": False})
headers = {"Content-Type": "application/json"}
requests.post(f"http://{host}:{server_port}/setup/launch", headers=headers, data=payload)
time.sleep(5)
try:
browser = p.chromium.connect_over_cdp(remote_debugging_url)
except Exception as e:
headers = {"Content-Type": "application/json"}
requests.post(f"http://{host}:{server_port}/setup/launch", headers=headers, data=payload)
time.sleep(5)
try:
browser = p.chromium.connect_over_cdp(remote_debugging_url)
logger.info(f"[OPEN_TABS_INFO] Successfully connected to new Chrome instance")
except Exception as e:
logger.error(f"[OPEN_TABS_INFO] Failed to connect to new Chrome instance: {e}")
return []
tabs_info = []
for context in browser.contexts:
for page in context.pages:
try:
# Set timeout for each page
page.set_default_timeout(timeout_ms)
# Wait for the page to finish loading, this prevents the "execution context was destroyed" issue
page.wait_for_load_state('networkidle', timeout=timeout_ms) # Wait for the 'load' event to complete
title = page.title()
url = page.url
tabs_info.append({'title': title, 'url': url})
logger.info(f"[OPEN_TABS_INFO] Tab info: '{title}' -> {url}")
except TimeoutError:
# If page loading times out, catch the exception and store the current information in the list
logger.warning(f"[OPEN_TABS_INFO] Tab load timeout for URL: {page.url}")
tabs_info.append({'title': 'Load timeout', 'url': page.url})
except Exception as e:
# Catch other potential exceptions that might occur while reading the page title
logger.error(f'[OPEN_TABS_INFO] Error reading tab info: {e}')
tabs_info.append({'title': 'Error encountered', 'url': page.url})
browser.close()
logger.info(f"[OPEN_TABS_INFO] Successfully retrieved info for {len(tabs_info)} tabs")
return tabs_info
except Exception as e:
logger.error(f"[OPEN_TABS_INFO] Attempt {attempt + 1} failed: {str(e)}")
logger.error(f"[OPEN_TABS_INFO] Exception type: {type(e).__name__}")
if attempt < max_retries - 1:
logger.info(f"[OPEN_TABS_INFO] Retrying in 3 seconds...")
time.sleep(3)
else:
logger.error(f"[OPEN_TABS_INFO] All {max_retries} attempts failed. Returning empty list.")
return []
tabs_info = []
for context in browser.contexts:
for page in context.pages:
try:
# Wait for the page to finish loading, this prevents the "execution context was destroyed" issue
page.wait_for_load_state('networkidle') # Wait for the 'load' event to complete
title = page.title()
url = page.url
tabs_info.append({'title': title, 'url': url})
except TimeoutError:
# If page loading times out, catch the exception and store the current information in the list
tabs_info.append({'title': 'Load timeout', 'url': page.url})
except Exception as e:
# Catch other potential exceptions that might occur while reading the page title
print(f'Error: {e}')
tabs_info.append({'title': 'Error encountered', 'url': page.url})
browser.close()
return tabs_info
# This should never be reached, but just in case
return []
def get_active_url_from_accessTree(env, config):
@@ -727,37 +793,79 @@ def get_active_tab_info(env, config: Dict[str, str]):
if active_tab_url is None:
logger.error("Failed to get the url of active tab")
return None
logger.info(f"[ACTIVE_TAB_INFO] Active tab URL: {active_tab_url}")
host = env.vm_ip
port = env.chromium_port # fixme: this port is hard-coded, need to be changed from config file
remote_debugging_url = f"http://{host}:{port}"
with sync_playwright() as p:
# connect to remote Chrome instance, since it is supposed to be the active one, we won't start a new one if failed
# Configuration for retry and timeout
max_retries = 2
timeout_ms = 60000 # 60 seconds for active tab
for attempt in range(max_retries):
try:
browser = p.chromium.connect_over_cdp(remote_debugging_url)
logger.info(f"[ACTIVE_TAB_INFO] Attempt {attempt + 1}/{max_retries}")
with sync_playwright() as p:
# connect to remote Chrome instance, since it is supposed to be the active one, we won't start a new one if failed
try:
browser = p.chromium.connect_over_cdp(remote_debugging_url)
logger.info(f"[ACTIVE_TAB_INFO] Successfully connected to Chrome instance")
except Exception as e:
logger.error(f"[ACTIVE_TAB_INFO] Failed to connect to Chrome instance: {e}")
return None
active_tab_info = {}
# go to the target URL page
page = browser.new_page()
# Set longer timeout for navigation
page.set_default_timeout(timeout_ms)
try:
logger.info(f"[ACTIVE_TAB_INFO] Navigating to URL: {active_tab_url}")
page.goto(active_tab_url, wait_until='networkidle', timeout=timeout_ms)
page.wait_for_load_state('networkidle', timeout=timeout_ms) # Wait for the 'load' event to complete
active_tab_info = {
'title': page.title(),
'url': page.url,
'content': page.content() # get the HTML content of the page
}
logger.info(f"[ACTIVE_TAB_INFO] Successfully loaded page. Title: '{active_tab_info['title']}'")
logger.info(f"[ACTIVE_TAB_INFO] Current URL: '{active_tab_info['url']}'")
except TimeoutError:
logger.warning(f"[ACTIVE_TAB_INFO] Page load timeout for URL: {active_tab_url}")
active_tab_info = {
'title': 'Load timeout',
'url': page.url,
'content': page.content()
}
except Exception as e:
logger.error(f"[ACTIVE_TAB_INFO] Failed to go to the target URL page: {e}")
return None
browser.close()
return active_tab_info
except Exception as e:
return None
logger.error(f"[ACTIVE_TAB_INFO] Attempt {attempt + 1} failed: {str(e)}")
logger.error(f"[ACTIVE_TAB_INFO] Exception type: {type(e).__name__}")
if attempt < max_retries - 1:
logger.info(f"[ACTIVE_TAB_INFO] Retrying in 3 seconds...")
time.sleep(3)
else:
logger.error(f"[ACTIVE_TAB_INFO] All {max_retries} attempts failed.")
return None
active_tab_info = {}
# go to the target URL page
page = browser.new_page()
try:
page.goto(active_tab_url)
except:
logger.error("Failed to go to the target URL page")
return None
page.wait_for_load_state('load') # Wait for the 'load' event to complete
active_tab_info = {
'title': page.title(),
'url': page.url,
'content': page.content() # get the HTML content of the page
}
browser.close()
# print("active_tab_title: {}".format(active_tab_info.get('title', 'None')))
# print("active_tab_url: {}".format(active_tab_info.get('url', 'None')))
# print("active_tab_content: {}".format(active_tab_info.get('content', 'None')))
return active_tab_info
# This should never be reached, but just in case
return None
def get_pdf_from_url(env, config: Dict[str, str]) -> str:
@@ -766,41 +874,110 @@ def get_pdf_from_url(env, config: Dict[str, str]) -> str:
"""
_url = config["path"]
_path = os.path.join(env.cache_dir, config["dest"])
# Add logging for debugging
logger.info(f"[PDF_FROM_URL] Starting PDF download from URL: {_url}")
logger.info(f"[PDF_FROM_URL] Target path: {_path}")
host = env.vm_ip
port = env.chromium_port # fixme: this port is hard-coded, need to be changed from config file
server_port = env.server_port
remote_debugging_url = f"http://{host}:{port}"
with sync_playwright() as p:
# Configuration for retry and timeout
max_retries = 3
timeout_ms = 60000 # Increase timeout to 60 seconds
for attempt in range(max_retries):
try:
browser = p.chromium.connect_over_cdp(remote_debugging_url)
logger.info(f"[PDF_FROM_URL] Attempt {attempt + 1}/{max_retries}")
with sync_playwright() as p:
try:
browser = p.chromium.connect_over_cdp(remote_debugging_url)
logger.info(f"[PDF_FROM_URL] Successfully connected to existing Chrome instance")
except Exception as e:
logger.warning(f"[PDF_FROM_URL] Failed to connect to existing Chrome instance: {e}")
logger.info(f"[PDF_FROM_URL] Starting new Chrome instance...")
# If the connection fails, start a new browser instance
platform.machine()
if "arm" in platform.machine():
# start a new browser instance if the connection fails
payload = json.dumps({"command": [
"chromium",
"--remote-debugging-port=1337"
], "shell": False})
else:
payload = json.dumps({"command": [
"google-chrome",
"--remote-debugging-port=1337"
], "shell": False})
headers = {"Content-Type": "application/json"}
requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
time.sleep(5)
browser = p.chromium.connect_over_cdp(remote_debugging_url)
logger.info(f"[PDF_FROM_URL] Successfully connected to new Chrome instance")
page = browser.new_page()
# Set longer timeout for navigation
page.set_default_timeout(timeout_ms)
logger.info(f"[PDF_FROM_URL] Navigating to URL: {_url}")
page.goto(_url, wait_until='networkidle', timeout=timeout_ms)
# Wait for page to be fully loaded
logger.info(f"[PDF_FROM_URL] Waiting for page to be fully loaded...")
page.wait_for_load_state('networkidle', timeout=timeout_ms)
# Additional wait to ensure all content is rendered
time.sleep(3)
logger.info(f"[PDF_FROM_URL] Page loaded successfully. Title: '{page.title()}'")
logger.info(f"[PDF_FROM_URL] Current URL: '{page.url}'")
# Generate PDF
logger.info(f"[PDF_FROM_URL] Generating PDF...")
page.pdf(path=_path)
logger.info(f"[PDF_FROM_URL] PDF generated successfully at: {_path}")
browser.close()
# Verify PDF file was created
if os.path.exists(_path):
file_size = os.path.getsize(_path)
logger.info(f"[PDF_FROM_URL] PDF file created successfully. Size: {file_size} bytes")
return _path
else:
logger.error(f"[PDF_FROM_URL] PDF file was not created at expected path: {_path}")
raise FileNotFoundError(f"PDF file was not created at {_path}")
except Exception as e:
# If the connection fails, start a new browser instance
platform.machine()
if "arm" in platform.machine():
# start a new browser instance if the connection fails
payload = json.dumps({"command": [
"chromium",
"--remote-debugging-port=1337"
], "shell": False})
logger.error(f"[PDF_FROM_URL] Attempt {attempt + 1} failed: {str(e)}")
logger.error(f"[PDF_FROM_URL] Exception type: {type(e).__name__}")
if attempt < max_retries - 1:
logger.info(f"[PDF_FROM_URL] Retrying in 5 seconds...")
time.sleep(5)
else:
payload = json.dumps({"command": [
"google-chrome",
"--remote-debugging-port=1337"
], "shell": False})
headers = {"Content-Type": "application/json"}
requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
time.sleep(5)
browser = p.chromium.connect_over_cdp(remote_debugging_url)
page = browser.new_page()
page.goto(_url)
page.pdf(path=_path)
browser.close()
logger.error(f"[PDF_FROM_URL] All {max_retries} attempts failed. Giving up.")
# Create a placeholder file or return a default path
try:
# Create an empty PDF file as fallback
with open(_path, 'w') as f:
f.write("%PDF-1.4\n%EOF\n")
logger.warning(f"[PDF_FROM_URL] Created empty PDF file as fallback: {_path}")
return _path
except Exception as fallback_error:
logger.error(f"[PDF_FROM_URL] Failed to create fallback file: {fallback_error}")
# Return the path anyway, even if file creation failed
return _path
# This should never be reached, but just in case
return _path
@@ -811,41 +988,75 @@ def get_chrome_saved_address(env, config: Dict[str, str]):
server_port = env.server_port
remote_debugging_url = f"http://{host}:{port}"
with sync_playwright() as p:
# connect to remote Chrome instance
# Configuration for retry and timeout
max_retries = 2
timeout_ms = 30000 # 30 seconds for settings page
for attempt in range(max_retries):
try:
browser = p.chromium.connect_over_cdp(remote_debugging_url)
logger.info(f"[CHROME_SAVED_ADDRESS] Attempt {attempt + 1}/{max_retries}")
with sync_playwright() as p:
# connect to remote Chrome instance
try:
browser = p.chromium.connect_over_cdp(remote_debugging_url)
logger.info(f"[CHROME_SAVED_ADDRESS] Successfully connected to existing Chrome instance")
except Exception as e:
logger.warning(f"[CHROME_SAVED_ADDRESS] Failed to connect to existing Chrome instance: {e}")
# If the connection fails, start a new browser instance
platform.machine()
if "arm" in platform.machine():
# start a new browser instance if the connection fails
payload = json.dumps({"command": [
"chromium",
"--remote-debugging-port=1337"
], "shell": False})
else:
payload = json.dumps({"command": [
"google-chrome",
"--remote-debugging-port=1337"
], "shell": False})
headers = {"Content-Type": "application/json"}
requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
time.sleep(5)
browser = p.chromium.connect_over_cdp(remote_debugging_url)
logger.info(f"[CHROME_SAVED_ADDRESS] Successfully connected to new Chrome instance")
page = browser.new_page()
# Set longer timeout for navigation
page.set_default_timeout(timeout_ms)
# Navigate to Chrome's settings page for autofill
logger.info(f"[CHROME_SAVED_ADDRESS] Navigating to Chrome settings page")
page.goto("chrome://settings/addresses", wait_until='networkidle', timeout=timeout_ms)
# Wait for page to be fully loaded
page.wait_for_load_state('networkidle', timeout=timeout_ms)
# Get the HTML content of the page
content = page.content()
logger.info(f"[CHROME_SAVED_ADDRESS] Successfully retrieved settings page content")
browser.close()
return content
except Exception as e:
# If the connection fails, start a new browser instance
platform.machine()
if "arm" in platform.machine():
# start a new browser instance if the connection fails
payload = json.dumps({"command": [
"chromium",
"--remote-debugging-port=1337"
], "shell": False})
logger.error(f"[CHROME_SAVED_ADDRESS] Attempt {attempt + 1} failed: {str(e)}")
logger.error(f"[CHROME_SAVED_ADDRESS] Exception type: {type(e).__name__}")
if attempt < max_retries - 1:
logger.info(f"[CHROME_SAVED_ADDRESS] Retrying in 3 seconds...")
time.sleep(3)
else:
payload = json.dumps({"command": [
"google-chrome",
"--remote-debugging-port=1337"
], "shell": False})
logger.error(f"[CHROME_SAVED_ADDRESS] All {max_retries} attempts failed. Returning empty content.")
return ""
headers = {"Content-Type": "application/json"}
requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
time.sleep(5)
browser = p.chromium.connect_over_cdp(remote_debugging_url)
page = browser.new_page()
# Navigate to Chrome's settings page for autofill
page.goto("chrome://settings/addresses")
# Get the HTML content of the page
content = page.content()
browser.close()
return content
# This should never be reached, but just in case
return ""
def get_shortcuts_on_desktop(env, config: Dict[str, str]):
@@ -891,38 +1102,76 @@ def get_number_of_search_results(env, config: Dict[str, str]):
server_port = env.server_port
remote_debugging_url = f"http://{host}:{port}"
with sync_playwright() as p:
# Configuration for retry and timeout
max_retries = 2
timeout_ms = 45000 # 45 seconds for search results
for attempt in range(max_retries):
try:
browser = p.chromium.connect_over_cdp(remote_debugging_url)
logger.info(f"[SEARCH_RESULTS] Attempt {attempt + 1}/{max_retries} for URL: {url}")
with sync_playwright() as p:
try:
browser = p.chromium.connect_over_cdp(remote_debugging_url)
logger.info(f"[SEARCH_RESULTS] Successfully connected to existing Chrome instance")
except Exception as e:
logger.warning(f"[SEARCH_RESULTS] Failed to connect to existing Chrome instance: {e}")
# If the connection fails, start a new browser instance
platform.machine()
if "arm" in platform.machine():
# start a new browser instance if the connection fails
payload = json.dumps({"command": [
"chromium",
"--remote-debugging-port=1337"
], "shell": False})
else:
payload = json.dumps({"command": [
"google-chrome",
"--remote-debugging-port=1337"
], "shell": False})
headers = {"Content-Type": "application/json"}
requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
time.sleep(5)
browser = p.chromium.connect_over_cdp(remote_debugging_url)
logger.info(f"[SEARCH_RESULTS] Successfully connected to new Chrome instance")
page = browser.new_page()
# Set longer timeout for navigation
page.set_default_timeout(timeout_ms)
logger.info(f"[SEARCH_RESULTS] Navigating to URL: {url}")
page.goto(url, wait_until='networkidle', timeout=timeout_ms)
# Wait for page to be fully loaded
page.wait_for_load_state('networkidle', timeout=timeout_ms)
search_results = page.query_selector_all(result_selector)
actual_count = len(search_results)
logger.info(f"[SEARCH_RESULTS] Found {actual_count} search results")
browser.close()
return actual_count
except Exception as e:
# If the connection fails, start a new browser instance
platform.machine()
if "arm" in platform.machine():
# start a new browser instance if the connection fails
payload = json.dumps({"command": [
"chromium",
"--remote-debugging-port=1337"
], "shell": False})
logger.error(f"[SEARCH_RESULTS] Attempt {attempt + 1} failed: {str(e)}")
logger.error(f"[SEARCH_RESULTS] Exception type: {type(e).__name__}")
if attempt < max_retries - 1:
logger.info(f"[SEARCH_RESULTS] Retrying in 3 seconds...")
time.sleep(3)
else:
payload = json.dumps({"command": [
"google-chrome",
"--remote-debugging-port=1337"
], "shell": False})
logger.error(f"[SEARCH_RESULTS] All {max_retries} attempts failed. Returning 0.")
return 0
headers = {"Content-Type": "application/json"}
requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
time.sleep(5)
browser = p.chromium.connect_over_cdp(remote_debugging_url)
page = browser.new_page()
page.goto(url)
search_results = page.query_selector_all(result_selector)
actual_count = len(search_results)
browser.close()
return actual_count
# This should never be reached, but just in case
return 0
def get_googledrive_file(env, config: Dict[str, Any]) -> str:
def get_googledrive_file(env, config: Dict[str, Any]) -> Any:
""" Get the desired file from Google Drive based on config, return the downloaded local filepath.
@args: keys in config dict
settings_file(str): target filepath to the settings file for Google Drive authentication, default is 'evaluation_examples/settings/googledrive/settings.yml'
@@ -959,18 +1208,21 @@ def get_googledrive_file(env, config: Dict[str, Any]) -> str:
return _path
if 'query' in config:
return get_single_file(config['query'], os.path.join(env.cache_dir, config['dest']))
result = get_single_file(config['query'], os.path.join(env.cache_dir, config['dest']))
return result
elif 'path' in config:
query = [f"title = '{fp}' and mimeType = 'application/vnd.google-apps.folder' and trashed = false" if idx < len(
config['path']) - 1
else f"title = '{fp}' and trashed = false" for idx, fp in enumerate(config['path'])]
return get_single_file(query, os.path.join(env.cache_dir, config['dest']))
result = get_single_file(query, os.path.join(env.cache_dir, config['dest']))
return result
elif 'query_list' in config:
_path_list = []
assert len(config['query_list']) == len(config['dest'])
for idx, query in enumerate(config['query_list']):
dest = config['dest'][idx]
_path_list.append(get_single_file(query, os.path.join(env.cache_dir, dest)))
result = get_single_file(query, os.path.join(env.cache_dir, dest))
_path_list.append(result)
return _path_list
else: # path_list in config
_path_list = []
@@ -981,7 +1233,8 @@ def get_googledrive_file(env, config: Dict[str, Any]) -> str:
path) - 1
else f"title = '{fp}' and trashed = false" for jdx, fp in enumerate(path)]
dest = config['dest'][idx]
_path_list.append(get_single_file(query, os.path.join(env.cache_dir, dest)))
result = get_single_file(query, os.path.join(env.cache_dir, dest))
_path_list.append(result)
return _path_list
@@ -1692,16 +1945,31 @@ def get_url_dashPart(env, config: Dict[str, str]):
return None
# extract the last dash-separated part of the URL, and delete all the characters after "id"
dash_part = active_tab_url.split("/")[config["partIndex"]]
if config["needDeleteId"]:
# Ensure partIndex is an integer
try:
part_index = int(config["partIndex"])
except (ValueError, TypeError):
logger.error(f"[URL_DASH_PART] Invalid partIndex: {config.get('partIndex', 'None')}. Must be an integer.")
return None
url_parts = active_tab_url.split("/")
if part_index >= len(url_parts):
logger.error(f"[URL_DASH_PART] partIndex {part_index} is out of range for URL with {len(url_parts)} parts")
return None
dash_part = url_parts[part_index]
if config.get("needDeleteId", False):
dash_part = dash_part.split("?")[0]
# print("active_tab_title: {}".format(active_tab_info.get('title', 'None')))
# print("active_tab_url: {}".format(active_tab_info.get('url', 'None')))
# print("active_tab_content: {}".format(active_tab_info.get('content', 'None')))
logger.info(f"[URL_DASH_PART] Extracted dash part: '{dash_part}' from URL: {active_tab_url}")
if config["returnType"] == "string":
return dash_part
elif config["returnType"] == "json":
return {config["key"]: dash_part}
else:
logger.error(f"[URL_DASH_PART] Invalid returnType: {config.get('returnType', 'None')}. Must be 'string' or 'json'.")
return None
def get_macys_product_url_parse(env, config: Dict[str, str]):