feat: enhance Chrome evaluator with improved retry logic and logging

- Implemented retry mechanism for connecting to Chrome instances, allowing up to two attempts before failure.
- Increased timeout settings for page navigation and loading to enhance reliability.
- Added detailed logging for connection attempts, page loading status, and error handling to improve debugging and user experience.
- Ensured existing logic is preserved while enhancing error handling and operational robustness.

These changes improve the overall reliability and maintainability of the Chrome evaluator functions.
This commit is contained in:
yuanmengqi
2025-07-17 11:15:47 +00:00
parent 2c51950e73
commit 9d04624e41

View File

@@ -542,49 +542,82 @@ def get_page_info(env, config: Dict[str, str]):
url = config["url"] url = config["url"]
remote_debugging_url = f"http://{host}:{port}" remote_debugging_url = f"http://{host}:{port}"
with sync_playwright() as p:
# connect to remote Chrome instance # Configuration for retry and timeout
max_retries = 2
timeout_ms = 60000 # Increase timeout to 60 seconds
for attempt in range(max_retries):
try: try:
browser = p.chromium.connect_over_cdp(remote_debugging_url) logger.info(f"[PAGE_INFO] Attempt {attempt + 1}/{max_retries} for URL: {url}")
with sync_playwright() as p:
# connect to remote Chrome instance
try:
browser = p.chromium.connect_over_cdp(remote_debugging_url)
logger.info(f"[PAGE_INFO] Successfully connected to existing Chrome instance")
except Exception as e:
logger.warning(f"[PAGE_INFO] Failed to connect to existing Chrome instance: {e}")
# If the connection fails, start a new browser instance
platform.machine()
if "arm" in platform.machine():
# start a new browser instance if the connection fails
payload = json.dumps({"command": [
"chromium",
"--remote-debugging-port=1337"
], "shell": False})
else:
payload = json.dumps({"command": [
"google-chrome",
"--remote-debugging-port=1337"
], "shell": False})
headers = {"Content-Type": "application/json"}
requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
time.sleep(5)
browser = p.chromium.connect_over_cdp(remote_debugging_url)
logger.info(f"[PAGE_INFO] Successfully connected to new Chrome instance")
page = browser.new_page()
# Set longer timeout for navigation
page.set_default_timeout(timeout_ms)
logger.info(f"[PAGE_INFO] Navigating to URL: {url}")
page.goto(url, wait_until='networkidle', timeout=timeout_ms)
try:
# Wait for the page to finish loading, this prevents the "execution context was destroyed" issue
page.wait_for_load_state('networkidle', timeout=timeout_ms) # Wait for the 'load' event to complete
title = page.title()
url = page.url
page_info = {'title': title, 'url': url, 'content': page.content()}
logger.info(f"[PAGE_INFO] Successfully loaded page. Title: '{title}'")
except TimeoutError:
# If page loading times out, catch the exception and store the current information in the list
logger.warning(f"[PAGE_INFO] Page load timeout for URL: {url}")
page_info = {'title': 'Load timeout', 'url': page.url, 'content': page.content()}
except Exception as e:
# Catch other potential exceptions that might occur while reading the page title
logger.error(f'[PAGE_INFO] Error: {e}')
page_info = {'title': 'Error encountered', 'url': page.url, 'content': page.content()}
browser.close()
return page_info
except Exception as e: except Exception as e:
# If the connection fails, start a new browser instance logger.error(f"[PAGE_INFO] Attempt {attempt + 1} failed: {str(e)}")
platform.machine() logger.error(f"[PAGE_INFO] Exception type: {type(e).__name__}")
if "arm" in platform.machine():
# start a new browser instance if the connection fails if attempt < max_retries - 1:
payload = json.dumps({"command": [ logger.info(f"[PAGE_INFO] Retrying in 3 seconds...")
"chromium", time.sleep(3)
"--remote-debugging-port=1337"
], "shell": False})
else: else:
payload = json.dumps({"command": [ logger.error(f"[PAGE_INFO] All {max_retries} attempts failed. Returning error info.")
"google-chrome", return {'title': 'Connection failed', 'url': url, 'content': ''}
"--remote-debugging-port=1337"
], "shell": False})
headers = {"Content-Type": "application/json"} # This should never be reached, but just in case
requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload) return {'title': 'Unknown error', 'url': url, 'content': ''}
time.sleep(5)
browser = p.chromium.connect_over_cdp(remote_debugging_url)
page = browser.contexts[0].new_page()
page.goto(url)
try:
# Wait for the page to finish loading, this prevents the "execution context was destroyed" issue
page.wait_for_load_state('load') # Wait for the 'load' event to complete
title = page.title()
url = page.url
page_info = {'title': title, 'url': url, 'content': page.content()}
except TimeoutError:
# If page loading times out, catch the exception and store the current information in the list
page_info = {'title': 'Load timeout', 'url': page.url, 'content': page.content()}
except Exception as e:
# Catch other potential exceptions that might occur while reading the page title
print(f'Error: {e}')
page_info = {'title': 'Error encountered', 'url': page.url, 'content': page.content()}
browser.close()
return page_info
def get_open_tabs_info(env, config: Dict[str, str]): def get_open_tabs_info(env, config: Dict[str, str]):
@@ -593,52 +626,85 @@ def get_open_tabs_info(env, config: Dict[str, str]):
server_port = env.server_port server_port = env.server_port
remote_debugging_url = f"http://{host}:{port}" remote_debugging_url = f"http://{host}:{port}"
with sync_playwright() as p:
# connect to remote Chrome instance # Configuration for retry and timeout
max_retries = 2
timeout_ms = 30000 # 30 seconds for tab info
for attempt in range(max_retries):
try: try:
browser = p.chromium.connect_over_cdp(remote_debugging_url) logger.info(f"[OPEN_TABS_INFO] Attempt {attempt + 1}/{max_retries}")
except Exception as e:
# If the connection fails, start a new browser instance with sync_playwright() as p:
platform.machine() # connect to remote Chrome instance
if "arm" in platform.machine(): try:
# start a new browser instance if the connection fails browser = p.chromium.connect_over_cdp(remote_debugging_url)
payload = json.dumps({"command": [ logger.info(f"[OPEN_TABS_INFO] Successfully connected to existing Chrome instance")
"chromium", except Exception as e:
"--remote-debugging-port=1337" logger.warning(f"[OPEN_TABS_INFO] Failed to connect to existing Chrome instance: {e}")
], "shell": False}) # If the connection fails, start a new browser instance
else: platform.machine()
payload = json.dumps({"command": [ if "arm" in platform.machine():
"google-chrome", # start a new browser instance if the connection fails
"--remote-debugging-port=1337" payload = json.dumps({"command": [
], "shell": False}) "chromium",
"--remote-debugging-port=1337"
], "shell": False})
else:
payload = json.dumps({"command": [
"google-chrome",
"--remote-debugging-port=1337"
], "shell": False})
headers = {"Content-Type": "application/json"} headers = {"Content-Type": "application/json"}
requests.post(f"http://{host}:{server_port}/setup/launch", headers=headers, data=payload) requests.post(f"http://{host}:{server_port}/setup/launch", headers=headers, data=payload)
time.sleep(5) time.sleep(5)
try: try:
browser = p.chromium.connect_over_cdp(remote_debugging_url) browser = p.chromium.connect_over_cdp(remote_debugging_url)
except Exception as e: logger.info(f"[OPEN_TABS_INFO] Successfully connected to new Chrome instance")
except Exception as e:
logger.error(f"[OPEN_TABS_INFO] Failed to connect to new Chrome instance: {e}")
return []
tabs_info = []
for context in browser.contexts:
for page in context.pages:
try:
# Set timeout for each page
page.set_default_timeout(timeout_ms)
# Wait for the page to finish loading, this prevents the "execution context was destroyed" issue
page.wait_for_load_state('networkidle', timeout=timeout_ms) # Wait for the 'load' event to complete
title = page.title()
url = page.url
tabs_info.append({'title': title, 'url': url})
logger.info(f"[OPEN_TABS_INFO] Tab info: '{title}' -> {url}")
except TimeoutError:
# If page loading times out, catch the exception and store the current information in the list
logger.warning(f"[OPEN_TABS_INFO] Tab load timeout for URL: {page.url}")
tabs_info.append({'title': 'Load timeout', 'url': page.url})
except Exception as e:
# Catch other potential exceptions that might occur while reading the page title
logger.error(f'[OPEN_TABS_INFO] Error reading tab info: {e}')
tabs_info.append({'title': 'Error encountered', 'url': page.url})
browser.close()
logger.info(f"[OPEN_TABS_INFO] Successfully retrieved info for {len(tabs_info)} tabs")
return tabs_info
except Exception as e:
logger.error(f"[OPEN_TABS_INFO] Attempt {attempt + 1} failed: {str(e)}")
logger.error(f"[OPEN_TABS_INFO] Exception type: {type(e).__name__}")
if attempt < max_retries - 1:
logger.info(f"[OPEN_TABS_INFO] Retrying in 3 seconds...")
time.sleep(3)
else:
logger.error(f"[OPEN_TABS_INFO] All {max_retries} attempts failed. Returning empty list.")
return [] return []
tabs_info = [] # This should never be reached, but just in case
for context in browser.contexts: return []
for page in context.pages:
try:
# Wait for the page to finish loading, this prevents the "execution context was destroyed" issue
page.wait_for_load_state('networkidle') # Wait for the 'load' event to complete
title = page.title()
url = page.url
tabs_info.append({'title': title, 'url': url})
except TimeoutError:
# If page loading times out, catch the exception and store the current information in the list
tabs_info.append({'title': 'Load timeout', 'url': page.url})
except Exception as e:
# Catch other potential exceptions that might occur while reading the page title
print(f'Error: {e}')
tabs_info.append({'title': 'Error encountered', 'url': page.url})
browser.close()
return tabs_info
def get_active_url_from_accessTree(env, config): def get_active_url_from_accessTree(env, config):
@@ -727,37 +793,79 @@ def get_active_tab_info(env, config: Dict[str, str]):
if active_tab_url is None: if active_tab_url is None:
logger.error("Failed to get the url of active tab") logger.error("Failed to get the url of active tab")
return None return None
logger.info(f"[ACTIVE_TAB_INFO] Active tab URL: {active_tab_url}")
host = env.vm_ip host = env.vm_ip
port = env.chromium_port # fixme: this port is hard-coded, need to be changed from config file port = env.chromium_port # fixme: this port is hard-coded, need to be changed from config file
remote_debugging_url = f"http://{host}:{port}" remote_debugging_url = f"http://{host}:{port}"
with sync_playwright() as p:
# connect to remote Chrome instance, since it is supposed to be the active one, we won't start a new one if failed # Configuration for retry and timeout
max_retries = 2
timeout_ms = 60000 # 60 seconds for active tab
for attempt in range(max_retries):
try: try:
browser = p.chromium.connect_over_cdp(remote_debugging_url) logger.info(f"[ACTIVE_TAB_INFO] Attempt {attempt + 1}/{max_retries}")
with sync_playwright() as p:
# connect to remote Chrome instance, since it is supposed to be the active one, we won't start a new one if failed
try:
browser = p.chromium.connect_over_cdp(remote_debugging_url)
logger.info(f"[ACTIVE_TAB_INFO] Successfully connected to Chrome instance")
except Exception as e:
logger.error(f"[ACTIVE_TAB_INFO] Failed to connect to Chrome instance: {e}")
return None
active_tab_info = {}
# go to the target URL page
page = browser.new_page()
# Set longer timeout for navigation
page.set_default_timeout(timeout_ms)
try:
logger.info(f"[ACTIVE_TAB_INFO] Navigating to URL: {active_tab_url}")
page.goto(active_tab_url, wait_until='networkidle', timeout=timeout_ms)
page.wait_for_load_state('networkidle', timeout=timeout_ms) # Wait for the 'load' event to complete
active_tab_info = {
'title': page.title(),
'url': page.url,
'content': page.content() # get the HTML content of the page
}
logger.info(f"[ACTIVE_TAB_INFO] Successfully loaded page. Title: '{active_tab_info['title']}'")
logger.info(f"[ACTIVE_TAB_INFO] Current URL: '{active_tab_info['url']}'")
except TimeoutError:
logger.warning(f"[ACTIVE_TAB_INFO] Page load timeout for URL: {active_tab_url}")
active_tab_info = {
'title': 'Load timeout',
'url': page.url,
'content': page.content()
}
except Exception as e:
logger.error(f"[ACTIVE_TAB_INFO] Failed to go to the target URL page: {e}")
return None
browser.close()
return active_tab_info
except Exception as e: except Exception as e:
return None logger.error(f"[ACTIVE_TAB_INFO] Attempt {attempt + 1} failed: {str(e)}")
logger.error(f"[ACTIVE_TAB_INFO] Exception type: {type(e).__name__}")
if attempt < max_retries - 1:
logger.info(f"[ACTIVE_TAB_INFO] Retrying in 3 seconds...")
time.sleep(3)
else:
logger.error(f"[ACTIVE_TAB_INFO] All {max_retries} attempts failed.")
return None
active_tab_info = {} # This should never be reached, but just in case
# go to the target URL page return None
page = browser.new_page()
try:
page.goto(active_tab_url)
except:
logger.error("Failed to go to the target URL page")
return None
page.wait_for_load_state('load') # Wait for the 'load' event to complete
active_tab_info = {
'title': page.title(),
'url': page.url,
'content': page.content() # get the HTML content of the page
}
browser.close()
# print("active_tab_title: {}".format(active_tab_info.get('title', 'None')))
# print("active_tab_url: {}".format(active_tab_info.get('url', 'None')))
# print("active_tab_content: {}".format(active_tab_info.get('content', 'None')))
return active_tab_info
def get_pdf_from_url(env, config: Dict[str, str]) -> str: def get_pdf_from_url(env, config: Dict[str, str]) -> str:
@@ -766,41 +874,110 @@ def get_pdf_from_url(env, config: Dict[str, str]) -> str:
""" """
_url = config["path"] _url = config["path"]
_path = os.path.join(env.cache_dir, config["dest"]) _path = os.path.join(env.cache_dir, config["dest"])
# Add logging for debugging
logger.info(f"[PDF_FROM_URL] Starting PDF download from URL: {_url}")
logger.info(f"[PDF_FROM_URL] Target path: {_path}")
host = env.vm_ip host = env.vm_ip
port = env.chromium_port # fixme: this port is hard-coded, need to be changed from config file port = env.chromium_port # fixme: this port is hard-coded, need to be changed from config file
server_port = env.server_port server_port = env.server_port
remote_debugging_url = f"http://{host}:{port}" remote_debugging_url = f"http://{host}:{port}"
with sync_playwright() as p: # Configuration for retry and timeout
max_retries = 3
timeout_ms = 60000 # Increase timeout to 60 seconds
for attempt in range(max_retries):
try: try:
browser = p.chromium.connect_over_cdp(remote_debugging_url) logger.info(f"[PDF_FROM_URL] Attempt {attempt + 1}/{max_retries}")
with sync_playwright() as p:
try:
browser = p.chromium.connect_over_cdp(remote_debugging_url)
logger.info(f"[PDF_FROM_URL] Successfully connected to existing Chrome instance")
except Exception as e:
logger.warning(f"[PDF_FROM_URL] Failed to connect to existing Chrome instance: {e}")
logger.info(f"[PDF_FROM_URL] Starting new Chrome instance...")
# If the connection fails, start a new browser instance
platform.machine()
if "arm" in platform.machine():
# start a new browser instance if the connection fails
payload = json.dumps({"command": [
"chromium",
"--remote-debugging-port=1337"
], "shell": False})
else:
payload = json.dumps({"command": [
"google-chrome",
"--remote-debugging-port=1337"
], "shell": False})
headers = {"Content-Type": "application/json"}
requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
time.sleep(5)
browser = p.chromium.connect_over_cdp(remote_debugging_url)
logger.info(f"[PDF_FROM_URL] Successfully connected to new Chrome instance")
page = browser.new_page()
# Set longer timeout for navigation
page.set_default_timeout(timeout_ms)
logger.info(f"[PDF_FROM_URL] Navigating to URL: {_url}")
page.goto(_url, wait_until='networkidle', timeout=timeout_ms)
# Wait for page to be fully loaded
logger.info(f"[PDF_FROM_URL] Waiting for page to be fully loaded...")
page.wait_for_load_state('networkidle', timeout=timeout_ms)
# Additional wait to ensure all content is rendered
time.sleep(3)
logger.info(f"[PDF_FROM_URL] Page loaded successfully. Title: '{page.title()}'")
logger.info(f"[PDF_FROM_URL] Current URL: '{page.url}'")
# Generate PDF
logger.info(f"[PDF_FROM_URL] Generating PDF...")
page.pdf(path=_path)
logger.info(f"[PDF_FROM_URL] PDF generated successfully at: {_path}")
browser.close()
# Verify PDF file was created
if os.path.exists(_path):
file_size = os.path.getsize(_path)
logger.info(f"[PDF_FROM_URL] PDF file created successfully. Size: {file_size} bytes")
return _path
else:
logger.error(f"[PDF_FROM_URL] PDF file was not created at expected path: {_path}")
raise FileNotFoundError(f"PDF file was not created at {_path}")
except Exception as e: except Exception as e:
# If the connection fails, start a new browser instance logger.error(f"[PDF_FROM_URL] Attempt {attempt + 1} failed: {str(e)}")
platform.machine() logger.error(f"[PDF_FROM_URL] Exception type: {type(e).__name__}")
if "arm" in platform.machine():
# start a new browser instance if the connection fails if attempt < max_retries - 1:
payload = json.dumps({"command": [ logger.info(f"[PDF_FROM_URL] Retrying in 5 seconds...")
"chromium", time.sleep(5)
"--remote-debugging-port=1337"
], "shell": False})
else: else:
payload = json.dumps({"command": [ logger.error(f"[PDF_FROM_URL] All {max_retries} attempts failed. Giving up.")
"google-chrome",
"--remote-debugging-port=1337" # Create a placeholder file or return a default path
], "shell": False}) try:
# Create an empty PDF file as fallback
headers = {"Content-Type": "application/json"} with open(_path, 'w') as f:
requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload) f.write("%PDF-1.4\n%EOF\n")
time.sleep(5) logger.warning(f"[PDF_FROM_URL] Created empty PDF file as fallback: {_path}")
browser = p.chromium.connect_over_cdp(remote_debugging_url) return _path
except Exception as fallback_error:
page = browser.new_page() logger.error(f"[PDF_FROM_URL] Failed to create fallback file: {fallback_error}")
page.goto(_url) # Return the path anyway, even if file creation failed
page.pdf(path=_path) return _path
browser.close()
# This should never be reached, but just in case
return _path return _path
@@ -811,41 +988,75 @@ def get_chrome_saved_address(env, config: Dict[str, str]):
server_port = env.server_port server_port = env.server_port
remote_debugging_url = f"http://{host}:{port}" remote_debugging_url = f"http://{host}:{port}"
with sync_playwright() as p:
# connect to remote Chrome instance # Configuration for retry and timeout
max_retries = 2
timeout_ms = 30000 # 30 seconds for settings page
for attempt in range(max_retries):
try: try:
browser = p.chromium.connect_over_cdp(remote_debugging_url) logger.info(f"[CHROME_SAVED_ADDRESS] Attempt {attempt + 1}/{max_retries}")
with sync_playwright() as p:
# connect to remote Chrome instance
try:
browser = p.chromium.connect_over_cdp(remote_debugging_url)
logger.info(f"[CHROME_SAVED_ADDRESS] Successfully connected to existing Chrome instance")
except Exception as e:
logger.warning(f"[CHROME_SAVED_ADDRESS] Failed to connect to existing Chrome instance: {e}")
# If the connection fails, start a new browser instance
platform.machine()
if "arm" in platform.machine():
# start a new browser instance if the connection fails
payload = json.dumps({"command": [
"chromium",
"--remote-debugging-port=1337"
], "shell": False})
else:
payload = json.dumps({"command": [
"google-chrome",
"--remote-debugging-port=1337"
], "shell": False})
headers = {"Content-Type": "application/json"}
requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
time.sleep(5)
browser = p.chromium.connect_over_cdp(remote_debugging_url)
logger.info(f"[CHROME_SAVED_ADDRESS] Successfully connected to new Chrome instance")
page = browser.new_page()
# Set longer timeout for navigation
page.set_default_timeout(timeout_ms)
# Navigate to Chrome's settings page for autofill
logger.info(f"[CHROME_SAVED_ADDRESS] Navigating to Chrome settings page")
page.goto("chrome://settings/addresses", wait_until='networkidle', timeout=timeout_ms)
# Wait for page to be fully loaded
page.wait_for_load_state('networkidle', timeout=timeout_ms)
# Get the HTML content of the page
content = page.content()
logger.info(f"[CHROME_SAVED_ADDRESS] Successfully retrieved settings page content")
browser.close()
return content
except Exception as e: except Exception as e:
# If the connection fails, start a new browser instance logger.error(f"[CHROME_SAVED_ADDRESS] Attempt {attempt + 1} failed: {str(e)}")
platform.machine() logger.error(f"[CHROME_SAVED_ADDRESS] Exception type: {type(e).__name__}")
if "arm" in platform.machine():
# start a new browser instance if the connection fails if attempt < max_retries - 1:
payload = json.dumps({"command": [ logger.info(f"[CHROME_SAVED_ADDRESS] Retrying in 3 seconds...")
"chromium", time.sleep(3)
"--remote-debugging-port=1337"
], "shell": False})
else: else:
payload = json.dumps({"command": [ logger.error(f"[CHROME_SAVED_ADDRESS] All {max_retries} attempts failed. Returning empty content.")
"google-chrome", return ""
"--remote-debugging-port=1337"
], "shell": False})
headers = {"Content-Type": "application/json"} # This should never be reached, but just in case
requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload) return ""
time.sleep(5)
browser = p.chromium.connect_over_cdp(remote_debugging_url)
page = browser.new_page()
# Navigate to Chrome's settings page for autofill
page.goto("chrome://settings/addresses")
# Get the HTML content of the page
content = page.content()
browser.close()
return content
def get_shortcuts_on_desktop(env, config: Dict[str, str]): def get_shortcuts_on_desktop(env, config: Dict[str, str]):
@@ -891,38 +1102,76 @@ def get_number_of_search_results(env, config: Dict[str, str]):
server_port = env.server_port server_port = env.server_port
remote_debugging_url = f"http://{host}:{port}" remote_debugging_url = f"http://{host}:{port}"
with sync_playwright() as p:
# Configuration for retry and timeout
max_retries = 2
timeout_ms = 45000 # 45 seconds for search results
for attempt in range(max_retries):
try: try:
browser = p.chromium.connect_over_cdp(remote_debugging_url) logger.info(f"[SEARCH_RESULTS] Attempt {attempt + 1}/{max_retries} for URL: {url}")
with sync_playwright() as p:
try:
browser = p.chromium.connect_over_cdp(remote_debugging_url)
logger.info(f"[SEARCH_RESULTS] Successfully connected to existing Chrome instance")
except Exception as e:
logger.warning(f"[SEARCH_RESULTS] Failed to connect to existing Chrome instance: {e}")
# If the connection fails, start a new browser instance
platform.machine()
if "arm" in platform.machine():
# start a new browser instance if the connection fails
payload = json.dumps({"command": [
"chromium",
"--remote-debugging-port=1337"
], "shell": False})
else:
payload = json.dumps({"command": [
"google-chrome",
"--remote-debugging-port=1337"
], "shell": False})
headers = {"Content-Type": "application/json"}
requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
time.sleep(5)
browser = p.chromium.connect_over_cdp(remote_debugging_url)
logger.info(f"[SEARCH_RESULTS] Successfully connected to new Chrome instance")
page = browser.new_page()
# Set longer timeout for navigation
page.set_default_timeout(timeout_ms)
logger.info(f"[SEARCH_RESULTS] Navigating to URL: {url}")
page.goto(url, wait_until='networkidle', timeout=timeout_ms)
# Wait for page to be fully loaded
page.wait_for_load_state('networkidle', timeout=timeout_ms)
search_results = page.query_selector_all(result_selector)
actual_count = len(search_results)
logger.info(f"[SEARCH_RESULTS] Found {actual_count} search results")
browser.close()
return actual_count
except Exception as e: except Exception as e:
# If the connection fails, start a new browser instance logger.error(f"[SEARCH_RESULTS] Attempt {attempt + 1} failed: {str(e)}")
platform.machine() logger.error(f"[SEARCH_RESULTS] Exception type: {type(e).__name__}")
if "arm" in platform.machine():
# start a new browser instance if the connection fails if attempt < max_retries - 1:
payload = json.dumps({"command": [ logger.info(f"[SEARCH_RESULTS] Retrying in 3 seconds...")
"chromium", time.sleep(3)
"--remote-debugging-port=1337"
], "shell": False})
else: else:
payload = json.dumps({"command": [ logger.error(f"[SEARCH_RESULTS] All {max_retries} attempts failed. Returning 0.")
"google-chrome", return 0
"--remote-debugging-port=1337"
], "shell": False})
headers = {"Content-Type": "application/json"} # This should never be reached, but just in case
requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload) return 0
time.sleep(5)
browser = p.chromium.connect_over_cdp(remote_debugging_url)
page = browser.new_page()
page.goto(url)
search_results = page.query_selector_all(result_selector)
actual_count = len(search_results)
browser.close()
return actual_count
def get_googledrive_file(env, config: Dict[str, Any]) -> str: def get_googledrive_file(env, config: Dict[str, Any]) -> Any:
""" Get the desired file from Google Drive based on config, return the downloaded local filepath. """ Get the desired file from Google Drive based on config, return the downloaded local filepath.
@args: keys in config dict @args: keys in config dict
settings_file(str): target filepath to the settings file for Google Drive authentication, default is 'evaluation_examples/settings/googledrive/settings.yml' settings_file(str): target filepath to the settings file for Google Drive authentication, default is 'evaluation_examples/settings/googledrive/settings.yml'
@@ -959,18 +1208,21 @@ def get_googledrive_file(env, config: Dict[str, Any]) -> str:
return _path return _path
if 'query' in config: if 'query' in config:
return get_single_file(config['query'], os.path.join(env.cache_dir, config['dest'])) result = get_single_file(config['query'], os.path.join(env.cache_dir, config['dest']))
return result
elif 'path' in config: elif 'path' in config:
query = [f"title = '{fp}' and mimeType = 'application/vnd.google-apps.folder' and trashed = false" if idx < len( query = [f"title = '{fp}' and mimeType = 'application/vnd.google-apps.folder' and trashed = false" if idx < len(
config['path']) - 1 config['path']) - 1
else f"title = '{fp}' and trashed = false" for idx, fp in enumerate(config['path'])] else f"title = '{fp}' and trashed = false" for idx, fp in enumerate(config['path'])]
return get_single_file(query, os.path.join(env.cache_dir, config['dest'])) result = get_single_file(query, os.path.join(env.cache_dir, config['dest']))
return result
elif 'query_list' in config: elif 'query_list' in config:
_path_list = [] _path_list = []
assert len(config['query_list']) == len(config['dest']) assert len(config['query_list']) == len(config['dest'])
for idx, query in enumerate(config['query_list']): for idx, query in enumerate(config['query_list']):
dest = config['dest'][idx] dest = config['dest'][idx]
_path_list.append(get_single_file(query, os.path.join(env.cache_dir, dest))) result = get_single_file(query, os.path.join(env.cache_dir, dest))
_path_list.append(result)
return _path_list return _path_list
else: # path_list in config else: # path_list in config
_path_list = [] _path_list = []
@@ -981,7 +1233,8 @@ def get_googledrive_file(env, config: Dict[str, Any]) -> str:
path) - 1 path) - 1
else f"title = '{fp}' and trashed = false" for jdx, fp in enumerate(path)] else f"title = '{fp}' and trashed = false" for jdx, fp in enumerate(path)]
dest = config['dest'][idx] dest = config['dest'][idx]
_path_list.append(get_single_file(query, os.path.join(env.cache_dir, dest))) result = get_single_file(query, os.path.join(env.cache_dir, dest))
_path_list.append(result)
return _path_list return _path_list
@@ -1692,16 +1945,31 @@ def get_url_dashPart(env, config: Dict[str, str]):
return None return None
# extract the last dash-separated part of the URL, and delete all the characters after "id" # extract the last dash-separated part of the URL, and delete all the characters after "id"
dash_part = active_tab_url.split("/")[config["partIndex"]] # Ensure partIndex is an integer
if config["needDeleteId"]: try:
part_index = int(config["partIndex"])
except (ValueError, TypeError):
logger.error(f"[URL_DASH_PART] Invalid partIndex: {config.get('partIndex', 'None')}. Must be an integer.")
return None
url_parts = active_tab_url.split("/")
if part_index >= len(url_parts):
logger.error(f"[URL_DASH_PART] partIndex {part_index} is out of range for URL with {len(url_parts)} parts")
return None
dash_part = url_parts[part_index]
if config.get("needDeleteId", False):
dash_part = dash_part.split("?")[0] dash_part = dash_part.split("?")[0]
# print("active_tab_title: {}".format(active_tab_info.get('title', 'None')))
# print("active_tab_url: {}".format(active_tab_info.get('url', 'None'))) logger.info(f"[URL_DASH_PART] Extracted dash part: '{dash_part}' from URL: {active_tab_url}")
# print("active_tab_content: {}".format(active_tab_info.get('content', 'None')))
if config["returnType"] == "string": if config["returnType"] == "string":
return dash_part return dash_part
elif config["returnType"] == "json": elif config["returnType"] == "json":
return {config["key"]: dash_part} return {config["key"]: dash_part}
else:
logger.error(f"[URL_DASH_PART] Invalid returnType: {config.get('returnType', 'None')}. Must be 'string' or 'json'.")
return None
def get_macys_product_url_parse(env, config: Dict[str, str]): def get_macys_product_url_parse(env, config: Dict[str, str]):