feat: enhance Chrome evaluator with improved retry logic and logging
- Implemented retry mechanism for connecting to Chrome instances, allowing up to two attempts before failure. - Increased timeout settings for page navigation and loading to enhance reliability. - Added detailed logging for connection attempts, page loading status, and error handling to improve debugging and user experience. - Ensured existing logic is preserved while enhancing error handling and operational robustness. These changes improve the overall reliability and maintainability of the Chrome evaluator functions.
This commit is contained in:
@@ -542,49 +542,82 @@ def get_page_info(env, config: Dict[str, str]):
|
||||
url = config["url"]
|
||||
|
||||
remote_debugging_url = f"http://{host}:{port}"
|
||||
with sync_playwright() as p:
|
||||
# connect to remote Chrome instance
|
||||
|
||||
# Configuration for retry and timeout
|
||||
max_retries = 2
|
||||
timeout_ms = 60000 # Increase timeout to 60 seconds
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
||||
logger.info(f"[PAGE_INFO] Attempt {attempt + 1}/{max_retries} for URL: {url}")
|
||||
|
||||
with sync_playwright() as p:
|
||||
# connect to remote Chrome instance
|
||||
try:
|
||||
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
||||
logger.info(f"[PAGE_INFO] Successfully connected to existing Chrome instance")
|
||||
except Exception as e:
|
||||
logger.warning(f"[PAGE_INFO] Failed to connect to existing Chrome instance: {e}")
|
||||
# If the connection fails, start a new browser instance
|
||||
platform.machine()
|
||||
if "arm" in platform.machine():
|
||||
# start a new browser instance if the connection fails
|
||||
payload = json.dumps({"command": [
|
||||
"chromium",
|
||||
"--remote-debugging-port=1337"
|
||||
], "shell": False})
|
||||
else:
|
||||
payload = json.dumps({"command": [
|
||||
"google-chrome",
|
||||
"--remote-debugging-port=1337"
|
||||
], "shell": False})
|
||||
|
||||
headers = {"Content-Type": "application/json"}
|
||||
requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
|
||||
time.sleep(5)
|
||||
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
||||
logger.info(f"[PAGE_INFO] Successfully connected to new Chrome instance")
|
||||
|
||||
page = browser.new_page()
|
||||
|
||||
# Set longer timeout for navigation
|
||||
page.set_default_timeout(timeout_ms)
|
||||
|
||||
logger.info(f"[PAGE_INFO] Navigating to URL: {url}")
|
||||
page.goto(url, wait_until='networkidle', timeout=timeout_ms)
|
||||
|
||||
try:
|
||||
# Wait for the page to finish loading, this prevents the "execution context was destroyed" issue
|
||||
page.wait_for_load_state('networkidle', timeout=timeout_ms) # Wait for the 'load' event to complete
|
||||
title = page.title()
|
||||
url = page.url
|
||||
page_info = {'title': title, 'url': url, 'content': page.content()}
|
||||
logger.info(f"[PAGE_INFO] Successfully loaded page. Title: '{title}'")
|
||||
except TimeoutError:
|
||||
# If page loading times out, catch the exception and store the current information in the list
|
||||
logger.warning(f"[PAGE_INFO] Page load timeout for URL: {url}")
|
||||
page_info = {'title': 'Load timeout', 'url': page.url, 'content': page.content()}
|
||||
except Exception as e:
|
||||
# Catch other potential exceptions that might occur while reading the page title
|
||||
logger.error(f'[PAGE_INFO] Error: {e}')
|
||||
page_info = {'title': 'Error encountered', 'url': page.url, 'content': page.content()}
|
||||
|
||||
browser.close()
|
||||
return page_info
|
||||
|
||||
except Exception as e:
|
||||
# If the connection fails, start a new browser instance
|
||||
platform.machine()
|
||||
if "arm" in platform.machine():
|
||||
# start a new browser instance if the connection fails
|
||||
payload = json.dumps({"command": [
|
||||
"chromium",
|
||||
"--remote-debugging-port=1337"
|
||||
], "shell": False})
|
||||
logger.error(f"[PAGE_INFO] Attempt {attempt + 1} failed: {str(e)}")
|
||||
logger.error(f"[PAGE_INFO] Exception type: {type(e).__name__}")
|
||||
|
||||
if attempt < max_retries - 1:
|
||||
logger.info(f"[PAGE_INFO] Retrying in 3 seconds...")
|
||||
time.sleep(3)
|
||||
else:
|
||||
payload = json.dumps({"command": [
|
||||
"google-chrome",
|
||||
"--remote-debugging-port=1337"
|
||||
], "shell": False})
|
||||
logger.error(f"[PAGE_INFO] All {max_retries} attempts failed. Returning error info.")
|
||||
return {'title': 'Connection failed', 'url': url, 'content': ''}
|
||||
|
||||
headers = {"Content-Type": "application/json"}
|
||||
requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
|
||||
time.sleep(5)
|
||||
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
||||
|
||||
page = browser.contexts[0].new_page()
|
||||
page.goto(url)
|
||||
|
||||
try:
|
||||
# Wait for the page to finish loading, this prevents the "execution context was destroyed" issue
|
||||
page.wait_for_load_state('load') # Wait for the 'load' event to complete
|
||||
title = page.title()
|
||||
url = page.url
|
||||
page_info = {'title': title, 'url': url, 'content': page.content()}
|
||||
except TimeoutError:
|
||||
# If page loading times out, catch the exception and store the current information in the list
|
||||
page_info = {'title': 'Load timeout', 'url': page.url, 'content': page.content()}
|
||||
except Exception as e:
|
||||
# Catch other potential exceptions that might occur while reading the page title
|
||||
print(f'Error: {e}')
|
||||
page_info = {'title': 'Error encountered', 'url': page.url, 'content': page.content()}
|
||||
|
||||
browser.close()
|
||||
return page_info
|
||||
# This should never be reached, but just in case
|
||||
return {'title': 'Unknown error', 'url': url, 'content': ''}
|
||||
|
||||
|
||||
def get_open_tabs_info(env, config: Dict[str, str]):
|
||||
@@ -593,52 +626,85 @@ def get_open_tabs_info(env, config: Dict[str, str]):
|
||||
server_port = env.server_port
|
||||
|
||||
remote_debugging_url = f"http://{host}:{port}"
|
||||
with sync_playwright() as p:
|
||||
# connect to remote Chrome instance
|
||||
|
||||
# Configuration for retry and timeout
|
||||
max_retries = 2
|
||||
timeout_ms = 30000 # 30 seconds for tab info
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
||||
except Exception as e:
|
||||
# If the connection fails, start a new browser instance
|
||||
platform.machine()
|
||||
if "arm" in platform.machine():
|
||||
# start a new browser instance if the connection fails
|
||||
payload = json.dumps({"command": [
|
||||
"chromium",
|
||||
"--remote-debugging-port=1337"
|
||||
], "shell": False})
|
||||
else:
|
||||
payload = json.dumps({"command": [
|
||||
"google-chrome",
|
||||
"--remote-debugging-port=1337"
|
||||
], "shell": False})
|
||||
logger.info(f"[OPEN_TABS_INFO] Attempt {attempt + 1}/{max_retries}")
|
||||
|
||||
with sync_playwright() as p:
|
||||
# connect to remote Chrome instance
|
||||
try:
|
||||
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
||||
logger.info(f"[OPEN_TABS_INFO] Successfully connected to existing Chrome instance")
|
||||
except Exception as e:
|
||||
logger.warning(f"[OPEN_TABS_INFO] Failed to connect to existing Chrome instance: {e}")
|
||||
# If the connection fails, start a new browser instance
|
||||
platform.machine()
|
||||
if "arm" in platform.machine():
|
||||
# start a new browser instance if the connection fails
|
||||
payload = json.dumps({"command": [
|
||||
"chromium",
|
||||
"--remote-debugging-port=1337"
|
||||
], "shell": False})
|
||||
else:
|
||||
payload = json.dumps({"command": [
|
||||
"google-chrome",
|
||||
"--remote-debugging-port=1337"
|
||||
], "shell": False})
|
||||
|
||||
headers = {"Content-Type": "application/json"}
|
||||
requests.post(f"http://{host}:{server_port}/setup/launch", headers=headers, data=payload)
|
||||
time.sleep(5)
|
||||
try:
|
||||
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
||||
except Exception as e:
|
||||
headers = {"Content-Type": "application/json"}
|
||||
requests.post(f"http://{host}:{server_port}/setup/launch", headers=headers, data=payload)
|
||||
time.sleep(5)
|
||||
try:
|
||||
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
||||
logger.info(f"[OPEN_TABS_INFO] Successfully connected to new Chrome instance")
|
||||
except Exception as e:
|
||||
logger.error(f"[OPEN_TABS_INFO] Failed to connect to new Chrome instance: {e}")
|
||||
return []
|
||||
|
||||
tabs_info = []
|
||||
for context in browser.contexts:
|
||||
for page in context.pages:
|
||||
try:
|
||||
# Set timeout for each page
|
||||
page.set_default_timeout(timeout_ms)
|
||||
|
||||
# Wait for the page to finish loading, this prevents the "execution context was destroyed" issue
|
||||
page.wait_for_load_state('networkidle', timeout=timeout_ms) # Wait for the 'load' event to complete
|
||||
title = page.title()
|
||||
url = page.url
|
||||
tabs_info.append({'title': title, 'url': url})
|
||||
logger.info(f"[OPEN_TABS_INFO] Tab info: '{title}' -> {url}")
|
||||
except TimeoutError:
|
||||
# If page loading times out, catch the exception and store the current information in the list
|
||||
logger.warning(f"[OPEN_TABS_INFO] Tab load timeout for URL: {page.url}")
|
||||
tabs_info.append({'title': 'Load timeout', 'url': page.url})
|
||||
except Exception as e:
|
||||
# Catch other potential exceptions that might occur while reading the page title
|
||||
logger.error(f'[OPEN_TABS_INFO] Error reading tab info: {e}')
|
||||
tabs_info.append({'title': 'Error encountered', 'url': page.url})
|
||||
|
||||
browser.close()
|
||||
logger.info(f"[OPEN_TABS_INFO] Successfully retrieved info for {len(tabs_info)} tabs")
|
||||
return tabs_info
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[OPEN_TABS_INFO] Attempt {attempt + 1} failed: {str(e)}")
|
||||
logger.error(f"[OPEN_TABS_INFO] Exception type: {type(e).__name__}")
|
||||
|
||||
if attempt < max_retries - 1:
|
||||
logger.info(f"[OPEN_TABS_INFO] Retrying in 3 seconds...")
|
||||
time.sleep(3)
|
||||
else:
|
||||
logger.error(f"[OPEN_TABS_INFO] All {max_retries} attempts failed. Returning empty list.")
|
||||
return []
|
||||
|
||||
tabs_info = []
|
||||
for context in browser.contexts:
|
||||
for page in context.pages:
|
||||
try:
|
||||
# Wait for the page to finish loading, this prevents the "execution context was destroyed" issue
|
||||
page.wait_for_load_state('networkidle') # Wait for the 'load' event to complete
|
||||
title = page.title()
|
||||
url = page.url
|
||||
tabs_info.append({'title': title, 'url': url})
|
||||
except TimeoutError:
|
||||
# If page loading times out, catch the exception and store the current information in the list
|
||||
tabs_info.append({'title': 'Load timeout', 'url': page.url})
|
||||
except Exception as e:
|
||||
# Catch other potential exceptions that might occur while reading the page title
|
||||
print(f'Error: {e}')
|
||||
tabs_info.append({'title': 'Error encountered', 'url': page.url})
|
||||
|
||||
browser.close()
|
||||
return tabs_info
|
||||
# This should never be reached, but just in case
|
||||
return []
|
||||
|
||||
|
||||
def get_active_url_from_accessTree(env, config):
|
||||
@@ -727,37 +793,79 @@ def get_active_tab_info(env, config: Dict[str, str]):
|
||||
if active_tab_url is None:
|
||||
logger.error("Failed to get the url of active tab")
|
||||
return None
|
||||
|
||||
logger.info(f"[ACTIVE_TAB_INFO] Active tab URL: {active_tab_url}")
|
||||
|
||||
host = env.vm_ip
|
||||
port = env.chromium_port # fixme: this port is hard-coded, need to be changed from config file
|
||||
|
||||
remote_debugging_url = f"http://{host}:{port}"
|
||||
with sync_playwright() as p:
|
||||
# connect to remote Chrome instance, since it is supposed to be the active one, we won't start a new one if failed
|
||||
|
||||
# Configuration for retry and timeout
|
||||
max_retries = 2
|
||||
timeout_ms = 60000 # 60 seconds for active tab
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
||||
logger.info(f"[ACTIVE_TAB_INFO] Attempt {attempt + 1}/{max_retries}")
|
||||
|
||||
with sync_playwright() as p:
|
||||
# connect to remote Chrome instance, since it is supposed to be the active one, we won't start a new one if failed
|
||||
try:
|
||||
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
||||
logger.info(f"[ACTIVE_TAB_INFO] Successfully connected to Chrome instance")
|
||||
except Exception as e:
|
||||
logger.error(f"[ACTIVE_TAB_INFO] Failed to connect to Chrome instance: {e}")
|
||||
return None
|
||||
|
||||
active_tab_info = {}
|
||||
# go to the target URL page
|
||||
page = browser.new_page()
|
||||
|
||||
# Set longer timeout for navigation
|
||||
page.set_default_timeout(timeout_ms)
|
||||
|
||||
try:
|
||||
logger.info(f"[ACTIVE_TAB_INFO] Navigating to URL: {active_tab_url}")
|
||||
page.goto(active_tab_url, wait_until='networkidle', timeout=timeout_ms)
|
||||
page.wait_for_load_state('networkidle', timeout=timeout_ms) # Wait for the 'load' event to complete
|
||||
|
||||
active_tab_info = {
|
||||
'title': page.title(),
|
||||
'url': page.url,
|
||||
'content': page.content() # get the HTML content of the page
|
||||
}
|
||||
|
||||
logger.info(f"[ACTIVE_TAB_INFO] Successfully loaded page. Title: '{active_tab_info['title']}'")
|
||||
logger.info(f"[ACTIVE_TAB_INFO] Current URL: '{active_tab_info['url']}'")
|
||||
|
||||
except TimeoutError:
|
||||
logger.warning(f"[ACTIVE_TAB_INFO] Page load timeout for URL: {active_tab_url}")
|
||||
active_tab_info = {
|
||||
'title': 'Load timeout',
|
||||
'url': page.url,
|
||||
'content': page.content()
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"[ACTIVE_TAB_INFO] Failed to go to the target URL page: {e}")
|
||||
return None
|
||||
|
||||
browser.close()
|
||||
return active_tab_info
|
||||
|
||||
except Exception as e:
|
||||
return None
|
||||
logger.error(f"[ACTIVE_TAB_INFO] Attempt {attempt + 1} failed: {str(e)}")
|
||||
logger.error(f"[ACTIVE_TAB_INFO] Exception type: {type(e).__name__}")
|
||||
|
||||
if attempt < max_retries - 1:
|
||||
logger.info(f"[ACTIVE_TAB_INFO] Retrying in 3 seconds...")
|
||||
time.sleep(3)
|
||||
else:
|
||||
logger.error(f"[ACTIVE_TAB_INFO] All {max_retries} attempts failed.")
|
||||
return None
|
||||
|
||||
active_tab_info = {}
|
||||
# go to the target URL page
|
||||
page = browser.new_page()
|
||||
try:
|
||||
page.goto(active_tab_url)
|
||||
except:
|
||||
logger.error("Failed to go to the target URL page")
|
||||
return None
|
||||
page.wait_for_load_state('load') # Wait for the 'load' event to complete
|
||||
active_tab_info = {
|
||||
'title': page.title(),
|
||||
'url': page.url,
|
||||
'content': page.content() # get the HTML content of the page
|
||||
}
|
||||
|
||||
browser.close()
|
||||
# print("active_tab_title: {}".format(active_tab_info.get('title', 'None')))
|
||||
# print("active_tab_url: {}".format(active_tab_info.get('url', 'None')))
|
||||
# print("active_tab_content: {}".format(active_tab_info.get('content', 'None')))
|
||||
return active_tab_info
|
||||
# This should never be reached, but just in case
|
||||
return None
|
||||
|
||||
|
||||
def get_pdf_from_url(env, config: Dict[str, str]) -> str:
|
||||
@@ -766,41 +874,110 @@ def get_pdf_from_url(env, config: Dict[str, str]) -> str:
|
||||
"""
|
||||
_url = config["path"]
|
||||
_path = os.path.join(env.cache_dir, config["dest"])
|
||||
|
||||
# Add logging for debugging
|
||||
logger.info(f"[PDF_FROM_URL] Starting PDF download from URL: {_url}")
|
||||
logger.info(f"[PDF_FROM_URL] Target path: {_path}")
|
||||
|
||||
host = env.vm_ip
|
||||
port = env.chromium_port # fixme: this port is hard-coded, need to be changed from config file
|
||||
server_port = env.server_port
|
||||
|
||||
remote_debugging_url = f"http://{host}:{port}"
|
||||
|
||||
with sync_playwright() as p:
|
||||
|
||||
# Configuration for retry and timeout
|
||||
max_retries = 3
|
||||
timeout_ms = 60000 # Increase timeout to 60 seconds
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
||||
logger.info(f"[PDF_FROM_URL] Attempt {attempt + 1}/{max_retries}")
|
||||
|
||||
with sync_playwright() as p:
|
||||
try:
|
||||
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
||||
logger.info(f"[PDF_FROM_URL] Successfully connected to existing Chrome instance")
|
||||
except Exception as e:
|
||||
logger.warning(f"[PDF_FROM_URL] Failed to connect to existing Chrome instance: {e}")
|
||||
logger.info(f"[PDF_FROM_URL] Starting new Chrome instance...")
|
||||
|
||||
# If the connection fails, start a new browser instance
|
||||
platform.machine()
|
||||
if "arm" in platform.machine():
|
||||
# start a new browser instance if the connection fails
|
||||
payload = json.dumps({"command": [
|
||||
"chromium",
|
||||
"--remote-debugging-port=1337"
|
||||
], "shell": False})
|
||||
else:
|
||||
payload = json.dumps({"command": [
|
||||
"google-chrome",
|
||||
"--remote-debugging-port=1337"
|
||||
], "shell": False})
|
||||
|
||||
headers = {"Content-Type": "application/json"}
|
||||
requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
|
||||
time.sleep(5)
|
||||
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
||||
logger.info(f"[PDF_FROM_URL] Successfully connected to new Chrome instance")
|
||||
|
||||
page = browser.new_page()
|
||||
|
||||
# Set longer timeout for navigation
|
||||
page.set_default_timeout(timeout_ms)
|
||||
|
||||
logger.info(f"[PDF_FROM_URL] Navigating to URL: {_url}")
|
||||
page.goto(_url, wait_until='networkidle', timeout=timeout_ms)
|
||||
|
||||
# Wait for page to be fully loaded
|
||||
logger.info(f"[PDF_FROM_URL] Waiting for page to be fully loaded...")
|
||||
page.wait_for_load_state('networkidle', timeout=timeout_ms)
|
||||
|
||||
# Additional wait to ensure all content is rendered
|
||||
time.sleep(3)
|
||||
|
||||
logger.info(f"[PDF_FROM_URL] Page loaded successfully. Title: '{page.title()}'")
|
||||
logger.info(f"[PDF_FROM_URL] Current URL: '{page.url}'")
|
||||
|
||||
# Generate PDF
|
||||
logger.info(f"[PDF_FROM_URL] Generating PDF...")
|
||||
page.pdf(path=_path)
|
||||
|
||||
logger.info(f"[PDF_FROM_URL] PDF generated successfully at: {_path}")
|
||||
browser.close()
|
||||
|
||||
# Verify PDF file was created
|
||||
if os.path.exists(_path):
|
||||
file_size = os.path.getsize(_path)
|
||||
logger.info(f"[PDF_FROM_URL] PDF file created successfully. Size: {file_size} bytes")
|
||||
return _path
|
||||
else:
|
||||
logger.error(f"[PDF_FROM_URL] PDF file was not created at expected path: {_path}")
|
||||
raise FileNotFoundError(f"PDF file was not created at {_path}")
|
||||
|
||||
except Exception as e:
|
||||
# If the connection fails, start a new browser instance
|
||||
platform.machine()
|
||||
if "arm" in platform.machine():
|
||||
# start a new browser instance if the connection fails
|
||||
payload = json.dumps({"command": [
|
||||
"chromium",
|
||||
"--remote-debugging-port=1337"
|
||||
], "shell": False})
|
||||
logger.error(f"[PDF_FROM_URL] Attempt {attempt + 1} failed: {str(e)}")
|
||||
logger.error(f"[PDF_FROM_URL] Exception type: {type(e).__name__}")
|
||||
|
||||
if attempt < max_retries - 1:
|
||||
logger.info(f"[PDF_FROM_URL] Retrying in 5 seconds...")
|
||||
time.sleep(5)
|
||||
else:
|
||||
payload = json.dumps({"command": [
|
||||
"google-chrome",
|
||||
"--remote-debugging-port=1337"
|
||||
], "shell": False})
|
||||
|
||||
headers = {"Content-Type": "application/json"}
|
||||
requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
|
||||
time.sleep(5)
|
||||
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
||||
|
||||
page = browser.new_page()
|
||||
page.goto(_url)
|
||||
page.pdf(path=_path)
|
||||
browser.close()
|
||||
logger.error(f"[PDF_FROM_URL] All {max_retries} attempts failed. Giving up.")
|
||||
|
||||
# Create a placeholder file or return a default path
|
||||
try:
|
||||
# Create an empty PDF file as fallback
|
||||
with open(_path, 'w') as f:
|
||||
f.write("%PDF-1.4\n%EOF\n")
|
||||
logger.warning(f"[PDF_FROM_URL] Created empty PDF file as fallback: {_path}")
|
||||
return _path
|
||||
except Exception as fallback_error:
|
||||
logger.error(f"[PDF_FROM_URL] Failed to create fallback file: {fallback_error}")
|
||||
# Return the path anyway, even if file creation failed
|
||||
return _path
|
||||
|
||||
# This should never be reached, but just in case
|
||||
return _path
|
||||
|
||||
|
||||
@@ -811,41 +988,75 @@ def get_chrome_saved_address(env, config: Dict[str, str]):
|
||||
server_port = env.server_port
|
||||
|
||||
remote_debugging_url = f"http://{host}:{port}"
|
||||
with sync_playwright() as p:
|
||||
# connect to remote Chrome instance
|
||||
|
||||
# Configuration for retry and timeout
|
||||
max_retries = 2
|
||||
timeout_ms = 30000 # 30 seconds for settings page
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
||||
logger.info(f"[CHROME_SAVED_ADDRESS] Attempt {attempt + 1}/{max_retries}")
|
||||
|
||||
with sync_playwright() as p:
|
||||
# connect to remote Chrome instance
|
||||
try:
|
||||
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
||||
logger.info(f"[CHROME_SAVED_ADDRESS] Successfully connected to existing Chrome instance")
|
||||
except Exception as e:
|
||||
logger.warning(f"[CHROME_SAVED_ADDRESS] Failed to connect to existing Chrome instance: {e}")
|
||||
# If the connection fails, start a new browser instance
|
||||
platform.machine()
|
||||
if "arm" in platform.machine():
|
||||
# start a new browser instance if the connection fails
|
||||
payload = json.dumps({"command": [
|
||||
"chromium",
|
||||
"--remote-debugging-port=1337"
|
||||
], "shell": False})
|
||||
else:
|
||||
payload = json.dumps({"command": [
|
||||
"google-chrome",
|
||||
"--remote-debugging-port=1337"
|
||||
], "shell": False})
|
||||
|
||||
headers = {"Content-Type": "application/json"}
|
||||
requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
|
||||
time.sleep(5)
|
||||
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
||||
logger.info(f"[CHROME_SAVED_ADDRESS] Successfully connected to new Chrome instance")
|
||||
|
||||
page = browser.new_page()
|
||||
|
||||
# Set longer timeout for navigation
|
||||
page.set_default_timeout(timeout_ms)
|
||||
|
||||
# Navigate to Chrome's settings page for autofill
|
||||
logger.info(f"[CHROME_SAVED_ADDRESS] Navigating to Chrome settings page")
|
||||
page.goto("chrome://settings/addresses", wait_until='networkidle', timeout=timeout_ms)
|
||||
|
||||
# Wait for page to be fully loaded
|
||||
page.wait_for_load_state('networkidle', timeout=timeout_ms)
|
||||
|
||||
# Get the HTML content of the page
|
||||
content = page.content()
|
||||
|
||||
logger.info(f"[CHROME_SAVED_ADDRESS] Successfully retrieved settings page content")
|
||||
browser.close()
|
||||
|
||||
return content
|
||||
|
||||
except Exception as e:
|
||||
# If the connection fails, start a new browser instance
|
||||
platform.machine()
|
||||
if "arm" in platform.machine():
|
||||
# start a new browser instance if the connection fails
|
||||
payload = json.dumps({"command": [
|
||||
"chromium",
|
||||
"--remote-debugging-port=1337"
|
||||
], "shell": False})
|
||||
logger.error(f"[CHROME_SAVED_ADDRESS] Attempt {attempt + 1} failed: {str(e)}")
|
||||
logger.error(f"[CHROME_SAVED_ADDRESS] Exception type: {type(e).__name__}")
|
||||
|
||||
if attempt < max_retries - 1:
|
||||
logger.info(f"[CHROME_SAVED_ADDRESS] Retrying in 3 seconds...")
|
||||
time.sleep(3)
|
||||
else:
|
||||
payload = json.dumps({"command": [
|
||||
"google-chrome",
|
||||
"--remote-debugging-port=1337"
|
||||
], "shell": False})
|
||||
logger.error(f"[CHROME_SAVED_ADDRESS] All {max_retries} attempts failed. Returning empty content.")
|
||||
return ""
|
||||
|
||||
headers = {"Content-Type": "application/json"}
|
||||
requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
|
||||
time.sleep(5)
|
||||
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
||||
|
||||
page = browser.new_page()
|
||||
|
||||
# Navigate to Chrome's settings page for autofill
|
||||
page.goto("chrome://settings/addresses")
|
||||
|
||||
# Get the HTML content of the page
|
||||
content = page.content()
|
||||
|
||||
browser.close()
|
||||
|
||||
return content
|
||||
# This should never be reached, but just in case
|
||||
return ""
|
||||
|
||||
|
||||
def get_shortcuts_on_desktop(env, config: Dict[str, str]):
|
||||
@@ -891,38 +1102,76 @@ def get_number_of_search_results(env, config: Dict[str, str]):
|
||||
server_port = env.server_port
|
||||
|
||||
remote_debugging_url = f"http://{host}:{port}"
|
||||
with sync_playwright() as p:
|
||||
|
||||
# Configuration for retry and timeout
|
||||
max_retries = 2
|
||||
timeout_ms = 45000 # 45 seconds for search results
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
||||
logger.info(f"[SEARCH_RESULTS] Attempt {attempt + 1}/{max_retries} for URL: {url}")
|
||||
|
||||
with sync_playwright() as p:
|
||||
try:
|
||||
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
||||
logger.info(f"[SEARCH_RESULTS] Successfully connected to existing Chrome instance")
|
||||
except Exception as e:
|
||||
logger.warning(f"[SEARCH_RESULTS] Failed to connect to existing Chrome instance: {e}")
|
||||
# If the connection fails, start a new browser instance
|
||||
platform.machine()
|
||||
if "arm" in platform.machine():
|
||||
# start a new browser instance if the connection fails
|
||||
payload = json.dumps({"command": [
|
||||
"chromium",
|
||||
"--remote-debugging-port=1337"
|
||||
], "shell": False})
|
||||
else:
|
||||
payload = json.dumps({"command": [
|
||||
"google-chrome",
|
||||
"--remote-debugging-port=1337"
|
||||
], "shell": False})
|
||||
|
||||
headers = {"Content-Type": "application/json"}
|
||||
requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
|
||||
time.sleep(5)
|
||||
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
||||
logger.info(f"[SEARCH_RESULTS] Successfully connected to new Chrome instance")
|
||||
|
||||
page = browser.new_page()
|
||||
|
||||
# Set longer timeout for navigation
|
||||
page.set_default_timeout(timeout_ms)
|
||||
|
||||
logger.info(f"[SEARCH_RESULTS] Navigating to URL: {url}")
|
||||
page.goto(url, wait_until='networkidle', timeout=timeout_ms)
|
||||
|
||||
# Wait for page to be fully loaded
|
||||
page.wait_for_load_state('networkidle', timeout=timeout_ms)
|
||||
|
||||
search_results = page.query_selector_all(result_selector)
|
||||
actual_count = len(search_results)
|
||||
|
||||
logger.info(f"[SEARCH_RESULTS] Found {actual_count} search results")
|
||||
browser.close()
|
||||
|
||||
return actual_count
|
||||
|
||||
except Exception as e:
|
||||
# If the connection fails, start a new browser instance
|
||||
platform.machine()
|
||||
if "arm" in platform.machine():
|
||||
# start a new browser instance if the connection fails
|
||||
payload = json.dumps({"command": [
|
||||
"chromium",
|
||||
"--remote-debugging-port=1337"
|
||||
], "shell": False})
|
||||
logger.error(f"[SEARCH_RESULTS] Attempt {attempt + 1} failed: {str(e)}")
|
||||
logger.error(f"[SEARCH_RESULTS] Exception type: {type(e).__name__}")
|
||||
|
||||
if attempt < max_retries - 1:
|
||||
logger.info(f"[SEARCH_RESULTS] Retrying in 3 seconds...")
|
||||
time.sleep(3)
|
||||
else:
|
||||
payload = json.dumps({"command": [
|
||||
"google-chrome",
|
||||
"--remote-debugging-port=1337"
|
||||
], "shell": False})
|
||||
logger.error(f"[SEARCH_RESULTS] All {max_retries} attempts failed. Returning 0.")
|
||||
return 0
|
||||
|
||||
headers = {"Content-Type": "application/json"}
|
||||
requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
|
||||
time.sleep(5)
|
||||
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
||||
page = browser.new_page()
|
||||
page.goto(url)
|
||||
search_results = page.query_selector_all(result_selector)
|
||||
actual_count = len(search_results)
|
||||
browser.close()
|
||||
|
||||
return actual_count
|
||||
# This should never be reached, but just in case
|
||||
return 0
|
||||
|
||||
|
||||
def get_googledrive_file(env, config: Dict[str, Any]) -> str:
|
||||
def get_googledrive_file(env, config: Dict[str, Any]) -> Any:
|
||||
""" Get the desired file from Google Drive based on config, return the downloaded local filepath.
|
||||
@args: keys in config dict
|
||||
settings_file(str): target filepath to the settings file for Google Drive authentication, default is 'evaluation_examples/settings/googledrive/settings.yml'
|
||||
@@ -959,18 +1208,21 @@ def get_googledrive_file(env, config: Dict[str, Any]) -> str:
|
||||
return _path
|
||||
|
||||
if 'query' in config:
|
||||
return get_single_file(config['query'], os.path.join(env.cache_dir, config['dest']))
|
||||
result = get_single_file(config['query'], os.path.join(env.cache_dir, config['dest']))
|
||||
return result
|
||||
elif 'path' in config:
|
||||
query = [f"title = '{fp}' and mimeType = 'application/vnd.google-apps.folder' and trashed = false" if idx < len(
|
||||
config['path']) - 1
|
||||
else f"title = '{fp}' and trashed = false" for idx, fp in enumerate(config['path'])]
|
||||
return get_single_file(query, os.path.join(env.cache_dir, config['dest']))
|
||||
result = get_single_file(query, os.path.join(env.cache_dir, config['dest']))
|
||||
return result
|
||||
elif 'query_list' in config:
|
||||
_path_list = []
|
||||
assert len(config['query_list']) == len(config['dest'])
|
||||
for idx, query in enumerate(config['query_list']):
|
||||
dest = config['dest'][idx]
|
||||
_path_list.append(get_single_file(query, os.path.join(env.cache_dir, dest)))
|
||||
result = get_single_file(query, os.path.join(env.cache_dir, dest))
|
||||
_path_list.append(result)
|
||||
return _path_list
|
||||
else: # path_list in config
|
||||
_path_list = []
|
||||
@@ -981,7 +1233,8 @@ def get_googledrive_file(env, config: Dict[str, Any]) -> str:
|
||||
path) - 1
|
||||
else f"title = '{fp}' and trashed = false" for jdx, fp in enumerate(path)]
|
||||
dest = config['dest'][idx]
|
||||
_path_list.append(get_single_file(query, os.path.join(env.cache_dir, dest)))
|
||||
result = get_single_file(query, os.path.join(env.cache_dir, dest))
|
||||
_path_list.append(result)
|
||||
return _path_list
|
||||
|
||||
|
||||
@@ -1692,16 +1945,31 @@ def get_url_dashPart(env, config: Dict[str, str]):
|
||||
return None
|
||||
|
||||
# extract the last dash-separated part of the URL, and delete all the characters after "id"
|
||||
dash_part = active_tab_url.split("/")[config["partIndex"]]
|
||||
if config["needDeleteId"]:
|
||||
# Ensure partIndex is an integer
|
||||
try:
|
||||
part_index = int(config["partIndex"])
|
||||
except (ValueError, TypeError):
|
||||
logger.error(f"[URL_DASH_PART] Invalid partIndex: {config.get('partIndex', 'None')}. Must be an integer.")
|
||||
return None
|
||||
|
||||
url_parts = active_tab_url.split("/")
|
||||
if part_index >= len(url_parts):
|
||||
logger.error(f"[URL_DASH_PART] partIndex {part_index} is out of range for URL with {len(url_parts)} parts")
|
||||
return None
|
||||
|
||||
dash_part = url_parts[part_index]
|
||||
if config.get("needDeleteId", False):
|
||||
dash_part = dash_part.split("?")[0]
|
||||
# print("active_tab_title: {}".format(active_tab_info.get('title', 'None')))
|
||||
# print("active_tab_url: {}".format(active_tab_info.get('url', 'None')))
|
||||
# print("active_tab_content: {}".format(active_tab_info.get('content', 'None')))
|
||||
|
||||
logger.info(f"[URL_DASH_PART] Extracted dash part: '{dash_part}' from URL: {active_tab_url}")
|
||||
|
||||
if config["returnType"] == "string":
|
||||
return dash_part
|
||||
elif config["returnType"] == "json":
|
||||
return {config["key"]: dash_part}
|
||||
else:
|
||||
logger.error(f"[URL_DASH_PART] Invalid returnType: {config.get('returnType', 'None')}. Must be 'string' or 'json'.")
|
||||
return None
|
||||
|
||||
|
||||
def get_macys_product_url_parse(env, config: Dict[str, str]):
|
||||
|
||||
Reference in New Issue
Block a user