feat: enhance Chrome evaluator with improved retry logic and logging
- Implemented retry mechanism for connecting to Chrome instances, allowing up to two attempts before failure. - Increased timeout settings for page navigation and loading to enhance reliability. - Added detailed logging for connection attempts, page loading status, and error handling to improve debugging and user experience. - Ensured existing logic is preserved while enhancing error handling and operational robustness. These changes improve the overall reliability and maintainability of the Chrome evaluator functions.
This commit is contained in:
@@ -542,49 +542,82 @@ def get_page_info(env, config: Dict[str, str]):
|
|||||||
url = config["url"]
|
url = config["url"]
|
||||||
|
|
||||||
remote_debugging_url = f"http://{host}:{port}"
|
remote_debugging_url = f"http://{host}:{port}"
|
||||||
with sync_playwright() as p:
|
|
||||||
# connect to remote Chrome instance
|
# Configuration for retry and timeout
|
||||||
|
max_retries = 2
|
||||||
|
timeout_ms = 60000 # Increase timeout to 60 seconds
|
||||||
|
|
||||||
|
for attempt in range(max_retries):
|
||||||
try:
|
try:
|
||||||
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
logger.info(f"[PAGE_INFO] Attempt {attempt + 1}/{max_retries} for URL: {url}")
|
||||||
|
|
||||||
|
with sync_playwright() as p:
|
||||||
|
# connect to remote Chrome instance
|
||||||
|
try:
|
||||||
|
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
||||||
|
logger.info(f"[PAGE_INFO] Successfully connected to existing Chrome instance")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"[PAGE_INFO] Failed to connect to existing Chrome instance: {e}")
|
||||||
|
# If the connection fails, start a new browser instance
|
||||||
|
platform.machine()
|
||||||
|
if "arm" in platform.machine():
|
||||||
|
# start a new browser instance if the connection fails
|
||||||
|
payload = json.dumps({"command": [
|
||||||
|
"chromium",
|
||||||
|
"--remote-debugging-port=1337"
|
||||||
|
], "shell": False})
|
||||||
|
else:
|
||||||
|
payload = json.dumps({"command": [
|
||||||
|
"google-chrome",
|
||||||
|
"--remote-debugging-port=1337"
|
||||||
|
], "shell": False})
|
||||||
|
|
||||||
|
headers = {"Content-Type": "application/json"}
|
||||||
|
requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
|
||||||
|
time.sleep(5)
|
||||||
|
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
||||||
|
logger.info(f"[PAGE_INFO] Successfully connected to new Chrome instance")
|
||||||
|
|
||||||
|
page = browser.new_page()
|
||||||
|
|
||||||
|
# Set longer timeout for navigation
|
||||||
|
page.set_default_timeout(timeout_ms)
|
||||||
|
|
||||||
|
logger.info(f"[PAGE_INFO] Navigating to URL: {url}")
|
||||||
|
page.goto(url, wait_until='networkidle', timeout=timeout_ms)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Wait for the page to finish loading, this prevents the "execution context was destroyed" issue
|
||||||
|
page.wait_for_load_state('networkidle', timeout=timeout_ms) # Wait for the 'load' event to complete
|
||||||
|
title = page.title()
|
||||||
|
url = page.url
|
||||||
|
page_info = {'title': title, 'url': url, 'content': page.content()}
|
||||||
|
logger.info(f"[PAGE_INFO] Successfully loaded page. Title: '{title}'")
|
||||||
|
except TimeoutError:
|
||||||
|
# If page loading times out, catch the exception and store the current information in the list
|
||||||
|
logger.warning(f"[PAGE_INFO] Page load timeout for URL: {url}")
|
||||||
|
page_info = {'title': 'Load timeout', 'url': page.url, 'content': page.content()}
|
||||||
|
except Exception as e:
|
||||||
|
# Catch other potential exceptions that might occur while reading the page title
|
||||||
|
logger.error(f'[PAGE_INFO] Error: {e}')
|
||||||
|
page_info = {'title': 'Error encountered', 'url': page.url, 'content': page.content()}
|
||||||
|
|
||||||
|
browser.close()
|
||||||
|
return page_info
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# If the connection fails, start a new browser instance
|
logger.error(f"[PAGE_INFO] Attempt {attempt + 1} failed: {str(e)}")
|
||||||
platform.machine()
|
logger.error(f"[PAGE_INFO] Exception type: {type(e).__name__}")
|
||||||
if "arm" in platform.machine():
|
|
||||||
# start a new browser instance if the connection fails
|
if attempt < max_retries - 1:
|
||||||
payload = json.dumps({"command": [
|
logger.info(f"[PAGE_INFO] Retrying in 3 seconds...")
|
||||||
"chromium",
|
time.sleep(3)
|
||||||
"--remote-debugging-port=1337"
|
|
||||||
], "shell": False})
|
|
||||||
else:
|
else:
|
||||||
payload = json.dumps({"command": [
|
logger.error(f"[PAGE_INFO] All {max_retries} attempts failed. Returning error info.")
|
||||||
"google-chrome",
|
return {'title': 'Connection failed', 'url': url, 'content': ''}
|
||||||
"--remote-debugging-port=1337"
|
|
||||||
], "shell": False})
|
|
||||||
|
|
||||||
headers = {"Content-Type": "application/json"}
|
# This should never be reached, but just in case
|
||||||
requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
|
return {'title': 'Unknown error', 'url': url, 'content': ''}
|
||||||
time.sleep(5)
|
|
||||||
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
|
||||||
|
|
||||||
page = browser.contexts[0].new_page()
|
|
||||||
page.goto(url)
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Wait for the page to finish loading, this prevents the "execution context was destroyed" issue
|
|
||||||
page.wait_for_load_state('load') # Wait for the 'load' event to complete
|
|
||||||
title = page.title()
|
|
||||||
url = page.url
|
|
||||||
page_info = {'title': title, 'url': url, 'content': page.content()}
|
|
||||||
except TimeoutError:
|
|
||||||
# If page loading times out, catch the exception and store the current information in the list
|
|
||||||
page_info = {'title': 'Load timeout', 'url': page.url, 'content': page.content()}
|
|
||||||
except Exception as e:
|
|
||||||
# Catch other potential exceptions that might occur while reading the page title
|
|
||||||
print(f'Error: {e}')
|
|
||||||
page_info = {'title': 'Error encountered', 'url': page.url, 'content': page.content()}
|
|
||||||
|
|
||||||
browser.close()
|
|
||||||
return page_info
|
|
||||||
|
|
||||||
|
|
||||||
def get_open_tabs_info(env, config: Dict[str, str]):
|
def get_open_tabs_info(env, config: Dict[str, str]):
|
||||||
@@ -593,52 +626,85 @@ def get_open_tabs_info(env, config: Dict[str, str]):
|
|||||||
server_port = env.server_port
|
server_port = env.server_port
|
||||||
|
|
||||||
remote_debugging_url = f"http://{host}:{port}"
|
remote_debugging_url = f"http://{host}:{port}"
|
||||||
with sync_playwright() as p:
|
|
||||||
# connect to remote Chrome instance
|
|
||||||
try:
|
|
||||||
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
|
||||||
except Exception as e:
|
|
||||||
# If the connection fails, start a new browser instance
|
|
||||||
platform.machine()
|
|
||||||
if "arm" in platform.machine():
|
|
||||||
# start a new browser instance if the connection fails
|
|
||||||
payload = json.dumps({"command": [
|
|
||||||
"chromium",
|
|
||||||
"--remote-debugging-port=1337"
|
|
||||||
], "shell": False})
|
|
||||||
else:
|
|
||||||
payload = json.dumps({"command": [
|
|
||||||
"google-chrome",
|
|
||||||
"--remote-debugging-port=1337"
|
|
||||||
], "shell": False})
|
|
||||||
|
|
||||||
headers = {"Content-Type": "application/json"}
|
# Configuration for retry and timeout
|
||||||
requests.post(f"http://{host}:{server_port}/setup/launch", headers=headers, data=payload)
|
max_retries = 2
|
||||||
time.sleep(5)
|
timeout_ms = 30000 # 30 seconds for tab info
|
||||||
try:
|
|
||||||
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
for attempt in range(max_retries):
|
||||||
except Exception as e:
|
try:
|
||||||
|
logger.info(f"[OPEN_TABS_INFO] Attempt {attempt + 1}/{max_retries}")
|
||||||
|
|
||||||
|
with sync_playwright() as p:
|
||||||
|
# connect to remote Chrome instance
|
||||||
|
try:
|
||||||
|
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
||||||
|
logger.info(f"[OPEN_TABS_INFO] Successfully connected to existing Chrome instance")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"[OPEN_TABS_INFO] Failed to connect to existing Chrome instance: {e}")
|
||||||
|
# If the connection fails, start a new browser instance
|
||||||
|
platform.machine()
|
||||||
|
if "arm" in platform.machine():
|
||||||
|
# start a new browser instance if the connection fails
|
||||||
|
payload = json.dumps({"command": [
|
||||||
|
"chromium",
|
||||||
|
"--remote-debugging-port=1337"
|
||||||
|
], "shell": False})
|
||||||
|
else:
|
||||||
|
payload = json.dumps({"command": [
|
||||||
|
"google-chrome",
|
||||||
|
"--remote-debugging-port=1337"
|
||||||
|
], "shell": False})
|
||||||
|
|
||||||
|
headers = {"Content-Type": "application/json"}
|
||||||
|
requests.post(f"http://{host}:{server_port}/setup/launch", headers=headers, data=payload)
|
||||||
|
time.sleep(5)
|
||||||
|
try:
|
||||||
|
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
||||||
|
logger.info(f"[OPEN_TABS_INFO] Successfully connected to new Chrome instance")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"[OPEN_TABS_INFO] Failed to connect to new Chrome instance: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
tabs_info = []
|
||||||
|
for context in browser.contexts:
|
||||||
|
for page in context.pages:
|
||||||
|
try:
|
||||||
|
# Set timeout for each page
|
||||||
|
page.set_default_timeout(timeout_ms)
|
||||||
|
|
||||||
|
# Wait for the page to finish loading, this prevents the "execution context was destroyed" issue
|
||||||
|
page.wait_for_load_state('networkidle', timeout=timeout_ms) # Wait for the 'load' event to complete
|
||||||
|
title = page.title()
|
||||||
|
url = page.url
|
||||||
|
tabs_info.append({'title': title, 'url': url})
|
||||||
|
logger.info(f"[OPEN_TABS_INFO] Tab info: '{title}' -> {url}")
|
||||||
|
except TimeoutError:
|
||||||
|
# If page loading times out, catch the exception and store the current information in the list
|
||||||
|
logger.warning(f"[OPEN_TABS_INFO] Tab load timeout for URL: {page.url}")
|
||||||
|
tabs_info.append({'title': 'Load timeout', 'url': page.url})
|
||||||
|
except Exception as e:
|
||||||
|
# Catch other potential exceptions that might occur while reading the page title
|
||||||
|
logger.error(f'[OPEN_TABS_INFO] Error reading tab info: {e}')
|
||||||
|
tabs_info.append({'title': 'Error encountered', 'url': page.url})
|
||||||
|
|
||||||
|
browser.close()
|
||||||
|
logger.info(f"[OPEN_TABS_INFO] Successfully retrieved info for {len(tabs_info)} tabs")
|
||||||
|
return tabs_info
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"[OPEN_TABS_INFO] Attempt {attempt + 1} failed: {str(e)}")
|
||||||
|
logger.error(f"[OPEN_TABS_INFO] Exception type: {type(e).__name__}")
|
||||||
|
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
logger.info(f"[OPEN_TABS_INFO] Retrying in 3 seconds...")
|
||||||
|
time.sleep(3)
|
||||||
|
else:
|
||||||
|
logger.error(f"[OPEN_TABS_INFO] All {max_retries} attempts failed. Returning empty list.")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
tabs_info = []
|
# This should never be reached, but just in case
|
||||||
for context in browser.contexts:
|
return []
|
||||||
for page in context.pages:
|
|
||||||
try:
|
|
||||||
# Wait for the page to finish loading, this prevents the "execution context was destroyed" issue
|
|
||||||
page.wait_for_load_state('networkidle') # Wait for the 'load' event to complete
|
|
||||||
title = page.title()
|
|
||||||
url = page.url
|
|
||||||
tabs_info.append({'title': title, 'url': url})
|
|
||||||
except TimeoutError:
|
|
||||||
# If page loading times out, catch the exception and store the current information in the list
|
|
||||||
tabs_info.append({'title': 'Load timeout', 'url': page.url})
|
|
||||||
except Exception as e:
|
|
||||||
# Catch other potential exceptions that might occur while reading the page title
|
|
||||||
print(f'Error: {e}')
|
|
||||||
tabs_info.append({'title': 'Error encountered', 'url': page.url})
|
|
||||||
|
|
||||||
browser.close()
|
|
||||||
return tabs_info
|
|
||||||
|
|
||||||
|
|
||||||
def get_active_url_from_accessTree(env, config):
|
def get_active_url_from_accessTree(env, config):
|
||||||
@@ -727,37 +793,79 @@ def get_active_tab_info(env, config: Dict[str, str]):
|
|||||||
if active_tab_url is None:
|
if active_tab_url is None:
|
||||||
logger.error("Failed to get the url of active tab")
|
logger.error("Failed to get the url of active tab")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
logger.info(f"[ACTIVE_TAB_INFO] Active tab URL: {active_tab_url}")
|
||||||
|
|
||||||
host = env.vm_ip
|
host = env.vm_ip
|
||||||
port = env.chromium_port # fixme: this port is hard-coded, need to be changed from config file
|
port = env.chromium_port # fixme: this port is hard-coded, need to be changed from config file
|
||||||
|
|
||||||
remote_debugging_url = f"http://{host}:{port}"
|
remote_debugging_url = f"http://{host}:{port}"
|
||||||
with sync_playwright() as p:
|
|
||||||
# connect to remote Chrome instance, since it is supposed to be the active one, we won't start a new one if failed
|
# Configuration for retry and timeout
|
||||||
|
max_retries = 2
|
||||||
|
timeout_ms = 60000 # 60 seconds for active tab
|
||||||
|
|
||||||
|
for attempt in range(max_retries):
|
||||||
try:
|
try:
|
||||||
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
logger.info(f"[ACTIVE_TAB_INFO] Attempt {attempt + 1}/{max_retries}")
|
||||||
|
|
||||||
|
with sync_playwright() as p:
|
||||||
|
# connect to remote Chrome instance, since it is supposed to be the active one, we won't start a new one if failed
|
||||||
|
try:
|
||||||
|
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
||||||
|
logger.info(f"[ACTIVE_TAB_INFO] Successfully connected to Chrome instance")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"[ACTIVE_TAB_INFO] Failed to connect to Chrome instance: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
active_tab_info = {}
|
||||||
|
# go to the target URL page
|
||||||
|
page = browser.new_page()
|
||||||
|
|
||||||
|
# Set longer timeout for navigation
|
||||||
|
page.set_default_timeout(timeout_ms)
|
||||||
|
|
||||||
|
try:
|
||||||
|
logger.info(f"[ACTIVE_TAB_INFO] Navigating to URL: {active_tab_url}")
|
||||||
|
page.goto(active_tab_url, wait_until='networkidle', timeout=timeout_ms)
|
||||||
|
page.wait_for_load_state('networkidle', timeout=timeout_ms) # Wait for the 'load' event to complete
|
||||||
|
|
||||||
|
active_tab_info = {
|
||||||
|
'title': page.title(),
|
||||||
|
'url': page.url,
|
||||||
|
'content': page.content() # get the HTML content of the page
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(f"[ACTIVE_TAB_INFO] Successfully loaded page. Title: '{active_tab_info['title']}'")
|
||||||
|
logger.info(f"[ACTIVE_TAB_INFO] Current URL: '{active_tab_info['url']}'")
|
||||||
|
|
||||||
|
except TimeoutError:
|
||||||
|
logger.warning(f"[ACTIVE_TAB_INFO] Page load timeout for URL: {active_tab_url}")
|
||||||
|
active_tab_info = {
|
||||||
|
'title': 'Load timeout',
|
||||||
|
'url': page.url,
|
||||||
|
'content': page.content()
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"[ACTIVE_TAB_INFO] Failed to go to the target URL page: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
browser.close()
|
||||||
|
return active_tab_info
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return None
|
logger.error(f"[ACTIVE_TAB_INFO] Attempt {attempt + 1} failed: {str(e)}")
|
||||||
|
logger.error(f"[ACTIVE_TAB_INFO] Exception type: {type(e).__name__}")
|
||||||
|
|
||||||
active_tab_info = {}
|
if attempt < max_retries - 1:
|
||||||
# go to the target URL page
|
logger.info(f"[ACTIVE_TAB_INFO] Retrying in 3 seconds...")
|
||||||
page = browser.new_page()
|
time.sleep(3)
|
||||||
try:
|
else:
|
||||||
page.goto(active_tab_url)
|
logger.error(f"[ACTIVE_TAB_INFO] All {max_retries} attempts failed.")
|
||||||
except:
|
return None
|
||||||
logger.error("Failed to go to the target URL page")
|
|
||||||
return None
|
|
||||||
page.wait_for_load_state('load') # Wait for the 'load' event to complete
|
|
||||||
active_tab_info = {
|
|
||||||
'title': page.title(),
|
|
||||||
'url': page.url,
|
|
||||||
'content': page.content() # get the HTML content of the page
|
|
||||||
}
|
|
||||||
|
|
||||||
browser.close()
|
# This should never be reached, but just in case
|
||||||
# print("active_tab_title: {}".format(active_tab_info.get('title', 'None')))
|
return None
|
||||||
# print("active_tab_url: {}".format(active_tab_info.get('url', 'None')))
|
|
||||||
# print("active_tab_content: {}".format(active_tab_info.get('content', 'None')))
|
|
||||||
return active_tab_info
|
|
||||||
|
|
||||||
|
|
||||||
def get_pdf_from_url(env, config: Dict[str, str]) -> str:
|
def get_pdf_from_url(env, config: Dict[str, str]) -> str:
|
||||||
@@ -767,40 +875,109 @@ def get_pdf_from_url(env, config: Dict[str, str]) -> str:
|
|||||||
_url = config["path"]
|
_url = config["path"]
|
||||||
_path = os.path.join(env.cache_dir, config["dest"])
|
_path = os.path.join(env.cache_dir, config["dest"])
|
||||||
|
|
||||||
|
# Add logging for debugging
|
||||||
|
logger.info(f"[PDF_FROM_URL] Starting PDF download from URL: {_url}")
|
||||||
|
logger.info(f"[PDF_FROM_URL] Target path: {_path}")
|
||||||
|
|
||||||
host = env.vm_ip
|
host = env.vm_ip
|
||||||
port = env.chromium_port # fixme: this port is hard-coded, need to be changed from config file
|
port = env.chromium_port # fixme: this port is hard-coded, need to be changed from config file
|
||||||
server_port = env.server_port
|
server_port = env.server_port
|
||||||
|
|
||||||
remote_debugging_url = f"http://{host}:{port}"
|
remote_debugging_url = f"http://{host}:{port}"
|
||||||
|
|
||||||
with sync_playwright() as p:
|
# Configuration for retry and timeout
|
||||||
|
max_retries = 3
|
||||||
|
timeout_ms = 60000 # Increase timeout to 60 seconds
|
||||||
|
|
||||||
|
for attempt in range(max_retries):
|
||||||
try:
|
try:
|
||||||
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
logger.info(f"[PDF_FROM_URL] Attempt {attempt + 1}/{max_retries}")
|
||||||
|
|
||||||
|
with sync_playwright() as p:
|
||||||
|
try:
|
||||||
|
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
||||||
|
logger.info(f"[PDF_FROM_URL] Successfully connected to existing Chrome instance")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"[PDF_FROM_URL] Failed to connect to existing Chrome instance: {e}")
|
||||||
|
logger.info(f"[PDF_FROM_URL] Starting new Chrome instance...")
|
||||||
|
|
||||||
|
# If the connection fails, start a new browser instance
|
||||||
|
platform.machine()
|
||||||
|
if "arm" in platform.machine():
|
||||||
|
# start a new browser instance if the connection fails
|
||||||
|
payload = json.dumps({"command": [
|
||||||
|
"chromium",
|
||||||
|
"--remote-debugging-port=1337"
|
||||||
|
], "shell": False})
|
||||||
|
else:
|
||||||
|
payload = json.dumps({"command": [
|
||||||
|
"google-chrome",
|
||||||
|
"--remote-debugging-port=1337"
|
||||||
|
], "shell": False})
|
||||||
|
|
||||||
|
headers = {"Content-Type": "application/json"}
|
||||||
|
requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
|
||||||
|
time.sleep(5)
|
||||||
|
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
||||||
|
logger.info(f"[PDF_FROM_URL] Successfully connected to new Chrome instance")
|
||||||
|
|
||||||
|
page = browser.new_page()
|
||||||
|
|
||||||
|
# Set longer timeout for navigation
|
||||||
|
page.set_default_timeout(timeout_ms)
|
||||||
|
|
||||||
|
logger.info(f"[PDF_FROM_URL] Navigating to URL: {_url}")
|
||||||
|
page.goto(_url, wait_until='networkidle', timeout=timeout_ms)
|
||||||
|
|
||||||
|
# Wait for page to be fully loaded
|
||||||
|
logger.info(f"[PDF_FROM_URL] Waiting for page to be fully loaded...")
|
||||||
|
page.wait_for_load_state('networkidle', timeout=timeout_ms)
|
||||||
|
|
||||||
|
# Additional wait to ensure all content is rendered
|
||||||
|
time.sleep(3)
|
||||||
|
|
||||||
|
logger.info(f"[PDF_FROM_URL] Page loaded successfully. Title: '{page.title()}'")
|
||||||
|
logger.info(f"[PDF_FROM_URL] Current URL: '{page.url}'")
|
||||||
|
|
||||||
|
# Generate PDF
|
||||||
|
logger.info(f"[PDF_FROM_URL] Generating PDF...")
|
||||||
|
page.pdf(path=_path)
|
||||||
|
|
||||||
|
logger.info(f"[PDF_FROM_URL] PDF generated successfully at: {_path}")
|
||||||
|
browser.close()
|
||||||
|
|
||||||
|
# Verify PDF file was created
|
||||||
|
if os.path.exists(_path):
|
||||||
|
file_size = os.path.getsize(_path)
|
||||||
|
logger.info(f"[PDF_FROM_URL] PDF file created successfully. Size: {file_size} bytes")
|
||||||
|
return _path
|
||||||
|
else:
|
||||||
|
logger.error(f"[PDF_FROM_URL] PDF file was not created at expected path: {_path}")
|
||||||
|
raise FileNotFoundError(f"PDF file was not created at {_path}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# If the connection fails, start a new browser instance
|
logger.error(f"[PDF_FROM_URL] Attempt {attempt + 1} failed: {str(e)}")
|
||||||
platform.machine()
|
logger.error(f"[PDF_FROM_URL] Exception type: {type(e).__name__}")
|
||||||
if "arm" in platform.machine():
|
|
||||||
# start a new browser instance if the connection fails
|
if attempt < max_retries - 1:
|
||||||
payload = json.dumps({"command": [
|
logger.info(f"[PDF_FROM_URL] Retrying in 5 seconds...")
|
||||||
"chromium",
|
time.sleep(5)
|
||||||
"--remote-debugging-port=1337"
|
|
||||||
], "shell": False})
|
|
||||||
else:
|
else:
|
||||||
payload = json.dumps({"command": [
|
logger.error(f"[PDF_FROM_URL] All {max_retries} attempts failed. Giving up.")
|
||||||
"google-chrome",
|
|
||||||
"--remote-debugging-port=1337"
|
|
||||||
], "shell": False})
|
|
||||||
|
|
||||||
headers = {"Content-Type": "application/json"}
|
# Create a placeholder file or return a default path
|
||||||
requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
|
try:
|
||||||
time.sleep(5)
|
# Create an empty PDF file as fallback
|
||||||
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
with open(_path, 'w') as f:
|
||||||
|
f.write("%PDF-1.4\n%EOF\n")
|
||||||
page = browser.new_page()
|
logger.warning(f"[PDF_FROM_URL] Created empty PDF file as fallback: {_path}")
|
||||||
page.goto(_url)
|
return _path
|
||||||
page.pdf(path=_path)
|
except Exception as fallback_error:
|
||||||
browser.close()
|
logger.error(f"[PDF_FROM_URL] Failed to create fallback file: {fallback_error}")
|
||||||
|
# Return the path anyway, even if file creation failed
|
||||||
|
return _path
|
||||||
|
|
||||||
|
# This should never be reached, but just in case
|
||||||
return _path
|
return _path
|
||||||
|
|
||||||
|
|
||||||
@@ -811,41 +988,75 @@ def get_chrome_saved_address(env, config: Dict[str, str]):
|
|||||||
server_port = env.server_port
|
server_port = env.server_port
|
||||||
|
|
||||||
remote_debugging_url = f"http://{host}:{port}"
|
remote_debugging_url = f"http://{host}:{port}"
|
||||||
with sync_playwright() as p:
|
|
||||||
# connect to remote Chrome instance
|
# Configuration for retry and timeout
|
||||||
|
max_retries = 2
|
||||||
|
timeout_ms = 30000 # 30 seconds for settings page
|
||||||
|
|
||||||
|
for attempt in range(max_retries):
|
||||||
try:
|
try:
|
||||||
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
logger.info(f"[CHROME_SAVED_ADDRESS] Attempt {attempt + 1}/{max_retries}")
|
||||||
|
|
||||||
|
with sync_playwright() as p:
|
||||||
|
# connect to remote Chrome instance
|
||||||
|
try:
|
||||||
|
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
||||||
|
logger.info(f"[CHROME_SAVED_ADDRESS] Successfully connected to existing Chrome instance")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"[CHROME_SAVED_ADDRESS] Failed to connect to existing Chrome instance: {e}")
|
||||||
|
# If the connection fails, start a new browser instance
|
||||||
|
platform.machine()
|
||||||
|
if "arm" in platform.machine():
|
||||||
|
# start a new browser instance if the connection fails
|
||||||
|
payload = json.dumps({"command": [
|
||||||
|
"chromium",
|
||||||
|
"--remote-debugging-port=1337"
|
||||||
|
], "shell": False})
|
||||||
|
else:
|
||||||
|
payload = json.dumps({"command": [
|
||||||
|
"google-chrome",
|
||||||
|
"--remote-debugging-port=1337"
|
||||||
|
], "shell": False})
|
||||||
|
|
||||||
|
headers = {"Content-Type": "application/json"}
|
||||||
|
requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
|
||||||
|
time.sleep(5)
|
||||||
|
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
||||||
|
logger.info(f"[CHROME_SAVED_ADDRESS] Successfully connected to new Chrome instance")
|
||||||
|
|
||||||
|
page = browser.new_page()
|
||||||
|
|
||||||
|
# Set longer timeout for navigation
|
||||||
|
page.set_default_timeout(timeout_ms)
|
||||||
|
|
||||||
|
# Navigate to Chrome's settings page for autofill
|
||||||
|
logger.info(f"[CHROME_SAVED_ADDRESS] Navigating to Chrome settings page")
|
||||||
|
page.goto("chrome://settings/addresses", wait_until='networkidle', timeout=timeout_ms)
|
||||||
|
|
||||||
|
# Wait for page to be fully loaded
|
||||||
|
page.wait_for_load_state('networkidle', timeout=timeout_ms)
|
||||||
|
|
||||||
|
# Get the HTML content of the page
|
||||||
|
content = page.content()
|
||||||
|
|
||||||
|
logger.info(f"[CHROME_SAVED_ADDRESS] Successfully retrieved settings page content")
|
||||||
|
browser.close()
|
||||||
|
|
||||||
|
return content
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# If the connection fails, start a new browser instance
|
logger.error(f"[CHROME_SAVED_ADDRESS] Attempt {attempt + 1} failed: {str(e)}")
|
||||||
platform.machine()
|
logger.error(f"[CHROME_SAVED_ADDRESS] Exception type: {type(e).__name__}")
|
||||||
if "arm" in platform.machine():
|
|
||||||
# start a new browser instance if the connection fails
|
if attempt < max_retries - 1:
|
||||||
payload = json.dumps({"command": [
|
logger.info(f"[CHROME_SAVED_ADDRESS] Retrying in 3 seconds...")
|
||||||
"chromium",
|
time.sleep(3)
|
||||||
"--remote-debugging-port=1337"
|
|
||||||
], "shell": False})
|
|
||||||
else:
|
else:
|
||||||
payload = json.dumps({"command": [
|
logger.error(f"[CHROME_SAVED_ADDRESS] All {max_retries} attempts failed. Returning empty content.")
|
||||||
"google-chrome",
|
return ""
|
||||||
"--remote-debugging-port=1337"
|
|
||||||
], "shell": False})
|
|
||||||
|
|
||||||
headers = {"Content-Type": "application/json"}
|
# This should never be reached, but just in case
|
||||||
requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
|
return ""
|
||||||
time.sleep(5)
|
|
||||||
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
|
||||||
|
|
||||||
page = browser.new_page()
|
|
||||||
|
|
||||||
# Navigate to Chrome's settings page for autofill
|
|
||||||
page.goto("chrome://settings/addresses")
|
|
||||||
|
|
||||||
# Get the HTML content of the page
|
|
||||||
content = page.content()
|
|
||||||
|
|
||||||
browser.close()
|
|
||||||
|
|
||||||
return content
|
|
||||||
|
|
||||||
|
|
||||||
def get_shortcuts_on_desktop(env, config: Dict[str, str]):
|
def get_shortcuts_on_desktop(env, config: Dict[str, str]):
|
||||||
@@ -891,38 +1102,76 @@ def get_number_of_search_results(env, config: Dict[str, str]):
|
|||||||
server_port = env.server_port
|
server_port = env.server_port
|
||||||
|
|
||||||
remote_debugging_url = f"http://{host}:{port}"
|
remote_debugging_url = f"http://{host}:{port}"
|
||||||
with sync_playwright() as p:
|
|
||||||
|
# Configuration for retry and timeout
|
||||||
|
max_retries = 2
|
||||||
|
timeout_ms = 45000 # 45 seconds for search results
|
||||||
|
|
||||||
|
for attempt in range(max_retries):
|
||||||
try:
|
try:
|
||||||
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
logger.info(f"[SEARCH_RESULTS] Attempt {attempt + 1}/{max_retries} for URL: {url}")
|
||||||
|
|
||||||
|
with sync_playwright() as p:
|
||||||
|
try:
|
||||||
|
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
||||||
|
logger.info(f"[SEARCH_RESULTS] Successfully connected to existing Chrome instance")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"[SEARCH_RESULTS] Failed to connect to existing Chrome instance: {e}")
|
||||||
|
# If the connection fails, start a new browser instance
|
||||||
|
platform.machine()
|
||||||
|
if "arm" in platform.machine():
|
||||||
|
# start a new browser instance if the connection fails
|
||||||
|
payload = json.dumps({"command": [
|
||||||
|
"chromium",
|
||||||
|
"--remote-debugging-port=1337"
|
||||||
|
], "shell": False})
|
||||||
|
else:
|
||||||
|
payload = json.dumps({"command": [
|
||||||
|
"google-chrome",
|
||||||
|
"--remote-debugging-port=1337"
|
||||||
|
], "shell": False})
|
||||||
|
|
||||||
|
headers = {"Content-Type": "application/json"}
|
||||||
|
requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
|
||||||
|
time.sleep(5)
|
||||||
|
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
||||||
|
logger.info(f"[SEARCH_RESULTS] Successfully connected to new Chrome instance")
|
||||||
|
|
||||||
|
page = browser.new_page()
|
||||||
|
|
||||||
|
# Set longer timeout for navigation
|
||||||
|
page.set_default_timeout(timeout_ms)
|
||||||
|
|
||||||
|
logger.info(f"[SEARCH_RESULTS] Navigating to URL: {url}")
|
||||||
|
page.goto(url, wait_until='networkidle', timeout=timeout_ms)
|
||||||
|
|
||||||
|
# Wait for page to be fully loaded
|
||||||
|
page.wait_for_load_state('networkidle', timeout=timeout_ms)
|
||||||
|
|
||||||
|
search_results = page.query_selector_all(result_selector)
|
||||||
|
actual_count = len(search_results)
|
||||||
|
|
||||||
|
logger.info(f"[SEARCH_RESULTS] Found {actual_count} search results")
|
||||||
|
browser.close()
|
||||||
|
|
||||||
|
return actual_count
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# If the connection fails, start a new browser instance
|
logger.error(f"[SEARCH_RESULTS] Attempt {attempt + 1} failed: {str(e)}")
|
||||||
platform.machine()
|
logger.error(f"[SEARCH_RESULTS] Exception type: {type(e).__name__}")
|
||||||
if "arm" in platform.machine():
|
|
||||||
# start a new browser instance if the connection fails
|
if attempt < max_retries - 1:
|
||||||
payload = json.dumps({"command": [
|
logger.info(f"[SEARCH_RESULTS] Retrying in 3 seconds...")
|
||||||
"chromium",
|
time.sleep(3)
|
||||||
"--remote-debugging-port=1337"
|
|
||||||
], "shell": False})
|
|
||||||
else:
|
else:
|
||||||
payload = json.dumps({"command": [
|
logger.error(f"[SEARCH_RESULTS] All {max_retries} attempts failed. Returning 0.")
|
||||||
"google-chrome",
|
return 0
|
||||||
"--remote-debugging-port=1337"
|
|
||||||
], "shell": False})
|
|
||||||
|
|
||||||
headers = {"Content-Type": "application/json"}
|
# This should never be reached, but just in case
|
||||||
requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
|
return 0
|
||||||
time.sleep(5)
|
|
||||||
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
|
||||||
page = browser.new_page()
|
|
||||||
page.goto(url)
|
|
||||||
search_results = page.query_selector_all(result_selector)
|
|
||||||
actual_count = len(search_results)
|
|
||||||
browser.close()
|
|
||||||
|
|
||||||
return actual_count
|
|
||||||
|
|
||||||
|
|
||||||
def get_googledrive_file(env, config: Dict[str, Any]) -> str:
|
def get_googledrive_file(env, config: Dict[str, Any]) -> Any:
|
||||||
""" Get the desired file from Google Drive based on config, return the downloaded local filepath.
|
""" Get the desired file from Google Drive based on config, return the downloaded local filepath.
|
||||||
@args: keys in config dict
|
@args: keys in config dict
|
||||||
settings_file(str): target filepath to the settings file for Google Drive authentication, default is 'evaluation_examples/settings/googledrive/settings.yml'
|
settings_file(str): target filepath to the settings file for Google Drive authentication, default is 'evaluation_examples/settings/googledrive/settings.yml'
|
||||||
@@ -959,18 +1208,21 @@ def get_googledrive_file(env, config: Dict[str, Any]) -> str:
|
|||||||
return _path
|
return _path
|
||||||
|
|
||||||
if 'query' in config:
|
if 'query' in config:
|
||||||
return get_single_file(config['query'], os.path.join(env.cache_dir, config['dest']))
|
result = get_single_file(config['query'], os.path.join(env.cache_dir, config['dest']))
|
||||||
|
return result
|
||||||
elif 'path' in config:
|
elif 'path' in config:
|
||||||
query = [f"title = '{fp}' and mimeType = 'application/vnd.google-apps.folder' and trashed = false" if idx < len(
|
query = [f"title = '{fp}' and mimeType = 'application/vnd.google-apps.folder' and trashed = false" if idx < len(
|
||||||
config['path']) - 1
|
config['path']) - 1
|
||||||
else f"title = '{fp}' and trashed = false" for idx, fp in enumerate(config['path'])]
|
else f"title = '{fp}' and trashed = false" for idx, fp in enumerate(config['path'])]
|
||||||
return get_single_file(query, os.path.join(env.cache_dir, config['dest']))
|
result = get_single_file(query, os.path.join(env.cache_dir, config['dest']))
|
||||||
|
return result
|
||||||
elif 'query_list' in config:
|
elif 'query_list' in config:
|
||||||
_path_list = []
|
_path_list = []
|
||||||
assert len(config['query_list']) == len(config['dest'])
|
assert len(config['query_list']) == len(config['dest'])
|
||||||
for idx, query in enumerate(config['query_list']):
|
for idx, query in enumerate(config['query_list']):
|
||||||
dest = config['dest'][idx]
|
dest = config['dest'][idx]
|
||||||
_path_list.append(get_single_file(query, os.path.join(env.cache_dir, dest)))
|
result = get_single_file(query, os.path.join(env.cache_dir, dest))
|
||||||
|
_path_list.append(result)
|
||||||
return _path_list
|
return _path_list
|
||||||
else: # path_list in config
|
else: # path_list in config
|
||||||
_path_list = []
|
_path_list = []
|
||||||
@@ -981,7 +1233,8 @@ def get_googledrive_file(env, config: Dict[str, Any]) -> str:
|
|||||||
path) - 1
|
path) - 1
|
||||||
else f"title = '{fp}' and trashed = false" for jdx, fp in enumerate(path)]
|
else f"title = '{fp}' and trashed = false" for jdx, fp in enumerate(path)]
|
||||||
dest = config['dest'][idx]
|
dest = config['dest'][idx]
|
||||||
_path_list.append(get_single_file(query, os.path.join(env.cache_dir, dest)))
|
result = get_single_file(query, os.path.join(env.cache_dir, dest))
|
||||||
|
_path_list.append(result)
|
||||||
return _path_list
|
return _path_list
|
||||||
|
|
||||||
|
|
||||||
@@ -1692,16 +1945,31 @@ def get_url_dashPart(env, config: Dict[str, str]):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
# extract the last dash-separated part of the URL, and delete all the characters after "id"
|
# extract the last dash-separated part of the URL, and delete all the characters after "id"
|
||||||
dash_part = active_tab_url.split("/")[config["partIndex"]]
|
# Ensure partIndex is an integer
|
||||||
if config["needDeleteId"]:
|
try:
|
||||||
|
part_index = int(config["partIndex"])
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
logger.error(f"[URL_DASH_PART] Invalid partIndex: {config.get('partIndex', 'None')}. Must be an integer.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
url_parts = active_tab_url.split("/")
|
||||||
|
if part_index >= len(url_parts):
|
||||||
|
logger.error(f"[URL_DASH_PART] partIndex {part_index} is out of range for URL with {len(url_parts)} parts")
|
||||||
|
return None
|
||||||
|
|
||||||
|
dash_part = url_parts[part_index]
|
||||||
|
if config.get("needDeleteId", False):
|
||||||
dash_part = dash_part.split("?")[0]
|
dash_part = dash_part.split("?")[0]
|
||||||
# print("active_tab_title: {}".format(active_tab_info.get('title', 'None')))
|
|
||||||
# print("active_tab_url: {}".format(active_tab_info.get('url', 'None')))
|
logger.info(f"[URL_DASH_PART] Extracted dash part: '{dash_part}' from URL: {active_tab_url}")
|
||||||
# print("active_tab_content: {}".format(active_tab_info.get('content', 'None')))
|
|
||||||
if config["returnType"] == "string":
|
if config["returnType"] == "string":
|
||||||
return dash_part
|
return dash_part
|
||||||
elif config["returnType"] == "json":
|
elif config["returnType"] == "json":
|
||||||
return {config["key"]: dash_part}
|
return {config["key"]: dash_part}
|
||||||
|
else:
|
||||||
|
logger.error(f"[URL_DASH_PART] Invalid returnType: {config.get('returnType', 'None')}. Must be 'string' or 'json'.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def get_macys_product_url_parse(env, config: Dict[str, str]):
|
def get_macys_product_url_parse(env, config: Dict[str, str]):
|
||||||
|
|||||||
Reference in New Issue
Block a user