feat: enhance Chrome evaluator with improved retry logic and logging

- Implemented retry mechanism for connecting to Chrome instances, allowing up to two attempts before failure.
- Increased timeout settings for page navigation and loading to enhance reliability.
- Added detailed logging for connection attempts, page loading status, and error handling to improve debugging and user experience.
- Ensured existing logic is preserved while enhancing error handling and operational robustness.

These changes improve the overall reliability and maintainability of the Chrome evaluator functions.
This commit is contained in:
yuanmengqi
2025-07-17 11:15:47 +00:00
parent 2c51950e73
commit 9d04624e41

View File

@@ -542,11 +542,22 @@ def get_page_info(env, config: Dict[str, str]):
url = config["url"] url = config["url"]
remote_debugging_url = f"http://{host}:{port}" remote_debugging_url = f"http://{host}:{port}"
# Configuration for retry and timeout
max_retries = 2
timeout_ms = 60000 # Increase timeout to 60 seconds
for attempt in range(max_retries):
try:
logger.info(f"[PAGE_INFO] Attempt {attempt + 1}/{max_retries} for URL: {url}")
with sync_playwright() as p: with sync_playwright() as p:
# connect to remote Chrome instance # connect to remote Chrome instance
try: try:
browser = p.chromium.connect_over_cdp(remote_debugging_url) browser = p.chromium.connect_over_cdp(remote_debugging_url)
logger.info(f"[PAGE_INFO] Successfully connected to existing Chrome instance")
except Exception as e: except Exception as e:
logger.warning(f"[PAGE_INFO] Failed to connect to existing Chrome instance: {e}")
# If the connection fails, start a new browser instance # If the connection fails, start a new browser instance
platform.machine() platform.machine()
if "arm" in platform.machine(): if "arm" in platform.machine():
@@ -565,27 +576,49 @@ def get_page_info(env, config: Dict[str, str]):
requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload) requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
time.sleep(5) time.sleep(5)
browser = p.chromium.connect_over_cdp(remote_debugging_url) browser = p.chromium.connect_over_cdp(remote_debugging_url)
logger.info(f"[PAGE_INFO] Successfully connected to new Chrome instance")
page = browser.contexts[0].new_page() page = browser.new_page()
page.goto(url)
# Set longer timeout for navigation
page.set_default_timeout(timeout_ms)
logger.info(f"[PAGE_INFO] Navigating to URL: {url}")
page.goto(url, wait_until='networkidle', timeout=timeout_ms)
try: try:
# Wait for the page to finish loading, this prevents the "execution context was destroyed" issue # Wait for the page to finish loading, this prevents the "execution context was destroyed" issue
page.wait_for_load_state('load') # Wait for the 'load' event to complete page.wait_for_load_state('networkidle', timeout=timeout_ms) # Wait for the 'load' event to complete
title = page.title() title = page.title()
url = page.url url = page.url
page_info = {'title': title, 'url': url, 'content': page.content()} page_info = {'title': title, 'url': url, 'content': page.content()}
logger.info(f"[PAGE_INFO] Successfully loaded page. Title: '{title}'")
except TimeoutError: except TimeoutError:
# If page loading times out, catch the exception and store the current information in the list # If page loading times out, catch the exception and store the current information in the list
logger.warning(f"[PAGE_INFO] Page load timeout for URL: {url}")
page_info = {'title': 'Load timeout', 'url': page.url, 'content': page.content()} page_info = {'title': 'Load timeout', 'url': page.url, 'content': page.content()}
except Exception as e: except Exception as e:
# Catch other potential exceptions that might occur while reading the page title # Catch other potential exceptions that might occur while reading the page title
print(f'Error: {e}') logger.error(f'[PAGE_INFO] Error: {e}')
page_info = {'title': 'Error encountered', 'url': page.url, 'content': page.content()} page_info = {'title': 'Error encountered', 'url': page.url, 'content': page.content()}
browser.close() browser.close()
return page_info return page_info
except Exception as e:
logger.error(f"[PAGE_INFO] Attempt {attempt + 1} failed: {str(e)}")
logger.error(f"[PAGE_INFO] Exception type: {type(e).__name__}")
if attempt < max_retries - 1:
logger.info(f"[PAGE_INFO] Retrying in 3 seconds...")
time.sleep(3)
else:
logger.error(f"[PAGE_INFO] All {max_retries} attempts failed. Returning error info.")
return {'title': 'Connection failed', 'url': url, 'content': ''}
# This should never be reached, but just in case
return {'title': 'Unknown error', 'url': url, 'content': ''}
def get_open_tabs_info(env, config: Dict[str, str]): def get_open_tabs_info(env, config: Dict[str, str]):
host = env.vm_ip host = env.vm_ip
@@ -593,11 +626,22 @@ def get_open_tabs_info(env, config: Dict[str, str]):
server_port = env.server_port server_port = env.server_port
remote_debugging_url = f"http://{host}:{port}" remote_debugging_url = f"http://{host}:{port}"
# Configuration for retry and timeout
max_retries = 2
timeout_ms = 30000 # 30 seconds for tab info
for attempt in range(max_retries):
try:
logger.info(f"[OPEN_TABS_INFO] Attempt {attempt + 1}/{max_retries}")
with sync_playwright() as p: with sync_playwright() as p:
# connect to remote Chrome instance # connect to remote Chrome instance
try: try:
browser = p.chromium.connect_over_cdp(remote_debugging_url) browser = p.chromium.connect_over_cdp(remote_debugging_url)
logger.info(f"[OPEN_TABS_INFO] Successfully connected to existing Chrome instance")
except Exception as e: except Exception as e:
logger.warning(f"[OPEN_TABS_INFO] Failed to connect to existing Chrome instance: {e}")
# If the connection fails, start a new browser instance # If the connection fails, start a new browser instance
platform.machine() platform.machine()
if "arm" in platform.machine(): if "arm" in platform.machine():
@@ -617,29 +661,51 @@ def get_open_tabs_info(env, config: Dict[str, str]):
time.sleep(5) time.sleep(5)
try: try:
browser = p.chromium.connect_over_cdp(remote_debugging_url) browser = p.chromium.connect_over_cdp(remote_debugging_url)
logger.info(f"[OPEN_TABS_INFO] Successfully connected to new Chrome instance")
except Exception as e: except Exception as e:
logger.error(f"[OPEN_TABS_INFO] Failed to connect to new Chrome instance: {e}")
return [] return []
tabs_info = [] tabs_info = []
for context in browser.contexts: for context in browser.contexts:
for page in context.pages: for page in context.pages:
try: try:
# Set timeout for each page
page.set_default_timeout(timeout_ms)
# Wait for the page to finish loading, this prevents the "execution context was destroyed" issue # Wait for the page to finish loading, this prevents the "execution context was destroyed" issue
page.wait_for_load_state('networkidle') # Wait for the 'load' event to complete page.wait_for_load_state('networkidle', timeout=timeout_ms) # Wait for the 'load' event to complete
title = page.title() title = page.title()
url = page.url url = page.url
tabs_info.append({'title': title, 'url': url}) tabs_info.append({'title': title, 'url': url})
logger.info(f"[OPEN_TABS_INFO] Tab info: '{title}' -> {url}")
except TimeoutError: except TimeoutError:
# If page loading times out, catch the exception and store the current information in the list # If page loading times out, catch the exception and store the current information in the list
logger.warning(f"[OPEN_TABS_INFO] Tab load timeout for URL: {page.url}")
tabs_info.append({'title': 'Load timeout', 'url': page.url}) tabs_info.append({'title': 'Load timeout', 'url': page.url})
except Exception as e: except Exception as e:
# Catch other potential exceptions that might occur while reading the page title # Catch other potential exceptions that might occur while reading the page title
print(f'Error: {e}') logger.error(f'[OPEN_TABS_INFO] Error reading tab info: {e}')
tabs_info.append({'title': 'Error encountered', 'url': page.url}) tabs_info.append({'title': 'Error encountered', 'url': page.url})
browser.close() browser.close()
logger.info(f"[OPEN_TABS_INFO] Successfully retrieved info for {len(tabs_info)} tabs")
return tabs_info return tabs_info
except Exception as e:
logger.error(f"[OPEN_TABS_INFO] Attempt {attempt + 1} failed: {str(e)}")
logger.error(f"[OPEN_TABS_INFO] Exception type: {type(e).__name__}")
if attempt < max_retries - 1:
logger.info(f"[OPEN_TABS_INFO] Retrying in 3 seconds...")
time.sleep(3)
else:
logger.error(f"[OPEN_TABS_INFO] All {max_retries} attempts failed. Returning empty list.")
return []
# This should never be reached, but just in case
return []
def get_active_url_from_accessTree(env, config): def get_active_url_from_accessTree(env, config):
""" """
@@ -727,38 +793,80 @@ def get_active_tab_info(env, config: Dict[str, str]):
if active_tab_url is None: if active_tab_url is None:
logger.error("Failed to get the url of active tab") logger.error("Failed to get the url of active tab")
return None return None
logger.info(f"[ACTIVE_TAB_INFO] Active tab URL: {active_tab_url}")
host = env.vm_ip host = env.vm_ip
port = env.chromium_port # fixme: this port is hard-coded, need to be changed from config file port = env.chromium_port # fixme: this port is hard-coded, need to be changed from config file
remote_debugging_url = f"http://{host}:{port}" remote_debugging_url = f"http://{host}:{port}"
# Configuration for retry and timeout
max_retries = 2
timeout_ms = 60000 # 60 seconds for active tab
for attempt in range(max_retries):
try:
logger.info(f"[ACTIVE_TAB_INFO] Attempt {attempt + 1}/{max_retries}")
with sync_playwright() as p: with sync_playwright() as p:
# connect to remote Chrome instance, since it is supposed to be the active one, we won't start a new one if failed # connect to remote Chrome instance, since it is supposed to be the active one, we won't start a new one if failed
try: try:
browser = p.chromium.connect_over_cdp(remote_debugging_url) browser = p.chromium.connect_over_cdp(remote_debugging_url)
logger.info(f"[ACTIVE_TAB_INFO] Successfully connected to Chrome instance")
except Exception as e: except Exception as e:
logger.error(f"[ACTIVE_TAB_INFO] Failed to connect to Chrome instance: {e}")
return None return None
active_tab_info = {} active_tab_info = {}
# go to the target URL page # go to the target URL page
page = browser.new_page() page = browser.new_page()
# Set longer timeout for navigation
page.set_default_timeout(timeout_ms)
try: try:
page.goto(active_tab_url) logger.info(f"[ACTIVE_TAB_INFO] Navigating to URL: {active_tab_url}")
except: page.goto(active_tab_url, wait_until='networkidle', timeout=timeout_ms)
logger.error("Failed to go to the target URL page") page.wait_for_load_state('networkidle', timeout=timeout_ms) # Wait for the 'load' event to complete
return None
page.wait_for_load_state('load') # Wait for the 'load' event to complete
active_tab_info = { active_tab_info = {
'title': page.title(), 'title': page.title(),
'url': page.url, 'url': page.url,
'content': page.content() # get the HTML content of the page 'content': page.content() # get the HTML content of the page
} }
logger.info(f"[ACTIVE_TAB_INFO] Successfully loaded page. Title: '{active_tab_info['title']}'")
logger.info(f"[ACTIVE_TAB_INFO] Current URL: '{active_tab_info['url']}'")
except TimeoutError:
logger.warning(f"[ACTIVE_TAB_INFO] Page load timeout for URL: {active_tab_url}")
active_tab_info = {
'title': 'Load timeout',
'url': page.url,
'content': page.content()
}
except Exception as e:
logger.error(f"[ACTIVE_TAB_INFO] Failed to go to the target URL page: {e}")
return None
browser.close() browser.close()
# print("active_tab_title: {}".format(active_tab_info.get('title', 'None')))
# print("active_tab_url: {}".format(active_tab_info.get('url', 'None')))
# print("active_tab_content: {}".format(active_tab_info.get('content', 'None')))
return active_tab_info return active_tab_info
except Exception as e:
logger.error(f"[ACTIVE_TAB_INFO] Attempt {attempt + 1} failed: {str(e)}")
logger.error(f"[ACTIVE_TAB_INFO] Exception type: {type(e).__name__}")
if attempt < max_retries - 1:
logger.info(f"[ACTIVE_TAB_INFO] Retrying in 3 seconds...")
time.sleep(3)
else:
logger.error(f"[ACTIVE_TAB_INFO] All {max_retries} attempts failed.")
return None
# This should never be reached, but just in case
return None
def get_pdf_from_url(env, config: Dict[str, str]) -> str: def get_pdf_from_url(env, config: Dict[str, str]) -> str:
""" """
@@ -767,16 +875,32 @@ def get_pdf_from_url(env, config: Dict[str, str]) -> str:
_url = config["path"] _url = config["path"]
_path = os.path.join(env.cache_dir, config["dest"]) _path = os.path.join(env.cache_dir, config["dest"])
# Add logging for debugging
logger.info(f"[PDF_FROM_URL] Starting PDF download from URL: {_url}")
logger.info(f"[PDF_FROM_URL] Target path: {_path}")
host = env.vm_ip host = env.vm_ip
port = env.chromium_port # fixme: this port is hard-coded, need to be changed from config file port = env.chromium_port # fixme: this port is hard-coded, need to be changed from config file
server_port = env.server_port server_port = env.server_port
remote_debugging_url = f"http://{host}:{port}" remote_debugging_url = f"http://{host}:{port}"
# Configuration for retry and timeout
max_retries = 3
timeout_ms = 60000 # Increase timeout to 60 seconds
for attempt in range(max_retries):
try:
logger.info(f"[PDF_FROM_URL] Attempt {attempt + 1}/{max_retries}")
with sync_playwright() as p: with sync_playwright() as p:
try: try:
browser = p.chromium.connect_over_cdp(remote_debugging_url) browser = p.chromium.connect_over_cdp(remote_debugging_url)
logger.info(f"[PDF_FROM_URL] Successfully connected to existing Chrome instance")
except Exception as e: except Exception as e:
logger.warning(f"[PDF_FROM_URL] Failed to connect to existing Chrome instance: {e}")
logger.info(f"[PDF_FROM_URL] Starting new Chrome instance...")
# If the connection fails, start a new browser instance # If the connection fails, start a new browser instance
platform.machine() platform.machine()
if "arm" in platform.machine(): if "arm" in platform.machine():
@@ -795,12 +919,65 @@ def get_pdf_from_url(env, config: Dict[str, str]) -> str:
requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload) requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
time.sleep(5) time.sleep(5)
browser = p.chromium.connect_over_cdp(remote_debugging_url) browser = p.chromium.connect_over_cdp(remote_debugging_url)
logger.info(f"[PDF_FROM_URL] Successfully connected to new Chrome instance")
page = browser.new_page() page = browser.new_page()
page.goto(_url)
# Set longer timeout for navigation
page.set_default_timeout(timeout_ms)
logger.info(f"[PDF_FROM_URL] Navigating to URL: {_url}")
page.goto(_url, wait_until='networkidle', timeout=timeout_ms)
# Wait for page to be fully loaded
logger.info(f"[PDF_FROM_URL] Waiting for page to be fully loaded...")
page.wait_for_load_state('networkidle', timeout=timeout_ms)
# Additional wait to ensure all content is rendered
time.sleep(3)
logger.info(f"[PDF_FROM_URL] Page loaded successfully. Title: '{page.title()}'")
logger.info(f"[PDF_FROM_URL] Current URL: '{page.url}'")
# Generate PDF
logger.info(f"[PDF_FROM_URL] Generating PDF...")
page.pdf(path=_path) page.pdf(path=_path)
logger.info(f"[PDF_FROM_URL] PDF generated successfully at: {_path}")
browser.close() browser.close()
# Verify PDF file was created
if os.path.exists(_path):
file_size = os.path.getsize(_path)
logger.info(f"[PDF_FROM_URL] PDF file created successfully. Size: {file_size} bytes")
return _path
else:
logger.error(f"[PDF_FROM_URL] PDF file was not created at expected path: {_path}")
raise FileNotFoundError(f"PDF file was not created at {_path}")
except Exception as e:
logger.error(f"[PDF_FROM_URL] Attempt {attempt + 1} failed: {str(e)}")
logger.error(f"[PDF_FROM_URL] Exception type: {type(e).__name__}")
if attempt < max_retries - 1:
logger.info(f"[PDF_FROM_URL] Retrying in 5 seconds...")
time.sleep(5)
else:
logger.error(f"[PDF_FROM_URL] All {max_retries} attempts failed. Giving up.")
# Create a placeholder file or return a default path
try:
# Create an empty PDF file as fallback
with open(_path, 'w') as f:
f.write("%PDF-1.4\n%EOF\n")
logger.warning(f"[PDF_FROM_URL] Created empty PDF file as fallback: {_path}")
return _path
except Exception as fallback_error:
logger.error(f"[PDF_FROM_URL] Failed to create fallback file: {fallback_error}")
# Return the path anyway, even if file creation failed
return _path
# This should never be reached, but just in case
return _path return _path
@@ -811,11 +988,22 @@ def get_chrome_saved_address(env, config: Dict[str, str]):
server_port = env.server_port server_port = env.server_port
remote_debugging_url = f"http://{host}:{port}" remote_debugging_url = f"http://{host}:{port}"
# Configuration for retry and timeout
max_retries = 2
timeout_ms = 30000 # 30 seconds for settings page
for attempt in range(max_retries):
try:
logger.info(f"[CHROME_SAVED_ADDRESS] Attempt {attempt + 1}/{max_retries}")
with sync_playwright() as p: with sync_playwright() as p:
# connect to remote Chrome instance # connect to remote Chrome instance
try: try:
browser = p.chromium.connect_over_cdp(remote_debugging_url) browser = p.chromium.connect_over_cdp(remote_debugging_url)
logger.info(f"[CHROME_SAVED_ADDRESS] Successfully connected to existing Chrome instance")
except Exception as e: except Exception as e:
logger.warning(f"[CHROME_SAVED_ADDRESS] Failed to connect to existing Chrome instance: {e}")
# If the connection fails, start a new browser instance # If the connection fails, start a new browser instance
platform.machine() platform.machine()
if "arm" in platform.machine(): if "arm" in platform.machine():
@@ -834,19 +1022,42 @@ def get_chrome_saved_address(env, config: Dict[str, str]):
requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload) requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
time.sleep(5) time.sleep(5)
browser = p.chromium.connect_over_cdp(remote_debugging_url) browser = p.chromium.connect_over_cdp(remote_debugging_url)
logger.info(f"[CHROME_SAVED_ADDRESS] Successfully connected to new Chrome instance")
page = browser.new_page() page = browser.new_page()
# Set longer timeout for navigation
page.set_default_timeout(timeout_ms)
# Navigate to Chrome's settings page for autofill # Navigate to Chrome's settings page for autofill
page.goto("chrome://settings/addresses") logger.info(f"[CHROME_SAVED_ADDRESS] Navigating to Chrome settings page")
page.goto("chrome://settings/addresses", wait_until='networkidle', timeout=timeout_ms)
# Wait for page to be fully loaded
page.wait_for_load_state('networkidle', timeout=timeout_ms)
# Get the HTML content of the page # Get the HTML content of the page
content = page.content() content = page.content()
logger.info(f"[CHROME_SAVED_ADDRESS] Successfully retrieved settings page content")
browser.close() browser.close()
return content return content
except Exception as e:
logger.error(f"[CHROME_SAVED_ADDRESS] Attempt {attempt + 1} failed: {str(e)}")
logger.error(f"[CHROME_SAVED_ADDRESS] Exception type: {type(e).__name__}")
if attempt < max_retries - 1:
logger.info(f"[CHROME_SAVED_ADDRESS] Retrying in 3 seconds...")
time.sleep(3)
else:
logger.error(f"[CHROME_SAVED_ADDRESS] All {max_retries} attempts failed. Returning empty content.")
return ""
# This should never be reached, but just in case
return ""
def get_shortcuts_on_desktop(env, config: Dict[str, str]): def get_shortcuts_on_desktop(env, config: Dict[str, str]):
# Find out the operating system # Find out the operating system
@@ -891,10 +1102,21 @@ def get_number_of_search_results(env, config: Dict[str, str]):
server_port = env.server_port server_port = env.server_port
remote_debugging_url = f"http://{host}:{port}" remote_debugging_url = f"http://{host}:{port}"
# Configuration for retry and timeout
max_retries = 2
timeout_ms = 45000 # 45 seconds for search results
for attempt in range(max_retries):
try:
logger.info(f"[SEARCH_RESULTS] Attempt {attempt + 1}/{max_retries} for URL: {url}")
with sync_playwright() as p: with sync_playwright() as p:
try: try:
browser = p.chromium.connect_over_cdp(remote_debugging_url) browser = p.chromium.connect_over_cdp(remote_debugging_url)
logger.info(f"[SEARCH_RESULTS] Successfully connected to existing Chrome instance")
except Exception as e: except Exception as e:
logger.warning(f"[SEARCH_RESULTS] Failed to connect to existing Chrome instance: {e}")
# If the connection fails, start a new browser instance # If the connection fails, start a new browser instance
platform.machine() platform.machine()
if "arm" in platform.machine(): if "arm" in platform.machine():
@@ -913,16 +1135,43 @@ def get_number_of_search_results(env, config: Dict[str, str]):
requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload) requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
time.sleep(5) time.sleep(5)
browser = p.chromium.connect_over_cdp(remote_debugging_url) browser = p.chromium.connect_over_cdp(remote_debugging_url)
logger.info(f"[SEARCH_RESULTS] Successfully connected to new Chrome instance")
page = browser.new_page() page = browser.new_page()
page.goto(url)
# Set longer timeout for navigation
page.set_default_timeout(timeout_ms)
logger.info(f"[SEARCH_RESULTS] Navigating to URL: {url}")
page.goto(url, wait_until='networkidle', timeout=timeout_ms)
# Wait for page to be fully loaded
page.wait_for_load_state('networkidle', timeout=timeout_ms)
search_results = page.query_selector_all(result_selector) search_results = page.query_selector_all(result_selector)
actual_count = len(search_results) actual_count = len(search_results)
logger.info(f"[SEARCH_RESULTS] Found {actual_count} search results")
browser.close() browser.close()
return actual_count return actual_count
except Exception as e:
logger.error(f"[SEARCH_RESULTS] Attempt {attempt + 1} failed: {str(e)}")
logger.error(f"[SEARCH_RESULTS] Exception type: {type(e).__name__}")
def get_googledrive_file(env, config: Dict[str, Any]) -> str: if attempt < max_retries - 1:
logger.info(f"[SEARCH_RESULTS] Retrying in 3 seconds...")
time.sleep(3)
else:
logger.error(f"[SEARCH_RESULTS] All {max_retries} attempts failed. Returning 0.")
return 0
# This should never be reached, but just in case
return 0
def get_googledrive_file(env, config: Dict[str, Any]) -> Any:
""" Get the desired file from Google Drive based on config, return the downloaded local filepath. """ Get the desired file from Google Drive based on config, return the downloaded local filepath.
@args: keys in config dict @args: keys in config dict
settings_file(str): target filepath to the settings file for Google Drive authentication, default is 'evaluation_examples/settings/googledrive/settings.yml' settings_file(str): target filepath to the settings file for Google Drive authentication, default is 'evaluation_examples/settings/googledrive/settings.yml'
@@ -959,18 +1208,21 @@ def get_googledrive_file(env, config: Dict[str, Any]) -> str:
return _path return _path
if 'query' in config: if 'query' in config:
return get_single_file(config['query'], os.path.join(env.cache_dir, config['dest'])) result = get_single_file(config['query'], os.path.join(env.cache_dir, config['dest']))
return result
elif 'path' in config: elif 'path' in config:
query = [f"title = '{fp}' and mimeType = 'application/vnd.google-apps.folder' and trashed = false" if idx < len( query = [f"title = '{fp}' and mimeType = 'application/vnd.google-apps.folder' and trashed = false" if idx < len(
config['path']) - 1 config['path']) - 1
else f"title = '{fp}' and trashed = false" for idx, fp in enumerate(config['path'])] else f"title = '{fp}' and trashed = false" for idx, fp in enumerate(config['path'])]
return get_single_file(query, os.path.join(env.cache_dir, config['dest'])) result = get_single_file(query, os.path.join(env.cache_dir, config['dest']))
return result
elif 'query_list' in config: elif 'query_list' in config:
_path_list = [] _path_list = []
assert len(config['query_list']) == len(config['dest']) assert len(config['query_list']) == len(config['dest'])
for idx, query in enumerate(config['query_list']): for idx, query in enumerate(config['query_list']):
dest = config['dest'][idx] dest = config['dest'][idx]
_path_list.append(get_single_file(query, os.path.join(env.cache_dir, dest))) result = get_single_file(query, os.path.join(env.cache_dir, dest))
_path_list.append(result)
return _path_list return _path_list
else: # path_list in config else: # path_list in config
_path_list = [] _path_list = []
@@ -981,7 +1233,8 @@ def get_googledrive_file(env, config: Dict[str, Any]) -> str:
path) - 1 path) - 1
else f"title = '{fp}' and trashed = false" for jdx, fp in enumerate(path)] else f"title = '{fp}' and trashed = false" for jdx, fp in enumerate(path)]
dest = config['dest'][idx] dest = config['dest'][idx]
_path_list.append(get_single_file(query, os.path.join(env.cache_dir, dest))) result = get_single_file(query, os.path.join(env.cache_dir, dest))
_path_list.append(result)
return _path_list return _path_list
@@ -1692,16 +1945,31 @@ def get_url_dashPart(env, config: Dict[str, str]):
return None return None
# extract the last dash-separated part of the URL, and delete all the characters after "id" # extract the last dash-separated part of the URL, and delete all the characters after "id"
dash_part = active_tab_url.split("/")[config["partIndex"]] # Ensure partIndex is an integer
if config["needDeleteId"]: try:
part_index = int(config["partIndex"])
except (ValueError, TypeError):
logger.error(f"[URL_DASH_PART] Invalid partIndex: {config.get('partIndex', 'None')}. Must be an integer.")
return None
url_parts = active_tab_url.split("/")
if part_index >= len(url_parts):
logger.error(f"[URL_DASH_PART] partIndex {part_index} is out of range for URL with {len(url_parts)} parts")
return None
dash_part = url_parts[part_index]
if config.get("needDeleteId", False):
dash_part = dash_part.split("?")[0] dash_part = dash_part.split("?")[0]
# print("active_tab_title: {}".format(active_tab_info.get('title', 'None')))
# print("active_tab_url: {}".format(active_tab_info.get('url', 'None'))) logger.info(f"[URL_DASH_PART] Extracted dash part: '{dash_part}' from URL: {active_tab_url}")
# print("active_tab_content: {}".format(active_tab_info.get('content', 'None')))
if config["returnType"] == "string": if config["returnType"] == "string":
return dash_part return dash_part
elif config["returnType"] == "json": elif config["returnType"] == "json":
return {config["key"]: dash_part} return {config["key"]: dash_part}
else:
logger.error(f"[URL_DASH_PART] Invalid returnType: {config.get('returnType', 'None')}. Must be 'string' or 'json'.")
return None
def get_macys_product_url_parse(env, config: Dict[str, str]): def get_macys_product_url_parse(env, config: Dict[str, str]):