fix chrome
This commit is contained in:
@@ -1200,6 +1200,32 @@ def get_active_tab_html_parse(env, config: Dict[str, Any]):
|
||||
elements = target_page.query_selector_all(selector)
|
||||
return [element.text_content().strip() for element in elements if element]
|
||||
|
||||
def safely_get_direct_text_nodes_playwright(selector):
|
||||
"""
|
||||
Extract all direct text node contents under the specified selector element (excluding text inside child div, span, etc.).
|
||||
Returns a list of lists, each sublist contains the direct text nodes of one element.
|
||||
Suitable for structures like: <div>SEA<div class="aura-separator"></div>NYC</div>
|
||||
"""
|
||||
elements = target_page.query_selector_all(selector)
|
||||
results = []
|
||||
for element in elements:
|
||||
texts = element.evaluate('''
|
||||
(node) => Array.from(node.childNodes)
|
||||
.filter(n => n.nodeType === Node.TEXT_NODE)
|
||||
.map(n => n.textContent.trim())
|
||||
.filter(Boolean)
|
||||
''')
|
||||
results.append(texts)
|
||||
return results[0]
|
||||
|
||||
def safely_get_direct_li_playwright(selector):
|
||||
elements = target_page.query_selector_all(selector + " li.catAllProducts")
|
||||
return [element.query_selector('span').inner_text().strip() for element in elements if element.query_selector('span')]
|
||||
|
||||
def safely_get_only_child_text_content(selector):
|
||||
elements = target_page.query_selector_all(selector)
|
||||
return [element.query_selector('h3').text_content().strip() for element in elements if element.query_selector('h3')]
|
||||
|
||||
if config["category"] == "class":
|
||||
class_multiObject = config.get("class_multiObject", {})
|
||||
for class_name, object_dict in class_multiObject.items():
|
||||
@@ -1208,6 +1234,41 @@ def get_active_tab_html_parse(env, config: Dict[str, Any]):
|
||||
index = int(order_key)
|
||||
if len(elements_texts) > index:
|
||||
return_json[key] = elements_texts[index]
|
||||
|
||||
class_multiObject_child = config.get("class_multiObject_child", {})
|
||||
for class_name, object_dict in class_multiObject_child.items():
|
||||
elements_texts = safely_get_direct_text_nodes_playwright("." + class_name)
|
||||
for order_key, key in object_dict.items():
|
||||
index = int(order_key)
|
||||
if len(elements_texts) > index:
|
||||
return_json[key] = elements_texts[index]
|
||||
|
||||
class_multiObject_only_child = config.get("class_multiObject_only_child", {})
|
||||
for class_name, object_dict in class_multiObject_only_child.items():
|
||||
elements_texts = safely_get_only_child_text_content("." + class_name)
|
||||
for order_key, key in object_dict.items():
|
||||
index = int(order_key)
|
||||
if len(elements_texts) > index:
|
||||
return_json[key] = elements_texts[index]
|
||||
|
||||
class_multiObject_search_exist = config.get("class_multiObject_search_exist", {})
|
||||
for class_name, object_list in class_multiObject_search_exist.items():
|
||||
elements_texts = safely_get_text_content("." + class_name)
|
||||
for each_object in object_list:
|
||||
if each_object == "is_other_exist":
|
||||
continue
|
||||
if each_object in elements_texts:
|
||||
return_json[each_object] = True
|
||||
else:
|
||||
return_json[each_object] = False
|
||||
if "is_other_exist" in object_list:
|
||||
for each_element in elements_texts:
|
||||
if each_element not in object_list:
|
||||
return_json["is_other_exist"] = True
|
||||
break
|
||||
if "is_other_exist" not in return_json.keys():
|
||||
return_json["is_other_exist"] = False
|
||||
|
||||
|
||||
class_singleObject = config.get("class_singleObject", {})
|
||||
for class_name, key in class_singleObject.items():
|
||||
@@ -1236,6 +1297,55 @@ def get_active_tab_html_parse(env, config: Dict[str, Any]):
|
||||
inputs = target_page.locator(f"xpath={xpath}")
|
||||
if inputs.count() > 0:
|
||||
return_json[key] = inputs.first.input_value().strip()
|
||||
|
||||
elif config["category"] == "class&url":
|
||||
class_multiObject = config.get("class_multiObject", {})
|
||||
for class_name, object_list in class_multiObject.items():
|
||||
elements_texts = safely_get_text_content("." + class_name)
|
||||
for each_key in object_list:
|
||||
if any(each_key.lower() == text.lower() for text in elements_texts):
|
||||
return_json[each_key.lower()] = True
|
||||
|
||||
for each_key in elements_texts:
|
||||
# each_key.lower() not in object_list.lower():
|
||||
if all(each_key.lower() not in item.lower() for item in object_list):
|
||||
return_json["is_other_exist"] = True
|
||||
break
|
||||
if "is_other_exist" not in return_json.keys():
|
||||
return_json["is_other_exist"] = False
|
||||
|
||||
class_multiObject_li = config.get("class_multiObject_li", {})
|
||||
for class_name, object_list in class_multiObject_li.items():
|
||||
elements_texts = safely_get_direct_li_playwright("." + class_name)
|
||||
for each_key in object_list:
|
||||
if any(each_key.lower() == text.lower() for text in elements_texts):
|
||||
return_json[each_key.lower()] = True
|
||||
|
||||
for each_key in elements_texts:
|
||||
# each_key.lower() not in object_list.lower():
|
||||
if all(each_key.lower() not in item.lower() for item in object_list):
|
||||
return_json["is_other_exist"] = True
|
||||
break
|
||||
if "is_other_exist" not in return_json.keys():
|
||||
return_json["is_other_exist"] = False
|
||||
|
||||
url_include_expected = config.get("url_include_expected", [])
|
||||
for key in url_include_expected:
|
||||
if key.lower() in target_page.url.lower():
|
||||
if key.lower() not in return_json.keys():
|
||||
return_json[key.lower()] = True
|
||||
else:
|
||||
if key.lower() not in return_json.keys():
|
||||
return_json[key.lower()] = False
|
||||
|
||||
url_include_expected_multichoice = config.get("url_include_expected_multichoice", {})
|
||||
for key, value in url_include_expected_multichoice.items():
|
||||
if key.lower() in target_page.url.lower():
|
||||
if value.lower() not in return_json.keys():
|
||||
return_json[value.lower()] = True
|
||||
else:
|
||||
if value.lower() not in return_json.keys():
|
||||
return_json[value.lower()] = False
|
||||
|
||||
browser.close()
|
||||
return return_json
|
||||
@@ -1274,13 +1384,14 @@ def get_gotoRecreationPage_and_get_html_content(env, config: Dict[str, Any]):
|
||||
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
||||
page = browser.new_page()
|
||||
page.goto("https://www.recreation.gov/")
|
||||
page.fill("input#hero-search-input", "Albion Basin")
|
||||
page.fill("input#hero-search-input", "Diamond")
|
||||
page.click("button.nav-search-button")
|
||||
print("after first click")
|
||||
time.sleep(2)
|
||||
time.sleep(10)
|
||||
# Assuming .search-result-highlight--success leads to a new page or requires page load
|
||||
with page.expect_popup() as popup_info:
|
||||
page.click(".search-result-highlight--success")
|
||||
time.sleep(30)
|
||||
print("after second click")
|
||||
newpage = popup_info.value
|
||||
newpage.wait_for_load_state()
|
||||
@@ -1362,3 +1473,57 @@ def get_url_dashPart(env, config: Dict[str, str]):
|
||||
return dash_part
|
||||
elif config["returnType"] == "json":
|
||||
return {config["key"]: dash_part}
|
||||
|
||||
|
||||
def get_url_path_parse(env, config: Dict[str, str]):
|
||||
"""
|
||||
Parse Macy's product url path, extract:
|
||||
- mens_clothing: true if 'mens-clothing' in path, else None
|
||||
- t_shirts: true if any key 'Top_style' or 'Product_department' value is 'T-shirts', else None
|
||||
- Men_regular_size_t, Price_discount_range (as list), Sleeve_length: as before, None if not found
|
||||
All fields are None if not found for robustness.
|
||||
"""
|
||||
from urllib.parse import urlparse, unquote
|
||||
result = {}
|
||||
# 1. Parse URL
|
||||
active_tab_url = get_active_url_from_accessTree(env, config)
|
||||
if active_tab_url is None:
|
||||
return None
|
||||
parsed = urlparse(active_tab_url)
|
||||
path = unquote(parsed.path)
|
||||
result = {}
|
||||
# mens_clothing
|
||||
result['mens_clothing'] = True if 'mens-clothing' in path else None
|
||||
# key-value
|
||||
path_parts = path.strip('/').split('/')
|
||||
key_value_json = {}
|
||||
tshirts_flag = False
|
||||
if "mens-t-shirts" in path:
|
||||
tshirts_flag = True
|
||||
for i in range(len(path_parts)-1):
|
||||
if ',' in path_parts[i] and ',' in path_parts[i+1]:
|
||||
keys = [k.strip() for k in path_parts[i].split(',')]
|
||||
values = [v.strip() for v in path_parts[i+1].split(',')]
|
||||
for k, v in zip(keys, values):
|
||||
if k == "Price_discount_range":
|
||||
key_value_json[k] = [item.strip() for item in v.split('|')] if v else None
|
||||
else:
|
||||
key_value_json[k] = v if v else None
|
||||
if (k == 'Top_style' or k == 'Product_department') and (v == 'T-shirts' or v == 'T-Shirts' or v == 'T-Shirt'):
|
||||
tshirts_flag = True
|
||||
break
|
||||
for field in ['Men_regular_size_t', 'Price_discount_range', 'Sleeve_length']:
|
||||
if field not in key_value_json:
|
||||
key_value_json[field] = None
|
||||
result['t_shirts'] = tshirts_flag if tshirts_flag else None
|
||||
# parse_keys
|
||||
for key in config["parse_keys"]:
|
||||
if key in key_value_json:
|
||||
if key == "Price_discount_range":
|
||||
if '50_PERCENT_ off & more' in key_value_json[key] and not '30_PERCENT_ off & more' in key_value_json[key] and not '20_PERCENT_ off & more' in key_value_json[key]:
|
||||
result[key] = '50_PERCENT_ off & more'
|
||||
else:
|
||||
result[key] = 'not_50_PERCENT_ off & more'
|
||||
else:
|
||||
result[key] = key_value_json[key]
|
||||
return result
|
||||
|
||||
Reference in New Issue
Block a user