Merge branch 'main' into xiaochuanli/addChromeExtensions
This commit is contained in:
@@ -12,7 +12,7 @@ import lxml.etree
|
||||
import requests
|
||||
from lxml.cssselect import CSSSelector
|
||||
from lxml.etree import _Element
|
||||
from playwright.sync_api import sync_playwright
|
||||
from playwright.sync_api import sync_playwright, expect
|
||||
from pydrive.auth import GoogleAuth
|
||||
from pydrive.drive import GoogleDrive, GoogleDriveFileList, GoogleDriveFile
|
||||
|
||||
@@ -36,6 +36,89 @@ WARNING:
|
||||
"""
|
||||
|
||||
|
||||
def get_info_from_website(env, config: Dict[Any, Any]) -> Any:
|
||||
""" Get information from a website. Especially useful when the information may be updated through time.
|
||||
Args:
|
||||
env (Any): The environment object.
|
||||
config (Dict[Any, Any]): The configuration dictionary.
|
||||
- url (str): The URL of the website to visit
|
||||
- infos (List[Dict[str, str]]): The list of information to be extracted from the website. Each dictionary contains:
|
||||
- action (str): chosen from 'inner_text', 'attribute', 'click_and_inner_text', 'click_and_attribute', etc., concretely,
|
||||
- inner_text: extract the inner text of the element specified by the selector
|
||||
- attribute: extract the attribute of the element specified by the selector
|
||||
- click_and_inner_text: click elements following the selector and then extract the inner text of the last element
|
||||
- click_and_attribute: click elements following the selector and then extract the attribute of the last element
|
||||
- selector (Union[str, List[str]]): The CSS selector(s) of the element(s) to be extracted.
|
||||
- attribute (str): optional for 'attribute' and 'click_and_attribute', the attribute to be extracted.
|
||||
- backups (Any): The backup information to be returned if the extraction fails.
|
||||
"""
|
||||
try:
|
||||
host = env.vm_ip
|
||||
port = 9222 # fixme: this port is hard-coded, need to be changed from config file
|
||||
remote_debugging_url = f"http://{host}:{port}"
|
||||
with sync_playwright() as p:
|
||||
# connect to remote Chrome instance
|
||||
try:
|
||||
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
||||
except Exception as e:
|
||||
# If the connection fails (e.g., the agent close the browser instance), start a new browser instance
|
||||
app = 'chromium' if 'arm' in platform.machine() else 'google-chrome'
|
||||
payload = json.dumps({"command": [
|
||||
app,
|
||||
"--remote-debugging-port=1337"
|
||||
], "shell": False})
|
||||
headers = {"Content-Type": "application/json"}
|
||||
requests.post("http://" + host + ":5000/setup" + "/launch", headers=headers, data=payload)
|
||||
time.sleep(5)
|
||||
browser = p.chromium.connect_over_cdp(remote_debugging_url)
|
||||
|
||||
page = browser.contexts[0].new_page()
|
||||
page.goto(config["url"])
|
||||
page.wait_for_load_state('load')
|
||||
infos = []
|
||||
for info_dict in config.get('infos', []):
|
||||
if page.url != config["url"]:
|
||||
page.goto(config["url"])
|
||||
page.wait_for_load_state('load')
|
||||
action = info_dict.get('action', 'inner_text')
|
||||
if action == "inner_text":
|
||||
ele = page.locator(info_dict['selector'])
|
||||
expect(ele).to_be_visible()
|
||||
infos.append(ele.inner_text())
|
||||
elif action == "attribute":
|
||||
ele = page.locator(info_dict['selector'])
|
||||
expect(ele).to_be_visible()
|
||||
infos.append(ele.get_attribute(info_dict['attribute']))
|
||||
elif action == 'click_and_inner_text':
|
||||
for idx, sel in enumerate(info_dict['selector']):
|
||||
if idx != len(info_dict['selector']) - 1:
|
||||
link = page.locator(sel)
|
||||
expect(link).to_be_visible()
|
||||
link.click()
|
||||
page.wait_for_load_state('load')
|
||||
else:
|
||||
ele = page.locator(sel)
|
||||
expect(ele).to_be_visible()
|
||||
infos.append(ele.inner_text())
|
||||
elif action == 'click_and_attribute':
|
||||
for idx, sel in enumerate(info_dict['selector']):
|
||||
if idx != len(info_dict['selector']) - 1:
|
||||
link = page.locator(sel)
|
||||
expect(link).to_be_visible()
|
||||
link.click()
|
||||
page.wait_for_load_state('load')
|
||||
else:
|
||||
ele = page.locator(sel)
|
||||
expect(ele).to_be_visible()
|
||||
infos.append(ele.get_attribute(info_dict['attribute']))
|
||||
else:
|
||||
raise NotImplementedError(f'The action {action} is not supported yet.')
|
||||
return infos
|
||||
except Exception as e:
|
||||
logger.error(f'[ERROR]: failed to obtain information from the website: {config["url"]}. Use backup results instead.')
|
||||
return config.get('backups', None)
|
||||
|
||||
|
||||
# The following ones just need to load info from the files of software, no need to connect to the software
|
||||
def get_default_search_engine(env, config: Dict[str, str]):
|
||||
os_type = env.vm_platform
|
||||
@@ -507,6 +590,10 @@ def get_active_url_from_accessTree(env, config):
|
||||
if len(elements) == 0:
|
||||
print("no elements found")
|
||||
return None
|
||||
elif elements[-1].text is None:
|
||||
print("no text found")
|
||||
return None
|
||||
|
||||
active_tab_url = config["goto_prefix"] + elements[0].text if "goto_prefix" in config.keys() else "https://" + \
|
||||
elements[0].text
|
||||
print("active tab url now: {}".format(active_tab_url))
|
||||
@@ -722,15 +809,20 @@ def get_number_of_search_results(env, config: Dict[str, str]):
|
||||
|
||||
def get_googledrive_file(env, config: Dict[str, Any]) -> str:
|
||||
""" Get the desired file from Google Drive based on config, return the downloaded local filepath.
|
||||
To retrieve the file, we provide two options in config dict:
|
||||
1. query: a list of queries to search the file, each query is a string that follows the format of Google Drive search query
|
||||
2. path: a list of path to the file, 'folder/subfolder/filename' -> ['folder', 'subfolder', 'filename']
|
||||
3. query_list: query extends to list to download multiple files
|
||||
4. path_list: path extends to list to download multiple files
|
||||
dest: target file name or list. If *_list is used, dest should also be a list of the same length.
|
||||
Return the downloaded filepath locally.
|
||||
@args: keys in config dict
|
||||
settings_file(str): target filepath to the settings file for Google Drive authentication, default is 'evaluation_examples/settings/googledrive/settings.yml'
|
||||
query/path[_list](Union[str, List[str]]): the query or path [list] to the file(s) on Google Drive. To retrieve the file, we provide multiple key options to specify the filepath on drive in config dict:
|
||||
1) query: a list of queries to search the file, each query is a string that follows the format of Google Drive search query. The documentation is available here: (support more complex search but too complicated to use)
|
||||
https://developers.google.com/drive/api/guides/search-files?hl=en
|
||||
2) path: a str list poingting to file path on googledrive, e.g., 'folder/subfolder/filename.txt' ->
|
||||
config contain one key-value pair "path": ['folder', 'subfolder', 'filename.txt']
|
||||
3) query_list: query extends to list to download multiple files
|
||||
4) path_list: path extends to list to download multiple files, e.g.,
|
||||
"path_list": [['folder', 'subfolder', 'filename1.txt'], ['folder', 'subfolder', 'filename2.txt']]
|
||||
@return:
|
||||
dest(Union[List[str], str]): target file name or list. If *_list is used in input config, dest should also be a list of the same length. Return the downloaded local filepath.
|
||||
"""
|
||||
settings_file = config.get('settings_file', 'evaluation_examples/settings/googledrive/settings.json')
|
||||
settings_file = config.get('settings_file', 'evaluation_examples/settings/googledrive/settings.yml')
|
||||
auth = GoogleAuth(settings_file=settings_file)
|
||||
drive = GoogleDrive(auth)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user