Update loaded Chrome examples

2024-02-23 14:15:16 +08:00
parent 81863b26dd
commit f812436ad3
10 changed files with 396 additions and 35 deletions
--- a/desktop_env/evaluators/getters/init.py
+++ b/desktop_env/evaluators/getters/init.py
@@ -6,6 +6,7 @@ from .chrome import (
    get_pdf_from_url,
    get_shortcuts_on_desktop,
    get_history,
+    get_page_info,
    get_enabled_experiments,
    get_chrome_language,
    get_chrome_font_size,
--- a/desktop_env/evaluators/getters/chrome.py
+++ b/desktop_env/evaluators/getters/chrome.py
@@ -3,9 +3,10 @@ import logging
 import os
 import sqlite3
 from typing import Dict, Any
+
+from playwright.sync_api import sync_playwright
 from pydrive.auth import GoogleAuth
 from pydrive.drive import GoogleDrive, GoogleDriveFileList, GoogleDriveFile
-from playwright.sync_api import sync_playwright

 logger = logging.getLogger("desktopenv.getters.chrome")

@@ -310,6 +311,36 @@ def get_extensions_installed_from_shop(env, config: Dict[str, str]):
 # The following ones require Playwright to be installed on the target machine, and the chrome needs to be pre-config on
 # port info to allow remote debugging, see README.md for details

+def get_page_info(env, config: Dict[str, str]):
+    host = env.vm_ip
+    port = 9222  # fixme: this port is hard-coded, need to be changed from config file
+    url = config["url"]
+
+    remote_debugging_url = f"http://{host}:{port}"
+    with sync_playwright() as p:
+        # connect to remote Chrome instance
+        browser = p.chromium.connect_over_cdp(remote_debugging_url)
+        page = browser.contexts[0].new_page()
+        page.goto(url)
+
+        try:
+            # Wait for the page to finish loading, this prevents the "execution context was destroyed" issue
+            page.wait_for_load_state('load')  # Wait for the 'load' event to complete
+            title = page.title()
+            url = page.url
+            page_info = {'title': title, 'url': url, 'content': page.content()}
+        except TimeoutError:
+            # If page loading times out, catch the exception and store the current information in the list
+            page_info = {'title': 'Load timeout', 'url': page.url, 'content': page.content()}
+        except Exception as e:
+            # Catch other potential exceptions that might occur while reading the page title
+            print(f'Error: {e}')
+            page_info = {'title': 'Error encountered', 'url': page.url, 'content': page.content()}
+
+        browser.close()
+        return page_info
+
+
 def get_open_tabs_info(env, config: Dict[str, str]):
    host = env.vm_ip
    port = 9222  # fixme: this port is hard-coded, need to be changed from config file
@@ -487,9 +518,9 @@ def get_googledrive_file(env, config: Dict[str, Any]) -> str:
            for q in _query:
                search = f'( {q} ) and "{parent_id}" in parents'
                filelist: GoogleDriveFileList = drive.ListFile({'q': search}).GetList()
-                if len(filelist) == 0: # target file not found
+                if len(filelist) == 0:  # target file not found
                    return None
-                file: GoogleDriveFile = filelist[0] # HACK: if multiple candidates, just use the first one
+                file: GoogleDriveFile = filelist[0]  # HACK: if multiple candidates, just use the first one
                parent_id = file['id']

            file.GetContentFile(_path, mimetype=file['mimeType'])
@@ -501,8 +532,9 @@ def get_googledrive_file(env, config: Dict[str, Any]) -> str:
    if 'query' in config:
        return get_single_file(config['query'], os.path.join(env.cache_dir, config['dest']))
    elif 'path' in config:
-        query = [f"title = '{fp}' and mimeType = 'application/vnd.google-apps.folder' and trashed = false" if idx < len(config['path']) - 1
-                    else f"title = '{fp}' and trashed = false" for idx, fp in enumerate(config['path'])]
+        query = [f"title = '{fp}' and mimeType = 'application/vnd.google-apps.folder' and trashed = false" if idx < len(
+            config['path']) - 1
+                 else f"title = '{fp}' and trashed = false" for idx, fp in enumerate(config['path'])]
        return get_single_file(query, os.path.join(env.cache_dir, config['dest']))
    elif 'query_list' in config:
        _path_list = []
@@ -511,12 +543,14 @@ def get_googledrive_file(env, config: Dict[str, Any]) -> str:
            dest = config['dest'][idx]
            _path_list.append(get_single_file(query, os.path.join(env.cache_dir, dest)))
        return _path_list
-    else: # path_list in config
+    else:  # path_list in config
        _path_list = []
        assert len(config['path_list']) == len(config['dest'])
        for idx, path in enumerate(config['path_list']):
-            query = [f"title = '{fp}' and mimeType = 'application/vnd.google-apps.folder' and trashed = false" if jdx < len(path) - 1
-                        else f"title = '{fp}' and trashed = false" for jdx, fp in enumerate(path)]
+            query = [
+                f"title = '{fp}' and mimeType = 'application/vnd.google-apps.folder' and trashed = false" if jdx < len(
+                    path) - 1
+                else f"title = '{fp}' and trashed = false" for jdx, fp in enumerate(path)]
            dest = config['dest'][idx]
            _path_list.append(get_single_file(query, os.path.join(env.cache_dir, dest)))
        return _path_list
@@ -545,7 +579,7 @@ def get_enable_do_not_track(env, config: Dict[str, str]):
        content = env.controller.get_file(preference_file_path)
        data = json.loads(content)

-        if_enable_do_not_track = data.get('enable_do_not_track', {}) # bool
+        if_enable_do_not_track = data.get('enable_do_not_track', {})  # bool
        return "true" if if_enable_do_not_track else "false"
    except Exception as e:
        logger.error(f"Error: {e}")
@@ -575,7 +609,7 @@ def get_enable_enhanced_safety_browsing(env, config: Dict[str, str]):
        content = env.controller.get_file(preference_file_path)
        data = json.loads(content)

-        if_enable_do_not_track = data.get('safebrowsing', {}).get('enhanced', {}) # bool
+        if_enable_do_not_track = data.get('safebrowsing', {}).get('enhanced', {})  # bool
        return "true" if if_enable_do_not_track else "false"
    except Exception as e:
        logger.error(f"Error: {e}")
@@ -610,7 +644,7 @@ def get_new_startup_page(env, config: Dict[str, str]):
        if "session" not in data.keys():
            return "true"
        else:
-            if_enable_do_not_track = data.get('session', {}).get('restore_on_startup', {}) # int, need to be 5
+            if_enable_do_not_track = data.get('session', {}).get('restore_on_startup', {})  # int, need to be 5
            return "true" if if_enable_do_not_track == 5 else "false"
    except Exception as e:
        logger.error(f"Error: {e}")
@@ -648,4 +682,4 @@ def get_find_unpacked_extension_path(env, config: Dict[str, str]):
        return all_extensions_path
    except Exception as e:
        logger.error(f"Error: {e}")
-        return "Google"
+        return "Google"