diff --git a/branch-config/filelist b/branch-config/filelist index 03e1bc0..0ba2d96 100644 --- a/branch-config/filelist +++ b/branch-config/filelist @@ -3,3 +3,10 @@ main.py requirements.txt evaluation_examples logs + +mm_agents +experiment_a11y_tree.py +experiment_screenshot.py +experiment_screenshot_a11y_tree.py +experiment_screenshot_seeact.py +experiment_screenshot_som.py diff --git a/branch_flag b/branch_flag index 9daeafb..180b47c 100644 --- a/branch_flag +++ b/branch_flag @@ -1 +1 @@ -test +baseline diff --git a/desktop_env/envs/desktop_env.py b/desktop_env/envs/desktop_env.py index ef7a1ad..f8cd082 100644 --- a/desktop_env/envs/desktop_env.py +++ b/desktop_env/envs/desktop_env.py @@ -76,6 +76,8 @@ class DesktopEnv(gym.Env): self.cache_dir_base: str = cache_dir self.vm_screen_size = screen_size + os.makedirs(self.tmp_dir_base, exist_ok=True) + # task-aware stuffs # todo: handling the logic of snapshot directory self._set_task_info(task_config) diff --git a/desktop_env/evaluators/metrics/table.py b/desktop_env/evaluators/metrics/table.py index 1bbdc60..d7e4faf 100644 --- a/desktop_env/evaluators/metrics/table.py +++ b/desktop_env/evaluators/metrics/table.py @@ -92,8 +92,11 @@ def compare_table(result: str, expected: str, **options) -> float: if result is None: return 0. - xlworkbookr: Workbook = openpyxl.load_workbook(filename=result) - pdworkbookr = pd.ExcelFile(result) + try: + xlworkbookr: Workbook = openpyxl.load_workbook(filename=result) + pdworkbookr = pd.ExcelFile(result) + except: + return 0. worksheetr_names: List[str] = pdworkbookr.sheet_names xlworkbooke: Workbook = openpyxl.load_workbook(filename=expected) diff --git a/desktop_env/evaluators/metrics/utils.py b/desktop_env/evaluators/metrics/utils.py index 08a3ca8..4f35b65 100644 --- a/desktop_env/evaluators/metrics/utils.py +++ b/desktop_env/evaluators/metrics/utils.py @@ -52,14 +52,17 @@ def load_sparklines(xlsx_file: str, sheet_name: str) -> Dict[str, str]: """ # read xlsx - with zipfile.ZipFile(xlsx_file, "r") as z_f: - with z_f.open("xl/workbook.xml") as f: - workbook_database: _Element = lxml.etree.fromstring(f.read()) - sheets: List[_Element] = _sheet_name_selector(workbook_database) - sheet_names: Dict[str, str] = {sh.get("name"): sh.get("sheetId") for sh in sheets} - with z_f.open("xl/worksheets/sheet{:}.xml".format(sheet_names[sheet_name])) as f: - sheet: _Element = lxml.etree.fromstring(f.read()) - sparklines: List[_Element] = _sparklines_selector(sheet) + try: + with zipfile.ZipFile(xlsx_file, "r") as z_f: + with z_f.open("xl/workbook.xml") as f: + workbook_database: _Element = lxml.etree.fromstring(f.read()) + sheets: List[_Element] = _sheet_name_selector(workbook_database) + sheet_names: Dict[str, str] = {sh.get("name"): sh.get("sheetId") for sh in sheets} + with z_f.open("xl/worksheets/sheet{:}.xml".format(sheet_names[sheet_name])) as f: + sheet: _Element = lxml.etree.fromstring(f.read()) + sparklines: List[_Element] = _sparklines_selector(sheet) + except zipfile.BadZipFile: + return {} sparklines_dict: Dict[str, str] = {} for sp_l in sparklines: @@ -158,29 +161,32 @@ def load_charts(xlsx_file: Workbook, sheet_name: str, **options) -> Dict[str, An _shared_str_selector = lxml.cssselect.CSSSelector("oo|sst>oo|si>oo|t", namespaces=_xlsx_ns_mapping) def read_cell_value(xlsx_file: str, sheet_name: str, coordinate: str) -> Any: # read_cell_value {{{ # - with zipfile.ZipFile(xlsx_file, "r") as z_f: - try: - with z_f.open("xl/sharedStrings.xml") as f: - shared_str_xml: _Element = lxml.etree.fromstring(f.read()) - str_elements: List[_Element] = _shared_str_selector(shared_str_xml) - shared_strs: List[str] = [elm.text for elm in str_elements] - except: - logger.debug("Read shared strings error: %s", xlsx_file) + try: + with zipfile.ZipFile(xlsx_file, "r") as z_f: + try: + with z_f.open("xl/sharedStrings.xml") as f: + shared_str_xml: _Element = lxml.etree.fromstring(f.read()) + str_elements: List[_Element] = _shared_str_selector(shared_str_xml) + shared_strs: List[str] = [elm.text for elm in str_elements] + except: + logger.debug("Read shared strings error: %s", xlsx_file) - with z_f.open("xl/workbook.xml") as f: - workbook_database: _Element = lxml.etree.fromstring(f.read()) - sheets: List[_Element] = _sheet_name_selector(workbook_database) - sheet_names: Dict[str, str] = {sh.get("name"): sh.get("sheetId") for sh in sheets} + with z_f.open("xl/workbook.xml") as f: + workbook_database: _Element = lxml.etree.fromstring(f.read()) + sheets: List[_Element] = _sheet_name_selector(workbook_database) + sheet_names: Dict[str, str] = {sh.get("name"): sh.get("sheetId") for sh in sheets} - with z_f.open("xl/worksheets/sheet{:}.xml".format(sheet_names[sheet_name])) as f: - sheet: _Element = lxml.etree.fromstring(f.read()) - cells: List[_Element] =\ - lxml.cssselect.CSSSelector( 'oo|row>oo|c[r="{:}"]'.format(coordinate) - , namespaces=_xlsx_ns_mapping - )(sheet) - if len(cells)==0: - return None - cell: _Element = cells[0] + with z_f.open("xl/worksheets/sheet{:}.xml".format(sheet_names[sheet_name])) as f: + sheet: _Element = lxml.etree.fromstring(f.read()) + cells: List[_Element] =\ + lxml.cssselect.CSSSelector( 'oo|row>oo|c[r="{:}"]'.format(coordinate) + , namespaces=_xlsx_ns_mapping + )(sheet) + if len(cells)==0: + return None + cell: _Element = cells[0] + except zipfile.BadZipFile: + return None cell: Dict[str, str] = xmltodict.parse( lxml.etree.tostring(cell, encoding="unicode") , process_namespaces=True diff --git a/experiment_screenshot_a11y_tree.py b/experiment_screenshot_a11y_tree.py index 60c81b6..c48a2d7 100644 --- a/experiment_screenshot_a11y_tree.py +++ b/experiment_screenshot_a11y_tree.py @@ -41,7 +41,8 @@ logger.addHandler(sdebug_handler) logger = logging.getLogger("desktopenv.experiment") -PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx" +#PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx" +PATH_TO_VM = "../../../../大文件/镜像/Ubuntu-1218/Ubuntu/Ubuntu.vmx" def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_trajectory", recording=True): @@ -109,8 +110,9 @@ def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_tr if __name__ == "__main__": action_space = "pyautogui" - example_class = "chrome" - example_id = "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3" + example_class = "libreoffice_calc" + #example_id = "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3" + example_id = "01b269ae-2111-4a07-81fd-3fcd711993b0" gpt4_model = "gpt-4-vision-preview" gemini_model = "gemini-pro-vision" @@ -120,7 +122,8 @@ if __name__ == "__main__": with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f: example = json.load(f) - example["snapshot"] = "exp_setup4" + #example["snapshot"] = "exp_setup4" + example["snapshot"] = "Snapshot 30" api_key = os.environ.get("OPENAI_API_KEY") agent = GPT4v_Agent(api_key=api_key, model=gpt4_model, instruction=example['instruction'], diff --git a/main.py b/main.py index 2027736..a3a01df 100644 --- a/main.py +++ b/main.py @@ -46,9 +46,9 @@ def human_agent(): Runs the Gym environment with human input. """ - with open("evaluation_examples/examples/libreoffice_calc/a01fbce3-2793-461f-ab86-43680ccbae25.json.nosetup", "r") as f: + with open("evaluation_examples/examples/thunderbird/7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3.json", "r") as f: example = json.load(f) - example["snapshot"] = "Snapshot 26" + example["snapshot"] = "Snapshot 30" env = DesktopEnv( path_to_vm="../../../../大文件/镜像/Ubuntu-1218/Ubuntu/Ubuntu.vmx" , action_space="computer_13" diff --git a/mm_agents/gpt_4v_agent.py b/mm_agents/gpt_4v_agent.py index cc08b79..55fd2a5 100644 --- a/mm_agents/gpt_4v_agent.py +++ b/mm_agents/gpt_4v_agent.py @@ -22,6 +22,10 @@ from mm_agents.prompts import SYS_PROMPT_IN_SCREENSHOT_OUT_CODE, SYS_PROMPT_IN_S SYS_PROMPT_IN_SOM_A11Y_OUT_TAG, \ SYS_PROMPT_SEEACT, ACTION_DESCRIPTION_PROMPT_SEEACT, ACTION_GROUNDING_PROMPT_SEEACT +import logging + +logger = logging.getLogger("desktopenv.agent") + # Function to encode the image def encode_image(image_path): @@ -247,6 +251,7 @@ class GPT4v_Agent: if self.exp == "both": _screenshot = previous_obs["screenshot"] _linearized_accessibility_tree = previous_obs["accessibility_tree"] + logger.debug("LINEAR AT: %s", _linearized_accessibility_tree) messages.append({ "role": "user", @@ -451,7 +456,7 @@ class GPT4v_Agent: "max_tokens": self.max_tokens }) - print(response) + logger.debug("RESPONSE: %s", response) if self.exp == "seeact": messages.append({ diff --git a/requirements.txt b/requirements.txt index 68ca347..b494102 100644 --- a/requirements.txt +++ b/requirements.txt @@ -35,3 +35,6 @@ playwright backoff formulas pydrive +fastdtw + +openai