diff --git a/desktop_env/evaluators/metrics/docs.py b/desktop_env/evaluators/metrics/docs.py index 57e5d4a..914211a 100644 --- a/desktop_env/evaluators/metrics/docs.py +++ b/desktop_env/evaluators/metrics/docs.py @@ -284,18 +284,6 @@ def compare_contains_image(docx_file1, docx_file2): return 1 -# file1 = 'path/to/file1.docx' -# file2 = 'path/to/file2.docx' - -# print(are_docx_files_same(file1, file2)) -# Replace 'your_document.docx' with the path to your document -# result = contains_page_break('your_document.docx') -# print(result) - -# config_path = "/home/[username]/.config/libreoffice/4/user/registrymodifications.xcu" -# print(find_default_font("Ani", config_path)) - - def evaluate_colored_words_in_tables(file_path1, file_path2, **kwargs): if not compare_docx_files(file_path1, file_path2): return 0 @@ -538,12 +526,3 @@ def compare_highlighted_text(file1, file2): return 1 else: return 0 - -if __name__ == '__main__': - print( - compare_docx_files( - r"C:\Users\tianbaox\Desktop\DesktopEnv\cache\88fe4b2d-3040-4c70-9a70-546a47764b48\CCCH9003_Tutorial_guidelines.docx", - r"C:\Users\tianbaox\Desktop\DesktopEnv\cache\88fe4b2d-3040-4c70-9a70-546a47764b48\CCCH9003_Tutorial_guidelines_Gold.docx", - ignore_blanks=False - ) - ) diff --git a/desktop_env/evaluators/metrics/utils.py b/desktop_env/evaluators/metrics/utils.py index 00deb09..6f444f8 100644 --- a/desktop_env/evaluators/metrics/utils.py +++ b/desktop_env/evaluators/metrics/utils.py @@ -1,46 +1,47 @@ +import builtins +import functools +import itertools import logging +import operator +import re import zipfile from typing import Any, TypeVar, Union, Iterable, Optional, Callable from typing import Dict, List, Set, Match, Tuple, Pattern from urllib.parse import urlparse, urlunparse -import re -import functools -import operator -import builtins -import itertools +import formulas import lxml.cssselect import lxml.etree -import openpyxl import xmltodict from lxml.etree import _Element from openpyxl import Workbook +from openpyxl.cell.cell import Cell from openpyxl.chart._chart import ChartBase -from openpyxl.worksheet.worksheet import Worksheet -from openpyxl.worksheet.filters import AutoFilter, SortState +from openpyxl.formatting.formatting import ConditionalFormattingList +from openpyxl.pivot.cache import CacheSource as PivotCacheSource +from openpyxl.pivot.table import TableDefinition as PivotTableDefinition +from openpyxl.styles.differential import DifferentialStyle +from openpyxl.utils import coordinate_to_tuple, get_column_letter from openpyxl.worksheet.cell_range import MultiCellRange, CellRange from openpyxl.worksheet.dimensions import DimensionHolder -from openpyxl.formatting.formatting import ConditionalFormattingList -from openpyxl.utils import coordinate_to_tuple, get_column_letter -from openpyxl.cell.cell import Cell -from openpyxl.styles.differential import DifferentialStyle -from openpyxl.pivot.table import TableDefinition as PivotTableDefinition -from openpyxl.pivot.cache import CacheSource as PivotCacheSource -import formulas +from openpyxl.worksheet.filters import AutoFilter, SortState +from openpyxl.worksheet.worksheet import Worksheet V = TypeVar("Value") logger = logging.getLogger("desktopenv.metrics.utils") -_xlsx_namespaces = [ ("oo", "http://schemas.openxmlformats.org/spreadsheetml/2006/main") - , ("x14", "http://schemas.microsoft.com/office/spreadsheetml/2009/9/main") - , ("xm", "http://schemas.microsoft.com/office/excel/2006/main") - ] +_xlsx_namespaces = [("oo", "http://schemas.openxmlformats.org/spreadsheetml/2006/main") + , ("x14", "http://schemas.microsoft.com/office/spreadsheetml/2009/9/main") + , ("xm", "http://schemas.microsoft.com/office/excel/2006/main") + ] _xlsx_ns_mapping = dict(_xlsx_namespaces) _xlsx_ns_imapping = dict(map(lambda itm: (itm[1], itm[0]), _xlsx_namespaces)) _xlsx_ns_imapping["http://schemas.openxmlformats.org/spreadsheetml/2006/main"] = None _sheet_name_selector = lxml.cssselect.CSSSelector("oo|sheets>oo|sheet", namespaces=_xlsx_ns_mapping) _sparklines_selector = lxml.cssselect.CSSSelector("x14|sparkline", namespaces=_xlsx_ns_mapping) + + def load_sparklines(xlsx_file: str, sheet_name: str) -> Dict[str, str]: # function load_sparklines {{{ # """ @@ -174,6 +175,7 @@ def load_charts(xlsx_file: Workbook, sheet_name: str, **options) -> Dict[str, An return chart_set # }}} function load_charts # + # Available Pivot Properties: # name: str # show_total, show_empty_row, show_empty_col, show_headers: bool @@ -210,23 +212,26 @@ def load_pivot_tables(xlsx_file: Workbook, sheet_name: str, **options) -> Dict[s pivot_set: Dict[str, Any] = {} pivot_props: Set[str] = set(options.get("pivot_props", [])) for pvt in pivots: - raw_selection: List[List[tuple[Optional[bool], int]]] =\ - [ [(itm.h, itm.x) for itm in f.items if itm.x is not None]\ - for f in pvt.pivotFields - ] - raw__selection: List[List[tuple[Optional[bool], int]]] = list(itertools.dropwhile(lambda r: len(r)==0, raw_selection)) - left_bias = len(raw_selection)-len(raw__selection) - selection: List[List[tuple[Optional[bool], int]]] = list((itertools.dropwhile(lambda r: len(r)==0, reversed(raw__selection))))[::-1] - right_bias = len(raw__selection)-len(selection) + raw_selection: List[List[tuple[Optional[bool], int]]] = \ + [[(itm.h, itm.x) for itm in f.items if itm.x is not None] \ + for f in pvt.pivotFields + ] + raw__selection: List[List[tuple[Optional[bool], int]]] = list( + itertools.dropwhile(lambda r: len(r) == 0, raw_selection)) + left_bias = len(raw_selection) - len(raw__selection) + selection: List[List[tuple[Optional[bool], int]]] = list( + (itertools.dropwhile(lambda r: len(r) == 0, reversed(raw__selection))))[::-1] + right_bias = len(raw__selection) - len(selection) cache_source: PivotCacheSource = pvt.cache.cacheSource cell_range1: str cell_range2: str cell_range1, cell_range2 = cache_source.worksheetSource.ref.split(":") cell_range1: Tuple[int, int] = coordinate_to_tuple(cell_range1) - cell_range1 = (cell_range1[0], cell_range1[1]+left_bias) + cell_range1 = (cell_range1[0], cell_range1[1] + left_bias) cell_range2: Tuple[int, int] = coordinate_to_tuple(cell_range2) - cell_range2 = (cell_range2[0], cell_range2[1]-right_bias) - source: str = "{:};{:}:{:};{:}".format(cache_source.type, cell_range1, cell_range2, cache_source.worksheetSource.sheet) + cell_range2 = (cell_range2[0], cell_range2[1] - right_bias) + source: str = "{:};{:}:{:};{:}".format(cache_source.type, cell_range1, cell_range2, + cache_source.worksheetSource.sheet) info: Dict[str, Any] = {} if "name" in pivot_props: @@ -248,22 +253,26 @@ def load_pivot_tables(xlsx_file: Workbook, sheet_name: str, **options) -> Dict[s if "filter" in pivot_props: info["filter_fields"] = set(f.fld for f in pvt.pageFields) if "col_fields" in pivot_props: - info["col_fields"] = [f.x-left_bias for f in pvt.colFields] + info["col_fields"] = [f.x - left_bias for f in pvt.colFields] if "row_fields" in pivot_props: - info["row_fields"] = [f.x-left_bias for f in pvt.rowFields] + info["row_fields"] = [f.x - left_bias for f in pvt.rowFields] if "data_fields" in pivot_props: - info["data_fields"] = [ "{:d};{:};{:};{:}".format( f.fld-left_bias, f.name if "data_fields_name" in pivot_props else "" - , f.subtotal, f.showDataAs - )\ - for f in pvt.dataFields - ] + info["data_fields"] = [ + "{:d};{:};{:};{:}".format(f.fld - left_bias, f.name if "data_fields_name" in pivot_props else "" + , f.subtotal, f.showDataAs + ) \ + for f in pvt.dataFields + ] pivot_set[source] = info logger.debug(".[%s].pivots: %s", sheet_name, repr(pivot_set)) return pivot_set # }}} function load_pivot_tables # + _shared_str_selector = lxml.cssselect.CSSSelector("oo|sst>oo|si>oo|t", namespaces=_xlsx_ns_mapping) + + def read_cell_value(xlsx_file: str, sheet_name: str, coordinate: str) -> Any: # read_cell_value {{{ # try: @@ -283,20 +292,20 @@ def read_cell_value(xlsx_file: str, sheet_name: str, coordinate: str) -> Any: with z_f.open("xl/worksheets/sheet{:}.xml".format(sheet_names[sheet_name])) as f: sheet: _Element = lxml.etree.fromstring(f.read()) - cells: List[_Element] =\ - lxml.cssselect.CSSSelector( 'oo|row>oo|c[r="{:}"]'.format(coordinate) - , namespaces=_xlsx_ns_mapping - )(sheet) - if len(cells)==0: + cells: List[_Element] = \ + lxml.cssselect.CSSSelector('oo|row>oo|c[r="{:}"]'.format(coordinate) + , namespaces=_xlsx_ns_mapping + )(sheet) + if len(cells) == 0: return None cell: _Element = cells[0] except zipfile.BadZipFile: return None - cell: Dict[str, str] = xmltodict.parse( lxml.etree.tostring(cell, encoding="unicode") - , process_namespaces=True - , namespaces=_xlsx_ns_imapping - ) + cell: Dict[str, str] = xmltodict.parse(lxml.etree.tostring(cell, encoding="unicode") + , process_namespaces=True + , namespaces=_xlsx_ns_imapping + ) logger.debug("%s.%s[%s]: %s", xlsx_file, sheet_name, coordinate, repr(cell)) if "@t" not in cell["c"]: return None @@ -308,6 +317,7 @@ def read_cell_value(xlsx_file: str, sheet_name: str, coordinate: str) -> Any: return cell["c"]["v"] # }}} read_cell_value # + # Supported Styles: # number_format # font_name - str @@ -322,50 +332,53 @@ def read_cell_value(xlsx_file: str, sheet_name: str, coordinate: str) -> Any: # fgcolor - in aRGB, e.g., FF00FFFF is yellow # hyperlink - str def _read_cell_style(style_name: str, cell: Cell, diff_style: Optional[DifferentialStyle] = None) -> Any: - if style_name=="number_format": - return (cell.number_format if diff_style is None else diff_style.numFmt.formatCode)\ - if cell.value is not None and cell.data_type=="n" else None - elif style_name=="font_name": + if style_name == "number_format": + return (cell.number_format if diff_style is None else diff_style.numFmt.formatCode) \ + if cell.value is not None and cell.data_type == "n" else None + elif style_name == "font_name": return (diff_style or cell).font.name if cell.value is not None else None - elif style_name=="font_family": + elif style_name == "font_family": return (diff_style or cell).font.family if cell.value is not None else None - elif style_name=="font_color": + elif style_name == "font_color": return (diff_style or cell).font.color.rgb if cell.value is not None else None - elif style_name=="font_bold": + elif style_name == "font_bold": return (diff_style or cell).font.bold if cell.value is not None else None - elif style_name=="font_italic": + elif style_name == "font_italic": return (diff_style or cell).font.italic if cell.value is not None else None - elif style_name=="font_underline": + elif style_name == "font_underline": return (diff_style or cell).font.underline if cell.value is not None else None - elif style_name=="font_size": + elif style_name == "font_size": return (diff_style or cell).font.size if cell.value is not None else None - elif style_name=="fill_type": + elif style_name == "fill_type": try: return (diff_style or cell).fill.tagname except: return None - elif style_name=="bgcolor": + elif style_name == "bgcolor": try: return (diff_style or cell).fill.bgColor.rgb except: return None - elif style_name=="fgcolor": + elif style_name == "fgcolor": try: return (diff_style or cell).fill.fgColor.rgb except: return None - elif style_name=="hyperlink": + elif style_name == "hyperlink": return cell.hyperlink or "" if cell.value is not None else None else: raise NotImplementedError("Unsupported Style: {:}".format(style_name)) -_absolute_range_pattern: Pattern[str] = re.compile( r"""\$(?P[A-Z]{1,3})\$(?P\d+) # coord1 + +_absolute_range_pattern: Pattern[str] = re.compile(r"""\$(?P[A-Z]{1,3})\$(?P\d+) # coord1 (?:: \$(?P[A-Z]{1,3})\$(?P\d+) # coord2 )? """ - , re.X - ) + , re.X + ) + + def load_xlsx_styles(xlsx_file: Workbook, sheet_name: str, book_name: str, **options) -> Dict[str, List[Any]]: # function load_xlsx_styles {{{ # """ @@ -417,24 +430,24 @@ def load_xlsx_styles(xlsx_file: Workbook, sheet_name: str, book_name: str, **opt if m[2] is None and m[3] is None: arguments.append(read_cell_value(book_name, sheet_name, coordinate="{:}{:}".format(m[0], m[1]))) else: - arguments.append( [ read_cell_value( book_name, sheet_name - , coordinate="{:}{:}".format( get_column_letter(c[1]) + arguments.append([read_cell_value(book_name, sheet_name + , coordinate="{:}{:}".format(get_column_letter(c[1]) , c[0] ) - )\ - for c in CellRange("{:}{:}:{:}{:}".format(m[0], m[1], m[2], m[3])).cells\ + ) \ + for c in CellRange("{:}{:}:{:}{:}".format(m[0], m[1], m[2], m[3])).cells \ ] - ) + ) logger.debug("Absolute range arguments: %s", repr(arguments)) for rge in fmt.cells: for c in rge.cells: cell: Cell = worksheet.cell(row=c[0], column=c[1]) - cell_value = read_cell_value( book_name, sheet_name - , coordinate="{:}{:d}".format( get_column_letter(c[1]) - , c[0] - ) - ) + cell_value = read_cell_value(book_name, sheet_name + , coordinate="{:}{:d}".format(get_column_letter(c[1]) + , c[0] + ) + ) if condition(cell_value, *arguments): logger.debug("Active Cell %s(%s) for %s", repr(cell), str(cell_value), r.formula[0]) active_cells.append(cell) @@ -448,6 +461,7 @@ def load_xlsx_styles(xlsx_file: Workbook, sheet_name: str, book_name: str, **opt return style_dict # }}} function load_xlsx_styles # + # Available Row Properties: # hidden # collapsed @@ -460,7 +474,7 @@ def load_xlsx_styles(xlsx_file: Workbook, sheet_name: str, book_name: str, **opt # collapsed # min # max -def load_rows_or_cols(xlsx_file: Workbook, sheet_name: str, **options)\ +def load_rows_or_cols(xlsx_file: Workbook, sheet_name: str, **options) \ -> Dict[Union[int, str], Dict[str, Any]]: # function load_rows_or_cols {{{ # """ @@ -491,6 +505,7 @@ def load_rows_or_cols(xlsx_file: Workbook, sheet_name: str, **options)\ return obj_set # }}} function load_rows_or_cols # + def load_filters(xlsx_file: Workbook, sheet_name: str, **options) -> Dict[str, Any]: # function load_filters {{{ # try: @@ -514,16 +529,16 @@ def load_filters(xlsx_file: Workbook, sheet_name: str, **options) -> Dict[str, A filter_column["filters"] = set(flt_clm.filters.filter) if flt_clm.customFilters is not None: filter_column["custom_filters_op"] = flt_clm.customFilters._and - filter_column["custom_filters"] = set( ( flt.operator + filter_column["custom_filters"] = set((flt.operator , flt.val - )\ - for flt in flt_clm.customFilters.customFilter - ) + ) \ + for flt in flt_clm.customFilters.customFilter + ) filter_column_set.append(filter_column) - filter_column_set = list( sorted( filter_column_set + filter_column_set = list(sorted(filter_column_set , key=(lambda d: d["col_id"]) ) - ) + ) filter_dict["filter_column"] = filter_column_set # sortState @@ -534,26 +549,30 @@ def load_filters(xlsx_file: Workbook, sheet_name: str, **options) -> Dict[str, A sort_state_dict["case"] = sort_state.caseSensitive sort_state_dict["method"] = sort_state.sortMethod sort_state_dict["ref"] = sort_state.ref - sort_state_dict["condition"] = list( { "descending": cdt.descending - , "key": cdt.sortBy - , "ref": cdt.ref - , "custom_list": cdt.customList - , "dxf_id": cdt.dxfId - , "icon": cdt.iconSet - , "iconid": cdt.iconId - }\ - for cdt in sort_state.sortCondition - ) + sort_state_dict["condition"] = list({"descending": cdt.descending + , "key": cdt.sortBy + , "ref": cdt.ref + , "custom_list": cdt.customList + , "dxf_id": cdt.dxfId + , "icon": cdt.iconSet + , "iconid": cdt.iconId + } \ + for cdt in sort_state.sortCondition + ) filter_dict["sort_state"] = sort_state_dict return filter_dict # }}} function load_filters # + def _match_record(pattern: Dict[str, Any], item: Dict[str, Any]) -> bool: return all(k in item and item[k] == val for k, val in pattern.items()) + def _multicellrange_containsby(subset_candidate: MultiCellRange, superset_candidate: MultiCellRange) -> bool: return all(r in superset_candidate for r in subset_candidate) + + def _match_value_to_rule(value: V, rule: Dict[str, Union[str, V]]) -> bool: """ Args: @@ -576,10 +595,10 @@ def _match_value_to_rule(value: V, rule: Dict[str, Union[str, V]]) -> bool: match_: Optional[Match[str]] = re.search(rule["ref"], value, flag) return match_ is not None - if rule["method"] in { "eq", "ne" - , "le", "lt" - , "ge", "gt" - }: + if rule["method"] in {"eq", "ne" + , "le", "lt" + , "ge", "gt" + }: return getattr(operator, rule["method"])(value, rule["ref"]) if rule["method"].startswith("approx"): threshold: float = float(rule["method"].split(":")[1]) @@ -589,26 +608,27 @@ def _match_value_to_rule(value: V, rule: Dict[str, Union[str, V]]) -> bool: except (ValueError, TypeError): return False else: - return abs(value-rule["ref"])<=threshold + return abs(value - rule["ref"]) <= threshold if rule["method"] == "spreadsheet_range": subset_limit = MultiCellRange(rule["ref"][0]) superset_limit = MultiCellRange(rule["ref"][1]) - return _multicellrange_containsby(subset_limit, value)\ - and _multicellrange_containsby(value, superset_limit) - if rule["method"].startswith("range."): # e.g., range.te [0, 2] -> 0 < x <= 2 + return _multicellrange_containsby(subset_limit, value) \ + and _multicellrange_containsby(value, superset_limit) + if rule["method"].startswith("range."): # e.g., range.te [0, 2] -> 0 < x <= 2 left_et = rule["method"][6] right_et = rule["method"][7] - return getattr(operator, "l" + left_et)(rule["ref"][0], value)\ - and getattr(operator, "l" + right_et)(value, rule["ref"][1]) + return getattr(operator, "l" + left_et)(rule["ref"][0], value) \ + and getattr(operator, "l" + right_et)(value, rule["ref"][1]) if rule["method"] in {"str_list_eq", "str_set_eq"}: container_type_str: str = rule["method"][4:-3] container_type = getattr(builtins, container_type_str) value: container_type = container_type(value.strip("\"'").split(",")) ref: container_type = container_type(rule["ref"]) - return value==ref + return value == ref raise NotImplementedError() + def are_lists_equal(list1, list2, comparison_func): # First check if both lists have the same length if len(list1) != len(list2): @@ -652,114 +672,3 @@ def compare_urls(url1, url2): # Compare the normalized URLs return norm_url1 == norm_url2 - - -if __name__ == "__main__": - path1 = "test.xlsx" - #path1 = "../../任务数据/LibreOffice Calc/Create_column_charts_using_statistics_gold.xlsx" - path1 = "../../任务集/SheetCopilot/dataset/task_sheet_answers_v2/BoomerangSales/2_BoomerangSales/2_BoomerangSales_gt1.xlsx" - workbook1: Workbook = openpyxl.load_workbook(filename=path1) - worksheet1: Worksheet = workbook1.active - #charts: List[ChartBase] = worksheet1._charts - # print(len(charts)) - # print(type(charts[0])) - # - # print(len(charts[0].series)) - # print(type(charts[0].series[0])) - # print(type(charts[0].series[0].val)) - ##print(charts[0].series[0].val) - # print(charts[0].series[0].val.numRef.f) - # - # print(type(charts[0].series[0].cat)) - ##print(charts[0].series[0].cat) - # print(charts[0].series[0].cat.numRef) - # print(charts[0].series[0].cat.strRef) - # print(charts[0].series[0].cat.strRef.f) - - # print(type(charts[0].title.tx.strRef)) - # print(type(charts[0].title.tx.rich)) - # print(type(charts[0].title.txPr)) - # print(len(charts[0].title.tx.rich.p)) - # print(len(charts[0].title.tx.rich.p[0].r)) - # print(type(charts[0].title.tx.rich.p[0].r[0])) - # print(type(charts[0].title.tx.rich.p[0].r[0].t)) - # print(charts[0].title.tx.rich.p[0].r[0].t) - - # print(type(charts[0].anchor)) - # print(charts[0].anchor.editAs) - # print(charts[0].anchor._from.col, charts[0].anchor.to.row) - # print(charts[0].anchor.to.col, charts[0].anchor.to.row) - - # df1 = pd.read_excel(path1) - # print(df1) - #print(load_charts(path1, chart_props=["title", "xtitle", "ytitle", "type"])) - #print(type(worksheet1["A1"].hyperlink)) - #print(worksheet1["A1"].hyperlink) - #print(worksheet1._charts[0].legend) - #print(worksheet1._charts[0].legend.position) - #for entr in worksheet1._charts[0].legend.legendEntry: - #print("Entr", entr.txPr.p[0].r[0].t) - #print(load_filters(workbook1, "工作表1")) - #print(worksheet1.auto_filter) - #for pvt in worksheet1._pivots: - ##print(type(pvt)) - ##print(pvt) - #print(type(pvt.cache)) - ##print(pvt.cache) - #print(pvt.cache.cacheSource.type) - #print(pvt.cache.cacheSource.worksheetSource.ref) - #print(pvt.cache.cacheSource.worksheetSource.sheet) -# - #print(type(pvt.location)) - #print(pvt.location) - #for f in pvt.pivotFields: - #print(type(f)) - #print([(itm.h, itm.x) for itm in f.items]) - ##for f_itm in f.items: - ##print(f_itm.n) - ##print(f_itm.t) - ##print(f_itm.h) - ##print(f_itm.s) - ##print(f_itm.sd) - ##print(f_itm.f) - ##print(f_itm.m) - ##print(f_itm.c) - ##print(f_itm.x) - ##print(f_itm.d) - ##print(f_itm.e) - ##print(f.countASubtotal) - ##print(f.countSubtotal) - ##for f in pvt.dataFields: - ##print(f.name) - ##print(f.fld) - ###print(f.baseField) - ##print(f.subtotal) - ##print(f.showDataAs) - ##for f in pvt.rowFields: - ##print(1, f.x) - ##for f in pvt.rowItems: - ##print(2, f.t, f.r, f.i, f.x) - ##for f in pvt.colFields: - ##print(3, f.x) - ##for f in pvt.colItems: - ##print(4, f.t, f.r, f.i, f.x) - #for f in pvt.pageFields: - #print(5, f.fld) - #for flt in pvt.filters: - #print(5, flt.fld) - #print(6, flt.mpFld) - #print(7, flt.type) - #print(8, flt.evalOrder) - #print(9, flt.id) - #print(10, flt.stringValue1) - #print(11, flt.stringValue2) - #print(load_charts(workbook1, "Sheet2", chart_props=["title", "type", "legend"])) - #print(load_filters(workbook1, "透视表_工作表1_1")) - #workbook1.save("test2.xlsx") - print( load_pivot_tables( workbook1, "Sheet2", pivot_props=[ "col_fields" - , "filter" - , "row_fields" - , "data_fields" - ] - ) - )