import logging import zipfile from typing import Any, TypeVar, Union, Iterable, Optional from typing import Dict, List, Set, Match from urllib.parse import urlparse, urlunparse import re import functools import operator import builtins import lxml.cssselect import lxml.etree import openpyxl import xmltodict from lxml.etree import _Element from openpyxl import Workbook from openpyxl.chart._chart import ChartBase from openpyxl.worksheet.worksheet import Worksheet from openpyxl.worksheet.cell_range import MultiCellRange V = TypeVar("Value") logger = logging.getLogger("desktopenv.metrics.utils") _xlsx_namespaces = [ ("oo", "http://schemas.openxmlformats.org/spreadsheetml/2006/main") , ("x14", "http://schemas.microsoft.com/office/spreadsheetml/2009/9/main") , ("xm", "http://schemas.microsoft.com/office/excel/2006/main") ] _xlsx_ns_mapping = dict(_xlsx_namespaces) _xlsx_ns_imapping = dict(map(lambda itm: (itm[1], itm[0]), _xlsx_namespaces)) _sheet_name_selector = lxml.cssselect.CSSSelector("oo|sheets>oo|sheet", namespaces=_xlsx_ns_mapping) _sparklines_selector = lxml.cssselect.CSSSelector("x14|sparkline", namespaces=_xlsx_ns_mapping) def load_sparklines(xlsx_file: str, sheet_name: str) -> Dict[str, str]: """ This function modifies data_frame in-place Args: xlsx_file (str): path to xlsx sheet_name (str): sheet name Returns: List[Dict[str, str]]: sparkline definitions in form of { "F3": "Sheet1!C3:E3" } """ # read xlsx with zipfile.ZipFile(xlsx_file, "r") as z_f: with z_f.open("xl/workbook.xml") as f: workbook_database: _Element = lxml.etree.fromstring(f.read()) sheets: List[_Element] = _sheet_name_selector(workbook_database) sheet_names: Dict[str, str] = {sh.get("name"): sh.get("sheetId") for sh in sheets} with z_f.open("xl/worksheets/sheet{:}.xml".format(sheet_names[sheet_name])) as f: sheet: _Element = lxml.etree.fromstring(f.read()) sparklines: List[_Element] = _sparklines_selector(sheet) sparklines_dict: Dict[str, str] = {} for sp_l in sparklines: sparkline_xml: str = lxml.etree.tostring(sp_l, encoding="unicode") sparkline: Dict[str, Dict[str, str]] = xmltodict.parse(sparkline_xml , process_namespaces=True , namespaces=_xlsx_ns_imapping ) sparklines_dict[sparkline["x14:sparkline"]["xm:sqref"]] = sparkline["x14:sparkline"]["xm:f"] return sparklines_dict # Available Chart Properties: # title: str # anchor: ["oneCell" | "twoCell" | "absolute", col0, row0, col1, row1] # width: number # height: number # type: "scatterChart" | "lineChart" | "barChart" # direction: "bar" (hori) | "col" (vert) # xtitle, ytitle, ztitle: str def load_charts(xlsx_file: Workbook, sheet_name: str, **options) -> Dict[str, Any]: """ Args: xlsx_file (Workbook): concerned excel book sheet_name (str): sheet name options (Dict[str, List[str]]): dict like {"chart_props": list of str} giving the concerned chart properties Returns: Dict[str, Any]: information of charts """ # workbook: Workbook = openpyxl.load_workbook(filename=xlsx_file) worksheet: Worksheet = xlsx_file[sheet_name] charts: List[ChartBase] = worksheet._charts chart_set: Dict[str, Any] = {} chart_props: Set[str] = set(options["chart_props"]) if "chart_props" in options else set() for ch in charts: series: List[str] = [] for ser in ch.series: value_num = ser.val.numRef.f \ if hasattr(ser.val, "numRef") and hasattr(ser.val.numRef, "f") \ else "" value_str = ser.val.strRef.f \ if hasattr(ser.val, "strRef") and hasattr(ser.val.strRef, "f") \ else "" categ_num = ser.cat.numRef.f \ if hasattr(ser.cat, "numRef") and hasattr(ser.cat.numRef, "f") \ else "" categ_str = ser.cat.strRef.f \ if hasattr(ser.cat, "strRef") and hasattr(ser.cat.strRef, "f") \ else "" series.append("{:},{:},{:},{:}".format(value_num, value_str , categ_num, categ_str ) ) series: str = ";".join(series) # TODO: maybe more aspects, like chart type info: Dict[str, Any] = {} if "title" in chart_props: info["title"] = ch.title.tx.rich.p[0].r[0].t if "anchor" in chart_props: info["anchor"] = [ch.anchor.editAs , ch.anchor._from.col, ch.anchor.to.row , ch.anchor.to.col, ch.anchor.to.row ] if "width" in chart_props: info["width"] = ch.width if "height" in chart_props: info["height"] = ch.height if "type" in chart_props: info["type"] = ch.tagname if "direction" in chart_props: info["direction"] = ch.barDir if "xtitle" in chart_props: info["xtitle"] = ch.x_axis.title.tx.rich.p[0].r[0].t if "ytitle" in chart_props: info["ytitle"] = ch.y_axis.title.tx.rich.p[0].r[0].t if "ztitle" in chart_props: info["ztitle"] = ch.z_axis.title.tx.rich.p[0].r[0].t chart_set[series] = info return chart_set def _match_record(pattern: Dict[str, Any], item: Dict[str, Any]) -> bool: return all(k in item and item[k] == val for k, val in pattern.items()) def _multicellrange_containsby(subset_candidate: MultiCellRange, superset_candidate: MultiCellRange) -> bool: return all(r in superset_candidate for r in subset_candidate) def _match_value_to_rule(value: V, rule: Dict[str, Union[str, V]]) -> bool: """ Args: value (V): value to match rule (Dict[str, Union[str, V]]): rule dict like { "method": str "ref": V as ref value } Returns: bool """ if rule["method"].startswith("re"): flags: List[str] = rule["method"].split(".")[1:] flags: Iterable[re.RegexFlag] = (getattr(re, fl) for fl in flags) flag: re.RegexFlag = functools.reduce(operator.or_, flags, re.RegexFlag(0)) logger.debug("REFLAG: %s", repr(flag)) match_: Optional[Match[str]] = re.search(rule["ref"], value, flag) return match_ is not None if rule["method"] in { "eq", "ne" , "le", "lt" , "ge", "gt" }: return getattr(operator, rule["method"])(value, rule["ref"]) if rule["method"] == "spreadsheet_range": subset_limit = MultiCellRange(rule["ref"][0]) superset_limit = MultiCellRange(rule["ref"][1]) return _multicellrange_containsby(subset_limit, value)\ and _multicellrange_containsby(value, superset_limit) if rule["method"].startswith("range."): # e.g., range.te [0, 2] -> 0 < x <= 2 left_et = rule["method"][6] right_et = rule["method"][7] return getattr(operator, "l" + left_et)(rule["ref"][0], value)\ and getattr(operator, "l" + right_et)(value, rule["ref"][1]) if rule["method"] in {"str_list_eq", "str_set_eq"}: container_type_str: str = rule["method"][4:-3] container_type = getattr(builtins, container_type_str) value: container_type = container_type(value.strip("\"'").split(",")) ref: container_type = container_type(rule["ref"]) return value==ref raise NotImplementedError() def are_lists_equal(list1, list2, comparison_func): # First check if both lists have the same length if len(list1) != len(list2): return False # Now make sure each element in one list has an equal element in the other list for item1 in list1: # Use the supplied function to test for an equal item if not any(comparison_func(item1, item2) for item2 in list2): return False # If all items match, the lists are equal return True def compare_urls(url1, url2): def normalize_url(url): # Parse the URL parsed_url = urlparse(url) # If no scheme is present, assume 'http' scheme = parsed_url.scheme if parsed_url.scheme else 'http' # Lowercase the scheme and netloc, remove 'www.', and handle trailing slash normalized_netloc = parsed_url.netloc.lower().replace("www.", "") normalized_path = parsed_url.path if parsed_url.path != '/' else '' # Reassemble the URL with normalized components normalized_parsed_url = parsed_url._replace(scheme=scheme.lower(), netloc=normalized_netloc, path=normalized_path) normalized_url = urlunparse(normalized_parsed_url) return normalized_url # Normalize both URLs for comparison norm_url1 = normalize_url(url1) norm_url2 = normalize_url(url2) # Compare the normalized URLs return norm_url1 == norm_url2 if __name__ == "__main__": path1 = "../../../../../任务数据/LibreOffice Calc/Create_column_charts_using_statistics_gold_line_scatter.xlsx" workbook1: Workbook = openpyxl.load_workbook(filename=path1) worksheet1: Worksheet = workbook1.active charts: List[ChartBase] = worksheet1._charts # print(len(charts)) # print(type(charts[0])) # # print(len(charts[0].series)) # print(type(charts[0].series[0])) # print(type(charts[0].series[0].val)) ##print(charts[0].series[0].val) # print(charts[0].series[0].val.numRef.f) # # print(type(charts[0].series[0].cat)) ##print(charts[0].series[0].cat) # print(charts[0].series[0].cat.numRef) # print(charts[0].series[0].cat.strRef) # print(charts[0].series[0].cat.strRef.f) # print(type(charts[0].title.tx.strRef)) # print(type(charts[0].title.tx.rich)) # print(type(charts[0].title.txPr)) # print(len(charts[0].title.tx.rich.p)) # print(len(charts[0].title.tx.rich.p[0].r)) # print(type(charts[0].title.tx.rich.p[0].r[0])) # print(type(charts[0].title.tx.rich.p[0].r[0].t)) # print(charts[0].title.tx.rich.p[0].r[0].t) # print(type(charts[0].anchor)) # print(charts[0].anchor.editAs) # print(charts[0].anchor._from.col, charts[0].anchor.to.row) # print(charts[0].anchor.to.col, charts[0].anchor.to.row) # df1 = pd.read_excel(path1) # print(df1) print(load_charts(path1, chart_props=["title", "xtitle", "ytitle", "type"]))