Files
sci-gui-agent-benchmark/desktop_env/evaluators/metrics/utils.py
David Chang 93229ce98c ver Jan22ndv3
updated style metric to compare_table
2024-01-22 23:45:15 +08:00

412 lines
16 KiB
Python

import logging
import zipfile
from typing import Any, TypeVar, Union, Iterable, Optional, Callable
from typing import Dict, List, Set, Match
from urllib.parse import urlparse, urlunparse
import re
import functools
import operator
import builtins
import lxml.cssselect
import lxml.etree
import openpyxl
import xmltodict
from lxml.etree import _Element
from openpyxl import Workbook
from openpyxl.chart._chart import ChartBase
from openpyxl.worksheet.worksheet import Worksheet
from openpyxl.worksheet.cell_range import MultiCellRange
from openpyxl.worksheet.dimensions import DimensionHolder
from openpyxl.formatting.formatting import ConditionalFormattingList
#from openpyxl.utils import get_column_letter
from openpyxl.cell.cell import Cell
from openpyxl.styles.differential import DifferentialStyle
import formulas
V = TypeVar("Value")
logger = logging.getLogger("desktopenv.metrics.utils")
_xlsx_namespaces = [ ("oo", "http://schemas.openxmlformats.org/spreadsheetml/2006/main")
, ("x14", "http://schemas.microsoft.com/office/spreadsheetml/2009/9/main")
, ("xm", "http://schemas.microsoft.com/office/excel/2006/main")
]
_xlsx_ns_mapping = dict(_xlsx_namespaces)
_xlsx_ns_imapping = dict(map(lambda itm: (itm[1], itm[0]), _xlsx_namespaces))
_sheet_name_selector = lxml.cssselect.CSSSelector("oo|sheets>oo|sheet", namespaces=_xlsx_ns_mapping)
_sparklines_selector = lxml.cssselect.CSSSelector("x14|sparkline", namespaces=_xlsx_ns_mapping)
def load_sparklines(xlsx_file: str, sheet_name: str) -> Dict[str, str]:
# function load_sparklines {{{ #
"""
Args:
xlsx_file (str): path to xlsx
sheet_name (str): sheet name
Returns:
List[Dict[str, str]]: sparkline definitions in form of
{
"F3": "Sheet1!C3:E3"
}
"""
# read xlsx
with zipfile.ZipFile(xlsx_file, "r") as z_f:
with z_f.open("xl/workbook.xml") as f:
workbook_database: _Element = lxml.etree.fromstring(f.read())
sheets: List[_Element] = _sheet_name_selector(workbook_database)
sheet_names: Dict[str, str] = {sh.get("name"): sh.get("sheetId") for sh in sheets}
with z_f.open("xl/worksheets/sheet{:}.xml".format(sheet_names[sheet_name])) as f:
sheet: _Element = lxml.etree.fromstring(f.read())
sparklines: List[_Element] = _sparklines_selector(sheet)
sparklines_dict: Dict[str, str] = {}
for sp_l in sparklines:
sparkline_xml: str = lxml.etree.tostring(sp_l, encoding="unicode")
sparkline: Dict[str, Dict[str, str]] = xmltodict.parse(sparkline_xml
, process_namespaces=True
, namespaces=_xlsx_ns_imapping
)
sparklines_dict[sparkline["x14:sparkline"]["xm:sqref"]] = sparkline["x14:sparkline"]["xm:f"]
return sparklines_dict
# }}} function load_sparklines #
# Available Chart Properties:
# title: str
# anchor: ["oneCell" | "twoCell" | "absolute", col0, row0, col1, row1]
# width: number
# height: number
# type: "scatterChart" | "lineChart" | "barChart"
# direction: "bar" (hori) | "col" (vert)
# xtitle, ytitle, ztitle: str
def load_charts(xlsx_file: Workbook, sheet_name: str, **options) -> Dict[str, Any]:
# function load_charts {{{ #
"""
Args:
xlsx_file (Workbook): concerned excel book
sheet_name (str): sheet name
options (Dict[str, List[str]]): dict like {"chart_props": list of str}
giving the concerned chart properties
Returns:
Dict[str, Any]: information of charts, dict like
{
<str representing data source>: {
<str as property>: anything
}
}
"""
# workbook: Workbook = openpyxl.load_workbook(filename=xlsx_file)
worksheet: Worksheet = xlsx_file[sheet_name]
charts: List[ChartBase] = worksheet._charts
chart_set: Dict[str, Any] = {}
chart_props: Set[str] = set(options["chart_props"]) if "chart_props" in options else set()
for ch in charts:
series: List[str] = []
for ser in ch.series:
value_num = ser.val.numRef.f \
if hasattr(ser.val, "numRef") and hasattr(ser.val.numRef, "f") \
else ""
value_str = ser.val.strRef.f \
if hasattr(ser.val, "strRef") and hasattr(ser.val.strRef, "f") \
else ""
categ_num = ser.cat.numRef.f \
if hasattr(ser.cat, "numRef") and hasattr(ser.cat.numRef, "f") \
else ""
categ_str = ser.cat.strRef.f \
if hasattr(ser.cat, "strRef") and hasattr(ser.cat.strRef, "f") \
else ""
series.append("{:},{:},{:},{:}".format(value_num, value_str
, categ_num, categ_str
)
)
series: str = ";".join(series)
# TODO: maybe more aspects, like chart type
info: Dict[str, Any] = {}
if "title" in chart_props:
info["title"] = ch.title.tx.rich.p[0].r[0].t
if "anchor" in chart_props:
info["anchor"] = [ch.anchor.editAs
, ch.anchor._from.col, ch.anchor.to.row
, ch.anchor.to.col, ch.anchor.to.row
]
if "width" in chart_props:
info["width"] = ch.width
if "height" in chart_props:
info["height"] = ch.height
if "type" in chart_props:
info["type"] = ch.tagname
if "direction" in chart_props:
info["direction"] = ch.barDir
if "xtitle" in chart_props:
info["xtitle"] = ch.x_axis.title.tx.rich.p[0].r[0].t
if "ytitle" in chart_props:
info["ytitle"] = ch.y_axis.title.tx.rich.p[0].r[0].t
if "ztitle" in chart_props:
info["ztitle"] = ch.z_axis.title.tx.rich.p[0].r[0].t
chart_set[series] = info
return chart_set
# }}} function load_charts #
# Supported Styles:
# number_format
# font_name - str
# font_family - float
# font_color - in aRGB, e.g., FF000000 is black
# font_bold - bool
# font_italic - bool
# fill_type - "patternFill" | "gradientFill"
# bgcolor - in aRGB, e.g., FFFF0000 is red
# fgcolor - in aRGB, e.g., FF00FFFF is yellow
def _read_cell_style(style_name: str, cell: Cell, diff_style: Optional[DifferentialStyle] = None) -> Any:
if style_name=="number_format":
return (cell.number_format if diff_style is None else diff_style.numFmt.formatCode)\
if cell.value is not None and cell.data_type=="n" else None
elif style_name=="font_name":
return (diff_style or cell).font.name if cell.value is not None else None
elif style_name=="font_family":
return (diff_style or cell).font.family if cell.value is not None else None
elif style_name=="font_color":
return (diff_style or cell).font.color.rgb if cell.value is not None else None
elif style_name=="font_bold":
return (diff_style or cell).font.bold if cell.value is not None else None
elif style_name=="font_italic":
return (diff_style or cell).font.italic if cell.value is not None else None
elif style_name=="fill_type":
return (diff_style or cell).fill.tagname
elif style_name=="bgcolor":
return (diff_style or cell).fill.bgColor.rgb
elif style_name=="fgcolor":
return (diff_style or cell).fill.fgColor.rgb
else:
raise NotImplementedError("Unsupported Style: {:}".format(style_name))
def load_xlsx_styles(xlsx_file: Workbook, sheet_name: str, **options) -> Dict[str, List[Any]]:
# function load_xlsx_styles {{{ #
"""
Args:
xlsx_file (Workbook): concerned excel book
sheet_name (str): sheet name
options (Dict[str, List[str]): dick like {"props": list of str} giving
the concerned styles
Returns:
Dict[str, List[Any]]: dict like
{
<str as cell coordinates>: list of anything indicating concerned
property values
}
"""
worksheet: Worksheet = xlsx_file[sheet_name]
style_dict: Dict[str, List[Any]] = {}
concerned_styles: List[str] = options.get("props", [])
# Handles Cell Styles
for col in worksheet.iter_cols():
for c in col:
style_list: List[Any] = []
for st in concerned_styles:
style_list.append(_read_cell_style(st, c))
style_dict[c.coordinate] = style_list
# Handles Conditional Formatting
conditional_formattings: ConditionalFormattingList = worksheet.conditional_formatting
formula_parser = formulas.Parser()
for fmt in conditional_formattings:
for r in fmt.rules:
active_cells: List[Cell] = []
if r.type == "expression":
condition: Callable[[str], bool] = formula_parser.ast("=" + r.formula[0])[1].compile()
for rge in fmt.cells:
for c in rge.cells:
cell: Cell = worksheet.cell(row=c[0], column=c[1])
if condition(str(cell.value)):
active_cells.append(cell)
else:
raise NotImplementedError("Not Implemented Condition Type: {:}".format(r.type))
for c in active_cells:
style_dict[c.coordinate] = [_read_cell_style(st, c, r.dxf) for st in concerned_styles]
return style_dict
# }}} function load_xlsx_styles #
# Available Row Properties:
# hidden
# collapsed
# height
#
# Available Column Properties:
# width
# auto_size
# hidden
# collapsed
# min
# max
def load_rows_or_cols(xlsx_file: Workbook, sheet_name: str, **options)\
-> Dict[Union[int, str], Dict[str, Any]]:
# function load_rows_or_cols {{{ #
"""
Args:
xlsx_file (Workbook): concerned excel book
sheet_name (str): sheet name
options (Dict[str, List[str]]): dict like
{"obj": "row" | "column", "props": list of str} giving the concerned
row/column properties
Returns:
Dict[Union[int, str], Dict[str, Any]]: row/column information
"""
worksheet: Worksheet = xlsx_file[sheet_name]
objs: DimensionHolder = getattr(worksheet, "{:}_dimensions".format(options["obj"]))
obj_set: Dict[int, Any] = {}
obj_props: Set[str] = set(options.get("props", []))
for obj_no, obj_dms in objs.items():
info_dict: Dict[str, Any] = {}
for prop in obj_props:
info_dict[prop] = getattr(obj_dms, prop)
obj_set[obj_no] = info_dict
return obj_set
# }}} function load_rows_or_cols #
def _match_record(pattern: Dict[str, Any], item: Dict[str, Any]) -> bool:
return all(k in item and item[k] == val for k, val in pattern.items())
def _multicellrange_containsby(subset_candidate: MultiCellRange, superset_candidate: MultiCellRange) -> bool:
return all(r in superset_candidate for r in subset_candidate)
def _match_value_to_rule(value: V, rule: Dict[str, Union[str, V]]) -> bool:
"""
Args:
value (V): value to match
rule (Dict[str, Union[str, V]]): rule dict like
{
"method": str
"ref": V as ref value
}
Returns:
bool
"""
if rule["method"].startswith("re"):
flags: List[str] = rule["method"].split(".")[1:]
flags: Iterable[re.RegexFlag] = (getattr(re, fl) for fl in flags)
flag: re.RegexFlag = functools.reduce(operator.or_, flags, re.RegexFlag(0))
logger.debug("REFLAG: %s", repr(flag))
match_: Optional[Match[str]] = re.search(rule["ref"], value, flag)
return match_ is not None
if rule["method"] in { "eq", "ne"
, "le", "lt"
, "ge", "gt"
}:
return getattr(operator, rule["method"])(value, rule["ref"])
if rule["method"] == "spreadsheet_range":
subset_limit = MultiCellRange(rule["ref"][0])
superset_limit = MultiCellRange(rule["ref"][1])
return _multicellrange_containsby(subset_limit, value)\
and _multicellrange_containsby(value, superset_limit)
if rule["method"].startswith("range."): # e.g., range.te [0, 2] -> 0 < x <= 2
left_et = rule["method"][6]
right_et = rule["method"][7]
return getattr(operator, "l" + left_et)(rule["ref"][0], value)\
and getattr(operator, "l" + right_et)(value, rule["ref"][1])
if rule["method"] in {"str_list_eq", "str_set_eq"}:
container_type_str: str = rule["method"][4:-3]
container_type = getattr(builtins, container_type_str)
value: container_type = container_type(value.strip("\"'").split(","))
ref: container_type = container_type(rule["ref"])
return value==ref
raise NotImplementedError()
def are_lists_equal(list1, list2, comparison_func):
# First check if both lists have the same length
if len(list1) != len(list2):
return False
# Now make sure each element in one list has an equal element in the other list
for item1 in list1:
# Use the supplied function to test for an equal item
if not any(comparison_func(item1, item2) for item2 in list2):
return False
# If all items match, the lists are equal
return True
def compare_urls(url1, url2):
def normalize_url(url):
# Parse the URL
parsed_url = urlparse(url)
# If no scheme is present, assume 'http'
scheme = parsed_url.scheme if parsed_url.scheme else 'http'
# Lowercase the scheme and netloc, remove 'www.', and handle trailing slash
normalized_netloc = parsed_url.netloc.lower().replace("www.", "")
normalized_path = parsed_url.path if parsed_url.path != '/' else ''
# Reassemble the URL with normalized components
normalized_parsed_url = parsed_url._replace(scheme=scheme.lower(), netloc=normalized_netloc,
path=normalized_path)
normalized_url = urlunparse(normalized_parsed_url)
return normalized_url
# Normalize both URLs for comparison
norm_url1 = normalize_url(url1)
norm_url2 = normalize_url(url2)
# Compare the normalized URLs
return norm_url1 == norm_url2
if __name__ == "__main__":
path1 = "../../../../../任务数据/LibreOffice Calc/Create_column_charts_using_statistics_gold_line_scatter.xlsx"
workbook1: Workbook = openpyxl.load_workbook(filename=path1)
worksheet1: Worksheet = workbook1.active
charts: List[ChartBase] = worksheet1._charts
# print(len(charts))
# print(type(charts[0]))
#
# print(len(charts[0].series))
# print(type(charts[0].series[0]))
# print(type(charts[0].series[0].val))
##print(charts[0].series[0].val)
# print(charts[0].series[0].val.numRef.f)
#
# print(type(charts[0].series[0].cat))
##print(charts[0].series[0].cat)
# print(charts[0].series[0].cat.numRef)
# print(charts[0].series[0].cat.strRef)
# print(charts[0].series[0].cat.strRef.f)
# print(type(charts[0].title.tx.strRef))
# print(type(charts[0].title.tx.rich))
# print(type(charts[0].title.txPr))
# print(len(charts[0].title.tx.rich.p))
# print(len(charts[0].title.tx.rich.p[0].r))
# print(type(charts[0].title.tx.rich.p[0].r[0]))
# print(type(charts[0].title.tx.rich.p[0].r[0].t))
# print(charts[0].title.tx.rich.p[0].r[0].t)
# print(type(charts[0].anchor))
# print(charts[0].anchor.editAs)
# print(charts[0].anchor._from.col, charts[0].anchor.to.row)
# print(charts[0].anchor.to.col, charts[0].anchor.to.row)
# df1 = pd.read_excel(path1)
# print(df1)
print(load_charts(path1, chart_props=["title", "xtitle", "ytitle", "type"]))