sci-gui-agent-benchmark/desktop_env/evaluators/metrics/utils.py

import logging
import zipfile
from typing import Any
from typing import Dict, List, Set
from urllib.parse import urlparse, urlunparse

import lxml.cssselect
import lxml.etree
import openpyxl
import xmltodict
from lxml.etree import _Element
from openpyxl import Workbook
from openpyxl.chart._chart import ChartBase
from openpyxl.worksheet.worksheet import Worksheet

logger = logging.getLogger("desktopenv.metrics.utils")

_xlsx_namespaces = [("x14", "http://schemas.microsoft.com/office/spreadsheetml/2009/9/main")
    , ("xm", "http://schemas.microsoft.com/office/excel/2006/main")
                    ]
_xlsx_ns_mapping = dict(_xlsx_namespaces)
_xlsx_ns_imapping = dict(map(lambda itm: (itm[1], itm[0]), _xlsx_namespaces))
_sparklines_selector = lxml.cssselect.CSSSelector("x14|sparkline", namespaces=_xlsx_ns_mapping)


# print(_sparklines_selector.css)
def load_sparklines(xlsx_file: str) -> Dict[str, str]:
    """
    This function modifies data_frame in-place

    Args:
        xlsx_file (str): path to xlsx

    Returns:
        List[Dict[str, str]]: sparkline definitions in form of
          {
            "F3": "Sheet1!C3:E3"
          }
    """

    # read xlsx
    with zipfile.ZipFile(xlsx_file, "r") as z_f:
        with z_f.open("xl/worksheets/sheet1.xml") as f:
            sheet1: _Element = lxml.etree.fromstring(f.read())
            sparklines: List[_Element] = _sparklines_selector(sheet1)

    sparklines_dict: Dict[str, str] = {}
    for sp_l in sparklines:
        sparkline_xml: str = lxml.etree.tostring(sp_l, encoding="unicode")
        sparkline: Dict[str, Dict[str, str]] = xmltodict.parse(sparkline_xml
                                                               , process_namespaces=True
                                                               , namespaces=_xlsx_ns_imapping
                                                               )
        sparklines_dict[sparkline["x14:sparkline"]["xm:sqref"]] = sparkline["x14:sparkline"]["xm:f"]
    return sparklines_dict


# Available Chart Properties:
# title: str
# anchor: ["oneCell" | "twoCell" | "absolute", col0, row0, col1, row1]
# width: number
# height: number
# type: "scatterChart" | "lineChart" | "barChart"
# direction: "bar" (hori) | "col" (vert)
# xtitle, ytitle, ztitle: str
def load_charts(xlsx_file: Workbook, **options) -> Dict[str, Any]:
    """
    Args:
        xlsx_file (Workbook): concerned excel book
        options (Dict[str, List[str]]): dict like {"chart_props": list of str}
          giving the concerned chart properties

    Returns:
        Dict[str, Any]: information of charts
    """

    # workbook: Workbook = openpyxl.load_workbook(filename=xlsx_file)
    worksheet: Worksheet = xlsx_file.active
    charts: List[ChartBase] = worksheet._charts

    chart_set: Dict[str, Any] = {}
    chart_props: Set[str] = set(options["chart_props"]) if "chart_props" in options else set()
    for ch in charts:
        series: List[str] = []
        for ser in ch.series:
            value_num = ser.val.numRef.f \
                if hasattr(ser.val, "numRef") and hasattr(ser.val.numRef, "f") \
                else ""
            value_str = ser.val.strRef.f \
                if hasattr(ser.val, "strRef") and hasattr(ser.val.strRef, "f") \
                else ""
            categ_num = ser.cat.numRef.f \
                if hasattr(ser.cat, "numRef") and hasattr(ser.cat.numRef, "f") \
                else ""
            categ_str = ser.cat.strRef.f \
                if hasattr(ser.cat, "strRef") and hasattr(ser.cat.strRef, "f") \
                else ""
            series.append("{:},{:},{:},{:}".format(value_num, value_str
                                                   , categ_num, categ_str
                                                   )
                          )
        series: str = ";".join(series)

        # TODO: maybe more aspects, like chart type
        info: Dict[str, Any] = {}

        if "title" in chart_props:
            info["title"] = ch.title.tx.rich.p[0].r[0].t
        if "anchor" in chart_props:
            info["anchor"] = [ch.anchor.editAs
                , ch.anchor._from.col, ch.anchor.to.row
                , ch.anchor.to.col, ch.anchor.to.row
                              ]
        if "width" in chart_props:
            info["width"] = ch.width
        if "height" in chart_props:
            info["height"] = ch.height
        if "type" in chart_props:
            info["type"] = ch.tagname
        if "direction" in chart_props:
            info["direction"] = ch.barDir

        if "xtitle" in chart_props:
            info["xtitle"] = ch.x_axis.title.tx.rich.p[0].r[0].t
        if "ytitle" in chart_props:
            info["ytitle"] = ch.y_axis.title.tx.rich.p[0].r[0].t
        if "ztitle" in chart_props:
            info["ztitle"] = ch.z_axis.title.tx.rich.p[0].r[0].t
        chart_set[series] = info
    return chart_set


def _match_record(pattern: Dict[str, Any], item: Dict[str, Any]) -> bool:
    return all(k in item and item[k] == val for k, val in pattern.items())


def are_lists_equal(list1, list2, comparison_func):
    # First check if both lists have the same length
    if len(list1) != len(list2):
        return False

    # Now make sure each element in one list has an equal element in the other list
    for item1 in list1:
        # Use the supplied function to test for an equal item
        if not any(comparison_func(item1, item2) for item2 in list2):
            return False

    # If all items match, the lists are equal
    return True


def compare_urls(url1, url2):
    def normalize_url(url):
        # Parse the URL
        parsed_url = urlparse(url)

        # If no scheme is present, assume 'http'
        scheme = parsed_url.scheme if parsed_url.scheme else 'http'

        # Lowercase the scheme and netloc, remove 'www.', and handle trailing slash
        normalized_netloc = parsed_url.netloc.lower().replace("www.", "")
        normalized_path = parsed_url.path if parsed_url.path != '/' else ''

        # Reassemble the URL with normalized components
        normalized_parsed_url = parsed_url._replace(scheme=scheme.lower(), netloc=normalized_netloc,
                                                    path=normalized_path)
        normalized_url = urlunparse(normalized_parsed_url)

        return normalized_url

    # Normalize both URLs for comparison
    norm_url1 = normalize_url(url1)
    norm_url2 = normalize_url(url2)

    # Compare the normalized URLs
    return norm_url1 == norm_url2


if __name__ == "__main__":
    path1 = "../../../../../任务数据/LibreOffice Calc/Create_column_charts_using_statistics_gold_line_scatter.xlsx"
    workbook1: Workbook = openpyxl.load_workbook(filename=path1)
    worksheet1: Worksheet = workbook1.active
    charts: List[ChartBase] = worksheet1._charts
    # print(len(charts))
    # print(type(charts[0]))
    #
    # print(len(charts[0].series))
    # print(type(charts[0].series[0]))
    # print(type(charts[0].series[0].val))
    ##print(charts[0].series[0].val)
    # print(charts[0].series[0].val.numRef.f)
    #
    # print(type(charts[0].series[0].cat))
    ##print(charts[0].series[0].cat)
    # print(charts[0].series[0].cat.numRef)
    # print(charts[0].series[0].cat.strRef)
    # print(charts[0].series[0].cat.strRef.f)

    # print(type(charts[0].title.tx.strRef))
    # print(type(charts[0].title.tx.rich))
    # print(type(charts[0].title.txPr))
    # print(len(charts[0].title.tx.rich.p))
    # print(len(charts[0].title.tx.rich.p[0].r))
    # print(type(charts[0].title.tx.rich.p[0].r[0]))
    # print(type(charts[0].title.tx.rich.p[0].r[0].t))
    # print(charts[0].title.tx.rich.p[0].r[0].t)

    # print(type(charts[0].anchor))
    # print(charts[0].anchor.editAs)
    # print(charts[0].anchor._from.col, charts[0].anchor.to.row)
    # print(charts[0].anchor.to.col, charts[0].anchor.to.row)

    # df1 = pd.read_excel(path1)
    # print(df1)
    print(load_charts(path1, chart_props=["title", "xtitle", "ytitle", "type"]))