sci-gui-agent-benchmark/desktop_env/evaluators/metrics/table.py

import pandas as pd
import zipfile
import lxml.etree
import lxml.cssselect
from lxml.etree import _Element
import xmltodict
#import pylightxl
import openpyxl
from openpyxl import Workbook
from openpyxl.worksheet.worksheet import Worksheet
from openpyxl.chart._chart import ChartBase

from typing import Dict, List
from typing import Any

def compare_table(actual, expected):
    df1 = pd.read_excel(expected)
    df2 = pd.read_excel(actual)

    # Compare the DataFrames
    return 1 if df1.equals(df2) else 0

_xlsx_namespaces = [ ("x14", "http://schemas.microsoft.com/office/spreadsheetml/2009/9/main")
                   , ("xm", "http://schemas.microsoft.com/office/excel/2006/main")
                   ]
_xlsx_ns_mapping = dict(_xlsx_namespaces)
_xlsx_ns_imapping = dict(map(lambda itm: (itm[1], itm[0]), _xlsx_namespaces))
_sparklines_selector = lxml.cssselect.CSSSelector("x14|sparkline", namespaces=_xlsx_ns_mapping)
#print(_sparklines_selector.css)
def _load_sparklines(xlsx_file: str) -> Dict[str, str]:
    """
    This function modifies data_frame in-place

    Args:
        xlsx_file (str): path to xlsx

    Returns:
        List[Dict[str, str]]: sparkline definitions in form of
          {
            "F3": "Sheet1!C3:E3"
          }
    """

    # read xlsx
    with zipfile.ZipFile(xlsx_file, "r") as z_f:
        with z_f.open("xl/worksheets/sheet1.xml") as f:
            sheet1: _Element = lxml.etree.fromstring(f.read())
            sparklines: List[_Element] = _sparklines_selector(sheet1)

    sparklines_dict: Dict[str, str] = {}
    for sp_l in sparklines:
        sparkline_xml: str = lxml.etree.tostring(sp_l, encoding="unicode")
        sparkline: Dict[str, Dict[str, str]] = xmltodict.parse( sparkline_xml
                                                              , process_namespaces=True
                                                              , namespaces=_xlsx_ns_imapping
                                                              )
        sparklines_dict[sparkline["x14:sparkline"]["xm:sqref"]] = sparkline["x14:sparkline"]["xm:f"]
    return sparklines_dict

def compare_with_sparklines(actual: str, expected: str) -> float:
    df1 = pd.read_excel(actual)
    df2 = pd.read_excel(expected)
    normal_content_metric: bool = df1.equals(df2)
    print("Normal Contents Metric: {:}".format(normal_content_metric))

    sp1 = _load_sparklines(actual)
    sp2 = _load_sparklines(expected)
    sparkline_metric: bool = sp1 == sp2
    print("Sparkline Metric: {:}".format(sparkline_metric))

    return float(normal_content_metric and sparkline_metric)

def _load_charts(xlsx_file: str) -> Dict[str, Any]:
    """
    Args:
        xlsx_file (str): path to xlsx

    Returns:
        Dict[str, Any]: information of charts
    """

    workbook: Workbook = openpyxl.load_workbook(filename=xlsx_file)
    worksheet: Worksheet = workbook.active
    charts: List[ChartBase] = worksheet._charts

    chart_set: Dict[str, Any] = {}
    for ch in charts:
        series: List[str] = []
        for ser in ch.series:
            value_num = ser.val.numRef.f\
                     if hasattr(ser.val, "numRef") and hasattr(ser.val.numRef, "f")\
                   else ""
            value_str = ser.val.strRef.f\
                     if hasattr(ser.val, "strRef") and hasattr(ser.val.strRef, "f")\
                   else ""
            categ_num = ser.cat.numRef.f\
                     if hasattr(ser.cat, "numRef") and hasattr(ser.cat.numRef, "f")\
                   else ""
            categ_str = ser.cat.strRef.f\
                     if hasattr(ser.cat, "strRef") and hasattr(ser.cat.strRef, "f")\
                   else ""
            series.append( "{:},{:},{:},{:}".format( value_num, value_str
                                                   , categ_num, categ_str
                                                   )
                         )
        series: str = ";".join(series)

        # TODO: maybe more aspects, like chart type
        info: Dict[str, Any] = {}
        chart_set[series] = info
    return chart_set

def compare_with_charts(actual: str, expected: str) -> float:
    df1 = pd.read_excel(actual)
    df2 = pd.read_excel(expected)
    normal_content_metric: bool = df1.equals(df2)
    print("Normal Contents Metric: {:}".format(normal_content_metric))

    charts1 = _load_charts(actual)
    charts2 = _load_charts(expected)
    chart_metric: bool = charts1==charts2
    print("Chart Metric: {:}".format(chart_metric))

    return float(normal_content_metric and chart_metric)

def check_sheet_list(result: str, rules: List[Dict[str, Any]]) -> float:
    #workbook: Workbook = openpyxl.load_workbook(filename=result)
    workbook = pd.ExcelFile(result)
    worksheet_names: List[str] = workbook.sheet_names

    passes = True
    for r in rules:
        if r["type"]=="sheet_name":
            expected_name: str = worksheet_names[r["sheet_idx"]]
            actual_name: str = r["sheet_name"]
            metric: bool = expected_name==actual_name
            print("Assertion: {:d}.{:} is {:} - {:}".format(r["sheet_idx"], actual_name, expected_name, metric))
            passes = passes and metric
        elif r["type"]=="sheet_data":
            if isinstance(r["sheet_idx0"], int):
                df1: pd.DataFrame = pd.read_excel(workbook, r["sheet_idx0"])
            else:
                file_name: str
                sheet_idx: str
                file_name, sheet_idx = r["sheet_idx0"].rsplit("@", maxsplit=1)
                sheet_idx: int = int(sheet_idx)
                df1: pd.DataFrame = pd.read_excel(file_name, sheet_idx)
            if isinstance(r["sheet_idx1"], int):
                df2: pd.DataFrame = pd.read_excel(workbook, r["sheet_idx1"])
            else:
                file_name: str
                sheet_idx: str
                file_name, sheet_idx = r["sheet_idx1"].rsplit("@", maxsplit=1)
                sheet_idx: int = int(sheet_idx)
                df2: pd.DataFrame = pd.read_excel(file_name, sheet_idx)
            metric: bool = df1.equals(df2)
            print("Assertion: {:} == {:} - {:}".format(r["sheet_idx0"], r["sheet_idx1"], metric))
            passes = passes and metric
        else:
            raise NotImplementedError("Unimplemented sheet check: {:}".format(r["type"]))

    return float(passes)

def check_xlsx_freeze(result: str, rules: Dict[str, str]) -> float:
    worksheet: Worksheet = openpyxl.load_workbook(filename=result).active
    return float(worksheet.freeze_panes==rules["position"])

if __name__ == '__main__':
    #path1 = ""
    #path2 = ""
    #print(compare_table(path1, path2))

    #path1 = "../../../../../任务数据/LibreOffice Calc/OrderId_Month_Chart_gold.xlsx"
    #path2 = "../../../../../任务数据/LibreOffice Calc/OrderId_Month_Chart.xlsx"
    #print(compare_with_sparklines(path1, path2))

    #path1 = "../../../../../任务数据/LibreOffice Calc/Freeze_row_column_gold.xlsx"
    #path2 = "../../../../../任务数据/LibreOffice Calc/Freeze_row_column.xlsx"
    #workbook1: Workbook = openpyxl.load_workbook(filename=path1)
    #worksheet1: Worksheet = workbook1.active
    #print(worksheet1.freeze_panes)
    #workbook2: Workbook = openpyxl.load_workbook(filename=path2)
    #worksheet2: Worksheet = workbook2.active
    #print(worksheet2.freeze_panes)
    #rule = {"position": "C6"}
    #print(check_xlsx_freeze(path1, rule))

    #path1 = "../../../../../任务数据/LibreOffice Calc/copy_sheet_insert_gold.xlsx"
    #rule = [ { "type": "sheet_name"
             #, "sheet_idx": 0
             #, "sheet_name": "Sheet1"
             #}
           #, { "type": "sheet_data"
             #, "sheet_idx0": "../../../../../任务数据/LibreOffice Calc/copy_sheet_insert.xlsx@0"
             #, "sheet_idx1": 1
             #}
           #, { "type": "sheet_name"
             #, "sheet_idx": 2
             #, "sheet_name": "Sheet2"
             #}
           #]
    #print(check_sheet_list(path1, rule))

    path1 = "../../../../../任务数据/LibreOffice Calc/Create_column_charts_using_statistics_gold.xlsx"
    #workbook1: Workbook = openpyxl.load_workbook(filename=path1)
    #worksheet1: Worksheet = workbook1.active
    #charts: List[ChartBase] = worksheet1._charts
    #print(len(charts))
    #print(type(charts[0]))
#
    #print(len(charts[0].series))
    #print(type(charts[0].series[0]))
    #print(type(charts[0].series[0].val))
    ##print(charts[0].series[0].val)
    #print(charts[0].series[0].val.numRef.f)
#
    #print(type(charts[0].series[0].cat))
    ##print(charts[0].series[0].cat)
    #print(charts[0].series[0].cat.numRef)
    #print(charts[0].series[0].cat.strRef)
    #print(charts[0].series[0].cat.strRef.f)
#
    #df1 = pd.read_excel(path1)
    #print(df1)
    path2 = "../../../../../任务数据/LibreOffice Calc/Create_column_charts_using_statistics_gold2.xlsx"
    print(compare_with_charts(path1, path2))