sci-gui-agent-benchmark/desktop_env/evaluators/metrics/table.py

import logging
import operator
from numbers import Number
from typing import Any, Union
from typing import Dict, List
import os.path
import itertools

import openpyxl
import pandas as pd
from openpyxl import Workbook
from openpyxl.worksheet.worksheet import Worksheet
#from openpyxl.worksheet.cell_range import MultiCellRange
from openpyxl.worksheet.datavalidation import DataValidation

from .utils import load_charts, load_sparklines, _match_value_to_rule

logger = logging.getLogger("desktopenv.metric.table")


def compare_table(actual: str, expected: str, **options) -> float:
    """
    Args:
        actual (str): path to result xlsx
        expected (str): path to gold xlsx
        options (Dict[str, List[str]]): dict like
          {
            "features": list of str for other features, supports:
                * sparkline
                * chart
                * number_format
            "chart_props": list of str, giving the concerned chart properties
            "as_shown": bool, TODO
          }

    Return:
        float: the score
    """

    if actual is None:
        return 0.

    if options.get("as_shown", False):
        expected_csv: str = os.path.splitext(expected)[0] + ".csv"
        actual_csv: str = os.path.splitext(actual)[0] + ".csv"

        with open(expected_csv) as f:
            expected_lines: List[str] = list( itertools.dropwhile( lambda l: len(l)==0
                                                                 , map( lambda l: l.strip()
                                                                      , reversed(f.read().splitlines())
                                                                      )
                                                                 )
                                            )
            if options.get("ignore_case", False):
                expected_lines = [l.lower() for l in expected_lines]
        with open(actual_csv) as f:
            actual_lines: List[str] = list( itertools.dropwhile( lambda l: len(l)==0
                                                               , map( lambda l: l.strip()
                                                                    , reversed(f.read().splitlines())
                                                                    )
                                                               )
                                          )
            if options.get("ignore_case", False):
                actual_lines = [l.lower() for l in actual_lines]
        metric: bool = expected_lines==actual_lines
        logger.debug("Content Metric just as shown: %s", metric)
    else:
        df1 = pd.read_excel(expected)
        df2 = pd.read_excel(actual)
        metric: bool = df1.equals(df2)
        logger.debug("Normal Content Metric: {:}".format(metric))

    features: List[str] = options.get("features", [])
    for ftr in features:
        workbook1: Workbook = openpyxl.load_workbook(actual)
        workbook2: Workbook = openpyxl.load_workbook(expected)

        if ftr == "sparkline":
            sp1 = load_sparklines(actual)
            sp2 = load_sparklines(expected)
            new_metric: bool = sp1 == sp2
            logger.debug("Sparkline Metric: {:}".format(new_metric))
        elif ftr == "chart":
            charts1 = load_charts(workbook1, **options)
            charts2 = load_charts(workbook2, **options)
            new_metric: bool = charts1 == charts2
            logger.debug("Chart Metric: {:}".format(new_metric))
        elif ftr == "number_format":
            number_formats1: List[str] = [c.number_format.lower() \
                                          for col in workbook1.active.iter_cols() \
                                          for c in col \
                                          if c.data_type == "n"
                                          ]
            number_formats2: List[str] = [c.number_format.lower() \
                                          for col in workbook2.active.iter_cols() \
                                          for c in col \
                                          if c.data_type == "n"
                                          ]
            new_metric: bool = number_formats1 == number_formats2
            logger.debug("Number Format Metric: {:}".format(new_metric))
        else:
            raise NotImplementedError("Unsupported xlsx feature: {:}".format(ftr))
        metric = metric and new_metric

    return float(metric)


def check_sheet_list(result: str, rules: List[Dict[str, Any]]) -> float:
    if result is None:
        return 0.

    # workbook: Workbook = openpyxl.load_workbook(filename=result)
    workbook = pd.ExcelFile(result)
    worksheet_names: List[str] = workbook.sheet_names

    passes = True
    for r in rules:
        if r["type"] == "sheet_name":
            expected_name: str = worksheet_names[r["sheet_idx"]]
            actual_name: str = r["sheet_name"]
            metric: bool = expected_name == actual_name
            logger.debug("Assertion: {:d}.{:} is {:} - {:}".format(r["sheet_idx"], actual_name, expected_name, metric))
            passes = passes and metric
        elif r["type"] == "sheet_data":
            if isinstance(r["sheet_idx0"], int):
                df1: pd.DataFrame = pd.read_excel(workbook, r["sheet_idx0"])
            else:
                file_name: str
                sheet_idx: str
                file_name, sheet_idx = r["sheet_idx0"].rsplit("@", maxsplit=1)
                sheet_idx: int = int(sheet_idx)
                df1: pd.DataFrame = pd.read_excel(file_name, sheet_idx)
            if isinstance(r["sheet_idx1"], int):
                df2: pd.DataFrame = pd.read_excel(workbook, r["sheet_idx1"])
            else:
                file_name: str
                sheet_idx: str
                file_name, sheet_idx = r["sheet_idx1"].rsplit("@", maxsplit=1)
                sheet_idx: int = int(sheet_idx)
                df2: pd.DataFrame = pd.read_excel(file_name, sheet_idx)
            metric: bool = df1.equals(df2)
            logger.debug("Assertion: {:} == {:} - {:}".format(r["sheet_idx0"], r["sheet_idx1"], metric))
            passes = passes and metric
        else:
            raise NotImplementedError("Unimplemented sheet check: {:}".format(r["type"]))

    return float(passes)


def check_xlsx_freeze(result: str, rules: Dict[str, str]) -> float:
    if result is None:
        return 0.

    worksheet: Worksheet = openpyxl.load_workbook(filename=result).active
    return float(worksheet.freeze_panes == rules["position"])


def check_xlsx_zoom(result: str, rules: Dict[str, Union[str, Number]]) -> float:
    if result is None:
        return 0.

    worksheet = openpyxl.load_workbook(filename=result).active
    zoom_scale: Number = worksheet.sheet_view.zoomScale or 100.
    return float(getattr(operator, rules["relation"])(zoom_scale
                                                      , rules["ref_value"]
                                                      )
                 )

def check_data_validations(result: str, rules: List[Dict[str, Dict[str, Any]]]) -> float:
    """
    Args:
        result (str): path to the concerned xlsx file
        rules (List[Dict[str, Dict[str, Any]]]): list of dict like
          {
            <str as attribute>: {
                "method": str
                "ref": something
            }
          }
          Available attributes:
          * ranges
          * type
          * formula1
          * formula2
          * operator
          * allowBlank
          * showDropDown
          * showInputMessage
          * showErrorMessage
          * error
          * errorTitle
          * errorStyle
          * prompt
          * promptTitle
          * imeMode

    Returns:
        float
    """

    workbook: Workbook = openpyxl.load_workbook(result)
    worksheet: Worksheet = workbook.active
    data_validators: List[DataValidation] = worksheet.data_validations.dataValidation

    total_metric = True
    for dat_vldt in data_validators:
        metric = False
        for r in rules:
            metric = metric or all( _match_value_to_rule( getattr(dat_vldt, attrbt)
                                                        , mr
                                                        )\
                                for attrbt, mr in r.items()
                                  )
            if metric:
                break
        total_metric = total_metric and metric
        if not total_metric:
            break
    return float(total_metric)

if __name__ == '__main__':
    # path1 = ""
    # path2 = ""
    # print(compare_table(path1, path2))

    # path1 = "../../../../../任务数据/LibreOffice Calc/OrderId_Month_Chart_gold.xlsx"
    # path2 = "../../../../../任务数据/LibreOffice Calc/OrderId_Month_Chart.xlsx"
    # print(compare_with_sparklines(path1, path2))

    # path1 = "../../../../../任务数据/LibreOffice Calc/Freeze_row_column_gold.xlsx"
    # path2 = "../../../../../任务数据/LibreOffice Calc/Freeze_row_column.xlsx"
    # workbook1: Workbook = openpyxl.load_workbook(filename=path1)
    # worksheet1: Worksheet = workbook1.active
    # print(worksheet1.freeze_panes)
    # workbook2: Workbook = openpyxl.load_workbook(filename=path2)
    # worksheet2: Worksheet = workbook2.active
    # print(worksheet2.freeze_panes)
    # rule = {"position": "C6"}
    # print(check_xlsx_freeze(path1, rule))

    # path1 = "../../../../../任务数据/LibreOffice Calc/copy_sheet_insert_gold.xlsx"
    # rule = [ { "type": "sheet_name"
    # , "sheet_idx": 0
    # , "sheet_name": "Sheet1"
    # }
    # , { "type": "sheet_data"
    # , "sheet_idx0": "../../../../../任务数据/LibreOffice Calc/copy_sheet_insert.xlsx@0"
    # , "sheet_idx1": 1
    # }
    # , { "type": "sheet_name"
    # , "sheet_idx": 2
    # , "sheet_name": "Sheet2"
    # }
    # ]
    # print(check_sheet_list(path1, rule))

    # path1 = "../../任务数据/LibreOffice Calc/Create_column_charts_using_statistics_gold.xlsx"
    # path2 = "../../任务数据/LibreOffice Calc/Create_column_charts_using_statistics_gold2.xlsx"
    # print(compare_table(path1, path2, features=["chart"], chart_props=["type", "direction"]))

    # path1 = "../../任务数据/LibreOffice Calc/Represent_in_millions_billions_gold.xlsx"
    # path2 = "../../任务数据/LibreOffice Calc/Represent_in_millions_billions_gold3.xlsx"
    # path1 = "../../任务数据/LibreOffice Calc/Set_Decimal_Separator_Dot.xlsx"
    # path2 = "../../任务数据/LibreOffice Calc/Set_Decimal_Separator_Dot_gold.xlsx"
    # workbook1: Workbook = openpyxl.load_workbook(filename=path1)
    # worksheet1: Worksheet = workbook1.active
    # import itertools
    # for col, r in itertools.product( ['A', 'B']
    # , range(1, 20)
    # ):
    # position: str = "{:}{:d}".format(col, r)
    # print(worksheet1[position])
    # print(worksheet1[position].value)
    # print(worksheet1[position].number_format)
    # workbook2: Workbook = openpyxl.load_workbook(filename=path2)
    # worksheet2: Worksheet = workbook2.active
    # for col, r in itertools.product( ['A', 'B']
    # , range(1, 20)
    # ):
    # position: str = "{:}{:d}".format(col, r)
    # print(worksheet2[position])
    # print(worksheet2[position].value)
    # print(worksheet2[position].number_format)
    # print(compare_table(path1, path2, features=["number_format"]))

    # path1 = "../../任务数据/LibreOffice Calc/Zoom_Out_Oversized_Cells_gold.xlsx"
    # path2 = "../../任务数据/LibreOffice Calc/Zoom_Out_Oversized_Cells.xlsx"
    # workbook1: Workbook = openpyxl.load_workbook(filename=path1)
    # worksheet1: Worksheet = workbook1.active
    # print(worksheet1.sheet_view.zoomScale)
    # print(type(worksheet1.sheet_view.zoomScale))
    #
    # import os
    # import os.path
    # for wb in filter( lambda f: f.endswith(".xlsx")
    # , os.listdir("../../任务数据/LibreOffice Calc/")
    # ):
    # path = os.path.join("../../任务数据/LibreOffice Calc/", wb)
    # print(wb, openpyxl.load_workbook(filename=path).active.sheet_view.zoomScale)
    # print(check_zoom(path1, {"relation": "lt", "ref_value": 100}))
    # print(check_zoom(path2, {"relation": "lt", "ref_value": 100}))

    #path1 = "../../任务数据/LibreOffice Calc/Customers_New_7digit_Id.xlsx"
    #path2 = "../../任务数据/LibreOffice Calc/Customers_New_7digit_Id_gold.xlsx"
    #data_frame: pd.DataFrame = pd.read_excel(path1)
    #print(data_frame)
    #print(compare_table(path1, path2, as_shown=True))

    #from openpyxl.worksheet.cell_range import MultiCellRange

    path = "../../任务数据/LibreOffice Calc/Order_Id_Mark_Pass_Fail_gold.xlsx"
    #worksheet: Worksheet = openpyxl.load_workbook(filename=path).active
    ##print(worksheet.data_validations)
    #print(type(worksheet.data_validations.dataValidation))
    #for dat_vldt in worksheet.data_validations.dataValidation:
        #print(dat_vldt.sqref)
        #print(all(r in MultiCellRange("D2:D30 B1:B60") for r in dat_vldt.sqref))
    print( check_data_validations( path, [ { "ranges": { "method": "spreadsheet_range"
                                                       , "ref": ["D2:D29", "D2:D1048576"]
                                                       }
                                           , "type": { "method": "eq"
                                                     , "ref": "list"
                                                     }
                                           , "formula1": { "method": "str_set_eq"
                                                         , "ref": ["Pass", "Fail", "Held"]
                                                         }
                                           }
                                         ]
                                 )
         )