ver Mar4th

updated range-wise fuzzy match mode for compare_table
2024-03-04 15:08:41 +08:00
parent dab7f196c4
commit e98cd6b701
3 changed files with 111 additions and 31 deletions
--- a/desktop_env/evaluators/metrics/docs.py
+++ b/desktop_env/evaluators/metrics/docs.py
@@ -13,7 +13,7 @@ from odf.text import P
 from odf.text import Span
 from skimage.color import deltaE_ciede2000
 from skimage.color import rgb2lab
-from fuzzywuzzy import fuzz
+from rapidfuzz import fuzz

 logger = logging.getLogger("desktopenv.metric.docs")

--- a/desktop_env/evaluators/metrics/table.py
+++ b/desktop_env/evaluators/metrics/table.py
@@ -5,19 +5,20 @@ import os.path
 # import operator
 from numbers import Number
 from typing import Any, Union, cast, Callable, Iterable
-from typing import Dict, List, Tuple
+from typing import Dict, List, Tuple, Set

 import openpyxl
 import pandas as pd
 from openpyxl import Workbook
 from openpyxl.cell.cell import Cell
-# from openpyxl.worksheet.cell_range import MultiCellRange
+from openpyxl.worksheet.cell_range import MultiCellRange
 from openpyxl.worksheet.datavalidation import DataValidation
 from openpyxl.worksheet.worksheet import Worksheet

 from .utils import _match_value_to_rule, _read_cell_style, read_cell_value
 from .utils import load_charts, load_sparklines, load_rows_or_cols, load_xlsx_styles\
                 , load_filters, load_pivot_tables
+from rapidfuzz import fuzz

 # from openpyxl.utils import coordinate_to_tuple

@@ -157,8 +158,8 @@ def compare_table(result: str, expected: str = None, **options) -> float:
                return 0.
            sheet2: pd.DataFrame = _load_sheet(*parse_idx(r["sheet_idx1"], pdworkbookr, pdworkbooke))

-            sheet1 = sheet1.round()
-            sheet2 = sheet2.round()
+            sheet1 = sheet1.round(error_limit)
+            sheet2 = sheet2.round(error_limit)
            metric: bool = sheet1.equals(sheet2)
            logger.debug("Sheet1: \n%s", str(sheet1))
            logger.debug("Sheet2: \n%s", str(sheet2))
@@ -186,6 +187,61 @@ def compare_table(result: str, expected: str = None, **options) -> float:
            logger.debug("Assertion: %s =p= %s - %s", r["sheet_idx0"], r["sheet_idx1"], metric)
            #  }}} Compare Sheet Data by Printed Value # 

+        elif r["type"] == "sheet_fuzzy":
+            #  Fuzzy Match for Ranges {{{ # 
+            # sheet_idx0: 0 == "RI0" == "RNSheet1" | "EI0" == "ENSheet1"
+            # sheet_idx1: as sheet_idx0
+            # rules: list of dict, each dict is like
+            #   { "range": ["A1:B6", "C2:E5"],
+            #     "type": "includes" | "includes_by" | "fuzzy_match" | "exact_match", # 0 includes 1, 0 includes_by 1
+            #     "threshold": 85, // for fuzzy match
+            #     "ignore_case": true | false,
+            #     "ignore_chars": " ()", # filtered out
+            #     "trim_leadings": "+ ", # filtered by lstrip
+            #     "trim_trailings": "", # filtered by rstrip
+            #     "normalization": [["Rd", "Road"]], # filtered by replace
+            #   }
+
+            sheet1: Tuple[BOOK, str] = parse_idx(r["sheet_idx0"], result, expected)
+            sheet2: Tuple[BOOK, str] = parse_idx(r["sheet_idx1"], result, expected)
+            total_metric = True
+            for rl in r["rules"]:
+                for rng in MultiCellRange(rl["range"]):
+                    for cdn in rng.cells:
+                        value1: str = str(read_cell_value(*sheet1, cdn))
+                        value2: str = str(read_cell_value(*sheet2, cdn))
+
+                        for rplc in rl.get("normalization", []):
+                            value1 = value1.replace(rplc[0], rplc[1])
+                            value2 = value2.replace(rplc[0], rplc[1])
+                        if "trim_leadings" in rl:
+                            value1 = value1.lstrip(rl["trim_leadings"])
+                            value2 = value2.lstrip(rl["trim_leadings"])
+                        if "trim_trailings" in rl:
+                            value1 = value1.rstrip(rl["trim_trailings"])
+                            value2 = value2.rstrip(rl["trim_trailings"])
+                        if "ignore_chars" in rl:
+                            ignore_chars: Set[str] = set(rl["ignore_chars"])
+                            value1 = "".join(filter(lambda ch: ch not in ignore_chars, value1))
+                            value2 = "".join(filter(lambda ch: ch not in ignore_chars, value2))
+                        if rl.get("ignore_case", False):
+                            value1 = value1.lower()
+                            value2 = value2.lower()
+
+                        if rl["type"]=="includes":
+                            metric: bool = value1 in value2
+                        if rl["type"]=="includes_by":
+                            metric: bool = value2 in value1
+                        if rl["type"]=="fuzzy_match":
+                            metric: bool = fuzz.ratio(value1, value2) >= rl.get("threshold", 85.)
+                        if rl["type"]=="exact_match":
+                            metric: bool = value1==value2
+                        total_metric = total_metric and metric
+
+            metric: bool = total_metric
+            logger.debug("Assertion: %s =~= %s - %s", r["sheet_idx0"], r["sheet_idx1"], metric)
+            #  }}} Fuzzy Match for Ranges # 
+
        elif r["type"] == "sparkline":
            #  Compare Sparklines {{{ # 
            # sheet_idx0: 0 == "RI0" == "RNSheet1" | "EI0" == "ENSheet1"