merged from zdy

2024-03-04 15:16:26 +08:00
parent 0eb37d26f3 e98cd6b701
commit 2a6ff83dd6
3 changed files with 111 additions and 31 deletions
--- a/desktop_env/evaluators/metrics/docs.py
+++ b/desktop_env/evaluators/metrics/docs.py
@@ -13,7 +13,7 @@ from odf.text import P
 from odf.text import Span
 from skimage.color import deltaE_ciede2000
 from skimage.color import rgb2lab
-from fuzzywuzzy import fuzz
+from rapidfuzz import fuzz

 logger = logging.getLogger("desktopenv.metric.docs")

--- a/desktop_env/evaluators/metrics/table.py
+++ b/desktop_env/evaluators/metrics/table.py
@@ -5,19 +5,20 @@ import os.path
 # import operator
 from numbers import Number
 from typing import Any, Union, cast, Callable, Iterable
-from typing import Dict, List, Tuple
+from typing import Dict, List, Tuple, Set

 import openpyxl
 import pandas as pd
 from openpyxl import Workbook
 from openpyxl.cell.cell import Cell
-# from openpyxl.worksheet.cell_range import MultiCellRange
+from openpyxl.worksheet.cell_range import MultiCellRange
 from openpyxl.worksheet.datavalidation import DataValidation
 from openpyxl.worksheet.worksheet import Worksheet

 from .utils import _match_value_to_rule, _read_cell_style, read_cell_value
 from .utils import load_charts, load_sparklines, load_rows_or_cols, load_xlsx_styles\
                 , load_filters, load_pivot_tables
+from rapidfuzz import fuzz

 # from openpyxl.utils import coordinate_to_tuple

@@ -157,8 +158,8 @@ def compare_table(result: str, expected: str = None, **options) -> float:
                return 0.
            sheet2: pd.DataFrame = _load_sheet(*parse_idx(r["sheet_idx1"], pdworkbookr, pdworkbooke))

-            sheet1 = sheet1.round()
-            sheet2 = sheet2.round()
+            sheet1 = sheet1.round(error_limit)
+            sheet2 = sheet2.round(error_limit)
            metric: bool = sheet1.equals(sheet2)
            logger.debug("Sheet1: \n%s", str(sheet1))
            logger.debug("Sheet2: \n%s", str(sheet2))
@@ -186,6 +187,61 @@ def compare_table(result: str, expected: str = None, **options) -> float:
            logger.debug("Assertion: %s =p= %s - %s", r["sheet_idx0"], r["sheet_idx1"], metric)
            #  }}} Compare Sheet Data by Printed Value # 

+        elif r["type"] == "sheet_fuzzy":
+            #  Fuzzy Match for Ranges {{{ # 
+            # sheet_idx0: 0 == "RI0" == "RNSheet1" | "EI0" == "ENSheet1"
+            # sheet_idx1: as sheet_idx0
+            # rules: list of dict, each dict is like
+            #   { "range": ["A1:B6", "C2:E5"],
+            #     "type": "includes" | "includes_by" | "fuzzy_match" | "exact_match", # 0 includes 1, 0 includes_by 1
+            #     "threshold": 85, // for fuzzy match
+            #     "ignore_case": true | false,
+            #     "ignore_chars": " ()", # filtered out
+            #     "trim_leadings": "+ ", # filtered by lstrip
+            #     "trim_trailings": "", # filtered by rstrip
+            #     "normalization": [["Rd", "Road"]], # filtered by replace
+            #   }
+
+            sheet1: Tuple[BOOK, str] = parse_idx(r["sheet_idx0"], result, expected)
+            sheet2: Tuple[BOOK, str] = parse_idx(r["sheet_idx1"], result, expected)
+            total_metric = True
+            for rl in r["rules"]:
+                for rng in MultiCellRange(rl["range"]):
+                    for cdn in rng.cells:
+                        value1: str = str(read_cell_value(*sheet1, cdn))
+                        value2: str = str(read_cell_value(*sheet2, cdn))
+
+                        for rplc in rl.get("normalization", []):
+                            value1 = value1.replace(rplc[0], rplc[1])
+                            value2 = value2.replace(rplc[0], rplc[1])
+                        if "trim_leadings" in rl:
+                            value1 = value1.lstrip(rl["trim_leadings"])
+                            value2 = value2.lstrip(rl["trim_leadings"])
+                        if "trim_trailings" in rl:
+                            value1 = value1.rstrip(rl["trim_trailings"])
+                            value2 = value2.rstrip(rl["trim_trailings"])
+                        if "ignore_chars" in rl:
+                            ignore_chars: Set[str] = set(rl["ignore_chars"])
+                            value1 = "".join(filter(lambda ch: ch not in ignore_chars, value1))
+                            value2 = "".join(filter(lambda ch: ch not in ignore_chars, value2))
+                        if rl.get("ignore_case", False):
+                            value1 = value1.lower()
+                            value2 = value2.lower()
+
+                        if rl["type"]=="includes":
+                            metric: bool = value1 in value2
+                        if rl["type"]=="includes_by":
+                            metric: bool = value2 in value1
+                        if rl["type"]=="fuzzy_match":
+                            metric: bool = fuzz.ratio(value1, value2) >= rl.get("threshold", 85.)
+                        if rl["type"]=="exact_match":
+                            metric: bool = value1==value2
+                        total_metric = total_metric and metric
+
+            metric: bool = total_metric
+            logger.debug("Assertion: %s =~= %s - %s", r["sheet_idx0"], r["sheet_idx1"], metric)
+            #  }}} Fuzzy Match for Ranges # 
+
        elif r["type"] == "sparkline":
            #  Compare Sparklines {{{ # 
            # sheet_idx0: 0 == "RI0" == "RNSheet1" | "EI0" == "ENSheet1"
--- a/evaluation_examples/examples/multi_apps/d1acdb87-bb67-4f30-84aa-990e56a09c92.json
+++ b/evaluation_examples/examples/multi_apps/d1acdb87-bb67-4f30-84aa-990e56a09c92.json
@@ -53,51 +53,75 @@
  ],
  "evaluator": {
    "postconfig": [
+      {
+        "type": "activate_window",
+        "parameters": {
+          "window_name": "MUST_VISIT.xlsx - LibreOffice Calc",
+          "strict": true
+        }
+      },
+      {
+        "type": "sleep",
+        "parameters": {
+          "seconds": 0.5
+        }
+      },
      {
        "type": "execute",
        "parameters": {
          "command": [
-            "libreoffice",
-            "--convert-to",
-            "csv:Text - txt - csv (StarCalc):44,34,UTF-8,,,,false,true,true,false,false,1",
-            "--outdir",
-            "/home/user/Desktop",
-            "/home/user/Desktop/MUST_VISIT.xlsx"
+            "python",
+            "-c",
+            "import pyautogui; pyautogui.hotkey(\"ctrl\", \"s\");"
          ]
        }
+      },
+      {
+        "type": "sleep",
+        "parameters": {
+          "seconds": 1.0
+        }
      }
    ],
    "func": "compare_table",
    "result": {
      "type": "vm_file",
-      "path": [
-        "/home/user/Desktop/MUST_VISIT.xlsx",
-        "/home/user/Desktop/MUST_VISIT-Sheet1.csv"
-      ],
-      "dest": [
-        "MUST_VISIT.xlsx",
-        "MUST_VISIT-Sheet1.csv"
-      ],
-      "multi": true
+      "path": "/home/user/Desktop/MUST_VISIT.xlsx",
+      "dest": "MUST_VISIT.xlsx"
    },
    "expected": {
      "type": "cloud_file",
-      "path": [
-        "https://drive.google.com/uc?id=1MV6jBvRbbYwPqeFTd_nX40xzyltNhphl&export=download",
-        "https://drive.google.com/uc?id=1CGoRQDLw9-Ai7daq3qCz0o9kYSZB2WNn&export=download"
-      ],
-      "dest": [
-        "MUST_VISIT-gt.xlsx",
-        "MUST_VISIT-gt-Sheet1.csv"
-      ],
-      "multi": true
+      "path": "https://drive.google.com/uc?id=1MV6jBvRbbYwPqeFTd_nX40xzyltNhphl&export=download",
+      "dest": "MUST_VISIT-gt.xlsx"
    },
    "options": {
      "rules": [
        {
-          "type": "sheet_print",
+          "type": "sheet_fuzzy",
          "sheet_idx0": "RNSheet1",
-          "sheet_idx1": "ENSheet1"
+          "sheet_idx1": "ENSheet1",
+          "rules": [
+            {
+              "range": ["A1:A6", "D1:D6"],
+              "type": "exact_match"
+            },
+            {
+              "range": ["B1:B6"],
+              "type": "fuzzy_match",
+              "threshold": 85,
+              "normalization": [
+                ["Rd", "Road"],
+                ["St", "Street"]
+              ],
+              "ignore_case": true
+            },
+            {
+              "range": ["C1:C6"],
+              "type": "includes",
+              "trim_leadings": "+ ",
+              "ignore_chars": " ()-"
+            }
+          ]
        }
      ]
    }