Merge remote-tracking branch 'upstream/main' into fix_chrome

2025-07-08 10:45:57 +00:00
parent 5778078596 b6d9a804fa
commit 349c31fa55
93 changed files with 1158 additions and 495 deletions
--- a/desktop_env/evaluators/metrics/chrome.py
+++ b/desktop_env/evaluators/metrics/chrome.py
@@ -2,6 +2,7 @@ import logging
 import os
 import re
 import shutil
+import io
 from itertools import product
 from typing import Any, Dict, List, Union

@@ -200,6 +201,7 @@ import fitz
 from PIL import Image
 from borb.pdf import Document
 from borb.pdf import PDF
+import imagehash

 from pathlib import Path
 import typing
@@ -208,6 +210,9 @@ import typing
 def compare_pdf_images(pdf1_path: str, pdf2_path: str, **kwargs) -> float:
    if not pdf1_path or not pdf2_path:
        return 0.
+    if not all(map(os.path.exists, [pdf1_path, pdf2_path])):
+        logger.warning(f"PDF file does not exist: {pdf1_path} or {pdf2_path}")
+        return 0.

    def extract_images_from_pdf(pdf_path):
        pdf_document = fitz.open(pdf_path)
@@ -215,35 +220,61 @@ def compare_pdf_images(pdf1_path: str, pdf2_path: str, **kwargs) -> float:

        for page_number in range(pdf_document.page_count):
            page = pdf_document[page_number]
-            pixmap = page.get_pixmap()
-
-            img = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples)
-
-            images.append(img)
+            for img_index, img in enumerate(page.get_images(full=True)):
+                xref = img[0]
+                base_image = pdf_document.extract_image(xref)
+                image_bytes = base_image["image"]
+                
+                # convert to PIL Image
+                try:
+                    pil_image = Image.open(io.BytesIO(image_bytes))
+                    images.append(pil_image)
+                except Exception as e:
+                    logger.error(f"Failed to process image in {pdf_path} on page {page_number}: {e}")

        return images
+    
+    temp_dir = Path(pdf1_path).parent / "temp_pdf_comparison"
+    os.makedirs(temp_dir, exist_ok=True)
+    
+    temp_pdf1 = temp_dir / Path(pdf1_path).name
+    temp_pdf2 = temp_dir / Path(pdf2_path).name

-    def fix_pdf(in_path: Path, out_path: Path) -> None:
-        doc: typing.Optional[Document] = None
-        with open(in_path, "rb") as fh:
-            doc = PDF.loads(fh)
-        with open(out_path, "wb") as fh:
-            PDF.dumps(fh, doc)
+    shutil.copy(pdf1_path, temp_pdf1)
+    shutil.copy(pdf2_path, temp_pdf2)

-    fix_pdf(Path(pdf1_path), Path(pdf1_path))
-    fix_pdf(Path(pdf2_path), Path(pdf2_path))
+    try:
+        images1 = extract_images_from_pdf(str(temp_pdf1))
+        images2 = extract_images_from_pdf(str(temp_pdf2))
+    except Exception as e:
+        logger.error(f"Error extracting images from PDFs: {e}")
+        shutil.rmtree(temp_dir)
+        return 0.
+    finally:
+        shutil.rmtree(temp_dir)

-    images1 = extract_images_from_pdf(pdf1_path)
-    images2 = extract_images_from_pdf(pdf2_path)

    if len(images1) != len(images2):
+        logger.info(f"Different number of images found. Gold: {len(images1)}, Pred: {len(images2)}")
        return 0.

-    for img1, img2 in zip(images1, images2):
-        if img1.tobytes() != img2.tobytes():
-            return 0.
+    if not images1:
+        logger.info("No images found in either PDF. Considering it a match.")
+        return 1.0

-    return 1.
+    hash_threshold = 5 
+    total_score = 0
+    for i, (img1, img2) in enumerate(zip(images1, images2)):
+        hash1 = imagehash.phash(img1)
+        hash2 = imagehash.phash(img2)
+        hash_diff = hash1 - hash2
+        
+        logger.info(f"Image {i+1}: Gold hash: {hash1}, Pred hash: {hash2}, Hash difference: {hash_diff}")
+
+        if hash_diff <= hash_threshold:
+            total_score +=1
+    
+    return total_score / len(images1)


 def compare_archive(pred_path: str, gold_path: str, **kwargs) -> float:
--- a/desktop_env/evaluators/metrics/docs.py
+++ b/desktop_env/evaluators/metrics/docs.py
@@ -86,6 +86,7 @@ def compare_docx_files(file1, file2, **options):
    ignore_case = options.get('ignore_case', False)
    ignore_order = options.get('ignore_order', False)
    content_only = options.get('content_only', False)
+    fuzzy_match = options.get('fuzzy_match', False)
    delete_empty_lines = options.get('delete_empty_lines', False)

    if not file1 or not file2:
@@ -158,29 +159,48 @@ def compare_docx_files(file1, file2, **options):
        text2 = re.sub(r'\s+', ' ', '\n'.join(doc2_paragraphs)).strip()
        if ignore_case:
            text1, text2 = text1.lower(), text2.lower()
-        if text1 != text2:
-            return 0
+
+        if fuzzy_match:
+            similarity = fuzz.ratio(text1, text2) / 100.0
+            return similarity
+        else:
+            if text1 != text2:
+                return 0
    else:
-        print("ignore_blanks=false")
        if len(doc1_paragraphs) != len(doc2_paragraphs):
            print(doc1_paragraphs)
            print(doc2_paragraphs)
            print(len(doc1_paragraphs))
            print(len(doc2_paragraphs))
            return 0
-        print("in compare")
-        # Compare each paragraph
-        for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
-            if ignore_case:
-                p1, p2 = p1.lower(), p2.lower()
-            if p1 != p2:
-                # show the difference
-                print("=== First Paragraph ===")
-                print(f"\033[92m{repr(p1)}\033[0m")  # Green color for p1, repr() shows hidden chars
-                print("=== Second Paragraph ===") 
-                print(f"\033[91m{repr(p2)}\033[0m")  # Red color for p2, repr() shows hidden chars
-                print("=" * 50)  # Clear boundary
-                return 0
+        
+        if fuzzy_match:
+            total_similarity = 0
+            if not doc1_paragraphs:
+                return 1.0
+            for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
+                if ignore_case:
+                    p1, p2 = p1.lower(), p2.lower()
+                total_similarity += fuzz.ratio(p1, p2) / 100.0
+            
+            if len(doc1_paragraphs) == 0:
+                return 1.0 if len(doc2_paragraphs) == 0 else 0.0
+
+            avg_similarity = total_similarity / len(doc1_paragraphs)
+            return avg_similarity
+        else:
+            # Compare each paragraph
+            for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
+                if ignore_case:
+                    p1, p2 = p1.lower(), p2.lower()
+                if p1 != p2:
+                    # show the difference
+                    print("=== First Paragraph ===")
+                    print(f"\033[92m{repr(p1)}\033[0m")  # Green color for p1, repr() shows hidden chars
+                    print("=== Second Paragraph ===")
+                    print(f"\033[91m{repr(p2)}\033[0m")  # Red color for p2, repr() shows hidden chars
+                    print("=" * 50)  # Clear boundary
+                    return 0

    return 1

--- a/desktop_env/evaluators/metrics/general.py
+++ b/desktop_env/evaluators/metrics/general.py
@@ -178,18 +178,43 @@ def check_list(result: str, rules: Dict[str, List[str]]) -> float:
    return float(all(expect_metrics) and unexpect_metric)


-_accessibility_ns_map = {"st": "uri:deskat:state.at-spi.gnome.org"
-    , "attr": "uri:deskat:attributes.at-spi.gnome.org"
-    , "cp": "uri:deskat:component.at-spi.gnome.org"
-    , "doc": "uri:deskat:document.at-spi.gnome.org"
-    , "docattr": "uri:deskat:attributes.document.at-spi.gnome.org"
-    , "txt": "uri:deskat:text.at-spi.gnome.org"
-    , "val": "uri:deskat:value.at-spi.gnome.org"
-    , "act": "uri:deskat:action.at-spi.gnome.org"
-                         }
+_accessibility_ns_map = {
+    "ubuntu": {
+        "st": "https://accessibility.ubuntu.example.org/ns/state",
+        "attr": "https://accessibility.ubuntu.example.org/ns/attributes",
+        "cp": "https://accessibility.ubuntu.example.org/ns/component",
+        "doc": "https://accessibility.ubuntu.example.org/ns/document",
+        "docattr": "https://accessibility.ubuntu.example.org/ns/document/attributes",
+        "txt": "https://accessibility.ubuntu.example.org/ns/text",
+        "val": "https://accessibility.ubuntu.example.org/ns/value",
+        "act": "https://accessibility.ubuntu.example.org/ns/action",
+    },
+    "windows": {
+        "st": "https://accessibility.windows.example.org/ns/state",
+        "attr": "https://accessibility.windows.example.org/ns/attributes",
+        "cp": "https://accessibility.windows.example.org/ns/component",
+        "doc": "https://accessibility.windows.example.org/ns/document",
+        "docattr": "https://accessibility.windows.example.org/ns/document/attributes",
+        "txt": "https://accessibility.windows.example.org/ns/text",
+        "val": "https://accessibility.windows.example.org/ns/value",
+        "act": "https://accessibility.windows.example.org/ns/action",
+        "class": "https://accessibility.windows.example.org/ns/class"
+    },
+    "macos": {
+        "st": "https://accessibility.macos.example.org/ns/state",
+        "attr": "https://accessibility.macos.example.org/ns/attributes",
+        "cp": "https://accessibility.macos.example.org/ns/component",
+        "doc": "https://accessibility.macos.example.org/ns/document",
+        "txt": "https://accessibility.macos.example.org/ns/text",
+        "val": "https://accessibility.macos.example.org/ns/value",
+        "act": "https://accessibility.macos.example.org/ns/action",
+        "role": "https://accessibility.macos.example.org/ns/role",
+    }
+
+}


-def check_accessibility_tree(result: str, rules: List[Dict[str, Any]]) -> float:
+def check_accessibility_tree(result: str, rules: List[Dict[str, Any]], osname: str = "ubuntu") -> float:
    """
    Args:
        result (str): XML of GNOME Accessibility Tree
@@ -205,18 +230,21 @@ def check_accessibility_tree(result: str, rules: List[Dict[str, Any]]) -> float:
            "exact": bool specifying whether exact match or fuzzy match should
              be performed. defaults to True.
          }
+        osname (str): "ubuntu" | "windows" | "macos". "ubuntu" by default.

    Returns:
        float
    """

+    a11y_ns_map = _accessibility_ns_map[osname]
+
    at: _Element = lxml.etree.fromstring(result)
    total_match_score = 1.
    for r in rules:
        if "xpath" in r:
-            elements: List[_Element] = at.xpath(r["xpath"], namespaces=_accessibility_ns_map)
+            elements: List[_Element] = at.xpath(r["xpath"], namespaces=a11y_ns_map)
        elif "selectors" in r:
-            selector = CSSSelector(", ".join(r["selectors"]), namespaces=_accessibility_ns_map)
+            selector = CSSSelector(", ".join(r["selectors"]), namespaces=a11y_ns_map)
            elements: List[_Element] = selector(at)
        else:
            raise ValueError("At least one of xpath and selectors is required")
@@ -307,6 +335,9 @@ def check_direct_json_object(result, rules) -> float:
    One of the most commonly used function to evalute.
    Compare two json objects directly.
    """
+    logger.info(f"[DEBUG] check_direct_json_object called with result: {result}")
+    logger.info(f"[DEBUG] check_direct_json_object called with rules: {rules}")
+    
    if isinstance(result, str):
        # remove blanks before and after result
        result = result.strip()
@@ -314,45 +345,84 @@ def check_direct_json_object(result, rules) -> float:
        result = result.replace("'", '"')
        # load json object
        result = json.loads(result)
+        
+    logger.info(f"[DEBUG] Processed result: {result}")
+    
    if result is None:
+        logger.info("[DEBUG] Result is None, returning 0.0")
+        return 0.
+    
+    # Check if expected value contains evaluation failure indicator
+    try:
+        expected_json = rules.get("expected", {})
+        if expected_json:
+            for key, value in expected_json.items():
+                if value == "__EVALUATION_FAILED__":
+                    logger.error(f"[DEBUG] Expected value for key '{key}' indicates evaluation failure, returning 0.0")
+                    return 0.
+    except Exception as e:
+        logger.error(f"[DEBUG] Error checking for evaluation failure indicator: {e}")
        return 0.
    try:
        expect_in_result = rules.get("expect_in_result", False)
+        logger.info(f"[DEBUG] expect_in_result: {expect_in_result}")
+        
        if not expect_in_result:
            expected_json = rules["expected"]
+            logger.info(f"[DEBUG] Expected JSON: {expected_json}")
+            
            for key in expected_json.keys():
                expected_value = expected_json.get(key)
+                actual_value = result.get(key)
+                logger.info(f"[DEBUG] Checking key '{key}': expected='{expected_value}', actual='{actual_value}'")
+                
                if expected_json.get("ignore_list_order", False):
                    expected_value = sorted(expected_value)
                    result_value = sorted(result.get(key))
+                    logger.info(f"[DEBUG] Comparing lists (sorted): expected={expected_value}, actual={result_value}")
                    if expected_value != result_value:
+                        logger.info(f"[DEBUG] List comparison failed for key '{key}', returning 0.0")
                        return 0.
                else:
-                    if expected_value != result.get(key):
+                    if expected_value != actual_value:
+                        logger.info(f"[DEBUG] Value comparison failed for key '{key}': expected='{expected_value}', actual='{actual_value}', returning 0.0")
                        return 0.
+                    else:
+                        logger.info(f"[DEBUG] Value comparison passed for key '{key}'")
+                        
+            logger.info("[DEBUG] All comparisons passed, returning 1.0")
            return 1.0
        else:
            expected_json = rules["expected"]
+            logger.info(f"[DEBUG] Expected JSON (expect_in_result mode): {expected_json}")

            for key in expected_json.keys():
                if isinstance(expected_json.get(key), list):
                    flag = 0
                    expected_value_list = expected_json.get(key)
+                    logger.info(f"[DEBUG] Checking list key '{key}': expected_list={expected_value_list}, actual='{result.get(key)}'")
                    for each_expected_value in expected_value_list:
                        if isinstance(result.get(key), list) and each_expected_value in result.get(key):
                            flag = 1
+                            logger.info(f"[DEBUG] Found expected value '{each_expected_value}' in result list for key '{key}'")
                            break
                    if flag == 0:
+                        logger.info(f"[DEBUG] No expected values found in result list for key '{key}', returning 0.0")
                        return 0.
                elif isinstance(expected_json.get(key), str):
-                    if expected_json.get(key) not in result.get(key):
+                    expected_str = expected_json.get(key)
+                    actual_str = result.get(key)
+                    logger.info(f"[DEBUG] Checking string key '{key}': expected='{expected_str}', actual='{actual_str}'")
+                    if expected_str not in actual_str:
+                        logger.info(f"[DEBUG] Expected string '{expected_str}' not found in actual string '{actual_str}' for key '{key}', returning 0.0")
                        return 0.
                else:
                    logger.debug("check_direct_json_object: expected value type not supported")
                    return 0.
+            logger.info("[DEBUG] All expect_in_result comparisons passed, returning 1.0")
            return 1.0
-    except:
-        logger.debug("check_direct_json_object: result is not a valid json object")
+    except Exception as e:
+        logger.debug(f"check_direct_json_object: result is not a valid json object, error: {e}")
        return 0.


@@ -361,7 +431,7 @@ def compare_time_in_speedtest_results(speedtest_result_path, time_diff):
        return 0

    # open the speedtest results file(csv)
-    date_col = None
+    #date_col = None
    try:
        with open(speedtest_result_path, 'r') as f:
            for i, line in enumerate(f):
@@ -476,37 +546,66 @@ def compare_terminal_and_txt(txt_file_path, terminal_output):

 def compare_python_pure_text(py_file_path, gold_file_path):
    if not py_file_path or not gold_file_path:
-        return 0
+        return 0.0

-    # first, change the suffix of gold_file from .txt to .py
-    print("py_file_path: ")
-    print(py_file_path)
-    print("gold_file_path: ")
-    print(gold_file_path)
+    def _normalize(text):
+        """
+        Minimal normalization - only handle basic formatting:
+        - Skip obvious file metadata (encoding, shebang) at the beginning
+        - Normalize whitespace and indentation
+        - Remove empty lines
+        
+        This preserves any content that shouldn't be there (like markdown)
+        so it can be detected as an error.
+        """
+        lines = text.splitlines()
+        result_lines = []
+        i = 0
+        
+        # Only skip obvious metadata at the very beginning
+        while i < len(lines) and i < 3:  # Check only first 3 lines
+            stripped = lines[i].strip()
+            
+            if (stripped.startswith('#!') or
+                stripped.startswith('# -*- coding:') or
+                stripped.startswith('# coding:') or
+                stripped.startswith('# coding=')):
+                i += 1
+                continue
+            
+            break
+        
+        # Process all remaining lines with minimal filtering
+        while i < len(lines):
+            line = lines[i]
+            stripped = line.strip()
+            
+            if stripped:  # Keep all non-empty lines
+                normalized = line.expandtabs(4).rstrip()
+                result_lines.append(normalized)
+            
+            i += 1
+        
+        return '\n'.join(result_lines)

-    # gold_file_path = gold_file_path.replace('.txt', '.py')
-    def remove_whitespace(text):
-        return ''.join(text.split())
-
-    with open(py_file_path, 'r') as file1:
-        content1 = file1.read()
-    with open(gold_file_path, 'r') as file2:
-        content2 = file2.read()
-    content1_no_whitespace = remove_whitespace(content1)
-    content2_no_whitespace = remove_whitespace(content2)
-    if content1_no_whitespace == content2_no_whitespace:
-        return 1
-    else:
-        return 0
-
-if __name__ == '__main__':
-    print(check_direct_json_object([], rules={
-                "relativeTime": {
-                  "from": "5th next month"
-                },
-                "expected": {
-                    "start": "SEA",
-                    "end": "NYC",
-                    "time": "{DoW}, {Month} {DayD}, {Year}",
-                    "category": "Miles"
-                }}))
+    try:
+        with open(py_file_path, 'r', encoding='utf-8') as file1:
+            user_content = file1.read()
+        with open(gold_file_path, 'r', encoding='utf-8') as file2:
+            gold_content = file2.read()
+        
+        # Apply different normalization strategies
+        user_normalized = _normalize(user_content)
+        gold_normalized = _normalize(gold_content)
+        
+        if user_normalized == gold_normalized:
+            return 1.0
+        else:
+            return 0.0
+            
+    except (FileNotFoundError, IOError, UnicodeDecodeError) as e:
+        logger.debug(f"compare_python_pure_text: Error reading files - {e}")
+        return 0.0
+    except Exception as e:
+        logger.debug(f"compare_python_pure_text: Unexpected error - {e}")
+        return 0.0
--- a/desktop_env/evaluators/metrics/gimp.py
+++ b/desktop_env/evaluators/metrics/gimp.py
@@ -1,4 +1,5 @@
 import os
+import logging
 from typing import List, Union
 from skimage.metrics import structural_similarity as ssim
 from PIL import Image, ImageChops, ImageStat
@@ -39,7 +40,7 @@ def get_gimp_export_path():
                    return current_path
    except FileNotFoundError:
        # Handle the case where the configuration file is not found
-        print("GIMP configuration file not found")
+        logging.debug("GIMP configuration file not found")
        return False


@@ -193,18 +194,18 @@ def structure_check_by_mse(img1, img2, threshold=0.03):
        (np.array(img1, dtype=np.float32) / 255
         - np.array(img2, dtype=np.float32) / 255) ** 2)
    structure_same = True if mse < threshold else False
-    print("MSE: ", mse)
+    logging.debug(f"MSE: {mse}, threshold: {threshold}")
    return structure_same


 def structure_check_by_ssim(img1, img2, threshold=0.9):
    """Check if two images are approximately the same by SSIM"""
    similarity = ssim(np.array(img1), np.array(img2), multichannel=True, channel_axis=-1)
-    print("SSIM: ", similarity)
+    logging.debug("SSIM: %s", similarity)
    return similarity >= threshold


-def check_brightness_decrease_and_structure_sim(src_path, tgt_path):
+def check_brightness_decrease_and_structure_sim(src_path, tgt_path, threshold=0.03):
    """
    Check the brightness of src is lower than tgt and the structures are similar
    gimp:7a4deb26-d57d-4ea9-9a73-630f66a7b568
@@ -219,13 +220,15 @@ def check_brightness_decrease_and_structure_sim(src_path, tgt_path):
    brightness_src = calculate_brightness(img_src)
    brightness_tgt = calculate_brightness(img_tgt)
    brightness_reduced = brightness_tgt > brightness_src
+    
+    # print(f"Brightness src: {brightness_src}, tgt: {brightness_tgt}, reduced: {brightness_reduced}")

    # Normalize and compare images
    target_brightness = 128
    img_src_normalized = normalize_brightness(img_src, target_brightness)
    img_tgt_normalized = normalize_brightness(img_tgt, target_brightness)

-    structure_same = structure_check_by_mse(img_src_normalized, img_tgt_normalized)
+    structure_same = structure_check_by_mse(img_src_normalized, img_tgt_normalized, threshold=threshold)
    if brightness_reduced and structure_same:
        return 1.
    else:
@@ -362,11 +365,37 @@ def check_structure_sim_resized(src_path, tgt_path):
    img_src = Image.open(src_path)
    img_tgt = Image.open(tgt_path)

-    # Resize the images to the same size
-    img_src = img_src.resize(img_tgt.size)
+    # Check if source image has transparency and extract content area
+    if img_src.mode in ('RGBA', 'LA') or 'transparency' in img_src.info:
+        if img_src.mode != 'RGBA':
+            img_src = img_src.convert('RGBA')
+        
+        # Get alpha channel and find bounding box of non-transparent pixels
+        alpha = img_src.split()[-1]
+        bbox = alpha.getbbox()
+        
+        if bbox is None:
+            # Image is completely transparent
+            logging.debug("Source image is completely transparent")
+            return 0.
+        
+        # Crop to content area only
+        img_src_content = img_src.crop(bbox)
+        logging.debug(f"Source image cropped from {img_src.size} to {img_src_content.size}")
+        
+        # Convert to RGB for comparison
+        img_src_content = img_src_content.convert('RGB')
+        img_src_resized = img_src_content.resize(img_tgt.size)
+    else:
+        # No transparency, resize normally
+        img_src_resized = img_src.resize(img_tgt.size)
+    
+    # Ensure target image is RGB for comparison
+    if img_tgt.mode != 'RGB':
+        img_tgt = img_tgt.convert('RGB')

    # Check if the structure is similar
-    structure_same = structure_check_by_ssim(img_src, img_tgt)
+    structure_same = structure_check_by_ssim(img_src_resized, img_tgt)
    return structure_same


@@ -431,20 +460,52 @@ def check_image_size(src_path, rule):

    # Load the image
    img = Image.open(src_path)
+    
+    # Check if we should ignore transparent parts
+    ignore_transparent = rule.get("ignore_transparent", False)
+    
+    if ignore_transparent and img.mode in ('RGBA', 'LA') or 'transparency' in img.info:
+        # Calculate bounding box of non-transparent pixels
+        if img.mode != 'RGBA':
+            img = img.convert('RGBA')
+        
+        # Get alpha channel
+        alpha = img.split()[-1]
+        
+        # Find bounding box of non-transparent pixels
+        bbox = alpha.getbbox()
+        
+        if bbox is None:
+            # Image is completely transparent
+            actual_width = 0
+            actual_height = 0
+        else:
+            # Calculate actual content size
+            actual_width = bbox[2] - bbox[0]
+            actual_height = bbox[3] - bbox[1]
+        
+        logging.debug(f"Original size: {img.size}, Content size: {actual_width}x{actual_height}")
+    else:
+        # Use original image size
+        actual_width = img.size[0]
+        actual_height = img.size[1]
+        logging.debug(f"Image size: {img.size}")

    # Check the size
    if rule.get("height", None) is not None:
-        height_same = img.size[1] == rule["height"]
+        height_same = actual_height == rule["height"]
    else:
        height_same = True
    if rule.get("width", None) is not None:
-        width_same = img.size[0] == rule["width"]
+        width_same = actual_width == rule["width"]
    else:
        width_same = True

    if height_same and width_same:
+        logging.debug(f"height_same: {height_same}, width_same: {width_same}")
        return 1.
    else:
+        logging.debug(f"height_same: {height_same}, width_same: {width_same}")
        return 0.


--- a/desktop_env/evaluators/metrics/others.py
+++ b/desktop_env/evaluators/metrics/others.py
@@ -63,11 +63,13 @@ def compare_epub(result: str, expected: str) -> float:
    result_files: List[str] = process_epub(result)
    expected_files: List[str] = process_epub(expected)

-    metric: float = 1.
+    metric: float = 0.
    for f1, f2 in zip(result_files, expected_files):
        current_metric: float = diff_text_file(f1, f2)
        logger.debug("%s vs %s: %f", f1, f2, current_metric)
-        metric *= current_metric
+        metric += current_metric
+    if len(result_files) > 0:
+        metric /= len(result_files)
    return metric


--- a/desktop_env/evaluators/metrics/slides.py
+++ b/desktop_env/evaluators/metrics/slides.py
@@ -5,6 +5,7 @@ from math import sqrt

 from pptx import Presentation
 from pptx.util import Inches
+from pptx.enum.shapes import MSO_SHAPE_TYPE

 logger = logging.getLogger("desktopenv.metric.slides")

@@ -139,6 +140,17 @@ def compare_pptx_files(file1_path, file2_path, **options):
    prs1 = Presentation(file1_path)
    prs2 = Presentation(file2_path)

+    approximately_tolerance = options.get("approximately_tolerance", 0.005)
+    def is_approximately_equal(val1, val2, tolerance=approximately_tolerance):
+        """Compare two values with a tolerance of 0.1% (0.005)"""
+        if val1 == val2:
+            return True
+        if val1 == 0 and val2 == 0:
+            return True
+        if val1 == 0 or val2 == 0:
+            return False
+        return abs(val1 - val2) / max(abs(val1), abs(val2)) <= tolerance
+
    examine_number_of_slides = options.get("examine_number_of_slides", True)
    examine_shape = options.get("examine_shape", True)
    examine_text = options.get("examine_text", True)
@@ -212,14 +224,20 @@ def compare_pptx_files(file1_path, file2_path, **options):
                if hasattr(shape1, "text") and hasattr(shape2, "text") and shape1.text == shape2.text:
                    if shape1.text == "Product Comparison" and (shape1.top <= shape2.top or shape1.top < 3600000):
                        return 0
-                elif shape1.left != shape2.left or shape1.top != shape2.top or shape1.width != shape2.width or shape1.height != shape2.height:
+                elif (not is_approximately_equal(shape1.left, shape2.left) or 
+                      not is_approximately_equal(shape1.top, shape2.top) or 
+                      not is_approximately_equal(shape1.width, shape2.width) or 
+                      not is_approximately_equal(shape1.height, shape2.height)):
                    return 0

            if examine_table_bottom_position:
                if slide_idx == 3 and shape1.shape_type == 19 and shape2.shape_type == 19:
                    if shape1.top <= shape2.top or shape1.top < 3600000:
                        return 0
-                elif shape1.left != shape2.left or shape1.top != shape2.top or shape1.width != shape2.width or shape1.height != shape2.height:
+                elif (not is_approximately_equal(shape1.left, shape2.left) or 
+                      not is_approximately_equal(shape1.top, shape2.top) or 
+                      not is_approximately_equal(shape1.width, shape2.width) or 
+                      not is_approximately_equal(shape1.height, shape2.height)):
                    return 0

            if examine_right_position:
@@ -231,34 +249,62 @@ def compare_pptx_files(file1_path, file2_path, **options):
                if slide_idx == 2 and shape1.shape_type == 13 and shape2.shape_type == 13:
                    if shape1.top >= shape2.top or shape1.top > 1980000:
                        return 0
-                elif shape1.left != shape2.left or shape1.top != shape2.top or shape1.width != shape2.width or shape1.height != shape2.height:
+                elif (not is_approximately_equal(shape1.left, shape2.left) or 
+                      not is_approximately_equal(shape1.top, shape2.top) or 
+                      not is_approximately_equal(shape1.width, shape2.width) or 
+                      not is_approximately_equal(shape1.height, shape2.height)):
                    return 0

            if examine_shape_for_shift_size:
-                if shape1.left != shape2.left or shape1.top != shape2.top or shape1.width != shape2.width or shape1.height != shape2.height:
+                if (not is_approximately_equal(shape1.left, shape2.left) or 
+                    not is_approximately_equal(shape1.top, shape2.top) or 
+                    not is_approximately_equal(shape1.width, shape2.width) or 
+                    not is_approximately_equal(shape1.height, shape2.height)):
                    if not (hasattr(shape1, "text") and hasattr(shape2,
                                                                "text") and shape1.text == shape2.text and shape1.text == "Elaborate on what you want to discuss."):
                        return 0

            if (
-                    shape1.left != shape2.left or shape1.top != shape2.top or shape1.width != shape2.width or shape1.height != shape2.height) and examine_shape:
+                    not is_approximately_equal(shape1.left, shape2.left) or 
+                    not is_approximately_equal(shape1.top, shape2.top) or 
+                    not is_approximately_equal(shape1.width, shape2.width) or 
+                    not is_approximately_equal(shape1.height, shape2.height)) and examine_shape:
                return 0

            if examine_image_size:
                if shape1.shape_type == 13 and shape2.shape_type == 13:
-                    if shape1.width != shape2.width or shape1.height != shape2.height:
+                    if not is_approximately_equal(shape1.width, shape2.width) or not is_approximately_equal(shape1.height, shape2.height):
                        return 0
-                elif shape1.left != shape2.left or shape1.top != shape2.top or shape1.width != shape2.width or shape1.height != shape2.height:
+                elif (not is_approximately_equal(shape1.left, shape2.left) or 
+                      not is_approximately_equal(shape1.top, shape2.top) or 
+                      not is_approximately_equal(shape1.width, shape2.width) or 
+                      not is_approximately_equal(shape1.height, shape2.height)):
                    return 0

            if examine_modify_height:
                if not hasattr(shape1, "text") and not hasattr(shape2,
                                                               "text") or shape1.shape_type == 5 and shape2.shape_type == 5:
-                    if shape1.height != shape2.height:
+                    if not is_approximately_equal(shape1.height, shape2.height):
                        return 0
-                elif shape1.left != shape2.left or shape1.top != shape2.top or shape1.width != shape2.width or shape1.height != shape2.height:
+                elif (not is_approximately_equal(shape1.left, shape2.left) or 
+                      not is_approximately_equal(shape1.top, shape2.top) or 
+                      not is_approximately_equal(shape1.width, shape2.width) or 
+                      not is_approximately_equal(shape1.height, shape2.height)):
                    return 0

+            if shape1.shape_type == MSO_SHAPE_TYPE.TABLE:
+                table1 = shape1.table
+                table2 = shape2.table
+                for row_idx in range(len(table1.rows)):
+                    for col_idx in range(len(table1.columns)):
+                        cell1 = table1.cell(row_idx, col_idx)
+                        cell2 = table2.cell(row_idx, col_idx)
+
+                        for para1, para2 in zip(cell1.text_frame.paragraphs, cell2.text_frame.paragraphs):
+                            for run1, run2 in zip(para1.runs, para2.runs):
+                                if run1.font.color.rgb != run2.font.color.rgb:
+                                    return 0
+
            if hasattr(shape1, "text") and hasattr(shape2, "text"):
                if shape1.text.strip() != shape2.text.strip() and examine_text:
                    return 0
@@ -288,15 +334,19 @@ def compare_pptx_files(file1_path, file2_path, **options):
                            return 0

                        if run1.font.italic != run2.font.italic and examine_font_italic:
-                            return 0
+                            if run1.font.italic is not None and run2.font.italic is not None:
+                                return 0

                        if hasattr(run1.font.color, "rgb") and hasattr(run2.font.color, "rgb"):
                            if run1.font.color.rgb != run2.font.color.rgb and examine_color_rgb:
                                return 0

                        if run1.font.underline != run2.font.underline and examine_font_underline:
-                            return 0
-
+                            if run1.font.underline is not None and run2.font.underline is not None:
+                                return 0
+                            if (run1.font.underline is None and run2.font.underline is True) or (run1.font.underline is True and run2.font.underline is None):
+                                return 0
+                                
                        if run1.font._element.attrib.get('strike', 'noStrike') != run2.font._element.attrib.get(
                                'strike', 'noStrike') and examine_strike_through:
                            return 0
@@ -325,14 +375,20 @@ def compare_pptx_files(file1_path, file2_path, **options):
                                    color = "No Color"

                                text = "".join(t.text for t in paragraph.findall('.//a:t', namespaces))
-
-                                bullets.append((lvl, char, text, color))
+                                
+                                # Only add non-empty paragraphs to bullets list
+                                if text.strip():
+                                    bullets.append((lvl, char, text, color))

                            return bullets

-                        if examine_bullets and _extract_bullets(run1.part.blob.decode('utf-8')) != _extract_bullets(
-                                run2.part.blob.decode('utf-8')):
-                            return 0
+                        if examine_bullets:
+                            bullets1 = _extract_bullets(run1.part.blob.decode('utf-8'))
+                            bullets2 = _extract_bullets(run2.part.blob.decode('utf-8'))
+                            
+                            # Compare only non-empty bullets
+                            if bullets1 != bullets2:
+                                return 0

                    # fixme: Actually there are more properties to be compared, we can add them later via parsing the xml data

@@ -446,17 +502,13 @@ def check_left_panel(accessibility_tree):

    root = ET.fromstring(accessibility_tree)

-    for root_pane in root.iter('root-pane'):
-        for panel in root_pane.iter('panel'):
-            for split_pane in panel.iter('split-pane'):
-                # Get the left panel
-                if split_pane.attrib.get("{{{}}}parentcoord".format(namespaces['cp'])) == "(0, 0)":
-                    # Get the visible attribute
-                    visible = split_pane.attrib.get("{{{}}}visible".format(namespaces['st']))
-                    if visible:
-                        # decide if it is left panel
-                        return 1.
+    # 遍历所有 document-frame 节点
+    for doc_frame in root.iter('document-frame'):
+        if doc_frame.attrib.get("name") == "Slides View":
+            # 说明 Slides View 存在，即左侧面板已打开
+            return 1.

+    # 没找到 Slides View，认为左侧面板未打开
    return 0.


--- a/desktop_env/evaluators/metrics/vlc.py
+++ b/desktop_env/evaluators/metrics/vlc.py
@@ -308,7 +308,7 @@ def compare_videos(video_path1, video_path2, max_frames_to_check=100, threshold=

        # If a video ends, then check if both ended to confirm they are of the same length
        if not ret1 or not ret2:
-            return ret1 == ret2
+            return 1. if ret1 == ret2 else 0. # return float only 

        # Convert frames to PIL Images
        frame1 = Image.fromarray(cv2.cvtColor(frame1, cv2.COLOR_BGR2RGB))