From 0651495d8801ec3ae5b0bcf32d362a1c5a440b40 Mon Sep 17 00:00:00 2001
From: yuanmengqi <yuanmengqi@mail.ustc.edu.cn>
Date: Mon, 14 Jul 2025 05:43:17 +0000
Subject: [PATCH] fix: Enhance error handling and logging across multiple
 evaluators

- Added logging for file retrieval and error handling in file.py, improving robustness during file operations.
- Implemented checks for file existence and parsing errors in general.py, enhancing reliability in JSON/YAML processing.
- Improved table comparison logic in table.py with detailed error logging for sheet loading and cell value reading.
- Enhanced metrics evaluation in slides.py with additional checks for paragraph and run counts, ensuring thorough comparison.
- Updated utils.py to include file existence checks and detailed error logging during cell value reading.
---
 desktop_env/evaluators/getters/file.py    | 45 ++++++++++--
 desktop_env/evaluators/metrics/general.py | 90 ++++++++++++++++++-----
 desktop_env/evaluators/metrics/slides.py  | 51 +++++++++++++
 desktop_env/evaluators/metrics/table.py   | 61 +++++++++++----
 desktop_env/evaluators/metrics/utils.py   | 27 +++++--
 5 files changed, 226 insertions(+), 48 deletions(-)

diff --git a/desktop_env/evaluators/getters/file.py b/desktop_env/evaluators/getters/file.py
index f4ab03a..f329d10 100644
--- a/desktop_env/evaluators/getters/file.py
+++ b/desktop_env/evaluators/getters/file.py
@@ -1,10 +1,13 @@
 import os
+import logging
 from typing import Dict, List, Set
 from typing import Optional, Any, Union
 from datetime import datetime
 import requests
 import pandas as pd
 
+logger = logging.getLogger("desktopenv.getter.file")
+
 
 def get_content_from_vm_file(env, config: Dict[str, Any]) -> Any:
     """
@@ -101,16 +104,42 @@ def get_vm_file(env, config: Dict[str, Any]) -> Union[Optional[str], List[Option
 
     for i, (p, d) in enumerate(zip(paths, dests)):
         _path = os.path.join(env.cache_dir, d)
-        file = env.controller.get_file(p)
-        if file is None:
+        
+        try:
+            # Try to get file from VM
+            file = env.controller.get_file(p)
+            if file is None:
+                logger.warning(f"Failed to get file from VM: {p}")
+                if i in gives:
+                    cache_paths.append(None)
+                continue
+
+            if i in gives:
+                cache_paths.append(_path)
+                
+            # Write file with robust error handling
+            try:
+                # Ensure cache directory exists
+                os.makedirs(env.cache_dir, exist_ok=True)
+                
+                with open(_path, "wb") as f:
+                    f.write(file)
+                logger.info(f"Successfully saved file: {_path} ({len(file)} bytes)")
+                
+            except IOError as e:
+                logger.error(f"IO error writing file {_path}: {e}")
+                if i in gives:
+                    cache_paths[-1] = None  # Replace the path we just added with None
+            except Exception as e:
+                logger.error(f"Unexpected error writing file {_path}: {e}")
+                if i in gives:
+                    cache_paths[-1] = None
+                    
+        except Exception as e:
+            logger.error(f"Error processing file {p}: {e}")
             if i in gives:
                 cache_paths.append(None)
-            continue
-
-        if i in gives:
-            cache_paths.append(_path)
-        with open(_path, "wb") as f:
-            f.write(file)
+                
     return cache_paths[0] if len(cache_paths)==1 else cache_paths
 
 
diff --git a/desktop_env/evaluators/metrics/general.py b/desktop_env/evaluators/metrics/general.py
index d0f6195..27356a2 100644
--- a/desktop_env/evaluators/metrics/general.py
+++ b/desktop_env/evaluators/metrics/general.py
@@ -298,34 +298,84 @@ def check_json(result: str, rules: Dict[str, List[Dict[str, Union[List[str], str
     """
 
     if result is None:
+        logger.warning("Result file path is None, returning 0.0")
+        return 0.
+        
+    # Check if file exists
+    if not os.path.exists(result):
+        logger.warning(f"Result file does not exist: {result}, returning 0.0")
+        return 0.
+    
+    try:
+        with open(result, 'r', encoding='utf-8') as f:
+            if is_yaml:
+                try:
+                    # Use SafeLoader instead of Loader for better security and error handling
+                    result_data: Dict[str, Any] = yaml.safe_load(f)
+                    if result_data is None:
+                        logger.warning(f"YAML file {result} is empty or contains only null values, returning 0.0")
+                        return 0.
+                except yaml.YAMLError as e:
+                    logger.error(f"YAML parsing error in file {result}: {e}")
+                    logger.error(f"File content might be corrupted or have invalid YAML syntax")
+                    return 0.
+                except Exception as e:
+                    logger.error(f"Unexpected error parsing YAML file {result}: {e}")
+                    return 0.
+            else:
+                try:
+                    result_data: Dict[str, Any] = json.load(f)
+                except json.JSONDecodeError as e:
+                    logger.error(f"JSON parsing error in file {result}: {e}")
+                    return 0.
+                except Exception as e:
+                    logger.error(f"Unexpected error parsing JSON file {result}: {e}")
+                    return 0.
+    except IOError as e:
+        logger.error(f"IO error reading file {result}: {e}")
+        return 0.
+    except Exception as e:
+        logger.error(f"Unexpected error reading file {result}: {e}")
         return 0.
-    with open(result) as f:
-        if is_yaml:
-            result: Dict[str, Any] = yaml.load(f, Loader=yaml.Loader)
-        else:
-            result: Dict[str, Any] = json.load(f)
 
     expect_rules = rules.get("expect", {})
     unexpect_rules = rules.get("unexpect", {})
 
     metric = True
     for r in expect_rules:
-        value = result
-        for k in r["key"]:
-            try:
-                value = value[k]
-            except KeyError:
-                return 0.
-        metric = metric and _match_value_to_rule(value, r)
+        value = result_data
+        try:
+            for k in r["key"]:
+                try:
+                    value = value[k]
+                except KeyError:
+                    logger.debug(f"Key '{k}' not found in result data, returning 0.0")
+                    return 0.
+                except TypeError:
+                    logger.debug(f"Cannot access key '{k}' - value is not a dictionary, returning 0.0")
+                    return 0.
+            metric = metric and _match_value_to_rule(value, r)
+        except Exception as e:
+            logger.error(f"Error processing expect rule {r}: {e}")
+            return 0.
+            
     for r in unexpect_rules:
-        value = result
-        for k in r["key"]:
-            try:
-                value = value[k]
-            except KeyError:
-                value = None
-                break
-        metric = metric and not _match_value_to_rule(value, r)
+        value = result_data
+        try:
+            for k in r["key"]:
+                try:
+                    value = value[k]
+                except KeyError:
+                    value = None
+                    break
+                except TypeError:
+                    value = None
+                    break
+            metric = metric and not _match_value_to_rule(value, r)
+        except Exception as e:
+            logger.error(f"Error processing unexpect rule {r}: {e}")
+            return 0.
+            
     return float(metric)
 
 
diff --git a/desktop_env/evaluators/metrics/slides.py b/desktop_env/evaluators/metrics/slides.py
index bee4f1a..81c4af9 100644
--- a/desktop_env/evaluators/metrics/slides.py
+++ b/desktop_env/evaluators/metrics/slides.py
@@ -73,6 +73,9 @@ def check_image_stretch_and_center(modified_ppt, original_ppt):
     original_slide_images = [shape for shape in original_slide.shapes if shape.shape_type == 13]
     modified_slide_images = [shape for shape in modified_slide.shapes if shape.shape_type == 13]
 
+    if not original_slide_images:
+        return 0.
+    
     the_image = original_slide_images[0]
 
     the_modified_image = None
@@ -395,12 +398,38 @@ def compare_pptx_files(file1_path, file2_path, **options):
                 table2 = shape2.table
                 if enable_debug:
                     debug_logger.debug(f"  Shape {shape_idx} - Comparing TABLE with {len(table1.rows)} rows and {len(table1.columns)} columns")
+                    debug_logger.debug(f"  Shape {shape_idx} - Table2 has {len(table2.rows)} rows and {len(table2.columns)} columns")
+                
+                # Check if tables have the same dimensions
+                if len(table1.rows) != len(table2.rows) or len(table1.columns) != len(table2.columns):
+                    if enable_debug:
+                        debug_logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx} (TABLE) - Table dimensions differ:")
+                        debug_logger.debug(f"      Table1: {len(table1.rows)} rows x {len(table1.columns)} columns")
+                        debug_logger.debug(f"      Table2: {len(table2.rows)} rows x {len(table2.columns)} columns")
+                    return 0
+                
                 for row_idx in range(len(table1.rows)):
                     for col_idx in range(len(table1.columns)):
                         cell1 = table1.cell(row_idx, col_idx)
                         cell2 = table2.cell(row_idx, col_idx)
 
+                        # Check if cells have the same number of paragraphs
+                        if len(cell1.text_frame.paragraphs) != len(cell2.text_frame.paragraphs):
+                            if enable_debug:
+                                debug_logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx} (TABLE) - Cell [{row_idx},{col_idx}] - Different number of paragraphs:")
+                                debug_logger.debug(f"      Cell1 paragraphs: {len(cell1.text_frame.paragraphs)}")
+                                debug_logger.debug(f"      Cell2 paragraphs: {len(cell2.text_frame.paragraphs)}")
+                            return 0
+
                         for para_idx, (para1, para2) in enumerate(zip(cell1.text_frame.paragraphs, cell2.text_frame.paragraphs)):
+                            # Check if paragraphs have the same number of runs
+                            if len(para1.runs) != len(para2.runs):
+                                if enable_debug:
+                                    debug_logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx} (TABLE) - Cell [{row_idx},{col_idx}], Para {para_idx} - Different number of runs:")
+                                    debug_logger.debug(f"      Para1 runs: {len(para1.runs)}")
+                                    debug_logger.debug(f"      Para2 runs: {len(para2.runs)}")
+                                return 0
+
                             for run_idx, (run1, run2) in enumerate(zip(para1.runs, para2.runs)):
                                 # Check font color
                                 if hasattr(run1.font.color, "rgb") and hasattr(run2.font.color, "rgb"):
@@ -451,6 +480,14 @@ def compare_pptx_files(file1_path, file2_path, **options):
                 if shape1.text.strip() != shape2.text.strip() and examine_text:
                     return 0
 
+                # check if the number of paragraphs are the same
+                if len(shape1.text_frame.paragraphs) != len(shape2.text_frame.paragraphs):
+                    if enable_debug:
+                        debug_logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx} - Different number of paragraphs:")
+                        debug_logger.debug(f"      Shape1 paragraphs: {len(shape1.text_frame.paragraphs)}")
+                        debug_logger.debug(f"      Shape2 paragraphs: {len(shape2.text_frame.paragraphs)}")
+                    return 0
+
                     # check if the paragraphs are the same
                 para_idx = 0
                 for para1, para2 in zip(shape1.text_frame.paragraphs, shape2.text_frame.paragraphs):
@@ -487,6 +524,14 @@ def compare_pptx_files(file1_path, file2_path, **options):
                     if para1.level != para2.level and examine_indent:
                         return 0
 
+                    # check if the number of runs are the same
+                    if len(para1.runs) != len(para2.runs):
+                        if enable_debug:
+                            debug_logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx}, Para {para_idx} - Different number of runs:")
+                            debug_logger.debug(f"      Para1 runs: {len(para1.runs)}")
+                            debug_logger.debug(f"      Para2 runs: {len(para2.runs)}")
+                        return 0
+
                     for run1, run2 in zip(para1.runs, para2.runs):
 
                         # check if the font properties are the same                        
@@ -634,6 +679,12 @@ def compare_pptx_files(file1_path, file2_path, **options):
                         debug_logger.debug(f"    MISMATCH: Text differs - '{tshape1.text.strip()}' vs '{tshape2.text.strip()}'")
                     return 0
                 
+                # Check if text shapes have the same number of paragraphs
+                if len(tshape1.text_frame.paragraphs) != len(tshape2.text_frame.paragraphs):
+                    if enable_debug:
+                        debug_logger.debug(f"    MISMATCH: Different number of paragraphs - {len(tshape1.text_frame.paragraphs)} vs {len(tshape2.text_frame.paragraphs)}")
+                    return 0
+                
                 # Compare alignment of each paragraph
                 for para_idx, (para1, para2) in enumerate(zip(tshape1.text_frame.paragraphs, tshape2.text_frame.paragraphs)):
                     from pptx.enum.text import PP_ALIGN
diff --git a/desktop_env/evaluators/metrics/table.py b/desktop_env/evaluators/metrics/table.py
index db51850..5a0f79d 100644
--- a/desktop_env/evaluators/metrics/table.py
+++ b/desktop_env/evaluators/metrics/table.py
@@ -36,8 +36,14 @@ def _parse_sheet_idx(sheet_idx: Union[int, str]
     #  function _parse_sheet_idx {{{ # 
     if isinstance(sheet_idx, int):
         try:
-            index: str = result_sheet_names[sheet_idx]
-        except:
+            if not result_sheet_names or sheet_idx >= len(result_sheet_names):
+                logger.error(f"Sheet index {sheet_idx} out of range. Available sheets: {result_sheet_names}")
+                index = ""
+            else:
+                index: str = result_sheet_names[sheet_idx]
+                logger.debug(f"Sheet index {sheet_idx} resolved to sheet: {index}")
+        except Exception as e:
+            logger.error(f"Error resolving sheet index {sheet_idx}: {e}")
             index = ""
         book: BOOK = result
     elif sheet_idx.startswith("RI"):
@@ -114,12 +120,21 @@ def compare_table(result: str, expected: str = None, **options) -> float:
     """
 
     if result is None:
+        logger.error("Result file path is None")
+        return 0.
+
+    # Check if result file exists
+    if not os.path.exists(result):
+        logger.error(f"Result file not found: {result}")
         return 0.
 
     try:
+        logger.info(f"Loading result file: {result}")
         xlworkbookr: Workbook = openpyxl.load_workbook(filename=result)
         pdworkbookr = pd.ExcelFile(result)
-    except:
+        logger.info(f"Successfully loaded result file with sheets: {pdworkbookr.sheet_names}")
+    except Exception as e:
+        logger.error(f"Failed to load result file {result}: {e}")
         return 0.
     worksheetr_names: List[str] = pdworkbookr.sheet_names
 
@@ -432,19 +447,35 @@ def compare_table(result: str, expected: str = None, **options) -> float:
             # props: dict like {attribute: {"method": str, "ref": anything}}
             #   supported attributes: value & those supported by utils._read_cell_style
 
-            sheet: Worksheet = _load_sheet(*parse_idx(r["sheet_idx"], xlworkbookr, xlworkbooke))
-            if sheet is None:
-                return 0.
-            # data_frame: pd.DataFrame = _load_sheet(*parse_idx(r["sheet_idx"], pdworkbookr, pdworkbooke))
-            cell: Cell = sheet[r["coordinate"]]
-            metric: bool = True
-            for prpt, rule in r["props"].items():
-                if prpt == "value":
-                    val = read_cell_value(*parse_idx(r["sheet_idx"], result, expected), r["coordinate"])
-                else:
-                    val = _read_cell_style(prpt, cell)
+            try:
+                sheet: Worksheet = _load_sheet(*parse_idx(r["sheet_idx"], xlworkbookr, xlworkbooke))
+                if sheet is None:
+                    logger.error(f"Failed to load sheet for sheet_idx: {r['sheet_idx']}")
+                    return 0.
+                # data_frame: pd.DataFrame = _load_sheet(*parse_idx(r["sheet_idx"], pdworkbookr, pdworkbooke))
+                cell: Cell = sheet[r["coordinate"]]
+                metric: bool = True
+                for prpt, rule in r["props"].items():
+                    if prpt == "value":
+                        try:
+                            parsed_result = parse_idx(r["sheet_idx"], result, expected)
+                            logger.debug(f"parse_idx result: {parsed_result}")
+                            val = read_cell_value(*parsed_result, r["coordinate"])
+                            logger.debug(f"Cell {r['coordinate']} value: {val}")
+                        except Exception as e:
+                            logger.error(f"Failed to read cell value at {r['coordinate']}: {e}")
+                            val = None
+                    else:
+                        try:
+                            val = _read_cell_style(prpt, cell)
+                        except Exception as e:
+                            logger.error(f"Failed to read cell style {prpt} at {r['coordinate']}: {e}")
+                            val = None
 
-                metric = metric and _match_value_to_rule(val, rule)
+                    metric = metric and _match_value_to_rule(val, rule)
+            except Exception as e:
+                logger.error(f"Error in check_cell processing: {e}")
+                return 0.
 
             logger.debug("Assertion: %s[%s] :%s - %s"
                          , r["sheet_idx"], r["coordinate"]
diff --git a/desktop_env/evaluators/metrics/utils.py b/desktop_env/evaluators/metrics/utils.py
index e512a26..1136655 100644
--- a/desktop_env/evaluators/metrics/utils.py
+++ b/desktop_env/evaluators/metrics/utils.py
@@ -4,6 +4,7 @@ import functools
 import itertools
 import logging
 import operator
+import os
 import re
 import zipfile
 #import pandas as pd
@@ -33,10 +34,11 @@ V = TypeVar("Value")
 
 logger = logging.getLogger("desktopenv.metrics.utils")
 
-_xlsx_namespaces = [("oo", "http://schemas.openxmlformats.org/spreadsheetml/2006/main")
-    , ("x14", "http://schemas.microsoft.com/office/spreadsheetml/2009/9/main")
-    , ("xm", "http://schemas.microsoft.com/office/excel/2006/main")
-                    ]
+_xlsx_namespaces = [
+    ("oo", "http://schemas.openxmlformats.org/spreadsheetml/2006/main"),
+    ("x14", "http://schemas.microsoft.com/office/spreadsheetml/2009/9/main"),
+    ("xm", "http://schemas.microsoft.com/office/excel/2006/main")
+]
 _xlsx_ns_mapping = dict(_xlsx_namespaces)
 _xlsx_ns_imapping = dict(map(lambda itm: (itm[1], itm[0]), _xlsx_namespaces))
 _xlsx_ns_imapping["http://schemas.openxmlformats.org/spreadsheetml/2006/main"] = None
@@ -282,6 +284,13 @@ _shared_str_value_selector = lxml.cssselect.CSSSelector("oo|t", namespaces=_xlsx
 
 def read_cell_value(xlsx_file: str, sheet_name: str, coordinate: str) -> Any:
     #  read_cell_value {{{ # 
+    logger.debug(f"Reading cell value from {xlsx_file}, sheet: {sheet_name}, coordinate: {coordinate}")
+    
+    # Check if file exists
+    if not os.path.exists(xlsx_file):
+        logger.error(f"Excel file not found: {xlsx_file}")
+        return None
+    
     try:
         with zipfile.ZipFile(xlsx_file, "r") as z_f:
             try:
@@ -308,9 +317,17 @@ def read_cell_value(xlsx_file: str, sheet_name: str, coordinate: str) -> Any:
                                                , namespaces=_xlsx_ns_mapping
                                                )(sheet)
                 if len(cells) == 0:
+                    logger.debug(f"Cell {coordinate} not found in sheet {sheet_name}")
                     return None
                 cell: _Element = cells[0]
-    except zipfile.BadZipFile:
+    except zipfile.BadZipFile as e:
+        logger.error(f"Bad zip file {xlsx_file}: {e}")
+        return None
+    except KeyError as e:
+        logger.error(f"Sheet {sheet_name} not found in {xlsx_file}: {e}")
+        return None
+    except Exception as e:
+        logger.error(f"Error reading {xlsx_file}: {e}")
         return None
 
     cell: Dict[str, str] = xmltodict.parse(lxml.etree.tostring(cell, encoding="unicode")