fix: Enhance error handling and logging across multiple evaluators

- Added logging for file retrieval and error handling in file.py, improving robustness during file operations. - Implemented checks for file existence and parsing errors in general.py, enhancing reliability in JSON/YAML processing. - Improved table comparison logic in table.py with detailed error logging for sheet loading and cell value reading. - Enhanced metrics evaluation in slides.py with additional checks for paragraph and run counts, ensuring thorough comparison. - Updated utils.py to include file existence checks and detailed error logging during cell value reading.
2025-07-14 05:43:17 +00:00
parent 349f2fd9fe
commit 0651495d88
5 changed files with 226 additions and 48 deletions
--- a/desktop_env/evaluators/getters/file.py
+++ b/desktop_env/evaluators/getters/file.py
@@ -1,10 +1,13 @@
 import os
 import logging
 from typing import Dict, List, Set
 from typing import Optional, Any, Union
 from datetime import datetime
 import requests
 import pandas as pd
 logger = logging.getLogger("desktopenv.getter.file")
 def get_content_from_vm_file(env, config: Dict[str, Any]) -> Any:
    """
@@ -101,16 +104,42 @@ def get_vm_file(env, config: Dict[str, Any]) -> Union[Optional[str], List[Option
    for i, (p, d) in enumerate(zip(paths, dests)):
        _path = os.path.join(env.cache_dir, d)
-        file = env.controller.get_file(p)
+        
-        if file is None:
+        try:
            # Try to get file from VM
            file = env.controller.get_file(p)
            if file is None:
                logger.warning(f"Failed to get file from VM: {p}")
                if i in gives:
                    cache_paths.append(None)
                continue
            if i in gives:
                cache_paths.append(_path)
            # Write file with robust error handling
            try:
                # Ensure cache directory exists
                os.makedirs(env.cache_dir, exist_ok=True)
                with open(_path, "wb") as f:
                    f.write(file)
                logger.info(f"Successfully saved file: {_path} ({len(file)} bytes)")
            except IOError as e:
                logger.error(f"IO error writing file {_path}: {e}")
                if i in gives:
                    cache_paths[-1] = None  # Replace the path we just added with None
            except Exception as e:
                logger.error(f"Unexpected error writing file {_path}: {e}")
                if i in gives:
                    cache_paths[-1] = None
        except Exception as e:
            logger.error(f"Error processing file {p}: {e}")
            if i in gives:
                cache_paths.append(None)
-            continue
+                
        if i in gives:
            cache_paths.append(_path)
        with open(_path, "wb") as f:
            f.write(file)
    return cache_paths[0] if len(cache_paths)==1 else cache_paths
--- a/desktop_env/evaluators/metrics/general.py
+++ b/desktop_env/evaluators/metrics/general.py
@@ -298,34 +298,84 @@ def check_json(result: str, rules: Dict[str, List[Dict[str, Union[List[str], str
    """
    if result is None:
        logger.warning("Result file path is None, returning 0.0")
        return 0.
    # Check if file exists
    if not os.path.exists(result):
        logger.warning(f"Result file does not exist: {result}, returning 0.0")
        return 0.
    try:
        with open(result, 'r', encoding='utf-8') as f:
            if is_yaml:
                try:
                    # Use SafeLoader instead of Loader for better security and error handling
                    result_data: Dict[str, Any] = yaml.safe_load(f)
                    if result_data is None:
                        logger.warning(f"YAML file {result} is empty or contains only null values, returning 0.0")
                        return 0.
                except yaml.YAMLError as e:
                    logger.error(f"YAML parsing error in file {result}: {e}")
                    logger.error(f"File content might be corrupted or have invalid YAML syntax")
                    return 0.
                except Exception as e:
                    logger.error(f"Unexpected error parsing YAML file {result}: {e}")
                    return 0.
            else:
                try:
                    result_data: Dict[str, Any] = json.load(f)
                except json.JSONDecodeError as e:
                    logger.error(f"JSON parsing error in file {result}: {e}")
                    return 0.
                except Exception as e:
                    logger.error(f"Unexpected error parsing JSON file {result}: {e}")
                    return 0.
    except IOError as e:
        logger.error(f"IO error reading file {result}: {e}")
        return 0.
    except Exception as e:
        logger.error(f"Unexpected error reading file {result}: {e}")
        return 0.
    with open(result) as f:
        if is_yaml:
            result: Dict[str, Any] = yaml.load(f, Loader=yaml.Loader)
        else:
            result: Dict[str, Any] = json.load(f)
    expect_rules = rules.get("expect", {})
    unexpect_rules = rules.get("unexpect", {})
    metric = True
    for r in expect_rules:
-        value = result
+        value = result_data
-        for k in r["key"]:
+        try:
-            try:
+            for k in r["key"]:
-                value = value[k]
+                try:
-            except KeyError:
+                    value = value[k]
-                return 0.
+                except KeyError:
-        metric = metric and _match_value_to_rule(value, r)
+                    logger.debug(f"Key '{k}' not found in result data, returning 0.0")
                    return 0.
                except TypeError:
                    logger.debug(f"Cannot access key '{k}' - value is not a dictionary, returning 0.0")
                    return 0.
            metric = metric and _match_value_to_rule(value, r)
        except Exception as e:
            logger.error(f"Error processing expect rule {r}: {e}")
            return 0.
    for r in unexpect_rules:
-        value = result
+        value = result_data
-        for k in r["key"]:
+        try:
-            try:
+            for k in r["key"]:
-                value = value[k]
+                try:
-            except KeyError:
+                    value = value[k]
-                value = None
+                except KeyError:
-                break
+                    value = None
-        metric = metric and not _match_value_to_rule(value, r)
+                    break
                except TypeError:
                    value = None
                    break
            metric = metric and not _match_value_to_rule(value, r)
        except Exception as e:
            logger.error(f"Error processing unexpect rule {r}: {e}")
            return 0.
    return float(metric)
--- a/desktop_env/evaluators/metrics/slides.py
+++ b/desktop_env/evaluators/metrics/slides.py
@@ -73,6 +73,9 @@ def check_image_stretch_and_center(modified_ppt, original_ppt):
    original_slide_images = [shape for shape in original_slide.shapes if shape.shape_type == 13]
    modified_slide_images = [shape for shape in modified_slide.shapes if shape.shape_type == 13]
    if not original_slide_images:
        return 0.
    the_image = original_slide_images[0]
    the_modified_image = None
@@ -395,12 +398,38 @@ def compare_pptx_files(file1_path, file2_path, **options):
                table2 = shape2.table
                if enable_debug:
                    debug_logger.debug(f"  Shape {shape_idx} - Comparing TABLE with {len(table1.rows)} rows and {len(table1.columns)} columns")
                    debug_logger.debug(f"  Shape {shape_idx} - Table2 has {len(table2.rows)} rows and {len(table2.columns)} columns")
                # Check if tables have the same dimensions
                if len(table1.rows) != len(table2.rows) or len(table1.columns) != len(table2.columns):
                    if enable_debug:
                        debug_logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx} (TABLE) - Table dimensions differ:")
                        debug_logger.debug(f"      Table1: {len(table1.rows)} rows x {len(table1.columns)} columns")
                        debug_logger.debug(f"      Table2: {len(table2.rows)} rows x {len(table2.columns)} columns")
                    return 0
                for row_idx in range(len(table1.rows)):
                    for col_idx in range(len(table1.columns)):
                        cell1 = table1.cell(row_idx, col_idx)
                        cell2 = table2.cell(row_idx, col_idx)
                        # Check if cells have the same number of paragraphs
                        if len(cell1.text_frame.paragraphs) != len(cell2.text_frame.paragraphs):
                            if enable_debug:
                                debug_logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx} (TABLE) - Cell [{row_idx},{col_idx}] - Different number of paragraphs:")
                                debug_logger.debug(f"      Cell1 paragraphs: {len(cell1.text_frame.paragraphs)}")
                                debug_logger.debug(f"      Cell2 paragraphs: {len(cell2.text_frame.paragraphs)}")
                            return 0
                        for para_idx, (para1, para2) in enumerate(zip(cell1.text_frame.paragraphs, cell2.text_frame.paragraphs)):
                            # Check if paragraphs have the same number of runs
                            if len(para1.runs) != len(para2.runs):
                                if enable_debug:
                                    debug_logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx} (TABLE) - Cell [{row_idx},{col_idx}], Para {para_idx} - Different number of runs:")
                                    debug_logger.debug(f"      Para1 runs: {len(para1.runs)}")
                                    debug_logger.debug(f"      Para2 runs: {len(para2.runs)}")
                                return 0
                            for run_idx, (run1, run2) in enumerate(zip(para1.runs, para2.runs)):
                                # Check font color
                                if hasattr(run1.font.color, "rgb") and hasattr(run2.font.color, "rgb"):
@@ -451,6 +480,14 @@ def compare_pptx_files(file1_path, file2_path, **options):
                if shape1.text.strip() != shape2.text.strip() and examine_text:
                    return 0
                # check if the number of paragraphs are the same
                if len(shape1.text_frame.paragraphs) != len(shape2.text_frame.paragraphs):
                    if enable_debug:
                        debug_logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx} - Different number of paragraphs:")
                        debug_logger.debug(f"      Shape1 paragraphs: {len(shape1.text_frame.paragraphs)}")
                        debug_logger.debug(f"      Shape2 paragraphs: {len(shape2.text_frame.paragraphs)}")
                    return 0
                    # check if the paragraphs are the same
                para_idx = 0
                for para1, para2 in zip(shape1.text_frame.paragraphs, shape2.text_frame.paragraphs):
@@ -487,6 +524,14 @@ def compare_pptx_files(file1_path, file2_path, **options):
                    if para1.level != para2.level and examine_indent:
                        return 0
                    # check if the number of runs are the same
                    if len(para1.runs) != len(para2.runs):
                        if enable_debug:
                            debug_logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx}, Para {para_idx} - Different number of runs:")
                            debug_logger.debug(f"      Para1 runs: {len(para1.runs)}")
                            debug_logger.debug(f"      Para2 runs: {len(para2.runs)}")
                        return 0
                    for run1, run2 in zip(para1.runs, para2.runs):
                        # check if the font properties are the same                        
@@ -634,6 +679,12 @@ def compare_pptx_files(file1_path, file2_path, **options):
                        debug_logger.debug(f"    MISMATCH: Text differs - '{tshape1.text.strip()}' vs '{tshape2.text.strip()}'")
                    return 0
                # Check if text shapes have the same number of paragraphs
                if len(tshape1.text_frame.paragraphs) != len(tshape2.text_frame.paragraphs):
                    if enable_debug:
                        debug_logger.debug(f"    MISMATCH: Different number of paragraphs - {len(tshape1.text_frame.paragraphs)} vs {len(tshape2.text_frame.paragraphs)}")
                    return 0
                # Compare alignment of each paragraph
                for para_idx, (para1, para2) in enumerate(zip(tshape1.text_frame.paragraphs, tshape2.text_frame.paragraphs)):
                    from pptx.enum.text import PP_ALIGN
--- a/desktop_env/evaluators/metrics/table.py
+++ b/desktop_env/evaluators/metrics/table.py
@@ -36,8 +36,14 @@ def _parse_sheet_idx(sheet_idx: Union[int, str]
    #  function _parse_sheet_idx {{{ # 
    if isinstance(sheet_idx, int):
        try:
-            index: str = result_sheet_names[sheet_idx]
+            if not result_sheet_names or sheet_idx >= len(result_sheet_names):
-        except:
+                logger.error(f"Sheet index {sheet_idx} out of range. Available sheets: {result_sheet_names}")
                index = ""
            else:
                index: str = result_sheet_names[sheet_idx]
                logger.debug(f"Sheet index {sheet_idx} resolved to sheet: {index}")
        except Exception as e:
            logger.error(f"Error resolving sheet index {sheet_idx}: {e}")
            index = ""
        book: BOOK = result
    elif sheet_idx.startswith("RI"):
@@ -114,12 +120,21 @@ def compare_table(result: str, expected: str = None, **options) -> float:
    """
    if result is None:
        logger.error("Result file path is None")
        return 0.
    # Check if result file exists
    if not os.path.exists(result):
        logger.error(f"Result file not found: {result}")
        return 0.
    try:
        logger.info(f"Loading result file: {result}")
        xlworkbookr: Workbook = openpyxl.load_workbook(filename=result)
        pdworkbookr = pd.ExcelFile(result)
-    except:
+        logger.info(f"Successfully loaded result file with sheets: {pdworkbookr.sheet_names}")
    except Exception as e:
        logger.error(f"Failed to load result file {result}: {e}")
        return 0.
    worksheetr_names: List[str] = pdworkbookr.sheet_names
@@ -432,19 +447,35 @@ def compare_table(result: str, expected: str = None, **options) -> float:
            # props: dict like {attribute: {"method": str, "ref": anything}}
            #   supported attributes: value & those supported by utils._read_cell_style
-            sheet: Worksheet = _load_sheet(*parse_idx(r["sheet_idx"], xlworkbookr, xlworkbooke))
+            try:
-            if sheet is None:
+                sheet: Worksheet = _load_sheet(*parse_idx(r["sheet_idx"], xlworkbookr, xlworkbooke))
-                return 0.
+                if sheet is None:
-            # data_frame: pd.DataFrame = _load_sheet(*parse_idx(r["sheet_idx"], pdworkbookr, pdworkbooke))
+                    logger.error(f"Failed to load sheet for sheet_idx: {r['sheet_idx']}")
-            cell: Cell = sheet[r["coordinate"]]
+                    return 0.
-            metric: bool = True
+                # data_frame: pd.DataFrame = _load_sheet(*parse_idx(r["sheet_idx"], pdworkbookr, pdworkbooke))
-            for prpt, rule in r["props"].items():
+                cell: Cell = sheet[r["coordinate"]]
-                if prpt == "value":
+                metric: bool = True
-                    val = read_cell_value(*parse_idx(r["sheet_idx"], result, expected), r["coordinate"])
+                for prpt, rule in r["props"].items():
-                else:
+                    if prpt == "value":
-                    val = _read_cell_style(prpt, cell)
+                        try:
                            parsed_result = parse_idx(r["sheet_idx"], result, expected)
                            logger.debug(f"parse_idx result: {parsed_result}")
                            val = read_cell_value(*parsed_result, r["coordinate"])
                            logger.debug(f"Cell {r['coordinate']} value: {val}")
                        except Exception as e:
                            logger.error(f"Failed to read cell value at {r['coordinate']}: {e}")
                            val = None
                    else:
                        try:
                            val = _read_cell_style(prpt, cell)
                        except Exception as e:
                            logger.error(f"Failed to read cell style {prpt} at {r['coordinate']}: {e}")
                            val = None
-                metric = metric and _match_value_to_rule(val, rule)
+                    metric = metric and _match_value_to_rule(val, rule)
            except Exception as e:
                logger.error(f"Error in check_cell processing: {e}")
                return 0.
            logger.debug("Assertion: %s[%s] :%s - %s"
                         , r["sheet_idx"], r["coordinate"]
--- a/desktop_env/evaluators/metrics/utils.py
+++ b/desktop_env/evaluators/metrics/utils.py
@@ -4,6 +4,7 @@ import functools
 import itertools
 import logging
 import operator
 import os
 import re
 import zipfile
 #import pandas as pd
@@ -33,10 +34,11 @@ V = TypeVar("Value")
 logger = logging.getLogger("desktopenv.metrics.utils")
-_xlsx_namespaces = [("oo", "http://schemas.openxmlformats.org/spreadsheetml/2006/main")
+_xlsx_namespaces = [
-    , ("x14", "http://schemas.microsoft.com/office/spreadsheetml/2009/9/main")
+    ("oo", "http://schemas.openxmlformats.org/spreadsheetml/2006/main"),
-    , ("xm", "http://schemas.microsoft.com/office/excel/2006/main")
+    ("x14", "http://schemas.microsoft.com/office/spreadsheetml/2009/9/main"),
-                    ]
+    ("xm", "http://schemas.microsoft.com/office/excel/2006/main")
 ]
 _xlsx_ns_mapping = dict(_xlsx_namespaces)
 _xlsx_ns_imapping = dict(map(lambda itm: (itm[1], itm[0]), _xlsx_namespaces))
 _xlsx_ns_imapping["http://schemas.openxmlformats.org/spreadsheetml/2006/main"] = None
@@ -282,6 +284,13 @@ _shared_str_value_selector = lxml.cssselect.CSSSelector("oo|t", namespaces=_xlsx
 def read_cell_value(xlsx_file: str, sheet_name: str, coordinate: str) -> Any:
    #  read_cell_value {{{ # 
    logger.debug(f"Reading cell value from {xlsx_file}, sheet: {sheet_name}, coordinate: {coordinate}")
    # Check if file exists
    if not os.path.exists(xlsx_file):
        logger.error(f"Excel file not found: {xlsx_file}")
        return None
    try:
        with zipfile.ZipFile(xlsx_file, "r") as z_f:
            try:
@@ -308,9 +317,17 @@ def read_cell_value(xlsx_file: str, sheet_name: str, coordinate: str) -> Any:
                                               , namespaces=_xlsx_ns_mapping
                                               )(sheet)
                if len(cells) == 0:
                    logger.debug(f"Cell {coordinate} not found in sheet {sheet_name}")
                    return None
                cell: _Element = cells[0]
-    except zipfile.BadZipFile:
+    except zipfile.BadZipFile as e:
        logger.error(f"Bad zip file {xlsx_file}: {e}")
        return None
    except KeyError as e:
        logger.error(f"Sheet {sheet_name} not found in {xlsx_file}: {e}")
        return None
    except Exception as e:
        logger.error(f"Error reading {xlsx_file}: {e}")
        return None
    cell: Dict[str, str] = xmltodict.parse(lxml.etree.tostring(cell, encoding="unicode")