From 6897e5320da04de35c259652c095431ca1952646 Mon Sep 17 00:00:00 2001
From: yuanmengqi <yuanmengqi@mail.ustc.edu.cn>
Date: Thu, 10 Jul 2025 22:32:53 +0000
Subject: [PATCH] Enhance image text comparison functionality with detailed
 logging

- Added logging for OCR results and text matching outcomes in compare_image_text function.
- Updated JSON examples to support multiple expected results and improved structure for evaluator functions.
- Enhanced handling of expected text rules to include multiple variations for better matching accuracy.
---
 desktop_env/evaluators/metrics/docs.py        | 25 +++++-
 desktop_env/evaluators/metrics/general.py     |  7 +-
 .../02ce9a50-7af2-47ed-8596-af0c230501f8.json | 57 +++++++++---
 .../ce2b64a2-ddc1-4f91-8c7d-a88be7121aac.json | 90 ++++++++++++++++++-
 4 files changed, 161 insertions(+), 18 deletions(-)

diff --git a/desktop_env/evaluators/metrics/docs.py b/desktop_env/evaluators/metrics/docs.py
index 26ec624..52f5e10 100644
--- a/desktop_env/evaluators/metrics/docs.py
+++ b/desktop_env/evaluators/metrics/docs.py
@@ -297,8 +297,24 @@ def compare_image_text(image_path, rule):
     reader = easyocr.Reader(['en'])
     result = reader.readtext(image_path)
     extracted_text = ' '.join([entry[1] for entry in result])
+    
+    # Log OCR results
+    logger.info(f"OCR extracted texts: {[entry[1] for entry in result]}")
+    logger.info(f"Combined extracted text: {extracted_text}")
+    
     if rule['type'] == 'text':
-        return 1 if rule['text'] in extracted_text else 0
+        target_text = rule['text']
+        match_found = target_text in extracted_text
+        
+        # Log matching results
+        logger.info(f"Target text: '{target_text}'")
+        logger.info(f"Match found: {match_found}")
+        if match_found:
+            logger.info("✅ Text matching successful!")
+        else:
+            logger.info("❌ Text matching failed!")
+        
+        return 1 if match_found else 0
     else:
         raise ValueError("Unsupported rule type")
 
@@ -986,3 +1002,10 @@ def compare_unique_train_records(processed_file, expected_files, **kwargs):
         return 0
 
     return 1
+
+if __name__ == "__main__":
+    image_path = "/home/ubuntu/OSWorld/cache/02ce9a50-7af2-47ed-8596-af0c230501f8/ls.png"
+    print(compare_image_text(image_path, {
+        "type": "text",
+        "text": "ls"
+      }))
\ No newline at end of file
diff --git a/desktop_env/evaluators/metrics/general.py b/desktop_env/evaluators/metrics/general.py
index 2380b35..03e66a4 100644
--- a/desktop_env/evaluators/metrics/general.py
+++ b/desktop_env/evaluators/metrics/general.py
@@ -402,12 +402,17 @@ def check_direct_json_object(result, rules) -> float:
                     expected_value_list = expected_json.get(key)
                     logger.info(f"[DEBUG] Checking list key '{key}': expected_list={expected_value_list}, actual='{result.get(key)}'")
                     for each_expected_value in expected_value_list:
+                        # Handle both list and string cases
                         if isinstance(result.get(key), list) and each_expected_value in result.get(key):
                             flag = 1
                             logger.info(f"[DEBUG] Found expected value '{each_expected_value}' in result list for key '{key}'")
                             break
+                        elif isinstance(result.get(key), str) and each_expected_value == result.get(key):
+                            flag = 1
+                            logger.info(f"[DEBUG] Found expected value '{each_expected_value}' matches result string for key '{key}'")
+                            break
                     if flag == 0:
-                        logger.info(f"[DEBUG] No expected values found in result list for key '{key}', returning 0.0")
+                        logger.info(f"[DEBUG] No expected values found in result for key '{key}', returning 0.0")
                         return 0.
                 elif isinstance(expected_json.get(key), str):
                     expected_str = expected_json.get(key)
diff --git a/evaluation_examples/examples/multi_apps/02ce9a50-7af2-47ed-8596-af0c230501f8.json b/evaluation_examples/examples/multi_apps/02ce9a50-7af2-47ed-8596-af0c230501f8.json
index 8658658..329f223 100644
--- a/evaluation_examples/examples/multi_apps/02ce9a50-7af2-47ed-8596-af0c230501f8.json
+++ b/evaluation_examples/examples/multi_apps/02ce9a50-7af2-47ed-8596-af0c230501f8.json
@@ -44,19 +44,52 @@
     "os"
   ],
   "evaluator": {
-    "func": "compare_image_text",
-    "result": {
-      "type": "vm_file",
-      "path": "/home/user/Desktop/ls.png",
-      "dest": "ls.png"
-    },
-    "expected": {
-      "type": "rule",
-      "rules": {
-        "type": "text",
-        "text": "ls"
+    "func": [
+      "compare_image_text",
+      "compare_image_text",
+      "compare_image_text"
+    ],
+    "conj": "or",
+    "result": [
+      {
+        "type": "vm_file",
+        "path": "/home/user/Desktop/ls.png",
+        "dest": "ls.png"
+      },
+      {
+        "type": "vm_file",
+        "path": "/home/user/Desktop/ls.png",
+        "dest": "ls.png"
+      },
+      {
+        "type": "vm_file",
+        "path": "/home/user/Desktop/ls.png",
+        "dest": "ls.png"
       }
-    }
+    ],
+    "expected": [
+      {
+        "type": "rule",
+        "rules": {
+          "type": "text",
+          "text": "ls"
+        }
+      },
+      {
+        "type": "rule",
+        "rules": {
+          "type": "text",
+          "text": "1s"
+        }
+      },
+      {
+        "type": "rule",
+        "rules": {
+          "type": "text",
+          "text": "1s"
+        }
+      }
+    ]
   },
   "proxy": false
 }
\ No newline at end of file
diff --git a/evaluation_examples/examples/multi_apps/ce2b64a2-ddc1-4f91-8c7d-a88be7121aac.json b/evaluation_examples/examples/multi_apps/ce2b64a2-ddc1-4f91-8c7d-a88be7121aac.json
index a5ffa57..21e6887 100644
--- a/evaluation_examples/examples/multi_apps/ce2b64a2-ddc1-4f91-8c7d-a88be7121aac.json
+++ b/evaluation_examples/examples/multi_apps/ce2b64a2-ddc1-4f91-8c7d-a88be7121aac.json
@@ -69,16 +69,98 @@
       "rules": {
         "expected": {
           "ec076282f61ba74642e94b5a6a1250c6988204d59d9b02936606b6b8ef1e4433": [
-            "Kilimanjaro"
+            "Kili",
+            "kili",
+            "Kili.jpg",
+            "kili.jpg",
+            "Kilimanjaro",
+            "kilimanjaro",
+            "Kilimanjaro.jpg",
+            "kilimanjaro.jpg",
+            "Mount Kilimanjaro",
+            "mount kilimanjaro",
+            "Mount Kilimanjaro.jpg",
+            "mount kilimanjaro.jpg",
+            "Kilimanjaro Mountain",
+            "kilimanjaro mountain",
+            "Kilimanjaro Mountain.jpg",
+            "kilimanjaro mountain.jpg"
           ],
           "6ed4239ecc2be3ec15ad65a78c5c823b9004d640b8cc83a6a7af5930f354de91": [
-            "Himalayas",
             "Everest",
-            "Sagarmatha"
+            "everest",
+            "Everest.jpg",
+            "everest.jpg",
+            "Mount Everest",
+            "mount everest",
+            "Mount Everest.jpg",
+            "mount everest.jpg",
+            "Everest Mountain",
+            "everest mountain",
+            "Everest Mountain.jpg",
+            "everest mountain.jpg",
+            "Sagarmatha",
+            "sagarmatha",
+            "Sagarmatha.jpg",
+            "sagarmatha.jpg",
+            "Sagarmatha Mountain",
+            "sagarmatha mountain",
+            "Sagarmatha Mountain.jpg",
+            "sagarmatha mountain.jpg",
+            "Chomolungma",
+            "chomolungma",
+            "Chomolungma.jpg",
+            "chomolungma.jpg",
+            "Qomolangma",
+            "qomolangma",
+            "Qomolangma.jpg",
+            "qomolangma.jpg",
+            "Himalayas",
+            "himalayas",
+            "Himalayas.jpg",
+            "himalayas.jpg",
+            "Himalayas Mountain",
+            "himalayas mountain",
+            "Himalayas Mountain.jpg",
+            "himalayas mountain.jpg",
+            "Himalaya",
+            "himalaya",
+            "Himalaya.jpg",
+            "himalaya.jpg",
+            "Himalaya Mountain",
+            "himalaya mountain",
+            "Himalaya Mountain.jpg",
+            "himalaya mountain.jpg"
           ],
           "79f45d40d8413d4e81f1b9734ea39e58622cafd79e12bab32959643fc245147c": [
             "Hua",
-            "hua"
+            "hua",
+            "Hua.jpg",
+            "hua.jpg",
+            "Mount Hua",
+            "mount hua",
+            "Mount Hua.jpg",
+            "mount hua.jpg",
+            "Hua Mountain",
+            "hua mountain",
+            "Hua Mountain.jpg",
+            "hua mountain.jpg",
+            "Huashan",
+            "huashan",
+            "Huashan.jpg",
+            "huashan.jpg",
+            "Hua Shan",
+            "hua shan",
+            "Hua Shan.jpg",
+            "hua shan.jpg",
+            "Huashan Mountain",
+            "huashan mountain",
+            "Huashan Mountain.jpg",
+            "huashan mountain.jpg",
+            "Hua Shan Mountain",
+            "hua shan mountain",
+            "Hua Shan Mountain.jpg",
+            "hua shan mountain.jpg"
           ]
         },
         "expect_in_result": true