From 6897e5320da04de35c259652c095431ca1952646 Mon Sep 17 00:00:00 2001 From: yuanmengqi Date: Thu, 10 Jul 2025 22:32:53 +0000 Subject: [PATCH] Enhance image text comparison functionality with detailed logging - Added logging for OCR results and text matching outcomes in compare_image_text function. - Updated JSON examples to support multiple expected results and improved structure for evaluator functions. - Enhanced handling of expected text rules to include multiple variations for better matching accuracy. --- desktop_env/evaluators/metrics/docs.py | 25 +++++- desktop_env/evaluators/metrics/general.py | 7 +- .../02ce9a50-7af2-47ed-8596-af0c230501f8.json | 57 +++++++++--- .../ce2b64a2-ddc1-4f91-8c7d-a88be7121aac.json | 90 ++++++++++++++++++- 4 files changed, 161 insertions(+), 18 deletions(-) diff --git a/desktop_env/evaluators/metrics/docs.py b/desktop_env/evaluators/metrics/docs.py index 26ec624..52f5e10 100644 --- a/desktop_env/evaluators/metrics/docs.py +++ b/desktop_env/evaluators/metrics/docs.py @@ -297,8 +297,24 @@ def compare_image_text(image_path, rule): reader = easyocr.Reader(['en']) result = reader.readtext(image_path) extracted_text = ' '.join([entry[1] for entry in result]) + + # Log OCR results + logger.info(f"OCR extracted texts: {[entry[1] for entry in result]}") + logger.info(f"Combined extracted text: {extracted_text}") + if rule['type'] == 'text': - return 1 if rule['text'] in extracted_text else 0 + target_text = rule['text'] + match_found = target_text in extracted_text + + # Log matching results + logger.info(f"Target text: '{target_text}'") + logger.info(f"Match found: {match_found}") + if match_found: + logger.info("✅ Text matching successful!") + else: + logger.info("❌ Text matching failed!") + + return 1 if match_found else 0 else: raise ValueError("Unsupported rule type") @@ -986,3 +1002,10 @@ def compare_unique_train_records(processed_file, expected_files, **kwargs): return 0 return 1 + +if __name__ == "__main__": + image_path = "/home/ubuntu/OSWorld/cache/02ce9a50-7af2-47ed-8596-af0c230501f8/ls.png" + print(compare_image_text(image_path, { + "type": "text", + "text": "ls" + })) \ No newline at end of file diff --git a/desktop_env/evaluators/metrics/general.py b/desktop_env/evaluators/metrics/general.py index 2380b35..03e66a4 100644 --- a/desktop_env/evaluators/metrics/general.py +++ b/desktop_env/evaluators/metrics/general.py @@ -402,12 +402,17 @@ def check_direct_json_object(result, rules) -> float: expected_value_list = expected_json.get(key) logger.info(f"[DEBUG] Checking list key '{key}': expected_list={expected_value_list}, actual='{result.get(key)}'") for each_expected_value in expected_value_list: + # Handle both list and string cases if isinstance(result.get(key), list) and each_expected_value in result.get(key): flag = 1 logger.info(f"[DEBUG] Found expected value '{each_expected_value}' in result list for key '{key}'") break + elif isinstance(result.get(key), str) and each_expected_value == result.get(key): + flag = 1 + logger.info(f"[DEBUG] Found expected value '{each_expected_value}' matches result string for key '{key}'") + break if flag == 0: - logger.info(f"[DEBUG] No expected values found in result list for key '{key}', returning 0.0") + logger.info(f"[DEBUG] No expected values found in result for key '{key}', returning 0.0") return 0. elif isinstance(expected_json.get(key), str): expected_str = expected_json.get(key) diff --git a/evaluation_examples/examples/multi_apps/02ce9a50-7af2-47ed-8596-af0c230501f8.json b/evaluation_examples/examples/multi_apps/02ce9a50-7af2-47ed-8596-af0c230501f8.json index 8658658..329f223 100644 --- a/evaluation_examples/examples/multi_apps/02ce9a50-7af2-47ed-8596-af0c230501f8.json +++ b/evaluation_examples/examples/multi_apps/02ce9a50-7af2-47ed-8596-af0c230501f8.json @@ -44,19 +44,52 @@ "os" ], "evaluator": { - "func": "compare_image_text", - "result": { - "type": "vm_file", - "path": "/home/user/Desktop/ls.png", - "dest": "ls.png" - }, - "expected": { - "type": "rule", - "rules": { - "type": "text", - "text": "ls" + "func": [ + "compare_image_text", + "compare_image_text", + "compare_image_text" + ], + "conj": "or", + "result": [ + { + "type": "vm_file", + "path": "/home/user/Desktop/ls.png", + "dest": "ls.png" + }, + { + "type": "vm_file", + "path": "/home/user/Desktop/ls.png", + "dest": "ls.png" + }, + { + "type": "vm_file", + "path": "/home/user/Desktop/ls.png", + "dest": "ls.png" } - } + ], + "expected": [ + { + "type": "rule", + "rules": { + "type": "text", + "text": "ls" + } + }, + { + "type": "rule", + "rules": { + "type": "text", + "text": "1s" + } + }, + { + "type": "rule", + "rules": { + "type": "text", + "text": "1s" + } + } + ] }, "proxy": false } \ No newline at end of file diff --git a/evaluation_examples/examples/multi_apps/ce2b64a2-ddc1-4f91-8c7d-a88be7121aac.json b/evaluation_examples/examples/multi_apps/ce2b64a2-ddc1-4f91-8c7d-a88be7121aac.json index a5ffa57..21e6887 100644 --- a/evaluation_examples/examples/multi_apps/ce2b64a2-ddc1-4f91-8c7d-a88be7121aac.json +++ b/evaluation_examples/examples/multi_apps/ce2b64a2-ddc1-4f91-8c7d-a88be7121aac.json @@ -69,16 +69,98 @@ "rules": { "expected": { "ec076282f61ba74642e94b5a6a1250c6988204d59d9b02936606b6b8ef1e4433": [ - "Kilimanjaro" + "Kili", + "kili", + "Kili.jpg", + "kili.jpg", + "Kilimanjaro", + "kilimanjaro", + "Kilimanjaro.jpg", + "kilimanjaro.jpg", + "Mount Kilimanjaro", + "mount kilimanjaro", + "Mount Kilimanjaro.jpg", + "mount kilimanjaro.jpg", + "Kilimanjaro Mountain", + "kilimanjaro mountain", + "Kilimanjaro Mountain.jpg", + "kilimanjaro mountain.jpg" ], "6ed4239ecc2be3ec15ad65a78c5c823b9004d640b8cc83a6a7af5930f354de91": [ - "Himalayas", "Everest", - "Sagarmatha" + "everest", + "Everest.jpg", + "everest.jpg", + "Mount Everest", + "mount everest", + "Mount Everest.jpg", + "mount everest.jpg", + "Everest Mountain", + "everest mountain", + "Everest Mountain.jpg", + "everest mountain.jpg", + "Sagarmatha", + "sagarmatha", + "Sagarmatha.jpg", + "sagarmatha.jpg", + "Sagarmatha Mountain", + "sagarmatha mountain", + "Sagarmatha Mountain.jpg", + "sagarmatha mountain.jpg", + "Chomolungma", + "chomolungma", + "Chomolungma.jpg", + "chomolungma.jpg", + "Qomolangma", + "qomolangma", + "Qomolangma.jpg", + "qomolangma.jpg", + "Himalayas", + "himalayas", + "Himalayas.jpg", + "himalayas.jpg", + "Himalayas Mountain", + "himalayas mountain", + "Himalayas Mountain.jpg", + "himalayas mountain.jpg", + "Himalaya", + "himalaya", + "Himalaya.jpg", + "himalaya.jpg", + "Himalaya Mountain", + "himalaya mountain", + "Himalaya Mountain.jpg", + "himalaya mountain.jpg" ], "79f45d40d8413d4e81f1b9734ea39e58622cafd79e12bab32959643fc245147c": [ "Hua", - "hua" + "hua", + "Hua.jpg", + "hua.jpg", + "Mount Hua", + "mount hua", + "Mount Hua.jpg", + "mount hua.jpg", + "Hua Mountain", + "hua mountain", + "Hua Mountain.jpg", + "hua mountain.jpg", + "Huashan", + "huashan", + "Huashan.jpg", + "huashan.jpg", + "Hua Shan", + "hua shan", + "Hua Shan.jpg", + "hua shan.jpg", + "Huashan Mountain", + "huashan mountain", + "Huashan Mountain.jpg", + "huashan mountain.jpg", + "Hua Shan Mountain", + "hua shan mountain", + "Hua Shan Mountain.jpg", + "hua shan mountain.jpg" ] }, "expect_in_result": true