Enhance image text comparison functionality with detailed logging

- Added logging for OCR results and text matching outcomes in compare_image_text function. - Updated JSON examples to support multiple expected results and improved structure for evaluator functions. - Enhanced handling of expected text rules to include multiple variations for better matching accuracy.
2025-07-10 22:32:53 +00:00
parent 4e3446d6fe
commit 6897e5320d
4 changed files with 161 additions and 18 deletions
--- a/desktop_env/evaluators/metrics/docs.py
+++ b/desktop_env/evaluators/metrics/docs.py
@@ -297,8 +297,24 @@ def compare_image_text(image_path, rule):
    reader = easyocr.Reader(['en'])
    result = reader.readtext(image_path)
    extracted_text = ' '.join([entry[1] for entry in result])
+    
+    # Log OCR results
+    logger.info(f"OCR extracted texts: {[entry[1] for entry in result]}")
+    logger.info(f"Combined extracted text: {extracted_text}")
+    
    if rule['type'] == 'text':
-        return 1 if rule['text'] in extracted_text else 0
+        target_text = rule['text']
+        match_found = target_text in extracted_text
+        
+        # Log matching results
+        logger.info(f"Target text: '{target_text}'")
+        logger.info(f"Match found: {match_found}")
+        if match_found:
+            logger.info("✅ Text matching successful!")
+        else:
+            logger.info("❌ Text matching failed!")
+        
+        return 1 if match_found else 0
    else:
        raise ValueError("Unsupported rule type")

@@ -986,3 +1002,10 @@ def compare_unique_train_records(processed_file, expected_files, **kwargs):
        return 0

    return 1
+
+if __name__ == "__main__":
+    image_path = "/home/ubuntu/OSWorld/cache/02ce9a50-7af2-47ed-8596-af0c230501f8/ls.png"
+    print(compare_image_text(image_path, {
+        "type": "text",
+        "text": "ls"
+      }))
--- a/desktop_env/evaluators/metrics/general.py
+++ b/desktop_env/evaluators/metrics/general.py
@@ -402,12 +402,17 @@ def check_direct_json_object(result, rules) -> float:
                    expected_value_list = expected_json.get(key)
                    logger.info(f"[DEBUG] Checking list key '{key}': expected_list={expected_value_list}, actual='{result.get(key)}'")
                    for each_expected_value in expected_value_list:
+                        # Handle both list and string cases
                        if isinstance(result.get(key), list) and each_expected_value in result.get(key):
                            flag = 1
                            logger.info(f"[DEBUG] Found expected value '{each_expected_value}' in result list for key '{key}'")
                            break
+                        elif isinstance(result.get(key), str) and each_expected_value == result.get(key):
+                            flag = 1
+                            logger.info(f"[DEBUG] Found expected value '{each_expected_value}' matches result string for key '{key}'")
+                            break
                    if flag == 0:
-                        logger.info(f"[DEBUG] No expected values found in result list for key '{key}', returning 0.0")
+                        logger.info(f"[DEBUG] No expected values found in result for key '{key}', returning 0.0")
                        return 0.
                elif isinstance(expected_json.get(key), str):
                    expected_str = expected_json.get(key)