feat: add X11 image handling and enhanced OCR processing

- Introduced a new function `read_x11_image` to read and convert X11 (XWD) format images to PIL Image, supporting both 24-bit and 32-bit formats. - Enhanced the `compare_image_text` function to include checks for X11 image formats, with multiple conversion attempts using PIL, a custom reader, and netpbm tools. - Improved error handling and logging for OCR processing, providing detailed feedback on conversion attempts and potential issues with X11 images. - Maintained existing logic while expanding functionality for better image processing reliability.
2025-07-18 19:26:29 +00:00
parent d6f2190a9f
commit c6c62c52d7
1 changed files with 201 additions and 19 deletions
--- a/desktop_env/evaluators/metrics/docs.py
+++ b/desktop_env/evaluators/metrics/docs.py
@@ -3,6 +3,10 @@ import os
 import re
 import xml.etree.ElementTree as ET
 import zipfile
 import tempfile
 import subprocess
 import struct
 import numpy as np
 from io import BytesIO
 from typing import List, Dict, Any
@@ -21,6 +25,77 @@ from skimage.color import rgb2lab
 logger = logging.getLogger("desktopenv.metric.docs")
 def read_x11_image(filepath):
    """
    Pure Python X11 (XWD) format reader that converts to PIL Image.
    No external dependencies required.
    Args:
        filepath: Path to the X11/XWD format image file
    Returns:
        PIL.Image: Converted image in RGB format
    Raises:
        ValueError: If the format is not supported
        IOError: If file cannot be read
    """
    with open(filepath, 'rb') as f:
        # Read X11 header
        header_data = f.read(100)
        # Parse header (big endian format)
        header_size = struct.unpack('>I', header_data[0:4])[0]
        version = struct.unpack('>I', header_data[4:8])[0]
        pixmap_format = struct.unpack('>I', header_data[8:12])[0]
        pixmap_depth = struct.unpack('>I', header_data[12:16])[0]
        pixmap_width = struct.unpack('>I', header_data[16:20])[0]
        pixmap_height = struct.unpack('>I', header_data[20:24])[0]
        logger.debug(f"X11 image info: {pixmap_width}x{pixmap_height}, depth={pixmap_depth}")
        # Skip to the end of header
        f.seek(header_size)
        # Read pixel data based on depth
        if pixmap_depth == 32:
            # 32-bit RGBA format
            bytes_per_pixel = 4
            total_pixels = pixmap_width * pixmap_height
            pixel_data = f.read(total_pixels * bytes_per_pixel)
            # Convert to numpy array
            pixels = np.frombuffer(pixel_data, dtype=np.uint8)
            # Reshape to image dimensions
            pixels = pixels.reshape((pixmap_height, pixmap_width, bytes_per_pixel))
            # X11 format is typically BGRA, convert to RGB
            # Swap B and R channels, ignore alpha
            rgb_pixels = pixels[:, :, [2, 1, 0]]  # BGR -> RGB
            # Create PIL image
            image = Image.fromarray(rgb_pixels, 'RGB')
            return image
        elif pixmap_depth == 24:
            # 24-bit RGB format
            bytes_per_pixel = 3
            total_pixels = pixmap_width * pixmap_height
            pixel_data = f.read(total_pixels * bytes_per_pixel)
            # Convert to numpy array and reshape
            pixels = np.frombuffer(pixel_data, dtype=np.uint8)
            pixels = pixels.reshape((pixmap_height, pixmap_width, bytes_per_pixel))
            # Create PIL image (assuming RGB order)
            image = Image.fromarray(pixels, 'RGB')
            return image
        else:
            raise ValueError(f'Unsupported X11 pixel depth: {pixmap_depth}. Only 24-bit and 32-bit formats are supported.')
 def find_default_font(config_file_path, rules):
    """Find the default font in LibreOffice Writer."""
    default_font = None
@@ -294,29 +369,136 @@ def compare_docx_images(docx_file1, docx_file2):
 def compare_image_text(image_path, rule):
    if not image_path:
        return 0
    reader = easyocr.Reader(['en'])
    result = reader.readtext(image_path)
    extracted_text = ' '.join([entry[1] for entry in result])
-    # Log OCR results
+    # Check if the image file exists
-    logger.info(f"OCR extracted texts: {[entry[1] for entry in result]}")
+    if not os.path.exists(image_path):
-    logger.info(f"Combined extracted text: {extracted_text}")
+        logger.error(f"Image file not found: {image_path}")
        return 0
-    if rule['type'] == 'text':
+    # Check image format and convert if necessary
-        target_text = rule['text']
+    temp_image_path = None
-        match_found = target_text in extracted_text
+    actual_image_path = image_path
    try:
        # First, try to identify the file format
        result = subprocess.run(['file', image_path], capture_output=True, text=True)
        file_info = result.stdout.lower()
-        # Log matching results
+        # If it's an X11 screen dump, we need to convert it
-        logger.info(f"Target text: '{target_text}'")
+        if 'x-window screen dump' in file_info or 'xwd' in file_info:
-        logger.info(f"Match found: {match_found}")
+            logger.info(f"Detected X11 screen dump format in {image_path}, attempting conversion...")
-        if match_found:
+            
-            logger.info("✅ Text matching successful!")
+            # Create a temporary file for the converted image
-        else:
+            temp_fd, temp_image_path = tempfile.mkstemp(suffix='.png')
-            logger.info("❌ Text matching failed!")
+            os.close(temp_fd)
            # Try to convert using PIL with xwd support or other methods
            try:
                # First try with PIL directly (sometimes it can handle xwd)
                img = Image.open(image_path)
                img.save(temp_image_path, 'PNG')
                actual_image_path = temp_image_path
                logger.info(f"Successfully converted X11 image using PIL")
            except Exception as pil_error:
                logger.warning(f"PIL conversion failed: {pil_error}")
                # Try our custom X11 reader (pure Python solution)
                try:
                    logger.info("Attempting conversion using custom X11 reader...")
                    x11_image = read_x11_image(image_path)
                    x11_image.save(temp_image_path, 'PNG')
                    actual_image_path = temp_image_path
                    logger.info(f"✅ Successfully converted X11 image using custom reader")
                except Exception as custom_error:
                    logger.warning(f"Custom X11 conversion failed: {custom_error}")
                    # Try with netpbm tools if available (fallback)
                    try:
                        result = subprocess.run(['which', 'xwdtopnm'], capture_output=True)
                        if result.returncode == 0:
                            # Use netpbm tools chain: xwdtopnm -> pnmtopng
                            subprocess.run(['xwdtopnm', image_path], 
                                         stdout=subprocess.PIPE, check=True)
                            with open(temp_image_path, 'wb') as f:
                                result = subprocess.run(['xwdtopnm', image_path], 
                                                      stdout=subprocess.PIPE, check=True)
                                result2 = subprocess.run(['pnmtopng'], 
                                                       input=result.stdout, 
                                                       stdout=f, check=True)
                            actual_image_path = temp_image_path
                            logger.info(f"Successfully converted X11 image using netpbm tools")
                        else:
                            raise Exception("netpbm tools not available")
                    except Exception as netpbm_error:
                        logger.warning(f"netpbm conversion failed: {netpbm_error}")
                        # All conversions failed
                        logger.error(
                            f"❌ All X11 conversion methods failed.\n"
                            f"Attempted: PIL → Custom Python reader → netpbm tools\n"
                            f"💡 The image might be corrupted or in an unsupported X11 variant"
                        )
                        # If all conversions fail, try to use the original file anyway
                        # Sometimes easyocr might handle it better than PIL
                        if temp_image_path:
                            os.unlink(temp_image_path)
                            temp_image_path = None
                        actual_image_path = image_path
                        logger.info(f"Will attempt OCR on original file format (likely to fail)")
-        return 1 if match_found else 0
+        # Now attempt OCR with error handling
-    else:
+        try:
-        raise ValueError("Unsupported rule type")
+            reader = easyocr.Reader(['en'])
            result = reader.readtext(actual_image_path)
            extracted_text = ' '.join([entry[1] for entry in result])
            # Log OCR results
            logger.info(f"OCR extracted texts: {[entry[1] for entry in result]}")
            logger.info(f"Combined extracted text: {extracted_text}")
            if rule['type'] == 'text':
                target_text = rule['text']
                match_found = target_text in extracted_text
                # Log matching results
                logger.info(f"Target text: '{target_text}'")
                logger.info(f"Match found: {match_found}")
                if match_found:
                    logger.info("✅ Text matching successful!")
                else:
                    logger.info("❌ Text matching failed!")
                return 1 if match_found else 0
            else:
                raise ValueError("Unsupported rule type")
        except Exception as ocr_error:
            logger.error(f"OCR processing failed for {actual_image_path}: {ocr_error}")
            # Check if this is specifically an X11 format issue
            if 'x-window screen dump' in file_info or 'xwd' in file_info:
                logger.error(
                    f"🚨 OCR failed on X11 screen dump after all conversion attempts.\n"
                    f"This might indicate:\n"
                    f"   1. The X11 file is corrupted or in an unsupported variant\n"
                    f"   2. Missing dependencies (numpy, PIL)\n"
                    f"   3. Insufficient memory for large images"
                )
            return 0
    except Exception as e:
        logger.error(f"Error processing image {image_path}: {e}")
        return 0
    finally:
        # Clean up temporary file if created
        if temp_image_path and os.path.exists(temp_image_path):
            try:
                os.unlink(temp_image_path)
            except:
                pass
 def compare_line_spacing(docx_file1, docx_file2):