feat: add X11 image handling and enhanced OCR processing

- Introduced a new function `read_x11_image` to read and convert X11 (XWD) format images to PIL Image, supporting both 24-bit and 32-bit formats.
- Enhanced the `compare_image_text` function to include checks for X11 image formats, with multiple conversion attempts using PIL, a custom reader, and netpbm tools.
- Improved error handling and logging for OCR processing, providing detailed feedback on conversion attempts and potential issues with X11 images.
- Maintained existing logic while expanding functionality for better image processing reliability.
This commit is contained in:
yuanmengqi
2025-07-18 19:26:29 +00:00
parent d6f2190a9f
commit c6c62c52d7

View File

@@ -3,6 +3,10 @@ import os
import re import re
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
import zipfile import zipfile
import tempfile
import subprocess
import struct
import numpy as np
from io import BytesIO from io import BytesIO
from typing import List, Dict, Any from typing import List, Dict, Any
@@ -21,6 +25,77 @@ from skimage.color import rgb2lab
logger = logging.getLogger("desktopenv.metric.docs") logger = logging.getLogger("desktopenv.metric.docs")
def read_x11_image(filepath):
"""
Pure Python X11 (XWD) format reader that converts to PIL Image.
No external dependencies required.
Args:
filepath: Path to the X11/XWD format image file
Returns:
PIL.Image: Converted image in RGB format
Raises:
ValueError: If the format is not supported
IOError: If file cannot be read
"""
with open(filepath, 'rb') as f:
# Read X11 header
header_data = f.read(100)
# Parse header (big endian format)
header_size = struct.unpack('>I', header_data[0:4])[0]
version = struct.unpack('>I', header_data[4:8])[0]
pixmap_format = struct.unpack('>I', header_data[8:12])[0]
pixmap_depth = struct.unpack('>I', header_data[12:16])[0]
pixmap_width = struct.unpack('>I', header_data[16:20])[0]
pixmap_height = struct.unpack('>I', header_data[20:24])[0]
logger.debug(f"X11 image info: {pixmap_width}x{pixmap_height}, depth={pixmap_depth}")
# Skip to the end of header
f.seek(header_size)
# Read pixel data based on depth
if pixmap_depth == 32:
# 32-bit RGBA format
bytes_per_pixel = 4
total_pixels = pixmap_width * pixmap_height
pixel_data = f.read(total_pixels * bytes_per_pixel)
# Convert to numpy array
pixels = np.frombuffer(pixel_data, dtype=np.uint8)
# Reshape to image dimensions
pixels = pixels.reshape((pixmap_height, pixmap_width, bytes_per_pixel))
# X11 format is typically BGRA, convert to RGB
# Swap B and R channels, ignore alpha
rgb_pixels = pixels[:, :, [2, 1, 0]] # BGR -> RGB
# Create PIL image
image = Image.fromarray(rgb_pixels, 'RGB')
return image
elif pixmap_depth == 24:
# 24-bit RGB format
bytes_per_pixel = 3
total_pixels = pixmap_width * pixmap_height
pixel_data = f.read(total_pixels * bytes_per_pixel)
# Convert to numpy array and reshape
pixels = np.frombuffer(pixel_data, dtype=np.uint8)
pixels = pixels.reshape((pixmap_height, pixmap_width, bytes_per_pixel))
# Create PIL image (assuming RGB order)
image = Image.fromarray(pixels, 'RGB')
return image
else:
raise ValueError(f'Unsupported X11 pixel depth: {pixmap_depth}. Only 24-bit and 32-bit formats are supported.')
def find_default_font(config_file_path, rules): def find_default_font(config_file_path, rules):
"""Find the default font in LibreOffice Writer.""" """Find the default font in LibreOffice Writer."""
default_font = None default_font = None
@@ -294,29 +369,136 @@ def compare_docx_images(docx_file1, docx_file2):
def compare_image_text(image_path, rule): def compare_image_text(image_path, rule):
if not image_path: if not image_path:
return 0 return 0
reader = easyocr.Reader(['en'])
result = reader.readtext(image_path)
extracted_text = ' '.join([entry[1] for entry in result])
# Log OCR results # Check if the image file exists
logger.info(f"OCR extracted texts: {[entry[1] for entry in result]}") if not os.path.exists(image_path):
logger.info(f"Combined extracted text: {extracted_text}") logger.error(f"Image file not found: {image_path}")
return 0
if rule['type'] == 'text': # Check image format and convert if necessary
target_text = rule['text'] temp_image_path = None
match_found = target_text in extracted_text actual_image_path = image_path
try:
# First, try to identify the file format
result = subprocess.run(['file', image_path], capture_output=True, text=True)
file_info = result.stdout.lower()
# Log matching results # If it's an X11 screen dump, we need to convert it
logger.info(f"Target text: '{target_text}'") if 'x-window screen dump' in file_info or 'xwd' in file_info:
logger.info(f"Match found: {match_found}") logger.info(f"Detected X11 screen dump format in {image_path}, attempting conversion...")
if match_found:
logger.info("✅ Text matching successful!") # Create a temporary file for the converted image
else: temp_fd, temp_image_path = tempfile.mkstemp(suffix='.png')
logger.info("❌ Text matching failed!") os.close(temp_fd)
# Try to convert using PIL with xwd support or other methods
try:
# First try with PIL directly (sometimes it can handle xwd)
img = Image.open(image_path)
img.save(temp_image_path, 'PNG')
actual_image_path = temp_image_path
logger.info(f"Successfully converted X11 image using PIL")
except Exception as pil_error:
logger.warning(f"PIL conversion failed: {pil_error}")
# Try our custom X11 reader (pure Python solution)
try:
logger.info("Attempting conversion using custom X11 reader...")
x11_image = read_x11_image(image_path)
x11_image.save(temp_image_path, 'PNG')
actual_image_path = temp_image_path
logger.info(f"✅ Successfully converted X11 image using custom reader")
except Exception as custom_error:
logger.warning(f"Custom X11 conversion failed: {custom_error}")
# Try with netpbm tools if available (fallback)
try:
result = subprocess.run(['which', 'xwdtopnm'], capture_output=True)
if result.returncode == 0:
# Use netpbm tools chain: xwdtopnm -> pnmtopng
subprocess.run(['xwdtopnm', image_path],
stdout=subprocess.PIPE, check=True)
with open(temp_image_path, 'wb') as f:
result = subprocess.run(['xwdtopnm', image_path],
stdout=subprocess.PIPE, check=True)
result2 = subprocess.run(['pnmtopng'],
input=result.stdout,
stdout=f, check=True)
actual_image_path = temp_image_path
logger.info(f"Successfully converted X11 image using netpbm tools")
else:
raise Exception("netpbm tools not available")
except Exception as netpbm_error:
logger.warning(f"netpbm conversion failed: {netpbm_error}")
# All conversions failed
logger.error(
f"❌ All X11 conversion methods failed.\n"
f"Attempted: PIL → Custom Python reader → netpbm tools\n"
f"💡 The image might be corrupted or in an unsupported X11 variant"
)
# If all conversions fail, try to use the original file anyway
# Sometimes easyocr might handle it better than PIL
if temp_image_path:
os.unlink(temp_image_path)
temp_image_path = None
actual_image_path = image_path
logger.info(f"Will attempt OCR on original file format (likely to fail)")
return 1 if match_found else 0 # Now attempt OCR with error handling
else: try:
raise ValueError("Unsupported rule type") reader = easyocr.Reader(['en'])
result = reader.readtext(actual_image_path)
extracted_text = ' '.join([entry[1] for entry in result])
# Log OCR results
logger.info(f"OCR extracted texts: {[entry[1] for entry in result]}")
logger.info(f"Combined extracted text: {extracted_text}")
if rule['type'] == 'text':
target_text = rule['text']
match_found = target_text in extracted_text
# Log matching results
logger.info(f"Target text: '{target_text}'")
logger.info(f"Match found: {match_found}")
if match_found:
logger.info("✅ Text matching successful!")
else:
logger.info("❌ Text matching failed!")
return 1 if match_found else 0
else:
raise ValueError("Unsupported rule type")
except Exception as ocr_error:
logger.error(f"OCR processing failed for {actual_image_path}: {ocr_error}")
# Check if this is specifically an X11 format issue
if 'x-window screen dump' in file_info or 'xwd' in file_info:
logger.error(
f"🚨 OCR failed on X11 screen dump after all conversion attempts.\n"
f"This might indicate:\n"
f" 1. The X11 file is corrupted or in an unsupported variant\n"
f" 2. Missing dependencies (numpy, PIL)\n"
f" 3. Insufficient memory for large images"
)
return 0
except Exception as e:
logger.error(f"Error processing image {image_path}: {e}")
return 0
finally:
# Clean up temporary file if created
if temp_image_path and os.path.exists(temp_image_path):
try:
os.unlink(temp_image_path)
except:
pass
def compare_line_spacing(docx_file1, docx_file2): def compare_line_spacing(docx_file1, docx_file2):