init commit

2026-03-16 11:44:10 +00:00
commit 94384a93c9
552 changed files with 363038 additions and 0 deletions
--- a/tests/integration/data_comparators/init.py
+++ b/tests/integration/data_comparators/init.py
@@ -0,0 +1,8 @@
+"""
+Data comparators module for integration tests.
+Provides functions to compare generated data with reference data.
+"""
+
+from .sequence_comparator import compare_navigation_results
+
+__all__ = ['compare_navigation_results']
--- a/tests/integration/data_comparators/sequence_comparator.py
+++ b/tests/integration/data_comparators/sequence_comparator.py
@@ -0,0 +1,551 @@
+"""
+Sequence data comparator for navigation pipeline tests.
+Provides functions to compare generated navigation sequences with reference data.
+"""
+
+import json
+import os
+import math
+import numpy as np
+import cv2  # OpenCV is available per requirements
+from pathlib import Path
+from typing import Tuple, Any, Dict, Optional
+
+
+def compare_navigation_results(generated_dir: str, reference_dir: str) -> Tuple[bool, str]:
+    """Original JSON trajectory sequence comparison (unchanged logic).
+
+    NOTE: Do not modify this function's core behavior. Image comparison is handled by a separate
+    wrapper function `compare_navigation_and_images` to avoid side effects on existing tests.
+    """
+    # --- Enhanced logic ---
+    # To support both "caller passes seq_path root directory" and "legacy call (leaf trajectory directory)" forms,
+    # here we use a symmetric data.json discovery strategy for both generated and reference sides:
+    # 1. If the current directory directly contains data.json, use that file.
+    # 2. Otherwise, traverse one level down into subdirectories (sorted alphabetically), looking for <dir>/data.json.
+    # 3. Otherwise, search within two nested levels (dir/subdir/data.json) and use the first match found.
+    # 4. If not found, report an error. This is compatible with the legacy "generated=root, reference=leaf" usage,
+    #    and also allows both sides to provide the root directory.
+
+    if not os.path.isdir(generated_dir):
+        return False, f"Generated directory does not exist or is not a directory: {generated_dir}"
+    if not os.path.isdir(reference_dir):
+        return False, f"Reference directory does not exist or is not a directory: {reference_dir}"
+
+    try:
+        generated_file = _locate_first_data_json(generated_dir)
+        if generated_file is None:
+            return False, f"Could not locate data.json under generated directory: {generated_dir}"
+    except Exception as e:  # pylint: disable=broad-except
+        return False, f"Error locating generated data file: {e}"
+
+    try:
+        reference_file = _locate_first_data_json(reference_dir)
+        if reference_file is None:
+            # To preserve legacy behavior, if reference_dir/data.json exists but was not detected above (should not happen in theory), check once more
+            candidate = os.path.join(reference_dir, "data.json")
+            if os.path.isfile(candidate):
+                reference_file = candidate
+            else:
+                return False, f"Could not locate data.json under reference directory: {reference_dir}"
+    except Exception as e:  # pylint: disable=broad-except
+        return False, f"Error locating reference data file: {e}"
+
+    return compare_trajectory_sequences(generated_file, reference_file)
+
+
+def compare_navigation_and_images(
+    generated_seq_dir: str,
+    reference_seq_dir: str,
+    generated_root_for_images: Optional[str] = None,
+    reference_root_for_images: Optional[str] = None,
+    rgb_abs_tolerance: int = 0,
+    depth_abs_tolerance: float = 0.0,
+    allowed_rgb_diff_ratio: float = 0.0,
+    allowed_depth_diff_ratio: float = 0.5,
+    depth_scale_auto: bool = False,
+    fail_if_images_missing: bool = False,
+) -> Tuple[bool, str]:
+    """Wrapper that preserves original JSON comparison while optionally adding first-frame image comparison.
+
+    Args:
+        generated_seq_dir: Path to generated seq_path root used by original comparator.
+        reference_seq_dir: Path to reference seq_path root.
+        generated_root_for_images: Root (parent of obs_path) or the obs_path itself for generated images.
+        reference_root_for_images: Same as above for reference. If None, image comparison may be skipped.
+        rgb_abs_tolerance: RGB absolute per-channel tolerance.
+        depth_abs_tolerance: Depth absolute tolerance.
+        allowed_rgb_diff_ratio: Allowed differing RGB pixel ratio.
+        allowed_depth_diff_ratio: Allowed differing depth pixel ratio.
+        depth_scale_auto: Auto scale depth if uint16 millimeters.
+        fail_if_images_missing: If True, treat missing obs_path as failure; otherwise skip.
+
+    Returns:
+        (success, message) combined result.
+    """
+    traj_ok, traj_msg = compare_navigation_results(generated_seq_dir, reference_seq_dir)
+
+    # Determine image roots; default to parent of seq_dir if not explicitly provided
+    gen_img_root = generated_root_for_images or os.path.dirname(generated_seq_dir.rstrip(os.sep))
+    ref_img_root = reference_root_for_images or os.path.dirname(reference_seq_dir.rstrip(os.sep))
+
+    img_ok = True
+    img_msg = "image comparison skipped"
+
+    if generated_root_for_images is not None or reference_root_for_images is not None:
+        # User explicitly passed at least one root -> attempt compare
+        img_ok, img_msg = compare_first_frame_images(
+            generated_root=gen_img_root,
+            reference_root=ref_img_root,
+            rgb_abs_tolerance=rgb_abs_tolerance,
+            depth_abs_tolerance=depth_abs_tolerance,
+            allowed_rgb_diff_ratio=allowed_rgb_diff_ratio,
+            allowed_depth_diff_ratio=allowed_depth_diff_ratio,
+            depth_scale_auto=depth_scale_auto,
+        )
+    else:
+        # Implicit attempt only if both obs_path exist under parent paths
+        gen_obs_candidate = os.path.join(gen_img_root, "obs_path")
+        ref_obs_candidate = os.path.join(ref_img_root, "obs_path")
+        if os.path.isdir(gen_obs_candidate) and os.path.isdir(ref_obs_candidate):
+            img_ok, img_msg = compare_first_frame_images(
+                generated_root=gen_img_root,
+                reference_root=ref_img_root,
+                rgb_abs_tolerance=rgb_abs_tolerance,
+                depth_abs_tolerance=depth_abs_tolerance,
+                allowed_rgb_diff_ratio=allowed_rgb_diff_ratio,
+                allowed_depth_diff_ratio=allowed_depth_diff_ratio,
+                depth_scale_auto=depth_scale_auto,
+            )
+        else:
+            if fail_if_images_missing:
+                missing = []
+                if not os.path.isdir(gen_obs_candidate):
+                    missing.append(f"generated:{gen_obs_candidate}")
+                if not os.path.isdir(ref_obs_candidate):
+                    missing.append(f"reference:{ref_obs_candidate}")
+                img_ok = False
+                img_msg = "obs_path missing -> " + ", ".join(missing)
+            else:
+                img_msg = "obs_path not found in one or both roots; skipped"
+
+    overall = traj_ok and img_ok
+    message = f"trajectory: {traj_msg}; images: {img_msg}"
+    return overall, message if overall else f"Mismatch - {message}"
+
+
+def compare_trajectory_sequences(generated_file: str, reference_file: str, tolerance: float = 1e-6) -> Tuple[bool, str]:
+    """
+    Compare trajectory sequence files with numerical tolerance.
+
+    Args:
+        generated_file: Path to generated trajectory file
+        reference_file: Path to reference trajectory file
+        tolerance: Numerical tolerance for floating point comparisons
+
+    Returns:
+        Tuple[bool, str]: (success, message)
+    """
+    try:
+        # Check if files exist
+        if not os.path.exists(generated_file):
+            return False, f"Generated file does not exist: {generated_file}"
+
+        if not os.path.exists(reference_file):
+            return False, f"Reference file does not exist: {reference_file}"
+
+        # Load JSON files
+        with open(generated_file, 'r') as f:
+            generated_data = json.load(f)
+
+        with open(reference_file, 'r') as f:
+            reference_data = json.load(f)
+
+        # Compare the JSON structures
+        success, message = _compare_data_structures(generated_data, reference_data, tolerance)
+
+        if success:
+            return True, "Trajectory sequences match within tolerance"
+        else:
+            return False, f"Trajectory sequences differ: {message}"
+
+    except json.JSONDecodeError as e:
+        return False, f"JSON decode error: {e}"
+    except Exception as e:
+        return False, f"Error comparing trajectory sequences: {e}"
+
+
+def _compare_data_structures(data1: Any, data2: Any, tolerance: float, path: str = "") -> Tuple[bool, str]:
+    """
+    Recursively compare two data structures with numerical tolerance.
+
+    Args:
+        data1: First data structure
+        data2: Second data structure
+        tolerance: Numerical tolerance for floating point comparisons
+        path: Current path in the data structure for error reporting
+
+    Returns:
+        Tuple[bool, str]: (success, error_message)
+    """
+    # Check if types are the same
+    if type(data1) != type(data2):
+        return False, f"Type mismatch at {path}: {type(data1)} vs {type(data2)}"
+
+    # Handle dictionaries
+    if isinstance(data1, dict):
+        if set(data1.keys()) != set(data2.keys()):
+            return False, f"Key mismatch at {path}: {set(data1.keys())} vs {set(data2.keys())}"
+
+        for key in data1.keys():
+            new_path = f"{path}.{key}" if path else key
+            success, message = _compare_data_structures(data1[key], data2[key], tolerance, new_path)
+            if not success:
+                return False, message
+
+    # Handle lists
+    elif isinstance(data1, list):
+        if len(data1) != len(data2):
+            return False, f"List length mismatch at {path}: {len(data1)} vs {len(data2)}"
+
+        for i, (item1, item2) in enumerate(zip(data1, data2)):
+            new_path = f"{path}[{i}]" if path else f"[{i}]"
+            success, message = _compare_data_structures(item1, item2, tolerance, new_path)
+            if not success:
+                return False, message
+
+    # Handle numerical values
+    elif isinstance(data1, (int, float)):
+        if isinstance(data2, (int, float)):
+            if abs(data1 - data2) > tolerance:
+                return (
+                    False,
+                    f"Numerical difference at {path}: {data1} vs {data2} (diff: {abs(data1 - data2)}, tolerance: {tolerance})",
+                )
+        else:
+            return False, f"Type mismatch at {path}: number vs {type(data2)}"
+
+    # Handle strings and other exact comparison types
+    elif isinstance(data1, (str, bool, type(None))):
+        if data1 != data2:
+            return False, f"Value mismatch at {path}: {data1} vs {data2}"
+
+    # Handle unknown types
+    else:
+        if data1 != data2:
+            return False, f"Value mismatch at {path}: {data1} vs {data2}"
+
+    return True, ""
+
+def _locate_first_data_json(root: str) -> Optional[str]:
+    """Locate a data.json file under root with a shallow, deterministic strategy.
+
+    Strategy (stop at first match to keep behavior predictable & lightweight):
+      1. If root/data.json exists -> return it.
+      2. Enumerate immediate subdirectories (sorted). For each d:
+         - if d/data.json exists -> return it.
+      3. Enumerate immediate subdirectories again; for each d enumerate its subdirectories (sorted) and
+         look for d/sub/data.json -> return first match.
+      4. If none found -> return None.
+    """
+    # 1. root/data.json
+    candidate = os.path.join(root, "data.json")
+    if os.path.isfile(candidate):
+        return candidate
+
+    try:
+        first_level = [d for d in os.listdir(root) if os.path.isdir(os.path.join(root, d))]
+    except FileNotFoundError:
+        return None
+
+    first_level.sort()
+
+    # 2. d/data.json
+    for d in first_level:
+        c = os.path.join(root, d, "data.json")
+        if os.path.isfile(c):
+            return c
+
+    # 3. d/sub/data.json
+    for d in first_level:
+        d_path = os.path.join(root, d)
+        try:
+            second_level = [s for s in os.listdir(d_path) if os.path.isdir(os.path.join(d_path, s))]
+        except FileNotFoundError:
+            continue
+        second_level.sort()
+        for s in second_level:
+            c = os.path.join(d_path, s, "data.json")
+            if os.path.isfile(c):
+                return c
+
+    return None
+
+
+def compare_first_frame_images(
+    generated_root: str,
+    reference_root: str,
+    rgb_dir_name: str = "rgb",
+    depth_dir_name: str = "depth",
+    scene_dir: Optional[str] = None,
+    traj_dir: Optional[str] = None,
+    rgb_abs_tolerance: int = 0,
+    depth_abs_tolerance: float = 0.0,
+    allowed_rgb_diff_ratio: float = 0.0,
+    allowed_depth_diff_ratio: float = 0.0,
+    compute_psnr: bool = True,
+    compute_mse: bool = True,
+    depth_scale_auto: bool = False,
+) -> Tuple[bool, str]:
+    """Compare only the first frame (index 0) of RGB & depth images between generated and reference.
+
+    This is a lightweight check to validate pipeline correctness without scanning all frames.
+
+    Args:
+        generated_root: Path to generated run root (may contain `obs_path` or be the `obs_path`).
+        reference_root: Path to reference run root (same structure as generated_root).
+        rgb_dir_name: Subdirectory name for RGB frames under a trajectory directory.
+        depth_dir_name: Subdirectory name for depth frames under a trajectory directory.
+        scene_dir: Optional explicit scene directory name (e.g. "6f"); if None will auto-pick first.
+        traj_dir: Optional explicit trajectory directory (e.g. "0"); if None will auto-pick first.
+        rgb_abs_tolerance: Per-channel absolute pixel tolerance (0 requires exact match).
+        depth_abs_tolerance: Absolute tolerance for depth value differences (after optional scaling).
+        allowed_rgb_diff_ratio: Max allowed ratio of differing RGB pixels (0.01 -> 1%).
+        allowed_depth_diff_ratio: Max allowed ratio of differing depth pixels beyond tolerance.
+        compute_psnr: Whether to compute PSNR metric for reporting.
+        compute_mse: Whether to compute MSE metric for reporting.
+        depth_scale_auto: If True, attempt simple heuristic scaling for uint16 depth (divide by 1000 if max > 10000).
+
+    Returns:
+        (success, message) summary of comparison.
+    """
+    try:
+        gen_obs = _resolve_obs_path(generated_root)
+        ref_obs = _resolve_obs_path(reference_root)
+        if gen_obs is None:
+            return False, f"Cannot locate obs_path under generated root: {generated_root}"
+        if ref_obs is None:
+            return False, f"Cannot locate obs_path under reference root: {reference_root}"
+
+        scene_dir = scene_dir or _pick_first_subdir(gen_obs)
+        if scene_dir is None:
+            return False, f"No scene directory found in {gen_obs}"
+        ref_scene_dir = scene_dir if os.path.isdir(os.path.join(ref_obs, scene_dir)) else _pick_first_subdir(ref_obs)
+        if ref_scene_dir is None:
+            return False, f"No matching scene directory in reference: {ref_obs}"
+
+        gen_scene_path = os.path.join(gen_obs, scene_dir)
+        ref_scene_path = os.path.join(ref_obs, ref_scene_dir)
+
+        traj_dir = traj_dir or _pick_first_subdir(gen_scene_path)
+        if traj_dir is None:
+            return False, f"No trajectory directory in {gen_scene_path}"
+        ref_traj_dir = (
+            traj_dir if os.path.isdir(os.path.join(ref_scene_path, traj_dir)) else _pick_first_subdir(ref_scene_path)
+        )
+        if ref_traj_dir is None:
+            return False, f"No trajectory directory in reference scene path {ref_scene_path}"
+
+        gen_traj_path = os.path.join(gen_scene_path, traj_dir)
+        ref_traj_path = os.path.join(ref_scene_path, ref_traj_dir)
+
+        # RGB comparison
+        rgb_result, rgb_msg = _compare_single_frame_rgb(
+            gen_traj_path,
+            ref_traj_path,
+            rgb_dir_name,
+            rgb_abs_tolerance,
+            allowed_rgb_diff_ratio,
+            compute_psnr,
+            compute_mse,
+        )
+
+        # Depth comparison (optional if depth folder exists)
+        depth_result, depth_msg = _compare_single_frame_depth(
+            gen_traj_path,
+            ref_traj_path,
+            depth_dir_name,
+            depth_abs_tolerance,
+            allowed_depth_diff_ratio,
+            compute_psnr,
+            compute_mse,
+            depth_scale_auto,
+        )
+
+        success = rgb_result and depth_result
+        combined_msg = f"RGB: {rgb_msg}; Depth: {depth_msg}"
+        return success, ("Images match - " + combined_msg) if success else ("Image mismatch - " + combined_msg)
+    except Exception as e:  # pylint: disable=broad-except
+        return False, f"Error during first-frame image comparison: {e}"
+
+
+def _resolve_obs_path(root: str) -> Optional[str]:
+    """Return the obs_path directory. Accept either the root itself or its child."""
+    if not os.path.isdir(root):
+        return None
+    if os.path.basename(root) == "obs_path":
+        return root
+    candidate = os.path.join(root, "obs_path")
+    return candidate if os.path.isdir(candidate) else None
+
+
+def _pick_first_subdir(parent: str) -> Optional[str]:
+    """Pick the first alphanumerically sorted subdirectory name under parent."""
+    try:
+        subs = [d for d in os.listdir(parent) if os.path.isdir(os.path.join(parent, d))]
+        if not subs:
+            return None
+        subs.sort()
+        return subs[0]
+    except FileNotFoundError:
+        return None
+
+
+def _find_first_frame_file(folder: str, exts: Tuple[str, ...]) -> Optional[str]:
+    """Find the smallest numeral file with one of extensions; returns absolute path."""
+    if not os.path.isdir(folder):
+        return None
+    candidates = []
+    for f in os.listdir(folder):
+        lower = f.lower()
+        for e in exts:
+            if lower.endswith(e):
+                num_part = os.path.splitext(f)[0]
+                if num_part.isdigit():
+                    candidates.append((int(num_part), f))
+                elif f.startswith("0"):  # fallback for names like 0.jpg
+                    candidates.append((0, f))
+                break
+    if not candidates:
+        return None
+    candidates.sort(key=lambda x: x[0])
+    return os.path.join(folder, candidates[0][1])
+
+
+def _compare_single_frame_rgb(
+    gen_traj_path: str,
+    ref_traj_path: str,
+    rgb_dir_name: str,
+    abs_tol: int,
+    allowed_ratio: float,
+    compute_psnr: bool,
+    compute_mse: bool,
+) -> Tuple[bool, str]:
+    rgb_gen_dir = os.path.join(gen_traj_path, rgb_dir_name)
+    rgb_ref_dir = os.path.join(ref_traj_path, rgb_dir_name)
+    if not os.path.isdir(rgb_gen_dir) or not os.path.isdir(rgb_ref_dir):
+        return (
+            False,
+            f"RGB directory missing (generated: {os.path.isdir(rgb_gen_dir)}, reference: {os.path.isdir(rgb_ref_dir)})",
+        )
+
+    gen_file = _find_first_frame_file(rgb_gen_dir, (".jpg", ".png", ".jpeg"))
+    ref_file = _find_first_frame_file(rgb_ref_dir, (".jpg", ".png", ".jpeg"))
+    if not gen_file or not ref_file:
+        return False, "First RGB frame file not found in one of the directories"
+
+    gen_img = cv2.imread(gen_file, cv2.IMREAD_COLOR)
+    ref_img = cv2.imread(ref_file, cv2.IMREAD_COLOR)
+    if gen_img is None or ref_img is None:
+        return False, "Failed to read RGB images"
+    if gen_img.shape != ref_img.shape:
+        return False, f"RGB shape mismatch {gen_img.shape} vs {ref_img.shape}"
+
+    diff = np.abs(gen_img.astype(np.int16) - ref_img.astype(np.int16))
+    diff_mask = np.any(diff > abs_tol, axis=2)
+    diff_ratio = float(diff_mask.sum()) / diff_mask.size
+
+    metrics_parts = [f"diff_pixels_ratio={diff_ratio:.4f}"]
+    flag = False
+    if compute_mse or compute_psnr:
+        mse = float((diff**2).mean())
+        if compute_mse:
+            metrics_parts.append(f"mse={mse:.2f}")
+        if compute_psnr:
+            if mse == 0.0:
+                psnr = float('inf')
+                flag = True
+            else:
+                psnr = 10.0 * math.log10((255.0**2) / mse)
+            if math.isinf(psnr):
+                metrics_parts.append("psnr=inf")
+                flag = True
+            else:
+                metrics_parts.append(f"psnr={psnr:.2f}dB")
+                if psnr >= 40.0:
+                    flag = True
+
+    passed = diff_ratio <= allowed_ratio or flag
+    status = "OK" if passed else "FAIL"
+    return passed, f"{status} (abs_tol={abs_tol}, allowed_ratio={allowed_ratio}, {' '.join(metrics_parts)})"
+
+
+def _compare_single_frame_depth(
+    gen_traj_path: str,
+    ref_traj_path: str,
+    depth_dir_name: str,
+    abs_tol: float,
+    allowed_ratio: float,
+    compute_psnr: bool,
+    compute_mse: bool,
+    auto_scale: bool,
+) -> Tuple[bool, str]:
+    depth_gen_dir = os.path.join(gen_traj_path, depth_dir_name)
+    depth_ref_dir = os.path.join(ref_traj_path, depth_dir_name)
+    if not os.path.isdir(depth_gen_dir) or not os.path.isdir(depth_ref_dir):
+        return (
+            False,
+            f"Depth directory missing (generated: {os.path.isdir(depth_gen_dir)}, reference: {os.path.isdir(depth_ref_dir)})",
+        )
+
+    gen_file = _find_first_frame_file(depth_gen_dir, (".png", ".exr"))
+    ref_file = _find_first_frame_file(depth_ref_dir, (".png", ".exr"))
+    if not gen_file or not ref_file:
+        return False, "First depth frame file not found in one of the directories"
+
+    gen_img = cv2.imread(gen_file, cv2.IMREAD_UNCHANGED)
+    ref_img = cv2.imread(ref_file, cv2.IMREAD_UNCHANGED)
+    if gen_img is None or ref_img is None:
+        return False, "Failed to read depth images"
+    if gen_img.shape != ref_img.shape:
+        return False, f"Depth shape mismatch {gen_img.shape} vs {ref_img.shape}"
+
+    gen_depth = _prepare_depth_array(gen_img, auto_scale)
+    ref_depth = _prepare_depth_array(ref_img, auto_scale)
+    if gen_depth.shape != ref_depth.shape:
+        return False, f"Depth array shape mismatch {gen_depth.shape} vs {ref_depth.shape}"
+
+    diff = np.abs(gen_depth - ref_depth)
+    diff_mask = diff > abs_tol
+    diff_ratio = float(diff_mask.sum()) / diff_mask.size
+
+    metrics_parts = [f"diff_pixels_ratio={diff_ratio:.4f}"]
+    if compute_mse or compute_psnr:
+        mse = float((diff**2).mean())
+        if compute_mse:
+            metrics_parts.append(f"mse={mse:.6f}")
+        if compute_psnr:
+            # Estimate dynamic range from reference depth
+            dr = float(ref_depth.max() - ref_depth.min()) or 1.0
+            if mse == 0.0:
+                psnr = float('inf')
+            else:
+                psnr = 10.0 * math.log10((dr**2) / mse)
+            metrics_parts.append("psnr=inf" if math.isinf(psnr) else f"psnr={psnr:.2f}dB")
+
+    passed = diff_ratio <= allowed_ratio
+    status = "OK" if passed else "FAIL"
+    return passed, f"{status} (abs_tol={abs_tol}, allowed_ratio={allowed_ratio}, {' '.join(metrics_parts)})"
+
+
+def _prepare_depth_array(arr: np.ndarray, auto_scale: bool) -> np.ndarray:
+    """Convert raw depth image to float32 array; apply simple heuristic scaling if requested."""
+    if arr.dtype == np.uint16:
+        depth = arr.astype(np.float32)
+        if auto_scale and depth.max() > 10000:  # likely millimeters
+            depth /= 1000.0
+        return depth
+    if arr.dtype == np.float32:
+        return arr
+    # Fallback: convert to float
+    return arr.astype(np.float32)
--- a/tests/integration/data_comparators/simbox_comparator.py
+++ b/tests/integration/data_comparators/simbox_comparator.py
@@ -0,0 +1,398 @@
+"""
+Simbox Output Comparator
+
+This module provides functionality to compare two Simbox task output directories.
+It compares both meta_info.pkl and LMDB database contents, handling different data types:
+- JSON data (dict/list)
+- Scalar data (numerical arrays/lists)
+- Image data (encoded images)
+- Proprioception data (joint states, gripper states)
+- Object data (object poses and properties)
+- Action data (robot actions)
+"""
+
+import argparse
+import pickle
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+import cv2
+import lmdb
+import numpy as np
+
+
+class SimboxComparator:
+    """Comparator for Simbox task output directories."""
+
+    def __init__(self, dir1: str, dir2: str, tolerance: float = 1e-6, image_psnr_threshold: float = 30.0):
+        """
+        Initialize the comparator.
+
+        Args:
+            dir1: Path to the first output directory
+            dir2: Path to the second output directory
+            tolerance: Numerical tolerance for floating point comparisons
+            image_psnr_threshold: PSNR threshold (dB) for considering images as acceptable match
+        """
+        self.dir1 = Path(dir1)
+        self.dir2 = Path(dir2)
+        self.tolerance = tolerance
+        self.image_psnr_threshold = image_psnr_threshold
+        self.mismatches = []
+        self.warnings = []
+        self.image_psnr_values: List[float] = []
+
+    def load_directory(self, directory: Path) -> Tuple[Optional[Dict], Optional[Any], Optional[Any]]:
+        """Load meta_info.pkl and LMDB database from directory."""
+        meta_path = directory / "meta_info.pkl"
+        lmdb_path = directory / "lmdb"
+
+        if not directory.is_dir() or not meta_path.exists() or not lmdb_path.is_dir():
+            print(f"Error: '{directory}' is not a valid output directory.")
+            print("It must contain 'meta_info.pkl' and an 'lmdb' subdirectory.")
+            return None, None, None
+
+        try:
+            with open(meta_path, "rb") as f:
+                meta_info = pickle.load(f)
+        except Exception as e:
+            print(f"Error loading meta_info.pkl from {directory}: {e}")
+            return None, None, None
+
+        try:
+            env = lmdb.open(str(lmdb_path), readonly=True, lock=False, readahead=False, meminit=False)
+            txn = env.begin(write=False)
+        except Exception as e:
+            print(f"Error opening LMDB database at {lmdb_path}: {e}")
+            return None, None, None
+
+        return meta_info, txn, env
+
+    def compare_metadata(self, meta1: Dict, meta2: Dict) -> bool:
+        """Compare high-level metadata."""
+        identical = True
+
+        if meta1.get("num_steps") != meta2.get("num_steps"):
+            self.mismatches.append(f"num_steps differ: {meta1.get('num_steps')} vs {meta2.get('num_steps')}")
+            identical = False
+
+        return identical
+
+    def get_key_categories(self, meta: Dict) -> Dict[str, set]:
+        """Extract key categories from metadata."""
+        key_to_category = {}
+        for category, keys in meta.get("keys", {}).items():
+            for key in keys:
+                key_bytes = key if isinstance(key, bytes) else key.encode()
+                key_to_category[key_bytes] = category
+
+        return key_to_category
+
+    def compare_json_data(self, key: bytes, data1: Any, data2: Any) -> bool:
+        """Compare JSON/dict/list data."""
+        if type(data1) != type(data2):
+            self.mismatches.append(f"[{key.decode()}] Type mismatch: {type(data1).__name__} vs {type(data2).__name__}")
+            return False
+
+        if isinstance(data1, dict):
+            if set(data1.keys()) != set(data2.keys()):
+                self.mismatches.append(f"[{key.decode()}] Dict keys differ")
+                return False
+            for k in data1.keys():
+                if not self.compare_json_data(key, data1[k], data2[k]):
+                    return False
+        elif isinstance(data1, list):
+            if len(data1) != len(data2):
+                self.mismatches.append(f"[{key.decode()}] List length differ: {len(data1)} vs {len(data2)}")
+                return False
+            # For lists, compare sample elements to avoid excessive output
+            if len(data1) > 10:
+                sample_indices = [0, len(data1) // 2, -1]
+                for idx in sample_indices:
+                    if not self.compare_json_data(key, data1[idx], data2[idx]):
+                        return False
+            else:
+                for i, (v1, v2) in enumerate(zip(data1, data2)):
+                    if not self.compare_json_data(key, v1, v2):
+                        return False
+        else:
+            if data1 != data2:
+                self.mismatches.append(f"[{key.decode()}] Value mismatch: {data1} vs {data2}")
+                return False
+
+        return True
+
+    def compare_numerical_data(self, key: bytes, data1: Any, data2: Any) -> bool:
+        """Compare numerical data (arrays, lists of numbers)."""
+        # Convert to numpy arrays for comparison
+        try:
+            if isinstance(data1, list):
+                arr1 = np.array(data1)
+                arr2 = np.array(data2)
+            else:
+                arr1 = data1
+                arr2 = data2
+
+            if arr1.shape != arr2.shape:
+                self.mismatches.append(f"[{key.decode()}] Shape mismatch: {arr1.shape} vs {arr2.shape}")
+                return False
+
+            if not np.allclose(arr1, arr2, rtol=self.tolerance, atol=self.tolerance):
+                diff = np.abs(arr1 - arr2)
+                max_diff = np.max(diff)
+                mean_diff = np.mean(diff)
+                self.mismatches.append(
+                    f"[{key.decode()}] Numerical difference: max={max_diff:.6e}, mean={mean_diff:.6e}"
+                )
+                return False
+
+        except Exception as e:
+            self.warnings.append(f"[{key.decode()}] Error comparing numerical data: {e}")
+            return False
+
+        return True
+
+    def compare_image_data(self, key: bytes, data1: np.ndarray, data2: np.ndarray) -> bool:
+        """Compare image data (encoded as uint8 arrays)."""
+        try:
+            # Decode images
+            img1 = cv2.imdecode(data1, cv2.IMREAD_UNCHANGED)
+            img2 = cv2.imdecode(data2, cv2.IMREAD_UNCHANGED)
+
+            if img1 is None or img2 is None:
+                self.warnings.append(f"[{key.decode()}] Could not decode image, using binary comparison")
+                return np.array_equal(data1, data2)
+
+            # Compare shapes
+            if img1.shape != img2.shape:
+                self.mismatches.append(f"[{key.decode()}] Image shape mismatch: {img1.shape} vs {img2.shape}")
+                return False
+
+            # Calculate PSNR for tracking average quality
+            if np.array_equal(img1, img2):
+                psnr = 100.0
+            else:
+                diff_float = img1.astype(np.float32) - img2.astype(np.float32)
+                mse = np.mean(diff_float ** 2)
+                if mse == 0:
+                    psnr = 100.0
+                else:
+                    max_pixel = 255.0
+                    psnr = 20 * np.log10(max_pixel / np.sqrt(mse))
+
+            self.image_psnr_values.append(psnr)
+
+            try:
+                print(f"[{key.decode()}] PSNR: {psnr:.2f} dB")
+            except Exception:
+                print(f"[<binary key>] PSNR: {psnr:.2f} dB")
+
+            return True
+
+        except Exception as e:
+            self.warnings.append(f"[{key.decode()}] Error comparing image: {e}")
+            return False
+
+    def _save_comparison_image(self, key: bytes, img1: np.ndarray, img2: np.ndarray, diff: np.ndarray):
+        """Save comparison visualization for differing images."""
+        try:
+            output_dir = Path("image_comparisons")
+            output_dir.mkdir(exist_ok=True)
+
+            # Normalize difference for visualization
+            if len(diff.shape) == 3:
+                diff_vis = np.clip(diff * 5, 0, 255).astype(np.uint8)
+            else:
+                diff_vis = np.clip(diff * 5, 0, 255).astype(np.uint8)
+
+            # Ensure RGB format for concatenation
+            def ensure_rgb(img):
+                if len(img.shape) == 2:
+                    return cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
+                elif len(img.shape) == 3 and img.shape[2] == 4:
+                    return cv2.cvtColor(img, cv2.COLOR_BGRA2RGB)
+                return img
+
+            img1_rgb = ensure_rgb(img1)
+            img2_rgb = ensure_rgb(img2)
+            diff_rgb = ensure_rgb(diff_vis)
+
+            # Concatenate horizontally
+            combined = np.hstack([img1_rgb, img2_rgb, diff_rgb])
+
+            # Save
+            safe_key = key.decode().replace("/", "_").replace("\\", "_").replace(":", "_")
+            output_path = output_dir / f"diff_{safe_key}.png"
+            cv2.imwrite(str(output_path), cv2.cvtColor(combined, cv2.COLOR_RGB2BGR))
+
+        except Exception as e:
+            self.warnings.append(f"Failed to save comparison image for {key.decode()}: {e}")
+
+    def compare_value(self, key: bytes, category: str, val1: bytes, val2: bytes) -> bool:
+        """Compare a single key-value pair based on its category."""
+        # Output the category for the current key being compared
+        try:
+            print(f"[{key.decode()}] category: {category}")
+        except Exception:
+            print(f"[<binary key>] category: {category}")
+
+        if val1 is None and val2 is None:
+            return True
+        if val1 is None or val2 is None:
+            self.mismatches.append(f"[{key.decode()}] Key exists in one dataset but not the other")
+            return False
+
+        try:
+            data1 = pickle.loads(val1)
+            data2 = pickle.loads(val2)
+        except Exception as e:
+            self.warnings.append(f"[{key.decode()}] Error unpickling data: {e}")
+            return val1 == val2
+
+        # Route to appropriate comparison based on category
+        if category == "json_data":
+            return self.compare_json_data(key, data1, data2)
+        elif category in ["scalar_data", "proprio_data", "object_data", "action_data"]:
+            return self.compare_numerical_data(key, data1, data2)
+        elif category.startswith("images."):
+            # Image data is stored as numpy uint8 array
+            if isinstance(data1, np.ndarray) and isinstance(data2, np.ndarray):
+                return self.compare_image_data(key, data1, data2)
+            else:
+                self.warnings.append(f"[{key.decode()}] Expected numpy array for image data")
+                return False
+        else:
+            # Unknown category, try generic comparison
+            self.warnings.append(f"[{key.decode()}] Unknown category '{category}', using binary comparison")
+            return val1 == val2
+
+    def compare(self) -> bool:
+        """Execute full comparison."""
+        print(f"Comparing directories:")
+        print(f"  Dir1: {self.dir1}")
+        print(f"  Dir2: {self.dir2}\n")
+
+        # Load data
+        meta1, txn1, env1 = self.load_directory(self.dir1)
+        meta2, txn2, env2 = self.load_directory(self.dir2)
+
+        if meta1 is None or meta2 is None:
+            print("Aborting due to loading errors.")
+            return False
+
+        print("Successfully loaded data from both directories.\n")
+
+        # Compare metadata
+        print("Comparing metadata...")
+        self.compare_metadata(meta1, meta2)
+
+        # Get key categories
+        key_cat1 = self.get_key_categories(meta1)
+        key_cat2 = self.get_key_categories(meta2)
+
+        keys1 = set(key_cat1.keys())
+        keys2 = set(key_cat2.keys())
+
+        # Check key sets
+        if keys1 != keys2:
+            missing_in_2 = sorted([k.decode() for k in keys1 - keys2])
+            missing_in_1 = sorted([k.decode() for k in keys2 - keys1])
+            if missing_in_2:
+                self.mismatches.append(f"Keys missing in dir2: {missing_in_2[:10]}")
+            if missing_in_1:
+                self.mismatches.append(f"Keys missing in dir1: {missing_in_1[:10]}")
+
+        # Compare common keys
+        common_keys = sorted(list(keys1.intersection(keys2)))
+        print(f"Comparing {len(common_keys)} common keys...\n")
+
+        for i, key in enumerate(common_keys):
+            if i % 100 == 0 and i > 0:
+                print(f"Progress: {i}/{len(common_keys)} keys compared...")
+
+            category = key_cat1.get(key, "unknown")
+            val1 = txn1.get(key)
+            val2 = txn2.get(key)
+
+            self.compare_value(key, category, val1, val2)
+
+        if self.image_psnr_values:
+            avg_psnr = sum(self.image_psnr_values) / len(self.image_psnr_values)
+            print(
+                f"\nImage PSNR average over {len(self.image_psnr_values)} images: "
+                f"{avg_psnr:.2f} dB (threshold {self.image_psnr_threshold:.2f} dB)"
+            )
+            if avg_psnr < self.image_psnr_threshold:
+                self.mismatches.append(
+                    f"Average image PSNR {avg_psnr:.2f} dB below threshold "
+                    f"{self.image_psnr_threshold:.2f} dB"
+                )
+        else:
+            print("\nNo image entries found for PSNR calculation.")
+
+        # Cleanup
+        env1.close()
+        env2.close()
+
+        # Print results
+        print("\n" + "=" * 80)
+        print("COMPARISON RESULTS")
+        print("=" * 80)
+
+        if self.warnings:
+            print(f"\nWarnings ({len(self.warnings)}):")
+            for warning in self.warnings[:20]:
+                print(f"  - {warning}")
+            if len(self.warnings) > 20:
+                print(f"  ... and {len(self.warnings) - 20} more warnings")
+
+        if self.mismatches:
+            print(f"\nMismatches found ({len(self.mismatches)}):")
+            for mismatch in self.mismatches[:30]:
+                print(f"  - {mismatch}")
+            if len(self.mismatches) > 30:
+                print(f"  ... and {len(self.mismatches) - 30} more mismatches")
+            print("\n❌ RESULT: Directories are DIFFERENT")
+            return False
+        else:
+            print("\n✅ RESULT: Directories are IDENTICAL")
+            return True
+
+
+def main():
+    """Main entry point."""
+    parser = argparse.ArgumentParser(
+        description="Compare two Simbox task output directories.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  %(prog)s --dir1 output/run1 --dir2 output/run2
+  %(prog)s --dir1 output/run1 --dir2 output/run2 --tolerance 1e-5
+  %(prog)s --dir1 output/run1 --dir2 output/run2 --image-psnr 40.0
+        """,
+    )
+    parser.add_argument("--dir1", type=str, required=True, help="Path to the first output directory")
+    parser.add_argument("--dir2", type=str, required=True, help="Path to the second output directory")
+    parser.add_argument(
+        "--tolerance",
+        type=float,
+        default=1e-6,
+        help="Numerical tolerance for floating point comparisons (default: 1e-6)",
+    )
+    parser.add_argument(
+        "--image-psnr",
+        type=float,
+        default=37.0,
+        help="PSNR threshold (dB) for considering images as matching (default: 37.0)",
+    )
+
+    args = parser.parse_args()
+
+    comparator = SimboxComparator(args.dir1, args.dir2, tolerance=args.tolerance, image_psnr_threshold=args.image_psnr)
+
+    success = comparator.compare()
+    exit(0 if success else 1)
+
+
+if __name__ == "__main__":
+    main()