issacdataengine/nimbus/utils/utils.py

import functools
import os
import re
import sys
import time
from typing import Tuple, Type, Union

from nimbus.components.data.observation import Observations
from nimbus.components.data.scene import Scene
from nimbus.components.data.sequence import Sequence


def init_env():
    sys.path.append("./")
    sys.path.append("./data_engine")
    sys.path.append("workflows/simbox")


def unpack_iter_data(data: tuple):
    assert len(data) <= 3, "not support yet"
    scene = None
    seq = None
    obs = None
    for item in data:
        if isinstance(item, Scene):
            scene = item
        elif isinstance(item, Sequence):
            seq = item
        elif isinstance(item, Observations):
            obs = item
    return scene, seq, obs


def consume_stage(stage_input):
    if hasattr(stage_input, "Args"):
        consume_iterators(stage_input.Args)
        for value in stage_input.Args:
            if hasattr(value, "__del__"):
                value.__del__()  # pylint: disable=C2801
    if hasattr(stage_input, "Kwargs"):
        if stage_input.Kwargs is not None:
            for value in stage_input.Kwargs.values():
                consume_iterators(value)
                if hasattr(value, "__del__"):
                    value.__del__()  # pylint: disable=C2801


# prevent isaac sim close pipe worker in advance
def pipe_consume_stage(stage_input):
    if hasattr(stage_input, "Args"):
        consume_iterators(stage_input.Args)
    if hasattr(stage_input, "Kwargs"):
        if stage_input.Kwargs is not None:
            for value in stage_input.Kwargs.values():
                consume_iterators(value)


def consume_iterators(obj):
    # from pdb import set_trace; set_trace()
    if isinstance(obj, (str, bytes)):
        return obj
    if isinstance(obj, dict):
        return {key: consume_iterators(value) for key, value in obj.items()}
    if isinstance(obj, list):
        return [consume_iterators(item) for item in obj]
    if isinstance(obj, tuple):
        return tuple(consume_iterators(item) for item in obj)
    if hasattr(obj, "__iter__"):
        for item in obj:
            consume_iterators(item)
    return obj


def scene_names_postprocess(scene_names: list) -> list:
    """
    Distributes a list of scene names (folders) among multiple workers in a distributed environment.
    This function is designed to work with Deep Learning Container (DLC) environments, where worker
    information is extracted from environment variables. It assigns a subset of the input scene names
    to the current worker based on its rank and the total number of workers, using a round-robin strategy.
    If not running in a DLC environment, all scene names are assigned to a single worker.
    Args:
        scene_names (list): List of scene names (typically folder names) to be distributed.
    Returns:
        list: The subset of scene names assigned to the current worker.
    Raises:
        PermissionError: If there is a permission issue accessing the input directory.
        RuntimeError: For any other errors encountered during processing.
    Notes:
        - The function expects certain environment variables (e.g., POD_NAME, WORLD_SIZE) to be set
          in DLC environments.
        - If multiple workers are present, the input list is sorted before distribution to ensure
          consistent assignment across workers.
    """

    def _get_dlc_worker_info():
        """Extract worker rank and world size from DLC environment variables."""
        pod_name = os.environ.get("POD_NAME")

        if pod_name:
            # Match worker-N or master-N patterns
            match = re.search(r"dlc.*?-(worker|master)-(\d+)$", pod_name)
            if match:
                node_type, node_id = match.groups()
                world_size = int(os.environ.get("WORLD_SIZE", "1"))

                if node_type == "worker":
                    rank = int(node_id)
                else:  # master node
                    rank = world_size - 1

                return rank, world_size

        # Default for non-DLC environment
        return 0, 1

    def _distribute_folders(all_folders, rank, world_size):
        """Distribute folders among workers using round-robin strategy."""
        if not all_folders:
            return []

        # Only sort when there are multiple workers to ensure consistency
        if world_size > 1:
            all_folders.sort()

        # Distribute using slicing: worker i gets folders at indices i, i+world_size, ...
        return all_folders[rank::world_size]

    try:
        # Get all subfolders
        all_subfolders = scene_names
        if not all_subfolders:
            print(f"Warning: No scene found in {scene_names}")
            return []

        # Get worker identity and distribute folders
        rank, world_size = _get_dlc_worker_info()
        assigned_folders = _distribute_folders(all_subfolders, rank, world_size)

        print(
            f"DLC Worker {rank}/{world_size}: Assigned {len(assigned_folders)} out of "
            f"{len(all_subfolders)} total folders"
        )

        return assigned_folders

    except PermissionError:
        raise PermissionError(f"No permission to access directory: {scene_names}")
    except Exception as e:
        raise RuntimeError(f"Error reading input directory {scene_names}: {e}")


def retry_on_exception(
    max_retries: int = 3, retry_exceptions: Union[bool, Tuple[Type[Exception], ...]] = True, delay: float = 1.0
):
    def decorator(func):
        @functools.wraps(func)
        def wrapper(self, *args, **kwargs):
            last_exception = None
            for attempt in range(max_retries + 1):
                try:
                    if attempt > 0:
                        print(f"Retry attempt {attempt}/{max_retries} for {func.__name__}")
                    return func(self, *args, **kwargs)
                except Exception as e:
                    last_exception = e
                    should_retry = False
                    if retry_exceptions is True:
                        should_retry = True
                    elif isinstance(retry_exceptions, (tuple, list)):
                        should_retry = isinstance(e, retry_exceptions)

                    if should_retry and attempt < max_retries:
                        print(f"Error in {func.__name__}: {e}. Retrying in {delay} seconds...")
                        time.sleep(delay)
                    else:
                        raise
            if last_exception:
                raise last_exception

        return wrapper

    return decorator