init commit

2026-03-16 11:44:10 +00:00
commit 94384a93c9
552 changed files with 363038 additions and 0 deletions
--- a/nimbus/init.py
+++ b/nimbus/init.py
@@ -0,0 +1,16 @@
+import ray
+
+from nimbus.utils.types import STAGE_PIPE
+
+from .data_engine import DataEngine, DistPipeDataEngine
+
+
+def run_data_engine(config, master_seed=None):
+    import nimbus_extension  # noqa: F401  pylint: disable=unused-import
+
+    if STAGE_PIPE in config:
+        ray.init(num_gpus=1)
+        data_engine = DistPipeDataEngine(config, master_seed=master_seed)
+    else:
+        data_engine = DataEngine(config, master_seed=master_seed)
+    data_engine.run()
--- a/nimbus/components/data/init.py
+++ b/nimbus/components/data/init.py
--- a/nimbus/components/data/camera.py
+++ b/nimbus/components/data/camera.py
@@ -0,0 +1,71 @@
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+
+
+@dataclass
+class C2W:
+    """
+    Represents a camera-to-world transformation matrix.
+
+    Attributes:
+        matrix (List[float]): A list of 16 floats representing the 4x4 transformation matrix in row-major order.
+    """
+
+    matrix: List[float]
+
+
+@dataclass
+class Camera:
+    """
+    Represents a single camera pose in the trajectory.
+
+    Attributes:
+        trajectory (List[C2W]): List of C2W transformations for this camera pose.
+        intrinsic (Optional[List[float]]): 3x3 camera intrinsic matrix: [[fx, 0, cx], [0, fy, cy], [0, 0, 1]].
+        extrinsic (Optional[List[float]]): 4x4 tobase_extrinsic matrix representing the camera mounting offset
+            relative to the robot base (height + pitch).
+        length (Optional[int]): Length of the trajectory in number of frames.
+        depths (Optional[list[np.ndarray]]): List of depth images captured by this camera.
+        rgbs (Optional[list[np.ndarray]]): List of RGB images captured by this camera.
+        uv_tracks (Optional[Dict[str, Any]]): UV tracking data in the format
+            {mesh_name: {"per_frame": list, "width": W, "height": H}}.
+        uv_mesh_names (Optional[List[str]]): List of mesh names being tracked in the UV tracking data.
+    """
+
+    trajectory: List[C2W]
+    intrinsic: List[float] = None
+    extrinsic: List[float] = None
+    length: int = None
+    depths: list[np.ndarray] = None
+    rgbs: list[np.ndarray] = None
+    uv_tracks: Optional[Dict[str, Any]] = None
+    uv_mesh_names: Optional[List[str]] = None
+
+    def __len__(self):
+        if self.length is not None:
+            return self.length
+        self._check_length()
+        self.length = len(self.trajectory)
+        return len(self.trajectory)
+
+    def _check_length(self):
+        if self.depths is not None and len(self.depths) != len(self.trajectory):
+            raise ValueError("Length of depths does not match length of trajectory")
+        if self.rgbs is not None and len(self.rgbs) != len(self.trajectory):
+            raise ValueError("Length of rgbs does not match length of trajectory")
+        if self.uv_tracks is not None:
+            for mesh_name, track_data in self.uv_tracks.items():
+                if len(track_data["per_frame"]) != len(self.trajectory):
+                    raise ValueError(f"Length of uv_tracks for mesh {mesh_name} does not match length of trajectory")
+
+    def append_rgb(self, rgb_image: np.ndarray):
+        if self.rgbs is None:
+            self.rgbs = []
+        self.rgbs.append(rgb_image)
+
+    def append_depth(self, depth_image: np.ndarray):
+        if self.depths is None:
+            self.depths = []
+        self.depths.append(depth_image)
--- a/nimbus/components/data/iterator.py
+++ b/nimbus/components/data/iterator.py
@@ -0,0 +1,95 @@
+import logging
+import time
+from abc import abstractmethod
+from collections.abc import Iterator
+from typing import Generic, TypeVar
+
+T = TypeVar("T")
+
+
+# pylint: disable=E0102
+class Iterator(Iterator, Generic[T]):
+    def __init__(self, max_retry=3):
+        self._next_calls = 0.0
+        self._next_total_time = 0.0
+        self._init_time_costs = 0.0
+        self._init_times = 0
+        self._frame_compute_time = 0.0
+        self._frame_compute_frames = 0.0
+        self._frame_io_time = 0.0
+        self._frame_io_frames = 0.0
+        self._wait_time = 0.0
+        self._seq_num = 0.0
+        self._seq_time = 0.0
+        self.logger = logging.getLogger("de_logger")
+        self.max_retry = max_retry
+        self.retry_num = 0
+
+    def record_init_time(self, time_costs):
+        self._init_times += 1
+        self._init_time_costs += time_costs
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        start_time = time.time()
+        try:
+            result = self._next()
+        except StopIteration:
+            self._log_statistics()
+            raise
+        end_time = time.time()
+        self._next_calls += 1
+        self._next_total_time += end_time - start_time
+        return result
+
+    def collect_compute_frame_info(self, length, time_costs):
+        self._frame_compute_frames += length
+        self._frame_compute_time += time_costs
+
+    def collect_io_frame_info(self, length, time_costs):
+        self._frame_io_frames += length
+        self._frame_io_time += time_costs
+
+    def collect_wait_time_info(self, time_costs):
+        self._wait_time += time_costs
+
+    def collect_seq_info(self, length, time_costs):
+        self._seq_num += length
+        self._seq_time += time_costs
+
+    @abstractmethod
+    def _next(self):
+        raise NotImplementedError("Subclasses should implement this method.")
+
+    def _log_statistics(self):
+        class_name = self.__class__.__name__
+        self.logger.info(
+            f"{class_name}: Next method called {self._next_calls} times, total time:"
+            f" {self._next_total_time:.6f} seconds"
+        )
+        if self._init_time_costs > 0:
+            self.logger.info(
+                f"{class_name}: Init time: {self._init_time_costs:.6f} seconds, init {self._init_times} times"
+            )
+        if self._frame_compute_time > 0:
+            avg_compute_time = self._frame_compute_time / self._frame_compute_frames
+            self.logger.info(
+                f"{class_name}: compute frame num: {self._frame_compute_frames}, total time:"
+                f" {self._frame_compute_time:.6f} seconds, average time: {avg_compute_time:.6f} seconds per frame"
+            )
+        if self._frame_io_frames > 0:
+            avg_io_time = self._frame_io_time / self._frame_io_frames
+            self.logger.info(
+                f"{class_name}: io frame num: {self._frame_io_frames}, total time: {self._frame_io_time:.6f} seconds,"
+                f" average time: {avg_io_time:.6f} seconds per frame"
+            )
+        if self._wait_time > 0:
+            self.logger.info(f"{class_name}: wait time: {self._wait_time:.6f} seconds")
+        if self._seq_time > 0:
+            avg_seq_time = self._seq_time / self._seq_num
+            self.logger.info(
+                f"{class_name}: seq num: {self._seq_num:.6f}, total time: {self._seq_time:.6f} seconds, average time:"
+                f" {avg_seq_time:.6f} seconds per sequence"
+            )
--- a/nimbus/components/data/observation.py
+++ b/nimbus/components/data/observation.py
@@ -0,0 +1,119 @@
+import os
+
+import cv2
+import imageio
+import numpy as np
+
+from nimbus.components.data.camera import Camera
+
+
+class Observations:
+    """
+    Represents a single observation of a scene, which may include multiple camera trajectories and associated data.
+    Each observation is identified by a unique name and index, and can contain multiple Camera items that capture
+    different viewpoints or modalities of the same scene.
+
+    Args:
+        scene_name (str): The name of the scene associated with this observation.
+        index (str): The index or ID of this observation within the scene.
+        length (int): Optional total length of the observation. Calculated from camera trajectories if not provided.
+        data (dict): Optional dictionary for storing additional arbitrary data, such as metadata or annotations.
+    """
+
+    def __init__(self, scene_name: str, index: str, length: int = None, data: dict = None):
+        self.scene_name = scene_name
+        self.obs_name = scene_name + "_" + index
+        self.index = index
+        self.cam_items = []
+        self.length = length
+        self.data = data
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+
+    def append_cam(self, item: Camera):
+        self.cam_items.append(item)
+
+    def __len__(self):
+        if self.length is not None:
+            return self.length
+        self.length = 0
+        for cam in self.cam_items:
+            self.length += len(cam)
+        return self.length
+
+    def get_length(self):
+        return len(self)
+
+    def flush_to_disk(self, path, video_fps=10):
+        path_to_save = os.path.join(path, "trajectory_" + self.index)
+        print(f"obs {self.obs_name} try to save path in {path_to_save}")
+        os.makedirs(path_to_save, exist_ok=True)
+
+        # Single camera: save in root directory
+        if len(self.cam_items) == 1:
+            cam = self.cam_items[0]
+            self._save_camera_data(path_to_save, cam, video_fps)
+        # Multiple cameras: save in camera_0/, camera_1/, etc.
+        else:
+            for idx, cam in enumerate(self.cam_items):
+                camera_dir = os.path.join(path_to_save, f"camera_{idx}")
+                os.makedirs(camera_dir, exist_ok=True)
+                self._save_camera_data(camera_dir, cam, video_fps)
+
+    def _save_camera_data(self, save_dir, cam: Camera, video_fps):
+        """Helper method to save camera visualization data (rgbs, depths) to a directory."""
+        # Save RGB and depth images if available
+        if cam.rgbs is not None and len(cam.rgbs) > 0:
+            rgb_images_path = os.path.join(save_dir, "rgb/")
+            os.makedirs(rgb_images_path, exist_ok=True)
+
+            fps_path = os.path.join(save_dir, "fps.mp4")
+
+            for idx, rgb_item in enumerate(cam.rgbs):
+                rgb_filename = os.path.join(rgb_images_path, f"{idx}.jpg")
+                cv2.imwrite(rgb_filename, cv2.cvtColor(rgb_item, cv2.COLOR_BGR2RGB))
+
+            imageio.mimwrite(fps_path, cam.rgbs, fps=video_fps)
+
+        if cam.depths is not None and len(cam.depths) > 0:
+            depth_images_path = os.path.join(save_dir, "depth/")
+            os.makedirs(depth_images_path, exist_ok=True)
+
+            depth_path = os.path.join(save_dir, "depth.mp4")
+
+            # Create a copy for video (8-bit version)
+            depth_video_frames = []
+            for idx, depth_item in enumerate(cam.depths):
+                depth_filename = os.path.join(depth_images_path, f"{idx}.png")
+                cv2.imwrite(depth_filename, depth_item)
+                depth_video_frames.append((depth_item >> 8).astype(np.uint8))
+
+            imageio.mimwrite(depth_path, depth_video_frames, fps=video_fps)
+
+        # Save UV tracking visualizations if available
+        if cam.uv_tracks is not None and cam.uv_mesh_names is not None and cam.rgbs is not None:
+            num_frames = len(cam.rgbs)
+            try:
+                from nimbus_extension.components.render.brpc_utils.point_tracking import (
+                    make_uv_overlays_and_video,
+                )
+            except ImportError as e:
+                raise ImportError(
+                    "UV tracking visualization requires nimbus_extension. "
+                    "Please add `import nimbus_extension` before running the pipeline."
+                ) from e
+
+            make_uv_overlays_and_video(
+                cam.rgbs,
+                cam.uv_tracks,
+                cam.uv_mesh_names,
+                start_frame=0,
+                end_frame=num_frames,
+                fps=video_fps,
+                path_to_save=save_dir,
+            )
--- a/nimbus/components/data/package.py
+++ b/nimbus/components/data/package.py
@@ -0,0 +1,39 @@
+import pickle
+
+
+class Package:
+    """
+    A class representing a data package that can be serialized and deserialized for pipeline.
+
+    Args:
+        data: The actual data contained in the package, which can be of any type.
+        task_id (int): The ID of the task associated with this package.
+        task_name (str): The name of the task associated with this package.
+        stop_sig (bool): Whether this package signals the pipeline to stop.
+    """
+
+    def __init__(self, data, task_id: int = -1, task_name: str = None, stop_sig: bool = False):
+        self.is_ser = False
+        self.data = data
+        self.task_id = task_id
+        self.task_name = task_name
+        self.stop_sig = stop_sig
+
+    def serialize(self):
+        assert self.is_ser is False, "data is already serialized"
+        self.data = pickle.dumps(self.data)
+        self.is_ser = True
+
+    def deserialize(self):
+        assert self.is_ser is True, "data is already deserialized"
+        self.data = pickle.loads(self.data)
+        self.is_ser = False
+
+    def is_serialized(self):
+        return self.is_ser
+
+    def get_data(self):
+        return self.data
+
+    def should_stop(self):
+        return self.stop_sig is True
--- a/nimbus/components/data/scene.py
+++ b/nimbus/components/data/scene.py
@@ -0,0 +1,69 @@
+class Scene:
+    """
+    Represents a loaded scene in the simulation environment, holding workflow context and task execution state.
+
+    Args:
+        name (str): The name of the scene or task.
+        pcd: Point cloud data associated with the scene.
+        scale (float): Scale factor for the scene geometry.
+        materials: Material data for the scene.
+        textures: Texture data for the scene.
+        floor_heights: Floor height information for the scene.
+        wf: The task workflow instance managing this scene.
+        task_id (int): The index of the current task within the workflow.
+        task_exec_num (int): The execution count for the current task, used for task repetition tracking.
+        simulation_app: The Isaac Sim SimulationApp instance.
+    """
+
+    def __init__(
+        self,
+        name: str = None,
+        pcd=None,
+        scale: float = 1.0,
+        materials=None,
+        textures=None,
+        floor_heights=None,
+        wf=None,
+        task_id: int = None,
+        task_exec_num: int = 1,
+        simulation_app=None,
+    ):
+        self.name = name
+        self.pcd = pcd
+        self.materials = materials
+        self.textures = textures
+        self.floor_heights = floor_heights
+        self.scale = scale
+        self.wf = wf
+        self.simulation_app = simulation_app
+        self.task_id = task_id
+        self.plan_info = None
+        self.generate_success = False
+        self.task_exec_num = task_exec_num
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        del state["pcd"]
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        self.pcd = None
+
+    def add_plan_info(self, plan_info):
+        self.plan_info = plan_info
+
+    def flush_to_disk(self, path):
+        pass
+
+    def load_from_disk(self, path):
+        pass
+
+    def update_generate_status(self, success):
+        self.generate_success = success
+
+    def get_generate_status(self):
+        return self.generate_success
+
+    def update_task_exec_num(self, num):
+        self.task_exec_num = num
--- a/nimbus/components/data/sequence.py
+++ b/nimbus/components/data/sequence.py
@@ -0,0 +1,145 @@
+import json
+import os
+
+import numpy as np
+import open3d as o3d
+
+from nimbus.components.data.camera import C2W, Camera
+
+
+class Sequence:
+    """
+    Represents a camera trajectory sequence with associated metadata.
+
+    Args:
+        scene_name (str): The name of the scene (e.g., room identifier).
+        index (str): The index or ID of this sequence within the scene.
+        length (int): Optional explicit sequence length. Calculated from camera trajectories if not provided.
+        data (dict): Optional additional arbitrary data associated with the sequence.
+    """
+
+    def __init__(self, scene_name: str, index: str, length: int = None, data: dict = None):
+        self.scene_name = scene_name
+        self.seq_name = scene_name + "_" + index
+        self.index = index
+        self.cam_items: list[Camera] = []
+        self.path_pcd = None
+        self.length = length
+        self.data = data
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["path_pcd_color"] = np.asarray(state["path_pcd"].colors)
+        state["path_pcd"] = o3d.io.write_point_cloud_to_bytes(state["path_pcd"], "mem::xyz")
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        self.path_pcd = o3d.io.read_point_cloud_from_bytes(state["path_pcd"], "mem::xyz")
+        self.path_pcd.colors = o3d.utility.Vector3dVector(state["path_pcd_color"])
+
+    def __len__(self):
+        if self.length is not None:
+            return self.length
+        self.length = 0
+        for cam in self.cam_items:
+            self.length += len(cam)
+        return self.length
+
+    def append_cam(self, item: Camera):
+        self.cam_items.append(item)
+
+    def update_pcd(self, path_pcd):
+        self.path_pcd = path_pcd
+
+    def get_length(self):
+        return len(self)
+
+    def flush_to_disk(self, path):
+        path_to_save = os.path.join(path, "trajectory_" + self.index)
+        print(f"seq {self.seq_name} try to save path in {path_to_save}")
+        os.makedirs(path_to_save, exist_ok=True)
+        if self.path_pcd is not None:
+            pcd_path = os.path.join(path_to_save, "path.ply")
+            o3d.io.write_point_cloud(pcd_path, self.path_pcd)
+
+        # Single camera: save in root directory
+        if len(self.cam_items) == 1:
+            cam = self.cam_items[0]
+            camera_trajectory_list = [t.matrix for t in cam.trajectory]
+            save_dict = {
+                "camera_intrinsic": cam.intrinsic if cam.intrinsic is not None else None,
+                "camera_extrinsic": cam.extrinsic if cam.extrinsic is not None else None,
+                "camera_trajectory": camera_trajectory_list,
+            }
+            traj_path = os.path.join(path_to_save, "data.json")
+            json_object = json.dumps(save_dict, indent=4)
+            with open(traj_path, "w", encoding="utf-8") as outfile:
+                outfile.write(json_object)
+        # Multiple cameras: save in camera_0/, camera_1/, etc.
+        else:
+            for idx, cam in enumerate(self.cam_items):
+                camera_dir = os.path.join(path_to_save, f"camera_{idx}")
+                os.makedirs(camera_dir, exist_ok=True)
+                camera_trajectory_list = [t.matrix for t in cam.trajectory]
+                save_dict = {
+                    "camera_intrinsic": cam.intrinsic if cam.intrinsic is not None else None,
+                    "camera_extrinsic": cam.extrinsic if cam.extrinsic is not None else None,
+                    "camera_trajectory": camera_trajectory_list,
+                }
+                traj_path = os.path.join(camera_dir, "data.json")
+                json_object = json.dumps(save_dict, indent=4)
+                with open(traj_path, "w", encoding="utf-8") as outfile:
+                    outfile.write(json_object)
+
+    def load_from_disk(self, path):
+        print(f"seq {self.seq_name} try to load path from {path}")
+
+        pcd_path = os.path.join(path, "path.ply")
+        if os.path.exists(pcd_path):
+            self.path_pcd = o3d.io.read_point_cloud(pcd_path)
+
+        # Clear existing camera items
+        self.cam_items = []
+
+        # Check if single camera format (data.json in root)
+        traj_path = os.path.join(path, "data.json")
+        if os.path.exists(traj_path):
+            with open(traj_path, "r", encoding="utf-8") as infile:
+                data = json.load(infile)
+
+            camera_trajectory_list = []
+            for trajectory in data["camera_trajectory"]:
+                camera_trajectory_list.append(C2W(matrix=trajectory))
+
+            cam = Camera(
+                trajectory=camera_trajectory_list,
+                intrinsic=data.get("camera_intrinsic"),
+                extrinsic=data.get("camera_extrinsic"),
+            )
+            self.cam_items.append(cam)
+        else:
+            # Multiple camera format (camera_0/, camera_1/, etc.)
+            idx = 0
+            while True:
+                camera_dir = os.path.join(path, f"camera_{idx}")
+                camera_json = os.path.join(camera_dir, "data.json")
+                if not os.path.exists(camera_json):
+                    break
+
+                with open(camera_json, "r", encoding="utf-8") as infile:
+                    data = json.load(infile)
+
+                camera_trajectory_list = []
+                for trajectory in data["camera_trajectory"]:
+                    camera_trajectory_list.append(C2W(matrix=trajectory))
+
+                cam = Camera(
+                    trajectory=camera_trajectory_list,
+                    intrinsic=data.get("camera_intrinsic"),
+                    extrinsic=data.get("camera_extrinsic"),
+                )
+                self.cam_items.append(cam)
+                idx += 1
+
+            assert len(self.cam_items) > 0, f"No camera data found in {path}"
--- a/nimbus/components/dedump/init.py
+++ b/nimbus/components/dedump/init.py
@@ -0,0 +1,7 @@
+from nimbus.components.data.iterator import Iterator
+
+dedumper_dict = {}
+
+
+def register(type_name: str, cls: Iterator):
+    dedumper_dict[type_name] = cls
--- a/nimbus/components/dump/init.py
+++ b/nimbus/components/dump/init.py
@@ -0,0 +1,7 @@
+from .base_dumper import BaseDumper
+
+dumper_dict = {}
+
+
+def register(type_name: str, cls: BaseDumper):
+    dumper_dict[type_name] = cls
--- a/nimbus/components/dump/base_dumper.py
+++ b/nimbus/components/dump/base_dumper.py
@@ -0,0 +1,82 @@
+import time
+from abc import abstractmethod
+
+from pympler import asizeof
+
+from nimbus.components.data.iterator import Iterator
+from nimbus.components.data.package import Package
+from nimbus.utils.utils import unpack_iter_data
+
+
+class BaseDumper(Iterator):
+    def __init__(self, data_iter, output_queue, max_queue_num=1):
+        super().__init__()
+        self.data_iter = data_iter
+        self.scene = None
+        self.output_queue = output_queue
+        self.total_case = 0
+        self.success_case = 0
+        self.max_queue_num = max_queue_num
+
+    def __iter__(self):
+        return self
+
+    def _next(self):
+        try:
+            data = next(self.data_iter)
+            scene, seq, obs = unpack_iter_data(data)
+            self.total_case += 1
+            if scene is not None:
+                if self.scene is not None and (
+                    scene.task_id != self.scene.task_id
+                    or scene.name != self.scene.name
+                    or scene.task_exec_num != self.scene.task_exec_num
+                ):
+                    self.logger.info(
+                        f"Scene {self.scene.name} generate finish, success rate: {self.success_case}/{self.total_case}"
+                    )
+                    self.total_case = 1
+                    self.success_case = 0
+                self.scene = scene
+            if obs is None and seq is None:
+                self.logger.info(f"generate failed, skip once! success rate: {self.success_case}/{self.total_case}")
+                if self.scene is not None:
+                    self.scene.update_generate_status(success=False)
+                return None
+            io_start_time = time.time()
+            if self.output_queue is not None:
+                obj = self.dump(seq, obs)
+                pack = Package(obj, task_id=scene.task_id, task_name=scene.name)
+                pack.serialize()
+
+                wait_time = time.time()
+                while self.output_queue.qsize() >= self.max_queue_num:
+                    time.sleep(1)
+                end_time = time.time()
+                self.collect_wait_time_info(end_time - wait_time)
+
+                st = time.time()
+                self.output_queue.put(pack)
+                ed = time.time()
+                self.logger.info(f"put time: {ed - st}, data size: {asizeof.asizeof(obj)}")
+            else:
+                obj = self.dump(seq, obs)
+            self.success_case += 1
+            self.scene.update_generate_status(success=True)
+            self.collect_seq_info(1, time.time() - io_start_time)
+        except StopIteration:
+            if self.output_queue is not None:
+                pack = Package(None, stop_sig=True)
+                self.output_queue.put(pack)
+            if self.scene is not None:
+                self.logger.info(
+                    f"Scene {self.scene.name} generate finish, success rate: {self.success_case}/{self.total_case}"
+                )
+            raise StopIteration("no data")
+        except Exception as e:
+            self.logger.exception(f"Error during data dumping: {e}")
+            raise e
+
+    @abstractmethod
+    def dump(self, seq, obs):
+        raise NotImplementedError("This method should be overridden by subclasses")
--- a/nimbus/components/load/init.py
+++ b/nimbus/components/load/init.py
@@ -0,0 +1,16 @@
+# flake8: noqa: F401
+# pylint: disable=C0413
+
+from .base_randomizer import LayoutRandomizer
+from .base_scene_loader import SceneLoader
+
+scene_loader_dict = {}
+layout_randomizer_dict = {}
+
+
+def register_loader(type_name: str, cls: SceneLoader):
+    scene_loader_dict[type_name] = cls
+
+
+def register_randomizer(type_name: str, cls: LayoutRandomizer):
+    layout_randomizer_dict[type_name] = cls
--- a/nimbus/components/load/base_randomizer.py
+++ b/nimbus/components/load/base_randomizer.py
@@ -0,0 +1,72 @@
+import sys
+import time
+from abc import abstractmethod
+from typing import Optional
+
+from nimbus.components.data.iterator import Iterator
+from nimbus.components.data.scene import Scene
+from nimbus.daemon.decorators import status_monitor
+
+
+class LayoutRandomizer(Iterator):
+    """
+    Base class for layout randomization in a scene. This class defines the structure for randomizing scenes and
+    tracking the randomization process. It manages the current scene, randomization count, and provides hooks for
+    subclasses to implement specific randomization logic.
+
+    Args:
+        scene_iter (Iterator): An iterator that provides scenes to be randomized.
+        random_num (int): The number of randomizations to perform for each scene before moving to the next one.
+        strict_mode (bool): If True, the randomizer will check the generation status of the current scene and retry
+                            randomization if it was not successful. This ensures that only successfully generated
+                            scenes are counted towards the randomization limit.
+    """
+
+    def __init__(self, scene_iter: Iterator, random_num: int, strict_mode: bool = False):
+        super().__init__()
+        self.scene_iter = scene_iter
+        self.random_num = random_num
+        self.strict_mode = strict_mode
+        self.cur_index = sys.maxsize
+        self.scene: Optional[Scene] = None
+
+    def reset(self, scene):
+        self.cur_index = 0
+        self.scene = scene
+
+    def _fetch_next_scene(self):
+        scene = next(self.scene_iter)
+        self.reset(scene)
+
+    @status_monitor()
+    def _randomize_with_status(self, scene) -> Scene:
+        scene = self.randomize_scene(self.scene)
+        return scene
+
+    def _next(self) -> Scene:
+        try:
+            if self.strict_mode and self.scene is not None:
+                if not self.scene.get_generate_status():
+                    self.logger.info("strict_mode is open, retry the randomization to generate sequence.")
+                    st = time.time()
+                    scene = self._randomize_with_status(self.scene)
+                    self.collect_seq_info(1, time.time() - st)
+                    return scene
+            if self.cur_index >= self.random_num:
+                self._fetch_next_scene()
+            if self.cur_index < self.random_num:
+                st = time.time()
+                scene = self._randomize_with_status(self.scene)
+                self.collect_seq_info(1, time.time() - st)
+                self.cur_index += 1
+            return scene
+        except StopIteration:
+            raise StopIteration("No more scenes to randomize.")
+        except Exception as e:
+            self.logger.exception(f"Error during scene idx {self.cur_index} randomization: {e}")
+            self.cur_index += 1
+            raise e
+
+    @abstractmethod
+    def randomize_scene(self, scene) -> Scene:
+        raise NotImplementedError("This method should be overridden by subclasses")
--- a/nimbus/components/load/base_scene_loader.py
+++ b/nimbus/components/load/base_scene_loader.py
@@ -0,0 +1,41 @@
+from abc import abstractmethod
+
+from nimbus.components.data.iterator import Iterator
+from nimbus.components.data.scene import Scene
+
+
+class SceneLoader(Iterator):
+    """
+    Base class for scene loading in a simulation environment. This class defines the structure for loading scenes
+    and tracking the loading process. It manages the current package iterator and provides hooks for subclasses
+    to implement specific scene loading logic.
+
+    Args:
+        pack_iter (Iterator): An iterator that provides packages containing scene information to be loaded.
+    """
+
+    def __init__(self, pack_iter):
+        super().__init__()
+        self.pack_iter = pack_iter
+
+    @abstractmethod
+    def load_asset(self) -> Scene:
+        """
+        Abstract method to load and initialize a scene.
+
+        Subclasses must implement this method to define the specific logic for creating and configuring
+        a scene object based on the current state of the iterator.
+
+        Returns:
+            Scene: A fully initialized Scene object.
+        """
+        raise NotImplementedError("This method must be implemented by subclasses")
+
+    def _next(self) -> Scene:
+        try:
+            return self.load_asset()
+        except StopIteration:
+            raise StopIteration("No more scenes to load.")
+        except Exception as e:
+            self.logger.exception(f"Error during scene loading: {e}")
+            raise e
--- a/nimbus/components/plan_with_render/init.py
+++ b/nimbus/components/plan_with_render/init.py
@@ -0,0 +1,7 @@
+from nimbus.components.data.iterator import Iterator
+
+plan_with_render_dict = {}
+
+
+def register(type_name: str, cls: Iterator):
+    plan_with_render_dict[type_name] = cls
--- a/nimbus/components/planner/init.py
+++ b/nimbus/components/planner/init.py
@@ -0,0 +1,7 @@
+from .base_seq_planner import SequencePlanner
+
+seq_planner_dict = {}
+
+
+def register(type_name: str, cls: SequencePlanner):
+    seq_planner_dict[type_name] = cls
--- a/nimbus/components/planner/base_seq_planner.py
+++ b/nimbus/components/planner/base_seq_planner.py
@@ -0,0 +1,102 @@
+import sys
+import time
+from abc import abstractmethod
+from typing import Optional
+
+from nimbus.components.data.iterator import Iterator
+from nimbus.components.data.scene import Scene
+from nimbus.components.data.sequence import Sequence
+from nimbus.daemon.decorators import status_monitor
+from nimbus.utils.flags import is_debug_mode
+from nimbus.utils.types import ARGS, TYPE
+
+from .planner import path_planner_dict
+
+
+class SequencePlanner(Iterator):
+    """
+    A base class for sequence planning in a simulation environment. This class defines the structure for generating
+    sequences based on scenes and tracking the planning process. It manages the current scene, episode count
+    and provides hooks for subclasses to implement specific sequence generation logic.
+
+    Args:
+        scene_iter (Iterator): An iterator that provides scenes to be processed for sequence planning.
+        planner_cfg (dict): A dictionary containing configuration parameters for the planner,
+            such as the type of planner to use and its arguments.
+        episodes (int): The number of episodes to generate for each scene before moving to the next one. Default is 1.
+    """
+
+    def __init__(self, scene_iter: Iterator[Scene], planner_cfg: dict, episodes: int = 1):
+        super().__init__()
+        self.scene_iter = scene_iter
+        self.planner_cfg = planner_cfg
+        self.episodes = episodes
+        self.current_episode = sys.maxsize
+        self.scene: Optional[Scene] = None
+
+    @status_monitor()
+    def _plan_with_status(self) -> Optional[Sequence]:
+        seq = self.generate_sequence()
+        return seq
+
+    def _next(self) -> tuple[Scene, Sequence]:
+        try:
+            if self.scene is None or self.current_episode >= self.episodes:
+                try:
+                    self.scene = next(self.scene_iter)
+                    self.current_episode = 0
+                    if self.scene is None:
+                        return None, None
+                    self.initialize(self.scene)
+                except StopIteration:
+                    raise StopIteration("No more scene to process.")
+                except Exception as e:
+                    self.logger.exception(f"Error loading next scene: {e}")
+                    if is_debug_mode():
+                        raise e
+                    self.current_episode = sys.maxsize
+                    return None, None
+
+            while True:
+                compute_start_time = time.time()
+                seq = self._plan_with_status()
+                compute_end_time = time.time()
+                self.current_episode += 1
+
+                if seq is not None:
+                    self.collect_compute_frame_info(seq.get_length(), compute_end_time - compute_start_time)
+                    return self.scene, seq
+
+                if self.current_episode >= self.episodes:
+                    return self.scene, None
+
+                self.logger.info(f"Generate seq failed and retry. Current episode id is {self.current_episode}")
+
+        except StopIteration:
+            raise StopIteration("No more scene to process.")
+        except Exception as e:
+            scene_name = getattr(self.scene, "name", "<unknown>")
+            self.logger.exception(
+                f"Error during idx {self.current_episode} sequence generation for scene {scene_name}: {e}"
+            )
+            if is_debug_mode():
+                raise e
+            self.current_episode += 1
+            return self.scene, None
+
+    @abstractmethod
+    def generate_sequence(self) -> Optional[Sequence]:
+        raise NotImplementedError("This method should be overridden by subclasses")
+
+    def _initialize(self, scene):
+        if self.planner_cfg is not None:
+            self.logger.info(f"init {self.planner_cfg[TYPE]} planner in seq_planner")
+            self.planner = path_planner_dict[self.planner_cfg[TYPE]](scene, **self.planner_cfg.get(ARGS, {}))
+        else:
+            self.planner = None
+            self.logger.info("planner config is None in seq_planner and skip initialize")
+
+    def initialize(self, scene):
+        init_start_time = time.time()
+        self._initialize(scene)
+        self.record_init_time(time.time() - init_start_time)
--- a/nimbus/components/planner/planner/init.py
+++ b/nimbus/components/planner/planner/init.py
@@ -0,0 +1,5 @@
+path_planner_dict = {}
+
+
+def register(type_name: str, cls):
+    path_planner_dict[type_name] = cls
--- a/nimbus/components/render/init.py
+++ b/nimbus/components/render/init.py
@@ -0,0 +1,7 @@
+from .base_renderer import BaseRenderer
+
+renderer_dict = {}
+
+
+def register(type_name: str, cls: BaseRenderer):
+    renderer_dict[type_name] = cls
--- a/nimbus/components/render/base_renderer.py
+++ b/nimbus/components/render/base_renderer.py
@@ -0,0 +1,80 @@
+import time
+from abc import abstractmethod
+from typing import Optional
+
+from nimbus.components.data.iterator import Iterator
+from nimbus.components.data.observation import Observations
+from nimbus.components.data.scene import Scene
+from nimbus.components.data.sequence import Sequence
+from nimbus.daemon.decorators import status_monitor
+
+
+class BaseRenderer(Iterator):
+    """
+    Base class for rendering in a simulation environment. This class defines the structure for rendering scenes and
+    tracking the rendering process. It manages the current scene and provides hooks for subclasses to implement
+    specific rendering logic.
+
+    Args:
+        scene_seq_iter (Iterator): An iterator that provides pairs of scenes and sequences to be rendered. Each item
+                                  from the iterator should be a tuple containing a scene and its corresponding sequence.
+    """
+
+    def __init__(self, scene_seq_iter: Iterator[tuple[Scene, Sequence]]):
+        super().__init__()
+        self.scene_seq_iter = scene_seq_iter
+        self.scene: Optional[Scene] = None
+
+    @status_monitor()
+    def _generate_obs_with_status(self, seq) -> Optional[Observations]:
+        compute_start_time = time.time()
+        obs = self.generate_obs(seq)
+        end_start_time = time.time()
+        if obs is not None:
+            self.collect_compute_frame_info(len(obs), end_start_time - compute_start_time)
+        return obs
+
+    def _next(self):
+        try:
+            scene, seq = next(self.scene_seq_iter)
+            if scene is not None:
+                if self.scene is None:
+                    self.reset(scene)
+                elif scene.task_id != self.scene.task_id or scene.name != self.scene.name:
+                    self.logger.info(f"Scene changed: {self.scene.name} -> {scene.name}")
+                    self.reset(scene)
+            if seq is None:
+                return scene, None, None
+            obs = self._generate_obs_with_status(seq)
+            if obs is None:
+                return scene, None, None
+            return scene, seq, obs
+        except StopIteration:
+            raise StopIteration("No more sequences to process.")
+        except Exception as e:
+            self.logger.exception(f"Error during rendering: {e}")
+            raise e
+
+    @abstractmethod
+    def generate_obs(self, seq) -> Optional[Observations]:
+        raise NotImplementedError("This method should be overridden by subclasses")
+
+    @abstractmethod
+    def _lazy_init(self):
+        raise NotImplementedError("This method should be overridden by subclasses")
+
+    @abstractmethod
+    def _close_resource(self):
+        raise NotImplementedError("This method should be overridden by subclasses")
+
+    def reset(self, scene):
+        try:
+            self.scene = scene
+            self._close_resource()
+            init_start_time = time.time()
+            self._lazy_init()
+            self.record_init_time(time.time() - init_start_time)
+        except Exception as e:
+            self.logger.exception(f"Error initializing renderer: {e}")
+            self.scene = None
+            raise e
--- a/nimbus/components/store/init.py
+++ b/nimbus/components/store/init.py
@@ -0,0 +1,7 @@
+from .base_writer import BaseWriter
+
+writer_dict = {}
+
+
+def register(type_name: str, cls: BaseWriter):
+    writer_dict[type_name] = cls
--- a/nimbus/components/store/base_writer.py
+++ b/nimbus/components/store/base_writer.py
@@ -0,0 +1,163 @@
+import time
+from abc import abstractmethod
+from concurrent.futures import ThreadPoolExecutor
+from copy import copy
+
+from nimbus.components.data.iterator import Iterator
+from nimbus.components.data.observation import Observations
+from nimbus.components.data.scene import Scene
+from nimbus.components.data.sequence import Sequence
+from nimbus.daemon import ComponentStatus, StatusReporter
+from nimbus.utils.flags import is_debug_mode
+from nimbus.utils.utils import unpack_iter_data
+
+
+def run_batch(func, args):
+    for arg in args:
+        func(*arg)
+
+
+class BaseWriter(Iterator):
+    """
+    A base class for writing generated sequences and observations to disk. This class defines the structure for
+    writing data and tracking the writing process. It manages the current scene, success and total case counts,
+    and provides hooks for subclasses to implement specific data writing logic. The writer supports both synchronous
+    and asynchronous batch writing modes, allowing for efficient data handling in various scenarios.
+
+    Args:
+        data_iter (Iterator): An iterator that provides data to be written, typically containing scenes,
+            sequences, and observations.
+        seq_output_dir (str): The directory where generated sequences will be saved. Can be None
+            if sequence output is not needed.
+        obs_output_dir (str): The directory where generated observations will be saved. Can be None
+            if observation output is not needed.
+        batch_async (bool): If True, the writer will use asynchronous batch writing to improve performance
+            when handling large amounts of data. Default is True.
+        async_threshold (int): The maximum number of asynchronous write operations that can be in progress
+            at the same time. If the threshold is reached, the writer will wait for the oldest operation
+            to complete before starting a new one. Default is 1.
+        batch_size (int): The number of data items to write in each batch when using asynchronous writing.
+            Default is 2, and it will be capped at 8 to prevent potential issues with too many concurrent operations.
+    """
+
+    def __init__(
+        self,
+        data_iter: Iterator[tuple[Scene, Sequence, Observations]],
+        seq_output_dir: str,
+        obs_output_dir: str,
+        batch_async: bool = True,
+        async_threshold: int = 1,
+        batch_size: int = 2,
+    ):
+        super().__init__()
+        assert (
+            seq_output_dir is not None or obs_output_dir is not None
+        ), "At least one output directory must be provided"
+        self.data_iter = data_iter
+        self.seq_output_dir = seq_output_dir
+        self.obs_output_dir = obs_output_dir
+        self.scene = None
+        self.async_mode = batch_async
+        self.batch_size = batch_size if batch_size <= 8 else 8
+        if batch_async and batch_size > self.batch_size:
+            self.logger.info("Batch size is larger than 8(probably cause program hang), batch size will be set to 8")
+        self.async_threshold = async_threshold
+        self.flush_executor = ThreadPoolExecutor(max_workers=max(1, 64 // self.batch_size))
+        self.flush_threads = []
+        self.data_buffer = []
+        self.logger.info(
+            f"Batch Async Write Mode: {self.async_mode}, async threshold: {self.async_threshold}, batch size:"
+            f" {self.batch_size}"
+        )
+        self.total_case = 0
+        self.success_case = 0
+        self.last_scene_key = None
+        self.status_reporter = StatusReporter(self.__class__.__name__)
+
+    def _next(self):
+        try:
+            data = next(self.data_iter)
+            scene, seq, obs = unpack_iter_data(data)
+
+            new_key = (scene.task_id, scene.name, scene.task_exec_num) if scene is not None else None
+
+            self.scene = scene
+
+            if new_key != self.last_scene_key:
+                if self.scene is not None and self.last_scene_key is not None:
+                    self.logger.info(
+                        f"Scene {self.scene.name} generate finish, success rate: {self.success_case}/{self.total_case}"
+                    )
+                self.success_case = 0
+                self.total_case = 0
+                self.last_scene_key = new_key
+
+            if self.scene is None:
+                return None
+
+            self.total_case += 1
+
+            self.status_reporter.update_status(ComponentStatus.RUNNING)
+            if seq is None and obs is None:
+                self.logger.info(f"generate failed, skip once! success rate: {self.success_case}/{self.total_case}")
+                self.scene.update_generate_status(success=False)
+                return None
+            scene_name = self.scene.name
+            io_start_time = time.time()
+            if self.async_mode:
+                cp_start_time = time.time()
+                cp = copy(self.scene.wf)
+                cp_end_time = time.time()
+                if self.scene.wf is not None:
+                    self.logger.info(f"Scene {scene_name} workflow copy time: {cp_end_time - cp_start_time:.2f}s")
+                self.data_buffer.append((cp, scene_name, seq, obs))
+                if len(self.data_buffer) >= self.batch_size:
+                    self.flush_threads = [t for t in self.flush_threads if not t.done()]
+
+                    if len(self.flush_threads) >= self.async_threshold:
+                        self.logger.info("Max async workers reached, waiting for the oldest thread to finish")
+                        self.flush_threads[0].result()
+                        self.flush_threads = self.flush_threads[1:]
+
+                    to_flush_buffer = self.data_buffer.copy()
+                    async_flush = self.flush_executor.submit(run_batch, self.flush_to_disk, to_flush_buffer)
+                    if is_debug_mode():
+                        async_flush.result()  # surface exceptions immediately in debug mode
+                    self.flush_threads.append(async_flush)
+                    self.data_buffer = []
+                flush_length = len(obs) if obs is not None else len(seq)
+            else:
+                flush_length = self.flush_to_disk(self.scene.wf, scene_name, seq, obs)
+            self.success_case += 1
+            self.scene.update_generate_status(success=True)
+            self.collect_io_frame_info(flush_length, time.time() - io_start_time)
+            self.status_reporter.update_status(ComponentStatus.COMPLETED)
+            return None
+        except StopIteration:
+            if self.async_mode:
+                if len(self.data_buffer) > 0:
+                    async_flush = self.flush_executor.submit(run_batch, self.flush_to_disk, self.data_buffer)
+                    self.flush_threads.append(async_flush)
+                for thread in self.flush_threads:
+                    thread.result()
+            if self.scene is not None:
+                self.logger.info(
+                    f"Scene {self.scene.name} generate finish, success rate: {self.success_case}/{self.total_case}"
+                )
+            raise StopIteration("no data")
+        except Exception as e:
+            self.logger.exception(f"Error during data writing: {e}")
+            raise e
+
+    def __del__(self):
+        for thread in self.flush_threads:
+            thread.result()
+        self.logger.info(f"Writer {len(self.flush_threads)} threads closed")
+        # Close the simulation app if it exists
+        if self.scene is not None and self.scene.simulation_app is not None:
+            self.logger.info("Closing simulation app")
+            self.scene.simulation_app.close()
+
+    @abstractmethod
+    def flush_to_disk(self, task, scene_name, seq, obs):
+        raise NotImplementedError("This method should be overridden by subclasses")
--- a/nimbus/daemon/init.py
+++ b/nimbus/daemon/init.py
@@ -0,0 +1,4 @@
+# flake8: noqa: E401
+from .status import ComponentStatus, StatusInfo
+from .status_monitor import StatusMonitor
+from .status_reporter import StatusReporter
--- a/nimbus/daemon/decorators.py
+++ b/nimbus/daemon/decorators.py
@@ -0,0 +1,24 @@
+from functools import wraps
+
+from nimbus.daemon import ComponentStatus, StatusReporter
+
+
+def status_monitor(running_status=ComponentStatus.RUNNING, completed_status=ComponentStatus.COMPLETED):
+    def decorator(func):
+        @wraps(func)
+        def wrapper(self, *args, **kwargs):
+            if not hasattr(self, "status_reporter"):
+                self.status_reporter = StatusReporter(self.__class__.__name__)
+
+            self.status_reporter.update_status(running_status)
+
+            try:
+                result = func(self, *args, **kwargs)
+                self.status_reporter.update_status(completed_status)
+                return result
+            except Exception as e:
+                raise e
+
+        return wrapper
+
+    return decorator
--- a/nimbus/daemon/status.py
+++ b/nimbus/daemon/status.py
@@ -0,0 +1,21 @@
+import time
+from dataclasses import dataclass, field
+from enum import Enum
+
+
+class ComponentStatus(Enum):
+    IDLE = "idle"
+    READY = "ready"
+    RUNNING = "running"
+    COMPLETED = "completed"
+    TIMEOUT = "timeout"
+
+
+@dataclass
+class StatusInfo:
+    component_id: str
+    status: ComponentStatus
+    last_update: float = field(default_factory=time.time)
+
+    def get_status_duration(self) -> float:
+        return time.time() - self.last_update
--- a/nimbus/daemon/status_monitor.py
+++ b/nimbus/daemon/status_monitor.py
@@ -0,0 +1,160 @@
+import threading
+from typing import Dict, Optional
+
+from .status import ComponentStatus, StatusInfo
+
+
+class StatusMonitor:
+    _instance = None
+    _lock = threading.Lock()
+
+    DEFAULT_TIMEOUTS = {
+        ComponentStatus.IDLE: 100,
+        ComponentStatus.READY: float("inf"),
+        ComponentStatus.RUNNING: 360,
+        ComponentStatus.COMPLETED: float("inf"),
+        ComponentStatus.TIMEOUT: float("inf"),
+    }
+
+    def __new__(cls):
+        if cls._instance is None:
+            with cls._lock:
+                if cls._instance is None:
+                    cls._instance = super().__new__(cls)
+        return cls._instance
+
+    def __init__(self):
+        if not hasattr(self, "initialized"):
+            self.components: Dict[str, StatusInfo] = {}
+            self.status_timeouts = self.DEFAULT_TIMEOUTS.copy()
+            self.initialized = True
+
+    @classmethod
+    def get_instance(cls):
+        return cls()
+
+    def set_logger(self, logger):
+        self.logger = logger
+
+    def set_status_timeout(self, status: ComponentStatus, timeout_seconds: float):
+        self.status_timeouts[status] = timeout_seconds
+
+    def set_component_timeouts(self, timeouts: Dict[str, float]):
+        converted_timeouts = {}
+
+        for status_name, timeout_value in timeouts.items():
+            try:
+                if isinstance(status_name, str):
+                    status = ComponentStatus[status_name.upper()]
+                elif isinstance(status_name, ComponentStatus):
+                    status = status_name
+                else:
+                    self._record(
+                        f"Warning: Invalid status type '{type(status_name)}' for status '{status_name}', skipping"
+                    )
+                    continue
+
+                try:
+                    timeout_value = float(timeout_value)
+                    if timeout_value < 0:
+                        timeout_value = float("inf")
+
+                    converted_timeouts[status] = timeout_value
+                    self._record(f"Set timeout for {status.value}: {timeout_value}s")
+
+                except (ValueError, TypeError) as e:
+                    self._record(
+                        f"Warning: Invalid timeout value '{timeout_value}' for status '{status_name}': {e}, skipping"
+                    )
+                    continue
+
+            except KeyError:
+                self._record(
+                    f"Warning: Unknown status '{status_name}', skipping. Available statuses:"
+                    f" {[s.name for s in ComponentStatus]}"
+                )
+                continue
+            except Exception as e:
+                self._record(f"Error processing status '{status_name}': {e}, skipping")
+                continue
+
+        self.status_timeouts.update(converted_timeouts)
+
+    def register_update(self, status_info: StatusInfo):
+        self.components[status_info.component_id] = status_info
+
+    def get_all_status(self) -> Dict[str, StatusInfo]:
+        return self.components.copy()
+
+    def get_status(self, component_id: str) -> Optional[StatusInfo]:
+        return self.components.get(component_id)
+
+    def get_timeout_components(self) -> Dict[str, StatusInfo]:
+        timeout_components = {}
+        for component_id, status_info in self.components.items():
+            if status_info.status == ComponentStatus.TIMEOUT:
+                timeout_components[component_id] = status_info
+        return timeout_components
+
+    def get_components_length(self):
+        return len(self.components)
+
+    def check_and_update_timeouts(self) -> Dict[str, StatusInfo]:
+        newly_timeout_components = {}
+        components = self.get_all_status()
+        for component_id, status_info in components.items():
+            if status_info.status == ComponentStatus.TIMEOUT:
+                newly_timeout_components[component_id] = status_info
+                continue
+
+            time_since_update = status_info.get_status_duration()
+            timeout_threshold = self.status_timeouts.get(status_info.status, 300)
+            self._record(
+                f"[COMPONENT DETAIL] {component_id}: "
+                f"Status={status_info.status}, "
+                f"Duration={status_info.get_status_duration():.1f}s, "
+                f"Threshold={timeout_threshold}s"
+            )
+
+            if time_since_update > timeout_threshold:
+                self._record(
+                    f"Component {component_id} timeout: {status_info.status.value} for {time_since_update:.1f}s"
+                    f" (threshold: {timeout_threshold}s)"
+                )
+
+                status_info.status = ComponentStatus.TIMEOUT
+                status_info.last_update = time_since_update
+                newly_timeout_components[component_id] = status_info
+
+        return newly_timeout_components
+
+    def clear(self):
+        self.components.clear()
+        self._record("Cleared all registered components.")
+
+    def get_component_status_duration(self, component_id: str) -> Optional[float]:
+        status_info = self.components.get(component_id)
+        if status_info:
+            return status_info.get_status_duration()
+        return None
+
+    def get_all_status_with_duration(self) -> Dict[str, Dict]:
+        result = {}
+        for comp_id, status_info in self.components.items():
+            result[comp_id] = {
+                "status": status_info.status,
+                "duration": status_info.get_status_duration(),
+                "timeout_threshold": self.status_timeouts.get(status_info.status, 300),
+                "last_update": status_info.last_update,
+            }
+        return result
+
+    def set_check_interval(self, interval_seconds: float):
+        self.check_interval = interval_seconds
+        self._record(f"Set daemon check interval to {interval_seconds}s")
+
+    def _record(self, info):
+        if hasattr(self, "logger") and self.logger is not None:
+            self.logger.info(f"[STATUS MONITOR]: {info}")
+        else:
+            print(f"[STATUS MONITOR]: {info}")
--- a/nimbus/daemon/status_reporter.py
+++ b/nimbus/daemon/status_reporter.py
@@ -0,0 +1,21 @@
+import threading
+import time
+
+from .status import ComponentStatus, StatusInfo
+from .status_monitor import StatusMonitor
+
+
+class StatusReporter:
+    def __init__(self, component_id: str):
+        self.component_id = component_id
+        self._status_info = StatusInfo(component_id, ComponentStatus.IDLE)
+        self._lock = threading.Lock()
+
+    def update_status(self, status: ComponentStatus):
+        with self._lock:
+            self._status_info = StatusInfo(component_id=self.component_id, status=status, last_update=time.time())
+            StatusMonitor.get_instance().register_update(self._status_info)
+
+    def get_status(self) -> StatusInfo:
+        with self._lock:
+            return self._status_info
--- a/nimbus/data_engine.py
+++ b/nimbus/data_engine.py
@@ -0,0 +1,66 @@
+from time import time
+
+from nimbus.dist_sim.head_node import HeadNode
+from nimbus.scheduler.sches import gen_pipe, gen_scheduler
+from nimbus.utils.logging import configure_logging
+from nimbus.utils.random import set_all_seeds
+from nimbus.utils.types import (
+    NAME,
+    SAFE_THRESHOLD,
+    STAGE_PIPE,
+    WORKER_SCHEDULE,
+    StageInput,
+)
+from nimbus.utils.utils import consume_stage
+
+
+class DataEngine:
+    def __init__(self, config, master_seed=None):
+        if master_seed is not None:
+            master_seed = int(master_seed)
+            set_all_seeds(master_seed)
+        exp_name = config[NAME]
+        configure_logging(exp_name, config=config)
+        self._sche_list = gen_scheduler(config)
+        self._stage_input = StageInput()
+
+    def run(self):
+        for stage in self._sche_list:
+            self._stage_input = stage.run(self._stage_input)
+        consume_stage(self._stage_input)
+
+
+class DistPipeDataEngine:
+    def __init__(self, config, master_seed=None):
+        self._sche_list = gen_scheduler(config)
+        self.config = config
+        self._stage_input = StageInput()
+        exp_name = config[NAME]
+        self.logger = configure_logging(exp_name, config=config)
+        master_seed = int(master_seed) if master_seed is not None else None
+        self.pipe_list = gen_pipe(config, self._sche_list, exp_name, master_seed=master_seed)
+        self.head_nodes = {}
+
+    def run(self):
+        self.logger.info("[DistPipeDataEngine]: %s", self.pipe_list)
+        st_time = time()
+        cur_pipe_queue = None
+        pre_worker_num = 0
+        worker_schedule = self.config[STAGE_PIPE].get(WORKER_SCHEDULE, False)
+        for idx, pipe in enumerate(self.pipe_list):
+            self.head_nodes[idx] = HeadNode(
+                cur_pipe_queue,
+                pipe,
+                pre_worker_num,
+                self.config[STAGE_PIPE][SAFE_THRESHOLD],
+                worker_schedule,
+                self.logger,
+                idx,
+            )
+            self.head_nodes[idx].run()
+            cur_pipe_queue = self.head_nodes[idx].result_queue()
+            pre_worker_num = len(pipe)
+        for _, value in self.head_nodes.items():
+            value.wait_stop()
+        et_time = time()
+        self.logger.info("execution duration: %s", et_time - st_time)
--- a/nimbus/dist_sim/init.py
+++ b/nimbus/dist_sim/init.py
--- a/nimbus/dist_sim/head_node.py
+++ b/nimbus/dist_sim/head_node.py
@@ -0,0 +1,201 @@
+import traceback
+from threading import Thread
+from time import sleep, time
+
+import ray
+from ray.util.queue import Queue
+
+from nimbus.components.data.package import Package
+from nimbus.dist_sim.task_board import TaskBoard
+from nimbus.scheduler.inner_pipe import PipeWorkerGroup
+
+
+class HeadNode:
+    def __init__(
+        self, data_queue, workers: PipeWorkerGroup, pre_worker_num, safe_threshold, worker_schedule, logger, idx
+    ):
+        self.idx = idx
+        self.data_queue = data_queue
+        self.logger = logger
+        self.worker_group = workers
+        logger.info(f"workers: {list(workers.keys())}")
+        self.pre_worker_num = pre_worker_num
+        self.safe_threshold = safe_threshold
+        self.worker_schedule = worker_schedule
+        logger.info(f"safe_threshold: {self.safe_threshold}")
+        logger.info(f"worker_schedule: {self.worker_schedule}")
+        self.task_queue = Queue() if data_queue is not None else None
+        self.output_queue = Queue()
+        self.GEN_STOP_SIG = False
+        self.task_board = TaskBoard()
+        self.gen_thread = Thread(target=self.gen_tasks, args=())
+        self.gen_thread.start()
+        self.should_stop = False
+        self.run_thread = None
+        # Map runner ObjectRef to worker name for proper cleanup
+        self.runner_to_worker = {}
+        self.all_workers_spawned = False
+
+    def gen_tasks(self):
+        self.logger.info(f"headnode: {self.idx}: =============start gen task=============")
+        pre_worker_stop_num = 0
+        while not self.GEN_STOP_SIG:
+            if self.data_queue is None:
+                self.logger.info(f"headnode: {self.idx}: =============Gen Tasks stop==============")
+                self.all_workers_spawned = True
+                return
+            if self.data_queue.empty():
+                sleep(0)
+                continue
+            if self.task_queue is not None and self.task_queue.size() >= self.safe_threshold:
+                sleep(1)
+                continue
+            task = self.data_queue.get()
+            assert isinstance(
+                task, Package
+            ), f"the transfered type of data should be Package type, but it is {type(task)}"
+            if task.should_stop():
+                pre_worker_stop_num += 1
+                self.logger.info(
+                    f"headnode: {self.idx}: Received stop signal from upstream worker"
+                    f" ({pre_worker_stop_num}/{self.pre_worker_num})"
+                )
+
+                # Dynamic worker scheduling: spawn new worker when upstream worker finishes
+                if self.worker_schedule:
+                    self.logger.info(
+                        f"headnode: {self.idx}: Worker schedule enabled, will spawn 1 new worker after resource release"
+                    )
+                    # Wait for upstream resources to be released by upstream HeadNode's wait_stop()
+                    # Retry mechanism to handle resource release timing
+                    max_retries = 30  # 30 * 2s = 60s max wait
+                    retry_interval = 2
+
+                    for retry in range(max_retries):
+                        try:
+                            self.logger.info(
+                                f"headnode: {self.idx}: Attempting to spawn new worker (attempt"
+                                f" {retry + 1}/{max_retries})..."
+                            )
+                            created_workers = self.worker_group.spawn(1)
+                            if created_workers:
+                                for worker_name, worker_bundle in created_workers:
+                                    # Start the new worker
+                                    runner = worker_bundle["worker"].run.remote(self.task_queue, self.output_queue)
+                                    self.runner_to_worker[runner] = worker_name
+                                    self.logger.info(
+                                        f"headnode: {self.idx}: Successfully spawned and started new worker:"
+                                        f" {worker_name}"
+                                    )
+                                    sleep(5)
+                                break  # Success, exit retry loop
+                        except Exception as e:
+                            if retry < max_retries - 1:
+                                self.logger.warning(
+                                    f"headnode: {self.idx}: Failed to spawn worker (attempt {retry + 1}), will retry in"
+                                    f" {retry_interval}s: {e}"
+                                )
+                                sleep(retry_interval)
+                            else:
+                                self.logger.error(
+                                    f"headnode: {self.idx}: Failed to spawn new worker after"
+                                    f" {max_retries} attempts: {e}"
+                                )
+                                self.logger.error(traceback.format_exc())
+
+                if pre_worker_stop_num == self.pre_worker_num:
+                    for _ in range(len(self.worker_group)):
+                        self.logger.info(f"headnode: {self.idx}: get stop signal")
+                        stop_pack = Package(None, stop_sig=True)
+                        self.task_board.reg_task(stop_pack)
+                    self.all_workers_spawned = True
+                    return
+            else:
+                self.task_board.reg_task(task)
+        if self.data_queue and not self.data_queue.empty():
+            task = self.data_queue.get_nowait()
+            self.task_board.reg_task(task)
+        self.logger.info("=============Gen Tasks stop==============")
+        self.all_workers_spawned = True
+
+    def result_queue(self):
+        return self.output_queue
+
+    def run(self):
+        self.logger.info(f"headnode: {self.idx}: ==============Running Head Node================")
+        for worker_name, worker_bundle in self.worker_group.items():
+            runner = worker_bundle["worker"].run.remote(self.task_queue, self.output_queue)
+            self.runner_to_worker[runner] = worker_name
+            sleep(5)
+
+        def inner_run():
+            while not self.should_stop:
+                tasks = self.task_board.get_tasks(timeout=0.05)
+                if len(tasks) == 0:
+                    sleep(0)
+                    continue
+                while self.task_queue.size() >= self.safe_threshold and not self.should_stop:
+                    sleep(1)
+                for _, task in enumerate(tasks):
+                    self.task_queue.put(task)
+
+        self.run_thread = Thread(target=inner_run)
+        self.run_thread.start()
+
+    def sig_stop(self):
+        self.logger.info(f"headnode: {self.idx}: ============Gen Stop===============")
+        self.GEN_STOP_SIG = True
+        self.gen_thread.join()
+
+    def wait_stop(self):
+        if self.worker_schedule and self.idx != 0:
+            self.logger.info(f"headnode: {self.idx}: Waiting for all worker spawning to complete...")
+            timeout = 600  # 600 seconds timeout
+            start_time = time()
+            while not self.all_workers_spawned:
+                if time() - start_time > timeout:
+                    self.logger.warning(
+                        f"headnode: {self.idx}: Timeout waiting for worker spawning completion after {timeout}s"
+                    )
+                    break
+                sleep(0.1)
+
+            if self.all_workers_spawned:
+                self.logger.info(f"headnode: {self.idx}: All worker spawning completed, proceeding to wait for runners")
+
+        remaining_runners = list(self.runner_to_worker.keys())
+        for runner in remaining_runners:
+            self.logger.info(f"headnode: {self.idx}: remaining runner include: {self.runner_to_worker[runner]}")
+
+        while remaining_runners:
+            ready, _ = ray.wait(remaining_runners, num_returns=len(remaining_runners), timeout=1.0)
+
+            for finished_runner in ready:
+                worker_name = self.runner_to_worker.get(finished_runner, "unknown")
+                self.logger.info(f"headnode: {self.idx}: Worker {worker_name} finished")
+                try:
+                    ray.get(finished_runner)
+                    self.logger.info(f"headnode: {self.idx}: Worker {worker_name} completed successfully")
+                    self.worker_group.remove(worker_name, self.logger)
+                except Exception as e:
+                    self.logger.error(f"Worker {worker_name} failed, error stack:")
+                    self.logger.error(e)
+                    if worker_name in self.worker_group.keys():
+                        self.worker_group.remove(worker_name, self.logger)
+
+                remaining_runners.remove(finished_runner)
+                self.runner_to_worker.pop(finished_runner, None)
+
+            if not ready:
+                sleep(1)
+
+        self.logger.info(f"headnode: {self.idx}: ==============stop head================")
+        self.should_stop = True
+        if self.run_thread is not None:
+            self.run_thread.join()
+        self.sig_stop()
+
+    def __del__(self):
+        if self.task_queue is not None:
+            self.task_queue.shutdown()
+        self.output_queue.shutdown()
--- a/nimbus/dist_sim/task_board.py
+++ b/nimbus/dist_sim/task_board.py
@@ -0,0 +1,42 @@
+import time
+from threading import Lock
+
+
+class Task:
+    def __init__(self):
+        pass
+
+    def update_state(self, state):
+        pass
+
+
+class TaskBoard:
+    def __init__(self):
+        self.tasks = []
+        self.flying_tasks = []
+        self.finished_tasks = []
+        self.task_cnt = 0
+        self.task_lock = Lock()
+        self.flying_task_lock = Lock()
+
+    def reg_task(self, task):
+        with self.task_lock:
+            self.tasks.append(task)
+        self.task_cnt += 1
+
+    def get_tasks(self, timeout=0):
+        st_time = time.time()
+        while len(self.tasks) == 0:
+            if time.time() - st_time > timeout:
+                return []
+            pass
+        with self.task_lock:
+            tasks = self.tasks.copy()
+            self.tasks = []
+        return tasks
+
+    def commit_task(self, tasks):
+        raise NotImplementedError("commit_task not implemented")
+
+    def finished(self):
+        raise NotImplementedError("finished not implemented")
--- a/nimbus/scheduler/init.py
+++ b/nimbus/scheduler/init.py
--- a/nimbus/scheduler/inner_pipe.py
+++ b/nimbus/scheduler/inner_pipe.py
@@ -0,0 +1,277 @@
+import math
+import os
+import threading
+import time
+
+import ray
+
+from nimbus.daemon.status_monitor import StatusMonitor
+from nimbus.scheduler.stages import DedumpStage, DumpStage
+from nimbus.utils.logging import configure_logging
+from nimbus.utils.random import set_all_seeds
+from nimbus.utils.types import MONITOR_CHECK_INTERVAL, STATUS_TIMEOUTS, StageInput
+from nimbus.utils.utils import init_env, pipe_consume_stage
+
+
+def iter_to_obj(iter_obj):
+    return pipe_consume_stage(iter_obj), True
+
+
+def _consume_N(iter_obj, N=1):
+    print("consume: ", iter_obj)
+    results = []
+    finish = False
+    for _ in range(N):
+        try:
+            obj = next(iter_obj)
+            results.append(obj)
+        except StopIteration:
+            finish = True
+    return results, finish
+
+
+def consume_N(stage_input):
+    finish = False
+    if hasattr(stage_input, "Args"):
+        stage_input.Args, finish = _consume_N(stage_input.Args[0])
+    if hasattr(stage_input, "Kwargs"):
+        if stage_input.Kwargs is not None:
+            stage_input.Kwargs = {key: _consume_N(value) for key, value in stage_input.Kwargs.items()}
+    return stage_input, finish
+
+
+class PipeWorkerGroup:
+    """
+    Manages a group of pipe workers and their supervisors.
+    Supports dynamic worker spawning for worker_schedule feature.
+    """
+
+    def __init__(
+        self,
+        pipe_name,
+        exp_name,
+        pipe_num,
+        stage_list,
+        master_seed,
+        supervisor_class,
+        inner_pipe_class,
+        initial_instances=0,
+    ):
+        self.workers = {}
+        self._next_worker_idx = 0
+        self.pipe_name = pipe_name
+        self.exp_name = exp_name
+        self.pipe_num = pipe_num
+        self.stage_list = stage_list
+        self.master_seed = master_seed
+        self.supervisor_class = supervisor_class
+        self.inner_pipe_class = inner_pipe_class
+
+        if initial_instances > 0:
+            self.spawn(initial_instances)
+
+    def spawn(self, count):
+        """
+        Spawn new workers dynamically.
+        Returns list of (name, bundle) tuples for created workers.
+        """
+        created = []
+        for _ in range(count):
+            name = f"p{self.pipe_num}_w{self._next_worker_idx}"
+            worker_seed = self.master_seed + self._next_worker_idx if self.master_seed is not None else None
+            supervisor = self.supervisor_class.remote(name)
+            pipe_actor = self.inner_pipe_class.remote(self.stage_list, name, supervisor, seed=worker_seed)
+            ray.get(supervisor.set_pipe.remote(pipe_actor))
+            supervisor.run.remote()
+            bundle = {"worker": pipe_actor, "supervisor": supervisor}
+            self.workers[name] = bundle
+            created.append((name, bundle))
+            self._next_worker_idx += 1
+            time.sleep(3)
+
+        if created:
+            print(f"{self.pipe_name}: spawned {len(created)} workers - {[name for name, _ in created]}")
+        return created
+
+    def items(self):
+        """Return items view of workers dictionary."""
+        return self.workers.items()
+
+    def values(self):
+        """Return values view of workers dictionary."""
+        return self.workers.values()
+
+    def keys(self):
+        """Return keys view of workers dictionary."""
+        return self.workers.keys()
+
+    def __len__(self):
+        """Return number of workers in the group."""
+        return len(self.workers)
+
+    def __repr__(self):
+        worker_names = list(self.workers.keys())
+        return f"PipeWorkerGroup({worker_names})"
+
+    def __getitem__(self, key):
+        """Support dictionary-style access."""
+        return self.workers[key]
+
+    def remove(self, name, logger):
+        """Remove a worker from the group."""
+        ray.kill(self.workers[name]["worker"])
+        logger.info(f"killed worker actor {name} to release GPU resouces")
+        ray.kill(self.workers[name]["supervisor"])
+        logger.info(f"Supervisor {name} killed successfully")
+        if name in self.workers:
+            del self.workers[name]
+
+
+def make_pipe(pipe_name, exp_name, pipe_num, stage_list, dev, instance_num, total_processes, config, master_seed=None):
+    gpu_num = 0
+    if dev == "gpu":
+        resources = ray.cluster_resources()
+        total_gpus = resources.get("GPU", 0)
+        assert total_gpus > 0, "not enough gpu resources"
+        processes_per_gpu = math.ceil(total_processes / total_gpus)
+        gpu_num = 1.0 / processes_per_gpu
+
+    @ray.remote
+    class Supervisor:
+        def __init__(self, name):
+            self.name = "supervisor_" + name
+            self.pipe_worker = None
+            self.logger = configure_logging(exp_name, self.name)
+            self.logger.info("Supervisor started")
+            self.monitor = StatusMonitor.get_instance()
+            self.monitor.set_logger(self.logger)
+
+            self._last_status_check = 0.0
+            self.check_interval = config.get(MONITOR_CHECK_INTERVAL, 120)
+            self.logger.info(f"Monitor check interval: {self.check_interval} seconds")
+            if config.get(STATUS_TIMEOUTS, None) is not None:
+                self.monitor.set_component_timeouts(config[STATUS_TIMEOUTS])
+
+        def set_pipe(self, pipe_worker):
+            self.logger.info("set pipe worker")
+            self.pipe_worker = pipe_worker
+
+        def set_queue(self, input_queue, output_queue):
+            self.input_queue = input_queue
+            self.output_queue = output_queue
+
+        def _restart_worker(self):
+            try:
+                ray.kill(self.pipe_worker, no_restart=False)
+                self.logger.info("trigger restart of the actor")
+            except Exception as ke:
+                self.logger.error(f"restart actor error: {ke}")
+
+        def update_component_state(self, components_state):
+            for _, state in components_state.items():
+                self.monitor.register_update(state)
+
+        def _start_daemon(self):
+            miss_cnt = 0
+            while True:
+                now = time.time()
+                if now - self._last_status_check >= self.check_interval:
+                    try:
+                        timeout_components = self.monitor.check_and_update_timeouts()
+                        if len(timeout_components) > 0:
+                            self.logger.warning(f"Components timeout: {timeout_components}, restart the pipe worker")
+                            self._restart_worker()
+                            self.monitor.clear()
+                        else:
+                            if self.monitor.get_components_length() == 0:
+                                miss_cnt += 1
+                                self.logger.info(f"No components timeout detected, miss count: {miss_cnt}")
+                            if miss_cnt >= 5:
+                                self.logger.info("No components detected for 5 consecutive checks, restart pipe worker")
+                                self._restart_worker()
+                                self.monitor.clear()
+                                miss_cnt = 0
+                    except Exception as e:
+                        self.logger.error(f"Get components status failed: {e}")
+                        self._restart_worker()
+                        self.monitor.clear()
+                    self._last_status_check = now
+                time.sleep(1)
+
+        def run(self):
+            assert self.pipe_worker is not None, "pipe worker is not set"
+            thread = threading.Thread(target=self._start_daemon, daemon=True)
+            thread.start()
+
+    @ray.remote(num_gpus=gpu_num, max_restarts=3, max_task_retries=3)
+    class InnerPipe:
+        def __init__(self, stage_list, name, supervisor, seed=None):
+            if seed is not None:
+                set_all_seeds(seed)
+            self.stages = stage_list
+            self.name = name
+            self.supervisor = supervisor
+            init_env()
+            self.logger = configure_logging(exp_name, self.name)
+            self.logger.info(f"Working on gpu {os.environ.get('CUDA_VISIBLE_DEVICES')}")
+            if ray.get_runtime_context().was_current_actor_reconstructed is True:
+                msg = (
+                    f"{'='*80}\n"
+                    "!!! ATTENTION !!!\n"
+                    f"!!! InnerPipe {name} WAS RECONSTRUCTED due to SYSTEM ERROR !!!\n"
+                    "!!! Please CHECK LOGS in /tmp/ray/session_latest/logs/ for details !!!\n"
+                    f"{'='*80}\n"
+                )
+                self.logger.info(msg)
+
+            self.monitor = StatusMonitor.get_instance()
+            self.monitor.set_logger(self.logger)
+
+            self.monitor_check_interval = config.get(MONITOR_CHECK_INTERVAL, 120)
+
+        def _update_supervisor(self):
+            while True:
+                for _ in range(self.monitor_check_interval):
+                    time.sleep(1)
+                components_status = self.monitor.get_all_status()
+                ray.get(self.supervisor.update_component_state.remote(components_status))
+
+        def run(self, input_queue, output_queue):
+            self.logger.info(f"[InnerPipe stages]: {self.stages}")
+
+            thread = threading.Thread(target=self._update_supervisor, daemon=True)
+            thread.start()
+            self.logger.info("Reporter started, start running pipe")
+
+            mid_results = StageInput()
+            # if input_queue is None:
+            #     mid_results = StageInput()
+            # else:
+            #     mid_results = StageInput((input_queue,), {})
+            for _, stage in enumerate(self.stages):
+                if isinstance(stage, DumpStage):
+                    mid_results = stage.run(mid_results, output_queue)
+                elif isinstance(stage, DedumpStage):
+                    mid_results = stage.run(mid_results, input_queue)
+                else:
+                    mid_results = stage.run(mid_results)
+            result, finish = iter_to_obj(mid_results)
+            self.logger.info("====================================")
+            self.logger.info(f"result: {result}, finish: {finish}")
+            self.logger.info("====================================")
+            ray.kill(self.supervisor)
+            self.logger.info("actor finished")
+            return finish
+
+    group = PipeWorkerGroup(
+        pipe_name=pipe_name,
+        exp_name=exp_name,
+        pipe_num=pipe_num,
+        stage_list=stage_list,
+        master_seed=master_seed,
+        supervisor_class=Supervisor,
+        inner_pipe_class=InnerPipe,
+        initial_instances=instance_num,
+    )
+    print(pipe_name, group)
+    return group
--- a/nimbus/scheduler/instructions.py
+++ b/nimbus/scheduler/instructions.py
@@ -0,0 +1,115 @@
+from abc import abstractmethod
+
+from nimbus.components.dedump import dedumper_dict
+from nimbus.components.dump import dumper_dict
+from nimbus.components.load import layout_randomizer_dict, scene_loader_dict
+from nimbus.components.plan_with_render import plan_with_render_dict
+from nimbus.components.planner import seq_planner_dict
+from nimbus.components.render import renderer_dict
+from nimbus.components.store import writer_dict
+from nimbus.utils.types import ARGS, PLANNER, TYPE
+
+
+class Instruction:
+    def __init__(self, config):
+        self.config = config
+
+    @abstractmethod
+    def run(self, stage_input):
+        raise NotImplementedError()
+
+
+class LoadSceneInstruction(Instruction):
+    def __init__(self, config):
+        super().__init__(config)
+        self.scene_iter = scene_loader_dict[self.config[TYPE]]
+
+    def run(self, stage_input):
+        pack_iter = pack_iter = stage_input.Args[0] if stage_input.Args is not None else None
+        return self.scene_iter(pack_iter=pack_iter, **self.config.get(ARGS, {}))
+
+
+class RandomizeLayoutInstruction(Instruction):
+    def __init__(self, config):
+        super().__init__(config)
+        self.layout_randomlizer = layout_randomizer_dict[self.config[TYPE]]
+
+    def run(self, stage_input):
+        scene_iterator = stage_input.Args[0]
+        extend_scene_iterator = self.layout_randomlizer(scene_iterator, **self.config.get(ARGS, {}))
+        return extend_scene_iterator
+
+
+class PlanPathInstruction(Instruction):
+    def __init__(self, config):
+        super().__init__(config)
+        self.seq_planner = seq_planner_dict[self.config[TYPE]]
+
+    def run(self, stage_input):
+        scene_iter = stage_input.Args[0]
+        planner_cfg = self.config[PLANNER] if PLANNER in self.config else None
+        return self.seq_planner(scene_iter, planner_cfg, **self.config.get(ARGS, {}))
+
+
+class RenderInstruction(Instruction):
+    def __init__(self, config):
+        super().__init__(config)
+        self.renderer = renderer_dict[self.config[TYPE]]
+
+    def run(self, stage_input):
+        scene_seqs_iter = stage_input.Args[0]
+        obs_iter = self.renderer(scene_seqs_iter, **self.config.get(ARGS, {}))
+        return obs_iter
+
+
+class PlanWithRenderInstruction(Instruction):
+    def __init__(self, config):
+        super().__init__(config)
+        self.plan_with_render = plan_with_render_dict[config[TYPE]]
+
+    def run(self, stage_input):
+        scene_iter = stage_input.Args[0]
+        plan_with_render_iter = self.plan_with_render(scene_iter, **self.config.get(ARGS, {}))
+        return plan_with_render_iter
+
+
+class StoreInstruction(Instruction):
+    def __init__(self, config):
+        super().__init__(config)
+        self.writer = writer_dict[config[TYPE]]
+
+    def run(self, stage_input):
+        seqs_obs_iter = stage_input.Args[0]
+        store_iter = self.writer(seqs_obs_iter, **self.config.get(ARGS, {}))
+        return store_iter
+
+
+class DumpInstruction(Instruction):
+    def __init__(self, config):
+        super().__init__(config)
+        self.dumper = dumper_dict[config[TYPE]]
+
+    def run(self, stage_input, output_queue=None):
+        seqs_obs_iter = stage_input.Args[0]
+        dump_iter = self.dumper(seqs_obs_iter, output_queue=output_queue, **self.config.get(ARGS, {}))
+        return dump_iter
+
+
+class DeDumpInstruction(Instruction):
+    def __init__(self, config):
+        super().__init__(config)
+        self.dedumper = dedumper_dict[config[TYPE]]
+
+    def run(self, stage_input, input_queue=None):
+        dump_iter = self.dedumper(input_queue=input_queue, **self.config.get(ARGS, {}))
+        return dump_iter
+
+
+class ComposeInstruction(Instruction):
+    def __init__(self, config):
+        super().__init__(config)
+
+
+class AnnotateDataInstruction(Instruction):
+    def __init__(self, config):
+        super().__init__(config)
--- a/nimbus/scheduler/sches.py
+++ b/nimbus/scheduler/sches.py
@@ -0,0 +1,80 @@
+from nimbus.scheduler.inner_pipe import make_pipe
+from nimbus.scheduler.stages import (
+    DedumpStage,
+    DumpStage,
+    LoadStage,
+    PlanStage,
+    PlanWithRenderStage,
+    RenderStage,
+    StoreStage,
+)
+from nimbus.utils.types import (
+    DEDUMP_STAGE,
+    DUMP_STAGE,
+    LOAD_STAGE,
+    PLAN_STAGE,
+    PLAN_WITH_RENDER_STAGE,
+    RENDER_STAGE,
+    STAGE_DEV,
+    STAGE_NUM,
+    STAGE_PIPE,
+    STORE_STAGE,
+    WORKER_NUM,
+)
+
+
+def gen_scheduler(config):
+    stages = []
+    if LOAD_STAGE in config:
+        stages.append(LoadStage(config[LOAD_STAGE]))
+    if PLAN_WITH_RENDER_STAGE in config:
+        stages.append(PlanWithRenderStage(config[PLAN_WITH_RENDER_STAGE]))
+    if PLAN_STAGE in config:
+        stages.append(PlanStage(config[PLAN_STAGE]))
+    if DUMP_STAGE in config:
+        stages.append(DumpStage(config[DUMP_STAGE]))
+    if DEDUMP_STAGE in config:
+        stages.append(DedumpStage(config[DEDUMP_STAGE]))
+    if RENDER_STAGE in config:
+        stages.append(RenderStage(config[RENDER_STAGE]))
+    if STORE_STAGE in config:
+        stages.append(StoreStage(config[STORE_STAGE]))
+    return stages
+
+
+def gen_pipe(config, stage_list, exp_name, master_seed=None):
+    if STAGE_PIPE in config:
+        pipe_stages_num = config[STAGE_PIPE][STAGE_NUM]
+        pipe_stages_dev = config[STAGE_PIPE][STAGE_DEV]
+        pipe_worker_num = config[STAGE_PIPE][WORKER_NUM]
+        inner_pipes = []
+        pipe_num = 0
+        total_processes = 0
+        for worker_num in config[STAGE_PIPE][WORKER_NUM]:
+            total_processes += worker_num
+        for num, dev, worker_num in zip(pipe_stages_num, pipe_stages_dev, pipe_worker_num):
+            stages = stage_list[:num]
+            print("===========================")
+            print(f"inner stage num: {num}, device type: {dev}")
+            print(f"stages: {stages}")
+            print("===========================")
+            stage_list = stage_list[num:]
+            pipe_name = "pipe"
+            for stage in stages:
+                pipe_name += f"_{stage.__class__.__name__}"
+            pipe_workers = make_pipe(
+                pipe_name,
+                exp_name,
+                pipe_num,
+                stages,
+                dev,
+                worker_num,
+                total_processes,
+                config[STAGE_PIPE],
+                master_seed=master_seed,
+            )
+            inner_pipes.append(pipe_workers)
+            pipe_num += 1
+        return inner_pipes
+    else:
+        return [make_pipe.InnerPipe(stage_list)]
--- a/nimbus/scheduler/stages.py
+++ b/nimbus/scheduler/stages.py
@@ -0,0 +1,137 @@
+from abc import abstractmethod
+
+from nimbus.scheduler.instructions import (
+    DeDumpInstruction,
+    DumpInstruction,
+    Instruction,
+    LoadSceneInstruction,
+    PlanPathInstruction,
+    PlanWithRenderInstruction,
+    RandomizeLayoutInstruction,
+    RenderInstruction,
+    StoreInstruction,
+)
+from nimbus.utils.types import (
+    DEDUMPER,
+    DUMPER,
+    LAYOUT_RANDOM_GENERATOR,
+    PLAN_WITH_RENDER,
+    RENDERER,
+    SCENE_LOADER,
+    SEQ_PLANNER,
+    WRITER,
+    StageInput,
+)
+
+
+class Stage:
+    def __init__(self, config):
+        self.config = config
+        self.instructions: list[Instruction] = []
+        self.output_queue = None
+
+    @abstractmethod
+    def run(self, stage_input):
+        raise NotImplementedError()
+
+
+class LoadStage(Stage):
+    def __init__(self, config):
+        super().__init__(config)
+        if SCENE_LOADER in config:
+            self.instructions.append(LoadSceneInstruction(config[SCENE_LOADER]))
+        if LAYOUT_RANDOM_GENERATOR in config:
+            self.instructions.append(RandomizeLayoutInstruction(config[LAYOUT_RANDOM_GENERATOR]))
+
+    def run(self, stage_input: StageInput):
+        for instruction in self.instructions:
+            scene_iterator = instruction.run(stage_input)
+            stage_input = StageInput((scene_iterator,), {})
+        return stage_input
+
+
+class PlanStage(Stage):
+    def __init__(self, config):
+        super().__init__(config)
+        if SEQ_PLANNER in config:
+            self.instructions.append(PlanPathInstruction(config[SEQ_PLANNER]))
+
+    def run(self, stage_input: StageInput):
+        for instruction in self.instructions:
+            scene_seqs_iter = instruction.run(stage_input)
+            stage_input = StageInput((scene_seqs_iter,), {})
+        return stage_input
+
+
+class RenderStage(Stage):
+    def __init__(self, config):
+        super().__init__(config)
+        self.instructions.append(RenderInstruction(config[RENDERER]))
+
+    def run(self, stage_input: StageInput):
+        for instruction in self.instructions:
+            obs_iter = instruction.run(stage_input)
+            stage_input = StageInput((obs_iter,), {})
+        return stage_input
+
+
+class PlanWithRenderStage(Stage):
+    def __init__(self, config):
+        super().__init__(config)
+        self.instructions.append(PlanWithRenderInstruction(config[PLAN_WITH_RENDER]))
+
+    def run(self, stage_input: StageInput):
+        for instruction in self.instructions:
+            scene_seqs_iter = instruction.run(stage_input)
+            stage_input = StageInput((scene_seqs_iter,), {})
+        return stage_input
+
+
+class StoreStage(Stage):
+    def __init__(self, config):
+        super().__init__(config)
+        if WRITER in config:
+            self.instructions.append(StoreInstruction(config[WRITER]))
+
+    def run(self, stage_input: StageInput):
+        for instruction in self.instructions:
+            store_iter = instruction.run(stage_input)
+            stage_input = StageInput((store_iter,), {})
+        return stage_input
+
+
+class DumpStage(Stage):
+    def __init__(self, config):
+        super().__init__(config)
+        self.instructions.append(DumpInstruction(config[DUMPER]))
+
+    def run(self, stage_input: StageInput, output_queue=None):
+        for instruction in self.instructions:
+            dump_iter = instruction.run(stage_input, output_queue)
+            stage_input = StageInput((dump_iter,), {})
+        return stage_input
+
+
+class DedumpStage(Stage):
+    def __init__(self, config):
+        super().__init__(config)
+        if DEDUMPER in config:
+            self.instructions.append(DeDumpInstruction(config[DEDUMPER]))
+        if SCENE_LOADER in config:
+            self.instructions.append(LoadSceneInstruction(config[SCENE_LOADER]))
+        if LAYOUT_RANDOM_GENERATOR in config:
+            self.instructions.append(RandomizeLayoutInstruction(config[LAYOUT_RANDOM_GENERATOR]))
+        if SEQ_PLANNER in config:
+            self.instructions.append(PlanPathInstruction(config[SEQ_PLANNER]))
+
+    def run(self, stage_input: StageInput, input_queue=None):
+        if input_queue is not None:
+            self.input_queue = input_queue
+
+        for instruction in self.instructions:
+            if isinstance(instruction, DeDumpInstruction):
+                result = instruction.run(stage_input, input_queue)
+            else:
+                result = instruction.run(stage_input)
+            stage_input = StageInput((result,), {})
+        return stage_input
--- a/nimbus/utils/config.py
+++ b/nimbus/utils/config.py
@@ -0,0 +1,20 @@
+from omegaconf import OmegaConf
+
+
+def load_config(*yaml_files, cli_args=None):
+    if cli_args is None:
+        cli_args = []
+    yaml_confs = [OmegaConf.load(f) for f in yaml_files]
+    cli_conf = OmegaConf.from_cli(cli_args)
+    conf = OmegaConf.merge(*yaml_confs, cli_conf)
+    OmegaConf.resolve(conf)
+    return conf
+
+
+def config_to_primitive(config, resolve=True):
+    return OmegaConf.to_container(config, resolve=resolve)
+
+
+def save_config(config, path):
+    with open(path, "w", encoding="utf-8") as fp:
+        OmegaConf.save(config=config, f=fp)
--- a/nimbus/utils/config_processor.py
+++ b/nimbus/utils/config_processor.py
@@ -0,0 +1,138 @@
+"""
+Config Processor: Responsible for identifying, converting, and loading configuration files.
+"""
+
+from omegaconf import DictConfig, OmegaConf
+
+from nimbus.utils.config import load_config
+
+
+class ConfigProcessor:
+    """Config processor class"""
+
+    def __init__(self):
+        pass
+
+    def _check_config_path_exists(self, config, path):
+        """
+        Check if a configuration path exists in the config object
+
+        Args:
+            config: OmegaConf config object
+            path: String path like 'stage_pipe.worker_num' or 'load_stage.scene_loader.args.random_num'
+
+        Returns:
+            bool: Whether the path exists in the config
+        """
+        try:
+            keys = path.split(".")
+            current = config
+            for key in keys:
+                if isinstance(current, DictConfig):
+                    if key not in current:
+                        return False
+                    current = current[key]
+                else:
+                    return False
+            return True
+        except Exception:
+            return False
+
+    def _validate_cli_args(self, config, cli_args):
+        """
+        Validate that all CLI arguments correspond to existing paths in the config
+
+        Args:
+            config: OmegaConf config object
+            cli_args: List of command line arguments
+
+        Raises:
+            ValueError: If any CLI argument path doesn't exist in the config
+        """
+        if not cli_args:
+            return
+
+        # Clean up CLI args to remove -- prefix if present
+        cleaned_cli_args = []
+        for arg in cli_args:
+            if arg.startswith("--"):
+                cleaned_cli_args.append(arg[2:])  # Remove the -- prefix
+            else:
+                cleaned_cli_args.append(arg)
+
+        # Parse CLI args to get the override paths
+        try:
+            cli_conf = OmegaConf.from_cli(cleaned_cli_args)
+        except Exception as e:
+            raise ValueError(f"Invalid CLI argument format: {e}. Please use format like: stage_pipe.worker_num='[2,4]'")
+
+        def check_nested_paths(conf, prefix=""):
+            """Recursively check all paths in the CLI config"""
+            for key, value in conf.items():
+                current_path = f"{prefix}.{key}" if prefix else key
+
+                if isinstance(value, DictConfig):
+                    # Check if this intermediate path exists
+                    if not self._check_config_path_exists(config, current_path):
+                        raise ValueError(f"Configuration path '{current_path}' does not exist in the config file")
+                    # Recursively check nested paths
+                    check_nested_paths(value, current_path)
+                else:
+                    # Check if this leaf path exists
+                    if not self._check_config_path_exists(config, current_path):
+                        raise ValueError(f"Configuration path '{current_path}' does not exist in the config file")
+
+        try:
+            check_nested_paths(cli_conf)
+        except ValueError:
+            raise
+        except Exception:
+            # If there's an issue parsing CLI args, provide helpful error message
+            raise ValueError("Invalid CLI argument format. Please use format like: --key=value or --nested.key=value")
+
+    def process_config(self, config_path, cli_args=None):
+        """
+        Process the config file
+
+        Args:
+            config_path: Path to the config file
+            cli_args: List of command line arguments
+
+        Returns:
+            OmegaConf: Processed config object
+        """
+        # Clean up CLI args to remove -- prefix if present
+        cleaned_cli_args = []
+        if cli_args:
+            for arg in cli_args:
+                if arg.startswith("--"):
+                    cleaned_cli_args.append(arg[2:])  # Remove the -- prefix
+                else:
+                    cleaned_cli_args.append(arg)
+
+        # Load config first without CLI args to validate paths
+        try:
+            base_config = load_config(config_path)
+        except Exception as e:
+            raise ValueError(f"Error loading config: {e}")
+
+        # Validate that CLI arguments correspond to existing paths
+        if cli_args:
+            self._validate_cli_args(base_config, cli_args)
+
+        # Now load config with CLI args (validation passed)
+        config = load_config(config_path, cli_args=cleaned_cli_args)
+
+        return config
+
+    def print_final_config(self, config):
+        """
+        Print the final running config
+
+        Args:
+            config: OmegaConf config object
+        """
+        print("=" * 50)
+        print("final config:")
+        print("=" * 50)
+        print(OmegaConf.to_yaml(config))
--- a/nimbus/utils/flags.py
+++ b/nimbus/utils/flags.py
@@ -0,0 +1,23 @@
+import os
+
+_DEBUG_KEY = "NIMBUS_DEBUG"
+_RANDOM_SEED_KEY = "NIMBUS_RANDOM_SEED"
+
+
+def set_debug_mode(enabled: bool) -> None:
+    """Set debug mode. Must be called before ray.init() to propagate to Ray workers."""
+    os.environ[_DEBUG_KEY] = "1" if enabled else "0"
+
+
+def is_debug_mode() -> bool:
+    return os.environ.get(_DEBUG_KEY, "0") == "1"
+
+
+def set_random_seed(seed: int) -> None:
+    """Set global random seed. Must be called before ray.init() to propagate to Ray workers."""
+    os.environ[_RANDOM_SEED_KEY] = str(seed)
+
+
+def get_random_seed() -> int | None:
+    val = os.environ.get(_RANDOM_SEED_KEY)
+    return int(val) if val is not None else None
--- a/nimbus/utils/logging.py
+++ b/nimbus/utils/logging.py
@@ -0,0 +1,48 @@
+import logging
+import os
+import time
+from datetime import datetime
+
+from nimbus.utils.config import save_config
+
+
+def configure_logging(exp_name, name=None, config=None):
+    pod_name = os.environ.get("POD_NAME", None)
+    if pod_name is not None:
+        exp_name = f"{exp_name}/{pod_name}"
+    log_dir = os.path.join("./output", exp_name)
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    if name is None:
+        log_name = f"de_time_profile_{timestamp}.log"
+    else:
+        log_name = f"de_{name}_time_profile_{timestamp}.log"
+
+    log_file = os.path.join(log_dir, log_name)
+
+    max_retries = 3
+    for attempt in range(max_retries):
+        try:
+            os.makedirs(log_dir, exist_ok=True)
+            break
+        except Exception as e:
+            print(f"Warning: Stale file handle when creating {log_dir}, attempt {attempt + 1}/{max_retries}")
+            if attempt < max_retries - 1:
+                time.sleep(3)
+                continue
+            else:
+                raise RuntimeError(f"Failed to create log directory {log_dir} after {max_retries} attempts") from e
+
+    if config is not None:
+        config_log_file = os.path.join(log_dir, "de_config.yaml")
+        save_config(config, config_log_file)
+
+    logger = logging.getLogger("de_logger")
+    logger.setLevel(logging.INFO)
+
+    fh = logging.FileHandler(log_file, mode="a")
+    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
+    fh.setFormatter(formatter)
+    logger.addHandler(fh)
+    logger.info("Start Data Engine")
+
+    return logger
--- a/nimbus/utils/random.py
+++ b/nimbus/utils/random.py
@@ -0,0 +1,33 @@
+import os
+import random
+
+import numpy as np
+import torch
+
+# Try to import open3d, but don't fail if it's not installed
+try:
+    import open3d as o3d
+except ImportError:
+    o3d = None
+
+
+def set_all_seeds(seed):
+    """
+    Sets seeds for all relevant random number generators to ensure reproducibility.
+    """
+    os.environ["PYTHONHASHSEED"] = str(seed)
+    print(f"set seed {seed} for all libraries")
+    seed = int(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+
+    if o3d and hasattr(o3d, "utility") and hasattr(o3d.utility, "random"):
+        o3d.utility.random.seed(seed)
+
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+
+    # These settings are crucial for deterministic results with CuDNN
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
--- a/nimbus/utils/types.py
+++ b/nimbus/utils/types.py
@@ -0,0 +1,65 @@
+from dataclasses import dataclass
+from typing import Dict, Optional, Tuple
+
+NAME = "name"
+
+# stage name
+LOAD_STAGE = "load_stage"
+PLAN_STAGE = "plan_stage"
+RENDER_STAGE = "render_stage"
+PLAN_WITH_RENDER_STAGE = "plan_with_render_stage"
+STORE_STAGE = "store_stage"
+STAGE_PIPE = "stage_pipe"
+DUMP_STAGE = "dump_stage"
+DEDUMP_STAGE = "dedump_stage"
+
+# instruction name
+# LOAD_STAGE
+SCENE_LOADER = "scene_loader"
+LAYOUT_RANDOM_GENERATOR = "layout_random_generator"
+INDEX_GENERATOR = "index_generator"
+DEDUMPER = "dedumper"
+
+# PLAN_STAGE
+SEQ_PLANNER = "seq_planner"
+PLANNER = "planner"
+SIMULATOR = "simulator"
+
+# RENDER_STAGE
+RENDERER = "renderer"
+
+# PLAN_WITH_RENDER_STAGE
+PLAN_WITH_RENDER = "plan_with_render"
+
+# PIPE_STAGE
+STAGE_NUM = "stage_num"
+STAGE_DEV = "stage_dev"
+WORKER_NUM = "worker_num"
+WORKER_SCHEDULE = "worker_schedule"
+SAFE_THRESHOLD = "safe_threshold"
+STATUS_TIMEOUTS = "status_timeouts"
+MONITOR_CHECK_INTERVAL = "monitor_check_interval"
+
+# STORE_STAGE
+WRITER = "writer"
+DUMPER = "dumper"
+
+OUTPUT_PATH = "output_path"
+INPUT_PATH = "input_path"
+
+TYPE = "type"
+ARGS = "args"
+
+
+@dataclass
+class StageInput:
+    """
+    A data class that encapsulates the input for a stage in the processing pipeline.
+
+    Args:
+        Args (Optional[Tuple]): Positional arguments passed to the stage's processing function.
+        Kwargs (Optional[Dict]): Keyword arguments passed to the stage's processing function.
+    """
+
+    Args: Optional[Tuple] = None
+    Kwargs: Optional[Dict] = None
--- a/nimbus/utils/utils.py
+++ b/nimbus/utils/utils.py
@@ -0,0 +1,182 @@
+import functools
+import os
+import re
+import sys
+import time
+from typing import Tuple, Type, Union
+
+from nimbus.components.data.observation import Observations
+from nimbus.components.data.scene import Scene
+from nimbus.components.data.sequence import Sequence
+
+
+def init_env():
+    sys.path.append("./")
+    sys.path.append("./data_engine")
+    sys.path.append("workflows/simbox")
+
+
+def unpack_iter_data(data: tuple):
+    assert len(data) <= 3, "not support yet"
+    scene = None
+    seq = None
+    obs = None
+    for item in data:
+        if isinstance(item, Scene):
+            scene = item
+        elif isinstance(item, Sequence):
+            seq = item
+        elif isinstance(item, Observations):
+            obs = item
+    return scene, seq, obs
+
+
+def consume_stage(stage_input):
+    if hasattr(stage_input, "Args"):
+        consume_iterators(stage_input.Args)
+        for value in stage_input.Args:
+            if hasattr(value, "__del__"):
+                value.__del__()  # pylint: disable=C2801
+    if hasattr(stage_input, "Kwargs"):
+        if stage_input.Kwargs is not None:
+            for value in stage_input.Kwargs.values():
+                consume_iterators(value)
+                if hasattr(value, "__del__"):
+                    value.__del__()  # pylint: disable=C2801
+
+
+# prevent isaac sim close pipe worker in advance
+def pipe_consume_stage(stage_input):
+    if hasattr(stage_input, "Args"):
+        consume_iterators(stage_input.Args)
+    if hasattr(stage_input, "Kwargs"):
+        if stage_input.Kwargs is not None:
+            for value in stage_input.Kwargs.values():
+                consume_iterators(value)
+
+
+def consume_iterators(obj):
+    # from pdb import set_trace; set_trace()
+    if isinstance(obj, (str, bytes)):
+        return obj
+    if isinstance(obj, dict):
+        return {key: consume_iterators(value) for key, value in obj.items()}
+    if isinstance(obj, list):
+        return [consume_iterators(item) for item in obj]
+    if isinstance(obj, tuple):
+        return tuple(consume_iterators(item) for item in obj)
+    if hasattr(obj, "__iter__"):
+        for item in obj:
+            consume_iterators(item)
+    return obj
+
+
+def scene_names_postprocess(scene_names: list) -> list:
+    """
+    Distributes a list of scene names (folders) among multiple workers in a distributed environment.
+    This function is designed to work with Deep Learning Container (DLC) environments, where worker
+    information is extracted from environment variables. It assigns a subset of the input scene names
+    to the current worker based on its rank and the total number of workers, using a round-robin strategy.
+    If not running in a DLC environment, all scene names are assigned to a single worker.
+    Args:
+        scene_names (list): List of scene names (typically folder names) to be distributed.
+    Returns:
+        list: The subset of scene names assigned to the current worker.
+    Raises:
+        PermissionError: If there is a permission issue accessing the input directory.
+        RuntimeError: For any other errors encountered during processing.
+    Notes:
+        - The function expects certain environment variables (e.g., POD_NAME, WORLD_SIZE) to be set
+          in DLC environments.
+        - If multiple workers are present, the input list is sorted before distribution to ensure
+          consistent assignment across workers.
+    """
+
+    def _get_dlc_worker_info():
+        """Extract worker rank and world size from DLC environment variables."""
+        pod_name = os.environ.get("POD_NAME")
+
+        if pod_name:
+            # Match worker-N or master-N patterns
+            match = re.search(r"dlc.*?-(worker|master)-(\d+)$", pod_name)
+            if match:
+                node_type, node_id = match.groups()
+                world_size = int(os.environ.get("WORLD_SIZE", "1"))
+
+                if node_type == "worker":
+                    rank = int(node_id)
+                else:  # master node
+                    rank = world_size - 1
+
+                return rank, world_size
+
+        # Default for non-DLC environment
+        return 0, 1
+
+    def _distribute_folders(all_folders, rank, world_size):
+        """Distribute folders among workers using round-robin strategy."""
+        if not all_folders:
+            return []
+
+        # Only sort when there are multiple workers to ensure consistency
+        if world_size > 1:
+            all_folders.sort()
+
+        # Distribute using slicing: worker i gets folders at indices i, i+world_size, ...
+        return all_folders[rank::world_size]
+
+    try:
+        # Get all subfolders
+        all_subfolders = scene_names
+        if not all_subfolders:
+            print(f"Warning: No scene found in {scene_names}")
+            return []
+
+        # Get worker identity and distribute folders
+        rank, world_size = _get_dlc_worker_info()
+        assigned_folders = _distribute_folders(all_subfolders, rank, world_size)
+
+        print(
+            f"DLC Worker {rank}/{world_size}: Assigned {len(assigned_folders)} out of "
+            f"{len(all_subfolders)} total folders"
+        )
+
+        return assigned_folders
+
+    except PermissionError:
+        raise PermissionError(f"No permission to access directory: {scene_names}")
+    except Exception as e:
+        raise RuntimeError(f"Error reading input directory {scene_names}: {e}")
+
+
+def retry_on_exception(
+    max_retries: int = 3, retry_exceptions: Union[bool, Tuple[Type[Exception], ...]] = True, delay: float = 1.0
+):
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(self, *args, **kwargs):
+            last_exception = None
+            for attempt in range(max_retries + 1):
+                try:
+                    if attempt > 0:
+                        print(f"Retry attempt {attempt}/{max_retries} for {func.__name__}")
+                    return func(self, *args, **kwargs)
+                except Exception as e:
+                    last_exception = e
+                    should_retry = False
+                    if retry_exceptions is True:
+                        should_retry = True
+                    elif isinstance(retry_exceptions, (tuple, list)):
+                        should_retry = isinstance(e, retry_exceptions)
+
+                    if should_retry and attempt < max_retries:
+                        print(f"Error in {func.__name__}: {e}. Retrying in {delay} seconds...")
+                        time.sleep(delay)
+                    else:
+                        raise
+            if last_exception:
+                raise last_exception
+
+        return wrapper
+
+    return decorator