init commit

This commit is contained in:
zyhe
2026-03-16 11:44:10 +00:00
commit 94384a93c9
552 changed files with 363038 additions and 0 deletions

16
nimbus/__init__.py Normal file
View File

@@ -0,0 +1,16 @@
import ray
from nimbus.utils.types import STAGE_PIPE
from .data_engine import DataEngine, DistPipeDataEngine
def run_data_engine(config, master_seed=None):
import nimbus_extension # noqa: F401 pylint: disable=unused-import
if STAGE_PIPE in config:
ray.init(num_gpus=1)
data_engine = DistPipeDataEngine(config, master_seed=master_seed)
else:
data_engine = DataEngine(config, master_seed=master_seed)
data_engine.run()

View File

View File

@@ -0,0 +1,71 @@
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
import numpy as np
@dataclass
class C2W:
"""
Represents a camera-to-world transformation matrix.
Attributes:
matrix (List[float]): A list of 16 floats representing the 4x4 transformation matrix in row-major order.
"""
matrix: List[float]
@dataclass
class Camera:
"""
Represents a single camera pose in the trajectory.
Attributes:
trajectory (List[C2W]): List of C2W transformations for this camera pose.
intrinsic (Optional[List[float]]): 3x3 camera intrinsic matrix: [[fx, 0, cx], [0, fy, cy], [0, 0, 1]].
extrinsic (Optional[List[float]]): 4x4 tobase_extrinsic matrix representing the camera mounting offset
relative to the robot base (height + pitch).
length (Optional[int]): Length of the trajectory in number of frames.
depths (Optional[list[np.ndarray]]): List of depth images captured by this camera.
rgbs (Optional[list[np.ndarray]]): List of RGB images captured by this camera.
uv_tracks (Optional[Dict[str, Any]]): UV tracking data in the format
{mesh_name: {"per_frame": list, "width": W, "height": H}}.
uv_mesh_names (Optional[List[str]]): List of mesh names being tracked in the UV tracking data.
"""
trajectory: List[C2W]
intrinsic: List[float] = None
extrinsic: List[float] = None
length: int = None
depths: list[np.ndarray] = None
rgbs: list[np.ndarray] = None
uv_tracks: Optional[Dict[str, Any]] = None
uv_mesh_names: Optional[List[str]] = None
def __len__(self):
if self.length is not None:
return self.length
self._check_length()
self.length = len(self.trajectory)
return len(self.trajectory)
def _check_length(self):
if self.depths is not None and len(self.depths) != len(self.trajectory):
raise ValueError("Length of depths does not match length of trajectory")
if self.rgbs is not None and len(self.rgbs) != len(self.trajectory):
raise ValueError("Length of rgbs does not match length of trajectory")
if self.uv_tracks is not None:
for mesh_name, track_data in self.uv_tracks.items():
if len(track_data["per_frame"]) != len(self.trajectory):
raise ValueError(f"Length of uv_tracks for mesh {mesh_name} does not match length of trajectory")
def append_rgb(self, rgb_image: np.ndarray):
if self.rgbs is None:
self.rgbs = []
self.rgbs.append(rgb_image)
def append_depth(self, depth_image: np.ndarray):
if self.depths is None:
self.depths = []
self.depths.append(depth_image)

View File

@@ -0,0 +1,95 @@
import logging
import time
from abc import abstractmethod
from collections.abc import Iterator
from typing import Generic, TypeVar
T = TypeVar("T")
# pylint: disable=E0102
class Iterator(Iterator, Generic[T]):
def __init__(self, max_retry=3):
self._next_calls = 0.0
self._next_total_time = 0.0
self._init_time_costs = 0.0
self._init_times = 0
self._frame_compute_time = 0.0
self._frame_compute_frames = 0.0
self._frame_io_time = 0.0
self._frame_io_frames = 0.0
self._wait_time = 0.0
self._seq_num = 0.0
self._seq_time = 0.0
self.logger = logging.getLogger("de_logger")
self.max_retry = max_retry
self.retry_num = 0
def record_init_time(self, time_costs):
self._init_times += 1
self._init_time_costs += time_costs
def __iter__(self):
return self
def __next__(self):
start_time = time.time()
try:
result = self._next()
except StopIteration:
self._log_statistics()
raise
end_time = time.time()
self._next_calls += 1
self._next_total_time += end_time - start_time
return result
def collect_compute_frame_info(self, length, time_costs):
self._frame_compute_frames += length
self._frame_compute_time += time_costs
def collect_io_frame_info(self, length, time_costs):
self._frame_io_frames += length
self._frame_io_time += time_costs
def collect_wait_time_info(self, time_costs):
self._wait_time += time_costs
def collect_seq_info(self, length, time_costs):
self._seq_num += length
self._seq_time += time_costs
@abstractmethod
def _next(self):
raise NotImplementedError("Subclasses should implement this method.")
def _log_statistics(self):
class_name = self.__class__.__name__
self.logger.info(
f"{class_name}: Next method called {self._next_calls} times, total time:"
f" {self._next_total_time:.6f} seconds"
)
if self._init_time_costs > 0:
self.logger.info(
f"{class_name}: Init time: {self._init_time_costs:.6f} seconds, init {self._init_times} times"
)
if self._frame_compute_time > 0:
avg_compute_time = self._frame_compute_time / self._frame_compute_frames
self.logger.info(
f"{class_name}: compute frame num: {self._frame_compute_frames}, total time:"
f" {self._frame_compute_time:.6f} seconds, average time: {avg_compute_time:.6f} seconds per frame"
)
if self._frame_io_frames > 0:
avg_io_time = self._frame_io_time / self._frame_io_frames
self.logger.info(
f"{class_name}: io frame num: {self._frame_io_frames}, total time: {self._frame_io_time:.6f} seconds,"
f" average time: {avg_io_time:.6f} seconds per frame"
)
if self._wait_time > 0:
self.logger.info(f"{class_name}: wait time: {self._wait_time:.6f} seconds")
if self._seq_time > 0:
avg_seq_time = self._seq_time / self._seq_num
self.logger.info(
f"{class_name}: seq num: {self._seq_num:.6f}, total time: {self._seq_time:.6f} seconds, average time:"
f" {avg_seq_time:.6f} seconds per sequence"
)

View File

@@ -0,0 +1,119 @@
import os
import cv2
import imageio
import numpy as np
from nimbus.components.data.camera import Camera
class Observations:
"""
Represents a single observation of a scene, which may include multiple camera trajectories and associated data.
Each observation is identified by a unique name and index, and can contain multiple Camera items that capture
different viewpoints or modalities of the same scene.
Args:
scene_name (str): The name of the scene associated with this observation.
index (str): The index or ID of this observation within the scene.
length (int): Optional total length of the observation. Calculated from camera trajectories if not provided.
data (dict): Optional dictionary for storing additional arbitrary data, such as metadata or annotations.
"""
def __init__(self, scene_name: str, index: str, length: int = None, data: dict = None):
self.scene_name = scene_name
self.obs_name = scene_name + "_" + index
self.index = index
self.cam_items = []
self.length = length
self.data = data
def __getstate__(self):
state = self.__dict__.copy()
return state
def __setstate__(self, state):
self.__dict__.update(state)
def append_cam(self, item: Camera):
self.cam_items.append(item)
def __len__(self):
if self.length is not None:
return self.length
self.length = 0
for cam in self.cam_items:
self.length += len(cam)
return self.length
def get_length(self):
return len(self)
def flush_to_disk(self, path, video_fps=10):
path_to_save = os.path.join(path, "trajectory_" + self.index)
print(f"obs {self.obs_name} try to save path in {path_to_save}")
os.makedirs(path_to_save, exist_ok=True)
# Single camera: save in root directory
if len(self.cam_items) == 1:
cam = self.cam_items[0]
self._save_camera_data(path_to_save, cam, video_fps)
# Multiple cameras: save in camera_0/, camera_1/, etc.
else:
for idx, cam in enumerate(self.cam_items):
camera_dir = os.path.join(path_to_save, f"camera_{idx}")
os.makedirs(camera_dir, exist_ok=True)
self._save_camera_data(camera_dir, cam, video_fps)
def _save_camera_data(self, save_dir, cam: Camera, video_fps):
"""Helper method to save camera visualization data (rgbs, depths) to a directory."""
# Save RGB and depth images if available
if cam.rgbs is not None and len(cam.rgbs) > 0:
rgb_images_path = os.path.join(save_dir, "rgb/")
os.makedirs(rgb_images_path, exist_ok=True)
fps_path = os.path.join(save_dir, "fps.mp4")
for idx, rgb_item in enumerate(cam.rgbs):
rgb_filename = os.path.join(rgb_images_path, f"{idx}.jpg")
cv2.imwrite(rgb_filename, cv2.cvtColor(rgb_item, cv2.COLOR_BGR2RGB))
imageio.mimwrite(fps_path, cam.rgbs, fps=video_fps)
if cam.depths is not None and len(cam.depths) > 0:
depth_images_path = os.path.join(save_dir, "depth/")
os.makedirs(depth_images_path, exist_ok=True)
depth_path = os.path.join(save_dir, "depth.mp4")
# Create a copy for video (8-bit version)
depth_video_frames = []
for idx, depth_item in enumerate(cam.depths):
depth_filename = os.path.join(depth_images_path, f"{idx}.png")
cv2.imwrite(depth_filename, depth_item)
depth_video_frames.append((depth_item >> 8).astype(np.uint8))
imageio.mimwrite(depth_path, depth_video_frames, fps=video_fps)
# Save UV tracking visualizations if available
if cam.uv_tracks is not None and cam.uv_mesh_names is not None and cam.rgbs is not None:
num_frames = len(cam.rgbs)
try:
from nimbus_extension.components.render.brpc_utils.point_tracking import (
make_uv_overlays_and_video,
)
except ImportError as e:
raise ImportError(
"UV tracking visualization requires nimbus_extension. "
"Please add `import nimbus_extension` before running the pipeline."
) from e
make_uv_overlays_and_video(
cam.rgbs,
cam.uv_tracks,
cam.uv_mesh_names,
start_frame=0,
end_frame=num_frames,
fps=video_fps,
path_to_save=save_dir,
)

View File

@@ -0,0 +1,39 @@
import pickle
class Package:
"""
A class representing a data package that can be serialized and deserialized for pipeline.
Args:
data: The actual data contained in the package, which can be of any type.
task_id (int): The ID of the task associated with this package.
task_name (str): The name of the task associated with this package.
stop_sig (bool): Whether this package signals the pipeline to stop.
"""
def __init__(self, data, task_id: int = -1, task_name: str = None, stop_sig: bool = False):
self.is_ser = False
self.data = data
self.task_id = task_id
self.task_name = task_name
self.stop_sig = stop_sig
def serialize(self):
assert self.is_ser is False, "data is already serialized"
self.data = pickle.dumps(self.data)
self.is_ser = True
def deserialize(self):
assert self.is_ser is True, "data is already deserialized"
self.data = pickle.loads(self.data)
self.is_ser = False
def is_serialized(self):
return self.is_ser
def get_data(self):
return self.data
def should_stop(self):
return self.stop_sig is True

View File

@@ -0,0 +1,69 @@
class Scene:
"""
Represents a loaded scene in the simulation environment, holding workflow context and task execution state.
Args:
name (str): The name of the scene or task.
pcd: Point cloud data associated with the scene.
scale (float): Scale factor for the scene geometry.
materials: Material data for the scene.
textures: Texture data for the scene.
floor_heights: Floor height information for the scene.
wf: The task workflow instance managing this scene.
task_id (int): The index of the current task within the workflow.
task_exec_num (int): The execution count for the current task, used for task repetition tracking.
simulation_app: The Isaac Sim SimulationApp instance.
"""
def __init__(
self,
name: str = None,
pcd=None,
scale: float = 1.0,
materials=None,
textures=None,
floor_heights=None,
wf=None,
task_id: int = None,
task_exec_num: int = 1,
simulation_app=None,
):
self.name = name
self.pcd = pcd
self.materials = materials
self.textures = textures
self.floor_heights = floor_heights
self.scale = scale
self.wf = wf
self.simulation_app = simulation_app
self.task_id = task_id
self.plan_info = None
self.generate_success = False
self.task_exec_num = task_exec_num
def __getstate__(self):
state = self.__dict__.copy()
del state["pcd"]
return state
def __setstate__(self, state):
self.__dict__.update(state)
self.pcd = None
def add_plan_info(self, plan_info):
self.plan_info = plan_info
def flush_to_disk(self, path):
pass
def load_from_disk(self, path):
pass
def update_generate_status(self, success):
self.generate_success = success
def get_generate_status(self):
return self.generate_success
def update_task_exec_num(self, num):
self.task_exec_num = num

View File

@@ -0,0 +1,145 @@
import json
import os
import numpy as np
import open3d as o3d
from nimbus.components.data.camera import C2W, Camera
class Sequence:
"""
Represents a camera trajectory sequence with associated metadata.
Args:
scene_name (str): The name of the scene (e.g., room identifier).
index (str): The index or ID of this sequence within the scene.
length (int): Optional explicit sequence length. Calculated from camera trajectories if not provided.
data (dict): Optional additional arbitrary data associated with the sequence.
"""
def __init__(self, scene_name: str, index: str, length: int = None, data: dict = None):
self.scene_name = scene_name
self.seq_name = scene_name + "_" + index
self.index = index
self.cam_items: list[Camera] = []
self.path_pcd = None
self.length = length
self.data = data
def __getstate__(self):
state = self.__dict__.copy()
state["path_pcd_color"] = np.asarray(state["path_pcd"].colors)
state["path_pcd"] = o3d.io.write_point_cloud_to_bytes(state["path_pcd"], "mem::xyz")
return state
def __setstate__(self, state):
self.__dict__.update(state)
self.path_pcd = o3d.io.read_point_cloud_from_bytes(state["path_pcd"], "mem::xyz")
self.path_pcd.colors = o3d.utility.Vector3dVector(state["path_pcd_color"])
def __len__(self):
if self.length is not None:
return self.length
self.length = 0
for cam in self.cam_items:
self.length += len(cam)
return self.length
def append_cam(self, item: Camera):
self.cam_items.append(item)
def update_pcd(self, path_pcd):
self.path_pcd = path_pcd
def get_length(self):
return len(self)
def flush_to_disk(self, path):
path_to_save = os.path.join(path, "trajectory_" + self.index)
print(f"seq {self.seq_name} try to save path in {path_to_save}")
os.makedirs(path_to_save, exist_ok=True)
if self.path_pcd is not None:
pcd_path = os.path.join(path_to_save, "path.ply")
o3d.io.write_point_cloud(pcd_path, self.path_pcd)
# Single camera: save in root directory
if len(self.cam_items) == 1:
cam = self.cam_items[0]
camera_trajectory_list = [t.matrix for t in cam.trajectory]
save_dict = {
"camera_intrinsic": cam.intrinsic if cam.intrinsic is not None else None,
"camera_extrinsic": cam.extrinsic if cam.extrinsic is not None else None,
"camera_trajectory": camera_trajectory_list,
}
traj_path = os.path.join(path_to_save, "data.json")
json_object = json.dumps(save_dict, indent=4)
with open(traj_path, "w", encoding="utf-8") as outfile:
outfile.write(json_object)
# Multiple cameras: save in camera_0/, camera_1/, etc.
else:
for idx, cam in enumerate(self.cam_items):
camera_dir = os.path.join(path_to_save, f"camera_{idx}")
os.makedirs(camera_dir, exist_ok=True)
camera_trajectory_list = [t.matrix for t in cam.trajectory]
save_dict = {
"camera_intrinsic": cam.intrinsic if cam.intrinsic is not None else None,
"camera_extrinsic": cam.extrinsic if cam.extrinsic is not None else None,
"camera_trajectory": camera_trajectory_list,
}
traj_path = os.path.join(camera_dir, "data.json")
json_object = json.dumps(save_dict, indent=4)
with open(traj_path, "w", encoding="utf-8") as outfile:
outfile.write(json_object)
def load_from_disk(self, path):
print(f"seq {self.seq_name} try to load path from {path}")
pcd_path = os.path.join(path, "path.ply")
if os.path.exists(pcd_path):
self.path_pcd = o3d.io.read_point_cloud(pcd_path)
# Clear existing camera items
self.cam_items = []
# Check if single camera format (data.json in root)
traj_path = os.path.join(path, "data.json")
if os.path.exists(traj_path):
with open(traj_path, "r", encoding="utf-8") as infile:
data = json.load(infile)
camera_trajectory_list = []
for trajectory in data["camera_trajectory"]:
camera_trajectory_list.append(C2W(matrix=trajectory))
cam = Camera(
trajectory=camera_trajectory_list,
intrinsic=data.get("camera_intrinsic"),
extrinsic=data.get("camera_extrinsic"),
)
self.cam_items.append(cam)
else:
# Multiple camera format (camera_0/, camera_1/, etc.)
idx = 0
while True:
camera_dir = os.path.join(path, f"camera_{idx}")
camera_json = os.path.join(camera_dir, "data.json")
if not os.path.exists(camera_json):
break
with open(camera_json, "r", encoding="utf-8") as infile:
data = json.load(infile)
camera_trajectory_list = []
for trajectory in data["camera_trajectory"]:
camera_trajectory_list.append(C2W(matrix=trajectory))
cam = Camera(
trajectory=camera_trajectory_list,
intrinsic=data.get("camera_intrinsic"),
extrinsic=data.get("camera_extrinsic"),
)
self.cam_items.append(cam)
idx += 1
assert len(self.cam_items) > 0, f"No camera data found in {path}"

View File

@@ -0,0 +1,7 @@
from nimbus.components.data.iterator import Iterator
dedumper_dict = {}
def register(type_name: str, cls: Iterator):
dedumper_dict[type_name] = cls

View File

@@ -0,0 +1,7 @@
from .base_dumper import BaseDumper
dumper_dict = {}
def register(type_name: str, cls: BaseDumper):
dumper_dict[type_name] = cls

View File

@@ -0,0 +1,82 @@
import time
from abc import abstractmethod
from pympler import asizeof
from nimbus.components.data.iterator import Iterator
from nimbus.components.data.package import Package
from nimbus.utils.utils import unpack_iter_data
class BaseDumper(Iterator):
def __init__(self, data_iter, output_queue, max_queue_num=1):
super().__init__()
self.data_iter = data_iter
self.scene = None
self.output_queue = output_queue
self.total_case = 0
self.success_case = 0
self.max_queue_num = max_queue_num
def __iter__(self):
return self
def _next(self):
try:
data = next(self.data_iter)
scene, seq, obs = unpack_iter_data(data)
self.total_case += 1
if scene is not None:
if self.scene is not None and (
scene.task_id != self.scene.task_id
or scene.name != self.scene.name
or scene.task_exec_num != self.scene.task_exec_num
):
self.logger.info(
f"Scene {self.scene.name} generate finish, success rate: {self.success_case}/{self.total_case}"
)
self.total_case = 1
self.success_case = 0
self.scene = scene
if obs is None and seq is None:
self.logger.info(f"generate failed, skip once! success rate: {self.success_case}/{self.total_case}")
if self.scene is not None:
self.scene.update_generate_status(success=False)
return None
io_start_time = time.time()
if self.output_queue is not None:
obj = self.dump(seq, obs)
pack = Package(obj, task_id=scene.task_id, task_name=scene.name)
pack.serialize()
wait_time = time.time()
while self.output_queue.qsize() >= self.max_queue_num:
time.sleep(1)
end_time = time.time()
self.collect_wait_time_info(end_time - wait_time)
st = time.time()
self.output_queue.put(pack)
ed = time.time()
self.logger.info(f"put time: {ed - st}, data size: {asizeof.asizeof(obj)}")
else:
obj = self.dump(seq, obs)
self.success_case += 1
self.scene.update_generate_status(success=True)
self.collect_seq_info(1, time.time() - io_start_time)
except StopIteration:
if self.output_queue is not None:
pack = Package(None, stop_sig=True)
self.output_queue.put(pack)
if self.scene is not None:
self.logger.info(
f"Scene {self.scene.name} generate finish, success rate: {self.success_case}/{self.total_case}"
)
raise StopIteration("no data")
except Exception as e:
self.logger.exception(f"Error during data dumping: {e}")
raise e
@abstractmethod
def dump(self, seq, obs):
raise NotImplementedError("This method should be overridden by subclasses")

View File

@@ -0,0 +1,16 @@
# flake8: noqa: F401
# pylint: disable=C0413
from .base_randomizer import LayoutRandomizer
from .base_scene_loader import SceneLoader
scene_loader_dict = {}
layout_randomizer_dict = {}
def register_loader(type_name: str, cls: SceneLoader):
scene_loader_dict[type_name] = cls
def register_randomizer(type_name: str, cls: LayoutRandomizer):
layout_randomizer_dict[type_name] = cls

View File

@@ -0,0 +1,72 @@
import sys
import time
from abc import abstractmethod
from typing import Optional
from nimbus.components.data.iterator import Iterator
from nimbus.components.data.scene import Scene
from nimbus.daemon.decorators import status_monitor
class LayoutRandomizer(Iterator):
"""
Base class for layout randomization in a scene. This class defines the structure for randomizing scenes and
tracking the randomization process. It manages the current scene, randomization count, and provides hooks for
subclasses to implement specific randomization logic.
Args:
scene_iter (Iterator): An iterator that provides scenes to be randomized.
random_num (int): The number of randomizations to perform for each scene before moving to the next one.
strict_mode (bool): If True, the randomizer will check the generation status of the current scene and retry
randomization if it was not successful. This ensures that only successfully generated
scenes are counted towards the randomization limit.
"""
def __init__(self, scene_iter: Iterator, random_num: int, strict_mode: bool = False):
super().__init__()
self.scene_iter = scene_iter
self.random_num = random_num
self.strict_mode = strict_mode
self.cur_index = sys.maxsize
self.scene: Optional[Scene] = None
def reset(self, scene):
self.cur_index = 0
self.scene = scene
def _fetch_next_scene(self):
scene = next(self.scene_iter)
self.reset(scene)
@status_monitor()
def _randomize_with_status(self, scene) -> Scene:
scene = self.randomize_scene(self.scene)
return scene
def _next(self) -> Scene:
try:
if self.strict_mode and self.scene is not None:
if not self.scene.get_generate_status():
self.logger.info("strict_mode is open, retry the randomization to generate sequence.")
st = time.time()
scene = self._randomize_with_status(self.scene)
self.collect_seq_info(1, time.time() - st)
return scene
if self.cur_index >= self.random_num:
self._fetch_next_scene()
if self.cur_index < self.random_num:
st = time.time()
scene = self._randomize_with_status(self.scene)
self.collect_seq_info(1, time.time() - st)
self.cur_index += 1
return scene
except StopIteration:
raise StopIteration("No more scenes to randomize.")
except Exception as e:
self.logger.exception(f"Error during scene idx {self.cur_index} randomization: {e}")
self.cur_index += 1
raise e
@abstractmethod
def randomize_scene(self, scene) -> Scene:
raise NotImplementedError("This method should be overridden by subclasses")

View File

@@ -0,0 +1,41 @@
from abc import abstractmethod
from nimbus.components.data.iterator import Iterator
from nimbus.components.data.scene import Scene
class SceneLoader(Iterator):
"""
Base class for scene loading in a simulation environment. This class defines the structure for loading scenes
and tracking the loading process. It manages the current package iterator and provides hooks for subclasses
to implement specific scene loading logic.
Args:
pack_iter (Iterator): An iterator that provides packages containing scene information to be loaded.
"""
def __init__(self, pack_iter):
super().__init__()
self.pack_iter = pack_iter
@abstractmethod
def load_asset(self) -> Scene:
"""
Abstract method to load and initialize a scene.
Subclasses must implement this method to define the specific logic for creating and configuring
a scene object based on the current state of the iterator.
Returns:
Scene: A fully initialized Scene object.
"""
raise NotImplementedError("This method must be implemented by subclasses")
def _next(self) -> Scene:
try:
return self.load_asset()
except StopIteration:
raise StopIteration("No more scenes to load.")
except Exception as e:
self.logger.exception(f"Error during scene loading: {e}")
raise e

View File

@@ -0,0 +1,7 @@
from nimbus.components.data.iterator import Iterator
plan_with_render_dict = {}
def register(type_name: str, cls: Iterator):
plan_with_render_dict[type_name] = cls

View File

@@ -0,0 +1,7 @@
from .base_seq_planner import SequencePlanner
seq_planner_dict = {}
def register(type_name: str, cls: SequencePlanner):
seq_planner_dict[type_name] = cls

View File

@@ -0,0 +1,102 @@
import sys
import time
from abc import abstractmethod
from typing import Optional
from nimbus.components.data.iterator import Iterator
from nimbus.components.data.scene import Scene
from nimbus.components.data.sequence import Sequence
from nimbus.daemon.decorators import status_monitor
from nimbus.utils.flags import is_debug_mode
from nimbus.utils.types import ARGS, TYPE
from .planner import path_planner_dict
class SequencePlanner(Iterator):
"""
A base class for sequence planning in a simulation environment. This class defines the structure for generating
sequences based on scenes and tracking the planning process. It manages the current scene, episode count
and provides hooks for subclasses to implement specific sequence generation logic.
Args:
scene_iter (Iterator): An iterator that provides scenes to be processed for sequence planning.
planner_cfg (dict): A dictionary containing configuration parameters for the planner,
such as the type of planner to use and its arguments.
episodes (int): The number of episodes to generate for each scene before moving to the next one. Default is 1.
"""
def __init__(self, scene_iter: Iterator[Scene], planner_cfg: dict, episodes: int = 1):
super().__init__()
self.scene_iter = scene_iter
self.planner_cfg = planner_cfg
self.episodes = episodes
self.current_episode = sys.maxsize
self.scene: Optional[Scene] = None
@status_monitor()
def _plan_with_status(self) -> Optional[Sequence]:
seq = self.generate_sequence()
return seq
def _next(self) -> tuple[Scene, Sequence]:
try:
if self.scene is None or self.current_episode >= self.episodes:
try:
self.scene = next(self.scene_iter)
self.current_episode = 0
if self.scene is None:
return None, None
self.initialize(self.scene)
except StopIteration:
raise StopIteration("No more scene to process.")
except Exception as e:
self.logger.exception(f"Error loading next scene: {e}")
if is_debug_mode():
raise e
self.current_episode = sys.maxsize
return None, None
while True:
compute_start_time = time.time()
seq = self._plan_with_status()
compute_end_time = time.time()
self.current_episode += 1
if seq is not None:
self.collect_compute_frame_info(seq.get_length(), compute_end_time - compute_start_time)
return self.scene, seq
if self.current_episode >= self.episodes:
return self.scene, None
self.logger.info(f"Generate seq failed and retry. Current episode id is {self.current_episode}")
except StopIteration:
raise StopIteration("No more scene to process.")
except Exception as e:
scene_name = getattr(self.scene, "name", "<unknown>")
self.logger.exception(
f"Error during idx {self.current_episode} sequence generation for scene {scene_name}: {e}"
)
if is_debug_mode():
raise e
self.current_episode += 1
return self.scene, None
@abstractmethod
def generate_sequence(self) -> Optional[Sequence]:
raise NotImplementedError("This method should be overridden by subclasses")
def _initialize(self, scene):
if self.planner_cfg is not None:
self.logger.info(f"init {self.planner_cfg[TYPE]} planner in seq_planner")
self.planner = path_planner_dict[self.planner_cfg[TYPE]](scene, **self.planner_cfg.get(ARGS, {}))
else:
self.planner = None
self.logger.info("planner config is None in seq_planner and skip initialize")
def initialize(self, scene):
init_start_time = time.time()
self._initialize(scene)
self.record_init_time(time.time() - init_start_time)

View File

@@ -0,0 +1,5 @@
path_planner_dict = {}
def register(type_name: str, cls):
path_planner_dict[type_name] = cls

View File

@@ -0,0 +1,7 @@
from .base_renderer import BaseRenderer
renderer_dict = {}
def register(type_name: str, cls: BaseRenderer):
renderer_dict[type_name] = cls

View File

@@ -0,0 +1,80 @@
import time
from abc import abstractmethod
from typing import Optional
from nimbus.components.data.iterator import Iterator
from nimbus.components.data.observation import Observations
from nimbus.components.data.scene import Scene
from nimbus.components.data.sequence import Sequence
from nimbus.daemon.decorators import status_monitor
class BaseRenderer(Iterator):
"""
Base class for rendering in a simulation environment. This class defines the structure for rendering scenes and
tracking the rendering process. It manages the current scene and provides hooks for subclasses to implement
specific rendering logic.
Args:
scene_seq_iter (Iterator): An iterator that provides pairs of scenes and sequences to be rendered. Each item
from the iterator should be a tuple containing a scene and its corresponding sequence.
"""
def __init__(self, scene_seq_iter: Iterator[tuple[Scene, Sequence]]):
super().__init__()
self.scene_seq_iter = scene_seq_iter
self.scene: Optional[Scene] = None
@status_monitor()
def _generate_obs_with_status(self, seq) -> Optional[Observations]:
compute_start_time = time.time()
obs = self.generate_obs(seq)
end_start_time = time.time()
if obs is not None:
self.collect_compute_frame_info(len(obs), end_start_time - compute_start_time)
return obs
def _next(self):
try:
scene, seq = next(self.scene_seq_iter)
if scene is not None:
if self.scene is None:
self.reset(scene)
elif scene.task_id != self.scene.task_id or scene.name != self.scene.name:
self.logger.info(f"Scene changed: {self.scene.name} -> {scene.name}")
self.reset(scene)
if seq is None:
return scene, None, None
obs = self._generate_obs_with_status(seq)
if obs is None:
return scene, None, None
return scene, seq, obs
except StopIteration:
raise StopIteration("No more sequences to process.")
except Exception as e:
self.logger.exception(f"Error during rendering: {e}")
raise e
@abstractmethod
def generate_obs(self, seq) -> Optional[Observations]:
raise NotImplementedError("This method should be overridden by subclasses")
@abstractmethod
def _lazy_init(self):
raise NotImplementedError("This method should be overridden by subclasses")
@abstractmethod
def _close_resource(self):
raise NotImplementedError("This method should be overridden by subclasses")
def reset(self, scene):
try:
self.scene = scene
self._close_resource()
init_start_time = time.time()
self._lazy_init()
self.record_init_time(time.time() - init_start_time)
except Exception as e:
self.logger.exception(f"Error initializing renderer: {e}")
self.scene = None
raise e

View File

@@ -0,0 +1,7 @@
from .base_writer import BaseWriter
writer_dict = {}
def register(type_name: str, cls: BaseWriter):
writer_dict[type_name] = cls

View File

@@ -0,0 +1,163 @@
import time
from abc import abstractmethod
from concurrent.futures import ThreadPoolExecutor
from copy import copy
from nimbus.components.data.iterator import Iterator
from nimbus.components.data.observation import Observations
from nimbus.components.data.scene import Scene
from nimbus.components.data.sequence import Sequence
from nimbus.daemon import ComponentStatus, StatusReporter
from nimbus.utils.flags import is_debug_mode
from nimbus.utils.utils import unpack_iter_data
def run_batch(func, args):
for arg in args:
func(*arg)
class BaseWriter(Iterator):
"""
A base class for writing generated sequences and observations to disk. This class defines the structure for
writing data and tracking the writing process. It manages the current scene, success and total case counts,
and provides hooks for subclasses to implement specific data writing logic. The writer supports both synchronous
and asynchronous batch writing modes, allowing for efficient data handling in various scenarios.
Args:
data_iter (Iterator): An iterator that provides data to be written, typically containing scenes,
sequences, and observations.
seq_output_dir (str): The directory where generated sequences will be saved. Can be None
if sequence output is not needed.
obs_output_dir (str): The directory where generated observations will be saved. Can be None
if observation output is not needed.
batch_async (bool): If True, the writer will use asynchronous batch writing to improve performance
when handling large amounts of data. Default is True.
async_threshold (int): The maximum number of asynchronous write operations that can be in progress
at the same time. If the threshold is reached, the writer will wait for the oldest operation
to complete before starting a new one. Default is 1.
batch_size (int): The number of data items to write in each batch when using asynchronous writing.
Default is 2, and it will be capped at 8 to prevent potential issues with too many concurrent operations.
"""
def __init__(
self,
data_iter: Iterator[tuple[Scene, Sequence, Observations]],
seq_output_dir: str,
obs_output_dir: str,
batch_async: bool = True,
async_threshold: int = 1,
batch_size: int = 2,
):
super().__init__()
assert (
seq_output_dir is not None or obs_output_dir is not None
), "At least one output directory must be provided"
self.data_iter = data_iter
self.seq_output_dir = seq_output_dir
self.obs_output_dir = obs_output_dir
self.scene = None
self.async_mode = batch_async
self.batch_size = batch_size if batch_size <= 8 else 8
if batch_async and batch_size > self.batch_size:
self.logger.info("Batch size is larger than 8(probably cause program hang), batch size will be set to 8")
self.async_threshold = async_threshold
self.flush_executor = ThreadPoolExecutor(max_workers=max(1, 64 // self.batch_size))
self.flush_threads = []
self.data_buffer = []
self.logger.info(
f"Batch Async Write Mode: {self.async_mode}, async threshold: {self.async_threshold}, batch size:"
f" {self.batch_size}"
)
self.total_case = 0
self.success_case = 0
self.last_scene_key = None
self.status_reporter = StatusReporter(self.__class__.__name__)
def _next(self):
try:
data = next(self.data_iter)
scene, seq, obs = unpack_iter_data(data)
new_key = (scene.task_id, scene.name, scene.task_exec_num) if scene is not None else None
self.scene = scene
if new_key != self.last_scene_key:
if self.scene is not None and self.last_scene_key is not None:
self.logger.info(
f"Scene {self.scene.name} generate finish, success rate: {self.success_case}/{self.total_case}"
)
self.success_case = 0
self.total_case = 0
self.last_scene_key = new_key
if self.scene is None:
return None
self.total_case += 1
self.status_reporter.update_status(ComponentStatus.RUNNING)
if seq is None and obs is None:
self.logger.info(f"generate failed, skip once! success rate: {self.success_case}/{self.total_case}")
self.scene.update_generate_status(success=False)
return None
scene_name = self.scene.name
io_start_time = time.time()
if self.async_mode:
cp_start_time = time.time()
cp = copy(self.scene.wf)
cp_end_time = time.time()
if self.scene.wf is not None:
self.logger.info(f"Scene {scene_name} workflow copy time: {cp_end_time - cp_start_time:.2f}s")
self.data_buffer.append((cp, scene_name, seq, obs))
if len(self.data_buffer) >= self.batch_size:
self.flush_threads = [t for t in self.flush_threads if not t.done()]
if len(self.flush_threads) >= self.async_threshold:
self.logger.info("Max async workers reached, waiting for the oldest thread to finish")
self.flush_threads[0].result()
self.flush_threads = self.flush_threads[1:]
to_flush_buffer = self.data_buffer.copy()
async_flush = self.flush_executor.submit(run_batch, self.flush_to_disk, to_flush_buffer)
if is_debug_mode():
async_flush.result() # surface exceptions immediately in debug mode
self.flush_threads.append(async_flush)
self.data_buffer = []
flush_length = len(obs) if obs is not None else len(seq)
else:
flush_length = self.flush_to_disk(self.scene.wf, scene_name, seq, obs)
self.success_case += 1
self.scene.update_generate_status(success=True)
self.collect_io_frame_info(flush_length, time.time() - io_start_time)
self.status_reporter.update_status(ComponentStatus.COMPLETED)
return None
except StopIteration:
if self.async_mode:
if len(self.data_buffer) > 0:
async_flush = self.flush_executor.submit(run_batch, self.flush_to_disk, self.data_buffer)
self.flush_threads.append(async_flush)
for thread in self.flush_threads:
thread.result()
if self.scene is not None:
self.logger.info(
f"Scene {self.scene.name} generate finish, success rate: {self.success_case}/{self.total_case}"
)
raise StopIteration("no data")
except Exception as e:
self.logger.exception(f"Error during data writing: {e}")
raise e
def __del__(self):
for thread in self.flush_threads:
thread.result()
self.logger.info(f"Writer {len(self.flush_threads)} threads closed")
# Close the simulation app if it exists
if self.scene is not None and self.scene.simulation_app is not None:
self.logger.info("Closing simulation app")
self.scene.simulation_app.close()
@abstractmethod
def flush_to_disk(self, task, scene_name, seq, obs):
raise NotImplementedError("This method should be overridden by subclasses")

View File

@@ -0,0 +1,4 @@
# flake8: noqa: E401
from .status import ComponentStatus, StatusInfo
from .status_monitor import StatusMonitor
from .status_reporter import StatusReporter

View File

@@ -0,0 +1,24 @@
from functools import wraps
from nimbus.daemon import ComponentStatus, StatusReporter
def status_monitor(running_status=ComponentStatus.RUNNING, completed_status=ComponentStatus.COMPLETED):
def decorator(func):
@wraps(func)
def wrapper(self, *args, **kwargs):
if not hasattr(self, "status_reporter"):
self.status_reporter = StatusReporter(self.__class__.__name__)
self.status_reporter.update_status(running_status)
try:
result = func(self, *args, **kwargs)
self.status_reporter.update_status(completed_status)
return result
except Exception as e:
raise e
return wrapper
return decorator

21
nimbus/daemon/status.py Normal file
View File

@@ -0,0 +1,21 @@
import time
from dataclasses import dataclass, field
from enum import Enum
class ComponentStatus(Enum):
IDLE = "idle"
READY = "ready"
RUNNING = "running"
COMPLETED = "completed"
TIMEOUT = "timeout"
@dataclass
class StatusInfo:
component_id: str
status: ComponentStatus
last_update: float = field(default_factory=time.time)
def get_status_duration(self) -> float:
return time.time() - self.last_update

View File

@@ -0,0 +1,160 @@
import threading
from typing import Dict, Optional
from .status import ComponentStatus, StatusInfo
class StatusMonitor:
_instance = None
_lock = threading.Lock()
DEFAULT_TIMEOUTS = {
ComponentStatus.IDLE: 100,
ComponentStatus.READY: float("inf"),
ComponentStatus.RUNNING: 360,
ComponentStatus.COMPLETED: float("inf"),
ComponentStatus.TIMEOUT: float("inf"),
}
def __new__(cls):
if cls._instance is None:
with cls._lock:
if cls._instance is None:
cls._instance = super().__new__(cls)
return cls._instance
def __init__(self):
if not hasattr(self, "initialized"):
self.components: Dict[str, StatusInfo] = {}
self.status_timeouts = self.DEFAULT_TIMEOUTS.copy()
self.initialized = True
@classmethod
def get_instance(cls):
return cls()
def set_logger(self, logger):
self.logger = logger
def set_status_timeout(self, status: ComponentStatus, timeout_seconds: float):
self.status_timeouts[status] = timeout_seconds
def set_component_timeouts(self, timeouts: Dict[str, float]):
converted_timeouts = {}
for status_name, timeout_value in timeouts.items():
try:
if isinstance(status_name, str):
status = ComponentStatus[status_name.upper()]
elif isinstance(status_name, ComponentStatus):
status = status_name
else:
self._record(
f"Warning: Invalid status type '{type(status_name)}' for status '{status_name}', skipping"
)
continue
try:
timeout_value = float(timeout_value)
if timeout_value < 0:
timeout_value = float("inf")
converted_timeouts[status] = timeout_value
self._record(f"Set timeout for {status.value}: {timeout_value}s")
except (ValueError, TypeError) as e:
self._record(
f"Warning: Invalid timeout value '{timeout_value}' for status '{status_name}': {e}, skipping"
)
continue
except KeyError:
self._record(
f"Warning: Unknown status '{status_name}', skipping. Available statuses:"
f" {[s.name for s in ComponentStatus]}"
)
continue
except Exception as e:
self._record(f"Error processing status '{status_name}': {e}, skipping")
continue
self.status_timeouts.update(converted_timeouts)
def register_update(self, status_info: StatusInfo):
self.components[status_info.component_id] = status_info
def get_all_status(self) -> Dict[str, StatusInfo]:
return self.components.copy()
def get_status(self, component_id: str) -> Optional[StatusInfo]:
return self.components.get(component_id)
def get_timeout_components(self) -> Dict[str, StatusInfo]:
timeout_components = {}
for component_id, status_info in self.components.items():
if status_info.status == ComponentStatus.TIMEOUT:
timeout_components[component_id] = status_info
return timeout_components
def get_components_length(self):
return len(self.components)
def check_and_update_timeouts(self) -> Dict[str, StatusInfo]:
newly_timeout_components = {}
components = self.get_all_status()
for component_id, status_info in components.items():
if status_info.status == ComponentStatus.TIMEOUT:
newly_timeout_components[component_id] = status_info
continue
time_since_update = status_info.get_status_duration()
timeout_threshold = self.status_timeouts.get(status_info.status, 300)
self._record(
f"[COMPONENT DETAIL] {component_id}: "
f"Status={status_info.status}, "
f"Duration={status_info.get_status_duration():.1f}s, "
f"Threshold={timeout_threshold}s"
)
if time_since_update > timeout_threshold:
self._record(
f"Component {component_id} timeout: {status_info.status.value} for {time_since_update:.1f}s"
f" (threshold: {timeout_threshold}s)"
)
status_info.status = ComponentStatus.TIMEOUT
status_info.last_update = time_since_update
newly_timeout_components[component_id] = status_info
return newly_timeout_components
def clear(self):
self.components.clear()
self._record("Cleared all registered components.")
def get_component_status_duration(self, component_id: str) -> Optional[float]:
status_info = self.components.get(component_id)
if status_info:
return status_info.get_status_duration()
return None
def get_all_status_with_duration(self) -> Dict[str, Dict]:
result = {}
for comp_id, status_info in self.components.items():
result[comp_id] = {
"status": status_info.status,
"duration": status_info.get_status_duration(),
"timeout_threshold": self.status_timeouts.get(status_info.status, 300),
"last_update": status_info.last_update,
}
return result
def set_check_interval(self, interval_seconds: float):
self.check_interval = interval_seconds
self._record(f"Set daemon check interval to {interval_seconds}s")
def _record(self, info):
if hasattr(self, "logger") and self.logger is not None:
self.logger.info(f"[STATUS MONITOR]: {info}")
else:
print(f"[STATUS MONITOR]: {info}")

View File

@@ -0,0 +1,21 @@
import threading
import time
from .status import ComponentStatus, StatusInfo
from .status_monitor import StatusMonitor
class StatusReporter:
def __init__(self, component_id: str):
self.component_id = component_id
self._status_info = StatusInfo(component_id, ComponentStatus.IDLE)
self._lock = threading.Lock()
def update_status(self, status: ComponentStatus):
with self._lock:
self._status_info = StatusInfo(component_id=self.component_id, status=status, last_update=time.time())
StatusMonitor.get_instance().register_update(self._status_info)
def get_status(self) -> StatusInfo:
with self._lock:
return self._status_info

66
nimbus/data_engine.py Normal file
View File

@@ -0,0 +1,66 @@
from time import time
from nimbus.dist_sim.head_node import HeadNode
from nimbus.scheduler.sches import gen_pipe, gen_scheduler
from nimbus.utils.logging import configure_logging
from nimbus.utils.random import set_all_seeds
from nimbus.utils.types import (
NAME,
SAFE_THRESHOLD,
STAGE_PIPE,
WORKER_SCHEDULE,
StageInput,
)
from nimbus.utils.utils import consume_stage
class DataEngine:
def __init__(self, config, master_seed=None):
if master_seed is not None:
master_seed = int(master_seed)
set_all_seeds(master_seed)
exp_name = config[NAME]
configure_logging(exp_name, config=config)
self._sche_list = gen_scheduler(config)
self._stage_input = StageInput()
def run(self):
for stage in self._sche_list:
self._stage_input = stage.run(self._stage_input)
consume_stage(self._stage_input)
class DistPipeDataEngine:
def __init__(self, config, master_seed=None):
self._sche_list = gen_scheduler(config)
self.config = config
self._stage_input = StageInput()
exp_name = config[NAME]
self.logger = configure_logging(exp_name, config=config)
master_seed = int(master_seed) if master_seed is not None else None
self.pipe_list = gen_pipe(config, self._sche_list, exp_name, master_seed=master_seed)
self.head_nodes = {}
def run(self):
self.logger.info("[DistPipeDataEngine]: %s", self.pipe_list)
st_time = time()
cur_pipe_queue = None
pre_worker_num = 0
worker_schedule = self.config[STAGE_PIPE].get(WORKER_SCHEDULE, False)
for idx, pipe in enumerate(self.pipe_list):
self.head_nodes[idx] = HeadNode(
cur_pipe_queue,
pipe,
pre_worker_num,
self.config[STAGE_PIPE][SAFE_THRESHOLD],
worker_schedule,
self.logger,
idx,
)
self.head_nodes[idx].run()
cur_pipe_queue = self.head_nodes[idx].result_queue()
pre_worker_num = len(pipe)
for _, value in self.head_nodes.items():
value.wait_stop()
et_time = time()
self.logger.info("execution duration: %s", et_time - st_time)

View File

View File

@@ -0,0 +1,201 @@
import traceback
from threading import Thread
from time import sleep, time
import ray
from ray.util.queue import Queue
from nimbus.components.data.package import Package
from nimbus.dist_sim.task_board import TaskBoard
from nimbus.scheduler.inner_pipe import PipeWorkerGroup
class HeadNode:
def __init__(
self, data_queue, workers: PipeWorkerGroup, pre_worker_num, safe_threshold, worker_schedule, logger, idx
):
self.idx = idx
self.data_queue = data_queue
self.logger = logger
self.worker_group = workers
logger.info(f"workers: {list(workers.keys())}")
self.pre_worker_num = pre_worker_num
self.safe_threshold = safe_threshold
self.worker_schedule = worker_schedule
logger.info(f"safe_threshold: {self.safe_threshold}")
logger.info(f"worker_schedule: {self.worker_schedule}")
self.task_queue = Queue() if data_queue is not None else None
self.output_queue = Queue()
self.GEN_STOP_SIG = False
self.task_board = TaskBoard()
self.gen_thread = Thread(target=self.gen_tasks, args=())
self.gen_thread.start()
self.should_stop = False
self.run_thread = None
# Map runner ObjectRef to worker name for proper cleanup
self.runner_to_worker = {}
self.all_workers_spawned = False
def gen_tasks(self):
self.logger.info(f"headnode: {self.idx}: =============start gen task=============")
pre_worker_stop_num = 0
while not self.GEN_STOP_SIG:
if self.data_queue is None:
self.logger.info(f"headnode: {self.idx}: =============Gen Tasks stop==============")
self.all_workers_spawned = True
return
if self.data_queue.empty():
sleep(0)
continue
if self.task_queue is not None and self.task_queue.size() >= self.safe_threshold:
sleep(1)
continue
task = self.data_queue.get()
assert isinstance(
task, Package
), f"the transfered type of data should be Package type, but it is {type(task)}"
if task.should_stop():
pre_worker_stop_num += 1
self.logger.info(
f"headnode: {self.idx}: Received stop signal from upstream worker"
f" ({pre_worker_stop_num}/{self.pre_worker_num})"
)
# Dynamic worker scheduling: spawn new worker when upstream worker finishes
if self.worker_schedule:
self.logger.info(
f"headnode: {self.idx}: Worker schedule enabled, will spawn 1 new worker after resource release"
)
# Wait for upstream resources to be released by upstream HeadNode's wait_stop()
# Retry mechanism to handle resource release timing
max_retries = 30 # 30 * 2s = 60s max wait
retry_interval = 2
for retry in range(max_retries):
try:
self.logger.info(
f"headnode: {self.idx}: Attempting to spawn new worker (attempt"
f" {retry + 1}/{max_retries})..."
)
created_workers = self.worker_group.spawn(1)
if created_workers:
for worker_name, worker_bundle in created_workers:
# Start the new worker
runner = worker_bundle["worker"].run.remote(self.task_queue, self.output_queue)
self.runner_to_worker[runner] = worker_name
self.logger.info(
f"headnode: {self.idx}: Successfully spawned and started new worker:"
f" {worker_name}"
)
sleep(5)
break # Success, exit retry loop
except Exception as e:
if retry < max_retries - 1:
self.logger.warning(
f"headnode: {self.idx}: Failed to spawn worker (attempt {retry + 1}), will retry in"
f" {retry_interval}s: {e}"
)
sleep(retry_interval)
else:
self.logger.error(
f"headnode: {self.idx}: Failed to spawn new worker after"
f" {max_retries} attempts: {e}"
)
self.logger.error(traceback.format_exc())
if pre_worker_stop_num == self.pre_worker_num:
for _ in range(len(self.worker_group)):
self.logger.info(f"headnode: {self.idx}: get stop signal")
stop_pack = Package(None, stop_sig=True)
self.task_board.reg_task(stop_pack)
self.all_workers_spawned = True
return
else:
self.task_board.reg_task(task)
if self.data_queue and not self.data_queue.empty():
task = self.data_queue.get_nowait()
self.task_board.reg_task(task)
self.logger.info("=============Gen Tasks stop==============")
self.all_workers_spawned = True
def result_queue(self):
return self.output_queue
def run(self):
self.logger.info(f"headnode: {self.idx}: ==============Running Head Node================")
for worker_name, worker_bundle in self.worker_group.items():
runner = worker_bundle["worker"].run.remote(self.task_queue, self.output_queue)
self.runner_to_worker[runner] = worker_name
sleep(5)
def inner_run():
while not self.should_stop:
tasks = self.task_board.get_tasks(timeout=0.05)
if len(tasks) == 0:
sleep(0)
continue
while self.task_queue.size() >= self.safe_threshold and not self.should_stop:
sleep(1)
for _, task in enumerate(tasks):
self.task_queue.put(task)
self.run_thread = Thread(target=inner_run)
self.run_thread.start()
def sig_stop(self):
self.logger.info(f"headnode: {self.idx}: ============Gen Stop===============")
self.GEN_STOP_SIG = True
self.gen_thread.join()
def wait_stop(self):
if self.worker_schedule and self.idx != 0:
self.logger.info(f"headnode: {self.idx}: Waiting for all worker spawning to complete...")
timeout = 600 # 600 seconds timeout
start_time = time()
while not self.all_workers_spawned:
if time() - start_time > timeout:
self.logger.warning(
f"headnode: {self.idx}: Timeout waiting for worker spawning completion after {timeout}s"
)
break
sleep(0.1)
if self.all_workers_spawned:
self.logger.info(f"headnode: {self.idx}: All worker spawning completed, proceeding to wait for runners")
remaining_runners = list(self.runner_to_worker.keys())
for runner in remaining_runners:
self.logger.info(f"headnode: {self.idx}: remaining runner include: {self.runner_to_worker[runner]}")
while remaining_runners:
ready, _ = ray.wait(remaining_runners, num_returns=len(remaining_runners), timeout=1.0)
for finished_runner in ready:
worker_name = self.runner_to_worker.get(finished_runner, "unknown")
self.logger.info(f"headnode: {self.idx}: Worker {worker_name} finished")
try:
ray.get(finished_runner)
self.logger.info(f"headnode: {self.idx}: Worker {worker_name} completed successfully")
self.worker_group.remove(worker_name, self.logger)
except Exception as e:
self.logger.error(f"Worker {worker_name} failed, error stack:")
self.logger.error(e)
if worker_name in self.worker_group.keys():
self.worker_group.remove(worker_name, self.logger)
remaining_runners.remove(finished_runner)
self.runner_to_worker.pop(finished_runner, None)
if not ready:
sleep(1)
self.logger.info(f"headnode: {self.idx}: ==============stop head================")
self.should_stop = True
if self.run_thread is not None:
self.run_thread.join()
self.sig_stop()
def __del__(self):
if self.task_queue is not None:
self.task_queue.shutdown()
self.output_queue.shutdown()

View File

@@ -0,0 +1,42 @@
import time
from threading import Lock
class Task:
def __init__(self):
pass
def update_state(self, state):
pass
class TaskBoard:
def __init__(self):
self.tasks = []
self.flying_tasks = []
self.finished_tasks = []
self.task_cnt = 0
self.task_lock = Lock()
self.flying_task_lock = Lock()
def reg_task(self, task):
with self.task_lock:
self.tasks.append(task)
self.task_cnt += 1
def get_tasks(self, timeout=0):
st_time = time.time()
while len(self.tasks) == 0:
if time.time() - st_time > timeout:
return []
pass
with self.task_lock:
tasks = self.tasks.copy()
self.tasks = []
return tasks
def commit_task(self, tasks):
raise NotImplementedError("commit_task not implemented")
def finished(self):
raise NotImplementedError("finished not implemented")

View File

View File

@@ -0,0 +1,277 @@
import math
import os
import threading
import time
import ray
from nimbus.daemon.status_monitor import StatusMonitor
from nimbus.scheduler.stages import DedumpStage, DumpStage
from nimbus.utils.logging import configure_logging
from nimbus.utils.random import set_all_seeds
from nimbus.utils.types import MONITOR_CHECK_INTERVAL, STATUS_TIMEOUTS, StageInput
from nimbus.utils.utils import init_env, pipe_consume_stage
def iter_to_obj(iter_obj):
return pipe_consume_stage(iter_obj), True
def _consume_N(iter_obj, N=1):
print("consume: ", iter_obj)
results = []
finish = False
for _ in range(N):
try:
obj = next(iter_obj)
results.append(obj)
except StopIteration:
finish = True
return results, finish
def consume_N(stage_input):
finish = False
if hasattr(stage_input, "Args"):
stage_input.Args, finish = _consume_N(stage_input.Args[0])
if hasattr(stage_input, "Kwargs"):
if stage_input.Kwargs is not None:
stage_input.Kwargs = {key: _consume_N(value) for key, value in stage_input.Kwargs.items()}
return stage_input, finish
class PipeWorkerGroup:
"""
Manages a group of pipe workers and their supervisors.
Supports dynamic worker spawning for worker_schedule feature.
"""
def __init__(
self,
pipe_name,
exp_name,
pipe_num,
stage_list,
master_seed,
supervisor_class,
inner_pipe_class,
initial_instances=0,
):
self.workers = {}
self._next_worker_idx = 0
self.pipe_name = pipe_name
self.exp_name = exp_name
self.pipe_num = pipe_num
self.stage_list = stage_list
self.master_seed = master_seed
self.supervisor_class = supervisor_class
self.inner_pipe_class = inner_pipe_class
if initial_instances > 0:
self.spawn(initial_instances)
def spawn(self, count):
"""
Spawn new workers dynamically.
Returns list of (name, bundle) tuples for created workers.
"""
created = []
for _ in range(count):
name = f"p{self.pipe_num}_w{self._next_worker_idx}"
worker_seed = self.master_seed + self._next_worker_idx if self.master_seed is not None else None
supervisor = self.supervisor_class.remote(name)
pipe_actor = self.inner_pipe_class.remote(self.stage_list, name, supervisor, seed=worker_seed)
ray.get(supervisor.set_pipe.remote(pipe_actor))
supervisor.run.remote()
bundle = {"worker": pipe_actor, "supervisor": supervisor}
self.workers[name] = bundle
created.append((name, bundle))
self._next_worker_idx += 1
time.sleep(3)
if created:
print(f"{self.pipe_name}: spawned {len(created)} workers - {[name for name, _ in created]}")
return created
def items(self):
"""Return items view of workers dictionary."""
return self.workers.items()
def values(self):
"""Return values view of workers dictionary."""
return self.workers.values()
def keys(self):
"""Return keys view of workers dictionary."""
return self.workers.keys()
def __len__(self):
"""Return number of workers in the group."""
return len(self.workers)
def __repr__(self):
worker_names = list(self.workers.keys())
return f"PipeWorkerGroup({worker_names})"
def __getitem__(self, key):
"""Support dictionary-style access."""
return self.workers[key]
def remove(self, name, logger):
"""Remove a worker from the group."""
ray.kill(self.workers[name]["worker"])
logger.info(f"killed worker actor {name} to release GPU resouces")
ray.kill(self.workers[name]["supervisor"])
logger.info(f"Supervisor {name} killed successfully")
if name in self.workers:
del self.workers[name]
def make_pipe(pipe_name, exp_name, pipe_num, stage_list, dev, instance_num, total_processes, config, master_seed=None):
gpu_num = 0
if dev == "gpu":
resources = ray.cluster_resources()
total_gpus = resources.get("GPU", 0)
assert total_gpus > 0, "not enough gpu resources"
processes_per_gpu = math.ceil(total_processes / total_gpus)
gpu_num = 1.0 / processes_per_gpu
@ray.remote
class Supervisor:
def __init__(self, name):
self.name = "supervisor_" + name
self.pipe_worker = None
self.logger = configure_logging(exp_name, self.name)
self.logger.info("Supervisor started")
self.monitor = StatusMonitor.get_instance()
self.monitor.set_logger(self.logger)
self._last_status_check = 0.0
self.check_interval = config.get(MONITOR_CHECK_INTERVAL, 120)
self.logger.info(f"Monitor check interval: {self.check_interval} seconds")
if config.get(STATUS_TIMEOUTS, None) is not None:
self.monitor.set_component_timeouts(config[STATUS_TIMEOUTS])
def set_pipe(self, pipe_worker):
self.logger.info("set pipe worker")
self.pipe_worker = pipe_worker
def set_queue(self, input_queue, output_queue):
self.input_queue = input_queue
self.output_queue = output_queue
def _restart_worker(self):
try:
ray.kill(self.pipe_worker, no_restart=False)
self.logger.info("trigger restart of the actor")
except Exception as ke:
self.logger.error(f"restart actor error: {ke}")
def update_component_state(self, components_state):
for _, state in components_state.items():
self.monitor.register_update(state)
def _start_daemon(self):
miss_cnt = 0
while True:
now = time.time()
if now - self._last_status_check >= self.check_interval:
try:
timeout_components = self.monitor.check_and_update_timeouts()
if len(timeout_components) > 0:
self.logger.warning(f"Components timeout: {timeout_components}, restart the pipe worker")
self._restart_worker()
self.monitor.clear()
else:
if self.monitor.get_components_length() == 0:
miss_cnt += 1
self.logger.info(f"No components timeout detected, miss count: {miss_cnt}")
if miss_cnt >= 5:
self.logger.info("No components detected for 5 consecutive checks, restart pipe worker")
self._restart_worker()
self.monitor.clear()
miss_cnt = 0
except Exception as e:
self.logger.error(f"Get components status failed: {e}")
self._restart_worker()
self.monitor.clear()
self._last_status_check = now
time.sleep(1)
def run(self):
assert self.pipe_worker is not None, "pipe worker is not set"
thread = threading.Thread(target=self._start_daemon, daemon=True)
thread.start()
@ray.remote(num_gpus=gpu_num, max_restarts=3, max_task_retries=3)
class InnerPipe:
def __init__(self, stage_list, name, supervisor, seed=None):
if seed is not None:
set_all_seeds(seed)
self.stages = stage_list
self.name = name
self.supervisor = supervisor
init_env()
self.logger = configure_logging(exp_name, self.name)
self.logger.info(f"Working on gpu {os.environ.get('CUDA_VISIBLE_DEVICES')}")
if ray.get_runtime_context().was_current_actor_reconstructed is True:
msg = (
f"{'='*80}\n"
"!!! ATTENTION !!!\n"
f"!!! InnerPipe {name} WAS RECONSTRUCTED due to SYSTEM ERROR !!!\n"
"!!! Please CHECK LOGS in /tmp/ray/session_latest/logs/ for details !!!\n"
f"{'='*80}\n"
)
self.logger.info(msg)
self.monitor = StatusMonitor.get_instance()
self.monitor.set_logger(self.logger)
self.monitor_check_interval = config.get(MONITOR_CHECK_INTERVAL, 120)
def _update_supervisor(self):
while True:
for _ in range(self.monitor_check_interval):
time.sleep(1)
components_status = self.monitor.get_all_status()
ray.get(self.supervisor.update_component_state.remote(components_status))
def run(self, input_queue, output_queue):
self.logger.info(f"[InnerPipe stages]: {self.stages}")
thread = threading.Thread(target=self._update_supervisor, daemon=True)
thread.start()
self.logger.info("Reporter started, start running pipe")
mid_results = StageInput()
# if input_queue is None:
# mid_results = StageInput()
# else:
# mid_results = StageInput((input_queue,), {})
for _, stage in enumerate(self.stages):
if isinstance(stage, DumpStage):
mid_results = stage.run(mid_results, output_queue)
elif isinstance(stage, DedumpStage):
mid_results = stage.run(mid_results, input_queue)
else:
mid_results = stage.run(mid_results)
result, finish = iter_to_obj(mid_results)
self.logger.info("====================================")
self.logger.info(f"result: {result}, finish: {finish}")
self.logger.info("====================================")
ray.kill(self.supervisor)
self.logger.info("actor finished")
return finish
group = PipeWorkerGroup(
pipe_name=pipe_name,
exp_name=exp_name,
pipe_num=pipe_num,
stage_list=stage_list,
master_seed=master_seed,
supervisor_class=Supervisor,
inner_pipe_class=InnerPipe,
initial_instances=instance_num,
)
print(pipe_name, group)
return group

View File

@@ -0,0 +1,115 @@
from abc import abstractmethod
from nimbus.components.dedump import dedumper_dict
from nimbus.components.dump import dumper_dict
from nimbus.components.load import layout_randomizer_dict, scene_loader_dict
from nimbus.components.plan_with_render import plan_with_render_dict
from nimbus.components.planner import seq_planner_dict
from nimbus.components.render import renderer_dict
from nimbus.components.store import writer_dict
from nimbus.utils.types import ARGS, PLANNER, TYPE
class Instruction:
def __init__(self, config):
self.config = config
@abstractmethod
def run(self, stage_input):
raise NotImplementedError()
class LoadSceneInstruction(Instruction):
def __init__(self, config):
super().__init__(config)
self.scene_iter = scene_loader_dict[self.config[TYPE]]
def run(self, stage_input):
pack_iter = pack_iter = stage_input.Args[0] if stage_input.Args is not None else None
return self.scene_iter(pack_iter=pack_iter, **self.config.get(ARGS, {}))
class RandomizeLayoutInstruction(Instruction):
def __init__(self, config):
super().__init__(config)
self.layout_randomlizer = layout_randomizer_dict[self.config[TYPE]]
def run(self, stage_input):
scene_iterator = stage_input.Args[0]
extend_scene_iterator = self.layout_randomlizer(scene_iterator, **self.config.get(ARGS, {}))
return extend_scene_iterator
class PlanPathInstruction(Instruction):
def __init__(self, config):
super().__init__(config)
self.seq_planner = seq_planner_dict[self.config[TYPE]]
def run(self, stage_input):
scene_iter = stage_input.Args[0]
planner_cfg = self.config[PLANNER] if PLANNER in self.config else None
return self.seq_planner(scene_iter, planner_cfg, **self.config.get(ARGS, {}))
class RenderInstruction(Instruction):
def __init__(self, config):
super().__init__(config)
self.renderer = renderer_dict[self.config[TYPE]]
def run(self, stage_input):
scene_seqs_iter = stage_input.Args[0]
obs_iter = self.renderer(scene_seqs_iter, **self.config.get(ARGS, {}))
return obs_iter
class PlanWithRenderInstruction(Instruction):
def __init__(self, config):
super().__init__(config)
self.plan_with_render = plan_with_render_dict[config[TYPE]]
def run(self, stage_input):
scene_iter = stage_input.Args[0]
plan_with_render_iter = self.plan_with_render(scene_iter, **self.config.get(ARGS, {}))
return plan_with_render_iter
class StoreInstruction(Instruction):
def __init__(self, config):
super().__init__(config)
self.writer = writer_dict[config[TYPE]]
def run(self, stage_input):
seqs_obs_iter = stage_input.Args[0]
store_iter = self.writer(seqs_obs_iter, **self.config.get(ARGS, {}))
return store_iter
class DumpInstruction(Instruction):
def __init__(self, config):
super().__init__(config)
self.dumper = dumper_dict[config[TYPE]]
def run(self, stage_input, output_queue=None):
seqs_obs_iter = stage_input.Args[0]
dump_iter = self.dumper(seqs_obs_iter, output_queue=output_queue, **self.config.get(ARGS, {}))
return dump_iter
class DeDumpInstruction(Instruction):
def __init__(self, config):
super().__init__(config)
self.dedumper = dedumper_dict[config[TYPE]]
def run(self, stage_input, input_queue=None):
dump_iter = self.dedumper(input_queue=input_queue, **self.config.get(ARGS, {}))
return dump_iter
class ComposeInstruction(Instruction):
def __init__(self, config):
super().__init__(config)
class AnnotateDataInstruction(Instruction):
def __init__(self, config):
super().__init__(config)

80
nimbus/scheduler/sches.py Normal file
View File

@@ -0,0 +1,80 @@
from nimbus.scheduler.inner_pipe import make_pipe
from nimbus.scheduler.stages import (
DedumpStage,
DumpStage,
LoadStage,
PlanStage,
PlanWithRenderStage,
RenderStage,
StoreStage,
)
from nimbus.utils.types import (
DEDUMP_STAGE,
DUMP_STAGE,
LOAD_STAGE,
PLAN_STAGE,
PLAN_WITH_RENDER_STAGE,
RENDER_STAGE,
STAGE_DEV,
STAGE_NUM,
STAGE_PIPE,
STORE_STAGE,
WORKER_NUM,
)
def gen_scheduler(config):
stages = []
if LOAD_STAGE in config:
stages.append(LoadStage(config[LOAD_STAGE]))
if PLAN_WITH_RENDER_STAGE in config:
stages.append(PlanWithRenderStage(config[PLAN_WITH_RENDER_STAGE]))
if PLAN_STAGE in config:
stages.append(PlanStage(config[PLAN_STAGE]))
if DUMP_STAGE in config:
stages.append(DumpStage(config[DUMP_STAGE]))
if DEDUMP_STAGE in config:
stages.append(DedumpStage(config[DEDUMP_STAGE]))
if RENDER_STAGE in config:
stages.append(RenderStage(config[RENDER_STAGE]))
if STORE_STAGE in config:
stages.append(StoreStage(config[STORE_STAGE]))
return stages
def gen_pipe(config, stage_list, exp_name, master_seed=None):
if STAGE_PIPE in config:
pipe_stages_num = config[STAGE_PIPE][STAGE_NUM]
pipe_stages_dev = config[STAGE_PIPE][STAGE_DEV]
pipe_worker_num = config[STAGE_PIPE][WORKER_NUM]
inner_pipes = []
pipe_num = 0
total_processes = 0
for worker_num in config[STAGE_PIPE][WORKER_NUM]:
total_processes += worker_num
for num, dev, worker_num in zip(pipe_stages_num, pipe_stages_dev, pipe_worker_num):
stages = stage_list[:num]
print("===========================")
print(f"inner stage num: {num}, device type: {dev}")
print(f"stages: {stages}")
print("===========================")
stage_list = stage_list[num:]
pipe_name = "pipe"
for stage in stages:
pipe_name += f"_{stage.__class__.__name__}"
pipe_workers = make_pipe(
pipe_name,
exp_name,
pipe_num,
stages,
dev,
worker_num,
total_processes,
config[STAGE_PIPE],
master_seed=master_seed,
)
inner_pipes.append(pipe_workers)
pipe_num += 1
return inner_pipes
else:
return [make_pipe.InnerPipe(stage_list)]

137
nimbus/scheduler/stages.py Normal file
View File

@@ -0,0 +1,137 @@
from abc import abstractmethod
from nimbus.scheduler.instructions import (
DeDumpInstruction,
DumpInstruction,
Instruction,
LoadSceneInstruction,
PlanPathInstruction,
PlanWithRenderInstruction,
RandomizeLayoutInstruction,
RenderInstruction,
StoreInstruction,
)
from nimbus.utils.types import (
DEDUMPER,
DUMPER,
LAYOUT_RANDOM_GENERATOR,
PLAN_WITH_RENDER,
RENDERER,
SCENE_LOADER,
SEQ_PLANNER,
WRITER,
StageInput,
)
class Stage:
def __init__(self, config):
self.config = config
self.instructions: list[Instruction] = []
self.output_queue = None
@abstractmethod
def run(self, stage_input):
raise NotImplementedError()
class LoadStage(Stage):
def __init__(self, config):
super().__init__(config)
if SCENE_LOADER in config:
self.instructions.append(LoadSceneInstruction(config[SCENE_LOADER]))
if LAYOUT_RANDOM_GENERATOR in config:
self.instructions.append(RandomizeLayoutInstruction(config[LAYOUT_RANDOM_GENERATOR]))
def run(self, stage_input: StageInput):
for instruction in self.instructions:
scene_iterator = instruction.run(stage_input)
stage_input = StageInput((scene_iterator,), {})
return stage_input
class PlanStage(Stage):
def __init__(self, config):
super().__init__(config)
if SEQ_PLANNER in config:
self.instructions.append(PlanPathInstruction(config[SEQ_PLANNER]))
def run(self, stage_input: StageInput):
for instruction in self.instructions:
scene_seqs_iter = instruction.run(stage_input)
stage_input = StageInput((scene_seqs_iter,), {})
return stage_input
class RenderStage(Stage):
def __init__(self, config):
super().__init__(config)
self.instructions.append(RenderInstruction(config[RENDERER]))
def run(self, stage_input: StageInput):
for instruction in self.instructions:
obs_iter = instruction.run(stage_input)
stage_input = StageInput((obs_iter,), {})
return stage_input
class PlanWithRenderStage(Stage):
def __init__(self, config):
super().__init__(config)
self.instructions.append(PlanWithRenderInstruction(config[PLAN_WITH_RENDER]))
def run(self, stage_input: StageInput):
for instruction in self.instructions:
scene_seqs_iter = instruction.run(stage_input)
stage_input = StageInput((scene_seqs_iter,), {})
return stage_input
class StoreStage(Stage):
def __init__(self, config):
super().__init__(config)
if WRITER in config:
self.instructions.append(StoreInstruction(config[WRITER]))
def run(self, stage_input: StageInput):
for instruction in self.instructions:
store_iter = instruction.run(stage_input)
stage_input = StageInput((store_iter,), {})
return stage_input
class DumpStage(Stage):
def __init__(self, config):
super().__init__(config)
self.instructions.append(DumpInstruction(config[DUMPER]))
def run(self, stage_input: StageInput, output_queue=None):
for instruction in self.instructions:
dump_iter = instruction.run(stage_input, output_queue)
stage_input = StageInput((dump_iter,), {})
return stage_input
class DedumpStage(Stage):
def __init__(self, config):
super().__init__(config)
if DEDUMPER in config:
self.instructions.append(DeDumpInstruction(config[DEDUMPER]))
if SCENE_LOADER in config:
self.instructions.append(LoadSceneInstruction(config[SCENE_LOADER]))
if LAYOUT_RANDOM_GENERATOR in config:
self.instructions.append(RandomizeLayoutInstruction(config[LAYOUT_RANDOM_GENERATOR]))
if SEQ_PLANNER in config:
self.instructions.append(PlanPathInstruction(config[SEQ_PLANNER]))
def run(self, stage_input: StageInput, input_queue=None):
if input_queue is not None:
self.input_queue = input_queue
for instruction in self.instructions:
if isinstance(instruction, DeDumpInstruction):
result = instruction.run(stage_input, input_queue)
else:
result = instruction.run(stage_input)
stage_input = StageInput((result,), {})
return stage_input

20
nimbus/utils/config.py Normal file
View File

@@ -0,0 +1,20 @@
from omegaconf import OmegaConf
def load_config(*yaml_files, cli_args=None):
if cli_args is None:
cli_args = []
yaml_confs = [OmegaConf.load(f) for f in yaml_files]
cli_conf = OmegaConf.from_cli(cli_args)
conf = OmegaConf.merge(*yaml_confs, cli_conf)
OmegaConf.resolve(conf)
return conf
def config_to_primitive(config, resolve=True):
return OmegaConf.to_container(config, resolve=resolve)
def save_config(config, path):
with open(path, "w", encoding="utf-8") as fp:
OmegaConf.save(config=config, f=fp)

View File

@@ -0,0 +1,138 @@
"""
Config Processor: Responsible for identifying, converting, and loading configuration files.
"""
from omegaconf import DictConfig, OmegaConf
from nimbus.utils.config import load_config
class ConfigProcessor:
"""Config processor class"""
def __init__(self):
pass
def _check_config_path_exists(self, config, path):
"""
Check if a configuration path exists in the config object
Args:
config: OmegaConf config object
path: String path like 'stage_pipe.worker_num' or 'load_stage.scene_loader.args.random_num'
Returns:
bool: Whether the path exists in the config
"""
try:
keys = path.split(".")
current = config
for key in keys:
if isinstance(current, DictConfig):
if key not in current:
return False
current = current[key]
else:
return False
return True
except Exception:
return False
def _validate_cli_args(self, config, cli_args):
"""
Validate that all CLI arguments correspond to existing paths in the config
Args:
config: OmegaConf config object
cli_args: List of command line arguments
Raises:
ValueError: If any CLI argument path doesn't exist in the config
"""
if not cli_args:
return
# Clean up CLI args to remove -- prefix if present
cleaned_cli_args = []
for arg in cli_args:
if arg.startswith("--"):
cleaned_cli_args.append(arg[2:]) # Remove the -- prefix
else:
cleaned_cli_args.append(arg)
# Parse CLI args to get the override paths
try:
cli_conf = OmegaConf.from_cli(cleaned_cli_args)
except Exception as e:
raise ValueError(f"Invalid CLI argument format: {e}. Please use format like: stage_pipe.worker_num='[2,4]'")
def check_nested_paths(conf, prefix=""):
"""Recursively check all paths in the CLI config"""
for key, value in conf.items():
current_path = f"{prefix}.{key}" if prefix else key
if isinstance(value, DictConfig):
# Check if this intermediate path exists
if not self._check_config_path_exists(config, current_path):
raise ValueError(f"Configuration path '{current_path}' does not exist in the config file")
# Recursively check nested paths
check_nested_paths(value, current_path)
else:
# Check if this leaf path exists
if not self._check_config_path_exists(config, current_path):
raise ValueError(f"Configuration path '{current_path}' does not exist in the config file")
try:
check_nested_paths(cli_conf)
except ValueError:
raise
except Exception:
# If there's an issue parsing CLI args, provide helpful error message
raise ValueError("Invalid CLI argument format. Please use format like: --key=value or --nested.key=value")
def process_config(self, config_path, cli_args=None):
"""
Process the config file
Args:
config_path: Path to the config file
cli_args: List of command line arguments
Returns:
OmegaConf: Processed config object
"""
# Clean up CLI args to remove -- prefix if present
cleaned_cli_args = []
if cli_args:
for arg in cli_args:
if arg.startswith("--"):
cleaned_cli_args.append(arg[2:]) # Remove the -- prefix
else:
cleaned_cli_args.append(arg)
# Load config first without CLI args to validate paths
try:
base_config = load_config(config_path)
except Exception as e:
raise ValueError(f"Error loading config: {e}")
# Validate that CLI arguments correspond to existing paths
if cli_args:
self._validate_cli_args(base_config, cli_args)
# Now load config with CLI args (validation passed)
config = load_config(config_path, cli_args=cleaned_cli_args)
return config
def print_final_config(self, config):
"""
Print the final running config
Args:
config: OmegaConf config object
"""
print("=" * 50)
print("final config:")
print("=" * 50)
print(OmegaConf.to_yaml(config))

23
nimbus/utils/flags.py Normal file
View File

@@ -0,0 +1,23 @@
import os
_DEBUG_KEY = "NIMBUS_DEBUG"
_RANDOM_SEED_KEY = "NIMBUS_RANDOM_SEED"
def set_debug_mode(enabled: bool) -> None:
"""Set debug mode. Must be called before ray.init() to propagate to Ray workers."""
os.environ[_DEBUG_KEY] = "1" if enabled else "0"
def is_debug_mode() -> bool:
return os.environ.get(_DEBUG_KEY, "0") == "1"
def set_random_seed(seed: int) -> None:
"""Set global random seed. Must be called before ray.init() to propagate to Ray workers."""
os.environ[_RANDOM_SEED_KEY] = str(seed)
def get_random_seed() -> int | None:
val = os.environ.get(_RANDOM_SEED_KEY)
return int(val) if val is not None else None

48
nimbus/utils/logging.py Normal file
View File

@@ -0,0 +1,48 @@
import logging
import os
import time
from datetime import datetime
from nimbus.utils.config import save_config
def configure_logging(exp_name, name=None, config=None):
pod_name = os.environ.get("POD_NAME", None)
if pod_name is not None:
exp_name = f"{exp_name}/{pod_name}"
log_dir = os.path.join("./output", exp_name)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
if name is None:
log_name = f"de_time_profile_{timestamp}.log"
else:
log_name = f"de_{name}_time_profile_{timestamp}.log"
log_file = os.path.join(log_dir, log_name)
max_retries = 3
for attempt in range(max_retries):
try:
os.makedirs(log_dir, exist_ok=True)
break
except Exception as e:
print(f"Warning: Stale file handle when creating {log_dir}, attempt {attempt + 1}/{max_retries}")
if attempt < max_retries - 1:
time.sleep(3)
continue
else:
raise RuntimeError(f"Failed to create log directory {log_dir} after {max_retries} attempts") from e
if config is not None:
config_log_file = os.path.join(log_dir, "de_config.yaml")
save_config(config, config_log_file)
logger = logging.getLogger("de_logger")
logger.setLevel(logging.INFO)
fh = logging.FileHandler(log_file, mode="a")
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
fh.setFormatter(formatter)
logger.addHandler(fh)
logger.info("Start Data Engine")
return logger

33
nimbus/utils/random.py Normal file
View File

@@ -0,0 +1,33 @@
import os
import random
import numpy as np
import torch
# Try to import open3d, but don't fail if it's not installed
try:
import open3d as o3d
except ImportError:
o3d = None
def set_all_seeds(seed):
"""
Sets seeds for all relevant random number generators to ensure reproducibility.
"""
os.environ["PYTHONHASHSEED"] = str(seed)
print(f"set seed {seed} for all libraries")
seed = int(seed)
np.random.seed(seed)
random.seed(seed)
if o3d and hasattr(o3d, "utility") and hasattr(o3d.utility, "random"):
o3d.utility.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
# These settings are crucial for deterministic results with CuDNN
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

65
nimbus/utils/types.py Normal file
View File

@@ -0,0 +1,65 @@
from dataclasses import dataclass
from typing import Dict, Optional, Tuple
NAME = "name"
# stage name
LOAD_STAGE = "load_stage"
PLAN_STAGE = "plan_stage"
RENDER_STAGE = "render_stage"
PLAN_WITH_RENDER_STAGE = "plan_with_render_stage"
STORE_STAGE = "store_stage"
STAGE_PIPE = "stage_pipe"
DUMP_STAGE = "dump_stage"
DEDUMP_STAGE = "dedump_stage"
# instruction name
# LOAD_STAGE
SCENE_LOADER = "scene_loader"
LAYOUT_RANDOM_GENERATOR = "layout_random_generator"
INDEX_GENERATOR = "index_generator"
DEDUMPER = "dedumper"
# PLAN_STAGE
SEQ_PLANNER = "seq_planner"
PLANNER = "planner"
SIMULATOR = "simulator"
# RENDER_STAGE
RENDERER = "renderer"
# PLAN_WITH_RENDER_STAGE
PLAN_WITH_RENDER = "plan_with_render"
# PIPE_STAGE
STAGE_NUM = "stage_num"
STAGE_DEV = "stage_dev"
WORKER_NUM = "worker_num"
WORKER_SCHEDULE = "worker_schedule"
SAFE_THRESHOLD = "safe_threshold"
STATUS_TIMEOUTS = "status_timeouts"
MONITOR_CHECK_INTERVAL = "monitor_check_interval"
# STORE_STAGE
WRITER = "writer"
DUMPER = "dumper"
OUTPUT_PATH = "output_path"
INPUT_PATH = "input_path"
TYPE = "type"
ARGS = "args"
@dataclass
class StageInput:
"""
A data class that encapsulates the input for a stage in the processing pipeline.
Args:
Args (Optional[Tuple]): Positional arguments passed to the stage's processing function.
Kwargs (Optional[Dict]): Keyword arguments passed to the stage's processing function.
"""
Args: Optional[Tuple] = None
Kwargs: Optional[Dict] = None

182
nimbus/utils/utils.py Normal file
View File

@@ -0,0 +1,182 @@
import functools
import os
import re
import sys
import time
from typing import Tuple, Type, Union
from nimbus.components.data.observation import Observations
from nimbus.components.data.scene import Scene
from nimbus.components.data.sequence import Sequence
def init_env():
sys.path.append("./")
sys.path.append("./data_engine")
sys.path.append("workflows/simbox")
def unpack_iter_data(data: tuple):
assert len(data) <= 3, "not support yet"
scene = None
seq = None
obs = None
for item in data:
if isinstance(item, Scene):
scene = item
elif isinstance(item, Sequence):
seq = item
elif isinstance(item, Observations):
obs = item
return scene, seq, obs
def consume_stage(stage_input):
if hasattr(stage_input, "Args"):
consume_iterators(stage_input.Args)
for value in stage_input.Args:
if hasattr(value, "__del__"):
value.__del__() # pylint: disable=C2801
if hasattr(stage_input, "Kwargs"):
if stage_input.Kwargs is not None:
for value in stage_input.Kwargs.values():
consume_iterators(value)
if hasattr(value, "__del__"):
value.__del__() # pylint: disable=C2801
# prevent isaac sim close pipe worker in advance
def pipe_consume_stage(stage_input):
if hasattr(stage_input, "Args"):
consume_iterators(stage_input.Args)
if hasattr(stage_input, "Kwargs"):
if stage_input.Kwargs is not None:
for value in stage_input.Kwargs.values():
consume_iterators(value)
def consume_iterators(obj):
# from pdb import set_trace; set_trace()
if isinstance(obj, (str, bytes)):
return obj
if isinstance(obj, dict):
return {key: consume_iterators(value) for key, value in obj.items()}
if isinstance(obj, list):
return [consume_iterators(item) for item in obj]
if isinstance(obj, tuple):
return tuple(consume_iterators(item) for item in obj)
if hasattr(obj, "__iter__"):
for item in obj:
consume_iterators(item)
return obj
def scene_names_postprocess(scene_names: list) -> list:
"""
Distributes a list of scene names (folders) among multiple workers in a distributed environment.
This function is designed to work with Deep Learning Container (DLC) environments, where worker
information is extracted from environment variables. It assigns a subset of the input scene names
to the current worker based on its rank and the total number of workers, using a round-robin strategy.
If not running in a DLC environment, all scene names are assigned to a single worker.
Args:
scene_names (list): List of scene names (typically folder names) to be distributed.
Returns:
list: The subset of scene names assigned to the current worker.
Raises:
PermissionError: If there is a permission issue accessing the input directory.
RuntimeError: For any other errors encountered during processing.
Notes:
- The function expects certain environment variables (e.g., POD_NAME, WORLD_SIZE) to be set
in DLC environments.
- If multiple workers are present, the input list is sorted before distribution to ensure
consistent assignment across workers.
"""
def _get_dlc_worker_info():
"""Extract worker rank and world size from DLC environment variables."""
pod_name = os.environ.get("POD_NAME")
if pod_name:
# Match worker-N or master-N patterns
match = re.search(r"dlc.*?-(worker|master)-(\d+)$", pod_name)
if match:
node_type, node_id = match.groups()
world_size = int(os.environ.get("WORLD_SIZE", "1"))
if node_type == "worker":
rank = int(node_id)
else: # master node
rank = world_size - 1
return rank, world_size
# Default for non-DLC environment
return 0, 1
def _distribute_folders(all_folders, rank, world_size):
"""Distribute folders among workers using round-robin strategy."""
if not all_folders:
return []
# Only sort when there are multiple workers to ensure consistency
if world_size > 1:
all_folders.sort()
# Distribute using slicing: worker i gets folders at indices i, i+world_size, ...
return all_folders[rank::world_size]
try:
# Get all subfolders
all_subfolders = scene_names
if not all_subfolders:
print(f"Warning: No scene found in {scene_names}")
return []
# Get worker identity and distribute folders
rank, world_size = _get_dlc_worker_info()
assigned_folders = _distribute_folders(all_subfolders, rank, world_size)
print(
f"DLC Worker {rank}/{world_size}: Assigned {len(assigned_folders)} out of "
f"{len(all_subfolders)} total folders"
)
return assigned_folders
except PermissionError:
raise PermissionError(f"No permission to access directory: {scene_names}")
except Exception as e:
raise RuntimeError(f"Error reading input directory {scene_names}: {e}")
def retry_on_exception(
max_retries: int = 3, retry_exceptions: Union[bool, Tuple[Type[Exception], ...]] = True, delay: float = 1.0
):
def decorator(func):
@functools.wraps(func)
def wrapper(self, *args, **kwargs):
last_exception = None
for attempt in range(max_retries + 1):
try:
if attempt > 0:
print(f"Retry attempt {attempt}/{max_retries} for {func.__name__}")
return func(self, *args, **kwargs)
except Exception as e:
last_exception = e
should_retry = False
if retry_exceptions is True:
should_retry = True
elif isinstance(retry_exceptions, (tuple, list)):
should_retry = isinstance(e, retry_exceptions)
if should_retry and attempt < max_retries:
print(f"Error in {func.__name__}: {e}. Retrying in {delay} seconds...")
time.sleep(delay)
else:
raise
if last_exception:
raise last_exception
return wrapper
return decorator