init commit
This commit is contained in:
16
nimbus/__init__.py
Normal file
16
nimbus/__init__.py
Normal file
@@ -0,0 +1,16 @@
|
||||
import ray
|
||||
|
||||
from nimbus.utils.types import STAGE_PIPE
|
||||
|
||||
from .data_engine import DataEngine, DistPipeDataEngine
|
||||
|
||||
|
||||
def run_data_engine(config, master_seed=None):
|
||||
import nimbus_extension # noqa: F401 pylint: disable=unused-import
|
||||
|
||||
if STAGE_PIPE in config:
|
||||
ray.init(num_gpus=1)
|
||||
data_engine = DistPipeDataEngine(config, master_seed=master_seed)
|
||||
else:
|
||||
data_engine = DataEngine(config, master_seed=master_seed)
|
||||
data_engine.run()
|
||||
0
nimbus/components/data/__init__.py
Normal file
0
nimbus/components/data/__init__.py
Normal file
71
nimbus/components/data/camera.py
Normal file
71
nimbus/components/data/camera.py
Normal file
@@ -0,0 +1,71 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
@dataclass
|
||||
class C2W:
|
||||
"""
|
||||
Represents a camera-to-world transformation matrix.
|
||||
|
||||
Attributes:
|
||||
matrix (List[float]): A list of 16 floats representing the 4x4 transformation matrix in row-major order.
|
||||
"""
|
||||
|
||||
matrix: List[float]
|
||||
|
||||
|
||||
@dataclass
|
||||
class Camera:
|
||||
"""
|
||||
Represents a single camera pose in the trajectory.
|
||||
|
||||
Attributes:
|
||||
trajectory (List[C2W]): List of C2W transformations for this camera pose.
|
||||
intrinsic (Optional[List[float]]): 3x3 camera intrinsic matrix: [[fx, 0, cx], [0, fy, cy], [0, 0, 1]].
|
||||
extrinsic (Optional[List[float]]): 4x4 tobase_extrinsic matrix representing the camera mounting offset
|
||||
relative to the robot base (height + pitch).
|
||||
length (Optional[int]): Length of the trajectory in number of frames.
|
||||
depths (Optional[list[np.ndarray]]): List of depth images captured by this camera.
|
||||
rgbs (Optional[list[np.ndarray]]): List of RGB images captured by this camera.
|
||||
uv_tracks (Optional[Dict[str, Any]]): UV tracking data in the format
|
||||
{mesh_name: {"per_frame": list, "width": W, "height": H}}.
|
||||
uv_mesh_names (Optional[List[str]]): List of mesh names being tracked in the UV tracking data.
|
||||
"""
|
||||
|
||||
trajectory: List[C2W]
|
||||
intrinsic: List[float] = None
|
||||
extrinsic: List[float] = None
|
||||
length: int = None
|
||||
depths: list[np.ndarray] = None
|
||||
rgbs: list[np.ndarray] = None
|
||||
uv_tracks: Optional[Dict[str, Any]] = None
|
||||
uv_mesh_names: Optional[List[str]] = None
|
||||
|
||||
def __len__(self):
|
||||
if self.length is not None:
|
||||
return self.length
|
||||
self._check_length()
|
||||
self.length = len(self.trajectory)
|
||||
return len(self.trajectory)
|
||||
|
||||
def _check_length(self):
|
||||
if self.depths is not None and len(self.depths) != len(self.trajectory):
|
||||
raise ValueError("Length of depths does not match length of trajectory")
|
||||
if self.rgbs is not None and len(self.rgbs) != len(self.trajectory):
|
||||
raise ValueError("Length of rgbs does not match length of trajectory")
|
||||
if self.uv_tracks is not None:
|
||||
for mesh_name, track_data in self.uv_tracks.items():
|
||||
if len(track_data["per_frame"]) != len(self.trajectory):
|
||||
raise ValueError(f"Length of uv_tracks for mesh {mesh_name} does not match length of trajectory")
|
||||
|
||||
def append_rgb(self, rgb_image: np.ndarray):
|
||||
if self.rgbs is None:
|
||||
self.rgbs = []
|
||||
self.rgbs.append(rgb_image)
|
||||
|
||||
def append_depth(self, depth_image: np.ndarray):
|
||||
if self.depths is None:
|
||||
self.depths = []
|
||||
self.depths.append(depth_image)
|
||||
95
nimbus/components/data/iterator.py
Normal file
95
nimbus/components/data/iterator.py
Normal file
@@ -0,0 +1,95 @@
|
||||
import logging
|
||||
import time
|
||||
from abc import abstractmethod
|
||||
from collections.abc import Iterator
|
||||
from typing import Generic, TypeVar
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
# pylint: disable=E0102
|
||||
class Iterator(Iterator, Generic[T]):
|
||||
def __init__(self, max_retry=3):
|
||||
self._next_calls = 0.0
|
||||
self._next_total_time = 0.0
|
||||
self._init_time_costs = 0.0
|
||||
self._init_times = 0
|
||||
self._frame_compute_time = 0.0
|
||||
self._frame_compute_frames = 0.0
|
||||
self._frame_io_time = 0.0
|
||||
self._frame_io_frames = 0.0
|
||||
self._wait_time = 0.0
|
||||
self._seq_num = 0.0
|
||||
self._seq_time = 0.0
|
||||
self.logger = logging.getLogger("de_logger")
|
||||
self.max_retry = max_retry
|
||||
self.retry_num = 0
|
||||
|
||||
def record_init_time(self, time_costs):
|
||||
self._init_times += 1
|
||||
self._init_time_costs += time_costs
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
start_time = time.time()
|
||||
try:
|
||||
result = self._next()
|
||||
except StopIteration:
|
||||
self._log_statistics()
|
||||
raise
|
||||
end_time = time.time()
|
||||
self._next_calls += 1
|
||||
self._next_total_time += end_time - start_time
|
||||
return result
|
||||
|
||||
def collect_compute_frame_info(self, length, time_costs):
|
||||
self._frame_compute_frames += length
|
||||
self._frame_compute_time += time_costs
|
||||
|
||||
def collect_io_frame_info(self, length, time_costs):
|
||||
self._frame_io_frames += length
|
||||
self._frame_io_time += time_costs
|
||||
|
||||
def collect_wait_time_info(self, time_costs):
|
||||
self._wait_time += time_costs
|
||||
|
||||
def collect_seq_info(self, length, time_costs):
|
||||
self._seq_num += length
|
||||
self._seq_time += time_costs
|
||||
|
||||
@abstractmethod
|
||||
def _next(self):
|
||||
raise NotImplementedError("Subclasses should implement this method.")
|
||||
|
||||
def _log_statistics(self):
|
||||
class_name = self.__class__.__name__
|
||||
self.logger.info(
|
||||
f"{class_name}: Next method called {self._next_calls} times, total time:"
|
||||
f" {self._next_total_time:.6f} seconds"
|
||||
)
|
||||
if self._init_time_costs > 0:
|
||||
self.logger.info(
|
||||
f"{class_name}: Init time: {self._init_time_costs:.6f} seconds, init {self._init_times} times"
|
||||
)
|
||||
if self._frame_compute_time > 0:
|
||||
avg_compute_time = self._frame_compute_time / self._frame_compute_frames
|
||||
self.logger.info(
|
||||
f"{class_name}: compute frame num: {self._frame_compute_frames}, total time:"
|
||||
f" {self._frame_compute_time:.6f} seconds, average time: {avg_compute_time:.6f} seconds per frame"
|
||||
)
|
||||
if self._frame_io_frames > 0:
|
||||
avg_io_time = self._frame_io_time / self._frame_io_frames
|
||||
self.logger.info(
|
||||
f"{class_name}: io frame num: {self._frame_io_frames}, total time: {self._frame_io_time:.6f} seconds,"
|
||||
f" average time: {avg_io_time:.6f} seconds per frame"
|
||||
)
|
||||
if self._wait_time > 0:
|
||||
self.logger.info(f"{class_name}: wait time: {self._wait_time:.6f} seconds")
|
||||
if self._seq_time > 0:
|
||||
avg_seq_time = self._seq_time / self._seq_num
|
||||
self.logger.info(
|
||||
f"{class_name}: seq num: {self._seq_num:.6f}, total time: {self._seq_time:.6f} seconds, average time:"
|
||||
f" {avg_seq_time:.6f} seconds per sequence"
|
||||
)
|
||||
119
nimbus/components/data/observation.py
Normal file
119
nimbus/components/data/observation.py
Normal file
@@ -0,0 +1,119 @@
|
||||
import os
|
||||
|
||||
import cv2
|
||||
import imageio
|
||||
import numpy as np
|
||||
|
||||
from nimbus.components.data.camera import Camera
|
||||
|
||||
|
||||
class Observations:
|
||||
"""
|
||||
Represents a single observation of a scene, which may include multiple camera trajectories and associated data.
|
||||
Each observation is identified by a unique name and index, and can contain multiple Camera items that capture
|
||||
different viewpoints or modalities of the same scene.
|
||||
|
||||
Args:
|
||||
scene_name (str): The name of the scene associated with this observation.
|
||||
index (str): The index or ID of this observation within the scene.
|
||||
length (int): Optional total length of the observation. Calculated from camera trajectories if not provided.
|
||||
data (dict): Optional dictionary for storing additional arbitrary data, such as metadata or annotations.
|
||||
"""
|
||||
|
||||
def __init__(self, scene_name: str, index: str, length: int = None, data: dict = None):
|
||||
self.scene_name = scene_name
|
||||
self.obs_name = scene_name + "_" + index
|
||||
self.index = index
|
||||
self.cam_items = []
|
||||
self.length = length
|
||||
self.data = data
|
||||
|
||||
def __getstate__(self):
|
||||
state = self.__dict__.copy()
|
||||
return state
|
||||
|
||||
def __setstate__(self, state):
|
||||
self.__dict__.update(state)
|
||||
|
||||
def append_cam(self, item: Camera):
|
||||
self.cam_items.append(item)
|
||||
|
||||
def __len__(self):
|
||||
if self.length is not None:
|
||||
return self.length
|
||||
self.length = 0
|
||||
for cam in self.cam_items:
|
||||
self.length += len(cam)
|
||||
return self.length
|
||||
|
||||
def get_length(self):
|
||||
return len(self)
|
||||
|
||||
def flush_to_disk(self, path, video_fps=10):
|
||||
path_to_save = os.path.join(path, "trajectory_" + self.index)
|
||||
print(f"obs {self.obs_name} try to save path in {path_to_save}")
|
||||
os.makedirs(path_to_save, exist_ok=True)
|
||||
|
||||
# Single camera: save in root directory
|
||||
if len(self.cam_items) == 1:
|
||||
cam = self.cam_items[0]
|
||||
self._save_camera_data(path_to_save, cam, video_fps)
|
||||
# Multiple cameras: save in camera_0/, camera_1/, etc.
|
||||
else:
|
||||
for idx, cam in enumerate(self.cam_items):
|
||||
camera_dir = os.path.join(path_to_save, f"camera_{idx}")
|
||||
os.makedirs(camera_dir, exist_ok=True)
|
||||
self._save_camera_data(camera_dir, cam, video_fps)
|
||||
|
||||
def _save_camera_data(self, save_dir, cam: Camera, video_fps):
|
||||
"""Helper method to save camera visualization data (rgbs, depths) to a directory."""
|
||||
# Save RGB and depth images if available
|
||||
if cam.rgbs is not None and len(cam.rgbs) > 0:
|
||||
rgb_images_path = os.path.join(save_dir, "rgb/")
|
||||
os.makedirs(rgb_images_path, exist_ok=True)
|
||||
|
||||
fps_path = os.path.join(save_dir, "fps.mp4")
|
||||
|
||||
for idx, rgb_item in enumerate(cam.rgbs):
|
||||
rgb_filename = os.path.join(rgb_images_path, f"{idx}.jpg")
|
||||
cv2.imwrite(rgb_filename, cv2.cvtColor(rgb_item, cv2.COLOR_BGR2RGB))
|
||||
|
||||
imageio.mimwrite(fps_path, cam.rgbs, fps=video_fps)
|
||||
|
||||
if cam.depths is not None and len(cam.depths) > 0:
|
||||
depth_images_path = os.path.join(save_dir, "depth/")
|
||||
os.makedirs(depth_images_path, exist_ok=True)
|
||||
|
||||
depth_path = os.path.join(save_dir, "depth.mp4")
|
||||
|
||||
# Create a copy for video (8-bit version)
|
||||
depth_video_frames = []
|
||||
for idx, depth_item in enumerate(cam.depths):
|
||||
depth_filename = os.path.join(depth_images_path, f"{idx}.png")
|
||||
cv2.imwrite(depth_filename, depth_item)
|
||||
depth_video_frames.append((depth_item >> 8).astype(np.uint8))
|
||||
|
||||
imageio.mimwrite(depth_path, depth_video_frames, fps=video_fps)
|
||||
|
||||
# Save UV tracking visualizations if available
|
||||
if cam.uv_tracks is not None and cam.uv_mesh_names is not None and cam.rgbs is not None:
|
||||
num_frames = len(cam.rgbs)
|
||||
try:
|
||||
from nimbus_extension.components.render.brpc_utils.point_tracking import (
|
||||
make_uv_overlays_and_video,
|
||||
)
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"UV tracking visualization requires nimbus_extension. "
|
||||
"Please add `import nimbus_extension` before running the pipeline."
|
||||
) from e
|
||||
|
||||
make_uv_overlays_and_video(
|
||||
cam.rgbs,
|
||||
cam.uv_tracks,
|
||||
cam.uv_mesh_names,
|
||||
start_frame=0,
|
||||
end_frame=num_frames,
|
||||
fps=video_fps,
|
||||
path_to_save=save_dir,
|
||||
)
|
||||
39
nimbus/components/data/package.py
Normal file
39
nimbus/components/data/package.py
Normal file
@@ -0,0 +1,39 @@
|
||||
import pickle
|
||||
|
||||
|
||||
class Package:
|
||||
"""
|
||||
A class representing a data package that can be serialized and deserialized for pipeline.
|
||||
|
||||
Args:
|
||||
data: The actual data contained in the package, which can be of any type.
|
||||
task_id (int): The ID of the task associated with this package.
|
||||
task_name (str): The name of the task associated with this package.
|
||||
stop_sig (bool): Whether this package signals the pipeline to stop.
|
||||
"""
|
||||
|
||||
def __init__(self, data, task_id: int = -1, task_name: str = None, stop_sig: bool = False):
|
||||
self.is_ser = False
|
||||
self.data = data
|
||||
self.task_id = task_id
|
||||
self.task_name = task_name
|
||||
self.stop_sig = stop_sig
|
||||
|
||||
def serialize(self):
|
||||
assert self.is_ser is False, "data is already serialized"
|
||||
self.data = pickle.dumps(self.data)
|
||||
self.is_ser = True
|
||||
|
||||
def deserialize(self):
|
||||
assert self.is_ser is True, "data is already deserialized"
|
||||
self.data = pickle.loads(self.data)
|
||||
self.is_ser = False
|
||||
|
||||
def is_serialized(self):
|
||||
return self.is_ser
|
||||
|
||||
def get_data(self):
|
||||
return self.data
|
||||
|
||||
def should_stop(self):
|
||||
return self.stop_sig is True
|
||||
69
nimbus/components/data/scene.py
Normal file
69
nimbus/components/data/scene.py
Normal file
@@ -0,0 +1,69 @@
|
||||
class Scene:
|
||||
"""
|
||||
Represents a loaded scene in the simulation environment, holding workflow context and task execution state.
|
||||
|
||||
Args:
|
||||
name (str): The name of the scene or task.
|
||||
pcd: Point cloud data associated with the scene.
|
||||
scale (float): Scale factor for the scene geometry.
|
||||
materials: Material data for the scene.
|
||||
textures: Texture data for the scene.
|
||||
floor_heights: Floor height information for the scene.
|
||||
wf: The task workflow instance managing this scene.
|
||||
task_id (int): The index of the current task within the workflow.
|
||||
task_exec_num (int): The execution count for the current task, used for task repetition tracking.
|
||||
simulation_app: The Isaac Sim SimulationApp instance.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
name: str = None,
|
||||
pcd=None,
|
||||
scale: float = 1.0,
|
||||
materials=None,
|
||||
textures=None,
|
||||
floor_heights=None,
|
||||
wf=None,
|
||||
task_id: int = None,
|
||||
task_exec_num: int = 1,
|
||||
simulation_app=None,
|
||||
):
|
||||
self.name = name
|
||||
self.pcd = pcd
|
||||
self.materials = materials
|
||||
self.textures = textures
|
||||
self.floor_heights = floor_heights
|
||||
self.scale = scale
|
||||
self.wf = wf
|
||||
self.simulation_app = simulation_app
|
||||
self.task_id = task_id
|
||||
self.plan_info = None
|
||||
self.generate_success = False
|
||||
self.task_exec_num = task_exec_num
|
||||
|
||||
def __getstate__(self):
|
||||
state = self.__dict__.copy()
|
||||
del state["pcd"]
|
||||
return state
|
||||
|
||||
def __setstate__(self, state):
|
||||
self.__dict__.update(state)
|
||||
self.pcd = None
|
||||
|
||||
def add_plan_info(self, plan_info):
|
||||
self.plan_info = plan_info
|
||||
|
||||
def flush_to_disk(self, path):
|
||||
pass
|
||||
|
||||
def load_from_disk(self, path):
|
||||
pass
|
||||
|
||||
def update_generate_status(self, success):
|
||||
self.generate_success = success
|
||||
|
||||
def get_generate_status(self):
|
||||
return self.generate_success
|
||||
|
||||
def update_task_exec_num(self, num):
|
||||
self.task_exec_num = num
|
||||
145
nimbus/components/data/sequence.py
Normal file
145
nimbus/components/data/sequence.py
Normal file
@@ -0,0 +1,145 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import open3d as o3d
|
||||
|
||||
from nimbus.components.data.camera import C2W, Camera
|
||||
|
||||
|
||||
class Sequence:
|
||||
"""
|
||||
Represents a camera trajectory sequence with associated metadata.
|
||||
|
||||
Args:
|
||||
scene_name (str): The name of the scene (e.g., room identifier).
|
||||
index (str): The index or ID of this sequence within the scene.
|
||||
length (int): Optional explicit sequence length. Calculated from camera trajectories if not provided.
|
||||
data (dict): Optional additional arbitrary data associated with the sequence.
|
||||
"""
|
||||
|
||||
def __init__(self, scene_name: str, index: str, length: int = None, data: dict = None):
|
||||
self.scene_name = scene_name
|
||||
self.seq_name = scene_name + "_" + index
|
||||
self.index = index
|
||||
self.cam_items: list[Camera] = []
|
||||
self.path_pcd = None
|
||||
self.length = length
|
||||
self.data = data
|
||||
|
||||
def __getstate__(self):
|
||||
state = self.__dict__.copy()
|
||||
state["path_pcd_color"] = np.asarray(state["path_pcd"].colors)
|
||||
state["path_pcd"] = o3d.io.write_point_cloud_to_bytes(state["path_pcd"], "mem::xyz")
|
||||
return state
|
||||
|
||||
def __setstate__(self, state):
|
||||
self.__dict__.update(state)
|
||||
self.path_pcd = o3d.io.read_point_cloud_from_bytes(state["path_pcd"], "mem::xyz")
|
||||
self.path_pcd.colors = o3d.utility.Vector3dVector(state["path_pcd_color"])
|
||||
|
||||
def __len__(self):
|
||||
if self.length is not None:
|
||||
return self.length
|
||||
self.length = 0
|
||||
for cam in self.cam_items:
|
||||
self.length += len(cam)
|
||||
return self.length
|
||||
|
||||
def append_cam(self, item: Camera):
|
||||
self.cam_items.append(item)
|
||||
|
||||
def update_pcd(self, path_pcd):
|
||||
self.path_pcd = path_pcd
|
||||
|
||||
def get_length(self):
|
||||
return len(self)
|
||||
|
||||
def flush_to_disk(self, path):
|
||||
path_to_save = os.path.join(path, "trajectory_" + self.index)
|
||||
print(f"seq {self.seq_name} try to save path in {path_to_save}")
|
||||
os.makedirs(path_to_save, exist_ok=True)
|
||||
if self.path_pcd is not None:
|
||||
pcd_path = os.path.join(path_to_save, "path.ply")
|
||||
o3d.io.write_point_cloud(pcd_path, self.path_pcd)
|
||||
|
||||
# Single camera: save in root directory
|
||||
if len(self.cam_items) == 1:
|
||||
cam = self.cam_items[0]
|
||||
camera_trajectory_list = [t.matrix for t in cam.trajectory]
|
||||
save_dict = {
|
||||
"camera_intrinsic": cam.intrinsic if cam.intrinsic is not None else None,
|
||||
"camera_extrinsic": cam.extrinsic if cam.extrinsic is not None else None,
|
||||
"camera_trajectory": camera_trajectory_list,
|
||||
}
|
||||
traj_path = os.path.join(path_to_save, "data.json")
|
||||
json_object = json.dumps(save_dict, indent=4)
|
||||
with open(traj_path, "w", encoding="utf-8") as outfile:
|
||||
outfile.write(json_object)
|
||||
# Multiple cameras: save in camera_0/, camera_1/, etc.
|
||||
else:
|
||||
for idx, cam in enumerate(self.cam_items):
|
||||
camera_dir = os.path.join(path_to_save, f"camera_{idx}")
|
||||
os.makedirs(camera_dir, exist_ok=True)
|
||||
camera_trajectory_list = [t.matrix for t in cam.trajectory]
|
||||
save_dict = {
|
||||
"camera_intrinsic": cam.intrinsic if cam.intrinsic is not None else None,
|
||||
"camera_extrinsic": cam.extrinsic if cam.extrinsic is not None else None,
|
||||
"camera_trajectory": camera_trajectory_list,
|
||||
}
|
||||
traj_path = os.path.join(camera_dir, "data.json")
|
||||
json_object = json.dumps(save_dict, indent=4)
|
||||
with open(traj_path, "w", encoding="utf-8") as outfile:
|
||||
outfile.write(json_object)
|
||||
|
||||
def load_from_disk(self, path):
|
||||
print(f"seq {self.seq_name} try to load path from {path}")
|
||||
|
||||
pcd_path = os.path.join(path, "path.ply")
|
||||
if os.path.exists(pcd_path):
|
||||
self.path_pcd = o3d.io.read_point_cloud(pcd_path)
|
||||
|
||||
# Clear existing camera items
|
||||
self.cam_items = []
|
||||
|
||||
# Check if single camera format (data.json in root)
|
||||
traj_path = os.path.join(path, "data.json")
|
||||
if os.path.exists(traj_path):
|
||||
with open(traj_path, "r", encoding="utf-8") as infile:
|
||||
data = json.load(infile)
|
||||
|
||||
camera_trajectory_list = []
|
||||
for trajectory in data["camera_trajectory"]:
|
||||
camera_trajectory_list.append(C2W(matrix=trajectory))
|
||||
|
||||
cam = Camera(
|
||||
trajectory=camera_trajectory_list,
|
||||
intrinsic=data.get("camera_intrinsic"),
|
||||
extrinsic=data.get("camera_extrinsic"),
|
||||
)
|
||||
self.cam_items.append(cam)
|
||||
else:
|
||||
# Multiple camera format (camera_0/, camera_1/, etc.)
|
||||
idx = 0
|
||||
while True:
|
||||
camera_dir = os.path.join(path, f"camera_{idx}")
|
||||
camera_json = os.path.join(camera_dir, "data.json")
|
||||
if not os.path.exists(camera_json):
|
||||
break
|
||||
|
||||
with open(camera_json, "r", encoding="utf-8") as infile:
|
||||
data = json.load(infile)
|
||||
|
||||
camera_trajectory_list = []
|
||||
for trajectory in data["camera_trajectory"]:
|
||||
camera_trajectory_list.append(C2W(matrix=trajectory))
|
||||
|
||||
cam = Camera(
|
||||
trajectory=camera_trajectory_list,
|
||||
intrinsic=data.get("camera_intrinsic"),
|
||||
extrinsic=data.get("camera_extrinsic"),
|
||||
)
|
||||
self.cam_items.append(cam)
|
||||
idx += 1
|
||||
|
||||
assert len(self.cam_items) > 0, f"No camera data found in {path}"
|
||||
7
nimbus/components/dedump/__init__.py
Normal file
7
nimbus/components/dedump/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from nimbus.components.data.iterator import Iterator
|
||||
|
||||
dedumper_dict = {}
|
||||
|
||||
|
||||
def register(type_name: str, cls: Iterator):
|
||||
dedumper_dict[type_name] = cls
|
||||
7
nimbus/components/dump/__init__.py
Normal file
7
nimbus/components/dump/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from .base_dumper import BaseDumper
|
||||
|
||||
dumper_dict = {}
|
||||
|
||||
|
||||
def register(type_name: str, cls: BaseDumper):
|
||||
dumper_dict[type_name] = cls
|
||||
82
nimbus/components/dump/base_dumper.py
Normal file
82
nimbus/components/dump/base_dumper.py
Normal file
@@ -0,0 +1,82 @@
|
||||
import time
|
||||
from abc import abstractmethod
|
||||
|
||||
from pympler import asizeof
|
||||
|
||||
from nimbus.components.data.iterator import Iterator
|
||||
from nimbus.components.data.package import Package
|
||||
from nimbus.utils.utils import unpack_iter_data
|
||||
|
||||
|
||||
class BaseDumper(Iterator):
|
||||
def __init__(self, data_iter, output_queue, max_queue_num=1):
|
||||
super().__init__()
|
||||
self.data_iter = data_iter
|
||||
self.scene = None
|
||||
self.output_queue = output_queue
|
||||
self.total_case = 0
|
||||
self.success_case = 0
|
||||
self.max_queue_num = max_queue_num
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def _next(self):
|
||||
try:
|
||||
data = next(self.data_iter)
|
||||
scene, seq, obs = unpack_iter_data(data)
|
||||
self.total_case += 1
|
||||
if scene is not None:
|
||||
if self.scene is not None and (
|
||||
scene.task_id != self.scene.task_id
|
||||
or scene.name != self.scene.name
|
||||
or scene.task_exec_num != self.scene.task_exec_num
|
||||
):
|
||||
self.logger.info(
|
||||
f"Scene {self.scene.name} generate finish, success rate: {self.success_case}/{self.total_case}"
|
||||
)
|
||||
self.total_case = 1
|
||||
self.success_case = 0
|
||||
self.scene = scene
|
||||
if obs is None and seq is None:
|
||||
self.logger.info(f"generate failed, skip once! success rate: {self.success_case}/{self.total_case}")
|
||||
if self.scene is not None:
|
||||
self.scene.update_generate_status(success=False)
|
||||
return None
|
||||
io_start_time = time.time()
|
||||
if self.output_queue is not None:
|
||||
obj = self.dump(seq, obs)
|
||||
pack = Package(obj, task_id=scene.task_id, task_name=scene.name)
|
||||
pack.serialize()
|
||||
|
||||
wait_time = time.time()
|
||||
while self.output_queue.qsize() >= self.max_queue_num:
|
||||
time.sleep(1)
|
||||
end_time = time.time()
|
||||
self.collect_wait_time_info(end_time - wait_time)
|
||||
|
||||
st = time.time()
|
||||
self.output_queue.put(pack)
|
||||
ed = time.time()
|
||||
self.logger.info(f"put time: {ed - st}, data size: {asizeof.asizeof(obj)}")
|
||||
else:
|
||||
obj = self.dump(seq, obs)
|
||||
self.success_case += 1
|
||||
self.scene.update_generate_status(success=True)
|
||||
self.collect_seq_info(1, time.time() - io_start_time)
|
||||
except StopIteration:
|
||||
if self.output_queue is not None:
|
||||
pack = Package(None, stop_sig=True)
|
||||
self.output_queue.put(pack)
|
||||
if self.scene is not None:
|
||||
self.logger.info(
|
||||
f"Scene {self.scene.name} generate finish, success rate: {self.success_case}/{self.total_case}"
|
||||
)
|
||||
raise StopIteration("no data")
|
||||
except Exception as e:
|
||||
self.logger.exception(f"Error during data dumping: {e}")
|
||||
raise e
|
||||
|
||||
@abstractmethod
|
||||
def dump(self, seq, obs):
|
||||
raise NotImplementedError("This method should be overridden by subclasses")
|
||||
16
nimbus/components/load/__init__.py
Normal file
16
nimbus/components/load/__init__.py
Normal file
@@ -0,0 +1,16 @@
|
||||
# flake8: noqa: F401
|
||||
# pylint: disable=C0413
|
||||
|
||||
from .base_randomizer import LayoutRandomizer
|
||||
from .base_scene_loader import SceneLoader
|
||||
|
||||
scene_loader_dict = {}
|
||||
layout_randomizer_dict = {}
|
||||
|
||||
|
||||
def register_loader(type_name: str, cls: SceneLoader):
|
||||
scene_loader_dict[type_name] = cls
|
||||
|
||||
|
||||
def register_randomizer(type_name: str, cls: LayoutRandomizer):
|
||||
layout_randomizer_dict[type_name] = cls
|
||||
72
nimbus/components/load/base_randomizer.py
Normal file
72
nimbus/components/load/base_randomizer.py
Normal file
@@ -0,0 +1,72 @@
|
||||
import sys
|
||||
import time
|
||||
from abc import abstractmethod
|
||||
from typing import Optional
|
||||
|
||||
from nimbus.components.data.iterator import Iterator
|
||||
from nimbus.components.data.scene import Scene
|
||||
from nimbus.daemon.decorators import status_monitor
|
||||
|
||||
|
||||
class LayoutRandomizer(Iterator):
|
||||
"""
|
||||
Base class for layout randomization in a scene. This class defines the structure for randomizing scenes and
|
||||
tracking the randomization process. It manages the current scene, randomization count, and provides hooks for
|
||||
subclasses to implement specific randomization logic.
|
||||
|
||||
Args:
|
||||
scene_iter (Iterator): An iterator that provides scenes to be randomized.
|
||||
random_num (int): The number of randomizations to perform for each scene before moving to the next one.
|
||||
strict_mode (bool): If True, the randomizer will check the generation status of the current scene and retry
|
||||
randomization if it was not successful. This ensures that only successfully generated
|
||||
scenes are counted towards the randomization limit.
|
||||
"""
|
||||
|
||||
def __init__(self, scene_iter: Iterator, random_num: int, strict_mode: bool = False):
|
||||
super().__init__()
|
||||
self.scene_iter = scene_iter
|
||||
self.random_num = random_num
|
||||
self.strict_mode = strict_mode
|
||||
self.cur_index = sys.maxsize
|
||||
self.scene: Optional[Scene] = None
|
||||
|
||||
def reset(self, scene):
|
||||
self.cur_index = 0
|
||||
self.scene = scene
|
||||
|
||||
def _fetch_next_scene(self):
|
||||
scene = next(self.scene_iter)
|
||||
self.reset(scene)
|
||||
|
||||
@status_monitor()
|
||||
def _randomize_with_status(self, scene) -> Scene:
|
||||
scene = self.randomize_scene(self.scene)
|
||||
return scene
|
||||
|
||||
def _next(self) -> Scene:
|
||||
try:
|
||||
if self.strict_mode and self.scene is not None:
|
||||
if not self.scene.get_generate_status():
|
||||
self.logger.info("strict_mode is open, retry the randomization to generate sequence.")
|
||||
st = time.time()
|
||||
scene = self._randomize_with_status(self.scene)
|
||||
self.collect_seq_info(1, time.time() - st)
|
||||
return scene
|
||||
if self.cur_index >= self.random_num:
|
||||
self._fetch_next_scene()
|
||||
if self.cur_index < self.random_num:
|
||||
st = time.time()
|
||||
scene = self._randomize_with_status(self.scene)
|
||||
self.collect_seq_info(1, time.time() - st)
|
||||
self.cur_index += 1
|
||||
return scene
|
||||
except StopIteration:
|
||||
raise StopIteration("No more scenes to randomize.")
|
||||
except Exception as e:
|
||||
self.logger.exception(f"Error during scene idx {self.cur_index} randomization: {e}")
|
||||
self.cur_index += 1
|
||||
raise e
|
||||
|
||||
@abstractmethod
|
||||
def randomize_scene(self, scene) -> Scene:
|
||||
raise NotImplementedError("This method should be overridden by subclasses")
|
||||
41
nimbus/components/load/base_scene_loader.py
Normal file
41
nimbus/components/load/base_scene_loader.py
Normal file
@@ -0,0 +1,41 @@
|
||||
from abc import abstractmethod
|
||||
|
||||
from nimbus.components.data.iterator import Iterator
|
||||
from nimbus.components.data.scene import Scene
|
||||
|
||||
|
||||
class SceneLoader(Iterator):
|
||||
"""
|
||||
Base class for scene loading in a simulation environment. This class defines the structure for loading scenes
|
||||
and tracking the loading process. It manages the current package iterator and provides hooks for subclasses
|
||||
to implement specific scene loading logic.
|
||||
|
||||
Args:
|
||||
pack_iter (Iterator): An iterator that provides packages containing scene information to be loaded.
|
||||
"""
|
||||
|
||||
def __init__(self, pack_iter):
|
||||
super().__init__()
|
||||
self.pack_iter = pack_iter
|
||||
|
||||
@abstractmethod
|
||||
def load_asset(self) -> Scene:
|
||||
"""
|
||||
Abstract method to load and initialize a scene.
|
||||
|
||||
Subclasses must implement this method to define the specific logic for creating and configuring
|
||||
a scene object based on the current state of the iterator.
|
||||
|
||||
Returns:
|
||||
Scene: A fully initialized Scene object.
|
||||
"""
|
||||
raise NotImplementedError("This method must be implemented by subclasses")
|
||||
|
||||
def _next(self) -> Scene:
|
||||
try:
|
||||
return self.load_asset()
|
||||
except StopIteration:
|
||||
raise StopIteration("No more scenes to load.")
|
||||
except Exception as e:
|
||||
self.logger.exception(f"Error during scene loading: {e}")
|
||||
raise e
|
||||
7
nimbus/components/plan_with_render/__init__.py
Normal file
7
nimbus/components/plan_with_render/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from nimbus.components.data.iterator import Iterator
|
||||
|
||||
plan_with_render_dict = {}
|
||||
|
||||
|
||||
def register(type_name: str, cls: Iterator):
|
||||
plan_with_render_dict[type_name] = cls
|
||||
7
nimbus/components/planner/__init__.py
Normal file
7
nimbus/components/planner/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from .base_seq_planner import SequencePlanner
|
||||
|
||||
seq_planner_dict = {}
|
||||
|
||||
|
||||
def register(type_name: str, cls: SequencePlanner):
|
||||
seq_planner_dict[type_name] = cls
|
||||
102
nimbus/components/planner/base_seq_planner.py
Normal file
102
nimbus/components/planner/base_seq_planner.py
Normal file
@@ -0,0 +1,102 @@
|
||||
import sys
|
||||
import time
|
||||
from abc import abstractmethod
|
||||
from typing import Optional
|
||||
|
||||
from nimbus.components.data.iterator import Iterator
|
||||
from nimbus.components.data.scene import Scene
|
||||
from nimbus.components.data.sequence import Sequence
|
||||
from nimbus.daemon.decorators import status_monitor
|
||||
from nimbus.utils.flags import is_debug_mode
|
||||
from nimbus.utils.types import ARGS, TYPE
|
||||
|
||||
from .planner import path_planner_dict
|
||||
|
||||
|
||||
class SequencePlanner(Iterator):
|
||||
"""
|
||||
A base class for sequence planning in a simulation environment. This class defines the structure for generating
|
||||
sequences based on scenes and tracking the planning process. It manages the current scene, episode count
|
||||
and provides hooks for subclasses to implement specific sequence generation logic.
|
||||
|
||||
Args:
|
||||
scene_iter (Iterator): An iterator that provides scenes to be processed for sequence planning.
|
||||
planner_cfg (dict): A dictionary containing configuration parameters for the planner,
|
||||
such as the type of planner to use and its arguments.
|
||||
episodes (int): The number of episodes to generate for each scene before moving to the next one. Default is 1.
|
||||
"""
|
||||
|
||||
def __init__(self, scene_iter: Iterator[Scene], planner_cfg: dict, episodes: int = 1):
|
||||
super().__init__()
|
||||
self.scene_iter = scene_iter
|
||||
self.planner_cfg = planner_cfg
|
||||
self.episodes = episodes
|
||||
self.current_episode = sys.maxsize
|
||||
self.scene: Optional[Scene] = None
|
||||
|
||||
@status_monitor()
|
||||
def _plan_with_status(self) -> Optional[Sequence]:
|
||||
seq = self.generate_sequence()
|
||||
return seq
|
||||
|
||||
def _next(self) -> tuple[Scene, Sequence]:
|
||||
try:
|
||||
if self.scene is None or self.current_episode >= self.episodes:
|
||||
try:
|
||||
self.scene = next(self.scene_iter)
|
||||
self.current_episode = 0
|
||||
if self.scene is None:
|
||||
return None, None
|
||||
self.initialize(self.scene)
|
||||
except StopIteration:
|
||||
raise StopIteration("No more scene to process.")
|
||||
except Exception as e:
|
||||
self.logger.exception(f"Error loading next scene: {e}")
|
||||
if is_debug_mode():
|
||||
raise e
|
||||
self.current_episode = sys.maxsize
|
||||
return None, None
|
||||
|
||||
while True:
|
||||
compute_start_time = time.time()
|
||||
seq = self._plan_with_status()
|
||||
compute_end_time = time.time()
|
||||
self.current_episode += 1
|
||||
|
||||
if seq is not None:
|
||||
self.collect_compute_frame_info(seq.get_length(), compute_end_time - compute_start_time)
|
||||
return self.scene, seq
|
||||
|
||||
if self.current_episode >= self.episodes:
|
||||
return self.scene, None
|
||||
|
||||
self.logger.info(f"Generate seq failed and retry. Current episode id is {self.current_episode}")
|
||||
|
||||
except StopIteration:
|
||||
raise StopIteration("No more scene to process.")
|
||||
except Exception as e:
|
||||
scene_name = getattr(self.scene, "name", "<unknown>")
|
||||
self.logger.exception(
|
||||
f"Error during idx {self.current_episode} sequence generation for scene {scene_name}: {e}"
|
||||
)
|
||||
if is_debug_mode():
|
||||
raise e
|
||||
self.current_episode += 1
|
||||
return self.scene, None
|
||||
|
||||
@abstractmethod
|
||||
def generate_sequence(self) -> Optional[Sequence]:
|
||||
raise NotImplementedError("This method should be overridden by subclasses")
|
||||
|
||||
def _initialize(self, scene):
|
||||
if self.planner_cfg is not None:
|
||||
self.logger.info(f"init {self.planner_cfg[TYPE]} planner in seq_planner")
|
||||
self.planner = path_planner_dict[self.planner_cfg[TYPE]](scene, **self.planner_cfg.get(ARGS, {}))
|
||||
else:
|
||||
self.planner = None
|
||||
self.logger.info("planner config is None in seq_planner and skip initialize")
|
||||
|
||||
def initialize(self, scene):
|
||||
init_start_time = time.time()
|
||||
self._initialize(scene)
|
||||
self.record_init_time(time.time() - init_start_time)
|
||||
5
nimbus/components/planner/planner/__init__.py
Normal file
5
nimbus/components/planner/planner/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
path_planner_dict = {}
|
||||
|
||||
|
||||
def register(type_name: str, cls):
|
||||
path_planner_dict[type_name] = cls
|
||||
7
nimbus/components/render/__init__.py
Normal file
7
nimbus/components/render/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from .base_renderer import BaseRenderer
|
||||
|
||||
renderer_dict = {}
|
||||
|
||||
|
||||
def register(type_name: str, cls: BaseRenderer):
|
||||
renderer_dict[type_name] = cls
|
||||
80
nimbus/components/render/base_renderer.py
Normal file
80
nimbus/components/render/base_renderer.py
Normal file
@@ -0,0 +1,80 @@
|
||||
import time
|
||||
from abc import abstractmethod
|
||||
from typing import Optional
|
||||
|
||||
from nimbus.components.data.iterator import Iterator
|
||||
from nimbus.components.data.observation import Observations
|
||||
from nimbus.components.data.scene import Scene
|
||||
from nimbus.components.data.sequence import Sequence
|
||||
from nimbus.daemon.decorators import status_monitor
|
||||
|
||||
|
||||
class BaseRenderer(Iterator):
|
||||
"""
|
||||
Base class for rendering in a simulation environment. This class defines the structure for rendering scenes and
|
||||
tracking the rendering process. It manages the current scene and provides hooks for subclasses to implement
|
||||
specific rendering logic.
|
||||
|
||||
Args:
|
||||
scene_seq_iter (Iterator): An iterator that provides pairs of scenes and sequences to be rendered. Each item
|
||||
from the iterator should be a tuple containing a scene and its corresponding sequence.
|
||||
"""
|
||||
|
||||
def __init__(self, scene_seq_iter: Iterator[tuple[Scene, Sequence]]):
|
||||
super().__init__()
|
||||
self.scene_seq_iter = scene_seq_iter
|
||||
self.scene: Optional[Scene] = None
|
||||
|
||||
@status_monitor()
|
||||
def _generate_obs_with_status(self, seq) -> Optional[Observations]:
|
||||
compute_start_time = time.time()
|
||||
obs = self.generate_obs(seq)
|
||||
end_start_time = time.time()
|
||||
if obs is not None:
|
||||
self.collect_compute_frame_info(len(obs), end_start_time - compute_start_time)
|
||||
return obs
|
||||
|
||||
def _next(self):
|
||||
try:
|
||||
scene, seq = next(self.scene_seq_iter)
|
||||
if scene is not None:
|
||||
if self.scene is None:
|
||||
self.reset(scene)
|
||||
elif scene.task_id != self.scene.task_id or scene.name != self.scene.name:
|
||||
self.logger.info(f"Scene changed: {self.scene.name} -> {scene.name}")
|
||||
self.reset(scene)
|
||||
if seq is None:
|
||||
return scene, None, None
|
||||
obs = self._generate_obs_with_status(seq)
|
||||
if obs is None:
|
||||
return scene, None, None
|
||||
return scene, seq, obs
|
||||
except StopIteration:
|
||||
raise StopIteration("No more sequences to process.")
|
||||
except Exception as e:
|
||||
self.logger.exception(f"Error during rendering: {e}")
|
||||
raise e
|
||||
|
||||
@abstractmethod
|
||||
def generate_obs(self, seq) -> Optional[Observations]:
|
||||
raise NotImplementedError("This method should be overridden by subclasses")
|
||||
|
||||
@abstractmethod
|
||||
def _lazy_init(self):
|
||||
raise NotImplementedError("This method should be overridden by subclasses")
|
||||
|
||||
@abstractmethod
|
||||
def _close_resource(self):
|
||||
raise NotImplementedError("This method should be overridden by subclasses")
|
||||
|
||||
def reset(self, scene):
|
||||
try:
|
||||
self.scene = scene
|
||||
self._close_resource()
|
||||
init_start_time = time.time()
|
||||
self._lazy_init()
|
||||
self.record_init_time(time.time() - init_start_time)
|
||||
except Exception as e:
|
||||
self.logger.exception(f"Error initializing renderer: {e}")
|
||||
self.scene = None
|
||||
raise e
|
||||
7
nimbus/components/store/__init__.py
Normal file
7
nimbus/components/store/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from .base_writer import BaseWriter
|
||||
|
||||
writer_dict = {}
|
||||
|
||||
|
||||
def register(type_name: str, cls: BaseWriter):
|
||||
writer_dict[type_name] = cls
|
||||
163
nimbus/components/store/base_writer.py
Normal file
163
nimbus/components/store/base_writer.py
Normal file
@@ -0,0 +1,163 @@
|
||||
import time
|
||||
from abc import abstractmethod
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from copy import copy
|
||||
|
||||
from nimbus.components.data.iterator import Iterator
|
||||
from nimbus.components.data.observation import Observations
|
||||
from nimbus.components.data.scene import Scene
|
||||
from nimbus.components.data.sequence import Sequence
|
||||
from nimbus.daemon import ComponentStatus, StatusReporter
|
||||
from nimbus.utils.flags import is_debug_mode
|
||||
from nimbus.utils.utils import unpack_iter_data
|
||||
|
||||
|
||||
def run_batch(func, args):
|
||||
for arg in args:
|
||||
func(*arg)
|
||||
|
||||
|
||||
class BaseWriter(Iterator):
|
||||
"""
|
||||
A base class for writing generated sequences and observations to disk. This class defines the structure for
|
||||
writing data and tracking the writing process. It manages the current scene, success and total case counts,
|
||||
and provides hooks for subclasses to implement specific data writing logic. The writer supports both synchronous
|
||||
and asynchronous batch writing modes, allowing for efficient data handling in various scenarios.
|
||||
|
||||
Args:
|
||||
data_iter (Iterator): An iterator that provides data to be written, typically containing scenes,
|
||||
sequences, and observations.
|
||||
seq_output_dir (str): The directory where generated sequences will be saved. Can be None
|
||||
if sequence output is not needed.
|
||||
obs_output_dir (str): The directory where generated observations will be saved. Can be None
|
||||
if observation output is not needed.
|
||||
batch_async (bool): If True, the writer will use asynchronous batch writing to improve performance
|
||||
when handling large amounts of data. Default is True.
|
||||
async_threshold (int): The maximum number of asynchronous write operations that can be in progress
|
||||
at the same time. If the threshold is reached, the writer will wait for the oldest operation
|
||||
to complete before starting a new one. Default is 1.
|
||||
batch_size (int): The number of data items to write in each batch when using asynchronous writing.
|
||||
Default is 2, and it will be capped at 8 to prevent potential issues with too many concurrent operations.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
data_iter: Iterator[tuple[Scene, Sequence, Observations]],
|
||||
seq_output_dir: str,
|
||||
obs_output_dir: str,
|
||||
batch_async: bool = True,
|
||||
async_threshold: int = 1,
|
||||
batch_size: int = 2,
|
||||
):
|
||||
super().__init__()
|
||||
assert (
|
||||
seq_output_dir is not None or obs_output_dir is not None
|
||||
), "At least one output directory must be provided"
|
||||
self.data_iter = data_iter
|
||||
self.seq_output_dir = seq_output_dir
|
||||
self.obs_output_dir = obs_output_dir
|
||||
self.scene = None
|
||||
self.async_mode = batch_async
|
||||
self.batch_size = batch_size if batch_size <= 8 else 8
|
||||
if batch_async and batch_size > self.batch_size:
|
||||
self.logger.info("Batch size is larger than 8(probably cause program hang), batch size will be set to 8")
|
||||
self.async_threshold = async_threshold
|
||||
self.flush_executor = ThreadPoolExecutor(max_workers=max(1, 64 // self.batch_size))
|
||||
self.flush_threads = []
|
||||
self.data_buffer = []
|
||||
self.logger.info(
|
||||
f"Batch Async Write Mode: {self.async_mode}, async threshold: {self.async_threshold}, batch size:"
|
||||
f" {self.batch_size}"
|
||||
)
|
||||
self.total_case = 0
|
||||
self.success_case = 0
|
||||
self.last_scene_key = None
|
||||
self.status_reporter = StatusReporter(self.__class__.__name__)
|
||||
|
||||
def _next(self):
|
||||
try:
|
||||
data = next(self.data_iter)
|
||||
scene, seq, obs = unpack_iter_data(data)
|
||||
|
||||
new_key = (scene.task_id, scene.name, scene.task_exec_num) if scene is not None else None
|
||||
|
||||
self.scene = scene
|
||||
|
||||
if new_key != self.last_scene_key:
|
||||
if self.scene is not None and self.last_scene_key is not None:
|
||||
self.logger.info(
|
||||
f"Scene {self.scene.name} generate finish, success rate: {self.success_case}/{self.total_case}"
|
||||
)
|
||||
self.success_case = 0
|
||||
self.total_case = 0
|
||||
self.last_scene_key = new_key
|
||||
|
||||
if self.scene is None:
|
||||
return None
|
||||
|
||||
self.total_case += 1
|
||||
|
||||
self.status_reporter.update_status(ComponentStatus.RUNNING)
|
||||
if seq is None and obs is None:
|
||||
self.logger.info(f"generate failed, skip once! success rate: {self.success_case}/{self.total_case}")
|
||||
self.scene.update_generate_status(success=False)
|
||||
return None
|
||||
scene_name = self.scene.name
|
||||
io_start_time = time.time()
|
||||
if self.async_mode:
|
||||
cp_start_time = time.time()
|
||||
cp = copy(self.scene.wf)
|
||||
cp_end_time = time.time()
|
||||
if self.scene.wf is not None:
|
||||
self.logger.info(f"Scene {scene_name} workflow copy time: {cp_end_time - cp_start_time:.2f}s")
|
||||
self.data_buffer.append((cp, scene_name, seq, obs))
|
||||
if len(self.data_buffer) >= self.batch_size:
|
||||
self.flush_threads = [t for t in self.flush_threads if not t.done()]
|
||||
|
||||
if len(self.flush_threads) >= self.async_threshold:
|
||||
self.logger.info("Max async workers reached, waiting for the oldest thread to finish")
|
||||
self.flush_threads[0].result()
|
||||
self.flush_threads = self.flush_threads[1:]
|
||||
|
||||
to_flush_buffer = self.data_buffer.copy()
|
||||
async_flush = self.flush_executor.submit(run_batch, self.flush_to_disk, to_flush_buffer)
|
||||
if is_debug_mode():
|
||||
async_flush.result() # surface exceptions immediately in debug mode
|
||||
self.flush_threads.append(async_flush)
|
||||
self.data_buffer = []
|
||||
flush_length = len(obs) if obs is not None else len(seq)
|
||||
else:
|
||||
flush_length = self.flush_to_disk(self.scene.wf, scene_name, seq, obs)
|
||||
self.success_case += 1
|
||||
self.scene.update_generate_status(success=True)
|
||||
self.collect_io_frame_info(flush_length, time.time() - io_start_time)
|
||||
self.status_reporter.update_status(ComponentStatus.COMPLETED)
|
||||
return None
|
||||
except StopIteration:
|
||||
if self.async_mode:
|
||||
if len(self.data_buffer) > 0:
|
||||
async_flush = self.flush_executor.submit(run_batch, self.flush_to_disk, self.data_buffer)
|
||||
self.flush_threads.append(async_flush)
|
||||
for thread in self.flush_threads:
|
||||
thread.result()
|
||||
if self.scene is not None:
|
||||
self.logger.info(
|
||||
f"Scene {self.scene.name} generate finish, success rate: {self.success_case}/{self.total_case}"
|
||||
)
|
||||
raise StopIteration("no data")
|
||||
except Exception as e:
|
||||
self.logger.exception(f"Error during data writing: {e}")
|
||||
raise e
|
||||
|
||||
def __del__(self):
|
||||
for thread in self.flush_threads:
|
||||
thread.result()
|
||||
self.logger.info(f"Writer {len(self.flush_threads)} threads closed")
|
||||
# Close the simulation app if it exists
|
||||
if self.scene is not None and self.scene.simulation_app is not None:
|
||||
self.logger.info("Closing simulation app")
|
||||
self.scene.simulation_app.close()
|
||||
|
||||
@abstractmethod
|
||||
def flush_to_disk(self, task, scene_name, seq, obs):
|
||||
raise NotImplementedError("This method should be overridden by subclasses")
|
||||
4
nimbus/daemon/__init__.py
Normal file
4
nimbus/daemon/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
# flake8: noqa: E401
|
||||
from .status import ComponentStatus, StatusInfo
|
||||
from .status_monitor import StatusMonitor
|
||||
from .status_reporter import StatusReporter
|
||||
24
nimbus/daemon/decorators.py
Normal file
24
nimbus/daemon/decorators.py
Normal file
@@ -0,0 +1,24 @@
|
||||
from functools import wraps
|
||||
|
||||
from nimbus.daemon import ComponentStatus, StatusReporter
|
||||
|
||||
|
||||
def status_monitor(running_status=ComponentStatus.RUNNING, completed_status=ComponentStatus.COMPLETED):
|
||||
def decorator(func):
|
||||
@wraps(func)
|
||||
def wrapper(self, *args, **kwargs):
|
||||
if not hasattr(self, "status_reporter"):
|
||||
self.status_reporter = StatusReporter(self.__class__.__name__)
|
||||
|
||||
self.status_reporter.update_status(running_status)
|
||||
|
||||
try:
|
||||
result = func(self, *args, **kwargs)
|
||||
self.status_reporter.update_status(completed_status)
|
||||
return result
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
21
nimbus/daemon/status.py
Normal file
21
nimbus/daemon/status.py
Normal file
@@ -0,0 +1,21 @@
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class ComponentStatus(Enum):
|
||||
IDLE = "idle"
|
||||
READY = "ready"
|
||||
RUNNING = "running"
|
||||
COMPLETED = "completed"
|
||||
TIMEOUT = "timeout"
|
||||
|
||||
|
||||
@dataclass
|
||||
class StatusInfo:
|
||||
component_id: str
|
||||
status: ComponentStatus
|
||||
last_update: float = field(default_factory=time.time)
|
||||
|
||||
def get_status_duration(self) -> float:
|
||||
return time.time() - self.last_update
|
||||
160
nimbus/daemon/status_monitor.py
Normal file
160
nimbus/daemon/status_monitor.py
Normal file
@@ -0,0 +1,160 @@
|
||||
import threading
|
||||
from typing import Dict, Optional
|
||||
|
||||
from .status import ComponentStatus, StatusInfo
|
||||
|
||||
|
||||
class StatusMonitor:
|
||||
_instance = None
|
||||
_lock = threading.Lock()
|
||||
|
||||
DEFAULT_TIMEOUTS = {
|
||||
ComponentStatus.IDLE: 100,
|
||||
ComponentStatus.READY: float("inf"),
|
||||
ComponentStatus.RUNNING: 360,
|
||||
ComponentStatus.COMPLETED: float("inf"),
|
||||
ComponentStatus.TIMEOUT: float("inf"),
|
||||
}
|
||||
|
||||
def __new__(cls):
|
||||
if cls._instance is None:
|
||||
with cls._lock:
|
||||
if cls._instance is None:
|
||||
cls._instance = super().__new__(cls)
|
||||
return cls._instance
|
||||
|
||||
def __init__(self):
|
||||
if not hasattr(self, "initialized"):
|
||||
self.components: Dict[str, StatusInfo] = {}
|
||||
self.status_timeouts = self.DEFAULT_TIMEOUTS.copy()
|
||||
self.initialized = True
|
||||
|
||||
@classmethod
|
||||
def get_instance(cls):
|
||||
return cls()
|
||||
|
||||
def set_logger(self, logger):
|
||||
self.logger = logger
|
||||
|
||||
def set_status_timeout(self, status: ComponentStatus, timeout_seconds: float):
|
||||
self.status_timeouts[status] = timeout_seconds
|
||||
|
||||
def set_component_timeouts(self, timeouts: Dict[str, float]):
|
||||
converted_timeouts = {}
|
||||
|
||||
for status_name, timeout_value in timeouts.items():
|
||||
try:
|
||||
if isinstance(status_name, str):
|
||||
status = ComponentStatus[status_name.upper()]
|
||||
elif isinstance(status_name, ComponentStatus):
|
||||
status = status_name
|
||||
else:
|
||||
self._record(
|
||||
f"Warning: Invalid status type '{type(status_name)}' for status '{status_name}', skipping"
|
||||
)
|
||||
continue
|
||||
|
||||
try:
|
||||
timeout_value = float(timeout_value)
|
||||
if timeout_value < 0:
|
||||
timeout_value = float("inf")
|
||||
|
||||
converted_timeouts[status] = timeout_value
|
||||
self._record(f"Set timeout for {status.value}: {timeout_value}s")
|
||||
|
||||
except (ValueError, TypeError) as e:
|
||||
self._record(
|
||||
f"Warning: Invalid timeout value '{timeout_value}' for status '{status_name}': {e}, skipping"
|
||||
)
|
||||
continue
|
||||
|
||||
except KeyError:
|
||||
self._record(
|
||||
f"Warning: Unknown status '{status_name}', skipping. Available statuses:"
|
||||
f" {[s.name for s in ComponentStatus]}"
|
||||
)
|
||||
continue
|
||||
except Exception as e:
|
||||
self._record(f"Error processing status '{status_name}': {e}, skipping")
|
||||
continue
|
||||
|
||||
self.status_timeouts.update(converted_timeouts)
|
||||
|
||||
def register_update(self, status_info: StatusInfo):
|
||||
self.components[status_info.component_id] = status_info
|
||||
|
||||
def get_all_status(self) -> Dict[str, StatusInfo]:
|
||||
return self.components.copy()
|
||||
|
||||
def get_status(self, component_id: str) -> Optional[StatusInfo]:
|
||||
return self.components.get(component_id)
|
||||
|
||||
def get_timeout_components(self) -> Dict[str, StatusInfo]:
|
||||
timeout_components = {}
|
||||
for component_id, status_info in self.components.items():
|
||||
if status_info.status == ComponentStatus.TIMEOUT:
|
||||
timeout_components[component_id] = status_info
|
||||
return timeout_components
|
||||
|
||||
def get_components_length(self):
|
||||
return len(self.components)
|
||||
|
||||
def check_and_update_timeouts(self) -> Dict[str, StatusInfo]:
|
||||
newly_timeout_components = {}
|
||||
components = self.get_all_status()
|
||||
for component_id, status_info in components.items():
|
||||
if status_info.status == ComponentStatus.TIMEOUT:
|
||||
newly_timeout_components[component_id] = status_info
|
||||
continue
|
||||
|
||||
time_since_update = status_info.get_status_duration()
|
||||
timeout_threshold = self.status_timeouts.get(status_info.status, 300)
|
||||
self._record(
|
||||
f"[COMPONENT DETAIL] {component_id}: "
|
||||
f"Status={status_info.status}, "
|
||||
f"Duration={status_info.get_status_duration():.1f}s, "
|
||||
f"Threshold={timeout_threshold}s"
|
||||
)
|
||||
|
||||
if time_since_update > timeout_threshold:
|
||||
self._record(
|
||||
f"Component {component_id} timeout: {status_info.status.value} for {time_since_update:.1f}s"
|
||||
f" (threshold: {timeout_threshold}s)"
|
||||
)
|
||||
|
||||
status_info.status = ComponentStatus.TIMEOUT
|
||||
status_info.last_update = time_since_update
|
||||
newly_timeout_components[component_id] = status_info
|
||||
|
||||
return newly_timeout_components
|
||||
|
||||
def clear(self):
|
||||
self.components.clear()
|
||||
self._record("Cleared all registered components.")
|
||||
|
||||
def get_component_status_duration(self, component_id: str) -> Optional[float]:
|
||||
status_info = self.components.get(component_id)
|
||||
if status_info:
|
||||
return status_info.get_status_duration()
|
||||
return None
|
||||
|
||||
def get_all_status_with_duration(self) -> Dict[str, Dict]:
|
||||
result = {}
|
||||
for comp_id, status_info in self.components.items():
|
||||
result[comp_id] = {
|
||||
"status": status_info.status,
|
||||
"duration": status_info.get_status_duration(),
|
||||
"timeout_threshold": self.status_timeouts.get(status_info.status, 300),
|
||||
"last_update": status_info.last_update,
|
||||
}
|
||||
return result
|
||||
|
||||
def set_check_interval(self, interval_seconds: float):
|
||||
self.check_interval = interval_seconds
|
||||
self._record(f"Set daemon check interval to {interval_seconds}s")
|
||||
|
||||
def _record(self, info):
|
||||
if hasattr(self, "logger") and self.logger is not None:
|
||||
self.logger.info(f"[STATUS MONITOR]: {info}")
|
||||
else:
|
||||
print(f"[STATUS MONITOR]: {info}")
|
||||
21
nimbus/daemon/status_reporter.py
Normal file
21
nimbus/daemon/status_reporter.py
Normal file
@@ -0,0 +1,21 @@
|
||||
import threading
|
||||
import time
|
||||
|
||||
from .status import ComponentStatus, StatusInfo
|
||||
from .status_monitor import StatusMonitor
|
||||
|
||||
|
||||
class StatusReporter:
|
||||
def __init__(self, component_id: str):
|
||||
self.component_id = component_id
|
||||
self._status_info = StatusInfo(component_id, ComponentStatus.IDLE)
|
||||
self._lock = threading.Lock()
|
||||
|
||||
def update_status(self, status: ComponentStatus):
|
||||
with self._lock:
|
||||
self._status_info = StatusInfo(component_id=self.component_id, status=status, last_update=time.time())
|
||||
StatusMonitor.get_instance().register_update(self._status_info)
|
||||
|
||||
def get_status(self) -> StatusInfo:
|
||||
with self._lock:
|
||||
return self._status_info
|
||||
66
nimbus/data_engine.py
Normal file
66
nimbus/data_engine.py
Normal file
@@ -0,0 +1,66 @@
|
||||
from time import time
|
||||
|
||||
from nimbus.dist_sim.head_node import HeadNode
|
||||
from nimbus.scheduler.sches import gen_pipe, gen_scheduler
|
||||
from nimbus.utils.logging import configure_logging
|
||||
from nimbus.utils.random import set_all_seeds
|
||||
from nimbus.utils.types import (
|
||||
NAME,
|
||||
SAFE_THRESHOLD,
|
||||
STAGE_PIPE,
|
||||
WORKER_SCHEDULE,
|
||||
StageInput,
|
||||
)
|
||||
from nimbus.utils.utils import consume_stage
|
||||
|
||||
|
||||
class DataEngine:
|
||||
def __init__(self, config, master_seed=None):
|
||||
if master_seed is not None:
|
||||
master_seed = int(master_seed)
|
||||
set_all_seeds(master_seed)
|
||||
exp_name = config[NAME]
|
||||
configure_logging(exp_name, config=config)
|
||||
self._sche_list = gen_scheduler(config)
|
||||
self._stage_input = StageInput()
|
||||
|
||||
def run(self):
|
||||
for stage in self._sche_list:
|
||||
self._stage_input = stage.run(self._stage_input)
|
||||
consume_stage(self._stage_input)
|
||||
|
||||
|
||||
class DistPipeDataEngine:
|
||||
def __init__(self, config, master_seed=None):
|
||||
self._sche_list = gen_scheduler(config)
|
||||
self.config = config
|
||||
self._stage_input = StageInput()
|
||||
exp_name = config[NAME]
|
||||
self.logger = configure_logging(exp_name, config=config)
|
||||
master_seed = int(master_seed) if master_seed is not None else None
|
||||
self.pipe_list = gen_pipe(config, self._sche_list, exp_name, master_seed=master_seed)
|
||||
self.head_nodes = {}
|
||||
|
||||
def run(self):
|
||||
self.logger.info("[DistPipeDataEngine]: %s", self.pipe_list)
|
||||
st_time = time()
|
||||
cur_pipe_queue = None
|
||||
pre_worker_num = 0
|
||||
worker_schedule = self.config[STAGE_PIPE].get(WORKER_SCHEDULE, False)
|
||||
for idx, pipe in enumerate(self.pipe_list):
|
||||
self.head_nodes[idx] = HeadNode(
|
||||
cur_pipe_queue,
|
||||
pipe,
|
||||
pre_worker_num,
|
||||
self.config[STAGE_PIPE][SAFE_THRESHOLD],
|
||||
worker_schedule,
|
||||
self.logger,
|
||||
idx,
|
||||
)
|
||||
self.head_nodes[idx].run()
|
||||
cur_pipe_queue = self.head_nodes[idx].result_queue()
|
||||
pre_worker_num = len(pipe)
|
||||
for _, value in self.head_nodes.items():
|
||||
value.wait_stop()
|
||||
et_time = time()
|
||||
self.logger.info("execution duration: %s", et_time - st_time)
|
||||
0
nimbus/dist_sim/__init__.py
Normal file
0
nimbus/dist_sim/__init__.py
Normal file
201
nimbus/dist_sim/head_node.py
Normal file
201
nimbus/dist_sim/head_node.py
Normal file
@@ -0,0 +1,201 @@
|
||||
import traceback
|
||||
from threading import Thread
|
||||
from time import sleep, time
|
||||
|
||||
import ray
|
||||
from ray.util.queue import Queue
|
||||
|
||||
from nimbus.components.data.package import Package
|
||||
from nimbus.dist_sim.task_board import TaskBoard
|
||||
from nimbus.scheduler.inner_pipe import PipeWorkerGroup
|
||||
|
||||
|
||||
class HeadNode:
|
||||
def __init__(
|
||||
self, data_queue, workers: PipeWorkerGroup, pre_worker_num, safe_threshold, worker_schedule, logger, idx
|
||||
):
|
||||
self.idx = idx
|
||||
self.data_queue = data_queue
|
||||
self.logger = logger
|
||||
self.worker_group = workers
|
||||
logger.info(f"workers: {list(workers.keys())}")
|
||||
self.pre_worker_num = pre_worker_num
|
||||
self.safe_threshold = safe_threshold
|
||||
self.worker_schedule = worker_schedule
|
||||
logger.info(f"safe_threshold: {self.safe_threshold}")
|
||||
logger.info(f"worker_schedule: {self.worker_schedule}")
|
||||
self.task_queue = Queue() if data_queue is not None else None
|
||||
self.output_queue = Queue()
|
||||
self.GEN_STOP_SIG = False
|
||||
self.task_board = TaskBoard()
|
||||
self.gen_thread = Thread(target=self.gen_tasks, args=())
|
||||
self.gen_thread.start()
|
||||
self.should_stop = False
|
||||
self.run_thread = None
|
||||
# Map runner ObjectRef to worker name for proper cleanup
|
||||
self.runner_to_worker = {}
|
||||
self.all_workers_spawned = False
|
||||
|
||||
def gen_tasks(self):
|
||||
self.logger.info(f"headnode: {self.idx}: =============start gen task=============")
|
||||
pre_worker_stop_num = 0
|
||||
while not self.GEN_STOP_SIG:
|
||||
if self.data_queue is None:
|
||||
self.logger.info(f"headnode: {self.idx}: =============Gen Tasks stop==============")
|
||||
self.all_workers_spawned = True
|
||||
return
|
||||
if self.data_queue.empty():
|
||||
sleep(0)
|
||||
continue
|
||||
if self.task_queue is not None and self.task_queue.size() >= self.safe_threshold:
|
||||
sleep(1)
|
||||
continue
|
||||
task = self.data_queue.get()
|
||||
assert isinstance(
|
||||
task, Package
|
||||
), f"the transfered type of data should be Package type, but it is {type(task)}"
|
||||
if task.should_stop():
|
||||
pre_worker_stop_num += 1
|
||||
self.logger.info(
|
||||
f"headnode: {self.idx}: Received stop signal from upstream worker"
|
||||
f" ({pre_worker_stop_num}/{self.pre_worker_num})"
|
||||
)
|
||||
|
||||
# Dynamic worker scheduling: spawn new worker when upstream worker finishes
|
||||
if self.worker_schedule:
|
||||
self.logger.info(
|
||||
f"headnode: {self.idx}: Worker schedule enabled, will spawn 1 new worker after resource release"
|
||||
)
|
||||
# Wait for upstream resources to be released by upstream HeadNode's wait_stop()
|
||||
# Retry mechanism to handle resource release timing
|
||||
max_retries = 30 # 30 * 2s = 60s max wait
|
||||
retry_interval = 2
|
||||
|
||||
for retry in range(max_retries):
|
||||
try:
|
||||
self.logger.info(
|
||||
f"headnode: {self.idx}: Attempting to spawn new worker (attempt"
|
||||
f" {retry + 1}/{max_retries})..."
|
||||
)
|
||||
created_workers = self.worker_group.spawn(1)
|
||||
if created_workers:
|
||||
for worker_name, worker_bundle in created_workers:
|
||||
# Start the new worker
|
||||
runner = worker_bundle["worker"].run.remote(self.task_queue, self.output_queue)
|
||||
self.runner_to_worker[runner] = worker_name
|
||||
self.logger.info(
|
||||
f"headnode: {self.idx}: Successfully spawned and started new worker:"
|
||||
f" {worker_name}"
|
||||
)
|
||||
sleep(5)
|
||||
break # Success, exit retry loop
|
||||
except Exception as e:
|
||||
if retry < max_retries - 1:
|
||||
self.logger.warning(
|
||||
f"headnode: {self.idx}: Failed to spawn worker (attempt {retry + 1}), will retry in"
|
||||
f" {retry_interval}s: {e}"
|
||||
)
|
||||
sleep(retry_interval)
|
||||
else:
|
||||
self.logger.error(
|
||||
f"headnode: {self.idx}: Failed to spawn new worker after"
|
||||
f" {max_retries} attempts: {e}"
|
||||
)
|
||||
self.logger.error(traceback.format_exc())
|
||||
|
||||
if pre_worker_stop_num == self.pre_worker_num:
|
||||
for _ in range(len(self.worker_group)):
|
||||
self.logger.info(f"headnode: {self.idx}: get stop signal")
|
||||
stop_pack = Package(None, stop_sig=True)
|
||||
self.task_board.reg_task(stop_pack)
|
||||
self.all_workers_spawned = True
|
||||
return
|
||||
else:
|
||||
self.task_board.reg_task(task)
|
||||
if self.data_queue and not self.data_queue.empty():
|
||||
task = self.data_queue.get_nowait()
|
||||
self.task_board.reg_task(task)
|
||||
self.logger.info("=============Gen Tasks stop==============")
|
||||
self.all_workers_spawned = True
|
||||
|
||||
def result_queue(self):
|
||||
return self.output_queue
|
||||
|
||||
def run(self):
|
||||
self.logger.info(f"headnode: {self.idx}: ==============Running Head Node================")
|
||||
for worker_name, worker_bundle in self.worker_group.items():
|
||||
runner = worker_bundle["worker"].run.remote(self.task_queue, self.output_queue)
|
||||
self.runner_to_worker[runner] = worker_name
|
||||
sleep(5)
|
||||
|
||||
def inner_run():
|
||||
while not self.should_stop:
|
||||
tasks = self.task_board.get_tasks(timeout=0.05)
|
||||
if len(tasks) == 0:
|
||||
sleep(0)
|
||||
continue
|
||||
while self.task_queue.size() >= self.safe_threshold and not self.should_stop:
|
||||
sleep(1)
|
||||
for _, task in enumerate(tasks):
|
||||
self.task_queue.put(task)
|
||||
|
||||
self.run_thread = Thread(target=inner_run)
|
||||
self.run_thread.start()
|
||||
|
||||
def sig_stop(self):
|
||||
self.logger.info(f"headnode: {self.idx}: ============Gen Stop===============")
|
||||
self.GEN_STOP_SIG = True
|
||||
self.gen_thread.join()
|
||||
|
||||
def wait_stop(self):
|
||||
if self.worker_schedule and self.idx != 0:
|
||||
self.logger.info(f"headnode: {self.idx}: Waiting for all worker spawning to complete...")
|
||||
timeout = 600 # 600 seconds timeout
|
||||
start_time = time()
|
||||
while not self.all_workers_spawned:
|
||||
if time() - start_time > timeout:
|
||||
self.logger.warning(
|
||||
f"headnode: {self.idx}: Timeout waiting for worker spawning completion after {timeout}s"
|
||||
)
|
||||
break
|
||||
sleep(0.1)
|
||||
|
||||
if self.all_workers_spawned:
|
||||
self.logger.info(f"headnode: {self.idx}: All worker spawning completed, proceeding to wait for runners")
|
||||
|
||||
remaining_runners = list(self.runner_to_worker.keys())
|
||||
for runner in remaining_runners:
|
||||
self.logger.info(f"headnode: {self.idx}: remaining runner include: {self.runner_to_worker[runner]}")
|
||||
|
||||
while remaining_runners:
|
||||
ready, _ = ray.wait(remaining_runners, num_returns=len(remaining_runners), timeout=1.0)
|
||||
|
||||
for finished_runner in ready:
|
||||
worker_name = self.runner_to_worker.get(finished_runner, "unknown")
|
||||
self.logger.info(f"headnode: {self.idx}: Worker {worker_name} finished")
|
||||
try:
|
||||
ray.get(finished_runner)
|
||||
self.logger.info(f"headnode: {self.idx}: Worker {worker_name} completed successfully")
|
||||
self.worker_group.remove(worker_name, self.logger)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Worker {worker_name} failed, error stack:")
|
||||
self.logger.error(e)
|
||||
if worker_name in self.worker_group.keys():
|
||||
self.worker_group.remove(worker_name, self.logger)
|
||||
|
||||
remaining_runners.remove(finished_runner)
|
||||
self.runner_to_worker.pop(finished_runner, None)
|
||||
|
||||
if not ready:
|
||||
sleep(1)
|
||||
|
||||
self.logger.info(f"headnode: {self.idx}: ==============stop head================")
|
||||
self.should_stop = True
|
||||
if self.run_thread is not None:
|
||||
self.run_thread.join()
|
||||
self.sig_stop()
|
||||
|
||||
def __del__(self):
|
||||
if self.task_queue is not None:
|
||||
self.task_queue.shutdown()
|
||||
self.output_queue.shutdown()
|
||||
42
nimbus/dist_sim/task_board.py
Normal file
42
nimbus/dist_sim/task_board.py
Normal file
@@ -0,0 +1,42 @@
|
||||
import time
|
||||
from threading import Lock
|
||||
|
||||
|
||||
class Task:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def update_state(self, state):
|
||||
pass
|
||||
|
||||
|
||||
class TaskBoard:
|
||||
def __init__(self):
|
||||
self.tasks = []
|
||||
self.flying_tasks = []
|
||||
self.finished_tasks = []
|
||||
self.task_cnt = 0
|
||||
self.task_lock = Lock()
|
||||
self.flying_task_lock = Lock()
|
||||
|
||||
def reg_task(self, task):
|
||||
with self.task_lock:
|
||||
self.tasks.append(task)
|
||||
self.task_cnt += 1
|
||||
|
||||
def get_tasks(self, timeout=0):
|
||||
st_time = time.time()
|
||||
while len(self.tasks) == 0:
|
||||
if time.time() - st_time > timeout:
|
||||
return []
|
||||
pass
|
||||
with self.task_lock:
|
||||
tasks = self.tasks.copy()
|
||||
self.tasks = []
|
||||
return tasks
|
||||
|
||||
def commit_task(self, tasks):
|
||||
raise NotImplementedError("commit_task not implemented")
|
||||
|
||||
def finished(self):
|
||||
raise NotImplementedError("finished not implemented")
|
||||
0
nimbus/scheduler/__init__.py
Normal file
0
nimbus/scheduler/__init__.py
Normal file
277
nimbus/scheduler/inner_pipe.py
Normal file
277
nimbus/scheduler/inner_pipe.py
Normal file
@@ -0,0 +1,277 @@
|
||||
import math
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
|
||||
import ray
|
||||
|
||||
from nimbus.daemon.status_monitor import StatusMonitor
|
||||
from nimbus.scheduler.stages import DedumpStage, DumpStage
|
||||
from nimbus.utils.logging import configure_logging
|
||||
from nimbus.utils.random import set_all_seeds
|
||||
from nimbus.utils.types import MONITOR_CHECK_INTERVAL, STATUS_TIMEOUTS, StageInput
|
||||
from nimbus.utils.utils import init_env, pipe_consume_stage
|
||||
|
||||
|
||||
def iter_to_obj(iter_obj):
|
||||
return pipe_consume_stage(iter_obj), True
|
||||
|
||||
|
||||
def _consume_N(iter_obj, N=1):
|
||||
print("consume: ", iter_obj)
|
||||
results = []
|
||||
finish = False
|
||||
for _ in range(N):
|
||||
try:
|
||||
obj = next(iter_obj)
|
||||
results.append(obj)
|
||||
except StopIteration:
|
||||
finish = True
|
||||
return results, finish
|
||||
|
||||
|
||||
def consume_N(stage_input):
|
||||
finish = False
|
||||
if hasattr(stage_input, "Args"):
|
||||
stage_input.Args, finish = _consume_N(stage_input.Args[0])
|
||||
if hasattr(stage_input, "Kwargs"):
|
||||
if stage_input.Kwargs is not None:
|
||||
stage_input.Kwargs = {key: _consume_N(value) for key, value in stage_input.Kwargs.items()}
|
||||
return stage_input, finish
|
||||
|
||||
|
||||
class PipeWorkerGroup:
|
||||
"""
|
||||
Manages a group of pipe workers and their supervisors.
|
||||
Supports dynamic worker spawning for worker_schedule feature.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
pipe_name,
|
||||
exp_name,
|
||||
pipe_num,
|
||||
stage_list,
|
||||
master_seed,
|
||||
supervisor_class,
|
||||
inner_pipe_class,
|
||||
initial_instances=0,
|
||||
):
|
||||
self.workers = {}
|
||||
self._next_worker_idx = 0
|
||||
self.pipe_name = pipe_name
|
||||
self.exp_name = exp_name
|
||||
self.pipe_num = pipe_num
|
||||
self.stage_list = stage_list
|
||||
self.master_seed = master_seed
|
||||
self.supervisor_class = supervisor_class
|
||||
self.inner_pipe_class = inner_pipe_class
|
||||
|
||||
if initial_instances > 0:
|
||||
self.spawn(initial_instances)
|
||||
|
||||
def spawn(self, count):
|
||||
"""
|
||||
Spawn new workers dynamically.
|
||||
Returns list of (name, bundle) tuples for created workers.
|
||||
"""
|
||||
created = []
|
||||
for _ in range(count):
|
||||
name = f"p{self.pipe_num}_w{self._next_worker_idx}"
|
||||
worker_seed = self.master_seed + self._next_worker_idx if self.master_seed is not None else None
|
||||
supervisor = self.supervisor_class.remote(name)
|
||||
pipe_actor = self.inner_pipe_class.remote(self.stage_list, name, supervisor, seed=worker_seed)
|
||||
ray.get(supervisor.set_pipe.remote(pipe_actor))
|
||||
supervisor.run.remote()
|
||||
bundle = {"worker": pipe_actor, "supervisor": supervisor}
|
||||
self.workers[name] = bundle
|
||||
created.append((name, bundle))
|
||||
self._next_worker_idx += 1
|
||||
time.sleep(3)
|
||||
|
||||
if created:
|
||||
print(f"{self.pipe_name}: spawned {len(created)} workers - {[name for name, _ in created]}")
|
||||
return created
|
||||
|
||||
def items(self):
|
||||
"""Return items view of workers dictionary."""
|
||||
return self.workers.items()
|
||||
|
||||
def values(self):
|
||||
"""Return values view of workers dictionary."""
|
||||
return self.workers.values()
|
||||
|
||||
def keys(self):
|
||||
"""Return keys view of workers dictionary."""
|
||||
return self.workers.keys()
|
||||
|
||||
def __len__(self):
|
||||
"""Return number of workers in the group."""
|
||||
return len(self.workers)
|
||||
|
||||
def __repr__(self):
|
||||
worker_names = list(self.workers.keys())
|
||||
return f"PipeWorkerGroup({worker_names})"
|
||||
|
||||
def __getitem__(self, key):
|
||||
"""Support dictionary-style access."""
|
||||
return self.workers[key]
|
||||
|
||||
def remove(self, name, logger):
|
||||
"""Remove a worker from the group."""
|
||||
ray.kill(self.workers[name]["worker"])
|
||||
logger.info(f"killed worker actor {name} to release GPU resouces")
|
||||
ray.kill(self.workers[name]["supervisor"])
|
||||
logger.info(f"Supervisor {name} killed successfully")
|
||||
if name in self.workers:
|
||||
del self.workers[name]
|
||||
|
||||
|
||||
def make_pipe(pipe_name, exp_name, pipe_num, stage_list, dev, instance_num, total_processes, config, master_seed=None):
|
||||
gpu_num = 0
|
||||
if dev == "gpu":
|
||||
resources = ray.cluster_resources()
|
||||
total_gpus = resources.get("GPU", 0)
|
||||
assert total_gpus > 0, "not enough gpu resources"
|
||||
processes_per_gpu = math.ceil(total_processes / total_gpus)
|
||||
gpu_num = 1.0 / processes_per_gpu
|
||||
|
||||
@ray.remote
|
||||
class Supervisor:
|
||||
def __init__(self, name):
|
||||
self.name = "supervisor_" + name
|
||||
self.pipe_worker = None
|
||||
self.logger = configure_logging(exp_name, self.name)
|
||||
self.logger.info("Supervisor started")
|
||||
self.monitor = StatusMonitor.get_instance()
|
||||
self.monitor.set_logger(self.logger)
|
||||
|
||||
self._last_status_check = 0.0
|
||||
self.check_interval = config.get(MONITOR_CHECK_INTERVAL, 120)
|
||||
self.logger.info(f"Monitor check interval: {self.check_interval} seconds")
|
||||
if config.get(STATUS_TIMEOUTS, None) is not None:
|
||||
self.monitor.set_component_timeouts(config[STATUS_TIMEOUTS])
|
||||
|
||||
def set_pipe(self, pipe_worker):
|
||||
self.logger.info("set pipe worker")
|
||||
self.pipe_worker = pipe_worker
|
||||
|
||||
def set_queue(self, input_queue, output_queue):
|
||||
self.input_queue = input_queue
|
||||
self.output_queue = output_queue
|
||||
|
||||
def _restart_worker(self):
|
||||
try:
|
||||
ray.kill(self.pipe_worker, no_restart=False)
|
||||
self.logger.info("trigger restart of the actor")
|
||||
except Exception as ke:
|
||||
self.logger.error(f"restart actor error: {ke}")
|
||||
|
||||
def update_component_state(self, components_state):
|
||||
for _, state in components_state.items():
|
||||
self.monitor.register_update(state)
|
||||
|
||||
def _start_daemon(self):
|
||||
miss_cnt = 0
|
||||
while True:
|
||||
now = time.time()
|
||||
if now - self._last_status_check >= self.check_interval:
|
||||
try:
|
||||
timeout_components = self.monitor.check_and_update_timeouts()
|
||||
if len(timeout_components) > 0:
|
||||
self.logger.warning(f"Components timeout: {timeout_components}, restart the pipe worker")
|
||||
self._restart_worker()
|
||||
self.monitor.clear()
|
||||
else:
|
||||
if self.monitor.get_components_length() == 0:
|
||||
miss_cnt += 1
|
||||
self.logger.info(f"No components timeout detected, miss count: {miss_cnt}")
|
||||
if miss_cnt >= 5:
|
||||
self.logger.info("No components detected for 5 consecutive checks, restart pipe worker")
|
||||
self._restart_worker()
|
||||
self.monitor.clear()
|
||||
miss_cnt = 0
|
||||
except Exception as e:
|
||||
self.logger.error(f"Get components status failed: {e}")
|
||||
self._restart_worker()
|
||||
self.monitor.clear()
|
||||
self._last_status_check = now
|
||||
time.sleep(1)
|
||||
|
||||
def run(self):
|
||||
assert self.pipe_worker is not None, "pipe worker is not set"
|
||||
thread = threading.Thread(target=self._start_daemon, daemon=True)
|
||||
thread.start()
|
||||
|
||||
@ray.remote(num_gpus=gpu_num, max_restarts=3, max_task_retries=3)
|
||||
class InnerPipe:
|
||||
def __init__(self, stage_list, name, supervisor, seed=None):
|
||||
if seed is not None:
|
||||
set_all_seeds(seed)
|
||||
self.stages = stage_list
|
||||
self.name = name
|
||||
self.supervisor = supervisor
|
||||
init_env()
|
||||
self.logger = configure_logging(exp_name, self.name)
|
||||
self.logger.info(f"Working on gpu {os.environ.get('CUDA_VISIBLE_DEVICES')}")
|
||||
if ray.get_runtime_context().was_current_actor_reconstructed is True:
|
||||
msg = (
|
||||
f"{'='*80}\n"
|
||||
"!!! ATTENTION !!!\n"
|
||||
f"!!! InnerPipe {name} WAS RECONSTRUCTED due to SYSTEM ERROR !!!\n"
|
||||
"!!! Please CHECK LOGS in /tmp/ray/session_latest/logs/ for details !!!\n"
|
||||
f"{'='*80}\n"
|
||||
)
|
||||
self.logger.info(msg)
|
||||
|
||||
self.monitor = StatusMonitor.get_instance()
|
||||
self.monitor.set_logger(self.logger)
|
||||
|
||||
self.monitor_check_interval = config.get(MONITOR_CHECK_INTERVAL, 120)
|
||||
|
||||
def _update_supervisor(self):
|
||||
while True:
|
||||
for _ in range(self.monitor_check_interval):
|
||||
time.sleep(1)
|
||||
components_status = self.monitor.get_all_status()
|
||||
ray.get(self.supervisor.update_component_state.remote(components_status))
|
||||
|
||||
def run(self, input_queue, output_queue):
|
||||
self.logger.info(f"[InnerPipe stages]: {self.stages}")
|
||||
|
||||
thread = threading.Thread(target=self._update_supervisor, daemon=True)
|
||||
thread.start()
|
||||
self.logger.info("Reporter started, start running pipe")
|
||||
|
||||
mid_results = StageInput()
|
||||
# if input_queue is None:
|
||||
# mid_results = StageInput()
|
||||
# else:
|
||||
# mid_results = StageInput((input_queue,), {})
|
||||
for _, stage in enumerate(self.stages):
|
||||
if isinstance(stage, DumpStage):
|
||||
mid_results = stage.run(mid_results, output_queue)
|
||||
elif isinstance(stage, DedumpStage):
|
||||
mid_results = stage.run(mid_results, input_queue)
|
||||
else:
|
||||
mid_results = stage.run(mid_results)
|
||||
result, finish = iter_to_obj(mid_results)
|
||||
self.logger.info("====================================")
|
||||
self.logger.info(f"result: {result}, finish: {finish}")
|
||||
self.logger.info("====================================")
|
||||
ray.kill(self.supervisor)
|
||||
self.logger.info("actor finished")
|
||||
return finish
|
||||
|
||||
group = PipeWorkerGroup(
|
||||
pipe_name=pipe_name,
|
||||
exp_name=exp_name,
|
||||
pipe_num=pipe_num,
|
||||
stage_list=stage_list,
|
||||
master_seed=master_seed,
|
||||
supervisor_class=Supervisor,
|
||||
inner_pipe_class=InnerPipe,
|
||||
initial_instances=instance_num,
|
||||
)
|
||||
print(pipe_name, group)
|
||||
return group
|
||||
115
nimbus/scheduler/instructions.py
Normal file
115
nimbus/scheduler/instructions.py
Normal file
@@ -0,0 +1,115 @@
|
||||
from abc import abstractmethod
|
||||
|
||||
from nimbus.components.dedump import dedumper_dict
|
||||
from nimbus.components.dump import dumper_dict
|
||||
from nimbus.components.load import layout_randomizer_dict, scene_loader_dict
|
||||
from nimbus.components.plan_with_render import plan_with_render_dict
|
||||
from nimbus.components.planner import seq_planner_dict
|
||||
from nimbus.components.render import renderer_dict
|
||||
from nimbus.components.store import writer_dict
|
||||
from nimbus.utils.types import ARGS, PLANNER, TYPE
|
||||
|
||||
|
||||
class Instruction:
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
|
||||
@abstractmethod
|
||||
def run(self, stage_input):
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class LoadSceneInstruction(Instruction):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.scene_iter = scene_loader_dict[self.config[TYPE]]
|
||||
|
||||
def run(self, stage_input):
|
||||
pack_iter = pack_iter = stage_input.Args[0] if stage_input.Args is not None else None
|
||||
return self.scene_iter(pack_iter=pack_iter, **self.config.get(ARGS, {}))
|
||||
|
||||
|
||||
class RandomizeLayoutInstruction(Instruction):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.layout_randomlizer = layout_randomizer_dict[self.config[TYPE]]
|
||||
|
||||
def run(self, stage_input):
|
||||
scene_iterator = stage_input.Args[0]
|
||||
extend_scene_iterator = self.layout_randomlizer(scene_iterator, **self.config.get(ARGS, {}))
|
||||
return extend_scene_iterator
|
||||
|
||||
|
||||
class PlanPathInstruction(Instruction):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.seq_planner = seq_planner_dict[self.config[TYPE]]
|
||||
|
||||
def run(self, stage_input):
|
||||
scene_iter = stage_input.Args[0]
|
||||
planner_cfg = self.config[PLANNER] if PLANNER in self.config else None
|
||||
return self.seq_planner(scene_iter, planner_cfg, **self.config.get(ARGS, {}))
|
||||
|
||||
|
||||
class RenderInstruction(Instruction):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.renderer = renderer_dict[self.config[TYPE]]
|
||||
|
||||
def run(self, stage_input):
|
||||
scene_seqs_iter = stage_input.Args[0]
|
||||
obs_iter = self.renderer(scene_seqs_iter, **self.config.get(ARGS, {}))
|
||||
return obs_iter
|
||||
|
||||
|
||||
class PlanWithRenderInstruction(Instruction):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.plan_with_render = plan_with_render_dict[config[TYPE]]
|
||||
|
||||
def run(self, stage_input):
|
||||
scene_iter = stage_input.Args[0]
|
||||
plan_with_render_iter = self.plan_with_render(scene_iter, **self.config.get(ARGS, {}))
|
||||
return plan_with_render_iter
|
||||
|
||||
|
||||
class StoreInstruction(Instruction):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.writer = writer_dict[config[TYPE]]
|
||||
|
||||
def run(self, stage_input):
|
||||
seqs_obs_iter = stage_input.Args[0]
|
||||
store_iter = self.writer(seqs_obs_iter, **self.config.get(ARGS, {}))
|
||||
return store_iter
|
||||
|
||||
|
||||
class DumpInstruction(Instruction):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.dumper = dumper_dict[config[TYPE]]
|
||||
|
||||
def run(self, stage_input, output_queue=None):
|
||||
seqs_obs_iter = stage_input.Args[0]
|
||||
dump_iter = self.dumper(seqs_obs_iter, output_queue=output_queue, **self.config.get(ARGS, {}))
|
||||
return dump_iter
|
||||
|
||||
|
||||
class DeDumpInstruction(Instruction):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.dedumper = dedumper_dict[config[TYPE]]
|
||||
|
||||
def run(self, stage_input, input_queue=None):
|
||||
dump_iter = self.dedumper(input_queue=input_queue, **self.config.get(ARGS, {}))
|
||||
return dump_iter
|
||||
|
||||
|
||||
class ComposeInstruction(Instruction):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
|
||||
|
||||
class AnnotateDataInstruction(Instruction):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
80
nimbus/scheduler/sches.py
Normal file
80
nimbus/scheduler/sches.py
Normal file
@@ -0,0 +1,80 @@
|
||||
from nimbus.scheduler.inner_pipe import make_pipe
|
||||
from nimbus.scheduler.stages import (
|
||||
DedumpStage,
|
||||
DumpStage,
|
||||
LoadStage,
|
||||
PlanStage,
|
||||
PlanWithRenderStage,
|
||||
RenderStage,
|
||||
StoreStage,
|
||||
)
|
||||
from nimbus.utils.types import (
|
||||
DEDUMP_STAGE,
|
||||
DUMP_STAGE,
|
||||
LOAD_STAGE,
|
||||
PLAN_STAGE,
|
||||
PLAN_WITH_RENDER_STAGE,
|
||||
RENDER_STAGE,
|
||||
STAGE_DEV,
|
||||
STAGE_NUM,
|
||||
STAGE_PIPE,
|
||||
STORE_STAGE,
|
||||
WORKER_NUM,
|
||||
)
|
||||
|
||||
|
||||
def gen_scheduler(config):
|
||||
stages = []
|
||||
if LOAD_STAGE in config:
|
||||
stages.append(LoadStage(config[LOAD_STAGE]))
|
||||
if PLAN_WITH_RENDER_STAGE in config:
|
||||
stages.append(PlanWithRenderStage(config[PLAN_WITH_RENDER_STAGE]))
|
||||
if PLAN_STAGE in config:
|
||||
stages.append(PlanStage(config[PLAN_STAGE]))
|
||||
if DUMP_STAGE in config:
|
||||
stages.append(DumpStage(config[DUMP_STAGE]))
|
||||
if DEDUMP_STAGE in config:
|
||||
stages.append(DedumpStage(config[DEDUMP_STAGE]))
|
||||
if RENDER_STAGE in config:
|
||||
stages.append(RenderStage(config[RENDER_STAGE]))
|
||||
if STORE_STAGE in config:
|
||||
stages.append(StoreStage(config[STORE_STAGE]))
|
||||
return stages
|
||||
|
||||
|
||||
def gen_pipe(config, stage_list, exp_name, master_seed=None):
|
||||
if STAGE_PIPE in config:
|
||||
pipe_stages_num = config[STAGE_PIPE][STAGE_NUM]
|
||||
pipe_stages_dev = config[STAGE_PIPE][STAGE_DEV]
|
||||
pipe_worker_num = config[STAGE_PIPE][WORKER_NUM]
|
||||
inner_pipes = []
|
||||
pipe_num = 0
|
||||
total_processes = 0
|
||||
for worker_num in config[STAGE_PIPE][WORKER_NUM]:
|
||||
total_processes += worker_num
|
||||
for num, dev, worker_num in zip(pipe_stages_num, pipe_stages_dev, pipe_worker_num):
|
||||
stages = stage_list[:num]
|
||||
print("===========================")
|
||||
print(f"inner stage num: {num}, device type: {dev}")
|
||||
print(f"stages: {stages}")
|
||||
print("===========================")
|
||||
stage_list = stage_list[num:]
|
||||
pipe_name = "pipe"
|
||||
for stage in stages:
|
||||
pipe_name += f"_{stage.__class__.__name__}"
|
||||
pipe_workers = make_pipe(
|
||||
pipe_name,
|
||||
exp_name,
|
||||
pipe_num,
|
||||
stages,
|
||||
dev,
|
||||
worker_num,
|
||||
total_processes,
|
||||
config[STAGE_PIPE],
|
||||
master_seed=master_seed,
|
||||
)
|
||||
inner_pipes.append(pipe_workers)
|
||||
pipe_num += 1
|
||||
return inner_pipes
|
||||
else:
|
||||
return [make_pipe.InnerPipe(stage_list)]
|
||||
137
nimbus/scheduler/stages.py
Normal file
137
nimbus/scheduler/stages.py
Normal file
@@ -0,0 +1,137 @@
|
||||
from abc import abstractmethod
|
||||
|
||||
from nimbus.scheduler.instructions import (
|
||||
DeDumpInstruction,
|
||||
DumpInstruction,
|
||||
Instruction,
|
||||
LoadSceneInstruction,
|
||||
PlanPathInstruction,
|
||||
PlanWithRenderInstruction,
|
||||
RandomizeLayoutInstruction,
|
||||
RenderInstruction,
|
||||
StoreInstruction,
|
||||
)
|
||||
from nimbus.utils.types import (
|
||||
DEDUMPER,
|
||||
DUMPER,
|
||||
LAYOUT_RANDOM_GENERATOR,
|
||||
PLAN_WITH_RENDER,
|
||||
RENDERER,
|
||||
SCENE_LOADER,
|
||||
SEQ_PLANNER,
|
||||
WRITER,
|
||||
StageInput,
|
||||
)
|
||||
|
||||
|
||||
class Stage:
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
self.instructions: list[Instruction] = []
|
||||
self.output_queue = None
|
||||
|
||||
@abstractmethod
|
||||
def run(self, stage_input):
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class LoadStage(Stage):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
if SCENE_LOADER in config:
|
||||
self.instructions.append(LoadSceneInstruction(config[SCENE_LOADER]))
|
||||
if LAYOUT_RANDOM_GENERATOR in config:
|
||||
self.instructions.append(RandomizeLayoutInstruction(config[LAYOUT_RANDOM_GENERATOR]))
|
||||
|
||||
def run(self, stage_input: StageInput):
|
||||
for instruction in self.instructions:
|
||||
scene_iterator = instruction.run(stage_input)
|
||||
stage_input = StageInput((scene_iterator,), {})
|
||||
return stage_input
|
||||
|
||||
|
||||
class PlanStage(Stage):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
if SEQ_PLANNER in config:
|
||||
self.instructions.append(PlanPathInstruction(config[SEQ_PLANNER]))
|
||||
|
||||
def run(self, stage_input: StageInput):
|
||||
for instruction in self.instructions:
|
||||
scene_seqs_iter = instruction.run(stage_input)
|
||||
stage_input = StageInput((scene_seqs_iter,), {})
|
||||
return stage_input
|
||||
|
||||
|
||||
class RenderStage(Stage):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.instructions.append(RenderInstruction(config[RENDERER]))
|
||||
|
||||
def run(self, stage_input: StageInput):
|
||||
for instruction in self.instructions:
|
||||
obs_iter = instruction.run(stage_input)
|
||||
stage_input = StageInput((obs_iter,), {})
|
||||
return stage_input
|
||||
|
||||
|
||||
class PlanWithRenderStage(Stage):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.instructions.append(PlanWithRenderInstruction(config[PLAN_WITH_RENDER]))
|
||||
|
||||
def run(self, stage_input: StageInput):
|
||||
for instruction in self.instructions:
|
||||
scene_seqs_iter = instruction.run(stage_input)
|
||||
stage_input = StageInput((scene_seqs_iter,), {})
|
||||
return stage_input
|
||||
|
||||
|
||||
class StoreStage(Stage):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
if WRITER in config:
|
||||
self.instructions.append(StoreInstruction(config[WRITER]))
|
||||
|
||||
def run(self, stage_input: StageInput):
|
||||
for instruction in self.instructions:
|
||||
store_iter = instruction.run(stage_input)
|
||||
stage_input = StageInput((store_iter,), {})
|
||||
return stage_input
|
||||
|
||||
|
||||
class DumpStage(Stage):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.instructions.append(DumpInstruction(config[DUMPER]))
|
||||
|
||||
def run(self, stage_input: StageInput, output_queue=None):
|
||||
for instruction in self.instructions:
|
||||
dump_iter = instruction.run(stage_input, output_queue)
|
||||
stage_input = StageInput((dump_iter,), {})
|
||||
return stage_input
|
||||
|
||||
|
||||
class DedumpStage(Stage):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
if DEDUMPER in config:
|
||||
self.instructions.append(DeDumpInstruction(config[DEDUMPER]))
|
||||
if SCENE_LOADER in config:
|
||||
self.instructions.append(LoadSceneInstruction(config[SCENE_LOADER]))
|
||||
if LAYOUT_RANDOM_GENERATOR in config:
|
||||
self.instructions.append(RandomizeLayoutInstruction(config[LAYOUT_RANDOM_GENERATOR]))
|
||||
if SEQ_PLANNER in config:
|
||||
self.instructions.append(PlanPathInstruction(config[SEQ_PLANNER]))
|
||||
|
||||
def run(self, stage_input: StageInput, input_queue=None):
|
||||
if input_queue is not None:
|
||||
self.input_queue = input_queue
|
||||
|
||||
for instruction in self.instructions:
|
||||
if isinstance(instruction, DeDumpInstruction):
|
||||
result = instruction.run(stage_input, input_queue)
|
||||
else:
|
||||
result = instruction.run(stage_input)
|
||||
stage_input = StageInput((result,), {})
|
||||
return stage_input
|
||||
20
nimbus/utils/config.py
Normal file
20
nimbus/utils/config.py
Normal file
@@ -0,0 +1,20 @@
|
||||
from omegaconf import OmegaConf
|
||||
|
||||
|
||||
def load_config(*yaml_files, cli_args=None):
|
||||
if cli_args is None:
|
||||
cli_args = []
|
||||
yaml_confs = [OmegaConf.load(f) for f in yaml_files]
|
||||
cli_conf = OmegaConf.from_cli(cli_args)
|
||||
conf = OmegaConf.merge(*yaml_confs, cli_conf)
|
||||
OmegaConf.resolve(conf)
|
||||
return conf
|
||||
|
||||
|
||||
def config_to_primitive(config, resolve=True):
|
||||
return OmegaConf.to_container(config, resolve=resolve)
|
||||
|
||||
|
||||
def save_config(config, path):
|
||||
with open(path, "w", encoding="utf-8") as fp:
|
||||
OmegaConf.save(config=config, f=fp)
|
||||
138
nimbus/utils/config_processor.py
Normal file
138
nimbus/utils/config_processor.py
Normal file
@@ -0,0 +1,138 @@
|
||||
"""
|
||||
Config Processor: Responsible for identifying, converting, and loading configuration files.
|
||||
"""
|
||||
|
||||
from omegaconf import DictConfig, OmegaConf
|
||||
|
||||
from nimbus.utils.config import load_config
|
||||
|
||||
|
||||
class ConfigProcessor:
|
||||
"""Config processor class"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def _check_config_path_exists(self, config, path):
|
||||
"""
|
||||
Check if a configuration path exists in the config object
|
||||
|
||||
Args:
|
||||
config: OmegaConf config object
|
||||
path: String path like 'stage_pipe.worker_num' or 'load_stage.scene_loader.args.random_num'
|
||||
|
||||
Returns:
|
||||
bool: Whether the path exists in the config
|
||||
"""
|
||||
try:
|
||||
keys = path.split(".")
|
||||
current = config
|
||||
for key in keys:
|
||||
if isinstance(current, DictConfig):
|
||||
if key not in current:
|
||||
return False
|
||||
current = current[key]
|
||||
else:
|
||||
return False
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _validate_cli_args(self, config, cli_args):
|
||||
"""
|
||||
Validate that all CLI arguments correspond to existing paths in the config
|
||||
|
||||
Args:
|
||||
config: OmegaConf config object
|
||||
cli_args: List of command line arguments
|
||||
|
||||
Raises:
|
||||
ValueError: If any CLI argument path doesn't exist in the config
|
||||
"""
|
||||
if not cli_args:
|
||||
return
|
||||
|
||||
# Clean up CLI args to remove -- prefix if present
|
||||
cleaned_cli_args = []
|
||||
for arg in cli_args:
|
||||
if arg.startswith("--"):
|
||||
cleaned_cli_args.append(arg[2:]) # Remove the -- prefix
|
||||
else:
|
||||
cleaned_cli_args.append(arg)
|
||||
|
||||
# Parse CLI args to get the override paths
|
||||
try:
|
||||
cli_conf = OmegaConf.from_cli(cleaned_cli_args)
|
||||
except Exception as e:
|
||||
raise ValueError(f"Invalid CLI argument format: {e}. Please use format like: stage_pipe.worker_num='[2,4]'")
|
||||
|
||||
def check_nested_paths(conf, prefix=""):
|
||||
"""Recursively check all paths in the CLI config"""
|
||||
for key, value in conf.items():
|
||||
current_path = f"{prefix}.{key}" if prefix else key
|
||||
|
||||
if isinstance(value, DictConfig):
|
||||
# Check if this intermediate path exists
|
||||
if not self._check_config_path_exists(config, current_path):
|
||||
raise ValueError(f"Configuration path '{current_path}' does not exist in the config file")
|
||||
# Recursively check nested paths
|
||||
check_nested_paths(value, current_path)
|
||||
else:
|
||||
# Check if this leaf path exists
|
||||
if not self._check_config_path_exists(config, current_path):
|
||||
raise ValueError(f"Configuration path '{current_path}' does not exist in the config file")
|
||||
|
||||
try:
|
||||
check_nested_paths(cli_conf)
|
||||
except ValueError:
|
||||
raise
|
||||
except Exception:
|
||||
# If there's an issue parsing CLI args, provide helpful error message
|
||||
raise ValueError("Invalid CLI argument format. Please use format like: --key=value or --nested.key=value")
|
||||
|
||||
def process_config(self, config_path, cli_args=None):
|
||||
"""
|
||||
Process the config file
|
||||
|
||||
Args:
|
||||
config_path: Path to the config file
|
||||
cli_args: List of command line arguments
|
||||
|
||||
Returns:
|
||||
OmegaConf: Processed config object
|
||||
"""
|
||||
# Clean up CLI args to remove -- prefix if present
|
||||
cleaned_cli_args = []
|
||||
if cli_args:
|
||||
for arg in cli_args:
|
||||
if arg.startswith("--"):
|
||||
cleaned_cli_args.append(arg[2:]) # Remove the -- prefix
|
||||
else:
|
||||
cleaned_cli_args.append(arg)
|
||||
|
||||
# Load config first without CLI args to validate paths
|
||||
try:
|
||||
base_config = load_config(config_path)
|
||||
except Exception as e:
|
||||
raise ValueError(f"Error loading config: {e}")
|
||||
|
||||
# Validate that CLI arguments correspond to existing paths
|
||||
if cli_args:
|
||||
self._validate_cli_args(base_config, cli_args)
|
||||
|
||||
# Now load config with CLI args (validation passed)
|
||||
config = load_config(config_path, cli_args=cleaned_cli_args)
|
||||
|
||||
return config
|
||||
|
||||
def print_final_config(self, config):
|
||||
"""
|
||||
Print the final running config
|
||||
|
||||
Args:
|
||||
config: OmegaConf config object
|
||||
"""
|
||||
print("=" * 50)
|
||||
print("final config:")
|
||||
print("=" * 50)
|
||||
print(OmegaConf.to_yaml(config))
|
||||
23
nimbus/utils/flags.py
Normal file
23
nimbus/utils/flags.py
Normal file
@@ -0,0 +1,23 @@
|
||||
import os
|
||||
|
||||
_DEBUG_KEY = "NIMBUS_DEBUG"
|
||||
_RANDOM_SEED_KEY = "NIMBUS_RANDOM_SEED"
|
||||
|
||||
|
||||
def set_debug_mode(enabled: bool) -> None:
|
||||
"""Set debug mode. Must be called before ray.init() to propagate to Ray workers."""
|
||||
os.environ[_DEBUG_KEY] = "1" if enabled else "0"
|
||||
|
||||
|
||||
def is_debug_mode() -> bool:
|
||||
return os.environ.get(_DEBUG_KEY, "0") == "1"
|
||||
|
||||
|
||||
def set_random_seed(seed: int) -> None:
|
||||
"""Set global random seed. Must be called before ray.init() to propagate to Ray workers."""
|
||||
os.environ[_RANDOM_SEED_KEY] = str(seed)
|
||||
|
||||
|
||||
def get_random_seed() -> int | None:
|
||||
val = os.environ.get(_RANDOM_SEED_KEY)
|
||||
return int(val) if val is not None else None
|
||||
48
nimbus/utils/logging.py
Normal file
48
nimbus/utils/logging.py
Normal file
@@ -0,0 +1,48 @@
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
from nimbus.utils.config import save_config
|
||||
|
||||
|
||||
def configure_logging(exp_name, name=None, config=None):
|
||||
pod_name = os.environ.get("POD_NAME", None)
|
||||
if pod_name is not None:
|
||||
exp_name = f"{exp_name}/{pod_name}"
|
||||
log_dir = os.path.join("./output", exp_name)
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
if name is None:
|
||||
log_name = f"de_time_profile_{timestamp}.log"
|
||||
else:
|
||||
log_name = f"de_{name}_time_profile_{timestamp}.log"
|
||||
|
||||
log_file = os.path.join(log_dir, log_name)
|
||||
|
||||
max_retries = 3
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
os.makedirs(log_dir, exist_ok=True)
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"Warning: Stale file handle when creating {log_dir}, attempt {attempt + 1}/{max_retries}")
|
||||
if attempt < max_retries - 1:
|
||||
time.sleep(3)
|
||||
continue
|
||||
else:
|
||||
raise RuntimeError(f"Failed to create log directory {log_dir} after {max_retries} attempts") from e
|
||||
|
||||
if config is not None:
|
||||
config_log_file = os.path.join(log_dir, "de_config.yaml")
|
||||
save_config(config, config_log_file)
|
||||
|
||||
logger = logging.getLogger("de_logger")
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
fh = logging.FileHandler(log_file, mode="a")
|
||||
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
||||
fh.setFormatter(formatter)
|
||||
logger.addHandler(fh)
|
||||
logger.info("Start Data Engine")
|
||||
|
||||
return logger
|
||||
33
nimbus/utils/random.py
Normal file
33
nimbus/utils/random.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import os
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
# Try to import open3d, but don't fail if it's not installed
|
||||
try:
|
||||
import open3d as o3d
|
||||
except ImportError:
|
||||
o3d = None
|
||||
|
||||
|
||||
def set_all_seeds(seed):
|
||||
"""
|
||||
Sets seeds for all relevant random number generators to ensure reproducibility.
|
||||
"""
|
||||
os.environ["PYTHONHASHSEED"] = str(seed)
|
||||
print(f"set seed {seed} for all libraries")
|
||||
seed = int(seed)
|
||||
np.random.seed(seed)
|
||||
random.seed(seed)
|
||||
|
||||
if o3d and hasattr(o3d, "utility") and hasattr(o3d.utility, "random"):
|
||||
o3d.utility.random.seed(seed)
|
||||
|
||||
torch.manual_seed(seed)
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
|
||||
# These settings are crucial for deterministic results with CuDNN
|
||||
torch.backends.cudnn.deterministic = True
|
||||
torch.backends.cudnn.benchmark = False
|
||||
65
nimbus/utils/types.py
Normal file
65
nimbus/utils/types.py
Normal file
@@ -0,0 +1,65 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, Optional, Tuple
|
||||
|
||||
NAME = "name"
|
||||
|
||||
# stage name
|
||||
LOAD_STAGE = "load_stage"
|
||||
PLAN_STAGE = "plan_stage"
|
||||
RENDER_STAGE = "render_stage"
|
||||
PLAN_WITH_RENDER_STAGE = "plan_with_render_stage"
|
||||
STORE_STAGE = "store_stage"
|
||||
STAGE_PIPE = "stage_pipe"
|
||||
DUMP_STAGE = "dump_stage"
|
||||
DEDUMP_STAGE = "dedump_stage"
|
||||
|
||||
# instruction name
|
||||
# LOAD_STAGE
|
||||
SCENE_LOADER = "scene_loader"
|
||||
LAYOUT_RANDOM_GENERATOR = "layout_random_generator"
|
||||
INDEX_GENERATOR = "index_generator"
|
||||
DEDUMPER = "dedumper"
|
||||
|
||||
# PLAN_STAGE
|
||||
SEQ_PLANNER = "seq_planner"
|
||||
PLANNER = "planner"
|
||||
SIMULATOR = "simulator"
|
||||
|
||||
# RENDER_STAGE
|
||||
RENDERER = "renderer"
|
||||
|
||||
# PLAN_WITH_RENDER_STAGE
|
||||
PLAN_WITH_RENDER = "plan_with_render"
|
||||
|
||||
# PIPE_STAGE
|
||||
STAGE_NUM = "stage_num"
|
||||
STAGE_DEV = "stage_dev"
|
||||
WORKER_NUM = "worker_num"
|
||||
WORKER_SCHEDULE = "worker_schedule"
|
||||
SAFE_THRESHOLD = "safe_threshold"
|
||||
STATUS_TIMEOUTS = "status_timeouts"
|
||||
MONITOR_CHECK_INTERVAL = "monitor_check_interval"
|
||||
|
||||
# STORE_STAGE
|
||||
WRITER = "writer"
|
||||
DUMPER = "dumper"
|
||||
|
||||
OUTPUT_PATH = "output_path"
|
||||
INPUT_PATH = "input_path"
|
||||
|
||||
TYPE = "type"
|
||||
ARGS = "args"
|
||||
|
||||
|
||||
@dataclass
|
||||
class StageInput:
|
||||
"""
|
||||
A data class that encapsulates the input for a stage in the processing pipeline.
|
||||
|
||||
Args:
|
||||
Args (Optional[Tuple]): Positional arguments passed to the stage's processing function.
|
||||
Kwargs (Optional[Dict]): Keyword arguments passed to the stage's processing function.
|
||||
"""
|
||||
|
||||
Args: Optional[Tuple] = None
|
||||
Kwargs: Optional[Dict] = None
|
||||
182
nimbus/utils/utils.py
Normal file
182
nimbus/utils/utils.py
Normal file
@@ -0,0 +1,182 @@
|
||||
import functools
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from typing import Tuple, Type, Union
|
||||
|
||||
from nimbus.components.data.observation import Observations
|
||||
from nimbus.components.data.scene import Scene
|
||||
from nimbus.components.data.sequence import Sequence
|
||||
|
||||
|
||||
def init_env():
|
||||
sys.path.append("./")
|
||||
sys.path.append("./data_engine")
|
||||
sys.path.append("workflows/simbox")
|
||||
|
||||
|
||||
def unpack_iter_data(data: tuple):
|
||||
assert len(data) <= 3, "not support yet"
|
||||
scene = None
|
||||
seq = None
|
||||
obs = None
|
||||
for item in data:
|
||||
if isinstance(item, Scene):
|
||||
scene = item
|
||||
elif isinstance(item, Sequence):
|
||||
seq = item
|
||||
elif isinstance(item, Observations):
|
||||
obs = item
|
||||
return scene, seq, obs
|
||||
|
||||
|
||||
def consume_stage(stage_input):
|
||||
if hasattr(stage_input, "Args"):
|
||||
consume_iterators(stage_input.Args)
|
||||
for value in stage_input.Args:
|
||||
if hasattr(value, "__del__"):
|
||||
value.__del__() # pylint: disable=C2801
|
||||
if hasattr(stage_input, "Kwargs"):
|
||||
if stage_input.Kwargs is not None:
|
||||
for value in stage_input.Kwargs.values():
|
||||
consume_iterators(value)
|
||||
if hasattr(value, "__del__"):
|
||||
value.__del__() # pylint: disable=C2801
|
||||
|
||||
|
||||
# prevent isaac sim close pipe worker in advance
|
||||
def pipe_consume_stage(stage_input):
|
||||
if hasattr(stage_input, "Args"):
|
||||
consume_iterators(stage_input.Args)
|
||||
if hasattr(stage_input, "Kwargs"):
|
||||
if stage_input.Kwargs is not None:
|
||||
for value in stage_input.Kwargs.values():
|
||||
consume_iterators(value)
|
||||
|
||||
|
||||
def consume_iterators(obj):
|
||||
# from pdb import set_trace; set_trace()
|
||||
if isinstance(obj, (str, bytes)):
|
||||
return obj
|
||||
if isinstance(obj, dict):
|
||||
return {key: consume_iterators(value) for key, value in obj.items()}
|
||||
if isinstance(obj, list):
|
||||
return [consume_iterators(item) for item in obj]
|
||||
if isinstance(obj, tuple):
|
||||
return tuple(consume_iterators(item) for item in obj)
|
||||
if hasattr(obj, "__iter__"):
|
||||
for item in obj:
|
||||
consume_iterators(item)
|
||||
return obj
|
||||
|
||||
|
||||
def scene_names_postprocess(scene_names: list) -> list:
|
||||
"""
|
||||
Distributes a list of scene names (folders) among multiple workers in a distributed environment.
|
||||
This function is designed to work with Deep Learning Container (DLC) environments, where worker
|
||||
information is extracted from environment variables. It assigns a subset of the input scene names
|
||||
to the current worker based on its rank and the total number of workers, using a round-robin strategy.
|
||||
If not running in a DLC environment, all scene names are assigned to a single worker.
|
||||
Args:
|
||||
scene_names (list): List of scene names (typically folder names) to be distributed.
|
||||
Returns:
|
||||
list: The subset of scene names assigned to the current worker.
|
||||
Raises:
|
||||
PermissionError: If there is a permission issue accessing the input directory.
|
||||
RuntimeError: For any other errors encountered during processing.
|
||||
Notes:
|
||||
- The function expects certain environment variables (e.g., POD_NAME, WORLD_SIZE) to be set
|
||||
in DLC environments.
|
||||
- If multiple workers are present, the input list is sorted before distribution to ensure
|
||||
consistent assignment across workers.
|
||||
"""
|
||||
|
||||
def _get_dlc_worker_info():
|
||||
"""Extract worker rank and world size from DLC environment variables."""
|
||||
pod_name = os.environ.get("POD_NAME")
|
||||
|
||||
if pod_name:
|
||||
# Match worker-N or master-N patterns
|
||||
match = re.search(r"dlc.*?-(worker|master)-(\d+)$", pod_name)
|
||||
if match:
|
||||
node_type, node_id = match.groups()
|
||||
world_size = int(os.environ.get("WORLD_SIZE", "1"))
|
||||
|
||||
if node_type == "worker":
|
||||
rank = int(node_id)
|
||||
else: # master node
|
||||
rank = world_size - 1
|
||||
|
||||
return rank, world_size
|
||||
|
||||
# Default for non-DLC environment
|
||||
return 0, 1
|
||||
|
||||
def _distribute_folders(all_folders, rank, world_size):
|
||||
"""Distribute folders among workers using round-robin strategy."""
|
||||
if not all_folders:
|
||||
return []
|
||||
|
||||
# Only sort when there are multiple workers to ensure consistency
|
||||
if world_size > 1:
|
||||
all_folders.sort()
|
||||
|
||||
# Distribute using slicing: worker i gets folders at indices i, i+world_size, ...
|
||||
return all_folders[rank::world_size]
|
||||
|
||||
try:
|
||||
# Get all subfolders
|
||||
all_subfolders = scene_names
|
||||
if not all_subfolders:
|
||||
print(f"Warning: No scene found in {scene_names}")
|
||||
return []
|
||||
|
||||
# Get worker identity and distribute folders
|
||||
rank, world_size = _get_dlc_worker_info()
|
||||
assigned_folders = _distribute_folders(all_subfolders, rank, world_size)
|
||||
|
||||
print(
|
||||
f"DLC Worker {rank}/{world_size}: Assigned {len(assigned_folders)} out of "
|
||||
f"{len(all_subfolders)} total folders"
|
||||
)
|
||||
|
||||
return assigned_folders
|
||||
|
||||
except PermissionError:
|
||||
raise PermissionError(f"No permission to access directory: {scene_names}")
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Error reading input directory {scene_names}: {e}")
|
||||
|
||||
|
||||
def retry_on_exception(
|
||||
max_retries: int = 3, retry_exceptions: Union[bool, Tuple[Type[Exception], ...]] = True, delay: float = 1.0
|
||||
):
|
||||
def decorator(func):
|
||||
@functools.wraps(func)
|
||||
def wrapper(self, *args, **kwargs):
|
||||
last_exception = None
|
||||
for attempt in range(max_retries + 1):
|
||||
try:
|
||||
if attempt > 0:
|
||||
print(f"Retry attempt {attempt}/{max_retries} for {func.__name__}")
|
||||
return func(self, *args, **kwargs)
|
||||
except Exception as e:
|
||||
last_exception = e
|
||||
should_retry = False
|
||||
if retry_exceptions is True:
|
||||
should_retry = True
|
||||
elif isinstance(retry_exceptions, (tuple, list)):
|
||||
should_retry = isinstance(e, retry_exceptions)
|
||||
|
||||
if should_retry and attempt < max_retries:
|
||||
print(f"Error in {func.__name__}: {e}. Retrying in {delay} seconds...")
|
||||
time.sleep(delay)
|
||||
else:
|
||||
raise
|
||||
if last_exception:
|
||||
raise last_exception
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
Reference in New Issue
Block a user