init commit

This commit is contained in:
zyhe
2026-03-16 11:44:10 +00:00
commit 94384a93c9
552 changed files with 363038 additions and 0 deletions

20
nimbus/utils/config.py Normal file
View File

@@ -0,0 +1,20 @@
from omegaconf import OmegaConf
def load_config(*yaml_files, cli_args=None):
if cli_args is None:
cli_args = []
yaml_confs = [OmegaConf.load(f) for f in yaml_files]
cli_conf = OmegaConf.from_cli(cli_args)
conf = OmegaConf.merge(*yaml_confs, cli_conf)
OmegaConf.resolve(conf)
return conf
def config_to_primitive(config, resolve=True):
return OmegaConf.to_container(config, resolve=resolve)
def save_config(config, path):
with open(path, "w", encoding="utf-8") as fp:
OmegaConf.save(config=config, f=fp)

View File

@@ -0,0 +1,138 @@
"""
Config Processor: Responsible for identifying, converting, and loading configuration files.
"""
from omegaconf import DictConfig, OmegaConf
from nimbus.utils.config import load_config
class ConfigProcessor:
"""Config processor class"""
def __init__(self):
pass
def _check_config_path_exists(self, config, path):
"""
Check if a configuration path exists in the config object
Args:
config: OmegaConf config object
path: String path like 'stage_pipe.worker_num' or 'load_stage.scene_loader.args.random_num'
Returns:
bool: Whether the path exists in the config
"""
try:
keys = path.split(".")
current = config
for key in keys:
if isinstance(current, DictConfig):
if key not in current:
return False
current = current[key]
else:
return False
return True
except Exception:
return False
def _validate_cli_args(self, config, cli_args):
"""
Validate that all CLI arguments correspond to existing paths in the config
Args:
config: OmegaConf config object
cli_args: List of command line arguments
Raises:
ValueError: If any CLI argument path doesn't exist in the config
"""
if not cli_args:
return
# Clean up CLI args to remove -- prefix if present
cleaned_cli_args = []
for arg in cli_args:
if arg.startswith("--"):
cleaned_cli_args.append(arg[2:]) # Remove the -- prefix
else:
cleaned_cli_args.append(arg)
# Parse CLI args to get the override paths
try:
cli_conf = OmegaConf.from_cli(cleaned_cli_args)
except Exception as e:
raise ValueError(f"Invalid CLI argument format: {e}. Please use format like: stage_pipe.worker_num='[2,4]'")
def check_nested_paths(conf, prefix=""):
"""Recursively check all paths in the CLI config"""
for key, value in conf.items():
current_path = f"{prefix}.{key}" if prefix else key
if isinstance(value, DictConfig):
# Check if this intermediate path exists
if not self._check_config_path_exists(config, current_path):
raise ValueError(f"Configuration path '{current_path}' does not exist in the config file")
# Recursively check nested paths
check_nested_paths(value, current_path)
else:
# Check if this leaf path exists
if not self._check_config_path_exists(config, current_path):
raise ValueError(f"Configuration path '{current_path}' does not exist in the config file")
try:
check_nested_paths(cli_conf)
except ValueError:
raise
except Exception:
# If there's an issue parsing CLI args, provide helpful error message
raise ValueError("Invalid CLI argument format. Please use format like: --key=value or --nested.key=value")
def process_config(self, config_path, cli_args=None):
"""
Process the config file
Args:
config_path: Path to the config file
cli_args: List of command line arguments
Returns:
OmegaConf: Processed config object
"""
# Clean up CLI args to remove -- prefix if present
cleaned_cli_args = []
if cli_args:
for arg in cli_args:
if arg.startswith("--"):
cleaned_cli_args.append(arg[2:]) # Remove the -- prefix
else:
cleaned_cli_args.append(arg)
# Load config first without CLI args to validate paths
try:
base_config = load_config(config_path)
except Exception as e:
raise ValueError(f"Error loading config: {e}")
# Validate that CLI arguments correspond to existing paths
if cli_args:
self._validate_cli_args(base_config, cli_args)
# Now load config with CLI args (validation passed)
config = load_config(config_path, cli_args=cleaned_cli_args)
return config
def print_final_config(self, config):
"""
Print the final running config
Args:
config: OmegaConf config object
"""
print("=" * 50)
print("final config:")
print("=" * 50)
print(OmegaConf.to_yaml(config))

23
nimbus/utils/flags.py Normal file
View File

@@ -0,0 +1,23 @@
import os
_DEBUG_KEY = "NIMBUS_DEBUG"
_RANDOM_SEED_KEY = "NIMBUS_RANDOM_SEED"
def set_debug_mode(enabled: bool) -> None:
"""Set debug mode. Must be called before ray.init() to propagate to Ray workers."""
os.environ[_DEBUG_KEY] = "1" if enabled else "0"
def is_debug_mode() -> bool:
return os.environ.get(_DEBUG_KEY, "0") == "1"
def set_random_seed(seed: int) -> None:
"""Set global random seed. Must be called before ray.init() to propagate to Ray workers."""
os.environ[_RANDOM_SEED_KEY] = str(seed)
def get_random_seed() -> int | None:
val = os.environ.get(_RANDOM_SEED_KEY)
return int(val) if val is not None else None

48
nimbus/utils/logging.py Normal file
View File

@@ -0,0 +1,48 @@
import logging
import os
import time
from datetime import datetime
from nimbus.utils.config import save_config
def configure_logging(exp_name, name=None, config=None):
pod_name = os.environ.get("POD_NAME", None)
if pod_name is not None:
exp_name = f"{exp_name}/{pod_name}"
log_dir = os.path.join("./output", exp_name)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
if name is None:
log_name = f"de_time_profile_{timestamp}.log"
else:
log_name = f"de_{name}_time_profile_{timestamp}.log"
log_file = os.path.join(log_dir, log_name)
max_retries = 3
for attempt in range(max_retries):
try:
os.makedirs(log_dir, exist_ok=True)
break
except Exception as e:
print(f"Warning: Stale file handle when creating {log_dir}, attempt {attempt + 1}/{max_retries}")
if attempt < max_retries - 1:
time.sleep(3)
continue
else:
raise RuntimeError(f"Failed to create log directory {log_dir} after {max_retries} attempts") from e
if config is not None:
config_log_file = os.path.join(log_dir, "de_config.yaml")
save_config(config, config_log_file)
logger = logging.getLogger("de_logger")
logger.setLevel(logging.INFO)
fh = logging.FileHandler(log_file, mode="a")
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
fh.setFormatter(formatter)
logger.addHandler(fh)
logger.info("Start Data Engine")
return logger

33
nimbus/utils/random.py Normal file
View File

@@ -0,0 +1,33 @@
import os
import random
import numpy as np
import torch
# Try to import open3d, but don't fail if it's not installed
try:
import open3d as o3d
except ImportError:
o3d = None
def set_all_seeds(seed):
"""
Sets seeds for all relevant random number generators to ensure reproducibility.
"""
os.environ["PYTHONHASHSEED"] = str(seed)
print(f"set seed {seed} for all libraries")
seed = int(seed)
np.random.seed(seed)
random.seed(seed)
if o3d and hasattr(o3d, "utility") and hasattr(o3d.utility, "random"):
o3d.utility.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
# These settings are crucial for deterministic results with CuDNN
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

65
nimbus/utils/types.py Normal file
View File

@@ -0,0 +1,65 @@
from dataclasses import dataclass
from typing import Dict, Optional, Tuple
NAME = "name"
# stage name
LOAD_STAGE = "load_stage"
PLAN_STAGE = "plan_stage"
RENDER_STAGE = "render_stage"
PLAN_WITH_RENDER_STAGE = "plan_with_render_stage"
STORE_STAGE = "store_stage"
STAGE_PIPE = "stage_pipe"
DUMP_STAGE = "dump_stage"
DEDUMP_STAGE = "dedump_stage"
# instruction name
# LOAD_STAGE
SCENE_LOADER = "scene_loader"
LAYOUT_RANDOM_GENERATOR = "layout_random_generator"
INDEX_GENERATOR = "index_generator"
DEDUMPER = "dedumper"
# PLAN_STAGE
SEQ_PLANNER = "seq_planner"
PLANNER = "planner"
SIMULATOR = "simulator"
# RENDER_STAGE
RENDERER = "renderer"
# PLAN_WITH_RENDER_STAGE
PLAN_WITH_RENDER = "plan_with_render"
# PIPE_STAGE
STAGE_NUM = "stage_num"
STAGE_DEV = "stage_dev"
WORKER_NUM = "worker_num"
WORKER_SCHEDULE = "worker_schedule"
SAFE_THRESHOLD = "safe_threshold"
STATUS_TIMEOUTS = "status_timeouts"
MONITOR_CHECK_INTERVAL = "monitor_check_interval"
# STORE_STAGE
WRITER = "writer"
DUMPER = "dumper"
OUTPUT_PATH = "output_path"
INPUT_PATH = "input_path"
TYPE = "type"
ARGS = "args"
@dataclass
class StageInput:
"""
A data class that encapsulates the input for a stage in the processing pipeline.
Args:
Args (Optional[Tuple]): Positional arguments passed to the stage's processing function.
Kwargs (Optional[Dict]): Keyword arguments passed to the stage's processing function.
"""
Args: Optional[Tuple] = None
Kwargs: Optional[Dict] = None

182
nimbus/utils/utils.py Normal file
View File

@@ -0,0 +1,182 @@
import functools
import os
import re
import sys
import time
from typing import Tuple, Type, Union
from nimbus.components.data.observation import Observations
from nimbus.components.data.scene import Scene
from nimbus.components.data.sequence import Sequence
def init_env():
sys.path.append("./")
sys.path.append("./data_engine")
sys.path.append("workflows/simbox")
def unpack_iter_data(data: tuple):
assert len(data) <= 3, "not support yet"
scene = None
seq = None
obs = None
for item in data:
if isinstance(item, Scene):
scene = item
elif isinstance(item, Sequence):
seq = item
elif isinstance(item, Observations):
obs = item
return scene, seq, obs
def consume_stage(stage_input):
if hasattr(stage_input, "Args"):
consume_iterators(stage_input.Args)
for value in stage_input.Args:
if hasattr(value, "__del__"):
value.__del__() # pylint: disable=C2801
if hasattr(stage_input, "Kwargs"):
if stage_input.Kwargs is not None:
for value in stage_input.Kwargs.values():
consume_iterators(value)
if hasattr(value, "__del__"):
value.__del__() # pylint: disable=C2801
# prevent isaac sim close pipe worker in advance
def pipe_consume_stage(stage_input):
if hasattr(stage_input, "Args"):
consume_iterators(stage_input.Args)
if hasattr(stage_input, "Kwargs"):
if stage_input.Kwargs is not None:
for value in stage_input.Kwargs.values():
consume_iterators(value)
def consume_iterators(obj):
# from pdb import set_trace; set_trace()
if isinstance(obj, (str, bytes)):
return obj
if isinstance(obj, dict):
return {key: consume_iterators(value) for key, value in obj.items()}
if isinstance(obj, list):
return [consume_iterators(item) for item in obj]
if isinstance(obj, tuple):
return tuple(consume_iterators(item) for item in obj)
if hasattr(obj, "__iter__"):
for item in obj:
consume_iterators(item)
return obj
def scene_names_postprocess(scene_names: list) -> list:
"""
Distributes a list of scene names (folders) among multiple workers in a distributed environment.
This function is designed to work with Deep Learning Container (DLC) environments, where worker
information is extracted from environment variables. It assigns a subset of the input scene names
to the current worker based on its rank and the total number of workers, using a round-robin strategy.
If not running in a DLC environment, all scene names are assigned to a single worker.
Args:
scene_names (list): List of scene names (typically folder names) to be distributed.
Returns:
list: The subset of scene names assigned to the current worker.
Raises:
PermissionError: If there is a permission issue accessing the input directory.
RuntimeError: For any other errors encountered during processing.
Notes:
- The function expects certain environment variables (e.g., POD_NAME, WORLD_SIZE) to be set
in DLC environments.
- If multiple workers are present, the input list is sorted before distribution to ensure
consistent assignment across workers.
"""
def _get_dlc_worker_info():
"""Extract worker rank and world size from DLC environment variables."""
pod_name = os.environ.get("POD_NAME")
if pod_name:
# Match worker-N or master-N patterns
match = re.search(r"dlc.*?-(worker|master)-(\d+)$", pod_name)
if match:
node_type, node_id = match.groups()
world_size = int(os.environ.get("WORLD_SIZE", "1"))
if node_type == "worker":
rank = int(node_id)
else: # master node
rank = world_size - 1
return rank, world_size
# Default for non-DLC environment
return 0, 1
def _distribute_folders(all_folders, rank, world_size):
"""Distribute folders among workers using round-robin strategy."""
if not all_folders:
return []
# Only sort when there are multiple workers to ensure consistency
if world_size > 1:
all_folders.sort()
# Distribute using slicing: worker i gets folders at indices i, i+world_size, ...
return all_folders[rank::world_size]
try:
# Get all subfolders
all_subfolders = scene_names
if not all_subfolders:
print(f"Warning: No scene found in {scene_names}")
return []
# Get worker identity and distribute folders
rank, world_size = _get_dlc_worker_info()
assigned_folders = _distribute_folders(all_subfolders, rank, world_size)
print(
f"DLC Worker {rank}/{world_size}: Assigned {len(assigned_folders)} out of "
f"{len(all_subfolders)} total folders"
)
return assigned_folders
except PermissionError:
raise PermissionError(f"No permission to access directory: {scene_names}")
except Exception as e:
raise RuntimeError(f"Error reading input directory {scene_names}: {e}")
def retry_on_exception(
max_retries: int = 3, retry_exceptions: Union[bool, Tuple[Type[Exception], ...]] = True, delay: float = 1.0
):
def decorator(func):
@functools.wraps(func)
def wrapper(self, *args, **kwargs):
last_exception = None
for attempt in range(max_retries + 1):
try:
if attempt > 0:
print(f"Retry attempt {attempt}/{max_retries} for {func.__name__}")
return func(self, *args, **kwargs)
except Exception as e:
last_exception = e
should_retry = False
if retry_exceptions is True:
should_retry = True
elif isinstance(retry_exceptions, (tuple, list)):
should_retry = isinstance(e, retry_exceptions)
if should_retry and attempt < max_retries:
print(f"Error in {func.__name__}: {e}. Retrying in {delay} seconds...")
time.sleep(delay)
else:
raise
if last_exception:
raise last_exception
return wrapper
return decorator