From 4934c4794eff87232583cba413cf90ab8555d4e0 Mon Sep 17 00:00:00 2001 From: Leon998 <1158046368@qq.com> Date: Wed, 18 Mar 2026 13:59:52 +0800 Subject: [PATCH] rich annotations & update open-pi fsdp explanations --- .gitignore | 3 +- README.md | 2 +- policy/openpi-InternData-A1/docs/training.md | 2 +- .../scripts/training_scripts/multi_node.sh | 2 - .../src/openpi/training/config.py | 5 +- scripts/download_assets.sh | 2 +- .../simbox/core/cameras/custom_camera.py | 43 +++++- workflows/simbox/core/loggers/__init__.py | 61 +++++++-- workflows/simbox/core/loggers/lmdb_logger.py | 66 ++++++++- workflows/simbox/core/utils/camera_utils.py | 128 ++++++++++++++++++ workflows/simbox_dual_workflow.py | 67 ++++++++- 11 files changed, 349 insertions(+), 32 deletions(-) create mode 100644 workflows/simbox/core/utils/camera_utils.py diff --git a/.gitignore b/.gitignore index 61fc9fe..a79389d 100644 --- a/.gitignore +++ b/.gitignore @@ -17,4 +17,5 @@ polygons.png _isaac_sim_410 InterDataEngine-docs debug.sh -debug.yaml \ No newline at end of file +debug.yaml +depre \ No newline at end of file diff --git a/README.md b/README.md index a9447e6..f3b62b6 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ InternDataEngine is a synthetic data generation engine for embodied AI that powers large-scale model training and iteration. Built on NVIDIA Isaac Sim, it unifies high-fidelity physical interaction from InternData-A1, semantic task and scene generation from InternData-M1, and high-throughput scheduling from the Nimbus framework to deliver realistic, task-aligned, and massively scalable robotic manipulation data. - **More realistic physical interaction**: Unified simulation of rigid, articulated, deformable, and fluid objects across single-arm, dual-arm, and humanoid robots, enabling long-horizon, skill-composed manipulation that better supports sim-to-real transfer. -- **More task-aligned data generation**: LLM-driven task and instruction generation with task-oriented scene graphs (ToSG), producing structured scenes and rich multi-modal annotations (boxes, keypoints, trajectories) for complex instruction-following and spatial reasoning. +- **More diverse data generation**: By leveraging the internal state of the simulation engine to extract high-quality ground truth, coupled with multi-dimensional domain randomization (e.g., layout, texture, structure, and lighting), the data distribution is significantly expanded. This approach produces precise and diverse operational data, while simultaneously exporting rich multimodal annotations such as bounding boxes, segmentation masks, and keypoints. - **More efficient large-scale production**: Nimbus-powered asynchronous pipelines that decouple planning, rendering, and storage, achieving 2–3× end-to-end throughput, cluster-level load balancing and fault tolerance for billion-scale data generation. ## 🔥 Latest News diff --git a/policy/openpi-InternData-A1/docs/training.md b/policy/openpi-InternData-A1/docs/training.md index e49f650..4491d6f 100644 --- a/policy/openpi-InternData-A1/docs/training.md +++ b/policy/openpi-InternData-A1/docs/training.md @@ -40,7 +40,7 @@ python scripts/download_paligemma.py You may adjust other training parameters based on your available GPUs and training budget: - `num_train_steps`: Total number of training steps - `num_workers`: Number of data loading workers -- `fsdp_devices`: Number of GPUs per node +- `fsdp_devices`: Number of GPUs used for FSDP per node to distribute model parameters, gradients, and optimizer states across devices for reduced memory usage - `batch_size`: Batch size per GPU - `save_interval`: Checkpoint saving interval (in steps) diff --git a/policy/openpi-InternData-A1/scripts/training_scripts/multi_node.sh b/policy/openpi-InternData-A1/scripts/training_scripts/multi_node.sh index 2c873a7..c824d57 100755 --- a/policy/openpi-InternData-A1/scripts/training_scripts/multi_node.sh +++ b/policy/openpi-InternData-A1/scripts/training_scripts/multi_node.sh @@ -202,8 +202,6 @@ python scripts/train_jax_multinode.py \ pretrain-interndata-a1 \ --exp-name=pretrain-interndata-a1 \ --num_workers=12 \ - --fsdp_devices=8 \ --batch_size=512 \ --num_train_steps=2000000 \ --save_interval=5000 - diff --git a/policy/openpi-InternData-A1/src/openpi/training/config.py b/policy/openpi-InternData-A1/src/openpi/training/config.py index 95ef6bf..7baeb37 100644 --- a/policy/openpi-InternData-A1/src/openpi/training/config.py +++ b/policy/openpi-InternData-A1/src/openpi/training/config.py @@ -1814,7 +1814,6 @@ _CONFIGS = [ pytorch_weight_path="", num_train_steps=2_000_000, num_workers=12, - fsdp_devices=8, batch_size=512, save_interval=5000, lr_schedule=_optimizer.WarmupConstantSchedule(), @@ -1844,7 +1843,6 @@ _CONFIGS = [ pytorch_weight_path="", num_train_steps=30_000, num_workers=32, - fsdp_devices=8, batch_size=128, save_interval=5000, ), @@ -1872,7 +1870,6 @@ _CONFIGS = [ pytorch_weight_path="", num_train_steps=30_000, num_workers=32, - fsdp_devices=8, batch_size=128, save_interval=5000, ), @@ -1901,4 +1898,4 @@ def check_lerobot_repo(repo_dir: str): if os.path.isdir(os.path.join(repo_dir, "data")) and os.path.isdir(os.path.join(repo_dir, "meta")) and os.path.isdir(os.path.join(repo_dir, "videos")): return True else: - return False + return False \ No newline at end of file diff --git a/scripts/download_assets.sh b/scripts/download_assets.sh index deba35a..9da7842 100644 --- a/scripts/download_assets.sh +++ b/scripts/download_assets.sh @@ -41,7 +41,7 @@ info() { echo -e "\033[32m[INFO]\033[0m $*"; } download() { info "Downloading $2 ..." - huggingface-cli download "$REPO_ID" --repo-type "$REPO_TYPE" --include "$1" --local-dir "$LOCAL_DIR" + hf download "$REPO_ID" --repo-type "$REPO_TYPE" --include "$1" --local-dir "$LOCAL_DIR" } # --- Scene assets: required (both modes) --- diff --git a/workflows/simbox/core/cameras/custom_camera.py b/workflows/simbox/core/cameras/custom_camera.py index 8ad367b..a007694 100644 --- a/workflows/simbox/core/cameras/custom_camera.py +++ b/workflows/simbox/core/cameras/custom_camera.py @@ -1,6 +1,7 @@ import numpy as np import omni.replicator.core as rep from core.cameras.base_camera import register_camera +from core.utils.camera_utils import get_src from omni.isaac.core.prims import XFormPrim from omni.isaac.core.utils.prims import get_prim_at_path from omni.isaac.core.utils.transformations import ( @@ -40,9 +41,25 @@ class CustomCamera(Camera): **kwargs, ) self.initialize() - self.add_motion_vectors_to_frame() - self.add_semantic_segmentation_to_frame() - self.add_distance_to_image_plane_to_frame() + self.with_distance = cfg["params"].get("with_distance", True) + self.with_semantic = cfg["params"].get("with_semantic", False) + self.with_bbox2d = cfg["params"].get("with_bbox2d", False) + self.with_bbox3d = cfg["params"].get("with_bbox3d", False) + # Motion vectors are high-volume outputs; keep default off unless explicitly enabled in config. + self.with_motion_vector = cfg["params"].get("with_motion_vector", False) + self.with_depth = cfg["params"].get("depth", False) + + if self.with_distance: + self.add_distance_to_image_plane_to_frame() + if self.with_semantic: + self.add_semantic_segmentation_to_frame() + if self.with_bbox2d: + self.add_bounding_box_2d_tight_to_frame() + self.add_bounding_box_2d_loose_to_frame() + if self.with_bbox3d: + self.add_bounding_box_3d_to_frame() + if self.with_motion_vector: + self.add_motion_vectors_to_frame() # ===== From cfg ===== pixel_size = cfg["params"].get("pixel_size") @@ -155,9 +172,27 @@ class CustomCamera(Camera): obs = { "color_image": color_image, - "depth_image": self.get_depth(), "camera2env_pose": camera2env_pose, "camera_params": self.is_camera_matrix.tolist(), } + if self.with_depth: + obs["depth_image"] = get_src(self, "depth"), + seg_data = get_src(self, "seg") + if seg_data is not None: + obs["semantic_mask"] = seg_data["mask"] + obs["semantic_mask_id2labels"] = seg_data["id2labels"] + + bbox2d_tight = get_src(self, "bbox2d_tight") + if bbox2d_tight is not None: + obs["bbox2d_tight"], obs["bbox2d_tight_id2labels"] = bbox2d_tight + bbox2d_loose = get_src(self, "bbox2d_loose") + if bbox2d_loose is not None: + obs["bbox2d_loose"], obs["bbox2d_loose_id2labels"] = bbox2d_loose + bbox3d = get_src(self, "bbox3d") + if bbox3d is not None: + obs["bbox3d"], obs["bbox3d_id2labels"] = bbox3d + motion_vectors = get_src(self, "motion_vectors") + if motion_vectors is not None: + obs["motion_vectors"] = motion_vectors return obs diff --git a/workflows/simbox/core/loggers/__init__.py b/workflows/simbox/core/loggers/__init__.py index dd9e101..7ce8d76 100644 --- a/workflows/simbox/core/loggers/__init__.py +++ b/workflows/simbox/core/loggers/__init__.py @@ -27,6 +27,10 @@ class BaseLogger(ABC): self.object_data_logger: Dict[str, List[Any]] = {} self.color_image_logger: Dict[str, List[Any]] = {} self.depth_image_logger: Dict[str, List[Any]] = {} + self.seg_image_logger: Dict[str, List[Any]] = {} + self.color_image_step_logger: Dict[str, List[Any]] = {} + self.depth_image_step_logger: Dict[str, List[Any]] = {} + self.seg_image_step_logger: Dict[str, List[Any]] = {} def update_tpi_initial_info(self, tpi_initial_info): self.tpi_initial_info = tpi_initial_info @@ -67,17 +71,50 @@ class BaseLogger(ABC): self.scalar_data_logger[robot][key] = [] self.scalar_data_logger[robot][key].append(value) - def add_color_image(self, robot, key, value): - if robot not in self.color_image_logger: - self.color_image_logger[robot] = {} - if key not in self.color_image_logger[robot]: - self.color_image_logger[robot][key] = [] - self.color_image_logger[robot][key].append(value) + def _add_image_data(self, data_logger, step_logger, robot, key, value, step_idx=None): + if robot not in data_logger: + data_logger[robot] = {} + if key not in data_logger[robot]: + data_logger[robot][key] = [] + data_logger[robot][key].append(value) - # def add_depth_image(self, key, value): - # if key not in self.depth_image_logger: - # self.depth_image_logger[key] = [] - # self.depth_image_logger[key].append(value) + if robot not in step_logger: + step_logger[robot] = {} + if key not in step_logger[robot]: + step_logger[robot][key] = [] + if step_idx is None: + step_idx = len(data_logger[robot][key]) - 1 + step_logger[robot][key].append(int(step_idx)) + + def add_color_image(self, robot, key, value, step_idx=None): + self._add_image_data( + self.color_image_logger, + self.color_image_step_logger, + robot, + key, + value, + step_idx=step_idx, + ) + + def add_depth_image(self, robot, key, value, step_idx=None): + self._add_image_data( + self.depth_image_logger, + self.depth_image_step_logger, + robot, + key, + value, + step_idx=step_idx, + ) + + def add_seg_image(self, robot, key, value, step_idx=None): + self._add_image_data( + self.seg_image_logger, + self.seg_image_step_logger, + robot, + key, + value, + step_idx=step_idx, + ) def clear( self, @@ -97,6 +134,10 @@ class BaseLogger(ABC): self.scalar_data_logger = {} self.color_image_logger = {} self.depth_image_logger = {} + self.seg_image_logger = {} + self.color_image_step_logger = {} + self.depth_image_step_logger = {} + self.seg_image_step_logger = {} @abstractmethod def close(self): diff --git a/workflows/simbox/core/loggers/lmdb_logger.py b/workflows/simbox/core/loggers/lmdb_logger.py index c1d9bfe..1644b60 100644 --- a/workflows/simbox/core/loggers/lmdb_logger.py +++ b/workflows/simbox/core/loggers/lmdb_logger.py @@ -13,6 +13,16 @@ from tqdm import tqdm DEFAULT_RGB_SCALE_FACTOR = 256000.0 +def float_array_to_uint16_png(float_array): + array = np.nan_to_num(float_array, nan=0.0, posinf=0.0, neginf=0.0) + array = np.round(array * 10000.0) + array = np.clip(array, 0, 65535) + return array.astype(np.uint16) + +def seg_array_to_uint16_png(seg_array): + array = np.nan_to_num(seg_array, nan=0.0, posinf=0.0, neginf=0.0) + array = np.clip(array, 0, 65535) + return array.astype(np.uint16) # pylint: disable=line-too-long,unused-argument class LmdbLogger(BaseLogger): @@ -63,6 +73,7 @@ class LmdbLogger(BaseLogger): meta_info["tpi_initial_info"] = self.tpi_initial_info meta_info["collect_info"] = self.collect_info meta_info["version"] = self.version + meta_info["image_valid_step_ids"] = {} # Lmdb log_path_lmdb = save_dir / "lmdb" @@ -139,13 +150,20 @@ class LmdbLogger(BaseLogger): # Save color images if save_img: - for key, value in self.color_image_logger[robot_name].items(): + for key, value in self.color_image_logger.get(robot_name, {}).items(): root_img_path = save_dir / f"{key}" root_img_path.mkdir(parents=True, exist_ok=True) + step_ids = self.color_image_step_logger.get(robot_name, {}).get(key, []) + if len(step_ids) != len(value): + step_ids = list(range(len(value))) + else: + step_ids = [int(x) for x in step_ids] + meta_info["image_valid_step_ids"][key] = step_ids + meta_info["keys"][key] = [] for i, image in enumerate(tqdm(value)): - step_id = str(i).zfill(4) + step_id = str(step_ids[i]).zfill(4) txn.put( f"{key}/{step_id}".encode("utf-8"), pickle.dumps(cv2.imencode(".jpg", image.astype(np.uint8))[1]), @@ -154,7 +172,49 @@ class LmdbLogger(BaseLogger): imageio.mimsave(os.path.join(root_img_path, "demo.mp4"), value, fps=15) - meta_info["num_steps"] = len(value) + for key, value in self.depth_image_logger.get(robot_name, {}).items(): + root_img_path = save_dir / f"{key}" + root_img_path.mkdir(parents=True, exist_ok=True) + + step_ids = self.depth_image_step_logger.get(robot_name, {}).get(key, []) + if len(step_ids) != len(value): + step_ids = list(range(len(value))) + else: + step_ids = [int(x) for x in step_ids] + meta_info["image_valid_step_ids"][key] = step_ids + + meta_info["keys"][key] = [] + for i, image in enumerate(tqdm(value)): + step_id = str(step_ids[i]).zfill(4) + depth_image = float_array_to_uint16_png(np.asarray(image)) + txn.put( + f"{key}/{step_id}".encode('utf-8'), + pickle.dumps(cv2.imencode('.png', depth_image)[1]) + ) + meta_info["keys"][key].append(f"{key}/{step_id}".encode('utf-8')) + + for key, value in self.seg_image_logger.get(robot_name, {}).items(): + root_img_path = save_dir / f"{key}" + root_img_path.mkdir(parents=True, exist_ok=True) + + step_ids = self.seg_image_step_logger.get(robot_name, {}).get(key, []) + if len(step_ids) != len(value): + step_ids = list(range(len(value))) + else: + step_ids = [int(x) for x in step_ids] + meta_info["image_valid_step_ids"][key] = step_ids + + meta_info["keys"][key] = [] + for i, image in enumerate(tqdm(value)): + step_id = str(step_ids[i]).zfill(4) + seg_image = seg_array_to_uint16_png(np.asarray(image)) + txn.put( + f"{key}/{step_id}".encode('utf-8'), + pickle.dumps(cv2.imencode('.png', seg_image)[1]) + ) + meta_info["keys"][key].append(f"{key}/{step_id}".encode('utf-8')) + + meta_info["num_steps"] = self.log_num_steps txn.commit() lmdb_env.close() pickle.dump(meta_info, open(os.path.join(save_dir, "meta_info.pkl"), "wb")) diff --git a/workflows/simbox/core/utils/camera_utils.py b/workflows/simbox/core/utils/camera_utils.py new file mode 100644 index 0000000..b21a244 --- /dev/null +++ b/workflows/simbox/core/utils/camera_utils.py @@ -0,0 +1,128 @@ +import numpy as np +from omni.isaac.core.utils.prims import get_prim_at_path +from omni.isaac.core.utils.transformations import get_relative_transform +from omni.isaac.sensor import Camera + + +def _get_annotator(camera: Camera, annotator_name: str): + custom_annotators = getattr(camera, "_custom_annotators", None) + if not isinstance(custom_annotators, dict): + return None + return custom_annotators.get(annotator_name) + + +def _get_frame(frame): + if isinstance(frame, np.ndarray) and frame.size > 0: + return frame[:, :, :3] + return None + + +def _get_depth(depth): + if isinstance(depth, np.ndarray) and depth.size > 0: + return depth + return None + + +def _get_rgb_image(camera: Camera): + output_mode = getattr(camera, "output_mode", "rgb") + if output_mode == "rgb": + return _get_frame(camera.get_rgba()) + if output_mode == "diffuse_albedo": + annotator = _get_annotator(camera, "DiffuseAlbedo") + if annotator is None: + return None + return _get_frame(annotator.get_data()) + raise NotImplementedError(f"Unsupported output mode: {output_mode}") + + +def _get_depth_image(camera: Camera): + annotator = _get_annotator(camera, "distance_to_image_plane") + if annotator is None: + return None + return _get_depth(annotator.get_data()) + + +def _get_object_mask(camera: Camera): + annotator = _get_annotator(camera, "semantic_segmentation") + if annotator is None: + return None + annotation_data = annotator.get_data() + if ( + not isinstance(annotation_data, dict) + or "data" not in annotation_data + or "info" not in annotation_data + ): + return None + info = annotation_data["info"] + if not isinstance(info, dict) or "idToLabels" not in info: + return None + mask = annotation_data["data"] + if isinstance(mask, np.ndarray) and mask.size > 0: + return {"mask": mask, "id2labels": info["idToLabels"]} + return None + + +def _get_bbox(camera: Camera, bbox_type: str): + annotator = _get_annotator(camera, bbox_type) + if annotator is None: + return None + annotation_data = annotator.get_data() + if ( + not isinstance(annotation_data, dict) + or "data" not in annotation_data + or "info" not in annotation_data + ): + return None + info = annotation_data["info"] + if not isinstance(info, dict) or "idToLabels" not in info: + return None + return annotation_data["data"], info["idToLabels"] + + +def _get_motion_vectors(camera: Camera): + annotator = _get_annotator(camera, "motion_vectors") + if annotator is None: + return None + annotation_data = annotator.get_data() + if isinstance(annotation_data, np.ndarray) and annotation_data.size > 0: + return annotation_data + return None + + +def _get_camera2env_pose(camera: Camera): + prim_path = getattr(camera, "prim_path", None) + root_prim_path = getattr(camera, "root_prim_path", None) + if not prim_path or not root_prim_path: + return None + return get_relative_transform(get_prim_at_path(prim_path), get_prim_at_path(root_prim_path)) + + +def _get_camera_params(camera: Camera): + camera_matrix = getattr(camera, "is_camera_matrix", None) + if camera_matrix is None: + return None + if isinstance(camera_matrix, np.ndarray): + return camera_matrix.tolist() + return camera_matrix + + +def get_src(camera: Camera, data_type: str): + if data_type == "rgb": + return _get_rgb_image(camera) + if data_type == "depth": + return _get_depth_image(camera) + if data_type == "seg": + return _get_object_mask(camera) + if data_type == "bbox2d_tight": + return _get_bbox(camera, "bounding_box_2d_tight") + if data_type == "bbox2d_loose": + return _get_bbox(camera, "bounding_box_2d_loose") + if data_type == "bbox3d": + return _get_bbox(camera, "bounding_box_3d") + if data_type == "motion_vectors": + return _get_motion_vectors(camera) + if data_type == "camera2env_pose": + return _get_camera2env_pose(camera) + if data_type == "camera_params": + return _get_camera_params(camera) + raise NotImplementedError(f"Unsupported source type: {data_type}") diff --git a/workflows/simbox_dual_workflow.py b/workflows/simbox_dual_workflow.py index 9ec1a7a..485c9bd 100644 --- a/workflows/simbox_dual_workflow.py +++ b/workflows/simbox_dual_workflow.py @@ -168,6 +168,8 @@ class SimBoxDualWorkFlow(NimbusWorkFlow): collect_info=self.task_cfg["data"]["collect_info"], version=self.task_cfg["data"].get("version", "v1.0"), ) + # Motion vectors are large dense tensors; keep LMDB logging opt-in. + self.log_motion_vectors = bool(self.task_cfg["data"].get("log_motion_vectors", False)) if self.random_seed is not None: seed = self.random_seed @@ -530,16 +532,71 @@ class SimBoxDualWorkFlow(NimbusWorkFlow): for key, value in self.task.cameras.items(): for robot_name, _ in self.task.robots.items(): if robot_name in key: - rgb_img = value.get_observations()["color_image"] + camera_obs = value.get_observations() + rgb_img = camera_obs["color_image"] # Special processing if enabled - camera2env_pose = value.get_observations()["camera2env_pose"] + camera2env_pose = camera_obs["camera2env_pose"] save_camera_name = key.replace(f"{robot_name}_", "") - self.logger.add_color_image(robot_name, "images.rgb." + save_camera_name, rgb_img) - self.logger.add_scalar_data(robot_name, "camera2env_pose." + save_camera_name, camera2env_pose) + self.logger.add_color_image( + robot_name, "images.rgb." + save_camera_name, rgb_img, step_idx=step_idx + ) + if "depth_image" in camera_obs: + depth_image = camera_obs["depth_image"] + depth_img = np.nan_to_num(depth_img, nan=0.0, posinf=0.0, neginf=0.0) + self.logger.add_depth_image( + robot_name, "images.depth." + save_camera_name, depth_img, step_idx=step_idx + ) + if "semantic_mask" in camera_obs: + self.logger.add_seg_image( + robot_name, "images.seg." + save_camera_name, seg_mask, step_idx=step_idx + ) + if "semantic_mask_id2labels" in camera_obs: + self.logger.add_scalar_data( + robot_name, + "labels.seg." + save_camera_name, + camera_obs["semantic_mask_id2labels"], + ) + if "bbox2d_tight" in camera_obs: + self.logger.add_scalar_data( + robot_name, "labels.bbox2d_tight." + save_camera_name, camera_obs["bbox2d_tight"] + ) + if "bbox2d_tight_id2labels" in camera_obs: + self.logger.add_scalar_data( + robot_name, + "labels.bbox2d_tight_id2labels." + save_camera_name, + camera_obs["bbox2d_tight_id2labels"], + ) + if "bbox2d_loose" in camera_obs: + self.logger.add_scalar_data( + robot_name, "labels.bbox2d_loose." + save_camera_name, camera_obs["bbox2d_loose"] + ) + if "bbox2d_loose_id2labels" in camera_obs: + self.logger.add_scalar_data( + robot_name, + "labels.bbox2d_loose_id2labels." + save_camera_name, + camera_obs["bbox2d_loose_id2labels"], + ) + if "bbox3d" in camera_obs: + self.logger.add_scalar_data( + robot_name, "labels.bbox3d." + save_camera_name, camera_obs["bbox3d"] + ) + if "bbox3d_id2labels" in camera_obs: + self.logger.add_scalar_data( + robot_name, + "labels.bbox3d_id2labels." + save_camera_name, + camera_obs["bbox3d_id2labels"], + ) + if self.log_motion_vectors and "motion_vectors" in camera_obs: + self.logger.add_scalar_data( + robot_name, "labels.motion_vectors." + save_camera_name, camera_obs["motion_vectors"] + ) + self.logger.add_scalar_data( + robot_name, "camera2env_pose." + save_camera_name, camera2env_pose + ) if step_idx == 0: save_camera_name = key.replace(f"{robot_name}_", "") self.logger.add_json_data( - robot_name, f"{save_camera_name}_camera_params", value.get_observations()["camera_params"] + robot_name, f"{save_camera_name}_camera_params", camera_obs["camera_params"] ) # depth_img = get_src(value, "depth")