rich annotations & update open-pi fsdp explanations

2026-03-18 13:59:52 +08:00
parent 814f3c3526
commit 4934c4794e
11 changed files with 349 additions and 32 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -17,4 +17,5 @@ polygons.png
 _isaac_sim_410
 InterDataEngine-docs
 debug.sh
-debug.yaml
+debug.yaml
+depre
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@
 InternDataEngine is a synthetic data generation engine for embodied AI that powers large-scale model training and iteration. Built on NVIDIA Isaac Sim, it unifies high-fidelity physical interaction from InternData-A1, semantic task and scene generation from InternData-M1, and high-throughput scheduling from the Nimbus framework to deliver realistic, task-aligned, and massively scalable robotic manipulation data.

 - **More realistic physical interaction**: Unified simulation of rigid, articulated, deformable, and fluid objects across single-arm, dual-arm, and humanoid robots, enabling long-horizon, skill-composed manipulation that better supports sim-to-real transfer.
- **More task-aligned data generation**: LLM-driven task and instruction generation with task-oriented scene graphs (ToSG), producing structured scenes and rich multi-modal annotations (boxes, keypoints, trajectories) for complex instruction-following and spatial reasoning.
+- **More diverse data generation**: By leveraging the internal state of the simulation engine to extract high-quality ground truth, coupled with multi-dimensional domain randomization (e.g., layout, texture, structure, and lighting), the data distribution is significantly expanded. This approach produces precise and diverse operational data, while simultaneously exporting rich multimodal annotations such as bounding boxes, segmentation masks, and keypoints.
 - **More efficient large-scale production**: Nimbus-powered asynchronous pipelines that decouple planning, rendering, and storage, achieving 2–3× end-to-end throughput, cluster-level load balancing and fault tolerance for billion-scale data generation.

 ## 🔥 Latest News
--- a/policy/openpi-InternData-A1/docs/training.md
+++ b/policy/openpi-InternData-A1/docs/training.md
@@ -40,7 +40,7 @@ python scripts/download_paligemma.py
 You may adjust other training parameters based on your available GPUs and training budget:
 - `num_train_steps`: Total number of training steps
 - `num_workers`: Number of data loading workers
- `fsdp_devices`: Number of GPUs per node
+- `fsdp_devices`: Number of GPUs used for FSDP per node to distribute model parameters, gradients, and optimizer states across devices for reduced memory usage
 - `batch_size`: Batch size per GPU
 - `save_interval`: Checkpoint saving interval (in steps)

--- a/policy/openpi-InternData-A1/scripts/training_scripts/multi_node.sh
+++ b/policy/openpi-InternData-A1/scripts/training_scripts/multi_node.sh
@@ -202,8 +202,6 @@ python scripts/train_jax_multinode.py \
    pretrain-interndata-a1 \
  --exp-name=pretrain-interndata-a1 \
  --num_workers=12 \
-  --fsdp_devices=8 \
  --batch_size=512 \
  --num_train_steps=2000000 \
  --save_interval=5000
-
--- a/policy/openpi-InternData-A1/src/openpi/training/config.py
+++ b/policy/openpi-InternData-A1/src/openpi/training/config.py
@@ -1814,7 +1814,6 @@ _CONFIGS = [
        pytorch_weight_path="", 
        num_train_steps=2_000_000,
        num_workers=12,
-        fsdp_devices=8,
        batch_size=512, 
        save_interval=5000,
        lr_schedule=_optimizer.WarmupConstantSchedule(),
@@ -1844,7 +1843,6 @@ _CONFIGS = [
        pytorch_weight_path="", 
        num_train_steps=30_000,
        num_workers=32,
-        fsdp_devices=8,
        batch_size=128, 
        save_interval=5000,
    ),
@@ -1872,7 +1870,6 @@ _CONFIGS = [
        pytorch_weight_path="", 
        num_train_steps=30_000,
        num_workers=32,
-        fsdp_devices=8,
        batch_size=128, 
        save_interval=5000,
    ),
@@ -1901,4 +1898,4 @@ def check_lerobot_repo(repo_dir: str):
    if os.path.isdir(os.path.join(repo_dir, "data")) and os.path.isdir(os.path.join(repo_dir, "meta")) and os.path.isdir(os.path.join(repo_dir, "videos")):
        return True
    else:
-        return False
+        return False
--- a/scripts/download_assets.sh
+++ b/scripts/download_assets.sh
@@ -41,7 +41,7 @@ info() { echo -e "\033[32m[INFO]\033[0m $*"; }

 download() {
    info "Downloading $2 ..."
-    huggingface-cli download "$REPO_ID" --repo-type "$REPO_TYPE" --include "$1" --local-dir "$LOCAL_DIR"
+    hf download "$REPO_ID" --repo-type "$REPO_TYPE" --include "$1" --local-dir "$LOCAL_DIR"
 }

 # --- Scene assets: required (both modes) ---
--- a/workflows/simbox/core/cameras/custom_camera.py
+++ b/workflows/simbox/core/cameras/custom_camera.py
@@ -1,6 +1,7 @@
 import numpy as np
 import omni.replicator.core as rep
 from core.cameras.base_camera import register_camera
+from core.utils.camera_utils import get_src
 from omni.isaac.core.prims import XFormPrim
 from omni.isaac.core.utils.prims import get_prim_at_path
 from omni.isaac.core.utils.transformations import (
@@ -40,9 +41,25 @@ class CustomCamera(Camera):
            **kwargs,
        )
        self.initialize()
-        self.add_motion_vectors_to_frame()
-        self.add_semantic_segmentation_to_frame()
-        self.add_distance_to_image_plane_to_frame()
+        self.with_distance = cfg["params"].get("with_distance", True)
+        self.with_semantic = cfg["params"].get("with_semantic", False)
+        self.with_bbox2d = cfg["params"].get("with_bbox2d", False)
+        self.with_bbox3d = cfg["params"].get("with_bbox3d", False)
+        # Motion vectors are high-volume outputs; keep default off unless explicitly enabled in config.
+        self.with_motion_vector = cfg["params"].get("with_motion_vector", False)
+        self.with_depth = cfg["params"].get("depth", False)
+
+        if self.with_distance:
+            self.add_distance_to_image_plane_to_frame()
+        if self.with_semantic:
+            self.add_semantic_segmentation_to_frame()
+        if self.with_bbox2d:
+            self.add_bounding_box_2d_tight_to_frame()
+            self.add_bounding_box_2d_loose_to_frame()
+        if self.with_bbox3d:
+            self.add_bounding_box_3d_to_frame()
+        if self.with_motion_vector:
+            self.add_motion_vectors_to_frame()

        # ===== From cfg =====
        pixel_size = cfg["params"].get("pixel_size")
@@ -155,9 +172,27 @@ class CustomCamera(Camera):

        obs = {
            "color_image": color_image,
-            "depth_image": self.get_depth(),
            "camera2env_pose": camera2env_pose,
            "camera_params": self.is_camera_matrix.tolist(),
        }
+        if self.with_depth:
+            obs["depth_image"] = get_src(self, "depth"),

+        seg_data = get_src(self, "seg")
+        if seg_data is not None:
+            obs["semantic_mask"] = seg_data["mask"]
+            obs["semantic_mask_id2labels"] = seg_data["id2labels"]
+
+        bbox2d_tight = get_src(self, "bbox2d_tight")
+        if bbox2d_tight is not None:
+            obs["bbox2d_tight"], obs["bbox2d_tight_id2labels"] = bbox2d_tight
+        bbox2d_loose = get_src(self, "bbox2d_loose")
+        if bbox2d_loose is not None:
+            obs["bbox2d_loose"], obs["bbox2d_loose_id2labels"] = bbox2d_loose
+        bbox3d = get_src(self, "bbox3d")
+        if bbox3d is not None:
+            obs["bbox3d"], obs["bbox3d_id2labels"] = bbox3d
+        motion_vectors = get_src(self, "motion_vectors")
+        if motion_vectors is not None:
+            obs["motion_vectors"] = motion_vectors
        return obs
--- a/workflows/simbox/core/loggers/init.py
+++ b/workflows/simbox/core/loggers/init.py
@@ -27,6 +27,10 @@ class BaseLogger(ABC):
        self.object_data_logger: Dict[str, List[Any]] = {}
        self.color_image_logger: Dict[str, List[Any]] = {}
        self.depth_image_logger: Dict[str, List[Any]] = {}
+        self.seg_image_logger: Dict[str, List[Any]] = {}
+        self.color_image_step_logger: Dict[str, List[Any]] = {}
+        self.depth_image_step_logger: Dict[str, List[Any]] = {}
+        self.seg_image_step_logger: Dict[str, List[Any]] = {}

    def update_tpi_initial_info(self, tpi_initial_info):
        self.tpi_initial_info = tpi_initial_info
@@ -67,17 +71,50 @@ class BaseLogger(ABC):
            self.scalar_data_logger[robot][key] = []
        self.scalar_data_logger[robot][key].append(value)

-    def add_color_image(self, robot, key, value):
-        if robot not in self.color_image_logger:
-            self.color_image_logger[robot] = {}
-        if key not in self.color_image_logger[robot]:
-            self.color_image_logger[robot][key] = []
-        self.color_image_logger[robot][key].append(value)
+    def _add_image_data(self, data_logger, step_logger, robot, key, value, step_idx=None):
+        if robot not in data_logger:
+            data_logger[robot] = {}
+        if key not in data_logger[robot]:
+            data_logger[robot][key] = []
+        data_logger[robot][key].append(value)

-    # def add_depth_image(self, key, value):
-    #     if key not in self.depth_image_logger:
-    #         self.depth_image_logger[key] = []
-    #     self.depth_image_logger[key].append(value)
+        if robot not in step_logger:
+            step_logger[robot] = {}
+        if key not in step_logger[robot]:
+            step_logger[robot][key] = []
+        if step_idx is None:
+            step_idx = len(data_logger[robot][key]) - 1
+        step_logger[robot][key].append(int(step_idx))
+
+    def add_color_image(self, robot, key, value, step_idx=None):
+        self._add_image_data(
+            self.color_image_logger,
+            self.color_image_step_logger,
+            robot,
+            key,
+            value,
+            step_idx=step_idx,
+        )
+
+    def add_depth_image(self, robot, key, value, step_idx=None):
+        self._add_image_data(
+            self.depth_image_logger,
+            self.depth_image_step_logger,
+            robot,
+            key,
+            value,
+            step_idx=step_idx,
+        )
+
+    def add_seg_image(self, robot, key, value, step_idx=None):
+        self._add_image_data(
+            self.seg_image_logger,
+            self.seg_image_step_logger,
+            robot,
+            key,
+            value,
+            step_idx=step_idx,
+        )

    def clear(
        self,
@@ -97,6 +134,10 @@ class BaseLogger(ABC):
        self.scalar_data_logger = {}
        self.color_image_logger = {}
        self.depth_image_logger = {}
+        self.seg_image_logger = {}
+        self.color_image_step_logger = {}
+        self.depth_image_step_logger = {}
+        self.seg_image_step_logger = {}

    @abstractmethod
    def close(self):
--- a/workflows/simbox/core/loggers/lmdb_logger.py
+++ b/workflows/simbox/core/loggers/lmdb_logger.py
@@ -13,6 +13,16 @@ from tqdm import tqdm

 DEFAULT_RGB_SCALE_FACTOR = 256000.0

+def float_array_to_uint16_png(float_array):
+    array = np.nan_to_num(float_array, nan=0.0, posinf=0.0, neginf=0.0)
+    array = np.round(array * 10000.0)
+    array = np.clip(array, 0, 65535)
+    return array.astype(np.uint16)
+
+def seg_array_to_uint16_png(seg_array):
+    array = np.nan_to_num(seg_array, nan=0.0, posinf=0.0, neginf=0.0)
+    array = np.clip(array, 0, 65535)
+    return array.astype(np.uint16)

 # pylint: disable=line-too-long,unused-argument
 class LmdbLogger(BaseLogger):
@@ -63,6 +73,7 @@ class LmdbLogger(BaseLogger):
            meta_info["tpi_initial_info"] = self.tpi_initial_info
            meta_info["collect_info"] = self.collect_info
            meta_info["version"] = self.version
+            meta_info["image_valid_step_ids"] = {}

            # Lmdb
            log_path_lmdb = save_dir / "lmdb"
@@ -139,13 +150,20 @@ class LmdbLogger(BaseLogger):

            # Save color images
            if save_img:
-                for key, value in self.color_image_logger[robot_name].items():
+                for key, value in self.color_image_logger.get(robot_name, {}).items():
                    root_img_path = save_dir / f"{key}"
                    root_img_path.mkdir(parents=True, exist_ok=True)

+                    step_ids = self.color_image_step_logger.get(robot_name, {}).get(key, [])
+                    if len(step_ids) != len(value):
+                        step_ids = list(range(len(value)))
+                    else:
+                        step_ids = [int(x) for x in step_ids]
+                    meta_info["image_valid_step_ids"][key] = step_ids
+
                    meta_info["keys"][key] = []
                    for i, image in enumerate(tqdm(value)):
-                        step_id = str(i).zfill(4)
+                        step_id = str(step_ids[i]).zfill(4)
                        txn.put(
                            f"{key}/{step_id}".encode("utf-8"),
                            pickle.dumps(cv2.imencode(".jpg", image.astype(np.uint8))[1]),
@@ -154,7 +172,49 @@ class LmdbLogger(BaseLogger):

                    imageio.mimsave(os.path.join(root_img_path, "demo.mp4"), value, fps=15)

-            meta_info["num_steps"] = len(value)
+                for key, value in self.depth_image_logger.get(robot_name, {}).items():
+                    root_img_path = save_dir / f"{key}"
+                    root_img_path.mkdir(parents=True, exist_ok=True)
+
+                    step_ids = self.depth_image_step_logger.get(robot_name, {}).get(key, [])
+                    if len(step_ids) != len(value):
+                        step_ids = list(range(len(value)))
+                    else:
+                        step_ids = [int(x) for x in step_ids]
+                    meta_info["image_valid_step_ids"][key] = step_ids
+
+                    meta_info["keys"][key] = []
+                    for i, image in enumerate(tqdm(value)):
+                        step_id = str(step_ids[i]).zfill(4)
+                        depth_image = float_array_to_uint16_png(np.asarray(image))
+                        txn.put(
+                            f"{key}/{step_id}".encode('utf-8'),
+                            pickle.dumps(cv2.imencode('.png', depth_image)[1])
+                        )
+                        meta_info["keys"][key].append(f"{key}/{step_id}".encode('utf-8'))
+
+                for key, value in self.seg_image_logger.get(robot_name, {}).items():
+                    root_img_path = save_dir / f"{key}"
+                    root_img_path.mkdir(parents=True, exist_ok=True)
+
+                    step_ids = self.seg_image_step_logger.get(robot_name, {}).get(key, [])
+                    if len(step_ids) != len(value):
+                        step_ids = list(range(len(value)))
+                    else:
+                        step_ids = [int(x) for x in step_ids]
+                    meta_info["image_valid_step_ids"][key] = step_ids
+
+                    meta_info["keys"][key] = []
+                    for i, image in enumerate(tqdm(value)):
+                        step_id = str(step_ids[i]).zfill(4)
+                        seg_image = seg_array_to_uint16_png(np.asarray(image))
+                        txn.put(
+                            f"{key}/{step_id}".encode('utf-8'),
+                            pickle.dumps(cv2.imencode('.png', seg_image)[1])
+                        )
+                        meta_info["keys"][key].append(f"{key}/{step_id}".encode('utf-8'))
+
+            meta_info["num_steps"] = self.log_num_steps
            txn.commit()
            lmdb_env.close()
            pickle.dump(meta_info, open(os.path.join(save_dir, "meta_info.pkl"), "wb"))
--- a/workflows/simbox/core/utils/camera_utils.py
+++ b/workflows/simbox/core/utils/camera_utils.py
@@ -0,0 +1,128 @@
+import numpy as np
+from omni.isaac.core.utils.prims import get_prim_at_path
+from omni.isaac.core.utils.transformations import get_relative_transform
+from omni.isaac.sensor import Camera
+
+
+def _get_annotator(camera: Camera, annotator_name: str):
+    custom_annotators = getattr(camera, "_custom_annotators", None)
+    if not isinstance(custom_annotators, dict):
+        return None
+    return custom_annotators.get(annotator_name)
+
+
+def _get_frame(frame):
+    if isinstance(frame, np.ndarray) and frame.size > 0:
+        return frame[:, :, :3]
+    return None
+
+
+def _get_depth(depth):
+    if isinstance(depth, np.ndarray) and depth.size > 0:
+        return depth
+    return None
+
+
+def _get_rgb_image(camera: Camera):
+    output_mode = getattr(camera, "output_mode", "rgb")
+    if output_mode == "rgb":
+        return _get_frame(camera.get_rgba())
+    if output_mode == "diffuse_albedo":
+        annotator = _get_annotator(camera, "DiffuseAlbedo")
+        if annotator is None:
+            return None
+        return _get_frame(annotator.get_data())
+    raise NotImplementedError(f"Unsupported output mode: {output_mode}")
+
+
+def _get_depth_image(camera: Camera):
+    annotator = _get_annotator(camera, "distance_to_image_plane")
+    if annotator is None:
+        return None
+    return _get_depth(annotator.get_data())
+
+
+def _get_object_mask(camera: Camera):
+    annotator = _get_annotator(camera, "semantic_segmentation")
+    if annotator is None:
+        return None
+    annotation_data = annotator.get_data()
+    if (
+        not isinstance(annotation_data, dict)
+        or "data" not in annotation_data
+        or "info" not in annotation_data
+    ):
+        return None
+    info = annotation_data["info"]
+    if not isinstance(info, dict) or "idToLabels" not in info:
+        return None
+    mask = annotation_data["data"]
+    if isinstance(mask, np.ndarray) and mask.size > 0:
+        return {"mask": mask, "id2labels": info["idToLabels"]}
+    return None
+
+
+def _get_bbox(camera: Camera, bbox_type: str):
+    annotator = _get_annotator(camera, bbox_type)
+    if annotator is None:
+        return None
+    annotation_data = annotator.get_data()
+    if (
+        not isinstance(annotation_data, dict)
+        or "data" not in annotation_data
+        or "info" not in annotation_data
+    ):
+        return None
+    info = annotation_data["info"]
+    if not isinstance(info, dict) or "idToLabels" not in info:
+        return None
+    return annotation_data["data"], info["idToLabels"]
+
+
+def _get_motion_vectors(camera: Camera):
+    annotator = _get_annotator(camera, "motion_vectors")
+    if annotator is None:
+        return None
+    annotation_data = annotator.get_data()
+    if isinstance(annotation_data, np.ndarray) and annotation_data.size > 0:
+        return annotation_data
+    return None
+
+
+def _get_camera2env_pose(camera: Camera):
+    prim_path = getattr(camera, "prim_path", None)
+    root_prim_path = getattr(camera, "root_prim_path", None)
+    if not prim_path or not root_prim_path:
+        return None
+    return get_relative_transform(get_prim_at_path(prim_path), get_prim_at_path(root_prim_path))
+
+
+def _get_camera_params(camera: Camera):
+    camera_matrix = getattr(camera, "is_camera_matrix", None)
+    if camera_matrix is None:
+        return None
+    if isinstance(camera_matrix, np.ndarray):
+        return camera_matrix.tolist()
+    return camera_matrix
+
+
+def get_src(camera: Camera, data_type: str):
+    if data_type == "rgb":
+        return _get_rgb_image(camera)
+    if data_type == "depth":
+        return _get_depth_image(camera)
+    if data_type == "seg":
+        return _get_object_mask(camera)
+    if data_type == "bbox2d_tight":
+        return _get_bbox(camera, "bounding_box_2d_tight")
+    if data_type == "bbox2d_loose":
+        return _get_bbox(camera, "bounding_box_2d_loose")
+    if data_type == "bbox3d":
+        return _get_bbox(camera, "bounding_box_3d")
+    if data_type == "motion_vectors":
+        return _get_motion_vectors(camera)
+    if data_type == "camera2env_pose":
+        return _get_camera2env_pose(camera)
+    if data_type == "camera_params":
+        return _get_camera_params(camera)
+    raise NotImplementedError(f"Unsupported source type: {data_type}")
--- a/workflows/simbox_dual_workflow.py
+++ b/workflows/simbox_dual_workflow.py
@@ -168,6 +168,8 @@ class SimBoxDualWorkFlow(NimbusWorkFlow):
            collect_info=self.task_cfg["data"]["collect_info"],
            version=self.task_cfg["data"].get("version", "v1.0"),
        )
+        # Motion vectors are large dense tensors; keep LMDB logging opt-in.
+        self.log_motion_vectors = bool(self.task_cfg["data"].get("log_motion_vectors", False))

        if self.random_seed is not None:
            seed = self.random_seed
@@ -530,16 +532,71 @@ class SimBoxDualWorkFlow(NimbusWorkFlow):
        for key, value in self.task.cameras.items():
            for robot_name, _ in self.task.robots.items():
                if robot_name in key:
-                    rgb_img = value.get_observations()["color_image"]
+                    camera_obs = value.get_observations()
+                    rgb_img = camera_obs["color_image"]
                    # Special processing if enabled
-                    camera2env_pose = value.get_observations()["camera2env_pose"]
+                    camera2env_pose = camera_obs["camera2env_pose"]
                    save_camera_name = key.replace(f"{robot_name}_", "")
-                    self.logger.add_color_image(robot_name, "images.rgb." + save_camera_name, rgb_img)
-                    self.logger.add_scalar_data(robot_name, "camera2env_pose." + save_camera_name, camera2env_pose)
+                    self.logger.add_color_image(
+                        robot_name, "images.rgb." + save_camera_name, rgb_img, step_idx=step_idx
+                    )
+                    if "depth_image" in camera_obs:
+                        depth_image = camera_obs["depth_image"]
+                        depth_img = np.nan_to_num(depth_img, nan=0.0, posinf=0.0, neginf=0.0)
+                        self.logger.add_depth_image(
+                            robot_name, "images.depth." + save_camera_name, depth_img, step_idx=step_idx
+                        )
+                    if "semantic_mask" in camera_obs:
+                        self.logger.add_seg_image(
+                            robot_name, "images.seg." + save_camera_name, seg_mask, step_idx=step_idx
+                        )
+                        if "semantic_mask_id2labels" in camera_obs:
+                            self.logger.add_scalar_data(
+                                robot_name,
+                                "labels.seg." + save_camera_name,
+                                camera_obs["semantic_mask_id2labels"],
+                            )
+                    if "bbox2d_tight" in camera_obs:
+                        self.logger.add_scalar_data(
+                            robot_name, "labels.bbox2d_tight." + save_camera_name, camera_obs["bbox2d_tight"]
+                        )
+                    if "bbox2d_tight_id2labels" in camera_obs:
+                        self.logger.add_scalar_data(
+                            robot_name,
+                            "labels.bbox2d_tight_id2labels." + save_camera_name,
+                            camera_obs["bbox2d_tight_id2labels"],
+                        )
+                    if "bbox2d_loose" in camera_obs:
+                        self.logger.add_scalar_data(
+                            robot_name, "labels.bbox2d_loose." + save_camera_name, camera_obs["bbox2d_loose"]
+                        )
+                    if "bbox2d_loose_id2labels" in camera_obs:
+                        self.logger.add_scalar_data(
+                            robot_name,
+                            "labels.bbox2d_loose_id2labels." + save_camera_name,
+                            camera_obs["bbox2d_loose_id2labels"],
+                        )
+                    if "bbox3d" in camera_obs:
+                        self.logger.add_scalar_data(
+                            robot_name, "labels.bbox3d." + save_camera_name, camera_obs["bbox3d"]
+                        )
+                    if "bbox3d_id2labels" in camera_obs:
+                        self.logger.add_scalar_data(
+                            robot_name,
+                            "labels.bbox3d_id2labels." + save_camera_name,
+                            camera_obs["bbox3d_id2labels"],
+                        )
+                    if self.log_motion_vectors and "motion_vectors" in camera_obs:
+                        self.logger.add_scalar_data(
+                            robot_name, "labels.motion_vectors." + save_camera_name, camera_obs["motion_vectors"]
+                        )
+                    self.logger.add_scalar_data(
+                        robot_name, "camera2env_pose." + save_camera_name, camera2env_pose
+                    )
                    if step_idx == 0:
                        save_camera_name = key.replace(f"{robot_name}_", "")
                        self.logger.add_json_data(
-                            robot_name, f"{save_camera_name}_camera_params", value.get_observations()["camera_params"]
+                            robot_name, f"{save_camera_name}_camera_params", camera_obs["camera_params"]
                        )

                    # depth_img = get_src(value, "depth")