rich annotations & update open-pi fsdp explanations

2026-03-18 13:59:52 +08:00
parent 814f3c3526
commit 4934c4794e
11 changed files with 349 additions and 32 deletions
--- a/workflows/simbox/core/cameras/custom_camera.py
+++ b/workflows/simbox/core/cameras/custom_camera.py
@@ -1,6 +1,7 @@
 import numpy as np
 import omni.replicator.core as rep
 from core.cameras.base_camera import register_camera
+from core.utils.camera_utils import get_src
 from omni.isaac.core.prims import XFormPrim
 from omni.isaac.core.utils.prims import get_prim_at_path
 from omni.isaac.core.utils.transformations import (
@@ -40,9 +41,25 @@ class CustomCamera(Camera):
            **kwargs,
        )
        self.initialize()
-        self.add_motion_vectors_to_frame()
-        self.add_semantic_segmentation_to_frame()
-        self.add_distance_to_image_plane_to_frame()
+        self.with_distance = cfg["params"].get("with_distance", True)
+        self.with_semantic = cfg["params"].get("with_semantic", False)
+        self.with_bbox2d = cfg["params"].get("with_bbox2d", False)
+        self.with_bbox3d = cfg["params"].get("with_bbox3d", False)
+        # Motion vectors are high-volume outputs; keep default off unless explicitly enabled in config.
+        self.with_motion_vector = cfg["params"].get("with_motion_vector", False)
+        self.with_depth = cfg["params"].get("depth", False)
+
+        if self.with_distance:
+            self.add_distance_to_image_plane_to_frame()
+        if self.with_semantic:
+            self.add_semantic_segmentation_to_frame()
+        if self.with_bbox2d:
+            self.add_bounding_box_2d_tight_to_frame()
+            self.add_bounding_box_2d_loose_to_frame()
+        if self.with_bbox3d:
+            self.add_bounding_box_3d_to_frame()
+        if self.with_motion_vector:
+            self.add_motion_vectors_to_frame()

        # ===== From cfg =====
        pixel_size = cfg["params"].get("pixel_size")
@@ -155,9 +172,27 @@ class CustomCamera(Camera):

        obs = {
            "color_image": color_image,
-            "depth_image": self.get_depth(),
            "camera2env_pose": camera2env_pose,
            "camera_params": self.is_camera_matrix.tolist(),
        }
+        if self.with_depth:
+            obs["depth_image"] = get_src(self, "depth"),

+        seg_data = get_src(self, "seg")
+        if seg_data is not None:
+            obs["semantic_mask"] = seg_data["mask"]
+            obs["semantic_mask_id2labels"] = seg_data["id2labels"]
+
+        bbox2d_tight = get_src(self, "bbox2d_tight")
+        if bbox2d_tight is not None:
+            obs["bbox2d_tight"], obs["bbox2d_tight_id2labels"] = bbox2d_tight
+        bbox2d_loose = get_src(self, "bbox2d_loose")
+        if bbox2d_loose is not None:
+            obs["bbox2d_loose"], obs["bbox2d_loose_id2labels"] = bbox2d_loose
+        bbox3d = get_src(self, "bbox3d")
+        if bbox3d is not None:
+            obs["bbox3d"], obs["bbox3d_id2labels"] = bbox3d
+        motion_vectors = get_src(self, "motion_vectors")
+        if motion_vectors is not None:
+            obs["motion_vectors"] = motion_vectors
        return obs
--- a/workflows/simbox/core/loggers/init.py
+++ b/workflows/simbox/core/loggers/init.py
@@ -27,6 +27,10 @@ class BaseLogger(ABC):
        self.object_data_logger: Dict[str, List[Any]] = {}
        self.color_image_logger: Dict[str, List[Any]] = {}
        self.depth_image_logger: Dict[str, List[Any]] = {}
+        self.seg_image_logger: Dict[str, List[Any]] = {}
+        self.color_image_step_logger: Dict[str, List[Any]] = {}
+        self.depth_image_step_logger: Dict[str, List[Any]] = {}
+        self.seg_image_step_logger: Dict[str, List[Any]] = {}

    def update_tpi_initial_info(self, tpi_initial_info):
        self.tpi_initial_info = tpi_initial_info
@@ -67,17 +71,50 @@ class BaseLogger(ABC):
            self.scalar_data_logger[robot][key] = []
        self.scalar_data_logger[robot][key].append(value)

-    def add_color_image(self, robot, key, value):
-        if robot not in self.color_image_logger:
-            self.color_image_logger[robot] = {}
-        if key not in self.color_image_logger[robot]:
-            self.color_image_logger[robot][key] = []
-        self.color_image_logger[robot][key].append(value)
+    def _add_image_data(self, data_logger, step_logger, robot, key, value, step_idx=None):
+        if robot not in data_logger:
+            data_logger[robot] = {}
+        if key not in data_logger[robot]:
+            data_logger[robot][key] = []
+        data_logger[robot][key].append(value)

-    # def add_depth_image(self, key, value):
-    #     if key not in self.depth_image_logger:
-    #         self.depth_image_logger[key] = []
-    #     self.depth_image_logger[key].append(value)
+        if robot not in step_logger:
+            step_logger[robot] = {}
+        if key not in step_logger[robot]:
+            step_logger[robot][key] = []
+        if step_idx is None:
+            step_idx = len(data_logger[robot][key]) - 1
+        step_logger[robot][key].append(int(step_idx))
+
+    def add_color_image(self, robot, key, value, step_idx=None):
+        self._add_image_data(
+            self.color_image_logger,
+            self.color_image_step_logger,
+            robot,
+            key,
+            value,
+            step_idx=step_idx,
+        )
+
+    def add_depth_image(self, robot, key, value, step_idx=None):
+        self._add_image_data(
+            self.depth_image_logger,
+            self.depth_image_step_logger,
+            robot,
+            key,
+            value,
+            step_idx=step_idx,
+        )
+
+    def add_seg_image(self, robot, key, value, step_idx=None):
+        self._add_image_data(
+            self.seg_image_logger,
+            self.seg_image_step_logger,
+            robot,
+            key,
+            value,
+            step_idx=step_idx,
+        )

    def clear(
        self,
@@ -97,6 +134,10 @@ class BaseLogger(ABC):
        self.scalar_data_logger = {}
        self.color_image_logger = {}
        self.depth_image_logger = {}
+        self.seg_image_logger = {}
+        self.color_image_step_logger = {}
+        self.depth_image_step_logger = {}
+        self.seg_image_step_logger = {}

    @abstractmethod
    def close(self):
--- a/workflows/simbox/core/loggers/lmdb_logger.py
+++ b/workflows/simbox/core/loggers/lmdb_logger.py
@@ -13,6 +13,16 @@ from tqdm import tqdm

 DEFAULT_RGB_SCALE_FACTOR = 256000.0

+def float_array_to_uint16_png(float_array):
+    array = np.nan_to_num(float_array, nan=0.0, posinf=0.0, neginf=0.0)
+    array = np.round(array * 10000.0)
+    array = np.clip(array, 0, 65535)
+    return array.astype(np.uint16)
+
+def seg_array_to_uint16_png(seg_array):
+    array = np.nan_to_num(seg_array, nan=0.0, posinf=0.0, neginf=0.0)
+    array = np.clip(array, 0, 65535)
+    return array.astype(np.uint16)

 # pylint: disable=line-too-long,unused-argument
 class LmdbLogger(BaseLogger):
@@ -63,6 +73,7 @@ class LmdbLogger(BaseLogger):
            meta_info["tpi_initial_info"] = self.tpi_initial_info
            meta_info["collect_info"] = self.collect_info
            meta_info["version"] = self.version
+            meta_info["image_valid_step_ids"] = {}

            # Lmdb
            log_path_lmdb = save_dir / "lmdb"
@@ -139,13 +150,20 @@ class LmdbLogger(BaseLogger):

            # Save color images
            if save_img:
-                for key, value in self.color_image_logger[robot_name].items():
+                for key, value in self.color_image_logger.get(robot_name, {}).items():
                    root_img_path = save_dir / f"{key}"
                    root_img_path.mkdir(parents=True, exist_ok=True)

+                    step_ids = self.color_image_step_logger.get(robot_name, {}).get(key, [])
+                    if len(step_ids) != len(value):
+                        step_ids = list(range(len(value)))
+                    else:
+                        step_ids = [int(x) for x in step_ids]
+                    meta_info["image_valid_step_ids"][key] = step_ids
+
                    meta_info["keys"][key] = []
                    for i, image in enumerate(tqdm(value)):
-                        step_id = str(i).zfill(4)
+                        step_id = str(step_ids[i]).zfill(4)
                        txn.put(
                            f"{key}/{step_id}".encode("utf-8"),
                            pickle.dumps(cv2.imencode(".jpg", image.astype(np.uint8))[1]),
@@ -154,7 +172,49 @@ class LmdbLogger(BaseLogger):

                    imageio.mimsave(os.path.join(root_img_path, "demo.mp4"), value, fps=15)

-            meta_info["num_steps"] = len(value)
+                for key, value in self.depth_image_logger.get(robot_name, {}).items():
+                    root_img_path = save_dir / f"{key}"
+                    root_img_path.mkdir(parents=True, exist_ok=True)
+
+                    step_ids = self.depth_image_step_logger.get(robot_name, {}).get(key, [])
+                    if len(step_ids) != len(value):
+                        step_ids = list(range(len(value)))
+                    else:
+                        step_ids = [int(x) for x in step_ids]
+                    meta_info["image_valid_step_ids"][key] = step_ids
+
+                    meta_info["keys"][key] = []
+                    for i, image in enumerate(tqdm(value)):
+                        step_id = str(step_ids[i]).zfill(4)
+                        depth_image = float_array_to_uint16_png(np.asarray(image))
+                        txn.put(
+                            f"{key}/{step_id}".encode('utf-8'),
+                            pickle.dumps(cv2.imencode('.png', depth_image)[1])
+                        )
+                        meta_info["keys"][key].append(f"{key}/{step_id}".encode('utf-8'))
+
+                for key, value in self.seg_image_logger.get(robot_name, {}).items():
+                    root_img_path = save_dir / f"{key}"
+                    root_img_path.mkdir(parents=True, exist_ok=True)
+
+                    step_ids = self.seg_image_step_logger.get(robot_name, {}).get(key, [])
+                    if len(step_ids) != len(value):
+                        step_ids = list(range(len(value)))
+                    else:
+                        step_ids = [int(x) for x in step_ids]
+                    meta_info["image_valid_step_ids"][key] = step_ids
+
+                    meta_info["keys"][key] = []
+                    for i, image in enumerate(tqdm(value)):
+                        step_id = str(step_ids[i]).zfill(4)
+                        seg_image = seg_array_to_uint16_png(np.asarray(image))
+                        txn.put(
+                            f"{key}/{step_id}".encode('utf-8'),
+                            pickle.dumps(cv2.imencode('.png', seg_image)[1])
+                        )
+                        meta_info["keys"][key].append(f"{key}/{step_id}".encode('utf-8'))
+
+            meta_info["num_steps"] = self.log_num_steps
            txn.commit()
            lmdb_env.close()
            pickle.dump(meta_info, open(os.path.join(save_dir, "meta_info.pkl"), "wb"))
--- a/workflows/simbox/core/utils/camera_utils.py
+++ b/workflows/simbox/core/utils/camera_utils.py
@@ -0,0 +1,128 @@
+import numpy as np
+from omni.isaac.core.utils.prims import get_prim_at_path
+from omni.isaac.core.utils.transformations import get_relative_transform
+from omni.isaac.sensor import Camera
+
+
+def _get_annotator(camera: Camera, annotator_name: str):
+    custom_annotators = getattr(camera, "_custom_annotators", None)
+    if not isinstance(custom_annotators, dict):
+        return None
+    return custom_annotators.get(annotator_name)
+
+
+def _get_frame(frame):
+    if isinstance(frame, np.ndarray) and frame.size > 0:
+        return frame[:, :, :3]
+    return None
+
+
+def _get_depth(depth):
+    if isinstance(depth, np.ndarray) and depth.size > 0:
+        return depth
+    return None
+
+
+def _get_rgb_image(camera: Camera):
+    output_mode = getattr(camera, "output_mode", "rgb")
+    if output_mode == "rgb":
+        return _get_frame(camera.get_rgba())
+    if output_mode == "diffuse_albedo":
+        annotator = _get_annotator(camera, "DiffuseAlbedo")
+        if annotator is None:
+            return None
+        return _get_frame(annotator.get_data())
+    raise NotImplementedError(f"Unsupported output mode: {output_mode}")
+
+
+def _get_depth_image(camera: Camera):
+    annotator = _get_annotator(camera, "distance_to_image_plane")
+    if annotator is None:
+        return None
+    return _get_depth(annotator.get_data())
+
+
+def _get_object_mask(camera: Camera):
+    annotator = _get_annotator(camera, "semantic_segmentation")
+    if annotator is None:
+        return None
+    annotation_data = annotator.get_data()
+    if (
+        not isinstance(annotation_data, dict)
+        or "data" not in annotation_data
+        or "info" not in annotation_data
+    ):
+        return None
+    info = annotation_data["info"]
+    if not isinstance(info, dict) or "idToLabels" not in info:
+        return None
+    mask = annotation_data["data"]
+    if isinstance(mask, np.ndarray) and mask.size > 0:
+        return {"mask": mask, "id2labels": info["idToLabels"]}
+    return None
+
+
+def _get_bbox(camera: Camera, bbox_type: str):
+    annotator = _get_annotator(camera, bbox_type)
+    if annotator is None:
+        return None
+    annotation_data = annotator.get_data()
+    if (
+        not isinstance(annotation_data, dict)
+        or "data" not in annotation_data
+        or "info" not in annotation_data
+    ):
+        return None
+    info = annotation_data["info"]
+    if not isinstance(info, dict) or "idToLabels" not in info:
+        return None
+    return annotation_data["data"], info["idToLabels"]
+
+
+def _get_motion_vectors(camera: Camera):
+    annotator = _get_annotator(camera, "motion_vectors")
+    if annotator is None:
+        return None
+    annotation_data = annotator.get_data()
+    if isinstance(annotation_data, np.ndarray) and annotation_data.size > 0:
+        return annotation_data
+    return None
+
+
+def _get_camera2env_pose(camera: Camera):
+    prim_path = getattr(camera, "prim_path", None)
+    root_prim_path = getattr(camera, "root_prim_path", None)
+    if not prim_path or not root_prim_path:
+        return None
+    return get_relative_transform(get_prim_at_path(prim_path), get_prim_at_path(root_prim_path))
+
+
+def _get_camera_params(camera: Camera):
+    camera_matrix = getattr(camera, "is_camera_matrix", None)
+    if camera_matrix is None:
+        return None
+    if isinstance(camera_matrix, np.ndarray):
+        return camera_matrix.tolist()
+    return camera_matrix
+
+
+def get_src(camera: Camera, data_type: str):
+    if data_type == "rgb":
+        return _get_rgb_image(camera)
+    if data_type == "depth":
+        return _get_depth_image(camera)
+    if data_type == "seg":
+        return _get_object_mask(camera)
+    if data_type == "bbox2d_tight":
+        return _get_bbox(camera, "bounding_box_2d_tight")
+    if data_type == "bbox2d_loose":
+        return _get_bbox(camera, "bounding_box_2d_loose")
+    if data_type == "bbox3d":
+        return _get_bbox(camera, "bounding_box_3d")
+    if data_type == "motion_vectors":
+        return _get_motion_vectors(camera)
+    if data_type == "camera2env_pose":
+        return _get_camera2env_pose(camera)
+    if data_type == "camera_params":
+        return _get_camera_params(camera)
+    raise NotImplementedError(f"Unsupported source type: {data_type}")
--- a/workflows/simbox_dual_workflow.py
+++ b/workflows/simbox_dual_workflow.py
@@ -168,6 +168,8 @@ class SimBoxDualWorkFlow(NimbusWorkFlow):
            collect_info=self.task_cfg["data"]["collect_info"],
            version=self.task_cfg["data"].get("version", "v1.0"),
        )
+        # Motion vectors are large dense tensors; keep LMDB logging opt-in.
+        self.log_motion_vectors = bool(self.task_cfg["data"].get("log_motion_vectors", False))

        if self.random_seed is not None:
            seed = self.random_seed
@@ -530,16 +532,71 @@ class SimBoxDualWorkFlow(NimbusWorkFlow):
        for key, value in self.task.cameras.items():
            for robot_name, _ in self.task.robots.items():
                if robot_name in key:
-                    rgb_img = value.get_observations()["color_image"]
+                    camera_obs = value.get_observations()
+                    rgb_img = camera_obs["color_image"]
                    # Special processing if enabled
-                    camera2env_pose = value.get_observations()["camera2env_pose"]
+                    camera2env_pose = camera_obs["camera2env_pose"]
                    save_camera_name = key.replace(f"{robot_name}_", "")
-                    self.logger.add_color_image(robot_name, "images.rgb." + save_camera_name, rgb_img)
-                    self.logger.add_scalar_data(robot_name, "camera2env_pose." + save_camera_name, camera2env_pose)
+                    self.logger.add_color_image(
+                        robot_name, "images.rgb." + save_camera_name, rgb_img, step_idx=step_idx
+                    )
+                    if "depth_image" in camera_obs:
+                        depth_image = camera_obs["depth_image"]
+                        depth_img = np.nan_to_num(depth_img, nan=0.0, posinf=0.0, neginf=0.0)
+                        self.logger.add_depth_image(
+                            robot_name, "images.depth." + save_camera_name, depth_img, step_idx=step_idx
+                        )
+                    if "semantic_mask" in camera_obs:
+                        self.logger.add_seg_image(
+                            robot_name, "images.seg." + save_camera_name, seg_mask, step_idx=step_idx
+                        )
+                        if "semantic_mask_id2labels" in camera_obs:
+                            self.logger.add_scalar_data(
+                                robot_name,
+                                "labels.seg." + save_camera_name,
+                                camera_obs["semantic_mask_id2labels"],
+                            )
+                    if "bbox2d_tight" in camera_obs:
+                        self.logger.add_scalar_data(
+                            robot_name, "labels.bbox2d_tight." + save_camera_name, camera_obs["bbox2d_tight"]
+                        )
+                    if "bbox2d_tight_id2labels" in camera_obs:
+                        self.logger.add_scalar_data(
+                            robot_name,
+                            "labels.bbox2d_tight_id2labels." + save_camera_name,
+                            camera_obs["bbox2d_tight_id2labels"],
+                        )
+                    if "bbox2d_loose" in camera_obs:
+                        self.logger.add_scalar_data(
+                            robot_name, "labels.bbox2d_loose." + save_camera_name, camera_obs["bbox2d_loose"]
+                        )
+                    if "bbox2d_loose_id2labels" in camera_obs:
+                        self.logger.add_scalar_data(
+                            robot_name,
+                            "labels.bbox2d_loose_id2labels." + save_camera_name,
+                            camera_obs["bbox2d_loose_id2labels"],
+                        )
+                    if "bbox3d" in camera_obs:
+                        self.logger.add_scalar_data(
+                            robot_name, "labels.bbox3d." + save_camera_name, camera_obs["bbox3d"]
+                        )
+                    if "bbox3d_id2labels" in camera_obs:
+                        self.logger.add_scalar_data(
+                            robot_name,
+                            "labels.bbox3d_id2labels." + save_camera_name,
+                            camera_obs["bbox3d_id2labels"],
+                        )
+                    if self.log_motion_vectors and "motion_vectors" in camera_obs:
+                        self.logger.add_scalar_data(
+                            robot_name, "labels.motion_vectors." + save_camera_name, camera_obs["motion_vectors"]
+                        )
+                    self.logger.add_scalar_data(
+                        robot_name, "camera2env_pose." + save_camera_name, camera2env_pose
+                    )
                    if step_idx == 0:
                        save_camera_name = key.replace(f"{robot_name}_", "")
                        self.logger.add_json_data(
-                            robot_name, f"{save_camera_name}_camera_params", value.get_observations()["camera_params"]
+                            robot_name, f"{save_camera_name}_camera_params", camera_obs["camera_params"]
                        )

                    # depth_img = get_src(value, "depth")