rich annotations & update open-pi fsdp explanations

This commit is contained in:
Leon998
2026-03-18 13:59:52 +08:00
parent 814f3c3526
commit 4934c4794e
11 changed files with 349 additions and 32 deletions

3
.gitignore vendored
View File

@@ -17,4 +17,5 @@ polygons.png
_isaac_sim_410
InterDataEngine-docs
debug.sh
debug.yaml
debug.yaml
depre

View File

@@ -24,7 +24,7 @@
InternDataEngine is a synthetic data generation engine for embodied AI that powers large-scale model training and iteration. Built on NVIDIA Isaac Sim, it unifies high-fidelity physical interaction from InternData-A1, semantic task and scene generation from InternData-M1, and high-throughput scheduling from the Nimbus framework to deliver realistic, task-aligned, and massively scalable robotic manipulation data.
- **More realistic physical interaction**: Unified simulation of rigid, articulated, deformable, and fluid objects across single-arm, dual-arm, and humanoid robots, enabling long-horizon, skill-composed manipulation that better supports sim-to-real transfer.
- **More task-aligned data generation**: LLM-driven task and instruction generation with task-oriented scene graphs (ToSG), producing structured scenes and rich multi-modal annotations (boxes, keypoints, trajectories) for complex instruction-following and spatial reasoning.
- **More diverse data generation**: By leveraging the internal state of the simulation engine to extract high-quality ground truth, coupled with multi-dimensional domain randomization (e.g., layout, texture, structure, and lighting), the data distribution is significantly expanded. This approach produces precise and diverse operational data, while simultaneously exporting rich multimodal annotations such as bounding boxes, segmentation masks, and keypoints.
- **More efficient large-scale production**: Nimbus-powered asynchronous pipelines that decouple planning, rendering, and storage, achieving 23× end-to-end throughput, cluster-level load balancing and fault tolerance for billion-scale data generation.
## 🔥 Latest News

View File

@@ -40,7 +40,7 @@ python scripts/download_paligemma.py
You may adjust other training parameters based on your available GPUs and training budget:
- `num_train_steps`: Total number of training steps
- `num_workers`: Number of data loading workers
- `fsdp_devices`: Number of GPUs per node
- `fsdp_devices`: Number of GPUs used for FSDP per node to distribute model parameters, gradients, and optimizer states across devices for reduced memory usage
- `batch_size`: Batch size per GPU
- `save_interval`: Checkpoint saving interval (in steps)

View File

@@ -202,8 +202,6 @@ python scripts/train_jax_multinode.py \
pretrain-interndata-a1 \
--exp-name=pretrain-interndata-a1 \
--num_workers=12 \
--fsdp_devices=8 \
--batch_size=512 \
--num_train_steps=2000000 \
--save_interval=5000

View File

@@ -1814,7 +1814,6 @@ _CONFIGS = [
pytorch_weight_path="",
num_train_steps=2_000_000,
num_workers=12,
fsdp_devices=8,
batch_size=512,
save_interval=5000,
lr_schedule=_optimizer.WarmupConstantSchedule(),
@@ -1844,7 +1843,6 @@ _CONFIGS = [
pytorch_weight_path="",
num_train_steps=30_000,
num_workers=32,
fsdp_devices=8,
batch_size=128,
save_interval=5000,
),
@@ -1872,7 +1870,6 @@ _CONFIGS = [
pytorch_weight_path="",
num_train_steps=30_000,
num_workers=32,
fsdp_devices=8,
batch_size=128,
save_interval=5000,
),
@@ -1901,4 +1898,4 @@ def check_lerobot_repo(repo_dir: str):
if os.path.isdir(os.path.join(repo_dir, "data")) and os.path.isdir(os.path.join(repo_dir, "meta")) and os.path.isdir(os.path.join(repo_dir, "videos")):
return True
else:
return False
return False

View File

@@ -41,7 +41,7 @@ info() { echo -e "\033[32m[INFO]\033[0m $*"; }
download() {
info "Downloading $2 ..."
huggingface-cli download "$REPO_ID" --repo-type "$REPO_TYPE" --include "$1" --local-dir "$LOCAL_DIR"
hf download "$REPO_ID" --repo-type "$REPO_TYPE" --include "$1" --local-dir "$LOCAL_DIR"
}
# --- Scene assets: required (both modes) ---

View File

@@ -1,6 +1,7 @@
import numpy as np
import omni.replicator.core as rep
from core.cameras.base_camera import register_camera
from core.utils.camera_utils import get_src
from omni.isaac.core.prims import XFormPrim
from omni.isaac.core.utils.prims import get_prim_at_path
from omni.isaac.core.utils.transformations import (
@@ -40,9 +41,25 @@ class CustomCamera(Camera):
**kwargs,
)
self.initialize()
self.add_motion_vectors_to_frame()
self.add_semantic_segmentation_to_frame()
self.add_distance_to_image_plane_to_frame()
self.with_distance = cfg["params"].get("with_distance", True)
self.with_semantic = cfg["params"].get("with_semantic", False)
self.with_bbox2d = cfg["params"].get("with_bbox2d", False)
self.with_bbox3d = cfg["params"].get("with_bbox3d", False)
# Motion vectors are high-volume outputs; keep default off unless explicitly enabled in config.
self.with_motion_vector = cfg["params"].get("with_motion_vector", False)
self.with_depth = cfg["params"].get("depth", False)
if self.with_distance:
self.add_distance_to_image_plane_to_frame()
if self.with_semantic:
self.add_semantic_segmentation_to_frame()
if self.with_bbox2d:
self.add_bounding_box_2d_tight_to_frame()
self.add_bounding_box_2d_loose_to_frame()
if self.with_bbox3d:
self.add_bounding_box_3d_to_frame()
if self.with_motion_vector:
self.add_motion_vectors_to_frame()
# ===== From cfg =====
pixel_size = cfg["params"].get("pixel_size")
@@ -155,9 +172,27 @@ class CustomCamera(Camera):
obs = {
"color_image": color_image,
"depth_image": self.get_depth(),
"camera2env_pose": camera2env_pose,
"camera_params": self.is_camera_matrix.tolist(),
}
if self.with_depth:
obs["depth_image"] = get_src(self, "depth"),
seg_data = get_src(self, "seg")
if seg_data is not None:
obs["semantic_mask"] = seg_data["mask"]
obs["semantic_mask_id2labels"] = seg_data["id2labels"]
bbox2d_tight = get_src(self, "bbox2d_tight")
if bbox2d_tight is not None:
obs["bbox2d_tight"], obs["bbox2d_tight_id2labels"] = bbox2d_tight
bbox2d_loose = get_src(self, "bbox2d_loose")
if bbox2d_loose is not None:
obs["bbox2d_loose"], obs["bbox2d_loose_id2labels"] = bbox2d_loose
bbox3d = get_src(self, "bbox3d")
if bbox3d is not None:
obs["bbox3d"], obs["bbox3d_id2labels"] = bbox3d
motion_vectors = get_src(self, "motion_vectors")
if motion_vectors is not None:
obs["motion_vectors"] = motion_vectors
return obs

View File

@@ -27,6 +27,10 @@ class BaseLogger(ABC):
self.object_data_logger: Dict[str, List[Any]] = {}
self.color_image_logger: Dict[str, List[Any]] = {}
self.depth_image_logger: Dict[str, List[Any]] = {}
self.seg_image_logger: Dict[str, List[Any]] = {}
self.color_image_step_logger: Dict[str, List[Any]] = {}
self.depth_image_step_logger: Dict[str, List[Any]] = {}
self.seg_image_step_logger: Dict[str, List[Any]] = {}
def update_tpi_initial_info(self, tpi_initial_info):
self.tpi_initial_info = tpi_initial_info
@@ -67,17 +71,50 @@ class BaseLogger(ABC):
self.scalar_data_logger[robot][key] = []
self.scalar_data_logger[robot][key].append(value)
def add_color_image(self, robot, key, value):
if robot not in self.color_image_logger:
self.color_image_logger[robot] = {}
if key not in self.color_image_logger[robot]:
self.color_image_logger[robot][key] = []
self.color_image_logger[robot][key].append(value)
def _add_image_data(self, data_logger, step_logger, robot, key, value, step_idx=None):
if robot not in data_logger:
data_logger[robot] = {}
if key not in data_logger[robot]:
data_logger[robot][key] = []
data_logger[robot][key].append(value)
# def add_depth_image(self, key, value):
# if key not in self.depth_image_logger:
# self.depth_image_logger[key] = []
# self.depth_image_logger[key].append(value)
if robot not in step_logger:
step_logger[robot] = {}
if key not in step_logger[robot]:
step_logger[robot][key] = []
if step_idx is None:
step_idx = len(data_logger[robot][key]) - 1
step_logger[robot][key].append(int(step_idx))
def add_color_image(self, robot, key, value, step_idx=None):
self._add_image_data(
self.color_image_logger,
self.color_image_step_logger,
robot,
key,
value,
step_idx=step_idx,
)
def add_depth_image(self, robot, key, value, step_idx=None):
self._add_image_data(
self.depth_image_logger,
self.depth_image_step_logger,
robot,
key,
value,
step_idx=step_idx,
)
def add_seg_image(self, robot, key, value, step_idx=None):
self._add_image_data(
self.seg_image_logger,
self.seg_image_step_logger,
robot,
key,
value,
step_idx=step_idx,
)
def clear(
self,
@@ -97,6 +134,10 @@ class BaseLogger(ABC):
self.scalar_data_logger = {}
self.color_image_logger = {}
self.depth_image_logger = {}
self.seg_image_logger = {}
self.color_image_step_logger = {}
self.depth_image_step_logger = {}
self.seg_image_step_logger = {}
@abstractmethod
def close(self):

View File

@@ -13,6 +13,16 @@ from tqdm import tqdm
DEFAULT_RGB_SCALE_FACTOR = 256000.0
def float_array_to_uint16_png(float_array):
array = np.nan_to_num(float_array, nan=0.0, posinf=0.0, neginf=0.0)
array = np.round(array * 10000.0)
array = np.clip(array, 0, 65535)
return array.astype(np.uint16)
def seg_array_to_uint16_png(seg_array):
array = np.nan_to_num(seg_array, nan=0.0, posinf=0.0, neginf=0.0)
array = np.clip(array, 0, 65535)
return array.astype(np.uint16)
# pylint: disable=line-too-long,unused-argument
class LmdbLogger(BaseLogger):
@@ -63,6 +73,7 @@ class LmdbLogger(BaseLogger):
meta_info["tpi_initial_info"] = self.tpi_initial_info
meta_info["collect_info"] = self.collect_info
meta_info["version"] = self.version
meta_info["image_valid_step_ids"] = {}
# Lmdb
log_path_lmdb = save_dir / "lmdb"
@@ -139,13 +150,20 @@ class LmdbLogger(BaseLogger):
# Save color images
if save_img:
for key, value in self.color_image_logger[robot_name].items():
for key, value in self.color_image_logger.get(robot_name, {}).items():
root_img_path = save_dir / f"{key}"
root_img_path.mkdir(parents=True, exist_ok=True)
step_ids = self.color_image_step_logger.get(robot_name, {}).get(key, [])
if len(step_ids) != len(value):
step_ids = list(range(len(value)))
else:
step_ids = [int(x) for x in step_ids]
meta_info["image_valid_step_ids"][key] = step_ids
meta_info["keys"][key] = []
for i, image in enumerate(tqdm(value)):
step_id = str(i).zfill(4)
step_id = str(step_ids[i]).zfill(4)
txn.put(
f"{key}/{step_id}".encode("utf-8"),
pickle.dumps(cv2.imencode(".jpg", image.astype(np.uint8))[1]),
@@ -154,7 +172,49 @@ class LmdbLogger(BaseLogger):
imageio.mimsave(os.path.join(root_img_path, "demo.mp4"), value, fps=15)
meta_info["num_steps"] = len(value)
for key, value in self.depth_image_logger.get(robot_name, {}).items():
root_img_path = save_dir / f"{key}"
root_img_path.mkdir(parents=True, exist_ok=True)
step_ids = self.depth_image_step_logger.get(robot_name, {}).get(key, [])
if len(step_ids) != len(value):
step_ids = list(range(len(value)))
else:
step_ids = [int(x) for x in step_ids]
meta_info["image_valid_step_ids"][key] = step_ids
meta_info["keys"][key] = []
for i, image in enumerate(tqdm(value)):
step_id = str(step_ids[i]).zfill(4)
depth_image = float_array_to_uint16_png(np.asarray(image))
txn.put(
f"{key}/{step_id}".encode('utf-8'),
pickle.dumps(cv2.imencode('.png', depth_image)[1])
)
meta_info["keys"][key].append(f"{key}/{step_id}".encode('utf-8'))
for key, value in self.seg_image_logger.get(robot_name, {}).items():
root_img_path = save_dir / f"{key}"
root_img_path.mkdir(parents=True, exist_ok=True)
step_ids = self.seg_image_step_logger.get(robot_name, {}).get(key, [])
if len(step_ids) != len(value):
step_ids = list(range(len(value)))
else:
step_ids = [int(x) for x in step_ids]
meta_info["image_valid_step_ids"][key] = step_ids
meta_info["keys"][key] = []
for i, image in enumerate(tqdm(value)):
step_id = str(step_ids[i]).zfill(4)
seg_image = seg_array_to_uint16_png(np.asarray(image))
txn.put(
f"{key}/{step_id}".encode('utf-8'),
pickle.dumps(cv2.imencode('.png', seg_image)[1])
)
meta_info["keys"][key].append(f"{key}/{step_id}".encode('utf-8'))
meta_info["num_steps"] = self.log_num_steps
txn.commit()
lmdb_env.close()
pickle.dump(meta_info, open(os.path.join(save_dir, "meta_info.pkl"), "wb"))

View File

@@ -0,0 +1,128 @@
import numpy as np
from omni.isaac.core.utils.prims import get_prim_at_path
from omni.isaac.core.utils.transformations import get_relative_transform
from omni.isaac.sensor import Camera
def _get_annotator(camera: Camera, annotator_name: str):
custom_annotators = getattr(camera, "_custom_annotators", None)
if not isinstance(custom_annotators, dict):
return None
return custom_annotators.get(annotator_name)
def _get_frame(frame):
if isinstance(frame, np.ndarray) and frame.size > 0:
return frame[:, :, :3]
return None
def _get_depth(depth):
if isinstance(depth, np.ndarray) and depth.size > 0:
return depth
return None
def _get_rgb_image(camera: Camera):
output_mode = getattr(camera, "output_mode", "rgb")
if output_mode == "rgb":
return _get_frame(camera.get_rgba())
if output_mode == "diffuse_albedo":
annotator = _get_annotator(camera, "DiffuseAlbedo")
if annotator is None:
return None
return _get_frame(annotator.get_data())
raise NotImplementedError(f"Unsupported output mode: {output_mode}")
def _get_depth_image(camera: Camera):
annotator = _get_annotator(camera, "distance_to_image_plane")
if annotator is None:
return None
return _get_depth(annotator.get_data())
def _get_object_mask(camera: Camera):
annotator = _get_annotator(camera, "semantic_segmentation")
if annotator is None:
return None
annotation_data = annotator.get_data()
if (
not isinstance(annotation_data, dict)
or "data" not in annotation_data
or "info" not in annotation_data
):
return None
info = annotation_data["info"]
if not isinstance(info, dict) or "idToLabels" not in info:
return None
mask = annotation_data["data"]
if isinstance(mask, np.ndarray) and mask.size > 0:
return {"mask": mask, "id2labels": info["idToLabels"]}
return None
def _get_bbox(camera: Camera, bbox_type: str):
annotator = _get_annotator(camera, bbox_type)
if annotator is None:
return None
annotation_data = annotator.get_data()
if (
not isinstance(annotation_data, dict)
or "data" not in annotation_data
or "info" not in annotation_data
):
return None
info = annotation_data["info"]
if not isinstance(info, dict) or "idToLabels" not in info:
return None
return annotation_data["data"], info["idToLabels"]
def _get_motion_vectors(camera: Camera):
annotator = _get_annotator(camera, "motion_vectors")
if annotator is None:
return None
annotation_data = annotator.get_data()
if isinstance(annotation_data, np.ndarray) and annotation_data.size > 0:
return annotation_data
return None
def _get_camera2env_pose(camera: Camera):
prim_path = getattr(camera, "prim_path", None)
root_prim_path = getattr(camera, "root_prim_path", None)
if not prim_path or not root_prim_path:
return None
return get_relative_transform(get_prim_at_path(prim_path), get_prim_at_path(root_prim_path))
def _get_camera_params(camera: Camera):
camera_matrix = getattr(camera, "is_camera_matrix", None)
if camera_matrix is None:
return None
if isinstance(camera_matrix, np.ndarray):
return camera_matrix.tolist()
return camera_matrix
def get_src(camera: Camera, data_type: str):
if data_type == "rgb":
return _get_rgb_image(camera)
if data_type == "depth":
return _get_depth_image(camera)
if data_type == "seg":
return _get_object_mask(camera)
if data_type == "bbox2d_tight":
return _get_bbox(camera, "bounding_box_2d_tight")
if data_type == "bbox2d_loose":
return _get_bbox(camera, "bounding_box_2d_loose")
if data_type == "bbox3d":
return _get_bbox(camera, "bounding_box_3d")
if data_type == "motion_vectors":
return _get_motion_vectors(camera)
if data_type == "camera2env_pose":
return _get_camera2env_pose(camera)
if data_type == "camera_params":
return _get_camera_params(camera)
raise NotImplementedError(f"Unsupported source type: {data_type}")

View File

@@ -168,6 +168,8 @@ class SimBoxDualWorkFlow(NimbusWorkFlow):
collect_info=self.task_cfg["data"]["collect_info"],
version=self.task_cfg["data"].get("version", "v1.0"),
)
# Motion vectors are large dense tensors; keep LMDB logging opt-in.
self.log_motion_vectors = bool(self.task_cfg["data"].get("log_motion_vectors", False))
if self.random_seed is not None:
seed = self.random_seed
@@ -530,16 +532,71 @@ class SimBoxDualWorkFlow(NimbusWorkFlow):
for key, value in self.task.cameras.items():
for robot_name, _ in self.task.robots.items():
if robot_name in key:
rgb_img = value.get_observations()["color_image"]
camera_obs = value.get_observations()
rgb_img = camera_obs["color_image"]
# Special processing if enabled
camera2env_pose = value.get_observations()["camera2env_pose"]
camera2env_pose = camera_obs["camera2env_pose"]
save_camera_name = key.replace(f"{robot_name}_", "")
self.logger.add_color_image(robot_name, "images.rgb." + save_camera_name, rgb_img)
self.logger.add_scalar_data(robot_name, "camera2env_pose." + save_camera_name, camera2env_pose)
self.logger.add_color_image(
robot_name, "images.rgb." + save_camera_name, rgb_img, step_idx=step_idx
)
if "depth_image" in camera_obs:
depth_image = camera_obs["depth_image"]
depth_img = np.nan_to_num(depth_img, nan=0.0, posinf=0.0, neginf=0.0)
self.logger.add_depth_image(
robot_name, "images.depth." + save_camera_name, depth_img, step_idx=step_idx
)
if "semantic_mask" in camera_obs:
self.logger.add_seg_image(
robot_name, "images.seg." + save_camera_name, seg_mask, step_idx=step_idx
)
if "semantic_mask_id2labels" in camera_obs:
self.logger.add_scalar_data(
robot_name,
"labels.seg." + save_camera_name,
camera_obs["semantic_mask_id2labels"],
)
if "bbox2d_tight" in camera_obs:
self.logger.add_scalar_data(
robot_name, "labels.bbox2d_tight." + save_camera_name, camera_obs["bbox2d_tight"]
)
if "bbox2d_tight_id2labels" in camera_obs:
self.logger.add_scalar_data(
robot_name,
"labels.bbox2d_tight_id2labels." + save_camera_name,
camera_obs["bbox2d_tight_id2labels"],
)
if "bbox2d_loose" in camera_obs:
self.logger.add_scalar_data(
robot_name, "labels.bbox2d_loose." + save_camera_name, camera_obs["bbox2d_loose"]
)
if "bbox2d_loose_id2labels" in camera_obs:
self.logger.add_scalar_data(
robot_name,
"labels.bbox2d_loose_id2labels." + save_camera_name,
camera_obs["bbox2d_loose_id2labels"],
)
if "bbox3d" in camera_obs:
self.logger.add_scalar_data(
robot_name, "labels.bbox3d." + save_camera_name, camera_obs["bbox3d"]
)
if "bbox3d_id2labels" in camera_obs:
self.logger.add_scalar_data(
robot_name,
"labels.bbox3d_id2labels." + save_camera_name,
camera_obs["bbox3d_id2labels"],
)
if self.log_motion_vectors and "motion_vectors" in camera_obs:
self.logger.add_scalar_data(
robot_name, "labels.motion_vectors." + save_camera_name, camera_obs["motion_vectors"]
)
self.logger.add_scalar_data(
robot_name, "camera2env_pose." + save_camera_name, camera2env_pose
)
if step_idx == 0:
save_camera_name = key.replace(f"{robot_name}_", "")
self.logger.add_json_data(
robot_name, f"{save_camera_name}_camera_params", value.get_observations()["camera_params"]
robot_name, f"{save_camera_name}_camera_params", camera_obs["camera_params"]
)
# depth_img = get_src(value, "depth")