feat(policies): add Nvidia Gr00t N1.5 model (#2292)

* feat(policies): add Nvidia Gr00t N1.5 model

Co-authored-by: lbenhorin <lbenhorin@nvidia.com>
Co-authored-by: Aravindh <aravindhs@nvidia.com>
Co-authored-by: nv-sachdevkartik <ksachdev@nvidia.com>
Co-authored-by: youliangt <youliangt@nvidia.com>
Co-authored-by: Michel Aractingi <michel.aractingi@huggingface.co>
Co-authored-by: Pepijn <138571049+pkooij@users.noreply.github.com>
Co-authored-by: Jade Choghari <chogharijade@gmail.com>

* fix(docs): add groot to index

Co-authored-by: sachdevkartik <sachdev.kartik25@gmail.com>

---------

Co-authored-by: lbenhorin <lbenhorin@nvidia.com>
Co-authored-by: Aravindh <aravindhs@nvidia.com>
Co-authored-by: nv-sachdevkartik <ksachdev@nvidia.com>
Co-authored-by: youliangt <youliangt@nvidia.com>
Co-authored-by: Michel Aractingi <michel.aractingi@huggingface.co>
Co-authored-by: Pepijn <138571049+pkooij@users.noreply.github.com>
Co-authored-by: Jade Choghari <chogharijade@gmail.com>
Co-authored-by: sachdevkartik <sachdev.kartik25@gmail.com>
This commit is contained in:
Steven Palma
2025-10-23 13:50:30 +02:00
committed by GitHub
parent 306429a85b
commit be46bdea8f
26 changed files with 4766 additions and 6 deletions

View File

@@ -0,0 +1,207 @@
#!/usr/bin/env python
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Test script for LeRobot's Groot policy forward and inference passes."""
import gc
import os
from copy import deepcopy
from typing import Any
import numpy as np
import pytest
import torch
from lerobot.policies.groot.configuration_groot import GrootConfig
from lerobot.policies.groot.modeling_groot import GrootPolicy
from lerobot.policies.groot.processor_groot import make_groot_pre_post_processors
from lerobot.processor import PolicyAction, PolicyProcessorPipeline
from lerobot.utils.utils import auto_select_torch_device
from tests.utils import require_cuda # noqa: E402
pytest.importorskip("transformers")
pytestmark = pytest.mark.skipif(
os.environ.get("CI") == "true" or os.environ.get("GITHUB_ACTIONS") == "true",
reason="This test requires local Groot installation and is not meant for CI",
)
# Define constants for dummy data
DUMMY_STATE_DIM = 44
DUMMY_ACTION_DIM = 44
DUMMY_ACTION_HORIZON = 16
IMAGE_SIZE = 256
DEVICE = auto_select_torch_device()
MODEL_PATH = "aractingi/bimanual-handover-groot-10k"
def cleanup_memory():
"""Clean up GPU/MPS memory to prevent OOM errors between tests."""
print("\nCleaning up memory...")
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.synchronize()
if torch.backends.mps.is_available():
torch.mps.empty_cache()
print("Memory cleanup complete.")
def set_seed_all(seed: int):
"""Set random seed for all RNG sources to ensure reproducibility."""
import random
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
# Set deterministic behavior
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.use_deterministic_algorithms(True, warn_only=True)
def instantiate_lerobot_groot(
from_pretrained: bool = False,
model_path: str = MODEL_PATH,
) -> tuple[
GrootPolicy,
PolicyProcessorPipeline[dict[str, Any], dict[str, Any]],
PolicyProcessorPipeline[PolicyAction, PolicyAction],
]:
"""Instantiate LeRobot Groot policy with preprocessor and postprocessor."""
if from_pretrained:
policy = GrootPolicy.from_pretrained(
pretrained_name_or_path=model_path,
strict=False,
)
policy.config.embodiment_tag = "gr1"
else:
config = GrootConfig(
base_model_path=model_path,
n_action_steps=DUMMY_ACTION_HORIZON,
chunk_size=DUMMY_ACTION_HORIZON,
image_size=[IMAGE_SIZE, IMAGE_SIZE],
device=DEVICE,
embodiment_tag="gr1",
)
policy = GrootPolicy(config)
policy.to(DEVICE)
policy.config.device = DEVICE
preprocessor, postprocessor = make_groot_pre_post_processors(
config=policy.config,
dataset_stats=None, # Pass None for dataset_stats to disable normalization (original GR00T doesn't normalize)
)
return (policy, preprocessor, postprocessor)
def create_dummy_data(device=DEVICE):
"""Create a dummy data batch for testing."""
batch_size = 2
prompt = "Pick up the red cube and place it in the bin"
state = torch.randn(batch_size, DUMMY_STATE_DIM, dtype=torch.float32, device=device)
batch = {
"observation.state": state,
"action": torch.randn(
batch_size,
DUMMY_ACTION_HORIZON,
DUMMY_ACTION_DIM,
dtype=torch.float32,
device=device, # Action ground truth (for training)
),
"observation.images.ego_view": torch.rand(
batch_size,
3,
IMAGE_SIZE,
IMAGE_SIZE,
dtype=torch.float32,
device=device, # Images in [0, 1] range as expected by LeRobot
),
"task": [prompt for _ in range(batch_size)],
}
return batch
@require_cuda
def test_lerobot_groot_inference():
"""Test the inference pass (select_action) of LeRobot's Groot policy."""
print("Test: LeRobot Groot Inference Pass")
set_seed_all(42)
# Instantiate policy and processors
lerobot_policy, lerobot_preprocessor, lerobot_postprocessor = instantiate_lerobot_groot(
from_pretrained=True
)
batch = create_dummy_data()
print("\n[LeRobot] Running inference...")
lerobot_policy.eval()
batch_lerobot_processed = lerobot_preprocessor(deepcopy(batch))
# Ensure identical RNG state before inference
torch.manual_seed(42)
with torch.no_grad():
lerobot_action = lerobot_policy.select_action(batch_lerobot_processed)
print(f"\nInference successful. Output action shape: {lerobot_action.shape}")
print("Output actions (first 5 dims):")
print(lerobot_action[:, :5])
lerobot_action = lerobot_postprocessor(lerobot_action)
del lerobot_policy, lerobot_preprocessor, lerobot_postprocessor, batch
cleanup_memory()
@require_cuda
def test_lerobot_groot_forward_pass():
"""Test the forward pass of LeRobot's Groot policy."""
print("\n" + "=" * 50)
print("Test: LeRobot Groot Forward Pass (Training Mode)")
set_seed_all(42)
# Instantiate policy and processors
lerobot_policy, lerobot_preprocessor, _ = instantiate_lerobot_groot(from_pretrained=True)
batch = create_dummy_data()
lerobot_policy.eval()
print("\n[LeRobot] Running forward pass...")
batch_lerobot_processed = lerobot_preprocessor(deepcopy(batch))
set_seed_all(42)
with torch.no_grad():
lerobot_loss, lerobot_metrics = lerobot_policy.forward(batch_lerobot_processed)
print("\nForward pass successful.")
print(f" - Loss: {lerobot_loss.item():.6f}")
print(f" - Metrics: {lerobot_metrics}")
del lerobot_policy, lerobot_preprocessor, batch
cleanup_memory()

View File

@@ -0,0 +1,443 @@
#!/usr/bin/env python
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Test script to verify Groot policy integration with LeRobot vs the original implementation, only meant to be run locally!"""
import gc
import os
from copy import deepcopy
from typing import Any
import numpy as np
import pytest
import torch
from lerobot.policies.groot.configuration_groot import GrootConfig
from lerobot.policies.groot.modeling_groot import GrootPolicy
from lerobot.policies.groot.processor_groot import make_groot_pre_post_processors
from lerobot.processor import PolicyAction, PolicyProcessorPipeline
pytest.importorskip("gr00t")
pytest.importorskip("transformers")
pytestmark = pytest.mark.skipif(
os.environ.get("CI") == "true" or os.environ.get("GITHUB_ACTIONS") == "true",
reason="This test requires local Groot installation and is not meant for CI",
)
from gr00t.data.dataset import ModalityConfig # noqa: E402
from gr00t.data.embodiment_tags import EmbodimentTag # noqa: E402
from gr00t.data.transform.base import ComposedModalityTransform # noqa: E402
from gr00t.model.policy import Gr00tPolicy # noqa: E402
# GR1 humanoid dimensions (from pretrained model metadata)
# The actual GR1 robot has 44 dimensions for both state and action
# GR00TTransform will pad state to 64 and truncate action to 32
DUMMY_STATE_DIM = 44
DUMMY_ACTION_DIM = 44
DUMMY_ACTION_HORIZON = 16
IMAGE_SIZE = 256
DEVICE = "cpu"
MODEL_PATH = "nvidia/GR00T-N1.5-3B"
GR1_BODY_PARTS = {
"left_arm": 7,
"left_hand": 6,
"left_leg": 6,
"neck": 3,
"right_arm": 7,
"right_hand": 6,
"right_leg": 6,
"waist": 3,
}
def cleanup_memory():
"""Clean up GPU/MPS memory to prevent OOM errors between tests."""
print("\nCleaning up memory...")
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.synchronize()
if torch.backends.mps.is_available():
torch.mps.empty_cache()
print("Memory cleanup complete.")
def set_seed_all(seed: int):
"""Set random seed for all RNG sources to ensure reproducibility."""
import random
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
# Set deterministic behavior
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.use_deterministic_algorithms(True, warn_only=True)
def instantiate_lerobot_groot(
from_pretrained: bool = False,
model_path: str = MODEL_PATH,
) -> tuple[
GrootPolicy,
PolicyProcessorPipeline[dict[str, Any], dict[str, Any]],
PolicyProcessorPipeline[PolicyAction, PolicyAction],
]:
"""Instantiate LeRobot Groot policy with preprocessor and postprocessor."""
if from_pretrained:
policy = GrootPolicy.from_pretrained(
pretrained_name_or_path=model_path,
strict=False,
)
policy.config.embodiment_tag = "gr1"
else:
config = GrootConfig(
base_model_path=model_path,
n_action_steps=DUMMY_ACTION_HORIZON,
chunk_size=DUMMY_ACTION_HORIZON,
image_size=[IMAGE_SIZE, IMAGE_SIZE],
device=DEVICE,
embodiment_tag="gr1",
)
policy = GrootPolicy(config)
policy.to(DEVICE)
policy.config.device = DEVICE
preprocessor, postprocessor = make_groot_pre_post_processors(
config=policy.config,
dataset_stats=None, # Pass None for dataset_stats to disable normalization (original GR00T doesn't normalize)
)
return (policy, preprocessor, postprocessor)
def instantiate_original_groot(
from_pretrained: bool = False,
model_path: str = MODEL_PATH,
):
"""Instantiate original Groot policy from NVIDIA's implementation."""
from gr00t.data.transform.concat import ConcatTransform
from gr00t.data.transform.state_action import StateActionToTensor
from gr00t.data.transform.video import VideoToNumpy, VideoToTensor
from gr00t.model.transforms import GR00TTransform
video_keys = ["video.ego_view"]
state_keys = [
"state"
] # Important: Use single concatenated "state" key (not split body parts) to match preprocessing
action_keys = [
"action.left_arm",
"action.right_arm",
"action.left_hand",
"action.right_hand",
"action.left_leg",
"action.right_leg",
"action.neck",
"action.waist",
]
language_keys = ["annotation.human.action.task_description"]
modality_config = {
"video": ModalityConfig(
delta_indices=[0], # Current frame only
modality_keys=video_keys,
),
"state": ModalityConfig(
delta_indices=[0],
modality_keys=state_keys,
),
"action": ModalityConfig(
delta_indices=list(range(DUMMY_ACTION_HORIZON)),
modality_keys=action_keys,
),
"language": ModalityConfig(
delta_indices=[0],
modality_keys=language_keys,
),
}
modality_transform = ComposedModalityTransform(
transforms=[
VideoToTensor(apply_to=video_keys),
VideoToNumpy(apply_to=video_keys), # Convert to numpy (GR00TTransform expects numpy arrays)
# State is already a single concatenated key, so no StateActionToTensor needed
# Convert action from numpy to tensor
StateActionToTensor(apply_to=action_keys),
# Concatenate only video and actions (state is already single key)
ConcatTransform(
video_concat_order=video_keys,
state_concat_order=[], # Empty:state is already single key
action_concat_order=action_keys,
),
GR00TTransform(
max_state_dim=64,
max_action_dim=32,
state_horizon=1,
action_horizon=DUMMY_ACTION_HORIZON,
training=False,
),
]
)
policy = Gr00tPolicy(
model_path=model_path,
embodiment_tag=EmbodimentTag.GR1,
modality_config=modality_config,
modality_transform=modality_transform,
device=DEVICE,
)
return policy, modality_config, modality_transform
def create_dummy_data(device=DEVICE):
"""Create dummy data for testing both implementations."""
batch_size = 2
prompt = "Pick up the red cube and place it in the bin"
state = torch.randn(batch_size, DUMMY_STATE_DIM, dtype=torch.float32, device=device)
batch = {
"observation.state": state,
"action": torch.randn(
batch_size,
DUMMY_ACTION_HORIZON,
DUMMY_ACTION_DIM,
dtype=torch.float32,
device=device, # Action ground truth (for training)
),
"observation.images.ego_view": torch.rand(
batch_size,
3,
IMAGE_SIZE,
IMAGE_SIZE,
dtype=torch.float32,
device=device, # Images in [0, 1] range as expected by LeRobot
),
"task": [prompt for _ in range(batch_size)],
}
return batch
def convert_lerobot_to_original_format(batch, modality_config):
"""Convert LeRobot batch format to original Groot format.
The original Groot expects observations in this format:
{
"video.<camera_name>": np.ndarray (T, H, W, C) or (B, T, H, W, C)
"state.<state_component>": np.ndarray (T, D) or (B, T, D)
"action.<action_component>": np.ndarray (T, D) or (B, T, D)
"annotation.<annotation_type>": str or list[str]
}
"""
# Original Groot expects (T, H, W, C) format for images
# LeRobot has (B, C, H, W) format, so we need to convert
observation = {}
for img_key in ["ego_view"]:
lerobot_key = f"observation.images.{img_key}"
if lerobot_key in batch:
img = batch[lerobot_key]
# Convert from (B, C, H, W) to (B, T=1, H, W, C)
img_np = img.permute(0, 2, 3, 1).unsqueeze(1).cpu().numpy()
# Convert [0, 1] to [0, 255] uint8 as expected by original
img_np = (img_np * 255).astype(np.uint8)
observation[f"video.{img_key}"] = img_np
# Important: The Original's GR00TTransform expects "state" as (B, T, D), not split body parts
if "observation.state" in batch:
state = batch["observation.state"]
state_np = state.unsqueeze(1).cpu().numpy() # (B, 1, D)
observation["state"] = state_np
if "action" in batch:
action = batch["action"]
action_np = action.cpu().numpy()
start_idx = 0
for part_name, part_dim in GR1_BODY_PARTS.items():
end_idx = start_idx + part_dim
observation[f"action.{part_name}"] = action_np[:, :, start_idx:end_idx]
start_idx = end_idx
if "task" in batch:
task_list = batch["task"]
# GR00TTransform expects language with (B, T) shape for batched data
# Create a (B, T=1) array where each element is the string directly
bsz = len(task_list)
task_array = np.empty((bsz, 1), dtype=object)
for i in range(bsz):
task_array[i, 0] = task_list[i] # Assign string directly to each (i, 0) position
observation["annotation.human.action.task_description"] = task_array
return observation
def test_groot_original_vs_lerobot_pretrained():
"""Test Groot original implementation vs LeRobot implementation with pretrained weights."""
print("Test: Groot Original vs LeRobot with Pretrained Weights (Inference)")
set_seed_all(42)
lerobot_policy, lerobot_preprocessor, lerobot_postprocessor = instantiate_lerobot_groot(
from_pretrained=True
)
original_policy, modality_config, modality_transform = instantiate_original_groot(from_pretrained=True)
batch = create_dummy_data()
batch_lerobot = deepcopy(batch)
print("\n[LeRobot] Running inference...")
lerobot_policy.eval()
batch_lerobot_processed = lerobot_preprocessor(batch_lerobot)
# Important: Reset seed immediately before inference to ensure identical RNG state
torch.manual_seed(42)
with torch.no_grad():
lerobot_actions = lerobot_policy.select_action(batch_lerobot_processed)
print("\n[Original] Running inference...")
original_policy.model.eval()
observation = convert_lerobot_to_original_format(batch, modality_config)
original_obs_transformed = modality_transform(deepcopy(observation))
# Important: Reset seed immediately before inference to ensure identical RNG state
torch.manual_seed(42)
with torch.no_grad():
original_model_output = original_policy.model.get_action(original_obs_transformed)
original_actions_raw = original_model_output["action_pred"] # [2, 16, 32]
# Take first timestep
original_actions = original_actions_raw[:, 0, :].to(lerobot_actions.device).to(lerobot_actions.dtype)
print("Action Comparison:")
diff = lerobot_actions - original_actions
abs_diff = torch.abs(diff)
for batch_idx in range(lerobot_actions.shape[0]):
print(f"\n{'=' * 60}")
print(f"Batch {batch_idx}")
print(f"{'=' * 60}")
print(f"{'Idx':<5} {'LeRobot':<14} {'Original':<14} {'Difference':<14}")
print("-" * 60)
for action_idx in range(lerobot_actions.shape[1]):
lr_val = lerobot_actions[batch_idx, action_idx].item()
orig_val = original_actions[batch_idx, action_idx].item()
diff_val = abs(lr_val - orig_val)
sign = "+" if (lr_val - orig_val) > 0 else "-"
print(f"{action_idx:<5} {lr_val:>13.6f} {orig_val:>13.6f} {sign}{diff_val:>12.6f}")
max_diff = abs_diff.max().item()
tolerance = 0.001
assert torch.allclose(lerobot_actions, original_actions, atol=tolerance), (
f"Actions differ by more than tolerance ({tolerance}): max diff = {max_diff:.6f}"
)
print(f"\nSuccess: Actions match within tolerance ({tolerance})!")
del lerobot_policy, lerobot_preprocessor, lerobot_postprocessor
del original_policy, modality_config, modality_transform
del batch, batch_lerobot, observation
cleanup_memory()
def test_groot_forward_pass_comparison():
"""Test forward pass comparison between LeRobot and Original Groot implementations."""
print("Test: Forward Pass Comparison (Training Mode)")
set_seed_all(42)
lerobot_policy, lerobot_preprocessor, lerobot_postprocessor = instantiate_lerobot_groot(
from_pretrained=True
)
original_policy, modality_config, modality_transform = instantiate_original_groot(from_pretrained=True)
batch = create_dummy_data()
lerobot_policy.eval()
original_policy.model.eval()
print("\n[LeRobot] Running forward pass...")
batch_lerobot = deepcopy(batch)
batch_lerobot_processed = lerobot_preprocessor(batch_lerobot)
set_seed_all(42)
with torch.no_grad():
lerobot_loss, lerobot_metrics = lerobot_policy.forward(batch_lerobot_processed)
print(f" Loss: {lerobot_loss.item():.6f}")
print("\n[Original] Running forward pass...")
observation = convert_lerobot_to_original_format(batch, modality_config)
transformed_obs = modality_transform(observation)
if "action" not in transformed_obs:
action_for_forward = batch_lerobot_processed["action"]
action_mask_for_forward = batch_lerobot_processed["action_mask"]
# Match action horizon if needed
if action_for_forward.shape[1] != original_policy.model.action_horizon:
if action_for_forward.shape[1] < original_policy.model.action_horizon:
pad_size = original_policy.model.action_horizon - action_for_forward.shape[1]
last_action = action_for_forward[:, -1:, :]
padding = last_action.repeat(1, pad_size, 1)
action_for_forward = torch.cat([action_for_forward, padding], dim=1)
mask_padding = torch.zeros(
action_mask_for_forward.shape[0],
pad_size,
action_mask_for_forward.shape[2],
dtype=action_mask_for_forward.dtype,
device=action_mask_for_forward.device,
)
action_mask_for_forward = torch.cat([action_mask_for_forward, mask_padding], dim=1)
else:
action_for_forward = action_for_forward[:, : original_policy.model.action_horizon, :]
action_mask_for_forward = action_mask_for_forward[
:, : original_policy.model.action_horizon, :
]
transformed_obs["action"] = action_for_forward
transformed_obs["action_mask"] = action_mask_for_forward
set_seed_all(42)
with torch.no_grad():
original_outputs = original_policy.model.forward(transformed_obs)
original_loss = original_outputs["loss"]
print(f" Loss: {original_loss.item():.6f}")
loss_diff = abs(lerobot_loss.item() - original_loss.item())
loss_rel_diff = loss_diff / (abs(original_loss.item()) + 1e-8) * 100
print("\nLoss Values:")
print(f" LeRobot: {lerobot_loss.item():.6f}")
print(f" Original: {original_loss.item():.6f}")
print(f" Absolute difference: {loss_diff:.6f}")
print(f" Relative difference: {loss_rel_diff:.2f}%")
del lerobot_policy, lerobot_preprocessor, lerobot_postprocessor
del original_policy, modality_config, modality_transform
del batch, batch_lerobot, observation, transformed_obs
cleanup_memory()