[HIL-SERL]Remove overstrict pre-commit modifications (#1028)

2025-04-24 13:48:52 +02:00
parent 671ac3411f
commit c58b504a9e
47 changed files with 163 additions and 757 deletions
--- a/lerobot/common/policies/act/modeling_act.py
+++ b/lerobot/common/policies/act/modeling_act.py
@@ -241,9 +241,7 @@ class ACTTemporalEnsembler:
            # Note: The last dimension is unsqueeze to make sure we can broadcast properly for tensor
            # operations later.
            self.ensembled_actions_count = torch.ones(
-                (self.chunk_size, 1),
-                dtype=torch.long,
-                device=self.ensembled_actions.device,
+                (self.chunk_size, 1), dtype=torch.long, device=self.ensembled_actions.device
            )
        else:
            # self.ensembled_actions will have shape (batch_size, chunk_size - 1, action_dim). Compute
@@ -255,10 +253,7 @@ class ACTTemporalEnsembler:
            # The last action, which has no prior online average, needs to get concatenated onto the end.
            self.ensembled_actions = torch.cat([self.ensembled_actions, actions[:, -1:]], dim=1)
            self.ensembled_actions_count = torch.cat(
-                [
-                    self.ensembled_actions_count,
-                    torch.ones_like(self.ensembled_actions_count[-1:]),
-                ]
+                [self.ensembled_actions_count, torch.ones_like(self.ensembled_actions_count[-1:])]
            )
        # "Consume" the first action.
        action, self.ensembled_actions, self.ensembled_actions_count = (
@@ -338,11 +333,7 @@ class ACT(nn.Module):
        # Backbone for image feature extraction.
        if self.config.image_features:
            backbone_model = getattr(torchvision.models, config.vision_backbone)(
-                replace_stride_with_dilation=[
-                    False,
-                    False,
-                    config.replace_final_stride_with_dilation,
-                ],
+                replace_stride_with_dilation=[False, False, config.replace_final_stride_with_dilation],
                weights=config.pretrained_backbone_weights,
                norm_layer=FrozenBatchNorm2d,
            )
@@ -436,11 +427,7 @@ class ACT(nn.Module):
            action_embed = self.vae_encoder_action_input_proj(batch["action"])  # (B, S, D)

            if self.config.robot_state_feature:
-                vae_encoder_input = [
-                    cls_embed,
-                    robot_state_embed,
-                    action_embed,
-                ]  # (B, S+2, D)
+                vae_encoder_input = [cls_embed, robot_state_embed, action_embed]  # (B, S+2, D)
            else:
                vae_encoder_input = [cls_embed, action_embed]
            vae_encoder_input = torch.cat(vae_encoder_input, axis=1)
@@ -553,10 +540,7 @@ class ACTEncoder(nn.Module):
        self.norm = nn.LayerNorm(config.dim_model) if config.pre_norm else nn.Identity()

    def forward(
-        self,
-        x: Tensor,
-        pos_embed: Tensor | None = None,
-        key_padding_mask: Tensor | None = None,
+        self, x: Tensor, pos_embed: Tensor | None = None, key_padding_mask: Tensor | None = None
    ) -> Tensor:
        for layer in self.layers:
            x = layer(x, pos_embed=pos_embed, key_padding_mask=key_padding_mask)
@@ -619,10 +603,7 @@ class ACTDecoder(nn.Module):
    ) -> Tensor:
        for layer in self.layers:
            x = layer(
-                x,
-                encoder_out,
-                decoder_pos_embed=decoder_pos_embed,
-                encoder_pos_embed=encoder_pos_embed,
+                x, encoder_out, decoder_pos_embed=decoder_pos_embed, encoder_pos_embed=encoder_pos_embed
            )
        if self.norm is not None:
            x = self.norm(x)
--- a/lerobot/common/policies/diffusion/modeling_diffusion.py
+++ b/lerobot/common/policies/diffusion/modeling_diffusion.py
@@ -209,10 +209,7 @@ class DiffusionModel(nn.Module):

    # ========= inference  ============
    def conditional_sample(
-        self,
-        batch_size: int,
-        global_cond: Tensor | None = None,
-        generator: torch.Generator | None = None,
+        self, batch_size: int, global_cond: Tensor | None = None, generator: torch.Generator | None = None
    ) -> Tensor:
        device = get_device_from_parameters(self)
        dtype = get_dtype_from_parameters(self)
@@ -257,10 +254,7 @@ class DiffusionModel(nn.Module):
                # Separate batch and sequence dims back out. The camera index dim gets absorbed into the
                # feature dim (effectively concatenating the camera features).
                img_features = einops.rearrange(
-                    img_features_list,
-                    "(n b s) ... -> b s (n ...)",
-                    b=batch_size,
-                    s=n_obs_steps,
+                    img_features_list, "(n b s) ... -> b s (n ...)", b=batch_size, s=n_obs_steps
                )
            else:
                # Combine batch, sequence, and "which camera" dims before passing to shared encoder.
@@ -270,10 +264,7 @@ class DiffusionModel(nn.Module):
                # Separate batch dim and sequence dim back out. The camera index dim gets absorbed into the
                # feature dim (effectively concatenating the camera features).
                img_features = einops.rearrange(
-                    img_features,
-                    "(b s n) ... -> b s (n ...)",
-                    b=batch_size,
-                    s=n_obs_steps,
+                    img_features, "(b s n) ... -> b s (n ...)", b=batch_size, s=n_obs_steps
                )
            global_cond_feats.append(img_features)

@@ -524,9 +515,7 @@ class DiffusionRgbEncoder(nn.Module):


 def _replace_submodules(
-    root_module: nn.Module,
-    predicate: Callable[[nn.Module], bool],
-    func: Callable[[nn.Module], nn.Module],
+    root_module: nn.Module, predicate: Callable[[nn.Module], bool], func: Callable[[nn.Module], nn.Module]
 ) -> nn.Module:
    """
    Args:
@@ -644,14 +633,10 @@ class DiffusionConditionalUnet1d(nn.Module):
        self.mid_modules = nn.ModuleList(
            [
                DiffusionConditionalResidualBlock1d(
-                    config.down_dims[-1],
-                    config.down_dims[-1],
-                    **common_res_block_kwargs,
+                    config.down_dims[-1], config.down_dims[-1], **common_res_block_kwargs
                ),
                DiffusionConditionalResidualBlock1d(
-                    config.down_dims[-1],
-                    config.down_dims[-1],
-                    **common_res_block_kwargs,
+                    config.down_dims[-1], config.down_dims[-1], **common_res_block_kwargs
                ),
            ]
        )
--- a/lerobot/common/policies/pi0/conversion_scripts/convert_pi0_to_hf_lerobot.py
+++ b/lerobot/common/policies/pi0/conversion_scripts/convert_pi0_to_hf_lerobot.py
@@ -61,11 +61,7 @@ from lerobot.common.policies.pi0.conversion_scripts.conversion_utils import (
 )
 from lerobot.common.policies.pi0.modeling_pi0 import PI0Policy

-PRECISIONS = {
-    "bfloat16": torch.bfloat16,
-    "float32": torch.float32,
-    "float16": torch.float16,
-}
+PRECISIONS = {"bfloat16": torch.bfloat16, "float32": torch.float32, "float16": torch.float16}


 def slice_paligemma_state_dict(state_dict, config):
--- a/lerobot/common/policies/pi0/flex_attention.py
+++ b/lerobot/common/policies/pi0/flex_attention.py
@@ -48,32 +48,18 @@ def flex_attention_forward(

    key_states = key_states[:, :, :, None, :]
    key_states = key_states.expand(
-        batch_size,
-        key_states.shape[1],
-        num_key_value_heads,
-        num_key_value_groups,
-        head_dim,
+        batch_size, key_states.shape[1], num_key_value_heads, num_key_value_groups, head_dim
    )
    key_states = key_states.reshape(
-        batch_size,
-        key_states.shape[1],
-        num_key_value_heads * num_key_value_groups,
-        head_dim,
+        batch_size, key_states.shape[1], num_key_value_heads * num_key_value_groups, head_dim
    )

    value_states = value_states[:, :, :, None, :]
    value_states = value_states.expand(
-        batch_size,
-        value_states.shape[1],
-        num_key_value_heads,
-        num_key_value_groups,
-        head_dim,
+        batch_size, value_states.shape[1], num_key_value_heads, num_key_value_groups, head_dim
    )
    value_states = value_states.reshape(
-        batch_size,
-        value_states.shape[1],
-        num_key_value_heads * num_key_value_groups,
-        head_dim,
+        batch_size, value_states.shape[1], num_key_value_heads * num_key_value_groups, head_dim
    )

    query_states = query_states.transpose(1, 2)
--- a/lerobot/common/policies/pi0/modeling_pi0.py
+++ b/lerobot/common/policies/pi0/modeling_pi0.py
@@ -69,11 +69,7 @@ from lerobot.common.utils.utils import get_safe_dtype


 def create_sinusoidal_pos_embedding(
-    time: torch.tensor,
-    dimension: int,
-    min_period: float,
-    max_period: float,
-    device="cpu",
+    time: torch.tensor, dimension: int, min_period: float, max_period: float, device="cpu"
 ) -> Tensor:
    """Computes sine-cosine positional embedding vectors for scalar positions."""
    if dimension % 2 != 0:
@@ -581,11 +577,7 @@ class PI0FlowMatching(nn.Module):

        # Embed timestep using sine-cosine positional encoding with sensitivity in the range [0, 1]
        time_emb = create_sinusoidal_pos_embedding(
-            timestep,
-            self.config.proj_width,
-            min_period=4e-3,
-            max_period=4.0,
-            device=device,
+            timestep, self.config.proj_width, min_period=4e-3, max_period=4.0, device=device
        )
        time_emb = time_emb.type(dtype=dtype)

@@ -617,15 +609,7 @@ class PI0FlowMatching(nn.Module):
        return embs, pad_masks, att_masks

    def forward(
-        self,
-        images,
-        img_masks,
-        lang_tokens,
-        lang_masks,
-        state,
-        actions,
-        noise=None,
-        time=None,
+        self, images, img_masks, lang_tokens, lang_masks, state, actions, noise=None, time=None
    ) -> Tensor:
        """Do a full training forward pass and compute the loss (batch_size x num_steps x num_motors)"""
        if noise is None:
@@ -671,11 +655,7 @@ class PI0FlowMatching(nn.Module):
        device = state.device

        if noise is None:
-            actions_shape = (
-                bsize,
-                self.config.n_action_steps,
-                self.config.max_action_dim,
-            )
+            actions_shape = (bsize, self.config.n_action_steps, self.config.max_action_dim)
            noise = self.sample_noise(actions_shape, device)

        prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix(
--- a/lerobot/common/policies/pi0/paligemma_with_expert.py
+++ b/lerobot/common/policies/pi0/paligemma_with_expert.py
@@ -293,18 +293,12 @@ class PaliGemmaWithExpertModel(PreTrainedModel):
                    # in `transformers`. (molbap)
                    key_states = torch.cat([past_key_values[layer_idx]["key_states"], key_states], dim=1)
                    value_states = torch.cat(
-                        [past_key_values[layer_idx]["value_states"], value_states],
-                        dim=1,
+                        [past_key_values[layer_idx]["value_states"], value_states], dim=1
                    )

            attention_interface = self.get_attention_interface()
            att_output = attention_interface(
-                attention_mask,
-                batch_size,
-                head_dim,
-                query_states,
-                key_states,
-                value_states,
+                attention_mask, batch_size, head_dim, query_states, key_states, value_states
            )
            att_output = att_output.to(dtype=torch.bfloat16)

@@ -364,24 +358,12 @@ class PaliGemmaWithExpertModel(PreTrainedModel):
        return attention_interface

    def flash_attention_forward(
-        self,
-        attention_mask,
-        batch_size,
-        head_dim,
-        query_states,
-        key_states,
-        value_states,
+        self, attention_mask, batch_size, head_dim, query_states, key_states, value_states
    ):
        raise NotImplementedError("FA2 is not implemented (yet)")

    def eager_attention_forward(
-        self,
-        attention_mask,
-        batch_size,
-        head_dim,
-        query_states,
-        key_states,
-        value_states,
+        self, attention_mask, batch_size, head_dim, query_states, key_states, value_states
    ):
        num_att_heads = self.config.paligemma_config.text_config.num_attention_heads
        num_key_value_heads = self.config.paligemma_config.text_config.num_key_value_heads
@@ -393,31 +375,17 @@ class PaliGemmaWithExpertModel(PreTrainedModel):
        sequence_length = key_states.shape[1]

        key_states = key_states[:, :, :, None, :].expand(
-            batch_size,
-            sequence_length,
-            num_key_value_heads,
-            num_key_value_groups,
-            head_dim,
+            batch_size, sequence_length, num_key_value_heads, num_key_value_groups, head_dim
        )
        key_states = key_states.reshape(
-            batch_size,
-            sequence_length,
-            num_key_value_heads * num_key_value_groups,
-            head_dim,
+            batch_size, sequence_length, num_key_value_heads * num_key_value_groups, head_dim
        )

        value_states = value_states[:, :, :, None, :].expand(
-            batch_size,
-            sequence_length,
-            num_key_value_heads,
-            num_key_value_groups,
-            head_dim,
+            batch_size, sequence_length, num_key_value_heads, num_key_value_groups, head_dim
        )
        value_states = value_states.reshape(
-            batch_size,
-            sequence_length,
-            num_key_value_heads * num_key_value_groups,
-            head_dim,
+            batch_size, sequence_length, num_key_value_heads * num_key_value_groups, head_dim
        )

        # Attention here is upcasted to float32 to match the original eager implementation.
--- a/lerobot/common/policies/tdmpc/modeling_tdmpc.py
+++ b/lerobot/common/policies/tdmpc/modeling_tdmpc.py
@@ -39,11 +39,7 @@ from lerobot.common.constants import OBS_ENV, OBS_ROBOT
 from lerobot.common.policies.normalize import Normalize, Unnormalize
 from lerobot.common.policies.pretrained import PreTrainedPolicy
 from lerobot.common.policies.tdmpc.configuration_tdmpc import TDMPCConfig
-from lerobot.common.policies.utils import (
-    get_device_from_parameters,
-    get_output_shape,
-    populate_queues,
-)
+from lerobot.common.policies.utils import get_device_from_parameters, get_output_shape, populate_queues


 class TDMPCPolicy(PreTrainedPolicy):
@@ -67,11 +63,7 @@ class TDMPCPolicy(PreTrainedPolicy):
    config_class = TDMPCConfig
    name = "tdmpc"

-    def __init__(
-        self,
-        config: TDMPCConfig,
-        dataset_stats: dict[str, dict[str, Tensor]] | None = None,
-    ):
+    def __init__(self, config: TDMPCConfig, dataset_stats: dict[str, dict[str, Tensor]] | None = None):
        """
        Args:
            config: Policy configuration class instance or None, in which case the default instantiation of
@@ -197,20 +189,13 @@ class TDMPCPolicy(PreTrainedPolicy):

        # In the CEM loop we will need this for a call to estimate_value with the gaussian sampled
        # trajectories.
-        z = einops.repeat(
-            z,
-            "b d -> n b d",
-            n=self.config.n_gaussian_samples + self.config.n_pi_samples,
-        )
+        z = einops.repeat(z, "b d -> n b d", n=self.config.n_gaussian_samples + self.config.n_pi_samples)

        # Model Predictive Path Integral (MPPI) with the cross-entropy method (CEM) as the optimization
        # algorithm.
        # The initial mean and standard deviation for the cross-entropy method (CEM).
        mean = torch.zeros(
-            self.config.horizon,
-            batch_size,
-            self.config.action_feature.shape[0],
-            device=device,
+            self.config.horizon, batch_size, self.config.action_feature.shape[0], device=device
        )
        # Maybe warm start CEM with the mean from the previous step.
        if self._prev_mean is not None:
@@ -306,10 +291,9 @@ class TDMPCPolicy(PreTrainedPolicy):
        if self.config.q_ensemble_size > 2:
            G += (
                running_discount
-                * torch.min(
-                    terminal_values[torch.randint(0, self.config.q_ensemble_size, size=(2,))],
-                    dim=0,
-                )[0]
+                * torch.min(terminal_values[torch.randint(0, self.config.q_ensemble_size, size=(2,))], dim=0)[
+                    0
+                ]
            )
        else:
            G += running_discount * torch.min(terminal_values, dim=0)[0]
@@ -345,10 +329,7 @@ class TDMPCPolicy(PreTrainedPolicy):
        # Apply random image augmentations.
        if self.config.image_features and self.config.max_random_shift_ratio > 0:
            observations["observation.image"] = flatten_forward_unflatten(
-                partial(
-                    random_shifts_aug,
-                    max_random_shift_ratio=self.config.max_random_shift_ratio,
-                ),
+                partial(random_shifts_aug, max_random_shift_ratio=self.config.max_random_shift_ratio),
                observations["observation.image"],
            )

@@ -572,10 +553,7 @@ class TDMPCTOLD(nn.Module):
        self._Qs = nn.ModuleList(
            [
                nn.Sequential(
-                    nn.Linear(
-                        config.latent_dim + config.action_feature.shape[0],
-                        config.mlp_dim,
-                    ),
+                    nn.Linear(config.latent_dim + config.action_feature.shape[0], config.mlp_dim),
                    nn.LayerNorm(config.mlp_dim),
                    nn.Tanh(),
                    nn.Linear(config.mlp_dim, config.mlp_dim),
@@ -724,26 +702,11 @@ class TDMPCObservationEncoder(nn.Module):
                    stride=2,
                ),
                nn.ReLU(),
-                nn.Conv2d(
-                    config.image_encoder_hidden_dim,
-                    config.image_encoder_hidden_dim,
-                    5,
-                    stride=2,
-                ),
+                nn.Conv2d(config.image_encoder_hidden_dim, config.image_encoder_hidden_dim, 5, stride=2),
                nn.ReLU(),
-                nn.Conv2d(
-                    config.image_encoder_hidden_dim,
-                    config.image_encoder_hidden_dim,
-                    3,
-                    stride=2,
-                ),
+                nn.Conv2d(config.image_encoder_hidden_dim, config.image_encoder_hidden_dim, 3, stride=2),
                nn.ReLU(),
-                nn.Conv2d(
-                    config.image_encoder_hidden_dim,
-                    config.image_encoder_hidden_dim,
-                    3,
-                    stride=2,
-                ),
+                nn.Conv2d(config.image_encoder_hidden_dim, config.image_encoder_hidden_dim, 3, stride=2),
                nn.ReLU(),
            )
            dummy_shape = (1, *next(iter(config.image_features.values())).shape)
@@ -786,8 +749,7 @@ class TDMPCObservationEncoder(nn.Module):
        if self.config.image_features:
            feat.append(
                flatten_forward_unflatten(
-                    self.image_enc_layers,
-                    obs_dict[next(iter(self.config.image_features))],
+                    self.image_enc_layers, obs_dict[next(iter(self.config.image_features))]
                )
            )
        if self.config.env_state_feature:
@@ -834,9 +796,7 @@ def update_ema_parameters(ema_net: nn.Module, net: nn.Module, alpha: float):
    """Update EMA parameters in place with ema_param <- alpha * ema_param + (1 - alpha) * param."""
    for ema_module, module in zip(ema_net.modules(), net.modules(), strict=True):
        for (n_p_ema, p_ema), (n_p, p) in zip(
-            ema_module.named_parameters(recurse=False),
-            module.named_parameters(recurse=False),
-            strict=True,
+            ema_module.named_parameters(recurse=False), module.named_parameters(recurse=False), strict=True
        ):
            assert n_p_ema == n_p, "Parameter names don't match for EMA model update"
            if isinstance(p, dict):
--- a/lerobot/common/policies/vqbet/configuration_vqbet.py
+++ b/lerobot/common/policies/vqbet/configuration_vqbet.py
@@ -193,12 +193,7 @@ class VQBeTConfig(PreTrainedConfig):

    @property
    def action_delta_indices(self) -> list:
-        return list(
-            range(
-                1 - self.n_obs_steps,
-                self.n_action_pred_token + self.action_chunk_size - 1,
-            )
-        )
+        return list(range(1 - self.n_obs_steps, self.n_action_pred_token + self.action_chunk_size - 1))

    @property
    def reward_delta_indices(self) -> None:
--- a/lerobot/common/policies/vqbet/modeling_vqbet.py
+++ b/lerobot/common/policies/vqbet/modeling_vqbet.py
@@ -29,11 +29,7 @@ from torch import Tensor, nn

 from lerobot.common.policies.normalize import Normalize, Unnormalize
 from lerobot.common.policies.pretrained import PreTrainedPolicy
-from lerobot.common.policies.utils import (
-    get_device_from_parameters,
-    get_output_shape,
-    populate_queues,
-)
+from lerobot.common.policies.utils import get_device_from_parameters, get_output_shape, populate_queues
 from lerobot.common.policies.vqbet.configuration_vqbet import VQBeTConfig
 from lerobot.common.policies.vqbet.vqbet_utils import GPT, ResidualVQ

@@ -328,8 +324,7 @@ class VQBeTModel(nn.Module):

        # To input state and observation features into GPT layers, we first project the features to fit the shape of input size of GPT.
        self.state_projector = MLP(
-            config.robot_state_feature.shape[0],
-            hidden_channels=[self.config.gpt_input_dim],
+            config.robot_state_feature.shape[0], hidden_channels=[self.config.gpt_input_dim]
        )
        self.rgb_feature_projector = MLP(
            self.rgb_encoder.feature_dim, hidden_channels=[self.config.gpt_input_dim]
@@ -359,11 +354,7 @@ class VQBeTModel(nn.Module):
        )
        # Separate batch and sequence dims.
        img_features = einops.rearrange(
-            img_features,
-            "(b s n) ... -> b s n ...",
-            b=batch_size,
-            s=n_obs_steps,
-            n=self.num_images,
+            img_features, "(b s n) ... -> b s n ...", b=batch_size, s=n_obs_steps, n=self.num_images
        )

        # Arrange prior and current observation step tokens as shown in the class docstring.
@@ -400,11 +391,7 @@ class VQBeTModel(nn.Module):
        # Thus, it predicts a historical action sequence, in addition to current and future actions (predicting future actions : optional).
        if len_additional_action_token > 0:
            features = torch.cat(
-                [
-                    features[:, historical_act_pred_index],
-                    features[:, -len_additional_action_token:],
-                ],
-                dim=1,
+                [features[:, historical_act_pred_index], features[:, -len_additional_action_token:]], dim=1
            )
        else:
            features = features[:, historical_act_pred_index]
@@ -527,13 +514,7 @@ class VQBeTHead(nn.Module):

            cbet_secondary_logits = self.map_to_cbet_preds_secondary_bin(
                torch.cat(
-                    (
-                        x,
-                        F.one_hot(
-                            sampled_primary_centers,
-                            num_classes=self.config.vqvae_n_embed,
-                        ),
-                    ),
+                    (x, F.one_hot(sampled_primary_centers, num_classes=self.config.vqvae_n_embed)),
                    axis=1,
                )
            )
@@ -551,9 +532,7 @@ class VQBeTHead(nn.Module):
        else:
            cbet_logits = self.map_to_cbet_preds_bin(x)
            cbet_logits = einops.rearrange(
-                cbet_logits,
-                "(NT) (G C) -> (NT) G C",
-                G=self.vqvae_model.vqvae_num_layers,
+                cbet_logits, "(NT) (G C) -> (NT) G C", G=self.vqvae_model.vqvae_num_layers
            )
            cbet_probs = torch.softmax(cbet_logits / self.config.bet_softmax_temperature, dim=-1)
            NT, G, choices = cbet_probs.shape
@@ -751,9 +730,7 @@ class VQBeTRgbEncoder(nn.Module):


 def _replace_submodules(
-    root_module: nn.Module,
-    predicate: Callable[[nn.Module], bool],
-    func: Callable[[nn.Module], nn.Module],
+    root_module: nn.Module, predicate: Callable[[nn.Module], bool], func: Callable[[nn.Module], nn.Module]
 ) -> nn.Module:
    """
    Args:
--- a/lerobot/common/policies/vqbet/vqbet_utils.py
+++ b/lerobot/common/policies/vqbet/vqbet_utils.py
@@ -377,10 +377,7 @@ class ResidualVQ(nn.Module):
        self.layers = nn.ModuleList(
            [
                VectorQuantize(
-                    dim=codebook_dim,
-                    codebook_dim=codebook_dim,
-                    accept_image_fmap=accept_image_fmap,
-                    **kwargs,
+                    dim=codebook_dim, codebook_dim=codebook_dim, accept_image_fmap=accept_image_fmap, **kwargs
                )
                for _ in range(num_quantizers)
            ]