[pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
2025-03-24 13:41:27 +00:00
parent 2945bbb221
commit 7c05755823
123 changed files with 1161 additions and 3425 deletions
--- a/lerobot/common/policies/act/configuration_act.py
+++ b/lerobot/common/policies/act/configuration_act.py
@@ -171,9 +171,7 @@ class ACTConfig(PreTrainedConfig):

    def validate_features(self) -> None:
        if not self.image_features and not self.env_state_feature:
-            raise ValueError(
-                "You must provide at least one image or the environment state among the inputs."
-            )
+            raise ValueError("You must provide at least one image or the environment state among the inputs.")

    @property
    def observation_delta_indices(self) -> None:
--- a/lerobot/common/policies/act/modeling_act.py
+++ b/lerobot/common/policies/act/modeling_act.py
@@ -63,9 +63,7 @@ class ACTPolicy(PreTrainedPolicy):
        config.validate_features()
        self.config = config

-        self.normalize_inputs = Normalize(
-            config.input_features, config.normalization_mapping, dataset_stats
-        )
+        self.normalize_inputs = Normalize(config.input_features, config.normalization_mapping, dataset_stats)
        self.normalize_targets = Normalize(
            config.output_features, config.normalization_mapping, dataset_stats
        )
@@ -76,9 +74,7 @@ class ACTPolicy(PreTrainedPolicy):
        self.model = ACT(config)

        if config.temporal_ensemble_coeff is not None:
-            self.temporal_ensembler = ACTTemporalEnsembler(
-                config.temporal_ensemble_coeff, config.chunk_size
-            )
+            self.temporal_ensembler = ACTTemporalEnsembler(config.temporal_ensemble_coeff, config.chunk_size)

        self.reset()

@@ -122,12 +118,8 @@ class ACTPolicy(PreTrainedPolicy):

        batch = self.normalize_inputs(batch)
        if self.config.image_features:
-            batch = dict(
-                batch
-            )  # shallow copy so that adding a key doesn't modify the original
-            batch["observation.images"] = [
-                batch[key] for key in self.config.image_features
-            ]
+            batch = dict(batch)  # shallow copy so that adding a key doesn't modify the original
+            batch["observation.images"] = [batch[key] for key in self.config.image_features]

        # If we are doing temporal ensembling, do online updates where we keep track of the number of actions
        # we are ensembling over.
@@ -154,19 +146,14 @@ class ACTPolicy(PreTrainedPolicy):
        """Run the batch through the model and compute the loss for training or validation."""
        batch = self.normalize_inputs(batch)
        if self.config.image_features:
-            batch = dict(
-                batch
-            )  # shallow copy so that adding a key doesn't modify the original
-            batch["observation.images"] = [
-                batch[key] for key in self.config.image_features
-            ]
+            batch = dict(batch)  # shallow copy so that adding a key doesn't modify the original
+            batch["observation.images"] = [batch[key] for key in self.config.image_features]

        batch = self.normalize_targets(batch)
        actions_hat, (mu_hat, log_sigma_x2_hat) = self.model(batch)

        l1_loss = (
-            F.l1_loss(batch["action"], actions_hat, reduction="none")
-            * ~batch["action_is_pad"].unsqueeze(-1)
+            F.l1_loss(batch["action"], actions_hat, reduction="none") * ~batch["action_is_pad"].unsqueeze(-1)
        ).mean()

        loss_dict = {"l1_loss": l1_loss.item()}
@@ -176,12 +163,7 @@ class ACTPolicy(PreTrainedPolicy):
            # KL-divergence per batch element, then take the mean over the batch.
            # (See App. B of https://arxiv.org/abs/1312.6114 for more details).
            mean_kld = (
-                (
-                    -0.5
-                    * (1 + log_sigma_x2_hat - mu_hat.pow(2) - (log_sigma_x2_hat).exp())
-                )
-                .sum(-1)
-                .mean()
+                (-0.5 * (1 + log_sigma_x2_hat - mu_hat.pow(2) - (log_sigma_x2_hat).exp())).sum(-1).mean()
            )
            loss_dict["kld_loss"] = mean_kld.item()
            loss = l1_loss + mean_kld * self.config.kl_weight
@@ -235,9 +217,7 @@ class ACTTemporalEnsembler:
        ```
        """
        self.chunk_size = chunk_size
-        self.ensemble_weights = torch.exp(
-            -temporal_ensemble_coeff * torch.arange(chunk_size)
-        )
+        self.ensemble_weights = torch.exp(-temporal_ensemble_coeff * torch.arange(chunk_size))
        self.ensemble_weights_cumsum = torch.cumsum(self.ensemble_weights, dim=0)
        self.reset()

@@ -253,9 +233,7 @@ class ACTTemporalEnsembler:
        time steps, and pop/return the next batch of actions in the sequence.
        """
        self.ensemble_weights = self.ensemble_weights.to(device=actions.device)
-        self.ensemble_weights_cumsum = self.ensemble_weights_cumsum.to(
-            device=actions.device
-        )
+        self.ensemble_weights_cumsum = self.ensemble_weights_cumsum.to(device=actions.device)
        if self.ensembled_actions is None:
            # Initializes `self._ensembled_action` to the sequence of actions predicted during the first
            # time step of the episode.
@@ -270,22 +248,12 @@ class ACTTemporalEnsembler:
        else:
            # self.ensembled_actions will have shape (batch_size, chunk_size - 1, action_dim). Compute
            # the online update for those entries.
-            self.ensembled_actions *= self.ensemble_weights_cumsum[
-                self.ensembled_actions_count - 1
-            ]
-            self.ensembled_actions += (
-                actions[:, :-1] * self.ensemble_weights[self.ensembled_actions_count]
-            )
-            self.ensembled_actions /= self.ensemble_weights_cumsum[
-                self.ensembled_actions_count
-            ]
-            self.ensembled_actions_count = torch.clamp(
-                self.ensembled_actions_count + 1, max=self.chunk_size
-            )
+            self.ensembled_actions *= self.ensemble_weights_cumsum[self.ensembled_actions_count - 1]
+            self.ensembled_actions += actions[:, :-1] * self.ensemble_weights[self.ensembled_actions_count]
+            self.ensembled_actions /= self.ensemble_weights_cumsum[self.ensembled_actions_count]
+            self.ensembled_actions_count = torch.clamp(self.ensembled_actions_count + 1, max=self.chunk_size)
            # The last action, which has no prior online average, needs to get concatenated onto the end.
-            self.ensembled_actions = torch.cat(
-                [self.ensembled_actions, actions[:, -1:]], dim=1
-            )
+            self.ensembled_actions = torch.cat([self.ensembled_actions, actions[:, -1:]], dim=1)
            self.ensembled_actions_count = torch.cat(
                [
                    self.ensembled_actions_count,
@@ -356,9 +324,7 @@ class ACT(nn.Module):
                config.dim_model,
            )
            # Projection layer from the VAE encoder's output to the latent distribution's parameter space.
-            self.vae_encoder_latent_output_proj = nn.Linear(
-                config.dim_model, config.latent_dim * 2
-            )
+            self.vae_encoder_latent_output_proj = nn.Linear(config.dim_model, config.latent_dim * 2)
            # Fixed sinusoidal positional embedding for the input to the VAE encoder. Unsqueeze for batch
            # dimension.
            num_input_token_encoder = 1 + config.chunk_size
@@ -366,9 +332,7 @@ class ACT(nn.Module):
                num_input_token_encoder += 1
            self.register_buffer(
                "vae_encoder_pos_enc",
-                create_sinusoidal_pos_embedding(
-                    num_input_token_encoder, config.dim_model
-                ).unsqueeze(0),
+                create_sinusoidal_pos_embedding(num_input_token_encoder, config.dim_model).unsqueeze(0),
            )

        # Backbone for image feature extraction.
@@ -385,9 +349,7 @@ class ACT(nn.Module):
            # Note: The assumption here is that we are using a ResNet model (and hence layer4 is the final
            # feature map).
            # Note: The forward method of this returns a dict: {"feature_map": output}.
-            self.backbone = IntermediateLayerGetter(
-                backbone_model, return_layers={"layer4": "feature_map"}
-            )
+            self.backbone = IntermediateLayerGetter(backbone_model, return_layers={"layer4": "feature_map"})

        # Transformer (acts as VAE decoder when training with the variational objective).
        self.encoder = ACTEncoder(config)
@@ -416,18 +378,14 @@ class ACT(nn.Module):
            n_1d_tokens += 1
        self.encoder_1d_feature_pos_embed = nn.Embedding(n_1d_tokens, config.dim_model)
        if self.config.image_features:
-            self.encoder_cam_feat_pos_embed = ACTSinusoidalPositionEmbedding2d(
-                config.dim_model // 2
-            )
+            self.encoder_cam_feat_pos_embed = ACTSinusoidalPositionEmbedding2d(config.dim_model // 2)

        # Transformer decoder.
        # Learnable positional embedding for the transformer's decoder (in the style of DETR object queries).
        self.decoder_pos_embed = nn.Embedding(config.chunk_size, config.dim_model)

        # Final action regression head on the output of the transformer's decoder.
-        self.action_head = nn.Linear(
-            config.dim_model, self.config.action_feature.shape[0]
-        )
+        self.action_head = nn.Linear(config.dim_model, self.config.action_feature.shape[0])

        self._reset_parameters()

@@ -437,9 +395,7 @@ class ACT(nn.Module):
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

-    def forward(
-        self, batch: dict[str, Tensor]
-    ) -> tuple[Tensor, tuple[Tensor, Tensor] | tuple[None, None]]:
+    def forward(self, batch: dict[str, Tensor]) -> tuple[Tensor, tuple[Tensor, Tensor] | tuple[None, None]]:
        """A forward pass through the Action Chunking Transformer (with optional VAE encoder).

        `batch` should have the following structure:
@@ -475,13 +431,9 @@ class ACT(nn.Module):
                self.vae_encoder_cls_embed.weight, "1 d -> b 1 d", b=batch_size
            )  # (B, 1, D)
            if self.config.robot_state_feature:
-                robot_state_embed = self.vae_encoder_robot_state_input_proj(
-                    batch["observation.state"]
-                )
+                robot_state_embed = self.vae_encoder_robot_state_input_proj(batch["observation.state"])
                robot_state_embed = robot_state_embed.unsqueeze(1)  # (B, 1, D)
-            action_embed = self.vae_encoder_action_input_proj(
-                batch["action"]
-            )  # (B, S, D)
+            action_embed = self.vae_encoder_action_input_proj(batch["action"])  # (B, S, D)

            if self.config.robot_state_feature:
                vae_encoder_input = [
@@ -526,26 +478,20 @@ class ACT(nn.Module):
            # When not using the VAE encoder, we set the latent to be all zeros.
            mu = log_sigma_x2 = None
            # TODO(rcadene, alexander-soare): remove call to `.to` to speedup forward ; precompute and use buffer
-            latent_sample = torch.zeros(
-                [batch_size, self.config.latent_dim], dtype=torch.float32
-            ).to(batch["observation.state"].device)
+            latent_sample = torch.zeros([batch_size, self.config.latent_dim], dtype=torch.float32).to(
+                batch["observation.state"].device
+            )

        # Prepare transformer encoder inputs.
        encoder_in_tokens = [self.encoder_latent_input_proj(latent_sample)]
-        encoder_in_pos_embed = list(
-            self.encoder_1d_feature_pos_embed.weight.unsqueeze(1)
-        )
+        encoder_in_pos_embed = list(self.encoder_1d_feature_pos_embed.weight.unsqueeze(1))
        # Robot state token.
        if self.config.robot_state_feature:
-            encoder_in_tokens.append(
-                self.encoder_robot_state_input_proj(batch["observation.state"])
-            )
+            encoder_in_tokens.append(self.encoder_robot_state_input_proj(batch["observation.state"]))
        # Environment state token.
        if self.config.env_state_feature:
            encoder_in_tokens.append(
-                self.encoder_env_state_input_proj(
-                    batch["observation.environment_state"]
-                )
+                self.encoder_env_state_input_proj(batch["observation.environment_state"])
            )

        # Camera observation features and positional embeddings.
@@ -556,9 +502,7 @@ class ACT(nn.Module):
            # For a list of images, the H and W may vary but H*W is constant.
            for img in batch["observation.images"]:
                cam_features = self.backbone(img)["feature_map"]
-                cam_pos_embed = self.encoder_cam_feat_pos_embed(cam_features).to(
-                    dtype=cam_features.dtype
-                )
+                cam_pos_embed = self.encoder_cam_feat_pos_embed(cam_features).to(dtype=cam_features.dtype)
                cam_features = self.encoder_img_feat_input_proj(cam_features)

                # Rearrange features to (sequence, batch, dim).
@@ -604,14 +548,8 @@ class ACTEncoder(nn.Module):
    def __init__(self, config: ACTConfig, is_vae_encoder: bool = False):
        super().__init__()
        self.is_vae_encoder = is_vae_encoder
-        num_layers = (
-            config.n_vae_encoder_layers
-            if self.is_vae_encoder
-            else config.n_encoder_layers
-        )
-        self.layers = nn.ModuleList(
-            [ACTEncoderLayer(config) for _ in range(num_layers)]
-        )
+        num_layers = config.n_vae_encoder_layers if self.is_vae_encoder else config.n_encoder_layers
+        self.layers = nn.ModuleList([ACTEncoderLayer(config) for _ in range(num_layers)])
        self.norm = nn.LayerNorm(config.dim_model) if config.pre_norm else nn.Identity()

    def forward(
@@ -629,9 +567,7 @@ class ACTEncoder(nn.Module):
 class ACTEncoderLayer(nn.Module):
    def __init__(self, config: ACTConfig):
        super().__init__()
-        self.self_attn = nn.MultiheadAttention(
-            config.dim_model, config.n_heads, dropout=config.dropout
-        )
+        self.self_attn = nn.MultiheadAttention(config.dim_model, config.n_heads, dropout=config.dropout)

        # Feed forward layers.
        self.linear1 = nn.Linear(config.dim_model, config.dim_feedforward)
@@ -646,9 +582,7 @@ class ACTEncoderLayer(nn.Module):
        self.activation = get_activation_fn(config.feedforward_activation)
        self.pre_norm = config.pre_norm

-    def forward(
-        self, x, pos_embed: Tensor | None = None, key_padding_mask: Tensor | None = None
-    ) -> Tensor:
+    def forward(self, x, pos_embed: Tensor | None = None, key_padding_mask: Tensor | None = None) -> Tensor:
        skip = x
        if self.pre_norm:
            x = self.norm1(x)
@@ -673,9 +607,7 @@ class ACTDecoder(nn.Module):
    def __init__(self, config: ACTConfig):
        """Convenience module for running multiple decoder layers followed by normalization."""
        super().__init__()
-        self.layers = nn.ModuleList(
-            [ACTDecoderLayer(config) for _ in range(config.n_decoder_layers)]
-        )
+        self.layers = nn.ModuleList([ACTDecoderLayer(config) for _ in range(config.n_decoder_layers)])
        self.norm = nn.LayerNorm(config.dim_model)

    def forward(
@@ -700,12 +632,8 @@ class ACTDecoder(nn.Module):
 class ACTDecoderLayer(nn.Module):
    def __init__(self, config: ACTConfig):
        super().__init__()
-        self.self_attn = nn.MultiheadAttention(
-            config.dim_model, config.n_heads, dropout=config.dropout
-        )
-        self.multihead_attn = nn.MultiheadAttention(
-            config.dim_model, config.n_heads, dropout=config.dropout
-        )
+        self.self_attn = nn.MultiheadAttention(config.dim_model, config.n_heads, dropout=config.dropout)
+        self.multihead_attn = nn.MultiheadAttention(config.dim_model, config.n_heads, dropout=config.dropout)

        # Feed forward layers.
        self.linear1 = nn.Linear(config.dim_model, config.dim_feedforward)
@@ -746,9 +674,7 @@ class ACTDecoderLayer(nn.Module):
        if self.pre_norm:
            x = self.norm1(x)
        q = k = self.maybe_add_pos_embed(x, decoder_pos_embed)
-        x = self.self_attn(q, k, value=x)[
-            0
-        ]  # select just the output, not the attention weights
+        x = self.self_attn(q, k, value=x)[0]  # select just the output, not the attention weights
        x = skip + self.dropout1(x)
        if self.pre_norm:
            skip = x
@@ -785,14 +711,9 @@ def create_sinusoidal_pos_embedding(num_positions: int, dimension: int) -> Tenso
    """

    def get_position_angle_vec(position):
-        return [
-            position / np.power(10000, 2 * (hid_j // 2) / dimension)
-            for hid_j in range(dimension)
-        ]
+        return [position / np.power(10000, 2 * (hid_j // 2) / dimension) for hid_j in range(dimension)]

-    sinusoid_table = np.array(
-        [get_position_angle_vec(pos_i) for pos_i in range(num_positions)]
-    )
+    sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(num_positions)])
    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
    return torch.from_numpy(sinusoid_table).float()
@@ -837,9 +758,7 @@ class ACTSinusoidalPositionEmbedding2d(nn.Module):
        x_range = x_range / (x_range[:, :, -1:] + self._eps) * self._two_pi

        inverse_frequency = self._temperature ** (
-            2
-            * (torch.arange(self.dimension, dtype=torch.float32, device=x.device) // 2)
-            / self.dimension
+            2 * (torch.arange(self.dimension, dtype=torch.float32, device=x.device) // 2) / self.dimension
        )

        x_range = x_range.unsqueeze(-1) / inverse_frequency  # (1, H, W, 1)
@@ -847,15 +766,9 @@ class ACTSinusoidalPositionEmbedding2d(nn.Module):

        # Note: this stack then flatten operation results in interleaved sine and cosine terms.
        # pos_embed_x and pos_embed_y are (1, H, W, C // 2).
-        pos_embed_x = torch.stack(
-            (x_range[..., 0::2].sin(), x_range[..., 1::2].cos()), dim=-1
-        ).flatten(3)
-        pos_embed_y = torch.stack(
-            (y_range[..., 0::2].sin(), y_range[..., 1::2].cos()), dim=-1
-        ).flatten(3)
-        pos_embed = torch.cat((pos_embed_y, pos_embed_x), dim=3).permute(
-            0, 3, 1, 2
-        )  # (1, C, H, W)
+        pos_embed_x = torch.stack((x_range[..., 0::2].sin(), x_range[..., 1::2].cos()), dim=-1).flatten(3)
+        pos_embed_y = torch.stack((y_range[..., 0::2].sin(), y_range[..., 1::2].cos()), dim=-1).flatten(3)
+        pos_embed = torch.cat((pos_embed_y, pos_embed_x), dim=3).permute(0, 3, 1, 2)  # (1, C, H, W)

        return pos_embed

--- a/lerobot/common/policies/diffusion/configuration_diffusion.py
+++ b/lerobot/common/policies/diffusion/configuration_diffusion.py
@@ -205,16 +205,11 @@ class DiffusionConfig(PreTrainedConfig):

    def validate_features(self) -> None:
        if len(self.image_features) == 0 and self.env_state_feature is None:
-            raise ValueError(
-                "You must provide at least one image or the environment state among the inputs."
-            )
+            raise ValueError("You must provide at least one image or the environment state among the inputs.")

        if self.crop_shape is not None:
            for key, image_ft in self.image_features.items():
-                if (
-                    self.crop_shape[0] > image_ft.shape[1]
-                    or self.crop_shape[1] > image_ft.shape[2]
-                ):
+                if self.crop_shape[0] > image_ft.shape[1] or self.crop_shape[1] > image_ft.shape[2]:
                    raise ValueError(
                        f"`crop_shape` should fit within the images shapes. Got {self.crop_shape} "
                        f"for `crop_shape` and {image_ft.shape} for "
--- a/lerobot/common/policies/diffusion/modeling_diffusion.py
+++ b/lerobot/common/policies/diffusion/modeling_diffusion.py
@@ -70,9 +70,7 @@ class DiffusionPolicy(PreTrainedPolicy):
        config.validate_features()
        self.config = config

-        self.normalize_inputs = Normalize(
-            config.input_features, config.normalization_mapping, dataset_stats
-        )
+        self.normalize_inputs = Normalize(config.input_features, config.normalization_mapping, dataset_stats)
        self.normalize_targets = Normalize(
            config.output_features, config.normalization_mapping, dataset_stats
        )
@@ -99,9 +97,7 @@ class DiffusionPolicy(PreTrainedPolicy):
        if self.config.image_features:
            self._queues["observation.images"] = deque(maxlen=self.config.n_obs_steps)
        if self.config.env_state_feature:
-            self._queues["observation.environment_state"] = deque(
-                maxlen=self.config.n_obs_steps
-            )
+            self._queues["observation.environment_state"] = deque(maxlen=self.config.n_obs_steps)

    @torch.no_grad
    def select_action(self, batch: dict[str, Tensor]) -> Tensor:
@@ -127,9 +123,7 @@ class DiffusionPolicy(PreTrainedPolicy):
        """
        batch = self.normalize_inputs(batch)
        if self.config.image_features:
-            batch = dict(
-                batch
-            )  # shallow copy so that adding a key doesn't modify the original
+            batch = dict(batch)  # shallow copy so that adding a key doesn't modify the original
            batch["observation.images"] = torch.stack(
                [batch[key] for key in self.config.image_features], dim=-4
            )
@@ -138,11 +132,7 @@ class DiffusionPolicy(PreTrainedPolicy):

        if len(self._queues["action"]) == 0:
            # stack n latest observations from the queue
-            batch = {
-                k: torch.stack(list(self._queues[k]), dim=1)
-                for k in batch
-                if k in self._queues
-            }
+            batch = {k: torch.stack(list(self._queues[k]), dim=1) for k in batch if k in self._queues}
            actions = self.diffusion.generate_actions(batch)

            # TODO(rcadene): make above methods return output dictionary?
@@ -157,9 +147,7 @@ class DiffusionPolicy(PreTrainedPolicy):
        """Run the batch through the model and compute the loss for training or validation."""
        batch = self.normalize_inputs(batch)
        if self.config.image_features:
-            batch = dict(
-                batch
-            )  # shallow copy so that adding a key doesn't modify the original
+            batch = dict(batch)  # shallow copy so that adding a key doesn't modify the original
            batch["observation.images"] = torch.stack(
                [batch[key] for key in self.config.image_features], dim=-4
            )
@@ -201,9 +189,7 @@ class DiffusionModel(nn.Module):
        if self.config.env_state_feature:
            global_cond_dim += self.config.env_state_feature.shape[0]

-        self.unet = DiffusionConditionalUnet1d(
-            config, global_cond_dim=global_cond_dim * config.n_obs_steps
-        )
+        self.unet = DiffusionConditionalUnet1d(config, global_cond_dim=global_cond_dim * config.n_obs_steps)

        self.noise_scheduler = _make_noise_scheduler(
            config.noise_scheduler_type,
@@ -249,9 +235,7 @@ class DiffusionModel(nn.Module):
                global_cond=global_cond,
            )
            # Compute previous image: x_t -> x_t-1
-            sample = self.noise_scheduler.step(
-                model_output, t, sample, generator=generator
-            ).prev_sample
+            sample = self.noise_scheduler.step(model_output, t, sample, generator=generator).prev_sample

        return sample

@@ -263,15 +247,11 @@ class DiffusionModel(nn.Module):
        if self.config.image_features:
            if self.config.use_separate_rgb_encoder_per_camera:
                # Combine batch and sequence dims while rearranging to make the camera index dimension first.
-                images_per_camera = einops.rearrange(
-                    batch["observation.images"], "b s n ... -> n (b s) ..."
-                )
+                images_per_camera = einops.rearrange(batch["observation.images"], "b s n ... -> n (b s) ...")
                img_features_list = torch.cat(
                    [
                        encoder(images)
-                        for encoder, images in zip(
-                            self.rgb_encoder, images_per_camera, strict=True
-                        )
+                        for encoder, images in zip(self.rgb_encoder, images_per_camera, strict=True)
                    ]
                )
                # Separate batch and sequence dims back out. The camera index dim gets absorbed into the
@@ -285,9 +265,7 @@ class DiffusionModel(nn.Module):
            else:
                # Combine batch, sequence, and "which camera" dims before passing to shared encoder.
                img_features = self.rgb_encoder(
-                    einops.rearrange(
-                        batch["observation.images"], "b s n ... -> (b s n) ..."
-                    )
+                    einops.rearrange(batch["observation.images"], "b s n ... -> (b s n) ...")
                )
                # Separate batch dim and sequence dim back out. The camera index dim gets absorbed into the
                # feature dim (effectively concatenating the camera features).
@@ -381,9 +359,7 @@ class DiffusionModel(nn.Module):
        elif self.config.prediction_type == "sample":
            target = batch["action"]
        else:
-            raise ValueError(
-                f"Unsupported prediction type {self.config.prediction_type}"
-            )
+            raise ValueError(f"Unsupported prediction type {self.config.prediction_type}")

        loss = F.mse_loss(pred, target, reduction="none")

@@ -443,9 +419,7 @@ class SpatialSoftmax(nn.Module):

        # we could use torch.linspace directly but that seems to behave slightly differently than numpy
        # and causes a small degradation in pc_success of pre-trained models.
-        pos_x, pos_y = np.meshgrid(
-            np.linspace(-1.0, 1.0, self._in_w), np.linspace(-1.0, 1.0, self._in_h)
-        )
+        pos_x, pos_y = np.meshgrid(np.linspace(-1.0, 1.0, self._in_w), np.linspace(-1.0, 1.0, self._in_h))
        pos_x = torch.from_numpy(pos_x.reshape(self._in_h * self._in_w, 1)).float()
        pos_y = torch.from_numpy(pos_y.reshape(self._in_h * self._in_w, 1)).float()
        # register as buffer so it's moved to the correct device.
@@ -487,9 +461,7 @@ class DiffusionRgbEncoder(nn.Module):
            # Always use center crop for eval
            self.center_crop = torchvision.transforms.CenterCrop(config.crop_shape)
            if config.crop_is_random:
-                self.maybe_random_crop = torchvision.transforms.RandomCrop(
-                    config.crop_shape
-                )
+                self.maybe_random_crop = torchvision.transforms.RandomCrop(config.crop_shape)
            else:
                self.maybe_random_crop = self.center_crop
        else:
@@ -510,9 +482,7 @@ class DiffusionRgbEncoder(nn.Module):
            self.backbone = _replace_submodules(
                root_module=self.backbone,
                predicate=lambda x: isinstance(x, nn.BatchNorm2d),
-                func=lambda x: nn.GroupNorm(
-                    num_groups=x.num_features // 16, num_channels=x.num_features
-                ),
+                func=lambda x: nn.GroupNorm(num_groups=x.num_features // 16, num_channels=x.num_features),
            )

        # Set up pooling and final layers.
@@ -523,15 +493,11 @@ class DiffusionRgbEncoder(nn.Module):

        # Note: we have a check in the config class to make sure all images have the same shape.
        images_shape = next(iter(config.image_features.values())).shape
-        dummy_shape_h_w = (
-            config.crop_shape if config.crop_shape is not None else images_shape[1:]
-        )
+        dummy_shape_h_w = config.crop_shape if config.crop_shape is not None else images_shape[1:]
        dummy_shape = (1, images_shape[0], *dummy_shape_h_w)
        feature_map_shape = get_output_shape(self.backbone, dummy_shape)[1:]

-        self.pool = SpatialSoftmax(
-            feature_map_shape, num_kp=config.spatial_softmax_num_keypoints
-        )
+        self.pool = SpatialSoftmax(feature_map_shape, num_kp=config.spatial_softmax_num_keypoints)
        self.feature_dim = config.spatial_softmax_num_keypoints * 2
        self.out = nn.Linear(config.spatial_softmax_num_keypoints * 2, self.feature_dim)
        self.relu = nn.ReLU()
@@ -573,11 +539,7 @@ def _replace_submodules(
    if predicate(root_module):
        return func(root_module)

-    replace_list = [
-        k.split(".")
-        for k, m in root_module.named_modules(remove_duplicate=True)
-        if predicate(m)
-    ]
+    replace_list = [k.split(".") for k, m in root_module.named_modules(remove_duplicate=True) if predicate(m)]
    for *parents, k in replace_list:
        parent_module = root_module
        if len(parents) > 0:
@@ -592,9 +554,7 @@ def _replace_submodules(
        else:
            setattr(parent_module, k, tgt_module)
    # verify that all BN are replaced
-    assert not any(
-        predicate(m) for _, m in root_module.named_modules(remove_duplicate=True)
-    )
+    assert not any(predicate(m) for _, m in root_module.named_modules(remove_duplicate=True))
    return root_module


@@ -622,9 +582,7 @@ class DiffusionConv1dBlock(nn.Module):
        super().__init__()

        self.block = nn.Sequential(
-            nn.Conv1d(
-                inp_channels, out_channels, kernel_size, padding=kernel_size // 2
-            ),
+            nn.Conv1d(inp_channels, out_channels, kernel_size, padding=kernel_size // 2),
            nn.GroupNorm(n_groups, out_channels),
            nn.Mish(),
        )
@@ -647,13 +605,9 @@ class DiffusionConditionalUnet1d(nn.Module):
        # Encoder for the diffusion timestep.
        self.diffusion_step_encoder = nn.Sequential(
            DiffusionSinusoidalPosEmb(config.diffusion_step_embed_dim),
-            nn.Linear(
-                config.diffusion_step_embed_dim, config.diffusion_step_embed_dim * 4
-            ),
+            nn.Linear(config.diffusion_step_embed_dim, config.diffusion_step_embed_dim * 4),
            nn.Mish(),
-            nn.Linear(
-                config.diffusion_step_embed_dim * 4, config.diffusion_step_embed_dim
-            ),
+            nn.Linear(config.diffusion_step_embed_dim * 4, config.diffusion_step_embed_dim),
        )

        # The FiLM conditioning dimension.
@@ -678,16 +632,10 @@ class DiffusionConditionalUnet1d(nn.Module):
            self.down_modules.append(
                nn.ModuleList(
                    [
-                        DiffusionConditionalResidualBlock1d(
-                            dim_in, dim_out, **common_res_block_kwargs
-                        ),
-                        DiffusionConditionalResidualBlock1d(
-                            dim_out, dim_out, **common_res_block_kwargs
-                        ),
+                        DiffusionConditionalResidualBlock1d(dim_in, dim_out, **common_res_block_kwargs),
+                        DiffusionConditionalResidualBlock1d(dim_out, dim_out, **common_res_block_kwargs),
                        # Downsample as long as it is not the last block.
-                        nn.Conv1d(dim_out, dim_out, 3, 2, 1)
-                        if not is_last
-                        else nn.Identity(),
+                        nn.Conv1d(dim_out, dim_out, 3, 2, 1) if not is_last else nn.Identity(),
                    ]
                )
            )
@@ -716,24 +664,16 @@ class DiffusionConditionalUnet1d(nn.Module):
                nn.ModuleList(
                    [
                        # dim_in * 2, because it takes the encoder's skip connection as well
-                        DiffusionConditionalResidualBlock1d(
-                            dim_in * 2, dim_out, **common_res_block_kwargs
-                        ),
-                        DiffusionConditionalResidualBlock1d(
-                            dim_out, dim_out, **common_res_block_kwargs
-                        ),
+                        DiffusionConditionalResidualBlock1d(dim_in * 2, dim_out, **common_res_block_kwargs),
+                        DiffusionConditionalResidualBlock1d(dim_out, dim_out, **common_res_block_kwargs),
                        # Upsample as long as it is not the last block.
-                        nn.ConvTranspose1d(dim_out, dim_out, 4, 2, 1)
-                        if not is_last
-                        else nn.Identity(),
+                        nn.ConvTranspose1d(dim_out, dim_out, 4, 2, 1) if not is_last else nn.Identity(),
                    ]
                )
            )

        self.final_conv = nn.Sequential(
-            DiffusionConv1dBlock(
-                config.down_dims[0], config.down_dims[0], kernel_size=config.kernel_size
-            ),
+            DiffusionConv1dBlock(config.down_dims[0], config.down_dims[0], kernel_size=config.kernel_size),
            nn.Conv1d(config.down_dims[0], config.action_feature.shape[0], 1),
        )

@@ -801,23 +741,17 @@ class DiffusionConditionalResidualBlock1d(nn.Module):
        self.use_film_scale_modulation = use_film_scale_modulation
        self.out_channels = out_channels

-        self.conv1 = DiffusionConv1dBlock(
-            in_channels, out_channels, kernel_size, n_groups=n_groups
-        )
+        self.conv1 = DiffusionConv1dBlock(in_channels, out_channels, kernel_size, n_groups=n_groups)

        # FiLM modulation (https://arxiv.org/abs/1709.07871) outputs per-channel bias and (maybe) scale.
        cond_channels = out_channels * 2 if use_film_scale_modulation else out_channels
        self.cond_encoder = nn.Sequential(nn.Mish(), nn.Linear(cond_dim, cond_channels))

-        self.conv2 = DiffusionConv1dBlock(
-            out_channels, out_channels, kernel_size, n_groups=n_groups
-        )
+        self.conv2 = DiffusionConv1dBlock(out_channels, out_channels, kernel_size, n_groups=n_groups)

        # A final convolution for dimension matching the residual (if needed).
        self.residual_conv = (
-            nn.Conv1d(in_channels, out_channels, 1)
-            if in_channels != out_channels
-            else nn.Identity()
+            nn.Conv1d(in_channels, out_channels, 1) if in_channels != out_channels else nn.Identity()
        )

    def forward(self, x: Tensor, cond: Tensor) -> Tensor:
--- a/lerobot/common/policies/factory.py
+++ b/lerobot/common/policies/factory.py
@@ -104,9 +104,7 @@ def make_policy(
        PreTrainedPolicy: _description_
    """
    if bool(ds_meta) == bool(env_cfg):
-        raise ValueError(
-            "Either one of a dataset metadata or a sim env must be provided."
-        )
+        raise ValueError("Either one of a dataset metadata or a sim env must be provided.")

    # NOTE: Currently, if you try to run vqbet with mps backend, you'll get this error.
    # TODO(aliberts, rcadene): Implement a check_backend_compatibility in policies?
@@ -136,12 +134,8 @@ def make_policy(
            )
        features = env_to_policy_features(env_cfg)

-    cfg.output_features = {
-        key: ft for key, ft in features.items() if ft.type is FeatureType.ACTION
-    }
-    cfg.input_features = {
-        key: ft for key, ft in features.items() if key not in cfg.output_features
-    }
+    cfg.output_features = {key: ft for key, ft in features.items() if ft.type is FeatureType.ACTION}
+    cfg.input_features = {key: ft for key, ft in features.items() if key not in cfg.output_features}
    kwargs["config"] = cfg

    if cfg.pretrained_path:
--- a/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
+++ b/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
@@ -7,9 +7,7 @@ from torch import Tensor, nn

 from .configuration_classifier import ClassifierConfig

-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
-)
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
 logger = logging.getLogger(__name__)


@@ -53,9 +51,7 @@ class Classifier(
        super().__init__()
        self.config = config
        # self.processor = AutoImageProcessor.from_pretrained(self.config.model_name, trust_remote_code=True)
-        encoder = AutoModel.from_pretrained(
-            self.config.model_name, trust_remote_code=True
-        )
+        encoder = AutoModel.from_pretrained(self.config.model_name, trust_remote_code=True)
        # Extract vision model if we're given a multimodal model
        if hasattr(encoder, "vision_model"):
            logging.info("Multimodal model detected - using vision encoder only")
@@ -81,9 +77,7 @@ class Classifier(
            self.feature_dim = self.encoder.fc.in_features
            self.encoder = nn.Sequential(*list(self.encoder.children())[:-1])
        elif hasattr(self.encoder.config, "hidden_sizes"):
-            self.feature_dim = self.encoder.config.hidden_sizes[
-                -1
-            ]  # Last channel dimension
+            self.feature_dim = self.encoder.config.hidden_sizes[-1]  # Last channel dimension
        else:
            raise ValueError("Unsupported CNN architecture")

@@ -103,9 +97,7 @@ class Classifier(
            if hasattr(self.encoder.config, "hidden_size"):
                input_dim = self.encoder.config.hidden_size
            else:
-                raise ValueError(
-                    "Unsupported transformer architecture since hidden_size is not found"
-                )
+                raise ValueError("Unsupported transformer architecture since hidden_size is not found")

        self.classifier_head = nn.Sequential(
            nn.Linear(input_dim * self.config.num_cameras, self.config.hidden_dim),
@@ -141,10 +133,7 @@ class Classifier(
                return features
            else:  # Transformer models
                outputs = self.encoder(processed)
-                if (
-                    hasattr(outputs, "pooler_output")
-                    and outputs.pooler_output is not None
-                ):
+                if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
                    return outputs.pooler_output
                return outputs.last_hidden_state[:, 0, :]

@@ -160,9 +149,7 @@ class Classifier(
        else:
            probabilities = torch.softmax(logits, dim=-1)

-        return ClassifierOutput(
-            logits=logits, probabilities=probabilities, hidden_states=encoder_outputs
-        )
+        return ClassifierOutput(logits=logits, probabilities=probabilities, hidden_states=encoder_outputs)

    def predict_reward(self, x, threshold=0.6):
        if self.config.num_classes == 2:
--- a/lerobot/common/policies/normalize.py
+++ b/lerobot/common/policies/normalize.py
@@ -82,43 +82,25 @@ def create_stats_buffers(
        if stats:
            if isinstance(stats[key]["mean"], np.ndarray):
                if norm_mode is NormalizationMode.MEAN_STD:
-                    buffer["mean"].data = torch.from_numpy(stats[key]["mean"]).to(
-                        dtype=torch.float32
-                    )
-                    buffer["std"].data = torch.from_numpy(stats[key]["std"]).to(
-                        dtype=torch.float32
-                    )
+                    buffer["mean"].data = torch.from_numpy(stats[key]["mean"]).to(dtype=torch.float32)
+                    buffer["std"].data = torch.from_numpy(stats[key]["std"]).to(dtype=torch.float32)
                elif norm_mode is NormalizationMode.MIN_MAX:
-                    buffer["min"].data = torch.from_numpy(stats[key]["min"]).to(
-                        dtype=torch.float32
-                    )
-                    buffer["max"].data = torch.from_numpy(stats[key]["max"]).to(
-                        dtype=torch.float32
-                    )
+                    buffer["min"].data = torch.from_numpy(stats[key]["min"]).to(dtype=torch.float32)
+                    buffer["max"].data = torch.from_numpy(stats[key]["max"]).to(dtype=torch.float32)
            elif isinstance(stats[key]["mean"], torch.Tensor):
                # Note: The clone is needed to make sure that the logic in save_pretrained doesn't see duplicated
                # tensors anywhere (for example, when we use the same stats for normalization and
                # unnormalization). See the logic here
                # https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/py_src/safetensors/torch.py#L97.
                if norm_mode is NormalizationMode.MEAN_STD:
-                    buffer["mean"].data = (
-                        stats[key]["mean"].clone().to(dtype=torch.float32)
-                    )
-                    buffer["std"].data = (
-                        stats[key]["std"].clone().to(dtype=torch.float32)
-                    )
+                    buffer["mean"].data = stats[key]["mean"].clone().to(dtype=torch.float32)
+                    buffer["std"].data = stats[key]["std"].clone().to(dtype=torch.float32)
                elif norm_mode is NormalizationMode.MIN_MAX:
-                    buffer["min"].data = (
-                        stats[key]["min"].clone().to(dtype=torch.float32)
-                    )
-                    buffer["max"].data = (
-                        stats[key]["max"].clone().to(dtype=torch.float32)
-                    )
+                    buffer["min"].data = stats[key]["min"].clone().to(dtype=torch.float32)
+                    buffer["max"].data = stats[key]["max"].clone().to(dtype=torch.float32)
            else:
                type_ = type(stats[key]["mean"])
-                raise ValueError(
-                    f"np.ndarray or torch.Tensor expected, but type is '{type_}' instead."
-                )
+                raise ValueError(f"np.ndarray or torch.Tensor expected, but type is '{type_}' instead.")

        stats_buffers[key] = buffer
    return stats_buffers
--- a/lerobot/common/policies/pi0/conversion_scripts/compare_with_jax.py
+++ b/lerobot/common/policies/pi0/conversion_scripts/compare_with_jax.py
@@ -44,9 +44,7 @@ def main():
    else:
        dataset_repo_id = "lerobot/aloha_sim_transfer_cube_human"

-    ckpt_torch_dir = (
-        Path.home() / f".cache/openpi/openpi-assets/checkpoints/{model_name}_pytorch"
-    )
+    ckpt_torch_dir = Path.home() / f".cache/openpi/openpi-assets/checkpoints/{model_name}_pytorch"
    ckpt_jax_dir = Path.home() / f".cache/openpi/openpi-assets/checkpoints/{model_name}"
    save_dir = Path(f"../openpi/data/{model_name}/save")

@@ -72,9 +70,7 @@ def main():
    # Create LeRobot batch from Jax
    batch = {}
    for cam_key, uint_chw_array in example["images"].items():
-        batch[f"observation.images.{cam_key}"] = (
-            torch.from_numpy(uint_chw_array) / 255.0
-        )
+        batch[f"observation.images.{cam_key}"] = torch.from_numpy(uint_chw_array) / 255.0
    batch["observation.state"] = torch.from_numpy(example["state"])
    batch["action"] = torch.from_numpy(outputs["actions"])
    batch["task"] = example["prompt"]
--- a/lerobot/common/policies/pi0/conversion_scripts/conversion_utils.py
+++ b/lerobot/common/policies/pi0/conversion_scripts/conversion_utils.py
@@ -54,9 +54,7 @@ def get_paligemma_config(precision: str):
        "projector_hidden_act": "gelu_fast",
        "vision_use_head": False,
    }
-    final_config = PaliGemmaConfig(
-        text_config=text_config, vision_config=vision_config, **config
-    )
+    final_config = PaliGemmaConfig(text_config=text_config, vision_config=vision_config, **config)
    return final_config


--- a/lerobot/common/policies/pi0/conversion_scripts/convert_pi0_to_hf_lerobot.py
+++ b/lerobot/common/policies/pi0/conversion_scripts/convert_pi0_to_hf_lerobot.py
@@ -322,9 +322,7 @@ def update_keys_with_prefix(d: dict, prefix: str) -> dict:
    return {f"{prefix}{key}": value for key, value in d.items()}


-def convert_pi0_checkpoint(
-    checkpoint_dir: str, precision: str, tokenizer_id: str, output_path: str
-):
+def convert_pi0_checkpoint(checkpoint_dir: str, precision: str, tokenizer_id: str, output_path: str):
    # Break down orbax ckpts - they are in OCDBT
    initial_params = slice_initial_orbax_checkpoint(checkpoint_dir=checkpoint_dir)
    # process projection params
@@ -384,9 +382,7 @@ def convert_pi0_checkpoint(
    # gemma_config=gemma_config, paligemma_config=paligemma_config)
    pi0_model = PI0Policy(pi0_config)

-    paligemma_params = update_keys_with_prefix(
-        paligemma_params, "model.paligemma_with_expert."
-    )
+    paligemma_params = update_keys_with_prefix(paligemma_params, "model.paligemma_with_expert.")
    gemma_params = update_keys_with_prefix(gemma_params, "model.paligemma_with_expert.")
    projection_params = update_keys_with_prefix(projection_params, "model.")

--- a/lerobot/common/policies/pi0/modeling_pi0.py
+++ b/lerobot/common/policies/pi0/modeling_pi0.py
@@ -193,9 +193,7 @@ def aloha_gripper_to_angular(value):

    # This is the inverse of the angular to linear transformation inside the Interbotix code.
    def linear_to_radian(linear_position, arm_length, horn_radius):
-        value = (horn_radius**2 + linear_position**2 - arm_length**2) / (
-            2 * horn_radius * linear_position
-        )
+        value = (horn_radius**2 + linear_position**2 - arm_length**2) / (2 * horn_radius * linear_position)
        return safe_arcsin(value)

    # The constants are taken from the Interbotix code.
@@ -246,9 +244,7 @@ class PI0Policy(PreTrainedPolicy):
        super().__init__(config)
        config.validate_features()
        self.config = config
-        self.normalize_inputs = Normalize(
-            config.input_features, config.normalization_mapping, dataset_stats
-        )
+        self.normalize_inputs = Normalize(config.input_features, config.normalization_mapping, dataset_stats)
        self.normalize_targets = Normalize(
            config.output_features, config.normalization_mapping, dataset_stats
        )
@@ -256,9 +252,7 @@ class PI0Policy(PreTrainedPolicy):
            config.output_features, config.normalization_mapping, dataset_stats
        )

-        self.language_tokenizer = AutoTokenizer.from_pretrained(
-            "google/paligemma-3b-pt-224"
-        )
+        self.language_tokenizer = AutoTokenizer.from_pretrained("google/paligemma-3b-pt-224")
        self.model = PI0FlowMatching(config)

        self.reset()
@@ -271,9 +265,7 @@ class PI0Policy(PreTrainedPolicy):
        return self.parameters()

    @torch.no_grad
-    def select_action(
-        self, batch: dict[str, Tensor], noise: Tensor | None = None
-    ) -> Tensor:
+    def select_action(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor:
        """Select a single action given environment observations.

        This method wraps `select_actions` in order to return one action at a time for execution in the
@@ -312,9 +304,7 @@ class PI0Policy(PreTrainedPolicy):
            self._action_queue.extend(actions.transpose(0, 1))
        return self._action_queue.popleft()

-    def forward(
-        self, batch: dict[str, Tensor], noise=None, time=None
-    ) -> tuple[Tensor, dict[str, Tensor]]:
+    def forward(self, batch: dict[str, Tensor], noise=None, time=None) -> tuple[Tensor, dict[str, Tensor]]:
        """Do a full training forward pass to compute the loss"""
        if self.config.adapt_to_pi_aloha:
            batch[OBS_ROBOT] = self._pi_aloha_decode_state(batch[OBS_ROBOT])
@@ -330,9 +320,7 @@ class PI0Policy(PreTrainedPolicy):
        actions_is_pad = batch.get("action_is_pad")

        loss_dict = {}
-        losses = self.model.forward(
-            images, img_masks, lang_tokens, lang_masks, state, actions, noise, time
-        )
+        losses = self.model.forward(images, img_masks, lang_tokens, lang_masks, state, actions, noise, time)
        loss_dict["losses_after_forward"] = losses.clone()

        if actions_is_pad is not None:
@@ -359,9 +347,7 @@ class PI0Policy(PreTrainedPolicy):
        img_masks = []

        present_img_keys = [key for key in self.config.image_features if key in batch]
-        missing_img_keys = [
-            key for key in self.config.image_features if key not in batch
-        ]
+        missing_img_keys = [key for key in self.config.image_features if key not in batch]

        if len(present_img_keys) == 0:
            raise ValueError(
@@ -373,9 +359,7 @@ class PI0Policy(PreTrainedPolicy):
            img = batch[key]

            if self.config.resize_imgs_with_padding is not None:
-                img = resize_with_pad(
-                    img, *self.config.resize_imgs_with_padding, pad_value=0
-                )
+                img = resize_with_pad(img, *self.config.resize_imgs_with_padding, pad_value=0)

            # Normalize from range [0,1] to [-1,1] as expacted by siglip
            img = img * 2.0 - 1.0
@@ -414,9 +398,7 @@ class PI0Policy(PreTrainedPolicy):
            return_tensors="pt",
        )
        lang_tokens = tokenized_prompt["input_ids"].to(device=device)
-        lang_masks = tokenized_prompt["attention_mask"].to(
-            device=device, dtype=torch.bool
-        )
+        lang_masks = tokenized_prompt["attention_mask"].to(device=device, dtype=torch.bool)

        return lang_tokens, lang_masks

@@ -435,9 +417,7 @@ class PI0Policy(PreTrainedPolicy):
            actions[:, :, motor_idx] *= -1
        # Reverse the gripper transformation that is being applied by the Aloha runtime.
        for motor_idx in [6, 13]:
-            actions[:, :, motor_idx] = aloha_gripper_from_angular(
-                actions[:, :, motor_idx]
-            )
+            actions[:, :, motor_idx] = aloha_gripper_from_angular(actions[:, :, motor_idx])
        return actions

    def _pi_aloha_encode_actions_inv(self, actions):
@@ -446,9 +426,7 @@ class PI0Policy(PreTrainedPolicy):
            actions[:, :, motor_idx] *= -1
        # Reverse the gripper transformation that is being applied by the Aloha runtime.
        for motor_idx in [6, 13]:
-            actions[:, :, motor_idx] = aloha_gripper_from_angular_inv(
-                actions[:, :, motor_idx]
-            )
+            actions[:, :, motor_idx] = aloha_gripper_from_angular_inv(actions[:, :, motor_idx])
        return actions

    def prepare_state(self, batch):
@@ -498,25 +476,15 @@ class PI0FlowMatching(nn.Module):
            train_expert_only=self.config.train_expert_only,
            attention_implementation=self.config.attention_implementation,
        )
-        self.paligemma_with_expert = PaliGemmaWithExpertModel(
-            paligemma_with_export_config
-        )
+        self.paligemma_with_expert = PaliGemmaWithExpertModel(paligemma_with_export_config)

        # Projections are float32
        self.state_proj = nn.Linear(self.config.max_state_dim, self.config.proj_width)
-        self.action_in_proj = nn.Linear(
-            self.config.max_action_dim, self.config.proj_width
-        )
-        self.action_out_proj = nn.Linear(
-            self.config.proj_width, self.config.max_action_dim
-        )
+        self.action_in_proj = nn.Linear(self.config.max_action_dim, self.config.proj_width)
+        self.action_out_proj = nn.Linear(self.config.proj_width, self.config.max_action_dim)

-        self.action_time_mlp_in = nn.Linear(
-            self.config.proj_width * 2, self.config.proj_width
-        )
-        self.action_time_mlp_out = nn.Linear(
-            self.config.proj_width, self.config.proj_width
-        )
+        self.action_time_mlp_in = nn.Linear(self.config.proj_width * 2, self.config.proj_width)
+        self.action_time_mlp_out = nn.Linear(self.config.proj_width, self.config.proj_width)

        self.set_requires_grad()

@@ -560,9 +528,7 @@ class PI0FlowMatching(nn.Module):

            # Normalize image embeddings
            img_emb_dim = img_emb.shape[-1]
-            img_emb = img_emb * torch.tensor(
-                img_emb_dim**0.5, dtype=img_emb.dtype, device=img_emb.device
-            )
+            img_emb = img_emb * torch.tensor(img_emb_dim**0.5, dtype=img_emb.dtype, device=img_emb.device)

            bsize, num_img_embs = img_emb.shape[:2]
            img_mask = img_mask[:, None].expand(bsize, num_img_embs)
@@ -637,9 +603,7 @@ class PI0FlowMatching(nn.Module):
        embs.append(action_time_emb)

        bsize, action_time_dim = action_time_emb.shape[:2]
-        action_time_mask = torch.ones(
-            bsize, action_time_dim, dtype=torch.bool, device=device
-        )
+        action_time_mask = torch.ones(bsize, action_time_dim, dtype=torch.bool, device=device)
        pad_masks.append(action_time_mask)

        # Set attention masks so that image, language and state inputs do not attend to action tokens
@@ -677,9 +641,7 @@ class PI0FlowMatching(nn.Module):
        prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix(
            images, img_masks, lang_tokens, lang_masks
        )
-        suffix_embs, suffix_pad_masks, suffix_att_masks = self.embed_suffix(
-            state, x_t, time
-        )
+        suffix_embs, suffix_pad_masks, suffix_att_masks = self.embed_suffix(state, x_t, time)

        pad_masks = torch.cat([prefix_pad_masks, suffix_pad_masks], dim=1)
        att_masks = torch.cat([prefix_att_masks, suffix_att_masks], dim=1)
@@ -703,9 +665,7 @@ class PI0FlowMatching(nn.Module):
        losses = F.mse_loss(u_t, v_t, reduction="none")
        return losses

-    def sample_actions(
-        self, images, img_masks, lang_tokens, lang_masks, state, noise=None
-    ) -> Tensor:
+    def sample_actions(self, images, img_masks, lang_tokens, lang_masks, state, noise=None) -> Tensor:
        """Do a full inference forward and compute the action (batch_size x num_steps x num_motors)"""
        bsize = state.shape[0]
        device = state.device
@@ -763,16 +723,12 @@ class PI0FlowMatching(nn.Module):
        timestep,
    ):
        """Apply one denoising step of the noise `x_t` at a given timestep."""
-        suffix_embs, suffix_pad_masks, suffix_att_masks = self.embed_suffix(
-            state, x_t, timestep
-        )
+        suffix_embs, suffix_pad_masks, suffix_att_masks = self.embed_suffix(state, x_t, timestep)

        suffix_len = suffix_pad_masks.shape[1]
        batch_size = prefix_pad_masks.shape[0]
        prefix_len = prefix_pad_masks.shape[1]
-        prefix_pad_2d_masks = prefix_pad_masks[:, None, :].expand(
-            batch_size, suffix_len, prefix_len
-        )
+        prefix_pad_2d_masks = prefix_pad_masks[:, None, :].expand(batch_size, suffix_len, prefix_len)

        suffix_att_2d_masks = make_att_2d_masks(suffix_pad_masks, suffix_att_masks)

--- a/lerobot/common/policies/pi0/paligemma_with_expert.py
+++ b/lerobot/common/policies/pi0/paligemma_with_expert.py
@@ -39,13 +39,9 @@ def apply_rope(x, positions, max_wavelength=10_000):
    dtype = x.dtype
    x = x.to(torch.float32)

-    freq_exponents = (2.0 / x.shape[-1]) * torch.arange(
-        d_half, dtype=torch.float32, device=device
-    )
+    freq_exponents = (2.0 / x.shape[-1]) * torch.arange(d_half, dtype=torch.float32, device=device)
    timescale = max_wavelength**freq_exponents
-    radians = positions[..., None].to(torch.float32) / timescale[None, None, :].to(
-        torch.float32
-    )
+    radians = positions[..., None].to(torch.float32) / timescale[None, None, :].to(torch.float32)

    radians = radians[..., None, :]

@@ -178,9 +174,7 @@ class PaliGemmaWithExpertModel(PreTrainedModel):
    def __init__(self, config: PaliGemmaWithExpertConfig):
        super().__init__(config=config)
        self.config = config
-        self.paligemma = PaliGemmaForConditionalGeneration(
-            config=config.paligemma_config
-        )
+        self.paligemma = PaliGemmaForConditionalGeneration(config=config.paligemma_config)
        self.gemma_expert = GemmaForCausalLM(config=config.gemma_expert_config)
        # Remove unused embed_tokens
        self.gemma_expert.model.embed_tokens = None
@@ -297,9 +291,7 @@ class PaliGemmaWithExpertModel(PreTrainedModel):
                    # so we create an empty cache, with just one cuda malloc, and if (in autoregressive case) we reach
                    # the max len, then we (for instance) double the cache size. This implementation already exists
                    # in `transformers`. (molbap)
-                    key_states = torch.cat(
-                        [past_key_values[layer_idx]["key_states"], key_states], dim=1
-                    )
+                    key_states = torch.cat([past_key_values[layer_idx]["key_states"], key_states], dim=1)
                    value_states = torch.cat(
                        [past_key_values[layer_idx]["value_states"], value_states],
                        dim=1,
@@ -392,9 +384,7 @@ class PaliGemmaWithExpertModel(PreTrainedModel):
        value_states,
    ):
        num_att_heads = self.config.paligemma_config.text_config.num_attention_heads
-        num_key_value_heads = (
-            self.config.paligemma_config.text_config.num_key_value_heads
-        )
+        num_key_value_heads = self.config.paligemma_config.text_config.num_key_value_heads
        num_key_value_groups = num_att_heads // num_key_value_heads

        # query_states: batch_size, sequence_length, num_att_head, head_dim
@@ -442,9 +432,7 @@ class PaliGemmaWithExpertModel(PreTrainedModel):
        att_weights *= head_dim**-0.5
        big_neg = -2.3819763e38  # See gemma/modules.py

-        masked_att_weights = torch.where(
-            attention_mask[:, None, :, :], att_weights, big_neg
-        )
+        masked_att_weights = torch.where(attention_mask[:, None, :, :], att_weights, big_neg)

        probs = nn.functional.softmax(masked_att_weights, dim=-1)
        probs = probs.to(dtype=value_states.dtype)
@@ -456,8 +444,6 @@ class PaliGemmaWithExpertModel(PreTrainedModel):

        att_output = att_output.permute(0, 2, 1, 3)
        # we use -1 because sequence length can change
-        att_output = att_output.reshape(
-            batch_size, -1, num_key_value_heads * num_key_value_groups * head_dim
-        )
+        att_output = att_output.reshape(batch_size, -1, num_key_value_heads * num_key_value_groups * head_dim)

        return att_output
--- a/lerobot/common/policies/pretrained.py
+++ b/lerobot/common/policies/pretrained.py
@@ -71,9 +71,7 @@ class PreTrainedPolicy(nn.Module, HubMixin, abc.ABC):
    def _save_pretrained(self, save_directory: Path) -> None:
        self.config._save_pretrained(save_directory)
        model_to_save = self.module if hasattr(self, "module") else self
-        save_model_as_safetensor(
-            model_to_save, str(save_directory / SAFETENSORS_SINGLE_FILE)
-        )
+        save_model_as_safetensor(model_to_save, str(save_directory / SAFETENSORS_SINGLE_FILE))

    @classmethod
    def from_pretrained(
@@ -112,9 +110,7 @@ class PreTrainedPolicy(nn.Module, HubMixin, abc.ABC):
        if os.path.isdir(model_id):
            print("Loading weights from local directory")
            model_file = os.path.join(model_id, SAFETENSORS_SINGLE_FILE)
-            policy = cls._load_as_safetensor(
-                instance, model_file, config.device, strict
-            )
+            policy = cls._load_as_safetensor(instance, model_file, config.device, strict)
        else:
            try:
                model_file = hf_hub_download(
@@ -128,9 +124,7 @@ class PreTrainedPolicy(nn.Module, HubMixin, abc.ABC):
                    token=token,
                    local_files_only=local_files_only,
                )
-                policy = cls._load_as_safetensor(
-                    instance, model_file, config.device, strict
-                )
+                policy = cls._load_as_safetensor(instance, model_file, config.device, strict)
            except HfHubHTTPError as e:
                raise FileNotFoundError(
                    f"{SAFETENSORS_SINGLE_FILE} not found on the HuggingFace Hub in {model_id}"
@@ -141,12 +135,8 @@ class PreTrainedPolicy(nn.Module, HubMixin, abc.ABC):
        return policy

    @classmethod
-    def _load_as_safetensor(
-        cls, model: T, model_file: str, map_location: str, strict: bool
-    ) -> T:
-        if packaging.version.parse(safetensors.__version__) < packaging.version.parse(
-            "0.4.3"
-        ):
+    def _load_as_safetensor(cls, model: T, model_file: str, map_location: str, strict: bool) -> T:
+        if packaging.version.parse(safetensors.__version__) < packaging.version.parse("0.4.3"):
            load_model_as_safetensor(model, model_file, strict=strict)
            if map_location != "cpu":
                logging.warning(
@@ -157,9 +147,7 @@ class PreTrainedPolicy(nn.Module, HubMixin, abc.ABC):
                )
                model.to(map_location)
        else:
-            safetensors.torch.load_model(
-                model, model_file, strict=strict, device=map_location
-            )
+            safetensors.torch.load_model(model, model_file, strict=strict, device=map_location)
        return model

    # def generate_model_card(self, *args, **kwargs) -> ModelCard:
--- a/lerobot/common/policies/sac/configuration_sac.py
+++ b/lerobot/common/policies/sac/configuration_sac.py
@@ -48,9 +48,7 @@ class SACConfig:
            "observation.state": {"min": [-1, -1, -1, -1], "max": [1, 1, 1, 1]},
        }
    )
-    output_normalization_modes: dict[str, str] = field(
-        default_factory=lambda: {"action": "min_max"}
-    )
+    output_normalization_modes: dict[str, str] = field(default_factory=lambda: {"action": "min_max"})
    output_normalization_params: dict[str, dict[str, list[float]]] = field(
        default_factory=lambda: {
            "action": {"min": [-1, -1], "max": [1, 1]},
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -18,8 +18,8 @@
 # TODO: (1) better device management

 import math
-from typing import Callable, Optional, Tuple, Union, Dict, List
 from pathlib import Path
+from typing import Callable, Dict, List, Optional, Tuple, Union

 import einops
 import numpy as np
@@ -124,17 +124,13 @@ class SACPolicy(

        self.actor = Policy(
            encoder=encoder_actor,
-            network=MLP(
-                input_dim=encoder_actor.output_dim, **config.actor_network_kwargs
-            ),
+            network=MLP(input_dim=encoder_actor.output_dim, **config.actor_network_kwargs),
            action_dim=config.output_shapes["action"][0],
            encoder_is_shared=config.shared_encoder,
            **config.policy_kwargs,
        )
        if config.target_entropy is None:
-            config.target_entropy = (
-                -np.prod(config.output_shapes["action"][0]) / 2
-            )  # (-dim(A)/2)
+            config.target_entropy = -np.prod(config.output_shapes["action"][0]) / 2  # (-dim(A)/2)

        # TODO (azouitine): Handle the case where the temparameter is a fixed
        # TODO (michel-aractingi): Put the log_alpha in cuda by default because otherwise
@@ -146,10 +142,11 @@ class SACPolicy(

    def _save_pretrained(self, save_directory):
        """Custom save method to handle TensorDict properly"""
-        import os
        import json
+        import os
        from dataclasses import asdict
-        from huggingface_hub.constants import SAFETENSORS_SINGLE_FILE, CONFIG_NAME
+
+        from huggingface_hub.constants import CONFIG_NAME, SAFETENSORS_SINGLE_FILE
        from safetensors.torch import save_model

        save_model(self, os.path.join(save_directory, SAFETENSORS_SINGLE_FILE))
@@ -177,12 +174,14 @@ class SACPolicy(
        **model_kwargs,
    ) -> "SACPolicy":
        """Custom load method to handle loading SAC policy from saved files"""
-        import os
        import json
+        import os
        from pathlib import Path
+
        from huggingface_hub import hf_hub_download
-        from huggingface_hub.constants import SAFETENSORS_SINGLE_FILE, CONFIG_NAME
+        from huggingface_hub.constants import CONFIG_NAME, SAFETENSORS_SINGLE_FILE
        from safetensors.torch import load_model
+
        from lerobot.common.policies.sac.configuration_sac import SACConfig

        # Check if model_id is a local path or a hub model ID
@@ -302,14 +301,10 @@ class SACPolicy(
    ) -> Tensor:
        self.temperature = self.log_alpha.exp().item()
        with torch.no_grad():
-            next_action_preds, next_log_probs, _ = self.actor(
-                next_observations, next_observation_features
-            )
+            next_action_preds, next_log_probs, _ = self.actor(next_observations, next_observation_features)

            # TODO: (maractingi, azouitine) This is to slow, we should find a way to do this in a more efficient way
-            next_action_preds = self.unnormalize_outputs({"action": next_action_preds})[
-                "action"
-            ]
+            next_action_preds = self.unnormalize_outputs({"action": next_action_preds})["action"]

            # 2- compute q targets
            q_targets = self.critic_forward(
@@ -353,21 +348,15 @@ class SACPolicy(
        ).sum()
        return critics_loss

-    def compute_loss_temperature(
-        self, observations, observation_features: Tensor | None = None
-    ) -> Tensor:
+    def compute_loss_temperature(self, observations, observation_features: Tensor | None = None) -> Tensor:
        """Compute the temperature loss"""
        # calculate temperature loss
        with torch.no_grad():
            _, log_probs, _ = self.actor(observations, observation_features)
-        temperature_loss = (
-            -self.log_alpha.exp() * (log_probs + self.config.target_entropy)
-        ).mean()
+        temperature_loss = (-self.log_alpha.exp() * (log_probs + self.config.target_entropy)).mean()
        return temperature_loss

-    def compute_loss_actor(
-        self, observations, observation_features: Tensor | None = None
-    ) -> Tensor:
+    def compute_loss_actor(self, observations, observation_features: Tensor | None = None) -> Tensor:
        self.temperature = self.log_alpha.exp().item()

        actions_pi, log_probs, _ = self.actor(observations, observation_features)
@@ -408,11 +397,7 @@ class MLP(nn.Module):
        if dropout_rate is not None and dropout_rate > 0:
            layers.append(nn.Dropout(p=dropout_rate))
        layers.append(nn.LayerNorm(hidden_dims[0]))
-        layers.append(
-            activations
-            if isinstance(activations, nn.Module)
-            else getattr(nn, activations)()
-        )
+        layers.append(activations if isinstance(activations, nn.Module) else getattr(nn, activations)())

        # Rest of the layers
        for i in range(1, len(hidden_dims)):
@@ -424,11 +409,7 @@ class MLP(nn.Module):
                layers.append(nn.LayerNorm(hidden_dims[i]))

                # If we're at the final layer and a final activation is specified, use it
-                if (
-                    i + 1 == len(hidden_dims)
-                    and activate_final
-                    and final_activation is not None
-                ):
+                if i + 1 == len(hidden_dims) and activate_final and final_activation is not None:
                    layers.append(
                        final_activation
                        if isinstance(final_activation, nn.Module)
@@ -436,9 +417,7 @@ class MLP(nn.Module):
                    )
                else:
                    layers.append(
-                        activations
-                        if isinstance(activations, nn.Module)
-                        else getattr(nn, activations)()
+                        activations if isinstance(activations, nn.Module) else getattr(nn, activations)()
                    )

        self.net = nn.Sequential(*layers)
@@ -639,15 +618,11 @@ class Policy(nn.Module):
        # Compute standard deviations
        if self.fixed_std is None:
            log_std = self.std_layer(outputs)
-            assert not torch.isnan(log_std).any(), (
-                "[ERROR] log_std became NaN after std_layer!"
-            )
+            assert not torch.isnan(log_std).any(), "[ERROR] log_std became NaN after std_layer!"

            if self.use_tanh_squash:
                log_std = torch.tanh(log_std)
-                log_std = self.log_std_min + 0.5 * (
-                    self.log_std_max - self.log_std_min
-                ) * (log_std + 1.0)
+                log_std = self.log_std_min + 0.5 * (self.log_std_max - self.log_std_min) * (log_std + 1.0)
            else:
                log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max)
        else:
@@ -660,9 +635,7 @@ class Policy(nn.Module):

        if self.use_tanh_squash:
            actions = torch.tanh(x_t)
-            log_probs -= torch.log(
-                (1 - actions.pow(2)) + 1e-6
-            )  # Adjust log-probs for Tanh
+            log_probs -= torch.log((1 - actions.pow(2)) + 1e-6)  # Adjust log-probs for Tanh
        else:
            actions = x_t  # No Tanh; raw Gaussian sample

@@ -709,9 +682,7 @@ class SACObservationEncoder(nn.Module):
                freeze_image_encoder(self.image_enc_layers)
            else:
                self.parameters_to_optimize += list(self.image_enc_layers.parameters())
-            self.all_image_keys = [
-                k for k in config.input_shapes if k.startswith("observation.image")
-            ]
+            self.all_image_keys = [k for k in config.input_shapes if k.startswith("observation.image")]

        if "observation.state" in config.input_shapes:
            self.state_enc_layers = nn.Sequential(
@@ -738,9 +709,7 @@ class SACObservationEncoder(nn.Module):
            self.aggregation_size += config.latent_dim
            self.parameters_to_optimize += list(self.env_state_enc_layers.parameters())

-        self.aggregation_layer = nn.Linear(
-            in_features=self.aggregation_size, out_features=config.latent_dim
-        )
+        self.aggregation_layer = nn.Linear(in_features=self.aggregation_size, out_features=config.latent_dim)
        self.parameters_to_optimize += list(self.aggregation_layer.parameters())

    def forward(self, obs_dict: dict[str, Tensor]) -> Tensor:
@@ -753,19 +722,13 @@ class SACObservationEncoder(nn.Module):
        obs_dict = self.input_normalization(obs_dict)
        # Batch all images along the batch dimension, then encode them.
        if len(self.all_image_keys) > 0:
-            images_batched = torch.cat(
-                [obs_dict[key] for key in self.all_image_keys], dim=0
-            )
+            images_batched = torch.cat([obs_dict[key] for key in self.all_image_keys], dim=0)
            images_batched = self.image_enc_layers(images_batched)
-            embeddings_chunks = torch.chunk(
-                images_batched, dim=0, chunks=len(self.all_image_keys)
-            )
+            embeddings_chunks = torch.chunk(images_batched, dim=0, chunks=len(self.all_image_keys))
            feat.extend(embeddings_chunks)

        if "observation.environment_state" in self.config.input_shapes:
-            feat.append(
-                self.env_state_enc_layers(obs_dict["observation.environment_state"])
-            )
+            feat.append(self.env_state_enc_layers(obs_dict["observation.environment_state"]))
        if "observation.state" in self.config.input_shapes:
            feat.append(self.state_enc_layers(obs_dict["observation.state"]))

@@ -833,9 +796,7 @@ class PretrainedImageEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()

-        self.image_enc_layers, self.image_enc_out_shape = (
-            self._load_pretrained_vision_encoder(config)
-        )
+        self.image_enc_layers, self.image_enc_out_shape = self._load_pretrained_vision_encoder(config)
        self.image_enc_proj = nn.Sequential(
            nn.Linear(np.prod(self.image_enc_out_shape), config.latent_dim),
            nn.LayerNorm(config.latent_dim),
@@ -846,21 +807,15 @@ class PretrainedImageEncoder(nn.Module):
        """Set up CNN encoder"""
        from transformers import AutoModel

-        self.image_enc_layers = AutoModel.from_pretrained(
-            config.vision_encoder_name, trust_remote_code=True
-        )
+        self.image_enc_layers = AutoModel.from_pretrained(config.vision_encoder_name, trust_remote_code=True)
        # self.image_enc_layers.pooler = Identity()

        if hasattr(self.image_enc_layers.config, "hidden_sizes"):
-            self.image_enc_out_shape = self.image_enc_layers.config.hidden_sizes[
-                -1
-            ]  # Last channel dimension
+            self.image_enc_out_shape = self.image_enc_layers.config.hidden_sizes[-1]  # Last channel dimension
        elif hasattr(self.image_enc_layers, "fc"):
            self.image_enc_out_shape = self.image_enc_layers.fc.in_features
        else:
-            raise ValueError(
-                "Unsupported vision encoder architecture, make sure you are using a CNN"
-            )
+            raise ValueError("Unsupported vision encoder architecture, make sure you are using a CNN")
        return self.image_enc_layers, self.image_enc_out_shape

    def forward(self, x):
@@ -896,9 +851,7 @@ def _convert_normalization_params_to_tensor(normalization_params: dict) -> dict:
        for key, value in inner_dict.items():
            converted_params[outer_key][key] = torch.tensor(value)
            if "image" in outer_key:
-                converted_params[outer_key][key] = converted_params[outer_key][
-                    key
-                ].view(3, 1, 1)
+                converted_params[outer_key][key] = converted_params[outer_key][key].view(3, 1, 1)

    return converted_params

--- a/lerobot/common/policies/tdmpc/configuration_tdmpc.py
+++ b/lerobot/common/policies/tdmpc/configuration_tdmpc.py
@@ -183,13 +183,9 @@ class TDMPCConfig(PreTrainedConfig):
                    "If `n_action_steps > 1`, `n_action_repeats` must be left to its default value of 1."
                )
            if not self.use_mpc:
-                raise ValueError(
-                    "If `n_action_steps > 1`, `use_mpc` must be set to `True`."
-                )
+                raise ValueError("If `n_action_steps > 1`, `use_mpc` must be set to `True`.")
            if self.n_action_steps > self.horizon:
-                raise ValueError(
-                    "`n_action_steps` must be less than or equal to `horizon`."
-                )
+                raise ValueError("`n_action_steps` must be less than or equal to `horizon`.")

    def get_optimizer_preset(self) -> AdamConfig:
        return AdamConfig(lr=self.optimizer_lr)
@@ -209,9 +205,7 @@ class TDMPCConfig(PreTrainedConfig):
            if image_ft.shape[-2] != image_ft.shape[-1]:
                # TODO(alexander-soare): This limitation is solely because of code in the random shift
                # augmentation. It should be able to be removed.
-                raise ValueError(
-                    f"Only square images are handled now. Got image shape {image_ft.shape}."
-                )
+                raise ValueError(f"Only square images are handled now. Got image shape {image_ft.shape}.")

    @property
    def observation_delta_indices(self) -> list:
--- a/lerobot/common/policies/tdmpc/modeling_tdmpc.py
+++ b/lerobot/common/policies/tdmpc/modeling_tdmpc.py
@@ -83,9 +83,7 @@ class TDMPCPolicy(PreTrainedPolicy):
        config.validate_features()
        self.config = config

-        self.normalize_inputs = Normalize(
-            config.input_features, config.normalization_mapping, dataset_stats
-        )
+        self.normalize_inputs = Normalize(config.input_features, config.normalization_mapping, dataset_stats)
        self.normalize_targets = Normalize(
            config.output_features, config.normalization_mapping, dataset_stats
        )
@@ -110,9 +108,7 @@ class TDMPCPolicy(PreTrainedPolicy):
        """
        self._queues = {
            "observation.state": deque(maxlen=1),
-            "action": deque(
-                maxlen=max(self.config.n_action_steps, self.config.n_action_repeats)
-            ),
+            "action": deque(maxlen=max(self.config.n_action_steps, self.config.n_action_repeats)),
        }
        if self.config.image_features:
            self._queues["observation.image"] = deque(maxlen=1)
@@ -127,9 +123,7 @@ class TDMPCPolicy(PreTrainedPolicy):
        """Select a single action given environment observations."""
        batch = self.normalize_inputs(batch)
        if self.config.image_features:
-            batch = dict(
-                batch
-            )  # shallow copy so that adding a key doesn't modify the original
+            batch = dict(batch)  # shallow copy so that adding a key doesn't modify the original
            batch["observation.image"] = batch[next(iter(self.config.image_features))]

        self._queues = populate_queues(self._queues, batch)
@@ -232,47 +226,35 @@ class TDMPCPolicy(PreTrainedPolicy):
                self.config.action_feature.shape[0],
                device=std.device,
            )
-            gaussian_actions = torch.clamp(
-                mean.unsqueeze(1) + std.unsqueeze(1) * std_normal_noise, -1, 1
-            )
+            gaussian_actions = torch.clamp(mean.unsqueeze(1) + std.unsqueeze(1) * std_normal_noise, -1, 1)

            # Compute elite actions.
            actions = torch.cat([gaussian_actions, pi_actions], dim=1)
            value = self.estimate_value(z, actions).nan_to_num_(0)
-            elite_idxs = torch.topk(
-                value, self.config.n_elites, dim=0
-            ).indices  # (n_elites, batch)
+            elite_idxs = torch.topk(value, self.config.n_elites, dim=0).indices  # (n_elites, batch)
            elite_value = value.take_along_dim(elite_idxs, dim=0)  # (n_elites, batch)
            # (horizon, n_elites, batch, action_dim)
-            elite_actions = actions.take_along_dim(
-                einops.rearrange(elite_idxs, "n b -> 1 n b 1"), dim=1
-            )
+            elite_actions = actions.take_along_dim(einops.rearrange(elite_idxs, "n b -> 1 n b 1"), dim=1)

            # Update gaussian PDF parameters to be the (weighted) mean and standard deviation of the elites.
            max_value = elite_value.max(0, keepdim=True)[0]  # (1, batch)
            # The weighting is a softmax over trajectory values. Note that this is not the same as the usage
            # of Ω in eqn 4 of the TD-MPC paper. Instead it is the normalized version of it: s = Ω/ΣΩ. This
            # makes the equations: μ = Σ(s⋅Γ), σ = Σ(s⋅(Γ-μ)²).
-            score = torch.exp(
-                self.config.elite_weighting_temperature * (elite_value - max_value)
-            )
+            score = torch.exp(self.config.elite_weighting_temperature * (elite_value - max_value))
            score /= score.sum(axis=0, keepdim=True)
            # (horizon, batch, action_dim)
-            _mean = torch.sum(
-                einops.rearrange(score, "n b -> n b 1") * elite_actions, dim=1
-            )
+            _mean = torch.sum(einops.rearrange(score, "n b -> n b 1") * elite_actions, dim=1)
            _std = torch.sqrt(
                torch.sum(
                    einops.rearrange(score, "n b -> n b 1")
-                    * (elite_actions - einops.rearrange(_mean, "h b d -> h 1 b d"))
-                    ** 2,
+                    * (elite_actions - einops.rearrange(_mean, "h b d -> h 1 b d")) ** 2,
                    dim=1,
                )
            )
            # Update mean with an exponential moving average, and std with a direct replacement.
            mean = (
-                self.config.gaussian_mean_momentum * mean
-                + (1 - self.config.gaussian_mean_momentum) * _mean
+                self.config.gaussian_mean_momentum * mean + (1 - self.config.gaussian_mean_momentum) * _mean
            )
            std = _std.clamp_(self.config.min_std, self.config.max_std)

@@ -281,9 +263,7 @@ class TDMPCPolicy(PreTrainedPolicy):

        # Randomly select one of the elite actions from the last iteration of MPPI/CEM using the softmax
        # scores from the last iteration.
-        actions = elite_actions[
-            :, torch.multinomial(score.T, 1).squeeze(), torch.arange(batch_size)
-        ]
+        actions = elite_actions[:, torch.multinomial(score.T, 1).squeeze(), torch.arange(batch_size)]

        return actions

@@ -306,8 +286,7 @@ class TDMPCPolicy(PreTrainedPolicy):
            # of the FOWM paper.
            if self.config.uncertainty_regularizer_coeff > 0:
                regularization = -(
-                    self.config.uncertainty_regularizer_coeff
-                    * self.model.Qs(z, actions[t]).std(0)
+                    self.config.uncertainty_regularizer_coeff * self.model.Qs(z, actions[t]).std(0)
                )
            else:
                regularization = 0
@@ -328,9 +307,7 @@ class TDMPCPolicy(PreTrainedPolicy):
            G += (
                running_discount
                * torch.min(
-                    terminal_values[
-                        torch.randint(0, self.config.q_ensemble_size, size=(2,))
-                    ],
+                    terminal_values[torch.randint(0, self.config.q_ensemble_size, size=(2,))],
                    dim=0,
                )[0]
            )
@@ -338,11 +315,7 @@ class TDMPCPolicy(PreTrainedPolicy):
            G += running_discount * torch.min(terminal_values, dim=0)[0]
        # Finally, also regularize the terminal value.
        if self.config.uncertainty_regularizer_coeff > 0:
-            G -= (
-                running_discount
-                * self.config.uncertainty_regularizer_coeff
-                * terminal_values.std(0)
-            )
+            G -= running_discount * self.config.uncertainty_regularizer_coeff * terminal_values.std(0)
        return G

    def forward(self, batch: dict[str, Tensor]) -> tuple[Tensor, dict]:
@@ -354,9 +327,7 @@ class TDMPCPolicy(PreTrainedPolicy):

        batch = self.normalize_inputs(batch)
        if self.config.image_features:
-            batch = dict(
-                batch
-            )  # shallow copy so that adding a key doesn't modify the original
+            batch = dict(batch)  # shallow copy so that adding a key doesn't modify the original
            batch["observation.image"] = batch[next(iter(self.config.image_features))]
        batch = self.normalize_targets(batch)

@@ -388,29 +359,21 @@ class TDMPCPolicy(PreTrainedPolicy):
            current_observation[k] = observations[k][0]
            next_observations[k] = observations[k][1:]
        horizon, batch_size = next_observations[
-            "observation.image"
-            if self.config.image_features
-            else "observation.environment_state"
+            "observation.image" if self.config.image_features else "observation.environment_state"
        ].shape[:2]

        # Run latent rollout using the latent dynamics model and policy model.
        # Note this has shape `horizon+1` because there are `horizon` actions and a current `z`. Each action
        # gives us a next `z`.
        batch_size = batch["index"].shape[0]
-        z_preds = torch.empty(
-            horizon + 1, batch_size, self.config.latent_dim, device=device
-        )
+        z_preds = torch.empty(horizon + 1, batch_size, self.config.latent_dim, device=device)
        z_preds[0] = self.model.encode(current_observation)
        reward_preds = torch.empty_like(reward, device=device)
        for t in range(horizon):
-            z_preds[t + 1], reward_preds[t] = self.model.latent_dynamics_and_reward(
-                z_preds[t], action[t]
-            )
+            z_preds[t + 1], reward_preds[t] = self.model.latent_dynamics_and_reward(z_preds[t], action[t])

        # Compute Q and V value predictions based on the latent rollout.
-        q_preds_ensemble = self.model.Qs(
-            z_preds[:-1], action
-        )  # (ensemble, horizon, batch)
+        q_preds_ensemble = self.model.Qs(z_preds[:-1], action)  # (ensemble, horizon, batch)
        v_preds = self.model.V(z_preds[:-1])
        info.update({"Q": q_preds_ensemble.mean().item(), "V": v_preds.mean().item()})

@@ -424,14 +387,10 @@ class TDMPCPolicy(PreTrainedPolicy):
            # actions (not actions estimated by π).
            # Note: Here we do not use self.model_target, but self.model. This is to follow the original code
            # and the FOWM paper.
-            q_targets = reward + self.config.discount * self.model.V(
-                self.model.encode(next_observations)
-            )
+            q_targets = reward + self.config.discount * self.model.V(self.model.encode(next_observations))
            # From eqn 3 of FOWM. These appear as Q(z, a). Here we call them v_targets to emphasize that we
            # are using them to compute loss for V.
-            v_targets = self.model_target.Qs(
-                z_preds[:-1].detach(), action, return_min=True
-            )
+            v_targets = self.model_target.Qs(z_preds[:-1].detach(), action, return_min=True)

        # Compute losses.
        # Exponentially decay the loss weight with respect to the timestep. Steps that are more distant in the
@@ -474,9 +433,7 @@ class TDMPCPolicy(PreTrainedPolicy):
                temporal_loss_coeffs
                * F.mse_loss(
                    q_preds_ensemble,
-                    einops.repeat(
-                        q_targets, "t b -> e t b", e=q_preds_ensemble.shape[0]
-                    ),
+                    einops.repeat(q_targets, "t b -> e t b", e=q_preds_ensemble.shape[0]),
                    reduction="none",
                ).sum(0)  # sum over ensemble
                # `q_preds_ensemble` depends on the first observation and the actions.
@@ -514,14 +471,12 @@ class TDMPCPolicy(PreTrainedPolicy):
        z_preds = z_preds.detach()
        # Use stopgrad for the advantage calculation.
        with torch.no_grad():
-            advantage = self.model_target.Qs(
-                z_preds[:-1], action, return_min=True
-            ) - self.model.V(z_preds[:-1])
+            advantage = self.model_target.Qs(z_preds[:-1], action, return_min=True) - self.model.V(
+                z_preds[:-1]
+            )
            info["advantage"] = advantage[0]
            # (t, b)
-            exp_advantage = torch.clamp(
-                torch.exp(advantage * self.config.advantage_scaling), max=100.0
-            )
+            exp_advantage = torch.clamp(torch.exp(advantage * self.config.advantage_scaling), max=100.0)
        action_preds = self.model.pi(z_preds[:-1])  # (t, b, a)
        # Calculate the MSE between the actions and the action predictions.
        # Note: FOWM's original code calculates the log probability (wrt to a unit standard deviation
@@ -575,9 +530,7 @@ class TDMPCPolicy(PreTrainedPolicy):
        # Note a minor variation with respect to the original FOWM code. Here they do this based on an EMA
        # update frequency parameter which is set to 2 (every 2 steps an update is done). To simplify the code
        # we update every step and adjust the decay parameter `alpha` accordingly (0.99 -> 0.995)
-        update_ema_parameters(
-            self.model_target, self.model, self.config.target_model_momentum
-        )
+        update_ema_parameters(self.model_target, self.model, self.config.target_model_momentum)


 class TDMPCTOLD(nn.Module):
@@ -588,9 +541,7 @@ class TDMPCTOLD(nn.Module):
        self.config = config
        self._encoder = TDMPCObservationEncoder(config)
        self._dynamics = nn.Sequential(
-            nn.Linear(
-                config.latent_dim + config.action_feature.shape[0], config.mlp_dim
-            ),
+            nn.Linear(config.latent_dim + config.action_feature.shape[0], config.mlp_dim),
            nn.LayerNorm(config.mlp_dim),
            nn.Mish(),
            nn.Linear(config.mlp_dim, config.mlp_dim),
@@ -601,9 +552,7 @@ class TDMPCTOLD(nn.Module):
            nn.Sigmoid(),
        )
        self._reward = nn.Sequential(
-            nn.Linear(
-                config.latent_dim + config.action_feature.shape[0], config.mlp_dim
-            ),
+            nn.Linear(config.latent_dim + config.action_feature.shape[0], config.mlp_dim),
            nn.LayerNorm(config.mlp_dim),
            nn.Mish(),
            nn.Linear(config.mlp_dim, config.mlp_dim),
@@ -671,9 +620,7 @@ class TDMPCTOLD(nn.Module):
                "Sanity check. The last linear layer needs 0 initialization on weights."
            )
            nn.init.zeros_(m[-1].weight)
-            nn.init.zeros_(
-                m[-1].bias
-            )  # this has already been done, but keep this line here for good measure
+            nn.init.zeros_(m[-1].bias)  # this has already been done, but keep this line here for good measure

    def encode(self, obs: dict[str, Tensor]) -> Tensor:
        """Encodes an observation into its latent representation."""
@@ -812,9 +759,7 @@ class TDMPCObservationEncoder(nn.Module):

        if config.robot_state_feature:
            self.state_enc_layers = nn.Sequential(
-                nn.Linear(
-                    config.robot_state_feature.shape[0], config.state_encoder_hidden_dim
-                ),
+                nn.Linear(config.robot_state_feature.shape[0], config.state_encoder_hidden_dim),
                nn.ELU(),
                nn.Linear(config.state_encoder_hidden_dim, config.latent_dim),
                nn.LayerNorm(config.latent_dim),
@@ -823,9 +768,7 @@ class TDMPCObservationEncoder(nn.Module):

        if config.env_state_feature:
            self.env_state_enc_layers = nn.Sequential(
-                nn.Linear(
-                    config.env_state_feature.shape[0], config.state_encoder_hidden_dim
-                ),
+                nn.Linear(config.env_state_feature.shape[0], config.state_encoder_hidden_dim),
                nn.ELU(),
                nn.Linear(config.state_encoder_hidden_dim, config.latent_dim),
                nn.LayerNorm(config.latent_dim),
@@ -898,10 +841,7 @@ def update_ema_parameters(ema_net: nn.Module, net: nn.Module, alpha: float):
            assert n_p_ema == n_p, "Parameter names don't match for EMA model update"
            if isinstance(p, dict):
                raise RuntimeError("Dict parameter not supported")
-            if (
-                isinstance(module, nn.modules.batchnorm._BatchNorm)
-                or not p.requires_grad
-            ):
+            if isinstance(module, nn.modules.batchnorm._BatchNorm) or not p.requires_grad:
                # Copy BatchNorm parameters, and non-trainable parameters directly.
                p_ema.copy_(p.to(dtype=p_ema.dtype).data)
            with torch.no_grad():
@@ -909,9 +849,7 @@ def update_ema_parameters(ema_net: nn.Module, net: nn.Module, alpha: float):
                p_ema.add_(p.to(dtype=p_ema.dtype).data, alpha=1 - alpha)


-def flatten_forward_unflatten(
-    fn: Callable[[Tensor], Tensor], image_tensor: Tensor
-) -> Tensor:
+def flatten_forward_unflatten(fn: Callable[[Tensor], Tensor], image_tensor: Tensor) -> Tensor:
    """Helper to temporarily flatten extra dims at the start of the image tensor.

    Args:
--- a/lerobot/common/policies/vqbet/configuration_vqbet.py
+++ b/lerobot/common/policies/vqbet/configuration_vqbet.py
@@ -172,10 +172,7 @@ class VQBeTConfig(PreTrainedConfig):

        if self.crop_shape is not None:
            for key, image_ft in self.image_features.items():
-                if (
-                    self.crop_shape[0] > image_ft.shape[1]
-                    or self.crop_shape[1] > image_ft.shape[2]
-                ):
+                if self.crop_shape[0] > image_ft.shape[1] or self.crop_shape[1] > image_ft.shape[2]:
                    raise ValueError(
                        f"`crop_shape` should fit within the images shapes. Got {self.crop_shape} "
                        f"for `crop_shape` and {image_ft.shape} for "
--- a/lerobot/common/policies/vqbet/modeling_vqbet.py
+++ b/lerobot/common/policies/vqbet/modeling_vqbet.py
@@ -64,9 +64,7 @@ class VQBeTPolicy(PreTrainedPolicy):
        config.validate_features()
        self.config = config

-        self.normalize_inputs = Normalize(
-            config.input_features, config.normalization_mapping, dataset_stats
-        )
+        self.normalize_inputs = Normalize(config.input_features, config.normalization_mapping, dataset_stats)
        self.normalize_targets = Normalize(
            config.output_features, config.normalization_mapping, dataset_stats
        )
@@ -97,17 +95,11 @@ class VQBeTPolicy(PreTrainedPolicy):
        if self.config.sequentially_select:
            decay_params = (
                decay_params
-                + list(
-                    self.vqbet.action_head.map_to_cbet_preds_primary_bin.parameters()
-                )
-                + list(
-                    self.vqbet.action_head.map_to_cbet_preds_secondary_bin.parameters()
-                )
+                + list(self.vqbet.action_head.map_to_cbet_preds_primary_bin.parameters())
+                + list(self.vqbet.action_head.map_to_cbet_preds_secondary_bin.parameters())
            )
        else:
-            decay_params = decay_params + list(
-                self.vqbet.action_head.map_to_cbet_preds_bin.parameters()
-            )
+            decay_params = decay_params + list(self.vqbet.action_head.map_to_cbet_preds_bin.parameters())

        return [
            {
@@ -145,12 +137,8 @@ class VQBeTPolicy(PreTrainedPolicy):
        """

        batch = self.normalize_inputs(batch)
-        batch = dict(
-            batch
-        )  # shallow copy so that adding a key doesn't modify the original
-        batch["observation.images"] = torch.stack(
-            [batch[key] for key in self.config.image_features], dim=-4
-        )
+        batch = dict(batch)  # shallow copy so that adding a key doesn't modify the original
+        batch["observation.images"] = torch.stack([batch[key] for key in self.config.image_features], dim=-4)
        # Note: It's important that this happens after stacking the images into a single key.
        self._queues = populate_queues(self._queues, batch)

@@ -161,14 +149,8 @@ class VQBeTPolicy(PreTrainedPolicy):
            )

        if len(self._queues["action"]) == 0:
-            batch = {
-                k: torch.stack(list(self._queues[k]), dim=1)
-                for k in batch
-                if k in self._queues
-            }
-            actions = self.vqbet(batch, rollout=True)[
-                :, : self.config.action_chunk_size
-            ]
+            batch = {k: torch.stack(list(self._queues[k]), dim=1) for k in batch if k in self._queues}
+            actions = self.vqbet(batch, rollout=True)[:, : self.config.action_chunk_size]

            # the dimension of returned action is (batch_size, action_chunk_size, action_dim)
            actions = self.unnormalize_outputs({"action": actions})["action"]
@@ -181,12 +163,8 @@ class VQBeTPolicy(PreTrainedPolicy):
    def forward(self, batch: dict[str, Tensor]) -> tuple[Tensor, dict]:
        """Run the batch through the model and compute the loss for training or validation."""
        batch = self.normalize_inputs(batch)
-        batch = dict(
-            batch
-        )  # shallow copy so that adding a key doesn't modify the original
-        batch["observation.images"] = torch.stack(
-            [batch[key] for key in self.config.image_features], dim=-4
-        )
+        batch = dict(batch)  # shallow copy so that adding a key doesn't modify the original
+        batch["observation.images"] = torch.stack([batch[key] for key in self.config.image_features], dim=-4)
        batch = self.normalize_targets(batch)
        # VQ-BeT discretizes action using VQ-VAE before training BeT (please refer to section 3.2 in the VQ-BeT paper https://arxiv.org/pdf/2403.03181)
        if not self.vqbet.action_head.vqvae_model.discretized.item():
@@ -194,9 +172,7 @@ class VQBeTPolicy(PreTrainedPolicy):
            # n_different_codes: how many of the total possible VQ codes are being used in single batch (how many of them have at least one encoder embedding as a nearest neighbor). This can be at most `vqvae_n_embed * number of layers of RVQ (=2)`.
            # n_different_combinations: how many different code combinations are being used out of all possible combinations in single batch. This can be at most `vqvae_n_embed ^ number of layers of RVQ (=2)` (hint consider the RVQ as a decision tree).
            loss, n_different_codes, n_different_combinations, recon_l1_error = (
-                self.vqbet.action_head.discretize(
-                    self.config.n_vqvae_training_steps, batch["action"]
-                )
+                self.vqbet.action_head.discretize(self.config.n_vqvae_training_steps, batch["action"])
            )
            return loss, {
                "n_different_codes": n_different_codes,
@@ -253,9 +229,7 @@ class SpatialSoftmax(nn.Module):

        # we could use torch.linspace directly but that seems to behave slightly differently than numpy
        # and causes a small degradation in pc_success of pre-trained models.
-        pos_x, pos_y = np.meshgrid(
-            np.linspace(-1.0, 1.0, self._in_w), np.linspace(-1.0, 1.0, self._in_h)
-        )
+        pos_x, pos_y = np.meshgrid(np.linspace(-1.0, 1.0, self._in_w), np.linspace(-1.0, 1.0, self._in_h))
        pos_x = torch.from_numpy(pos_x.reshape(self._in_h * self._in_w, 1)).float()
        pos_y = torch.from_numpy(pos_y.reshape(self._in_h * self._in_w, 1)).float()
        # register as buffer so it's moved to the correct device.
@@ -370,12 +344,7 @@ class VQBeTModel(nn.Module):
        num_tokens = self.config.n_action_pred_token + self.config.n_obs_steps - 1
        self.register_buffer(
            "select_target_actions_indices",
-            torch.row_stack(
-                [
-                    torch.arange(i, i + self.config.action_chunk_size)
-                    for i in range(num_tokens)
-                ]
-            ),
+            torch.row_stack([torch.arange(i, i + self.config.action_chunk_size) for i in range(num_tokens)]),
        )

    def forward(self, batch: dict[str, Tensor], rollout: bool) -> tuple[dict, dict]:
@@ -406,19 +375,13 @@ class VQBeTModel(nn.Module):
        input_tokens.append(
            self.state_projector(batch["observation.state"])
        )  # (batch, obs_step, projection dims)
-        input_tokens.append(
-            einops.repeat(
-                self.action_token, "1 1 d -> b n d", b=batch_size, n=n_obs_steps
-            )
-        )
+        input_tokens.append(einops.repeat(self.action_token, "1 1 d -> b n d", b=batch_size, n=n_obs_steps))
        # Interleave tokens by stacking and rearranging.
        input_tokens = torch.stack(input_tokens, dim=2)
        input_tokens = einops.rearrange(input_tokens, "b n t d -> b (n t) d")

        len_additional_action_token = self.config.n_action_pred_token - 1
-        future_action_tokens = self.action_token.repeat(
-            batch_size, len_additional_action_token, 1
-        )
+        future_action_tokens = self.action_token.repeat(batch_size, len_additional_action_token, 1)

        # add additional action query tokens for predicting future action chunks
        input_tokens = torch.cat([input_tokens, future_action_tokens], dim=1)
@@ -427,9 +390,9 @@ class VQBeTModel(nn.Module):
        features = self.policy(input_tokens)
        # len(self.config.input_features) is the number of different observation modes.
        # this line gets the index of action prompt tokens.
-        historical_act_pred_index = np.arange(0, n_obs_steps) * (
-            len(self.config.input_features) + 1
-        ) + len(self.config.input_features)
+        historical_act_pred_index = np.arange(0, n_obs_steps) * (len(self.config.input_features) + 1) + len(
+            self.config.input_features
+        )

        # only extract the output tokens at the position of action query:
        # Behavior Transformer (BeT), and VQ-BeT are both sequence-to-sequence prediction models,
@@ -449,15 +412,13 @@ class VQBeTModel(nn.Module):
        action_head_output = self.action_head(features)
        # if rollout, VQ-BeT don't calculate loss
        if rollout:
-            return action_head_output["predicted_action"][
-                :, n_obs_steps - 1, :
-            ].reshape(batch_size, self.config.action_chunk_size, -1)
+            return action_head_output["predicted_action"][:, n_obs_steps - 1, :].reshape(
+                batch_size, self.config.action_chunk_size, -1
+            )
        # else, it calculate overall loss (bin prediction loss, and offset loss)
        else:
            output = batch["action"][:, self.select_target_actions_indices]
-            loss = self.action_head.loss_fn(
-                action_head_output, output, reduction="mean"
-            )
+            loss = self.action_head.loss_fn(action_head_output, output, reduction="mean")
            return action_head_output, loss


@@ -492,9 +453,7 @@ class VQBeTHead(nn.Module):
        else:
            self.map_to_cbet_preds_bin = MLP(
                in_channels=config.gpt_output_dim,
-                hidden_channels=[
-                    self.vqvae_model.vqvae_num_layers * self.config.vqvae_n_embed
-                ],
+                hidden_channels=[self.vqvae_model.vqvae_num_layers * self.config.vqvae_n_embed],
            )
        self.map_to_cbet_preds_offset = MLP(
            in_channels=config.gpt_output_dim,
@@ -521,10 +480,7 @@ class VQBeTHead(nn.Module):

        loss, metric = self.vqvae_model.vqvae_forward(actions)
        n_different_codes = sum(
-            [
-                len(torch.unique(metric[2][:, i]))
-                for i in range(self.vqvae_model.vqvae_num_layers)
-            ]
+            [len(torch.unique(metric[2][:, i])) for i in range(self.vqvae_model.vqvae_num_layers)]
        )
        n_different_combinations = len(torch.unique(metric[2], dim=0))
        recon_l1_error = metric[0].detach().cpu().item()
@@ -585,18 +541,12 @@ class VQBeTHead(nn.Module):
                cbet_secondary_logits / self.config.bet_softmax_temperature, dim=-1
            )
            sampled_secondary_centers = einops.rearrange(
-                torch.multinomial(
-                    cbet_secondary_probs.view(-1, choices), num_samples=1
-                ),
+                torch.multinomial(cbet_secondary_probs.view(-1, choices), num_samples=1),
                "(NT) 1 -> NT",
                NT=NT,
            )
-            sampled_centers = torch.stack(
-                (sampled_primary_centers, sampled_secondary_centers), axis=1
-            )
-            cbet_logits = torch.stack(
-                [cbet_primary_logits, cbet_secondary_logits], dim=1
-            )
+            sampled_centers = torch.stack((sampled_primary_centers, sampled_secondary_centers), axis=1)
+            cbet_logits = torch.stack([cbet_primary_logits, cbet_secondary_logits], dim=1)
        # if self.config.sequentially_select is False, bin prediction head samples primary and secondary code at once.
        else:
            cbet_logits = self.map_to_cbet_preds_bin(x)
@@ -605,9 +555,7 @@ class VQBeTHead(nn.Module):
                "(NT) (G C) -> (NT) G C",
                G=self.vqvae_model.vqvae_num_layers,
            )
-            cbet_probs = torch.softmax(
-                cbet_logits / self.config.bet_softmax_temperature, dim=-1
-            )
+            cbet_probs = torch.softmax(cbet_logits / self.config.bet_softmax_temperature, dim=-1)
            NT, G, choices = cbet_probs.shape
            sampled_centers = einops.rearrange(
                torch.multinomial(cbet_probs.view(-1, choices), num_samples=1),
@@ -627,17 +575,9 @@ class VQBeTHead(nn.Module):
        sampled_offsets = sampled_offsets.sum(dim=1)
        with torch.no_grad():
            # Get the centroids (= vectors corresponding to the codes) of each layer to pass it through RVQ decoder
-            return_decoder_input = (
-                self.vqvae_model.get_embeddings_from_code(sampled_centers)
-                .clone()
-                .detach()
-            )
+            return_decoder_input = self.vqvae_model.get_embeddings_from_code(sampled_centers).clone().detach()
            # pass the centroids through decoder to get actions.
-            decoded_action = (
-                self.vqvae_model.get_action_from_latent(return_decoder_input)
-                .clone()
-                .detach()
-            )
+            decoded_action = self.vqvae_model.get_action_from_latent(return_decoder_input).clone().detach()
        # reshaped extracted offset to match with decoded centroids
        sampled_offsets = einops.rearrange(
            sampled_offsets, "NT (W A) -> NT W A", W=self.config.action_chunk_size
@@ -686,9 +626,7 @@ class VQBeTHead(nn.Module):
        # Figure out the loss for the actions.
        # First, we need to find the closest cluster center for each ground truth action.
        with torch.no_grad():
-            state_vq, action_bins = self.vqvae_model.get_code(
-                action_seq
-            )  # action_bins: NT, G
+            state_vq, action_bins = self.vqvae_model.get_code(action_seq)  # action_bins: NT, G

        # Now we can compute the loss.

@@ -711,12 +649,8 @@ class VQBeTHead(nn.Module):
            + cbet_loss2 * self.config.secondary_code_loss_weight
        )

-        equal_primary_code_rate = torch.sum(
-            (action_bins[:, 0] == sampled_centers[:, 0]).int()
-        ) / (NT)
-        equal_secondary_code_rate = torch.sum(
-            (action_bins[:, 1] == sampled_centers[:, 1]).int()
-        ) / (NT)
+        equal_primary_code_rate = torch.sum((action_bins[:, 0] == sampled_centers[:, 0]).int()) / (NT)
+        equal_secondary_code_rate = torch.sum((action_bins[:, 1] == sampled_centers[:, 1]).int()) / (NT)

        action_mse_error = torch.mean((action_seq - predicted_action) ** 2)
        vq_action_error = torch.mean(torch.abs(action_seq - decoded_action))
@@ -730,9 +664,7 @@ class VQBeTHead(nn.Module):
            "classification_loss": cbet_loss.detach().cpu().item(),
            "offset_loss": offset_loss.detach().cpu().item(),
            "equal_primary_code_rate": equal_primary_code_rate.detach().cpu().item(),
-            "equal_secondary_code_rate": equal_secondary_code_rate.detach()
-            .cpu()
-            .item(),
+            "equal_secondary_code_rate": equal_secondary_code_rate.detach().cpu().item(),
            "vq_action_error": vq_action_error.detach().cpu().item(),
            "offset_action_error": offset_action_error.detach().cpu().item(),
            "action_error_max": action_error_max.detach().cpu().item(),
@@ -757,9 +689,7 @@ class VQBeTRgbEncoder(nn.Module):
            # Always use center crop for eval
            self.center_crop = torchvision.transforms.CenterCrop(config.crop_shape)
            if config.crop_is_random:
-                self.maybe_random_crop = torchvision.transforms.RandomCrop(
-                    config.crop_shape
-                )
+                self.maybe_random_crop = torchvision.transforms.RandomCrop(config.crop_shape)
            else:
                self.maybe_random_crop = self.center_crop
        else:
@@ -780,9 +710,7 @@ class VQBeTRgbEncoder(nn.Module):
            self.backbone = _replace_submodules(
                root_module=self.backbone,
                predicate=lambda x: isinstance(x, nn.BatchNorm2d),
-                func=lambda x: nn.GroupNorm(
-                    num_groups=x.num_features // 16, num_channels=x.num_features
-                ),
+                func=lambda x: nn.GroupNorm(num_groups=x.num_features // 16, num_channels=x.num_features),
            )

        # Set up pooling and final layers.
@@ -792,15 +720,11 @@ class VQBeTRgbEncoder(nn.Module):
        # height and width from `config.image_features`.

        images_shape = next(iter(config.image_features.values())).shape
-        dummy_shape_h_w = (
-            config.crop_shape if config.crop_shape is not None else images_shape[1:]
-        )
+        dummy_shape_h_w = config.crop_shape if config.crop_shape is not None else images_shape[1:]
        dummy_shape = (1, images_shape[0], *dummy_shape_h_w)
        feature_map_shape = get_output_shape(self.backbone, dummy_shape)[1:]

-        self.pool = SpatialSoftmax(
-            feature_map_shape, num_kp=config.spatial_softmax_num_keypoints
-        )
+        self.pool = SpatialSoftmax(feature_map_shape, num_kp=config.spatial_softmax_num_keypoints)
        self.feature_dim = config.spatial_softmax_num_keypoints * 2
        self.out = nn.Linear(config.spatial_softmax_num_keypoints * 2, self.feature_dim)
        self.relu = nn.ReLU()
@@ -842,11 +766,7 @@ def _replace_submodules(
    if predicate(root_module):
        return func(root_module)

-    replace_list = [
-        k.split(".")
-        for k, m in root_module.named_modules(remove_duplicate=True)
-        if predicate(m)
-    ]
+    replace_list = [k.split(".") for k, m in root_module.named_modules(remove_duplicate=True) if predicate(m)]
    for *parents, k in replace_list:
        parent_module = root_module
        if len(parents) > 0:
@@ -861,9 +781,7 @@ def _replace_submodules(
        else:
            setattr(parent_module, k, tgt_module)
    # verify that all BN are replaced
-    assert not any(
-        predicate(m) for _, m in root_module.named_modules(remove_duplicate=True)
-    )
+    assert not any(predicate(m) for _, m in root_module.named_modules(remove_duplicate=True))
    return root_module


@@ -896,8 +814,7 @@ class VqVae(nn.Module):
        )

        self.encoder = MLP(
-            in_channels=self.config.action_feature.shape[0]
-            * self.config.action_chunk_size,
+            in_channels=self.config.action_feature.shape[0] * self.config.action_chunk_size,
            hidden_channels=[
                config.vqvae_enc_hidden_dim,
                config.vqvae_enc_hidden_dim,
@@ -925,13 +842,9 @@ class VqVae(nn.Module):
        # given latent vector, this function outputs the decoded action.
        output = self.decoder(latent)
        if self.config.action_chunk_size == 1:
-            return einops.rearrange(
-                output, "N (T A) -> N T A", A=self.config.action_feature.shape[0]
-            )
+            return einops.rearrange(output, "N (T A) -> N T A", A=self.config.action_feature.shape[0])
        else:
-            return einops.rearrange(
-                output, "N (T A) -> N T A", A=self.config.action_feature.shape[0]
-            )
+            return einops.rearrange(output, "N (T A) -> N T A", A=self.config.action_feature.shape[0])

    def get_code(self, state):
        # in phase 2 of VQ-BeT training, we need a `ground truth labels of action data` to calculate the Focal loss for code prediction head. (please refer to section 3.3 in the paper https://arxiv.org/pdf/2403.03181)
--- a/lerobot/common/policies/vqbet/vqbet_utils.py
+++ b/lerobot/common/policies/vqbet/vqbet_utils.py
@@ -123,15 +123,9 @@ class CausalSelfAttention(nn.Module):

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        q, k, v = self.c_attn(x).split(self.gpt_hidden_dim, dim=2)
-        k = k.view(B, T, self.gpt_n_head, C // self.gpt_n_head).transpose(
-            1, 2
-        )  # (B, nh, T, hs)
-        q = q.view(B, T, self.gpt_n_head, C // self.gpt_n_head).transpose(
-            1, 2
-        )  # (B, nh, T, hs)
-        v = v.view(B, T, self.gpt_n_head, C // self.gpt_n_head).transpose(
-            1, 2
-        )  # (B, nh, T, hs)
+        k = k.view(B, T, self.gpt_n_head, C // self.gpt_n_head).transpose(1, 2)  # (B, nh, T, hs)
+        q = q.view(B, T, self.gpt_n_head, C // self.gpt_n_head).transpose(1, 2)  # (B, nh, T, hs)
+        v = v.view(B, T, self.gpt_n_head, C // self.gpt_n_head).transpose(1, 2)  # (B, nh, T, hs)

        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
@@ -139,9 +133,7 @@ class CausalSelfAttention(nn.Module):
        att = F.softmax(att, dim=-1)
        att = self.attn_dropout(att)
        y = att @ v  # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
-        y = (
-            y.transpose(1, 2).contiguous().view(B, T, C)
-        )  # re-assemble all head outputs side by side
+        y = y.transpose(1, 2).contiguous().view(B, T, C)  # re-assemble all head outputs side by side

        # output projection
        y = self.resid_dropout(self.c_proj(y))
@@ -197,16 +189,12 @@ class GPT(nn.Module):
                "ln_f": nn.LayerNorm(config.gpt_hidden_dim),
            }
        )
-        self.lm_head = nn.Linear(
-            config.gpt_hidden_dim, config.gpt_output_dim, bias=False
-        )
+        self.lm_head = nn.Linear(config.gpt_hidden_dim, config.gpt_output_dim, bias=False)
        # init all weights, and apply a special scaled init to the residual projections, per GPT-2 paper
        self.apply(self._init_weights)
        for pn, p in self.named_parameters():
            if pn.endswith("c_proj.weight"):
-                torch.nn.init.normal_(
-                    p, mean=0.0, std=0.02 / math.sqrt(2 * config.gpt_n_layer)
-                )
+                torch.nn.init.normal_(p, mean=0.0, std=0.02 / math.sqrt(2 * config.gpt_n_layer))

        # report number of parameters
        n_params = sum(p.numel() for p in self.parameters())
@@ -220,17 +208,11 @@ class GPT(nn.Module):
        )

        # positional encodings that are added to the input embeddings
-        pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(
-            0
-        )  # shape (1, t)
+        pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0)  # shape (1, t)

        # forward the GPT model itself
-        tok_emb = self.transformer.wte(
-            input
-        )  # token embeddings of shape (b, t, gpt_hidden_dim)
-        pos_emb = self.transformer.wpe(
-            pos
-        )  # position embeddings of shape (1, t, gpt_hidden_dim)
+        tok_emb = self.transformer.wte(input)  # token embeddings of shape (b, t, gpt_hidden_dim)
+        pos_emb = self.transformer.wpe(pos)  # position embeddings of shape (1, t, gpt_hidden_dim)
        x = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            x = block(x)
@@ -255,9 +237,7 @@ class GPT(nn.Module):
        # but want to use a smaller block size for some smaller, simpler model
        assert gpt_block_size <= self.config.gpt_block_size
        self.config.gpt_block_size = gpt_block_size
-        self.transformer.wpe.weight = nn.Parameter(
-            self.transformer.wpe.weight[:gpt_block_size]
-        )
+        self.transformer.wpe.weight = nn.Parameter(self.transformer.wpe.weight[:gpt_block_size])
        for block in self.transformer.h:
            block.attn.bias = block.attn.bias[:, :, :gpt_block_size, :gpt_block_size]

@@ -290,10 +270,8 @@ class GPT(nn.Module):
        param_dict = dict(self.named_parameters())
        inter_params = decay & no_decay
        union_params = decay | no_decay
-        assert len(inter_params) == 0, (
-            "parameters {} made it into both decay/no_decay sets!".format(
-                str(inter_params)
-            )
+        assert len(inter_params) == 0, "parameters {} made it into both decay/no_decay sets!".format(
+            str(inter_params)
        )
        assert len(param_dict.keys() - union_params) == 0, (
            "parameters {} were not separated into either decay/no_decay set!".format(
@@ -390,12 +368,8 @@ class ResidualVQ(nn.Module):
        codebook_input_dim = codebook_dim * heads

        requires_projection = codebook_input_dim != dim
-        self.project_in = (
-            nn.Linear(dim, codebook_input_dim) if requires_projection else nn.Identity()
-        )
-        self.project_out = (
-            nn.Linear(codebook_input_dim, dim) if requires_projection else nn.Identity()
-        )
+        self.project_in = nn.Linear(dim, codebook_input_dim) if requires_projection else nn.Identity()
+        self.project_out = nn.Linear(codebook_input_dim, dim) if requires_projection else nn.Identity()

        self.num_quantizers = num_quantizers

@@ -477,9 +451,7 @@ class ResidualVQ(nn.Module):

        return all_codes

-    def forward(
-        self, x, indices=None, return_all_codes=False, sample_codebook_temp=None
-    ):
+    def forward(self, x, indices=None, return_all_codes=False, sample_codebook_temp=None):
        """
        For given input tensor x, this function will return the quantized output, the indices of the quantized output, and the loss.
        First, the input tensor x is projected to the codebook dimension. Then, the input tensor x is passed through Nq layers of VectorQuantize.
@@ -508,17 +480,13 @@ class ResidualVQ(nn.Module):
            )
            ce_losses = []

-        should_quantize_dropout = (
-            self.training and self.quantize_dropout and not return_loss
-        )
+        should_quantize_dropout = self.training and self.quantize_dropout and not return_loss

        # sample a layer index at which to dropout further residual quantization
        # also prepare null indices and loss

        if should_quantize_dropout:
-            rand_quantize_dropout_index = randrange(
-                self.quantize_dropout_cutoff_index, num_quant
-            )
+            rand_quantize_dropout_index = randrange(self.quantize_dropout_cutoff_index, num_quant)

            if quant_dropout_multiple_of != 1:
                rand_quantize_dropout_index = (
@@ -527,23 +495,14 @@ class ResidualVQ(nn.Module):
                    - 1
                )

-            null_indices_shape = (
-                (x.shape[0], *x.shape[-2:])
-                if self.accept_image_fmap
-                else tuple(x.shape[:2])
-            )
-            null_indices = torch.full(
-                null_indices_shape, -1.0, device=device, dtype=torch.long
-            )
+            null_indices_shape = (x.shape[0], *x.shape[-2:]) if self.accept_image_fmap else tuple(x.shape[:2])
+            null_indices = torch.full(null_indices_shape, -1.0, device=device, dtype=torch.long)
            null_loss = torch.full((1,), 0.0, device=device, dtype=x.dtype)

        # go through the layers

        for quantizer_index, layer in enumerate(self.layers):
-            if (
-                should_quantize_dropout
-                and quantizer_index > rand_quantize_dropout_index
-            ):
+            if should_quantize_dropout and quantizer_index > rand_quantize_dropout_index:
                all_indices.append(null_indices)
                all_losses.append(null_loss)
                continue
@@ -583,9 +542,7 @@ class ResidualVQ(nn.Module):

        # stack all losses and indices

-        all_losses, all_indices = map(
-            partial(torch.stack, dim=-1), (all_losses, all_indices)
-        )
+        all_losses, all_indices = map(partial(torch.stack, dim=-1), (all_losses, all_indices))

        ret = (quantized_out, all_indices, all_losses)

@@ -645,12 +602,8 @@ class VectorQuantize(nn.Module):
        codebook_input_dim = codebook_dim * heads

        requires_projection = codebook_input_dim != dim
-        self.project_in = (
-            nn.Linear(dim, codebook_input_dim) if requires_projection else nn.Identity()
-        )
-        self.project_out = (
-            nn.Linear(codebook_input_dim, dim) if requires_projection else nn.Identity()
-        )
+        self.project_in = nn.Linear(dim, codebook_input_dim) if requires_projection else nn.Identity()
+        self.project_out = nn.Linear(codebook_input_dim, dim) if requires_projection else nn.Identity()

        self.eps = eps
        self.commitment_weight = commitment_weight
@@ -664,14 +617,10 @@ class VectorQuantize(nn.Module):
        self.orthogonal_reg_active_codes_only = orthogonal_reg_active_codes_only
        self.orthogonal_reg_max_codes = orthogonal_reg_max_codes

-        assert not (ema_update and learnable_codebook), (
-            "learnable codebook not compatible with EMA update"
-        )
+        assert not (ema_update and learnable_codebook), "learnable codebook not compatible with EMA update"

        assert 0 <= sync_update_v <= 1.0
-        assert not (sync_update_v > 0.0 and not learnable_codebook), (
-            "learnable codebook must be turned on"
-        )
+        assert not (sync_update_v > 0.0 and not learnable_codebook), "learnable codebook must be turned on"

        self.sync_update_v = sync_update_v

@@ -683,9 +632,7 @@ class VectorQuantize(nn.Module):
        )

        if sync_codebook is None:
-            sync_codebook = (
-                distributed.is_initialized() and distributed.get_world_size() > 1
-            )
+            sync_codebook = distributed.is_initialized() and distributed.get_world_size() > 1

        codebook_kwargs = {
            "dim": codebook_dim,
@@ -850,17 +797,11 @@ class VectorQuantize(nn.Module):

            # quantize again

-            quantize, embed_ind, distances = self._codebook(
-                x, **codebook_forward_kwargs
-            )
+            quantize, embed_ind, distances = self._codebook(x, **codebook_forward_kwargs)

        if self.training:
            # determine code to use for commitment loss
-            maybe_detach = (
-                torch.detach
-                if not self.learnable_codebook or freeze_codebook
-                else identity
-            )
+            maybe_detach = torch.detach if not self.learnable_codebook or freeze_codebook else identity

            commit_quantize = maybe_detach(quantize)

@@ -870,9 +811,7 @@ class VectorQuantize(nn.Module):

            if self.sync_update_v > 0.0:
                # (21) in https://minyoungg.github.io/vqtorch/assets/draft_050523.pdf
-                quantize = quantize + self.sync_update_v * (
-                    quantize - quantize.detach()
-                )
+                quantize = quantize + self.sync_update_v * (quantize - quantize.detach())

        # function for calculating cross entropy loss to distance matrix
        # used for (1) naturalspeech2 training residual vq latents to be close to the correct codes and (2) cross-entropy based commitment loss
@@ -905,9 +844,7 @@ class VectorQuantize(nn.Module):
                embed_ind = rearrange(embed_ind, "1 (b h) n -> b n h", h=heads)

        if self.accept_image_fmap:
-            embed_ind = rearrange(
-                embed_ind, "b (h w) ... -> b h w ...", h=height, w=width
-            )
+            embed_ind = rearrange(embed_ind, "b (h w) ... -> b h w ...", h=height, w=width)

        if only_one:
            embed_ind = rearrange(embed_ind, "b 1 -> b")
@@ -961,12 +898,8 @@ class VectorQuantize(nn.Module):

                num_codes = codebook.shape[-2]

-                if (
-                    self.orthogonal_reg_max_codes is not None
-                ) and num_codes > self.orthogonal_reg_max_codes:
-                    rand_ids = torch.randperm(num_codes, device=device)[
-                        : self.orthogonal_reg_max_codes
-                    ]
+                if (self.orthogonal_reg_max_codes is not None) and num_codes > self.orthogonal_reg_max_codes:
+                    rand_ids = torch.randperm(num_codes, device=device)[: self.orthogonal_reg_max_codes]
                    codebook = codebook[:, rand_ids]

                orthogonal_reg_loss = orthogonal_loss_fn(codebook)
@@ -998,9 +931,7 @@ class VectorQuantize(nn.Module):
        # if masking, only return quantized for where mask has True

        if mask is not None:
-            quantize = torch.where(
-                rearrange(mask, "... -> ... 1"), quantize, orig_input
-            )
+            quantize = torch.where(rearrange(mask, "... -> ... 1"), quantize, orig_input)

        return quantize, embed_ind, loss

@@ -1110,9 +1041,7 @@ def sample_vectors(samples, num):


 def batched_sample_vectors(samples, num):
-    return torch.stack(
-        [sample_vectors(sample, num) for sample in samples.unbind(dim=0)], dim=0
-    )
+    return torch.stack([sample_vectors(sample, num) for sample in samples.unbind(dim=0)], dim=0)


 def pad_shape(shape, size, dim=0):
@@ -1163,9 +1092,7 @@ def sample_vectors_distributed(local_samples, num):
    all_num_samples = all_gather_sizes(local_samples, dim=0)

    if rank == 0:
-        samples_per_rank = sample_multinomial(
-            num, all_num_samples / all_num_samples.sum()
-        )
+        samples_per_rank = sample_multinomial(num, all_num_samples / all_num_samples.sum())
    else:
        samples_per_rank = torch.empty_like(all_num_samples)

@@ -1278,9 +1205,7 @@ class EuclideanCodebook(nn.Module):
        self.eps = eps
        self.threshold_ema_dead_code = threshold_ema_dead_code
        self.reset_cluster_size = (
-            reset_cluster_size
-            if (reset_cluster_size is not None)
-            else threshold_ema_dead_code
+            reset_cluster_size if (reset_cluster_size is not None) else threshold_ema_dead_code
        )

        assert callable(gumbel_sample)
@@ -1291,14 +1216,8 @@ class EuclideanCodebook(nn.Module):
            "kmeans init is not compatible with multiple codebooks in distributed environment for now"
        )

-        self.sample_fn = (
-            sample_vectors_distributed
-            if use_ddp and sync_kmeans
-            else batched_sample_vectors
-        )
-        self.kmeans_all_reduce_fn = (
-            distributed.all_reduce if use_ddp and sync_kmeans else noop
-        )
+        self.sample_fn = sample_vectors_distributed if use_ddp and sync_kmeans else batched_sample_vectors
+        self.kmeans_all_reduce_fn = distributed.all_reduce if use_ddp and sync_kmeans else noop
        self.all_reduce_fn = distributed.all_reduce if use_ddp else noop

        self.register_buffer("initted", torch.Tensor([not kmeans_init]))
@@ -1437,9 +1356,7 @@ class EuclideanCodebook(nn.Module):
        distributed.all_reduce(variance_number)
        batch_variance = variance_number / num_vectors

-        self.update_with_decay(
-            "batch_variance", batch_variance, self.affine_param_batch_decay
-        )
+        self.update_with_decay("batch_variance", batch_variance, self.affine_param_batch_decay)

    def replace(self, batch_samples, batch_mask):
        for ind, (samples, mask) in enumerate(
@@ -1448,9 +1365,7 @@ class EuclideanCodebook(nn.Module):
            if not torch.any(mask):
                continue

-            sampled = self.sample_fn(
-                rearrange(samples, "... -> 1 ..."), mask.sum().item()
-            )
+            sampled = self.sample_fn(rearrange(samples, "... -> 1 ..."), mask.sum().item())
            sampled = rearrange(sampled, "1 ... -> ...")

            self.embed.data[ind][mask] = sampled
@@ -1474,9 +1389,7 @@ class EuclideanCodebook(nn.Module):
    def forward(self, x, sample_codebook_temp=None, mask=None, freeze_codebook=False):
        needs_codebook_dim = x.ndim < 4
        sample_codebook_temp = (
-            sample_codebook_temp
-            if (sample_codebook_temp is not None)
-            else self.sample_codebook_temp
+            sample_codebook_temp if (sample_codebook_temp is not None) else self.sample_codebook_temp
        )

        x = x.float()
@@ -1504,9 +1417,7 @@ class EuclideanCodebook(nn.Module):
        if self.affine_param:
            codebook_std = self.codebook_variance.clamp(min=1e-5).sqrt()
            batch_std = self.batch_variance.clamp(min=1e-5).sqrt()
-            embed = (embed - self.codebook_mean) * (
-                batch_std / codebook_std
-            ) + self.batch_mean
+            embed = (embed - self.codebook_mean) * (batch_std / codebook_std) + self.batch_mean

        dist = -cdist(flatten, embed)

@@ -1524,9 +1435,7 @@ class EuclideanCodebook(nn.Module):

        if self.training and self.ema_update and not freeze_codebook:
            if self.affine_param:
-                flatten = (flatten - self.batch_mean) * (
-                    codebook_std / batch_std
-                ) + self.codebook_mean
+                flatten = (flatten - self.batch_mean) * (codebook_std / batch_std) + self.codebook_mean

            if mask is not None:
                embed_onehot[~mask] = 0.0
@@ -1549,9 +1458,7 @@ class EuclideanCodebook(nn.Module):
            self.expire_codes_(x)

        if needs_codebook_dim:
-            quantize, embed_ind = tuple(
-                rearrange(t, "1 ... -> ...") for t in (quantize, embed_ind)
-            )
+            quantize, embed_ind = tuple(rearrange(t, "1 ... -> ...") for t in (quantize, embed_ind))

        dist = unpack_one(dist, ps, "h * d")