Use PytorchModelHubMixin to save models as safetensors (#125)

Co-authored-by: Remi <re.cadene@gmail.com>
2024-05-01 16:17:18 +01:00
parent 01d5490d44
commit a4891095e4
18 changed files with 556 additions and 527 deletions
--- a/lerobot/common/logger.py
+++ b/lerobot/common/logger.py
@@ -2,9 +2,12 @@ import logging
 import os
 from pathlib import Path

+from huggingface_hub.constants import SAFETENSORS_SINGLE_FILE
 from omegaconf import OmegaConf
 from termcolor import colored

+from lerobot.common.policies.policy_protocol import Policy
+

 def log_output_dir(out_dir):
    logging.info(colored("Output dir:", "yellow", attrs=["bold"]) + f" {out_dir}")
@@ -27,7 +30,7 @@ class Logger:
        self._log_dir = Path(log_dir)
        self._log_dir.mkdir(parents=True, exist_ok=True)
        self._job_name = job_name
-        self._model_dir = self._log_dir / "models"
+        self._model_dir = self._log_dir / "checkpoints"
        self._buffer_dir = self._log_dir / "buffers"
        self._save_model = cfg.training.save_model
        self._disable_wandb_artifact = cfg.wandb.disable_artifact
@@ -67,18 +70,20 @@ class Logger:
            logging.info(f"Track this run --> {colored(wandb.run.get_url(), 'yellow', attrs=['bold'])}")
            self._wandb = wandb

-    def save_model(self, policy, identifier):
+    def save_model(self, policy: Policy, identifier):
        if self._save_model:
            self._model_dir.mkdir(parents=True, exist_ok=True)
-            fp = self._model_dir / f"{str(identifier)}.pt"
-            policy.save(fp)
+            save_dir = self._model_dir / str(identifier)
+            policy.save_pretrained(save_dir)
+            # Also save the full Hydra config for the env configuration.
+            OmegaConf.save(self._cfg, save_dir / "config.yaml")
            if self._wandb and not self._disable_wandb_artifact:
                # note wandb artifact does not accept ":" in its name
                artifact = self._wandb.Artifact(
                    self._group.replace(":", "_") + "-" + str(self._seed) + "-" + str(identifier),
                    type="model",
                )
-                artifact.add_file(fp)
+                artifact.add_file(save_dir / SAFETENSORS_SINGLE_FILE)
                self._wandb.log_artifact(artifact)

    def save_buffer(self, buffer, identifier):
--- a/lerobot/common/policies/act/configuration_act.py
+++ b/lerobot/common/policies/act/configuration_act.py
@@ -38,7 +38,7 @@ class ACTConfig:
        replace_final_stride_with_dilation: Whether to replace the ResNet's final 2x2 stride with a dilated
            convolution.
        pre_norm: Whether to use "pre-norm" in the transformer blocks.
-        d_model: The transformer blocks' main hidden dimension.
+        dim_model: The transformer blocks' main hidden dimension.
        n_heads: The number of heads to use in the transformer blocks' multi-head attention.
        dim_feedforward: The dimension to expand the transformer's hidden dimension to in the feed-forward
            layers.
@@ -94,7 +94,7 @@ class ACTConfig:
    replace_final_stride_with_dilation: int = False
    # Transformer layers.
    pre_norm: bool = False
-    d_model: int = 512
+    dim_model: int = 512
    n_heads: int = 8
    dim_feedforward: int = 3200
    feedforward_activation: str = "relu"
--- a/lerobot/common/policies/act/modeling_act.py
+++ b/lerobot/common/policies/act/modeling_act.py
@@ -14,6 +14,7 @@ import numpy as np
 import torch
 import torch.nn.functional as F  # noqa: N812
 import torchvision
+from huggingface_hub import PyTorchModelHubMixin
 from torch import Tensor, nn
 from torchvision.models._utils import IntermediateLayerGetter
 from torchvision.ops.misc import FrozenBatchNorm2d
@@ -22,7 +23,7 @@ from lerobot.common.policies.act.configuration_act import ACTConfig
 from lerobot.common.policies.normalize import Normalize, Unnormalize


-class ACTPolicy(nn.Module):
+class ACTPolicy(nn.Module, PyTorchModelHubMixin):
    """
    Action Chunking Transformer Policy as per Learning Fine-Grained Bimanual Manipulation with Low-Cost
    Hardware (paper: https://arxiv.org/abs/2304.13705, code: https://github.com/tonyzhaozh/act)
@@ -30,27 +31,31 @@ class ACTPolicy(nn.Module):

    name = "act"

-    def __init__(self, cfg: ACTConfig | None = None, dataset_stats=None):
+    def __init__(self, config: ACTConfig | None = None, dataset_stats=None):
        """
        Args:
-            cfg: Policy configuration class instance or None, in which case the default instantiation of the
-                 configuration class is used.
+            config: Policy configuration class instance or None, in which case the default instantiation of
+                    the configuration class is used.
        """
        super().__init__()
-        if cfg is None:
-            cfg = ACTConfig()
-        self.cfg = cfg
-        self.normalize_inputs = Normalize(cfg.input_shapes, cfg.input_normalization_modes, dataset_stats)
-        self.normalize_targets = Normalize(cfg.output_shapes, cfg.output_normalization_modes, dataset_stats)
-        self.unnormalize_outputs = Unnormalize(
-            cfg.output_shapes, cfg.output_normalization_modes, dataset_stats
+        if config is None:
+            config = ACTConfig()
+        self.config = config
+        self.normalize_inputs = Normalize(
+            config.input_shapes, config.input_normalization_modes, dataset_stats
        )
-        self.model = ACT(cfg)
+        self.normalize_targets = Normalize(
+            config.output_shapes, config.output_normalization_modes, dataset_stats
+        )
+        self.unnormalize_outputs = Unnormalize(
+            config.output_shapes, config.output_normalization_modes, dataset_stats
+        )
+        self.model = ACT(config)

    def reset(self):
        """This should be called whenever the environment is reset."""
-        if self.cfg.n_action_steps is not None:
-            self._action_queue = deque([], maxlen=self.cfg.n_action_steps)
+        if self.config.n_action_steps is not None:
+            self._action_queue = deque([], maxlen=self.config.n_action_steps)

    @torch.no_grad
    def select_action(self, batch: dict[str, Tensor], **_) -> Tensor:
@@ -68,7 +73,7 @@ class ACTPolicy(nn.Module):
        if len(self._action_queue) == 0:
            # `self.model.forward` returns a (batch_size, n_action_steps, action_dim) tensor, but the queue
            # effectively has shape (n_action_steps, batch_size, *), hence the transpose.
-            actions = self.model(batch)[0][: self.cfg.n_action_steps]
+            actions = self.model(batch)[0][: self.config.n_action_steps]

            # TODO(rcadene): make _forward return output dictionary?
            actions = self.unnormalize_outputs({"action": actions})["action"]
@@ -88,7 +93,7 @@ class ACTPolicy(nn.Module):
        ).mean()

        loss_dict = {"l1_loss": l1_loss}
-        if self.cfg.use_vae:
+        if self.config.use_vae:
            # Calculate Dₖₗ(latent_pdf || standard_normal). Note: After computing the KL-divergence for
            # each dimension independently, we sum over the latent dimension to get the total
            # KL-divergence per batch element, then take the mean over the batch.
@@ -97,7 +102,7 @@ class ACTPolicy(nn.Module):
                (-0.5 * (1 + log_sigma_x2_hat - mu_hat.pow(2) - (log_sigma_x2_hat).exp())).sum(-1).mean()
            )
            loss_dict["kld_loss"] = mean_kld
-            loss_dict["loss"] = l1_loss + mean_kld * self.cfg.kl_weight
+            loss_dict["loss"] = l1_loss + mean_kld * self.config.kl_weight
        else:
            loss_dict["loss"] = l1_loss

@@ -114,17 +119,10 @@ class ACTPolicy(nn.Module):
        """
        # Stack images in the order dictated by input_shapes.
        batch["observation.images"] = torch.stack(
-            [batch[k] for k in self.cfg.input_shapes if k.startswith("observation.images.")],
+            [batch[k] for k in self.config.input_shapes if k.startswith("observation.images.")],
            dim=-4,
        )

-    def save(self, fp):
-        torch.save(self.state_dict(), fp)
-
-    def load(self, fp):
-        d = torch.load(fp)
-        self.load_state_dict(d)
-

 class ACT(nn.Module):
    """Action Chunking Transformer: The underlying neural network for ACTPolicy.
@@ -161,36 +159,36 @@ class ACT(nn.Module):
                                └───────────────────────┘
    """

-    def __init__(self, cfg: ACTConfig):
+    def __init__(self, config: ACTConfig):
        super().__init__()
-        self.cfg = cfg
+        self.config = config
        # BERT style VAE encoder with input [cls, *joint_space_configuration, *action_sequence].
        # The cls token forms parameters of the latent's distribution (like this [*means, *log_variances]).
-        if self.cfg.use_vae:
-            self.vae_encoder = ACTEncoder(cfg)
-            self.vae_encoder_cls_embed = nn.Embedding(1, cfg.d_model)
+        if self.config.use_vae:
+            self.vae_encoder = ACTEncoder(config)
+            self.vae_encoder_cls_embed = nn.Embedding(1, config.dim_model)
            # Projection layer for joint-space configuration to hidden dimension.
            self.vae_encoder_robot_state_input_proj = nn.Linear(
-                cfg.input_shapes["observation.state"][0], cfg.d_model
+                config.input_shapes["observation.state"][0], config.dim_model
            )
            # Projection layer for action (joint-space target) to hidden dimension.
            self.vae_encoder_action_input_proj = nn.Linear(
-                cfg.input_shapes["observation.state"][0], cfg.d_model
+                config.input_shapes["observation.state"][0], config.dim_model
            )
-            self.latent_dim = cfg.latent_dim
+            self.latent_dim = config.latent_dim
            # Projection layer from the VAE encoder's output to the latent distribution's parameter space.
-            self.vae_encoder_latent_output_proj = nn.Linear(cfg.d_model, self.latent_dim * 2)
+            self.vae_encoder_latent_output_proj = nn.Linear(config.dim_model, self.latent_dim * 2)
            # Fixed sinusoidal positional embedding the whole input to the VAE encoder. Unsqueeze for batch
            # dimension.
            self.register_buffer(
                "vae_encoder_pos_enc",
-                create_sinusoidal_position_embedding(1 + 1 + cfg.chunk_size, cfg.d_model).unsqueeze(0),
+                create_sinusoidal_pos_embedding(1 + 1 + config.chunk_size, config.dim_model).unsqueeze(0),
            )

        # Backbone for image feature extraction.
-        backbone_model = getattr(torchvision.models, cfg.vision_backbone)(
-            replace_stride_with_dilation=[False, False, cfg.replace_final_stride_with_dilation],
-            weights=cfg.pretrained_backbone_weights,
+        backbone_model = getattr(torchvision.models, config.vision_backbone)(
+            replace_stride_with_dilation=[False, False, config.replace_final_stride_with_dilation],
+            weights=config.pretrained_backbone_weights,
            norm_layer=FrozenBatchNorm2d,
        )
        # Note: The assumption here is that we are using a ResNet model (and hence layer4 is the final feature
@@ -199,26 +197,28 @@ class ACT(nn.Module):
        self.backbone = IntermediateLayerGetter(backbone_model, return_layers={"layer4": "feature_map"})

        # Transformer (acts as VAE decoder when training with the variational objective).
-        self.encoder = ACTEncoder(cfg)
-        self.decoder = ACTDecoder(cfg)
+        self.encoder = ACTEncoder(config)
+        self.decoder = ACTDecoder(config)

        # Transformer encoder input projections. The tokens will be structured like
        # [latent, robot_state, image_feature_map_pixels].
-        self.encoder_robot_state_input_proj = nn.Linear(cfg.input_shapes["observation.state"][0], cfg.d_model)
-        self.encoder_latent_input_proj = nn.Linear(self.latent_dim, cfg.d_model)
+        self.encoder_robot_state_input_proj = nn.Linear(
+            config.input_shapes["observation.state"][0], config.dim_model
+        )
+        self.encoder_latent_input_proj = nn.Linear(self.latent_dim, config.dim_model)
        self.encoder_img_feat_input_proj = nn.Conv2d(
-            backbone_model.fc.in_features, cfg.d_model, kernel_size=1
+            backbone_model.fc.in_features, config.dim_model, kernel_size=1
        )
        # Transformer encoder positional embeddings.
-        self.encoder_robot_and_latent_pos_embed = nn.Embedding(2, cfg.d_model)
-        self.encoder_cam_feat_pos_embed = ACTSinusoidalPositionEmbedding2d(cfg.d_model // 2)
+        self.encoder_robot_and_latent_pos_embed = nn.Embedding(2, config.dim_model)
+        self.encoder_cam_feat_pos_embed = ACTSinusoidalPositionEmbedding2d(config.dim_model // 2)

        # Transformer decoder.
        # Learnable positional embedding for the transformer's decoder (in the style of DETR object queries).
-        self.decoder_pos_embed = nn.Embedding(cfg.chunk_size, cfg.d_model)
+        self.decoder_pos_embed = nn.Embedding(config.chunk_size, config.dim_model)

        # Final action regression head on the output of the transformer's decoder.
-        self.action_head = nn.Linear(cfg.d_model, cfg.output_shapes["action"][0])
+        self.action_head = nn.Linear(config.dim_model, config.output_shapes["action"][0])

        self._reset_parameters()

@@ -244,7 +244,7 @@ class ACT(nn.Module):
            Tuple containing the latent PDF's parameters (mean, log(σ²)) both as (B, L) tensors where L is the
            latent dimension.
        """
-        if self.cfg.use_vae and self.training:
+        if self.config.use_vae and self.training:
            assert (
                "action" in batch
            ), "actions must be provided when using the variational objective in training mode."
@@ -252,7 +252,7 @@ class ACT(nn.Module):
        batch_size = batch["observation.state"].shape[0]

        # Prepare the latent for input to the transformer encoder.
-        if self.cfg.use_vae and "action" in batch:
+        if self.config.use_vae and "action" in batch:
            # Prepare the input to the VAE encoder: [cls, *joint_space_configuration, *action_sequence].
            cls_embed = einops.repeat(
                self.vae_encoder_cls_embed.weight, "1 d -> b 1 d", b=batch_size
@@ -322,7 +322,7 @@ class ACT(nn.Module):
        # Forward pass through the transformer modules.
        encoder_out = self.encoder(encoder_in, pos_embed=pos_embed)
        decoder_in = torch.zeros(
-            (self.cfg.chunk_size, batch_size, self.cfg.d_model),
+            (self.config.chunk_size, batch_size, self.config.dim_model),
            dtype=pos_embed.dtype,
            device=pos_embed.device,
        )
@@ -344,10 +344,10 @@ class ACT(nn.Module):
 class ACTEncoder(nn.Module):
    """Convenience module for running multiple encoder layers, maybe followed by normalization."""

-    def __init__(self, cfg: ACTConfig):
+    def __init__(self, config: ACTConfig):
        super().__init__()
-        self.layers = nn.ModuleList([ACTEncoderLayer(cfg) for _ in range(cfg.n_encoder_layers)])
-        self.norm = nn.LayerNorm(cfg.d_model) if cfg.pre_norm else nn.Identity()
+        self.layers = nn.ModuleList([ACTEncoderLayer(config) for _ in range(config.n_encoder_layers)])
+        self.norm = nn.LayerNorm(config.dim_model) if config.pre_norm else nn.Identity()

    def forward(self, x: Tensor, pos_embed: Tensor | None = None) -> Tensor:
        for layer in self.layers:
@@ -357,22 +357,22 @@ class ACTEncoder(nn.Module):


 class ACTEncoderLayer(nn.Module):
-    def __init__(self, cfg: ACTConfig):
+    def __init__(self, config: ACTConfig):
        super().__init__()
-        self.self_attn = nn.MultiheadAttention(cfg.d_model, cfg.n_heads, dropout=cfg.dropout)
+        self.self_attn = nn.MultiheadAttention(config.dim_model, config.n_heads, dropout=config.dropout)

        # Feed forward layers.
-        self.linear1 = nn.Linear(cfg.d_model, cfg.dim_feedforward)
-        self.dropout = nn.Dropout(cfg.dropout)
-        self.linear2 = nn.Linear(cfg.dim_feedforward, cfg.d_model)
+        self.linear1 = nn.Linear(config.dim_model, config.dim_feedforward)
+        self.dropout = nn.Dropout(config.dropout)
+        self.linear2 = nn.Linear(config.dim_feedforward, config.dim_model)

-        self.norm1 = nn.LayerNorm(cfg.d_model)
-        self.norm2 = nn.LayerNorm(cfg.d_model)
-        self.dropout1 = nn.Dropout(cfg.dropout)
-        self.dropout2 = nn.Dropout(cfg.dropout)
+        self.norm1 = nn.LayerNorm(config.dim_model)
+        self.norm2 = nn.LayerNorm(config.dim_model)
+        self.dropout1 = nn.Dropout(config.dropout)
+        self.dropout2 = nn.Dropout(config.dropout)

-        self.activation = get_activation_fn(cfg.feedforward_activation)
-        self.pre_norm = cfg.pre_norm
+        self.activation = get_activation_fn(config.feedforward_activation)
+        self.pre_norm = config.pre_norm

    def forward(self, x, pos_embed: Tensor | None = None) -> Tensor:
        skip = x
@@ -395,11 +395,11 @@ class ACTEncoderLayer(nn.Module):


 class ACTDecoder(nn.Module):
-    def __init__(self, cfg: ACTConfig):
+    def __init__(self, config: ACTConfig):
        """Convenience module for running multiple decoder layers followed by normalization."""
        super().__init__()
-        self.layers = nn.ModuleList([ACTDecoderLayer(cfg) for _ in range(cfg.n_decoder_layers)])
-        self.norm = nn.LayerNorm(cfg.d_model)
+        self.layers = nn.ModuleList([ACTDecoderLayer(config) for _ in range(config.n_decoder_layers)])
+        self.norm = nn.LayerNorm(config.dim_model)

    def forward(
        self,
@@ -418,25 +418,25 @@ class ACTDecoder(nn.Module):


 class ACTDecoderLayer(nn.Module):
-    def __init__(self, cfg: ACTConfig):
+    def __init__(self, config: ACTConfig):
        super().__init__()
-        self.self_attn = nn.MultiheadAttention(cfg.d_model, cfg.n_heads, dropout=cfg.dropout)
-        self.multihead_attn = nn.MultiheadAttention(cfg.d_model, cfg.n_heads, dropout=cfg.dropout)
+        self.self_attn = nn.MultiheadAttention(config.dim_model, config.n_heads, dropout=config.dropout)
+        self.multihead_attn = nn.MultiheadAttention(config.dim_model, config.n_heads, dropout=config.dropout)

        # Feed forward layers.
-        self.linear1 = nn.Linear(cfg.d_model, cfg.dim_feedforward)
-        self.dropout = nn.Dropout(cfg.dropout)
-        self.linear2 = nn.Linear(cfg.dim_feedforward, cfg.d_model)
+        self.linear1 = nn.Linear(config.dim_model, config.dim_feedforward)
+        self.dropout = nn.Dropout(config.dropout)
+        self.linear2 = nn.Linear(config.dim_feedforward, config.dim_model)

-        self.norm1 = nn.LayerNorm(cfg.d_model)
-        self.norm2 = nn.LayerNorm(cfg.d_model)
-        self.norm3 = nn.LayerNorm(cfg.d_model)
-        self.dropout1 = nn.Dropout(cfg.dropout)
-        self.dropout2 = nn.Dropout(cfg.dropout)
-        self.dropout3 = nn.Dropout(cfg.dropout)
+        self.norm1 = nn.LayerNorm(config.dim_model)
+        self.norm2 = nn.LayerNorm(config.dim_model)
+        self.norm3 = nn.LayerNorm(config.dim_model)
+        self.dropout1 = nn.Dropout(config.dropout)
+        self.dropout2 = nn.Dropout(config.dropout)
+        self.dropout3 = nn.Dropout(config.dropout)

-        self.activation = get_activation_fn(cfg.feedforward_activation)
-        self.pre_norm = cfg.pre_norm
+        self.activation = get_activation_fn(config.feedforward_activation)
+        self.pre_norm = config.pre_norm

    def maybe_add_pos_embed(self, tensor: Tensor, pos_embed: Tensor | None) -> Tensor:
        return tensor if pos_embed is None else tensor + pos_embed
@@ -489,7 +489,7 @@ class ACTDecoderLayer(nn.Module):
        return x


-def create_sinusoidal_position_embedding(num_positions: int, dimension: int) -> Tensor:
+def create_sinusoidal_pos_embedding(num_positions: int, dimension: int) -> Tensor:
    """1D sinusoidal positional embeddings as in Attention is All You Need.

    Args:
--- a/lerobot/common/policies/diffusion/modeling_diffusion.py
+++ b/lerobot/common/policies/diffusion/modeling_diffusion.py
@@ -9,7 +9,6 @@ TODO(alexander-soare):
 """

 import copy
-import logging
 import math
 from collections import deque
 from typing import Callable
@@ -19,6 +18,7 @@ import torch
 import torch.nn.functional as F  # noqa: N812
 import torchvision
 from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
+from huggingface_hub import PyTorchModelHubMixin
 from robomimic.models.base_nets import SpatialSoftmax
 from torch import Tensor, nn
 from torch.nn.modules.batchnorm import _BatchNorm
@@ -32,7 +32,7 @@ from lerobot.common.policies.utils import (
 )


-class DiffusionPolicy(nn.Module):
+class DiffusionPolicy(nn.Module, PyTorchModelHubMixin):
    """
    Diffusion Policy as per "Diffusion Policy: Visuomotor Policy Learning via Action Diffusion"
    (paper: https://arxiv.org/abs/2303.04137, code: https://github.com/real-stanford/diffusion_policy).
@@ -41,45 +41,50 @@ class DiffusionPolicy(nn.Module):
    name = "diffusion"

    def __init__(
-        self, cfg: DiffusionConfig | None = None, lr_scheduler_num_training_steps: int = 0, dataset_stats=None
+        self,
+        config: DiffusionConfig | None = None,
+        dataset_stats=None,
    ):
        """
        Args:
-            cfg: Policy configuration class instance or None, in which case the default instantiation of the
-                 configuration class is used.
+            config: Policy configuration class instance or None, in which case the default instantiation of
+                    the configuration class is used.
        """
        super().__init__()
        # TODO(alexander-soare): LR scheduler will be removed.
-        assert lr_scheduler_num_training_steps > 0
-        if cfg is None:
-            cfg = DiffusionConfig()
-        self.cfg = cfg
-        self.normalize_inputs = Normalize(cfg.input_shapes, cfg.input_normalization_modes, dataset_stats)
-        self.normalize_targets = Normalize(cfg.output_shapes, cfg.output_normalization_modes, dataset_stats)
+        if config is None:
+            config = DiffusionConfig()
+        self.config = config
+        self.normalize_inputs = Normalize(
+            config.input_shapes, config.input_normalization_modes, dataset_stats
+        )
+        self.normalize_targets = Normalize(
+            config.output_shapes, config.output_normalization_modes, dataset_stats
+        )
        self.unnormalize_outputs = Unnormalize(
-            cfg.output_shapes, cfg.output_normalization_modes, dataset_stats
+            config.output_shapes, config.output_normalization_modes, dataset_stats
        )

        # queues are populated during rollout of the policy, they contain the n latest observations and actions
        self._queues = None

-        self.diffusion = DiffusionModel(cfg)
+        self.diffusion = DiffusionModel(config)

        # TODO(alexander-soare): This should probably be managed outside of the policy class.
        self.ema_diffusion = None
        self.ema = None
-        if self.cfg.use_ema:
+        if self.config.use_ema:
            self.ema_diffusion = copy.deepcopy(self.diffusion)
-            self.ema = DiffusionEMA(cfg, model=self.ema_diffusion)
+            self.ema = DiffusionEMA(config, model=self.ema_diffusion)

    def reset(self):
        """
        Clear observation and action queues. Should be called on `env.reset()`
        """
        self._queues = {
-            "observation.image": deque(maxlen=self.cfg.n_obs_steps),
-            "observation.state": deque(maxlen=self.cfg.n_obs_steps),
-            "action": deque(maxlen=self.cfg.n_action_steps),
+            "observation.image": deque(maxlen=self.config.n_obs_steps),
+            "observation.state": deque(maxlen=self.config.n_obs_steps),
+            "action": deque(maxlen=self.config.n_action_steps),
        }

    @torch.no_grad
@@ -138,46 +143,34 @@ class DiffusionPolicy(nn.Module):
        loss = self.diffusion.compute_loss(batch)
        return {"loss": loss}

-    def save(self, fp):
-        torch.save(self.state_dict(), fp)
-
-    def load(self, fp):
-        d = torch.load(fp)
-        missing_keys, unexpected_keys = self.load_state_dict(d, strict=False)
-        if len(missing_keys) > 0:
-            assert all(k.startswith("ema_diffusion.") for k in missing_keys)
-            logging.warning(
-                "DiffusionPolicy.load expected ema parameters in loaded state dict but none were found."
-            )
-        assert len(unexpected_keys) == 0
-

 class DiffusionModel(nn.Module):
-    def __init__(self, cfg: DiffusionConfig):
+    def __init__(self, config: DiffusionConfig):
        super().__init__()
-        self.cfg = cfg
+        self.config = config

-        self.rgb_encoder = DiffusionRgbEncoder(cfg)
+        self.rgb_encoder = DiffusionRgbEncoder(config)
        self.unet = DiffusionConditionalUnet1d(
-            cfg,
-            global_cond_dim=(cfg.output_shapes["action"][0] + self.rgb_encoder.feature_dim) * cfg.n_obs_steps,
+            config,
+            global_cond_dim=(config.output_shapes["action"][0] + self.rgb_encoder.feature_dim)
+            * config.n_obs_steps,
        )

        self.noise_scheduler = DDPMScheduler(
-            num_train_timesteps=cfg.num_train_timesteps,
-            beta_start=cfg.beta_start,
-            beta_end=cfg.beta_end,
-            beta_schedule=cfg.beta_schedule,
+            num_train_timesteps=config.num_train_timesteps,
+            beta_start=config.beta_start,
+            beta_end=config.beta_end,
+            beta_schedule=config.beta_schedule,
            variance_type="fixed_small",
-            clip_sample=cfg.clip_sample,
-            clip_sample_range=cfg.clip_sample_range,
-            prediction_type=cfg.prediction_type,
+            clip_sample=config.clip_sample,
+            clip_sample_range=config.clip_sample_range,
+            prediction_type=config.prediction_type,
        )

-        if cfg.num_inference_steps is None:
+        if config.num_inference_steps is None:
            self.num_inference_steps = self.noise_scheduler.config.num_train_timesteps
        else:
-            self.num_inference_steps = cfg.num_inference_steps
+            self.num_inference_steps = config.num_inference_steps

    # ========= inference  ============
    def conditional_sample(
@@ -188,7 +181,7 @@ class DiffusionModel(nn.Module):

        # Sample prior.
        sample = torch.randn(
-            size=(batch_size, self.cfg.horizon, self.cfg.output_shapes["action"][0]),
+            size=(batch_size, self.config.horizon, self.config.output_shapes["action"][0]),
            dtype=dtype,
            device=device,
            generator=generator,
@@ -218,7 +211,7 @@ class DiffusionModel(nn.Module):
        """
        assert set(batch).issuperset({"observation.state", "observation.image"})
        batch_size, n_obs_steps = batch["observation.state"].shape[:2]
-        assert n_obs_steps == self.cfg.n_obs_steps
+        assert n_obs_steps == self.config.n_obs_steps

        # Extract image feature (first combine batch and sequence dims).
        img_features = self.rgb_encoder(einops.rearrange(batch["observation.image"], "b n ... -> (b n) ..."))
@@ -231,10 +224,10 @@ class DiffusionModel(nn.Module):
        sample = self.conditional_sample(batch_size, global_cond=global_cond)

        # `horizon` steps worth of actions (from the first observation).
-        actions = sample[..., : self.cfg.output_shapes["action"][0]]
+        actions = sample[..., : self.config.output_shapes["action"][0]]
        # Extract `n_action_steps` steps worth of actions (from the current observation).
        start = n_obs_steps - 1
-        end = start + self.cfg.n_action_steps
+        end = start + self.config.n_action_steps
        actions = actions[:, start:end]

        return actions
@@ -253,8 +246,8 @@ class DiffusionModel(nn.Module):
        assert set(batch).issuperset({"observation.state", "observation.image", "action", "action_is_pad"})
        batch_size, n_obs_steps = batch["observation.state"].shape[:2]
        horizon = batch["action"].shape[1]
-        assert horizon == self.cfg.horizon
-        assert n_obs_steps == self.cfg.n_obs_steps
+        assert horizon == self.config.horizon
+        assert n_obs_steps == self.config.n_obs_steps

        # Extract image feature (first combine batch and sequence dims).
        img_features = self.rgb_encoder(einops.rearrange(batch["observation.image"], "b n ... -> (b n) ..."))
@@ -283,12 +276,12 @@ class DiffusionModel(nn.Module):

        # Compute the loss.
        # The target is either the original trajectory, or the noise.
-        if self.cfg.prediction_type == "epsilon":
+        if self.config.prediction_type == "epsilon":
            target = eps
-        elif self.cfg.prediction_type == "sample":
+        elif self.config.prediction_type == "sample":
            target = batch["action"]
        else:
-            raise ValueError(f"Unsupported prediction type {self.cfg.prediction_type}")
+            raise ValueError(f"Unsupported prediction type {self.config.prediction_type}")

        loss = F.mse_loss(pred, target, reduction="none")

@@ -306,29 +299,29 @@ class DiffusionRgbEncoder(nn.Module):
    Includes the ability to normalize and crop the image first.
    """

-    def __init__(self, cfg: DiffusionConfig):
+    def __init__(self, config: DiffusionConfig):
        super().__init__()
        # Set up optional preprocessing.
-        if cfg.crop_shape is not None:
+        if config.crop_shape is not None:
            self.do_crop = True
            # Always use center crop for eval
-            self.center_crop = torchvision.transforms.CenterCrop(cfg.crop_shape)
-            if cfg.crop_is_random:
-                self.maybe_random_crop = torchvision.transforms.RandomCrop(cfg.crop_shape)
+            self.center_crop = torchvision.transforms.CenterCrop(config.crop_shape)
+            if config.crop_is_random:
+                self.maybe_random_crop = torchvision.transforms.RandomCrop(config.crop_shape)
            else:
                self.maybe_random_crop = self.center_crop
        else:
            self.do_crop = False

        # Set up backbone.
-        backbone_model = getattr(torchvision.models, cfg.vision_backbone)(
-            weights=cfg.pretrained_backbone_weights
+        backbone_model = getattr(torchvision.models, config.vision_backbone)(
+            weights=config.pretrained_backbone_weights
        )
        # Note: This assumes that the layer4 feature map is children()[-3]
        # TODO(alexander-soare): Use a safer alternative.
        self.backbone = nn.Sequential(*(list(backbone_model.children())[:-2]))
-        if cfg.use_group_norm:
-            if cfg.pretrained_backbone_weights:
+        if config.use_group_norm:
+            if config.pretrained_backbone_weights:
                raise ValueError(
                    "You can't replace BatchNorm in a pretrained model without ruining the weights!"
                )
@@ -342,11 +335,11 @@ class DiffusionRgbEncoder(nn.Module):
        # Use a dry run to get the feature map shape.
        with torch.inference_mode():
            feat_map_shape = tuple(
-                self.backbone(torch.zeros(size=(1, *cfg.input_shapes["observation.image"]))).shape[1:]
+                self.backbone(torch.zeros(size=(1, *config.input_shapes["observation.image"]))).shape[1:]
            )
-        self.pool = SpatialSoftmax(feat_map_shape, num_kp=cfg.spatial_softmax_num_keypoints)
-        self.feature_dim = cfg.spatial_softmax_num_keypoints * 2
-        self.out = nn.Linear(cfg.spatial_softmax_num_keypoints * 2, self.feature_dim)
+        self.pool = SpatialSoftmax(feat_map_shape, num_kp=config.spatial_softmax_num_keypoints)
+        self.feature_dim = config.spatial_softmax_num_keypoints * 2
+        self.out = nn.Linear(config.spatial_softmax_num_keypoints * 2, self.feature_dim)
        self.relu = nn.ReLU()

    def forward(self, x: Tensor) -> Tensor:
@@ -442,34 +435,34 @@ class DiffusionConditionalUnet1d(nn.Module):
    Note: this removes local conditioning as compared to the original diffusion policy code.
    """

-    def __init__(self, cfg: DiffusionConfig, global_cond_dim: int):
+    def __init__(self, config: DiffusionConfig, global_cond_dim: int):
        super().__init__()

-        self.cfg = cfg
+        self.config = config

        # Encoder for the diffusion timestep.
        self.diffusion_step_encoder = nn.Sequential(
-            DiffusionSinusoidalPosEmb(cfg.diffusion_step_embed_dim),
-            nn.Linear(cfg.diffusion_step_embed_dim, cfg.diffusion_step_embed_dim * 4),
+            DiffusionSinusoidalPosEmb(config.diffusion_step_embed_dim),
+            nn.Linear(config.diffusion_step_embed_dim, config.diffusion_step_embed_dim * 4),
            nn.Mish(),
-            nn.Linear(cfg.diffusion_step_embed_dim * 4, cfg.diffusion_step_embed_dim),
+            nn.Linear(config.diffusion_step_embed_dim * 4, config.diffusion_step_embed_dim),
        )

        # The FiLM conditioning dimension.
-        cond_dim = cfg.diffusion_step_embed_dim + global_cond_dim
+        cond_dim = config.diffusion_step_embed_dim + global_cond_dim

        # In channels / out channels for each downsampling block in the Unet's encoder. For the decoder, we
        # just reverse these.
-        in_out = [(cfg.output_shapes["action"][0], cfg.down_dims[0])] + list(
-            zip(cfg.down_dims[:-1], cfg.down_dims[1:], strict=True)
+        in_out = [(config.output_shapes["action"][0], config.down_dims[0])] + list(
+            zip(config.down_dims[:-1], config.down_dims[1:], strict=True)
        )

        # Unet encoder.
        common_res_block_kwargs = {
            "cond_dim": cond_dim,
-            "kernel_size": cfg.kernel_size,
-            "n_groups": cfg.n_groups,
-            "use_film_scale_modulation": cfg.use_film_scale_modulation,
+            "kernel_size": config.kernel_size,
+            "n_groups": config.n_groups,
+            "use_film_scale_modulation": config.use_film_scale_modulation,
        }
        self.down_modules = nn.ModuleList([])
        for ind, (dim_in, dim_out) in enumerate(in_out):
@@ -489,10 +482,10 @@ class DiffusionConditionalUnet1d(nn.Module):
        self.mid_modules = nn.ModuleList(
            [
                DiffusionConditionalResidualBlock1d(
-                    cfg.down_dims[-1], cfg.down_dims[-1], **common_res_block_kwargs
+                    config.down_dims[-1], config.down_dims[-1], **common_res_block_kwargs
                ),
                DiffusionConditionalResidualBlock1d(
-                    cfg.down_dims[-1], cfg.down_dims[-1], **common_res_block_kwargs
+                    config.down_dims[-1], config.down_dims[-1], **common_res_block_kwargs
                ),
            ]
        )
@@ -514,8 +507,8 @@ class DiffusionConditionalUnet1d(nn.Module):
            )

        self.final_conv = nn.Sequential(
-            DiffusionConv1dBlock(cfg.down_dims[0], cfg.down_dims[0], kernel_size=cfg.kernel_size),
-            nn.Conv1d(cfg.down_dims[0], cfg.output_shapes["action"][0], 1),
+            DiffusionConv1dBlock(config.down_dims[0], config.down_dims[0], kernel_size=config.kernel_size),
+            nn.Conv1d(config.down_dims[0], config.output_shapes["action"][0], 1),
        )

    def forward(self, x: Tensor, timestep: Tensor | int, global_cond=None) -> Tensor:
@@ -626,13 +619,13 @@ class DiffusionEMA:
    Exponential Moving Average of models weights
    """

-    def __init__(self, cfg: DiffusionConfig, model: nn.Module):
+    def __init__(self, config: DiffusionConfig, model: nn.Module):
        """
        @crowsonkb's notes on EMA Warmup:
-            If gamma=1 and power=1, implements a simple average. gamma=1, power=2/3 are good values for models you plan
-            to train for a million or more steps (reaches decay factor 0.999 at 31.6K steps, 0.9999 at 1M steps),
-            gamma=1, power=3/4 for models you plan to train for less (reaches decay factor 0.999 at 10K steps, 0.9999
-            at 215.4k steps).
+            If gamma=1 and power=1, implements a simple average. gamma=1, power=2/3 are good values for models
+            you plan to train for a million or more steps (reaches decay factor 0.999 at 31.6K steps, 0.9999
+            at 1M steps), gamma=1, power=3/4 for models you plan to train for less (reaches decay factor 0.999
+            at 10K steps, 0.9999 at 215.4k steps).
        Args:
            inv_gamma (float): Inverse multiplicative factor of EMA warmup. Default: 1.
            power (float): Exponential factor of EMA warmup. Default: 2/3.
@@ -643,11 +636,11 @@ class DiffusionEMA:
        self.averaged_model.eval()
        self.averaged_model.requires_grad_(False)

-        self.update_after_step = cfg.ema_update_after_step
-        self.inv_gamma = cfg.ema_inv_gamma
-        self.power = cfg.ema_power
-        self.min_alpha = cfg.ema_min_alpha
-        self.max_alpha = cfg.ema_max_alpha
+        self.update_after_step = config.ema_update_after_step
+        self.inv_gamma = config.ema_inv_gamma
+        self.power = config.ema_power
+        self.min_alpha = config.ema_min_alpha
+        self.max_alpha = config.ema_max_alpha

        self.alpha = 0.0
        self.optimization_step = 0
--- a/lerobot/common/policies/factory.py
+++ b/lerobot/common/policies/factory.py
@@ -2,6 +2,7 @@ import inspect

 from omegaconf import DictConfig, OmegaConf

+from lerobot.common.policies.policy_protocol import Policy
 from lerobot.common.utils.utils import get_safe_torch_device


@@ -20,42 +21,49 @@ def _policy_cfg_from_hydra_cfg(policy_cfg_class, hydra_cfg):
    return policy_cfg


-def make_policy(hydra_cfg: DictConfig, dataset_stats=None):
-    if hydra_cfg.policy.name == "tdmpc":
-        from lerobot.common.policies.tdmpc.policy import TDMPCPolicy
-
-        policy = TDMPCPolicy(
-            hydra_cfg.policy,
-            n_obs_steps=hydra_cfg.n_obs_steps,
-            n_action_steps=hydra_cfg.n_action_steps,
-            device=hydra_cfg.device,
-        )
-    elif hydra_cfg.policy.name == "diffusion":
+def get_policy_and_config_classes(name: str) -> tuple[Policy, object]:
+    """Get the policy's class and config class given a name (matching the policy class' `name` attribute)."""
+    if name == "tdmpc":
+        raise NotImplementedError("Coming soon!")
+    elif name == "diffusion":
        from lerobot.common.policies.diffusion.configuration_diffusion import DiffusionConfig
        from lerobot.common.policies.diffusion.modeling_diffusion import DiffusionPolicy

-        policy_cfg = _policy_cfg_from_hydra_cfg(DiffusionConfig, hydra_cfg)
-        policy = DiffusionPolicy(policy_cfg, hydra_cfg.training.offline_steps, dataset_stats)
-        policy.to(get_safe_torch_device(hydra_cfg.device))
-    elif hydra_cfg.policy.name == "act":
+        return DiffusionPolicy, DiffusionConfig
+    elif name == "act":
        from lerobot.common.policies.act.configuration_act import ACTConfig
        from lerobot.common.policies.act.modeling_act import ACTPolicy

-        policy_cfg = _policy_cfg_from_hydra_cfg(ACTConfig, hydra_cfg)
-        policy = ACTPolicy(policy_cfg, dataset_stats)
+        return ACTPolicy, ACTConfig
+    else:
+        raise NotImplementedError(f"Policy with name {name} is not implemented.")
+
+
+def make_policy(
+    hydra_cfg: DictConfig, pretrained_policy_name_or_path: str | None = None, dataset_stats=None
+) -> Policy:
+    """Make an instance of a policy class.
+
+    Args:
+        hydra_cfg: A parsed Hydra configuration (see scripts). If `pretrained_policy_name_or_path` is
+            provided, only `hydra_cfg.policy.name` is used while everything else is ignored.
+        pretrained_policy_name_or_path: Either the repo ID of a model hosted on the Hub or a path to a
+            directory containing weights saved using `Policy.save_pretrained`. Note that providing this
+            argument overrides everything in `hydra_cfg.policy` apart from `hydra_cfg.policy.name`.
+        dataset_stats: Dataset statistics to use for (un)normalization of inputs/outputs in the policy. Must
+            be provided when initializing a new policy, and must not be provided when loading a pretrained
+            policy. Therefore, this argument is mutually exclusive with `pretrained_policy_name_or_path`.
+    """
+    if not (pretrained_policy_name_or_path is None) ^ (dataset_stats is None):
+        raise ValueError("Only one of `pretrained_policy_name_or_path` and `dataset_stats` may be provided.")
+
+    policy_cls, policy_cfg_class = get_policy_and_config_classes(hydra_cfg.policy.name)
+
+    if pretrained_policy_name_or_path is None:
+        policy_cfg = _policy_cfg_from_hydra_cfg(policy_cfg_class, hydra_cfg)
+        policy = policy_cls(policy_cfg, dataset_stats)
        policy.to(get_safe_torch_device(hydra_cfg.device))
    else:
-        raise ValueError(hydra_cfg.policy.name)
-
-    if hydra_cfg.policy.pretrained_model_path:
-        # TODO(rcadene): hack for old pretrained models from fowm
-        if hydra_cfg.policy.name == "tdmpc" and "fowm" in hydra_cfg.policy.pretrained_model_path:
-            if "offline" in hydra_cfg.policy.pretrained_model_path:
-                policy.step[0] = 25000
-            elif "final" in hydra_cfg.policy.pretrained_model_path:
-                policy.step[0] = 100000
-            else:
-                raise NotImplementedError()
-        policy.load(hydra_cfg.policy.pretrained_model_path)
+        policy = policy_cls.from_pretrained(pretrained_policy_name_or_path)

    return policy
--- a/lerobot/common/policies/normalize.py
+++ b/lerobot/common/policies/normalize.py
@@ -57,17 +57,28 @@ def create_stats_buffers(
            )

        if stats is not None:
+            # Note: The clone is needed to make sure that the logic in save_pretrained doesn't see duplicated
+            # tensors anywhere (for example, when we use the same stats for normalization and
+            # unnormalization). See the logic here
+            # https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/py_src/safetensors/torch.py#L97.
            if mode == "mean_std":
-                buffer["mean"].data = stats[key]["mean"]
-                buffer["std"].data = stats[key]["std"]
+                buffer["mean"].data = stats[key]["mean"].clone()
+                buffer["std"].data = stats[key]["std"].clone()
            elif mode == "min_max":
-                buffer["min"].data = stats[key]["min"]
-                buffer["max"].data = stats[key]["max"]
+                buffer["min"].data = stats[key]["min"].clone()
+                buffer["max"].data = stats[key]["max"].clone()

        stats_buffers[key] = buffer
    return stats_buffers


+def _no_stats_error_str(name: str) -> str:
+    return (
+        f"`{name}` is infinity. You should either initialize with `stats` as an argument, or use a "
+        "pretrained model."
+    )
+
+
 class Normalize(nn.Module):
    """Normalizes data (e.g. "observation.image") for more stable and faster convergence during training."""

@@ -99,7 +110,6 @@ class Normalize(nn.Module):
        self.shapes = shapes
        self.modes = modes
        self.stats = stats
-        # `self.buffer_observation_state["mean"]` contains `torch.tensor(state_dim)`
        stats_buffers = create_stats_buffers(shapes, modes, stats)
        for key, buffer in stats_buffers.items():
            setattr(self, "buffer_" + key.replace(".", "_"), buffer)
@@ -113,26 +123,14 @@ class Normalize(nn.Module):
            if mode == "mean_std":
                mean = buffer["mean"]
                std = buffer["std"]
-                assert not torch.isinf(mean).any(), (
-                    "`mean` is infinity. You forgot to initialize with `stats` as argument, or called "
-                    "`policy.load_state_dict`."
-                )
-                assert not torch.isinf(std).any(), (
-                    "`std` is infinity. You forgot to initialize with `stats` as argument, or called "
-                    "`policy.load_state_dict`."
-                )
+                assert not torch.isinf(mean).any(), _no_stats_error_str("mean")
+                assert not torch.isinf(std).any(), _no_stats_error_str("std")
                batch[key] = (batch[key] - mean) / (std + 1e-8)
            elif mode == "min_max":
                min = buffer["min"]
                max = buffer["max"]
-                assert not torch.isinf(min).any(), (
-                    "`min` is infinity. You forgot to initialize with `stats` as argument, or called "
-                    "`policy.load_state_dict`."
-                )
-                assert not torch.isinf(max).any(), (
-                    "`max` is infinity. You forgot to initialize with `stats` as argument, or called "
-                    "`policy.load_state_dict`."
-                )
+                assert not torch.isinf(min).any(), _no_stats_error_str("min")
+                assert not torch.isinf(max).any(), _no_stats_error_str("max")
                # normalize to [0,1]
                batch[key] = (batch[key] - min) / (max - min)
                # normalize to [-1, 1]
@@ -190,26 +188,14 @@ class Unnormalize(nn.Module):
            if mode == "mean_std":
                mean = buffer["mean"]
                std = buffer["std"]
-                assert not torch.isinf(mean).any(), (
-                    "`mean` is infinity. You forgot to initialize with `stats` as argument, or called "
-                    "`policy.load_state_dict`."
-                )
-                assert not torch.isinf(std).any(), (
-                    "`std` is infinity. You forgot to initialize with `stats` as argument, or called "
-                    "`policy.load_state_dict`."
-                )
+                assert not torch.isinf(mean).any(), _no_stats_error_str("mean")
+                assert not torch.isinf(std).any(), _no_stats_error_str("std")
                batch[key] = batch[key] * std + mean
            elif mode == "min_max":
                min = buffer["min"]
                max = buffer["max"]
-                assert not torch.isinf(min).any(), (
-                    "`min` is infinity. You forgot to initialize with `stats` as argument, or called "
-                    "`policy.load_state_dict`."
-                )
-                assert not torch.isinf(max).any(), (
-                    "`max` is infinity. You forgot to initialize with `stats` as argument, or called "
-                    "`policy.load_state_dict`."
-                )
+                assert not torch.isinf(min).any(), _no_stats_error_str("min")
+                assert not torch.isinf(max).any(), _no_stats_error_str("max")
                batch[key] = (batch[key] + 1) / 2
                batch[key] = batch[key] * (max - min) + min
            else:
--- a/lerobot/common/policies/policy_protocol.py
+++ b/lerobot/common/policies/policy_protocol.py
@@ -14,7 +14,10 @@ from torch import Tensor

@runtime_checkable
 class Policy(Protocol):
-    """The required interface for implementing a policy."""
+    """The required interface for implementing a policy.
+
+    We also expect all policies to subclass torch.nn.Module and PyTorchModelHubMixin.
+    """

    name: str