Simplify configs (#550)

Co-authored-by: Remi <remi.cadene@huggingface.co> Co-authored-by: HUANG TZU-CHUN <137322177+tc-huang@users.noreply.github.com>
2025-01-31 13:57:37 +01:00
parent 1ee1acf8ad
commit 3c0a209f9f
119 changed files with 5761 additions and 5466 deletions
--- a/lerobot/configs/default.py
+++ b/lerobot/configs/default.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+
+from lerobot.common import (
+    policies,  # noqa: F401
+)
+from lerobot.common.datasets.transforms import ImageTransformsConfig
+
+
+@dataclass
+class DatasetConfig:
+    # You may provide a list of datasets here. `train.py` creates them all and concatenates them. Note: only data
+    # keys common between the datasets are kept. Each dataset gets and additional transform that inserts the
+    # "dataset_index" into the returned item. The index mapping is made according to the order in which the
+    # datsets are provided.
+    repo_id: str
+    episodes: list[int] | None = None
+    image_transforms: ImageTransformsConfig = field(default_factory=ImageTransformsConfig)
+    local_files_only: bool = False
+    use_imagenet_stats: bool = True
+    video_backend: str = "pyav"
+
+
+@dataclass
+class WandBConfig:
+    enable: bool = False
+    # Set to true to disable saving an artifact despite training.save_checkpoint=True
+    disable_artifact: bool = False
+    project: str = "lerobot"
+    entity: str | None = None
+    notes: str | None = None
+
+
+@dataclass
+class EvalConfig:
+    n_episodes: int = 50
+    # `batch_size` specifies the number of environments to use in a gym.vector.VectorEnv.
+    batch_size: int = 50
+    # `use_async_envs` specifies whether to use asynchronous environments (multiprocessing).
+    use_async_envs: bool = False
+
+    def __post_init__(self):
+        if self.batch_size > self.n_episodes:
+            raise ValueError(
+                "The eval batch size is greater than the number of eval episodes "
+                f"({self.batch_size} > {self.n_episodes}). As a result, {self.batch_size} "
+                f"eval environments will be instantiated, but only {self.n_episodes} will be used. "
+                "This might significantly slow down evaluation. To fix this, you should update your command "
+                f"to increase the number of episodes to match the batch size (e.g. `eval.n_episodes={self.batch_size}`), "
+                f"or lower the batch size (e.g. `eval.batch_size={self.n_episodes}`)."
+            )
--- a/lerobot/configs/default.yaml
+++ b/lerobot/configs/default.yaml
@@ -1,130 +0,0 @@
-defaults:
-  - _self_
-  - env: pusht
-  - policy: diffusion
-
-hydra:
-  run:
-    # Set `dir` to where you would like to save all of the run outputs. If you run another training session
-    # with the same value for `dir` its contents will be overwritten unless you set `resume` to true.
-    dir: outputs/train/${now:%Y-%m-%d}/${now:%H-%M-%S}_${env.name}_${policy.name}_${hydra.job.name}
-  job:
-    name: default
-
-# Set `resume` to true to resume a previous run. In order for this to work, you will need to make sure
-# `hydra.run.dir` is the directory of an existing run with at least one checkpoint in it.
-# Note that when resuming a run, the default behavior is to use the configuration from the checkpoint,
-# regardless of what's provided with the training command at the time of resumption.
-resume: false
-device: cuda  # cpu
-# `use_amp` determines whether to use Automatic Mixed Precision (AMP) for training and evaluation. With AMP,
-# automatic gradient scaling is used.
-use_amp: false
-# `seed` is used for training (eg: model initialization, dataset shuffling)
-# AND for the evaluation environments.
-seed: ???
-# You may provide a list of datasets here. `train.py` creates them all and concatenates them. Note: only data
-# keys common between the datasets are kept. Each dataset gets and additional transform that inserts the
-# "dataset_index" into the returned item. The index mapping is made according to the order in which the
-# datsets are provided.
-dataset_repo_id: lerobot/pusht
-video_backend: pyav
-
-training:
-  offline_steps: ???
-
-  # Number of workers for the offline training dataloader.
-  num_workers: 4
-
-  batch_size: ???
-
-  eval_freq: ???
-  log_freq: 200
-  save_checkpoint: true
-  # Checkpoint is saved every `save_freq` training iterations and after the last training step.
-  save_freq: ???
-
-  # Online training. Note that the online training loop adopts most of the options above apart from the
-  # dataloader options. Unless otherwise specified.
-  # The online training look looks something like:
-  #
-  # for i in range(online_steps):
-  #     do_online_rollout_and_update_online_buffer()
-  #     for j in range(online_steps_between_rollouts):
-  #         batch = next(dataloader_with_offline_and_online_data)
-  #         loss = policy(batch)
-  #         loss.backward()
-  #         optimizer.step()
-  #
-  online_steps: ???
-  # How many episodes to collect at once when we reach the online rollout part of the training loop.
-  online_rollout_n_episodes: 1
-  # The number of environments to use in the gym.vector.VectorEnv. This ends up also being the batch size for
-  # the policy. Ideally you should set this to by an even divisor or online_rollout_n_episodes.
-  online_rollout_batch_size: 1
-  # How many optimization steps (forward, backward, optimizer step) to do between running rollouts.
-  online_steps_between_rollouts: null
-  # The proportion of online samples (vs offline samples) to include in the online training batches.
-  online_sampling_ratio: 0.5
-  # First seed to use for the online rollout environment. Seeds for subsequent rollouts are incremented by 1.
-  online_env_seed: null
-  # Sets the maximum number of frames that are stored in the online buffer for online training. The buffer is
-  # FIFO.
-  online_buffer_capacity: null
-  # The minimum number of frames to have in the online buffer before commencing online training.
-  # If online_buffer_seed_size > online_rollout_n_episodes, the rollout will be run multiple times until the
-  # seed size condition is satisfied.
-  online_buffer_seed_size: 0
-  # Whether to run the online rollouts asynchronously. This means we can run the online training steps in
-  # parallel with the rollouts. This might be advised if your GPU has the bandwidth to handle training
-  # + eval + environment rendering simultaneously.
-  do_online_rollout_async: false
-
-  image_transforms:
-  # These transforms are all using standard torchvision.transforms.v2
-  # You can find out how these transformations affect images here:
-  # https://pytorch.org/vision/0.18/auto_examples/transforms/plot_transforms_illustrations.html
-  # We use a custom RandomSubsetApply container to sample them.
-  # For each transform, the following parameters are available:
-  #   weight: This represents the multinomial probability (with no replacement)
-  #           used for sampling the transform. If the sum of the weights is not 1,
-  #           they will be normalized.
-  #   min_max: Lower & upper bound respectively used for sampling the transform's parameter
-  #           (following uniform distribution) when it's applied.
-    # Set this flag to `true` to enable transforms during training
-    enable: false
-    # This is the maximum number of transforms (sampled from these below) that will be applied to each frame.
-    # It's an integer in the interval [1, number of available transforms].
-    max_num_transforms: 3
-    # By default, transforms are applied in Torchvision's suggested order (shown below).
-    # Set this to True to apply them in a random order.
-    random_order: false
-    brightness:
-      weight: 1
-      min_max: [0.8, 1.2]
-    contrast:
-      weight: 1
-      min_max: [0.8, 1.2]
-    saturation:
-      weight: 1
-      min_max: [0.5, 1.5]
-    hue:
-      weight: 1
-      min_max: [-0.05, 0.05]
-    sharpness:
-      weight: 1
-      min_max: [0.8, 1.2]
-
-eval:
-  n_episodes: 1
-  # `batch_size` specifies the number of environments to use in a gym.vector.VectorEnv.
-  batch_size: 1
-  # `use_async_envs` specifies whether to use asynchronous environments (multiprocessing).
-  use_async_envs: false
-
-wandb:
-  enable: false
-  # Set to true to disable saving an artifact despite save_checkpoint == True
-  disable_artifact: false
-  project: lerobot
-  notes: ""
--- a/lerobot/configs/env/aloha.yaml
+++ b/lerobot/configs/env/aloha.yaml
@@ -1,14 +0,0 @@
-# @package _global_
-
-fps: 50
-
-env:
-  name: aloha
-  task: AlohaInsertion-v0
-  state_dim: 14
-  action_dim: 14
-  fps: ${fps}
-  episode_length: 400
-  gym:
-    obs_type: pixels_agent_pos
-    render_mode: rgb_array
--- a/lerobot/configs/env/aloha_real.yaml
+++ b/lerobot/configs/env/aloha_real.yaml
@@ -1,10 +0,0 @@
-# @package _global_
-
-fps: 30
-
-env:
-  name: real_world
-  task: null
-  state_dim: 18
-  action_dim: 18
-  fps: ${fps}
--- a/lerobot/configs/env/dora_aloha_real.yaml
+++ b/lerobot/configs/env/dora_aloha_real.yaml
@@ -1,13 +0,0 @@
-# @package _global_
-
-fps: 30
-
-env:
-  name: dora
-  task: DoraAloha-v0
-  state_dim: 14
-  action_dim: 14
-  fps: ${fps}
-  episode_length: 400
-  gym:
-    fps: ${fps}
--- a/lerobot/configs/env/koch_real.yaml
+++ b/lerobot/configs/env/koch_real.yaml
@@ -1,10 +0,0 @@
-# @package _global_
-
-fps: 30
-
-env:
-  name: real_world
-  task: null
-  state_dim: 6
-  action_dim: 6
-  fps: ${fps}
--- a/lerobot/configs/env/moss_real.yaml
+++ b/lerobot/configs/env/moss_real.yaml
@@ -1,10 +0,0 @@
-# @package _global_
-
-fps: 30
-
-env:
-  name: real_world
-  task: null
-  state_dim: 6
-  action_dim: 6
-  fps: ${fps}
--- a/lerobot/configs/env/pusht.yaml
+++ b/lerobot/configs/env/pusht.yaml
@@ -1,17 +0,0 @@
-# @package _global_
-
-fps: 10
-
-env:
-  name: pusht
-  task: PushT-v0
-  image_size: 96
-  state_dim: 2
-  action_dim: 2
-  fps: ${fps}
-  episode_length: 300
-  gym:
-    obs_type: pixels_agent_pos
-    render_mode: rgb_array
-    visualization_width: 384
-    visualization_height: 384
--- a/lerobot/configs/env/so100_real.yaml
+++ b/lerobot/configs/env/so100_real.yaml
@@ -1,10 +0,0 @@
-# @package _global_
-
-fps: 30
-
-env:
-  name: real_world
-  task: null
-  state_dim: 6
-  action_dim: 6
-  fps: ${fps}
--- a/lerobot/configs/env/xarm.yaml
+++ b/lerobot/configs/env/xarm.yaml
@@ -1,17 +0,0 @@
-# @package _global_
-
-fps: 15
-
-env:
-  name: xarm
-  task: XarmLift-v0
-  image_size: 84
-  state_dim: 4
-  action_dim: 4
-  fps: ${fps}
-  episode_length: 200
-  gym:
-    obs_type: pixels_agent_pos
-    render_mode: rgb_array
-    visualization_width: 384
-    visualization_height: 384
--- a/lerobot/configs/eval.py
+++ b/lerobot/configs/eval.py
@@ -0,0 +1,84 @@
+import datetime as dt
+import logging
+from dataclasses import dataclass, field
+from pathlib import Path
+
+from lerobot.common import envs, policies  # noqa: F401
+from lerobot.common.utils.utils import auto_select_torch_device, is_amp_available, is_torch_device_available
+from lerobot.configs import parser
+from lerobot.configs.default import EvalConfig
+from lerobot.configs.policies import PreTrainedConfig
+from lerobot.configs.train import TrainPipelineConfig
+
+
+@dataclass
+class EvalPipelineConfig:
+    # Either the repo ID of a model hosted on the Hub or a path to a directory containing weights
+    # saved using `Policy.save_pretrained`. If not provided, the policy is initialized from scratch
+    # (useful for debugging). This argument is mutually exclusive with `--config`.
+    env: envs.EnvConfig
+    eval: EvalConfig = field(default_factory=EvalConfig)
+    policy: PreTrainedConfig | None = None
+    output_dir: Path | None = None
+    job_name: str | None = None
+    # TODO(rcadene, aliberts): By default, use device and use_amp values from policy checkpoint.
+    device: str | None = None  # cuda | cpu | mps
+    # `use_amp` determines whether to use Automatic Mixed Precision (AMP) for training and evaluation. With AMP,
+    # automatic gradient scaling is used.
+    use_amp: bool = False
+    seed: int | None = 1000
+
+    def __post_init__(self):
+        # HACK: We parse again the cli args here to get the pretrained path if there was one.
+        policy_path = parser.get_path_arg("policy")
+        if policy_path:
+            cli_overrides = parser.get_cli_overrides("policy")
+            self.policy = PreTrainedConfig.from_pretrained(policy_path, cli_overrides=cli_overrides)
+            self.policy.pretrained_path = policy_path
+
+            # When no device or use_amp are given, use the one from training config.
+            if self.device is None or self.use_amp is None:
+                train_cfg = TrainPipelineConfig.from_pretrained(policy_path)
+                if self.device is None:
+                    self.device = train_cfg.device
+                if self.use_amp is None:
+                    self.use_amp = train_cfg.use_amp
+
+            # Automatically switch to available device if necessary
+            if not is_torch_device_available(self.device):
+                auto_device = auto_select_torch_device()
+                logging.warning(f"Device '{self.device}' is not available. Switching to '{auto_device}'.")
+                self.device = auto_device
+
+            # Automatically deactivate AMP if necessary
+            if self.use_amp and not is_amp_available(self.device):
+                logging.warning(
+                    f"Automatic Mixed Precision (amp) is not available on device '{self.device}'. Deactivating AMP."
+                )
+                self.use_amp = False
+
+        else:
+            logging.warning(
+                "No pretrained path was provided, evaluated policy will be built from scratch (random weights)."
+            )
+
+        if not self.job_name:
+            if self.env is None:
+                self.job_name = f"{self.policy.type}"
+            else:
+                self.job_name = f"{self.env.type}_{self.policy.type}"
+
+        if not self.output_dir:
+            now = dt.datetime.now()
+            eval_dir = f"{now:%Y-%m-%d}/{now:%H-%M-%S}_{self.job_name}"
+            self.output_dir = Path("outputs/eval") / eval_dir
+
+        if self.device is None:
+            raise ValueError("Set one of the following device: cuda, cpu or mps")
+        elif self.device == "cuda" and self.use_amp is None:
+            raise ValueError("Set 'use_amp' to True or False.")
+
+    @classmethod
+    def __get_path_fields__(cls) -> list[str]:
+        """This enables the parser to load config from the policy using `--policy.path=local/dir`"""
+        return ["policy"]
--- a/lerobot/configs/parser.py
+++ b/lerobot/configs/parser.py
@@ -0,0 +1,125 @@
+import inspect
+import sys
+from argparse import ArgumentError
+from functools import wraps
+from pathlib import Path
+from typing import Sequence
+
+import draccus
+
+from lerobot.common.utils.utils import has_method
+
+PATH_KEY = "path"
+draccus.set_config_type("json")
+
+
+def get_cli_overrides(field_name: str, args: Sequence[str] | None = None) -> list[str] | None:
+    """Parses arguments from cli at a given nested attribute level.
+
+    For example, supposing the main script was called with:
+    python myscript.py --arg1=1 --arg2.subarg1=abc --arg2.subarg2=some/path
+
+    If called during execution of myscript.py, get_cli_overrides("arg2") will return:
+    ["--subarg1=abc" "--subarg2=some/path"]
+    """
+    if args is None:
+        args = sys.argv[1:]
+    attr_level_args = []
+    detect_string = f"--{field_name}."
+    exclude_strings = (f"--{field_name}.{draccus.CHOICE_TYPE_KEY}=", f"--{field_name}.{PATH_KEY}=")
+    for arg in args:
+        if arg.startswith(detect_string) and not arg.startswith(exclude_strings):
+            denested_arg = f"--{arg.removeprefix(detect_string)}"
+            attr_level_args.append(denested_arg)
+
+    return attr_level_args
+
+
+def parse_arg(arg_name: str, args: Sequence[str] | None = None) -> str | None:
+    if args is None:
+        args = sys.argv[1:]
+    prefix = f"--{arg_name}="
+    for arg in args:
+        if arg.startswith(prefix):
+            return arg[len(prefix) :]
+    return None
+
+
+def get_path_arg(field_name: str, args: Sequence[str] | None = None) -> str | None:
+    return parse_arg(f"{field_name}.{PATH_KEY}", args)
+
+
+def get_type_arg(field_name: str, args: Sequence[str] | None = None) -> str | None:
+    return parse_arg(f"{field_name}.{draccus.CHOICE_TYPE_KEY}", args)
+
+
+def filter_arg(field_to_filter: str, args: Sequence[str] | None = None) -> list[str]:
+    return [arg for arg in args if not arg.startswith(f"--{field_to_filter}=")]
+
+
+def filter_path_args(fields_to_filter: str | list[str], args: Sequence[str] | None = None) -> list[str]:
+    """
+    Filters command-line arguments related to fields with specific path arguments.
+
+    Args:
+        fields_to_filter (str | list[str]): A single str or a list of str whose arguments need to be filtered.
+        args (Sequence[str] | None): The sequence of command-line arguments to be filtered.
+            Defaults to None.
+
+    Returns:
+        list[str]: A filtered list of arguments, with arguments related to the specified
+        fields removed.
+
+    Raises:
+        ArgumentError: If both a path argument (e.g., `--field_name.path`) and a type
+            argument (e.g., `--field_name.type`) are specified for the same field.
+    """
+    if isinstance(fields_to_filter, str):
+        fields_to_filter = [fields_to_filter]
+
+    filtered_args = args
+    for field in fields_to_filter:
+        if get_path_arg(field, args):
+            if get_type_arg(field, args):
+                raise ArgumentError(
+                    argument=None,
+                    message=f"Cannot specify both --{field}.{PATH_KEY} and --{field}.{draccus.CHOICE_TYPE_KEY}",
+                )
+            filtered_args = [arg for arg in filtered_args if not arg.startswith(f"--{field}.")]
+
+    return filtered_args
+
+
+def wrap(config_path: Path | None = None):
+    """
+    HACK: Similar to draccus.wrap but does two additional things:
+        - Will remove '.path' arguments from CLI in order to process them later on.
+        - If a 'config_path' is passed and the main config class has a 'from_pretrained' method, will
+          initialize it from there to allow to fetch configs from the hub directly
+    """
+
+    def wrapper_outer(fn):
+        @wraps(fn)
+        def wrapper_inner(*args, **kwargs):
+            argspec = inspect.getfullargspec(fn)
+            argtype = argspec.annotations[argspec.args[0]]
+            if len(args) > 0 and type(args[0]) is argtype:
+                cfg = args[0]
+                args = args[1:]
+            else:
+                cli_args = sys.argv[1:]
+                config_path_cli = parse_arg("config_path", cli_args)
+                if has_method(argtype, "__get_path_fields__"):
+                    path_fields = argtype.__get_path_fields__()
+                    cli_args = filter_path_args(path_fields, cli_args)
+                if has_method(argtype, "from_pretrained") and config_path_cli:
+                    cli_args = filter_arg("config_path", cli_args)
+                    cfg = argtype.from_pretrained(config_path_cli, cli_args=cli_args)
+                else:
+                    cfg = draccus.parse(config_class=argtype, config_path=config_path, args=cli_args)
+            response = fn(cfg, *args, **kwargs)
+            return response
+
+        return wrapper_inner
+
+    return wrapper_outer
--- a/lerobot/configs/policies.py
+++ b/lerobot/configs/policies.py
@@ -0,0 +1,145 @@
+import abc
+import os
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Type, TypeVar
+
+import draccus
+from huggingface_hub import hf_hub_download
+from huggingface_hub.constants import CONFIG_NAME
+from huggingface_hub.errors import HfHubHTTPError
+
+from lerobot.common.optim.optimizers import OptimizerConfig
+from lerobot.common.optim.schedulers import LRSchedulerConfig
+from lerobot.common.utils.hub import HubMixin
+from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
+
+# Generic variable that is either PreTrainedConfig or a subclass thereof
+T = TypeVar("T", bound="PreTrainedConfig")
+
+
+@dataclass
+class PreTrainedConfig(draccus.ChoiceRegistry, HubMixin, abc.ABC):
+    """
+    Base configuration class for policy models.
+
+    Args:
+        n_obs_steps: Number of environment steps worth of observations to pass to the policy (takes the
+            current step and additional steps going back).
+        input_shapes: A dictionary defining the shapes of the input data for the policy.
+        output_shapes: A dictionary defining the shapes of the output data for the policy.
+        input_normalization_modes: A dictionary with key representing the modality and the value specifies the
+            normalization mode to apply.
+        output_normalization_modes: Similar dictionary as `input_normalization_modes`, but to unnormalize to
+            the original scale.
+    """
+
+    n_obs_steps: int = 1
+    normalization_mapping: dict[str, NormalizationMode] = field(default_factory=dict)
+
+    input_features: dict[str, PolicyFeature] = field(default_factory=dict)
+    output_features: dict[str, PolicyFeature] = field(default_factory=dict)
+
+    def __post_init__(self):
+        self.pretrained_path = None
+
+    @property
+    def type(self) -> str:
+        return self.get_choice_name(self.__class__)
+
+    @abc.abstractproperty
+    def observation_delta_indices(self) -> list | None:
+        raise NotImplementedError
+
+    @abc.abstractproperty
+    def action_delta_indices(self) -> list | None:
+        raise NotImplementedError
+
+    @abc.abstractproperty
+    def reward_delta_indices(self) -> list | None:
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def get_optimizer_preset(self) -> OptimizerConfig:
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def get_scheduler_preset(self) -> LRSchedulerConfig | None:
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def validate_features(self) -> None:
+        raise NotImplementedError
+
+    @property
+    def robot_state_feature(self) -> PolicyFeature | None:
+        for _, ft in self.input_features.items():
+            if ft.type is FeatureType.STATE:
+                return ft
+        return None
+
+    @property
+    def env_state_feature(self) -> PolicyFeature | None:
+        for _, ft in self.input_features.items():
+            if ft.type is FeatureType.ENV:
+                return ft
+        return None
+
+    @property
+    def image_features(self) -> dict[str, PolicyFeature]:
+        return {key: ft for key, ft in self.input_features.items() if ft.type is FeatureType.VISUAL}
+
+    @property
+    def action_feature(self) -> PolicyFeature | None:
+        for _, ft in self.output_features.items():
+            if ft.type is FeatureType.ACTION:
+                return ft
+        return None
+
+    def _save_pretrained(self, save_directory: Path) -> None:
+        with open(save_directory / CONFIG_NAME, "w") as f, draccus.config_type("json"):
+            draccus.dump(self, f, indent=4)
+
+    @classmethod
+    def from_pretrained(
+        cls: Type[T],
+        pretrained_name_or_path: str | Path,
+        *,
+        force_download: bool = False,
+        resume_download: bool = None,
+        proxies: dict | None = None,
+        token: str | bool | None = None,
+        cache_dir: str | Path | None = None,
+        local_files_only: bool = False,
+        revision: str | None = None,
+        **policy_kwargs,
+    ) -> T:
+        model_id = str(pretrained_name_or_path)
+        config_file: str | None = None
+        if Path(model_id).is_dir():
+            if CONFIG_NAME in os.listdir(model_id):
+                config_file = os.path.join(model_id, CONFIG_NAME)
+            else:
+                print(f"{CONFIG_NAME} not found in {Path(model_id).resolve()}")
+        else:
+            try:
+                config_file = hf_hub_download(
+                    repo_id=model_id,
+                    filename=CONFIG_NAME,
+                    revision=revision,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    resume_download=resume_download,
+                    token=token,
+                    local_files_only=local_files_only,
+                )
+            except HfHubHTTPError as e:
+                raise FileNotFoundError(
+                    f"{CONFIG_NAME} not found on the HuggingFace Hub in {model_id}"
+                ) from e
+
+        # HACK: this is very ugly, ideally we'd like to be able to do that natively with draccus
+        # something like --policy.path (in addition to --policy.type)
+        cli_overrides = policy_kwargs.pop("cli_overrides", [])
+        return draccus.parse(cls, config_file, args=cli_overrides)
--- a/lerobot/configs/policy/act.yaml
+++ b/lerobot/configs/policy/act.yaml
@@ -1,82 +0,0 @@
-# @package _global_
-
-seed: 1000
-dataset_repo_id: lerobot/aloha_sim_insertion_human
-
-override_dataset_stats:
-  observation.images.top:
-    # stats from imagenet, since we use a pretrained vision model
-    mean: [[[0.485]], [[0.456]], [[0.406]]]  # (c,1,1)
-    std: [[[0.229]], [[0.224]], [[0.225]]]  # (c,1,1)
-
-training:
-  offline_steps: 100000
-  online_steps: 0
-  eval_freq: 20000
-  save_freq: 20000
-  save_checkpoint: true
-
-  batch_size: 8
-  lr: 1e-5
-  lr_backbone: 1e-5
-  weight_decay: 1e-4
-  grad_clip_norm: 10
-  online_steps_between_rollouts: 1
-
-  delta_timestamps:
-    action: "[i / ${fps} for i in range(${policy.chunk_size})]"
-
-eval:
-  n_episodes: 50
-  batch_size: 50
-
-# See `configuration_act.py` for more details.
-policy:
-  name: act
-
-  # Input / output structure.
-  n_obs_steps: 1
-  chunk_size: 100 # chunk_size
-  n_action_steps: 100
-
-  input_shapes:
-    # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
-    observation.images.top: [3, 480, 640]
-    observation.state: ["${env.state_dim}"]
-  output_shapes:
-    action: ["${env.action_dim}"]
-
-  # Normalization / Unnormalization
-  input_normalization_modes:
-    observation.images.top: mean_std
-    observation.state: mean_std
-  output_normalization_modes:
-    action: mean_std
-
-  # Architecture.
-  # Vision backbone.
-  vision_backbone: resnet18
-  pretrained_backbone_weights: ResNet18_Weights.IMAGENET1K_V1
-  replace_final_stride_with_dilation: false
-  # Transformer layers.
-  pre_norm: false
-  dim_model: 512
-  n_heads: 8
-  dim_feedforward: 3200
-  feedforward_activation: relu
-  n_encoder_layers: 4
-  # Note: Although the original ACT implementation has 7 for `n_decoder_layers`, there is a bug in the code
-  # that means only the first layer is used. Here we match the original implementation by setting this to 1.
-  # See this issue https://github.com/tonyzhaozh/act/issues/25#issue-2258740521.
-  n_decoder_layers: 1
-  # VAE.
-  use_vae: true
-  latent_dim: 32
-  n_vae_encoder_layers: 4
-
-  # Inference.
-  temporal_ensemble_coeff: null
-
-  # Training and loss computation.
-  dropout: 0.1
-  kl_weight: 10.0
--- a/lerobot/configs/policy/act_aloha_real.yaml
+++ b/lerobot/configs/policy/act_aloha_real.yaml
@@ -1,121 +0,0 @@
-# @package _global_
-
-# Use `act_aloha_real.yaml` to train on real-world datasets collected on Aloha or Aloha-2 robots.
-# Compared to `act.yaml`, it contains 4 cameras (i.e. cam_right_wrist, cam_left_wrist, cam_high, cam_low) instead of 1 camera (i.e. top).
-# Also, `training.eval_freq` is set to -1. This config is used to evaluate checkpoints at a certain frequency of training steps.
-# When it is set to -1, it deactivates evaluation. This is because real-world evaluation is done through our `control_robot.py` script.
-# Look at the documentation in header of `control_robot.py` for more information on how to collect data , train and evaluate a policy.
-#
-# Example of usage for training and inference with `control_robot.py`:
-# ```bash
-# python lerobot/scripts/train.py \
-#   policy=act_aloha_real \
-#   env=aloha_real
-# ```
-#
-# Example of usage for training and inference with [Dora-rs](https://github.com/dora-rs/dora-lerobot):
-# ```bash
-# python lerobot/scripts/train.py \
-#   policy=act_aloha_real \
-#   env=dora_aloha_real
-# ```
-
-seed: 1000
-dataset_repo_id: lerobot/aloha_static_vinh_cup
-
-override_dataset_stats:
-  observation.images.cam_right_wrist:
-    # stats from imagenet, since we use a pretrained vision model
-    mean: [[[0.485]], [[0.456]], [[0.406]]]  # (c,1,1)
-    std: [[[0.229]], [[0.224]], [[0.225]]]  # (c,1,1)
-  observation.images.cam_left_wrist:
-    # stats from imagenet, since we use a pretrained vision model
-    mean: [[[0.485]], [[0.456]], [[0.406]]]  # (c,1,1)
-    std: [[[0.229]], [[0.224]], [[0.225]]]  # (c,1,1)
-  observation.images.cam_high:
-    # stats from imagenet, since we use a pretrained vision model
-    mean: [[[0.485]], [[0.456]], [[0.406]]]  # (c,1,1)
-    std: [[[0.229]], [[0.224]], [[0.225]]]  # (c,1,1)
-  observation.images.cam_low:
-    # stats from imagenet, since we use a pretrained vision model
-    mean: [[[0.485]], [[0.456]], [[0.406]]]  # (c,1,1)
-    std: [[[0.229]], [[0.224]], [[0.225]]]  # (c,1,1)
-
-training:
-  offline_steps: 80000
-  online_steps: 0
-  eval_freq: -1
-  save_freq: 10000
-  log_freq: 100
-  save_checkpoint: true
-
-  batch_size: 8
-  lr: 1e-5
-  lr_backbone: 1e-5
-  weight_decay: 1e-4
-  grad_clip_norm: 10
-  online_steps_between_rollouts: 1
-
-  delta_timestamps:
-    action: "[i / ${fps} for i in range(${policy.chunk_size})]"
-
-eval:
-  n_episodes: 50
-  batch_size: 50
-
-# See `configuration_act.py` for more details.
-policy:
-  name: act
-
-  # Input / output structure.
-  n_obs_steps: 1
-  chunk_size: 100
-  n_action_steps: 100
-
-  input_shapes:
-    # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
-    observation.images.cam_right_wrist: [3, 480, 640]
-    observation.images.cam_left_wrist: [3, 480, 640]
-    observation.images.cam_high: [3, 480, 640]
-    observation.images.cam_low: [3, 480, 640]
-    observation.state: ["${env.state_dim}"]
-  output_shapes:
-    action: ["${env.action_dim}"]
-
-  # Normalization / Unnormalization
-  input_normalization_modes:
-    observation.images.cam_right_wrist: mean_std
-    observation.images.cam_left_wrist: mean_std
-    observation.images.cam_high: mean_std
-    observation.images.cam_low: mean_std
-    observation.state: mean_std
-  output_normalization_modes:
-    action: mean_std
-
-  # Architecture.
-  # Vision backbone.
-  vision_backbone: resnet18
-  pretrained_backbone_weights: ResNet18_Weights.IMAGENET1K_V1
-  replace_final_stride_with_dilation: false
-  # Transformer layers.
-  pre_norm: false
-  dim_model: 512
-  n_heads: 8
-  dim_feedforward: 3200
-  feedforward_activation: relu
-  n_encoder_layers: 4
-  # Note: Although the original ACT implementation has 7 for `n_decoder_layers`, there is a bug in the code
-  # that means only the first layer is used. Here we match the original implementation by setting this to 1.
-  # See this issue https://github.com/tonyzhaozh/act/issues/25#issue-2258740521.
-  n_decoder_layers: 1
-  # VAE.
-  use_vae: true
-  latent_dim: 32
-  n_vae_encoder_layers: 4
-
-  # Inference.
-  temporal_ensemble_coeff: null
-
-  # Training and loss computation.
-  dropout: 0.1
-  kl_weight: 10.0
--- a/lerobot/configs/policy/act_koch_real.yaml
+++ b/lerobot/configs/policy/act_koch_real.yaml
@@ -1,102 +0,0 @@
-# @package _global_
-
-# Use `act_koch_real.yaml` to train on real-world datasets collected on Alexander Koch's robots.
-# Compared to `act.yaml`, it contains 2 cameras (i.e. laptop, phone) instead of 1 camera (i.e. top).
-# Also, `training.eval_freq` is set to -1. This config is used to evaluate checkpoints at a certain frequency of training steps.
-# When it is set to -1, it deactivates evaluation. This is because real-world evaluation is done through our `control_robot.py` script.
-# Look at the documentation in header of `control_robot.py` for more information on how to collect data , train and evaluate a policy.
-#
-# Example of usage for training:
-# ```bash
-# python lerobot/scripts/train.py \
-#   policy=act_koch_real \
-#   env=koch_real
-# ```
-
-seed: 1000
-dataset_repo_id: lerobot/koch_pick_place_lego
-
-override_dataset_stats:
-  observation.images.laptop:
-    # stats from imagenet, since we use a pretrained vision model
-    mean: [[[0.485]], [[0.456]], [[0.406]]]  # (c,1,1)
-    std: [[[0.229]], [[0.224]], [[0.225]]]  # (c,1,1)
-  observation.images.phone:
-    # stats from imagenet, since we use a pretrained vision model
-    mean: [[[0.485]], [[0.456]], [[0.406]]]  # (c,1,1)
-    std: [[[0.229]], [[0.224]], [[0.225]]]  # (c,1,1)
-
-training:
-  offline_steps: 80000
-  online_steps: 0
-  eval_freq: -1
-  save_freq: 10000
-  log_freq: 100
-  save_checkpoint: true
-
-  batch_size: 8
-  lr: 1e-5
-  lr_backbone: 1e-5
-  weight_decay: 1e-4
-  grad_clip_norm: 10
-  online_steps_between_rollouts: 1
-
-  delta_timestamps:
-    action: "[i / ${fps} for i in range(${policy.chunk_size})]"
-
-eval:
-  n_episodes: 50
-  batch_size: 50
-
-# See `configuration_act.py` for more details.
-policy:
-  name: act
-
-  # Input / output structure.
-  n_obs_steps: 1
-  chunk_size: 100
-  n_action_steps: 100
-
-  input_shapes:
-    # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
-    observation.images.laptop: [3, 480, 640]
-    observation.images.phone: [3, 480, 640]
-    observation.state: ["${env.state_dim}"]
-  output_shapes:
-    action: ["${env.action_dim}"]
-
-  # Normalization / Unnormalization
-  input_normalization_modes:
-    observation.images.laptop: mean_std
-    observation.images.phone: mean_std
-    observation.state: mean_std
-  output_normalization_modes:
-    action: mean_std
-
-  # Architecture.
-  # Vision backbone.
-  vision_backbone: resnet18
-  pretrained_backbone_weights: ResNet18_Weights.IMAGENET1K_V1
-  replace_final_stride_with_dilation: false
-  # Transformer layers.
-  pre_norm: false
-  dim_model: 512
-  n_heads: 8
-  dim_feedforward: 3200
-  feedforward_activation: relu
-  n_encoder_layers: 4
-  # Note: Although the original ACT implementation has 7 for `n_decoder_layers`, there is a bug in the code
-  # that means only the first layer is used. Here we match the original implementation by setting this to 1.
-  # See this issue https://github.com/tonyzhaozh/act/issues/25#issue-2258740521.
-  n_decoder_layers: 1
-  # VAE.
-  use_vae: true
-  latent_dim: 32
-  n_vae_encoder_layers: 4
-
-  # Inference.
-  temporal_ensemble_coeff: null
-
-  # Training and loss computation.
-  dropout: 0.1
-  kl_weight: 10.0
--- a/lerobot/configs/policy/act_moss_real.yaml
+++ b/lerobot/configs/policy/act_moss_real.yaml
@@ -1,102 +0,0 @@
-# @package _global_
-
-# Use `act_koch_real.yaml` to train on real-world datasets collected on Alexander Koch's robots.
-# Compared to `act.yaml`, it contains 2 cameras (i.e. laptop, phone) instead of 1 camera (i.e. top).
-# Also, `training.eval_freq` is set to -1. This config is used to evaluate checkpoints at a certain frequency of training steps.
-# When it is set to -1, it deactivates evaluation. This is because real-world evaluation is done through our `control_robot.py` script.
-# Look at the documentation in header of `control_robot.py` for more information on how to collect data , train and evaluate a policy.
-#
-# Example of usage for training:
-# ```bash
-# python lerobot/scripts/train.py \
-#   policy=act_koch_real \
-#   env=koch_real
-# ```
-
-seed: 1000
-dataset_repo_id: lerobot/moss_pick_place_lego
-
-override_dataset_stats:
-  observation.images.laptop:
-    # stats from imagenet, since we use a pretrained vision model
-    mean: [[[0.485]], [[0.456]], [[0.406]]]  # (c,1,1)
-    std: [[[0.229]], [[0.224]], [[0.225]]]  # (c,1,1)
-  observation.images.phone:
-    # stats from imagenet, since we use a pretrained vision model
-    mean: [[[0.485]], [[0.456]], [[0.406]]]  # (c,1,1)
-    std: [[[0.229]], [[0.224]], [[0.225]]]  # (c,1,1)
-
-training:
-  offline_steps: 80000
-  online_steps: 0
-  eval_freq: -1
-  save_freq: 10000
-  log_freq: 100
-  save_checkpoint: true
-
-  batch_size: 8
-  lr: 1e-5
-  lr_backbone: 1e-5
-  weight_decay: 1e-4
-  grad_clip_norm: 10
-  online_steps_between_rollouts: 1
-
-  delta_timestamps:
-    action: "[i / ${fps} for i in range(${policy.chunk_size})]"
-
-eval:
-  n_episodes: 50
-  batch_size: 50
-
-# See `configuration_act.py` for more details.
-policy:
-  name: act
-
-  # Input / output structure.
-  n_obs_steps: 1
-  chunk_size: 100
-  n_action_steps: 100
-
-  input_shapes:
-    # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
-    observation.images.laptop: [3, 480, 640]
-    observation.images.phone: [3, 480, 640]
-    observation.state: ["${env.state_dim}"]
-  output_shapes:
-    action: ["${env.action_dim}"]
-
-  # Normalization / Unnormalization
-  input_normalization_modes:
-    observation.images.laptop: mean_std
-    observation.images.phone: mean_std
-    observation.state: mean_std
-  output_normalization_modes:
-    action: mean_std
-
-  # Architecture.
-  # Vision backbone.
-  vision_backbone: resnet18
-  pretrained_backbone_weights: ResNet18_Weights.IMAGENET1K_V1
-  replace_final_stride_with_dilation: false
-  # Transformer layers.
-  pre_norm: false
-  dim_model: 512
-  n_heads: 8
-  dim_feedforward: 3200
-  feedforward_activation: relu
-  n_encoder_layers: 4
-  # Note: Although the original ACT implementation has 7 for `n_decoder_layers`, there is a bug in the code
-  # that means only the first layer is used. Here we match the original implementation by setting this to 1.
-  # See this issue https://github.com/tonyzhaozh/act/issues/25#issue-2258740521.
-  n_decoder_layers: 1
-  # VAE.
-  use_vae: true
-  latent_dim: 32
-  n_vae_encoder_layers: 4
-
-  # Inference.
-  temporal_ensemble_coeff: null
-
-  # Training and loss computation.
-  dropout: 0.1
-  kl_weight: 10.0
--- a/lerobot/configs/policy/act_so100_real.yaml
+++ b/lerobot/configs/policy/act_so100_real.yaml
@@ -1,102 +0,0 @@
-# @package _global_
-
-# Use `act_koch_real.yaml` to train on real-world datasets collected on Alexander Koch's robots.
-# Compared to `act.yaml`, it contains 2 cameras (i.e. laptop, phone) instead of 1 camera (i.e. top).
-# Also, `training.eval_freq` is set to -1. This config is used to evaluate checkpoints at a certain frequency of training steps.
-# When it is set to -1, it deactivates evaluation. This is because real-world evaluation is done through our `control_robot.py` script.
-# Look at the documentation in header of `control_robot.py` for more information on how to collect data , train and evaluate a policy.
-#
-# Example of usage for training:
-# ```bash
-# python lerobot/scripts/train.py \
-#   policy=act_koch_real \
-#   env=koch_real
-# ```
-
-seed: 1000
-dataset_repo_id: lerobot/so100_pick_place_lego
-
-override_dataset_stats:
-  observation.images.laptop:
-    # stats from imagenet, since we use a pretrained vision model
-    mean: [[[0.485]], [[0.456]], [[0.406]]]  # (c,1,1)
-    std: [[[0.229]], [[0.224]], [[0.225]]]  # (c,1,1)
-  observation.images.phone:
-    # stats from imagenet, since we use a pretrained vision model
-    mean: [[[0.485]], [[0.456]], [[0.406]]]  # (c,1,1)
-    std: [[[0.229]], [[0.224]], [[0.225]]]  # (c,1,1)
-
-training:
-  offline_steps: 80000
-  online_steps: 0
-  eval_freq: -1
-  save_freq: 10000
-  log_freq: 100
-  save_checkpoint: true
-
-  batch_size: 8
-  lr: 1e-5
-  lr_backbone: 1e-5
-  weight_decay: 1e-4
-  grad_clip_norm: 10
-  online_steps_between_rollouts: 1
-
-  delta_timestamps:
-    action: "[i / ${fps} for i in range(${policy.chunk_size})]"
-
-eval:
-  n_episodes: 50
-  batch_size: 50
-
-# See `configuration_act.py` for more details.
-policy:
-  name: act
-
-  # Input / output structure.
-  n_obs_steps: 1
-  chunk_size: 100
-  n_action_steps: 100
-
-  input_shapes:
-    # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
-    observation.images.laptop: [3, 480, 640]
-    observation.images.phone: [3, 480, 640]
-    observation.state: ["${env.state_dim}"]
-  output_shapes:
-    action: ["${env.action_dim}"]
-
-  # Normalization / Unnormalization
-  input_normalization_modes:
-    observation.images.laptop: mean_std
-    observation.images.phone: mean_std
-    observation.state: mean_std
-  output_normalization_modes:
-    action: mean_std
-
-  # Architecture.
-  # Vision backbone.
-  vision_backbone: resnet18
-  pretrained_backbone_weights: ResNet18_Weights.IMAGENET1K_V1
-  replace_final_stride_with_dilation: false
-  # Transformer layers.
-  pre_norm: false
-  dim_model: 512
-  n_heads: 8
-  dim_feedforward: 3200
-  feedforward_activation: relu
-  n_encoder_layers: 4
-  # Note: Although the original ACT implementation has 7 for `n_decoder_layers`, there is a bug in the code
-  # that means only the first layer is used. Here we match the original implementation by setting this to 1.
-  # See this issue https://github.com/tonyzhaozh/act/issues/25#issue-2258740521.
-  n_decoder_layers: 1
-  # VAE.
-  use_vae: true
-  latent_dim: 32
-  n_vae_encoder_layers: 4
-
-  # Inference.
-  temporal_ensemble_coeff: null
-
-  # Training and loss computation.
-  dropout: 0.1
-  kl_weight: 10.0
--- a/lerobot/configs/policy/diffusion.yaml
+++ b/lerobot/configs/policy/diffusion.yaml
@@ -1,104 +0,0 @@
-# @package _global_
-
-# Defaults for training for the PushT dataset as per https://github.com/real-stanford/diffusion_policy.
-# Note: We do not track EMA model weights as we discovered it does not improve the results. See
-#       https://github.com/huggingface/lerobot/pull/134 for more details.
-
-seed: 100000
-dataset_repo_id: lerobot/pusht
-
-override_dataset_stats:
-  # TODO(rcadene, alexander-soare): should we remove image stats as well? do we use a pretrained vision model?
-  observation.image:
-    mean: [[[0.5]], [[0.5]], [[0.5]]]  # (c,1,1)
-    std: [[[0.5]], [[0.5]], [[0.5]]]  # (c,1,1)
-  # TODO(rcadene, alexander-soare): we override state and action stats to use the same as the pretrained model
-  # from the original codebase, but we should remove these and train our own pretrained model
-  observation.state:
-    min: [13.456424, 32.938293]
-    max: [496.14618, 510.9579]
-  action:
-    min: [12.0, 25.0]
-    max: [511.0, 511.0]
-
-training:
-  offline_steps: 200000
-  online_steps: 0
-  eval_freq: 25000
-  save_freq: 25000
-  save_checkpoint: true
-
-  batch_size: 64
-  grad_clip_norm: 10
-  lr: 1.0e-4
-  lr_scheduler: cosine
-  lr_warmup_steps: 500
-  adam_betas: [0.95, 0.999]
-  adam_eps: 1.0e-8
-  adam_weight_decay: 1.0e-6
-  online_steps_between_rollouts: 1
-
-  delta_timestamps:
-    observation.image: "[i / ${fps} for i in range(1 - ${policy.n_obs_steps}, 1)]"
-    observation.state: "[i / ${fps} for i in range(1 - ${policy.n_obs_steps}, 1)]"
-    action: "[i / ${fps} for i in range(1 - ${policy.n_obs_steps}, 1 - ${policy.n_obs_steps} + ${policy.horizon})]"
-
-  # The original implementation doesn't sample frames for the last 7 steps,
-  # which avoids excessive padding and leads to improved training results.
-  drop_n_last_frames: 7  # ${policy.horizon} - ${policy.n_action_steps} - ${policy.n_obs_steps} + 1
-
-eval:
-  n_episodes: 50
-  batch_size: 50
-
-policy:
-  name: diffusion
-
-  # Input / output structure.
-  n_obs_steps: 2
-  horizon: 16
-  n_action_steps: 8
-
-  input_shapes:
-    # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
-    observation.image: [3, 96, 96]
-    observation.state: ["${env.state_dim}"]
-  output_shapes:
-    action: ["${env.action_dim}"]
-
-  # Normalization / Unnormalization
-  input_normalization_modes:
-    observation.image: mean_std
-    observation.state: min_max
-  output_normalization_modes:
-    action: min_max
-
-  # Architecture / modeling.
-  # Vision backbone.
-  vision_backbone: resnet18
-  crop_shape: [84, 84]
-  crop_is_random: True
-  pretrained_backbone_weights: null
-  use_group_norm: True
-  spatial_softmax_num_keypoints: 32
-  # Unet.
-  down_dims: [512, 1024, 2048]
-  kernel_size: 5
-  n_groups: 8
-  diffusion_step_embed_dim: 128
-  use_film_scale_modulation: True
-  # Noise scheduler.
-  noise_scheduler_type: DDPM
-  num_train_timesteps: 100
-  beta_schedule: squaredcos_cap_v2
-  beta_start: 0.0001
-  beta_end: 0.02
-  prediction_type: epsilon # epsilon / sample
-  clip_sample: True
-  clip_sample_range: 1.0
-
-  # Inference
-  num_inference_steps: null  # if not provided, defaults to `num_train_timesteps`
-
-  # Loss computation
-  do_mask_loss_for_padding: false
--- a/lerobot/configs/policy/diffusion_pusht_keypoints.yaml
+++ b/lerobot/configs/policy/diffusion_pusht_keypoints.yaml
@@ -1,110 +0,0 @@
-# @package _global_
-
-# Defaults for training for the pusht_keypoints dataset.
-
-# They keypoints are on the vertices of the rectangles that make up the PushT as documented in the PushT
-# environment:
-# https://github.com/huggingface/gym-pusht/blob/5e2489be9ff99ed9cd47b6c653dda3b7aa844d24/gym_pusht/envs/pusht.py#L522-L534
-# For completeness, the diagram is copied here:
-#        0───────────1
-#        │           │
-#        3───4───5───2
-#            │   │
-#            │   │
-#            │   │
-#            │   │
-#            7───6
-
-
-# Note: The original work trains keypoints-only with conditioning via inpainting. Here, we encode the
-# observation along with the agent position and use the encoding as global conditioning for the denoising
-# U-Net.
-
-# Note: We do not track EMA model weights as we discovered it does not improve the results. See
-#       https://github.com/huggingface/lerobot/pull/134 for more details.
-
-seed: 100000
-dataset_repo_id: lerobot/pusht_keypoints
-
-training:
-  offline_steps: 200000
-  online_steps: 0
-  eval_freq: 5000
-  save_freq: 5000
-  log_freq: 250
-  save_checkpoint: true
-
-  batch_size: 64
-  grad_clip_norm: 10
-  lr: 1.0e-4
-  lr_scheduler: cosine
-  lr_warmup_steps: 500
-  adam_betas: [0.95, 0.999]
-  adam_eps: 1.0e-8
-  adam_weight_decay: 1.0e-6
-  online_steps_between_rollouts: 1
-
-  delta_timestamps:
-    observation.environment_state: "[i / ${fps} for i in range(1 - ${policy.n_obs_steps}, 1)]"
-    observation.state: "[i / ${fps} for i in range(1 - ${policy.n_obs_steps}, 1)]"
-    action: "[i / ${fps} for i in range(1 - ${policy.n_obs_steps}, 1 - ${policy.n_obs_steps} + ${policy.horizon})]"
-
-  # The original implementation doesn't sample frames for the last 7 steps,
-  # which avoids excessive padding and leads to improved training results.
-  drop_n_last_frames: 7  # ${policy.horizon} - ${policy.n_action_steps} - ${policy.n_obs_steps} + 1
-
-eval:
-  n_episodes: 50
-  batch_size: 50
-
-policy:
-  name: diffusion
-
-  # Input / output structure.
-  n_obs_steps: 2
-  horizon: 16
-  n_action_steps: 8
-
-  input_shapes:
-    # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
-    observation.environment_state: [16]
-    observation.state: ["${env.state_dim}"]
-  output_shapes:
-    action: ["${env.action_dim}"]
-
-  # Normalization / Unnormalization
-  input_normalization_modes:
-    observation.environment_state: min_max
-    observation.state: min_max
-  output_normalization_modes:
-    action: min_max
-
-  # Architecture / modeling.
-  # Vision backbone.
-  vision_backbone: resnet18
-  crop_shape: [84, 84]
-  crop_is_random: True
-  pretrained_backbone_weights: null
-  use_group_norm: True
-  spatial_softmax_num_keypoints: 32
-  # Unet.
-  down_dims: [256, 512, 1024]
-  kernel_size: 5
-  n_groups: 8
-  diffusion_step_embed_dim: 128
-  use_film_scale_modulation: True
-  # Noise scheduler.
-  noise_scheduler_type: DDIM
-  num_train_timesteps: 100
-  beta_schedule: squaredcos_cap_v2
-  beta_start: 0.0001
-  beta_end: 0.02
-  prediction_type: epsilon # epsilon / sample
-  clip_sample: True
-  clip_sample_range: 1.0
-
-  # Inference
-  num_inference_steps: 10  # if not provided, defaults to `num_train_timesteps`
-
-  # Loss computation
-  do_mask_loss_for_padding: false
--- a/lerobot/configs/policy/tdmpc.yaml
+++ b/lerobot/configs/policy/tdmpc.yaml
@@ -1,93 +0,0 @@
-# @package _global_
-
-seed: 1
-dataset_repo_id: lerobot/xarm_lift_medium
-
-training:
-  offline_steps: 50000
-
-  num_workers: 4
-
-  batch_size: 256
-  grad_clip_norm: 10.0
-  lr: 3e-4
-
-  save_freq: 10000
-  eval_freq: 5000
-  log_freq: 100
-
-  online_steps: 50000
-  online_rollout_n_episodes: 1
-  online_rollout_batch_size: 1
-  # Note: in FOWM `online_steps_between_rollouts` is actually dynamically set to match exactly the length of
-  # the last sampled episode.
-  online_steps_between_rollouts: 50
-  online_sampling_ratio: 0.5
-  online_env_seed: 10000
-  # FOWM Push uses 10000 for `online_buffer_capacity`. Given that their maximum episode length for this task
-  # is 25, 10000 is approx 400 of their episodes worth. Since our episodes are about 8 times longer, we'll use
-  # 80000.
-  online_buffer_capacity: 80000
-
-  delta_timestamps:
-    observation.image: "[i / ${fps} for i in range(${policy.horizon} + 1)]"
-    observation.state: "[i / ${fps} for i in range(${policy.horizon} + 1)]"
-    action: "[i / ${fps} for i in range(${policy.horizon})]"
-    next.reward: "[i / ${fps} for i in range(${policy.horizon})]"
-
-policy:
-  name: tdmpc
-
-  pretrained_model_path:
-
-  # Input / output structure.
-  n_action_repeats: 2
-  horizon: 5
-  n_action_steps: 1
-
-  input_shapes:
-    # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
-    observation.image: [3, 84, 84]
-    observation.state: ["${env.state_dim}"]
-  output_shapes:
-    action: ["${env.action_dim}"]
-
-  # Normalization / Unnormalization
-  input_normalization_modes: null
-  output_normalization_modes:
-    action: min_max
-
-  # Architecture / modeling.
-  # Neural networks.
-  image_encoder_hidden_dim: 32
-  state_encoder_hidden_dim: 256
-  latent_dim: 50
-  q_ensemble_size: 5
-  mlp_dim: 512
-  # Reinforcement learning.
-  discount: 0.9
-
-  # Inference.
-  use_mpc: true
-  cem_iterations: 6
-  max_std: 2.0
-  min_std: 0.05
-  n_gaussian_samples: 512
-  n_pi_samples: 51
-  uncertainty_regularizer_coeff: 1.0
-  n_elites: 50
-  elite_weighting_temperature: 0.5
-  gaussian_mean_momentum: 0.1
-
-  # Training and loss computation.
-  max_random_shift_ratio: 0.0476
-  # Loss coefficients.
-  reward_coeff: 0.5
-  expectile_weight: 0.9
-  value_coeff: 0.1
-  consistency_coeff: 20.0
-  advantage_scaling: 3.0
-  pi_coeff: 0.5
-  temporal_decay_coeff: 0.5
-  # Target model.
-  target_model_momentum: 0.995
--- a/lerobot/configs/policy/tdmpc_pusht_keypoints.yaml
+++ b/lerobot/configs/policy/tdmpc_pusht_keypoints.yaml
@@ -1,105 +0,0 @@
-# @package _global_
-
-# Train with:
-#
-# python lerobot/scripts/train.py \
-#   env=pusht \
-#   env.gym.obs_type=environment_state_agent_pos \
-#   policy=tdmpc_pusht_keypoints \
-#   eval.batch_size=50 \
-#   eval.n_episodes=50 \
-#   eval.use_async_envs=true \
-#   device=cuda \
-#   use_amp=true
-
-seed: 1
-dataset_repo_id: lerobot/pusht_keypoints
-
-training:
-  offline_steps: 0
-
-  # Offline training dataloader
-  num_workers: 4
-
-  batch_size: 256
-  grad_clip_norm: 10.0
-  lr: 3e-4
-
-  eval_freq: 10000
-  log_freq: 500
-  save_freq: 50000
-
-  online_steps: 1000000
-  online_rollout_n_episodes: 10
-  online_rollout_batch_size: 10
-  online_steps_between_rollouts: 1000
-  online_sampling_ratio: 1.0
-  online_env_seed: 10000
-  online_buffer_capacity: 40000
-  online_buffer_seed_size: 0
-  do_online_rollout_async: false
-
-  delta_timestamps:
-    observation.environment_state: "[i / ${fps} for i in range(${policy.horizon} + 1)]"
-    observation.state: "[i / ${fps} for i in range(${policy.horizon} + 1)]"
-    action: "[i / ${fps} for i in range(${policy.horizon})]"
-    next.reward: "[i / ${fps} for i in range(${policy.horizon})]"
-
-policy:
-  name: tdmpc
-
-  pretrained_model_path:
-
-  # Input / output structure.
-  n_action_repeats: 1
-  horizon: 5
-  n_action_steps: 5
-
-  input_shapes:
-    # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
-    observation.environment_state: [16]
-    observation.state: ["${env.state_dim}"]
-  output_shapes:
-    action: ["${env.action_dim}"]
-
-  # Normalization / Unnormalization
-  input_normalization_modes:
-    observation.environment_state: min_max
-    observation.state: min_max
-  output_normalization_modes:
-    action: min_max
-
-  # Architecture / modeling.
-  # Neural networks.
-  image_encoder_hidden_dim: 32
-  state_encoder_hidden_dim: 256
-  latent_dim: 50
-  q_ensemble_size: 5
-  mlp_dim: 512
-  # Reinforcement learning.
-  discount: 0.98
-
-  # Inference.
-  use_mpc: true
-  cem_iterations: 6
-  max_std: 2.0
-  min_std: 0.05
-  n_gaussian_samples: 512
-  n_pi_samples: 51
-  uncertainty_regularizer_coeff: 1.0
-  n_elites: 50
-  elite_weighting_temperature: 0.5
-  gaussian_mean_momentum: 0.1
-
-  # Training and loss computation.
-  max_random_shift_ratio: 0.0476
-  # Loss coefficients.
-  reward_coeff: 0.5
-  expectile_weight: 0.9
-  value_coeff: 0.1
-  consistency_coeff: 20.0
-  advantage_scaling: 3.0
-  pi_coeff: 0.5
-  temporal_decay_coeff: 0.5
-  # Target model.
-  target_model_momentum: 0.995
--- a/lerobot/configs/policy/vqbet.yaml
+++ b/lerobot/configs/policy/vqbet.yaml
@@ -1,103 +0,0 @@
-# @package _global_
-
-# Defaults for training for the PushT dataset.
-
-seed: 100000
-dataset_repo_id: lerobot/pusht
-
-override_dataset_stats:
-  # TODO(rcadene, alexander-soare): should we remove image stats as well? do we use a pretrained vision model?
-  observation.image:
-    mean: [[[0.5]], [[0.5]], [[0.5]]]  # (c,1,1)
-    std: [[[0.5]], [[0.5]], [[0.5]]]  # (c,1,1)
-  # TODO(rcadene, alexander-soare): we override state and action stats to use the same as the pretrained model
-  # from the original codebase, but we should remove these and train our own pretrained model
-  observation.state:
-    min: [13.456424, 32.938293]
-    max: [496.14618, 510.9579]
-  action:
-    min: [12.0, 25.0]
-    max: [511.0, 511.0]
-
-training:
-  offline_steps: 250000
-  online_steps: 0
-  eval_freq: 25000
-  save_freq: 25000
-  save_checkpoint: true
-
-  batch_size: 64
-  grad_clip_norm: 10
-  lr: 1.0e-4
-  lr_scheduler: cosine
-  lr_warmup_steps: 500
-  adam_betas: [0.95, 0.999]
-  adam_eps: 1.0e-8
-  adam_weight_decay: 1.0e-6
-  online_steps_between_rollouts: 1
-
-  # VQ-BeT specific
-  vqvae_lr: 1.0e-3
-  n_vqvae_training_steps: 20000
-  bet_weight_decay: 2e-4
-  bet_learning_rate: 5.5e-5
-  bet_betas: [0.9, 0.999]
-
-  delta_timestamps:
-    observation.image: "[i / ${fps} for i in range(1 - ${policy.n_obs_steps}, 1)]"
-    observation.state: "[i / ${fps} for i in range(1 - ${policy.n_obs_steps}, 1)]"
-    action: "[i / ${fps} for i in range(1 - ${policy.n_obs_steps}, ${policy.n_action_pred_token} + ${policy.action_chunk_size} - 1)]"
-
-eval:
-  n_episodes: 50
-  batch_size: 50
-
-policy:
-  name: vqbet
-
-  # Input / output structure.
-  n_obs_steps: 5
-  n_action_pred_token: 7
-  action_chunk_size: 5
-
-  input_shapes:
-    # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
-    observation.image: [3, 96, 96]
-    observation.state: ["${env.state_dim}"]
-  output_shapes:
-    action: ["${env.action_dim}"]
-
-  # Normalization / Unnormalization
-  input_normalization_modes:
-    observation.image: mean_std
-    observation.state: min_max
-  output_normalization_modes:
-    action: min_max
-
-  # Architecture / modeling.
-  # Vision backbone.
-  vision_backbone: resnet18
-  crop_shape: [84, 84]
-  crop_is_random: True
-  pretrained_backbone_weights: null
-  use_group_norm: True
-  spatial_softmax_num_keypoints: 32
-  # VQ-VAE
-  n_vqvae_training_steps: ${training.n_vqvae_training_steps}
-  vqvae_n_embed: 16
-  vqvae_embedding_dim: 256
-  vqvae_enc_hidden_dim: 128
-  # VQ-BeT
-  gpt_block_size: 500
-  gpt_input_dim: 512
-  gpt_output_dim: 512
-  gpt_n_layer: 8
-  gpt_n_head: 8
-  gpt_hidden_dim: 512
-  dropout: 0.1
-  mlp_hidden_dim: 1024
-  offset_loss_weight: 10000.
-  primary_code_loss_weight: 5.0
-  secondary_code_loss_weight: 0.5
-  bet_softmax_temperature: 0.1
-  sequentially_select: False
--- a/lerobot/configs/robot/aloha.yaml
+++ b/lerobot/configs/robot/aloha.yaml
@@ -1,117 +0,0 @@
-# [Aloha: A Low-Cost Hardware for Bimanual Teleoperation](https://www.trossenrobotics.com/aloha-stationary)
-# https://aloha-2.github.io
-
-# Requires installing extras packages
-# With pip: `pip install -e ".[dynamixel intelrealsense]"`
-# With poetry: `poetry install --sync --extras "dynamixel intelrealsense"`
-
-# See [tutorial](https://github.com/huggingface/lerobot/blob/main/examples/9_use_aloha.md)
-
-
-_target_: lerobot.common.robot_devices.robots.manipulator.ManipulatorRobot
-robot_type: aloha
-# Specific to Aloha, LeRobot comes with default calibration files. Assuming the motors have been
-# properly assembled, no manual calibration step is expected. If you need to run manual calibration,
-# simply update this path to ".cache/calibration/aloha"
-calibration_dir: .cache/calibration/aloha_default
-
-# /!\ FOR SAFETY, READ THIS /!\
-# `max_relative_target` limits the magnitude of the relative positional target vector for safety purposes.
-# Set this to a positive scalar to have the same value for all motors, or a list that is the same length as
-# the number of motors in your follower arms.
-# For Aloha, for every goal position request, motor rotations are capped at 5 degrees by default.
-# When you feel more confident with teleoperation or running the policy, you can extend
-# this safety limit and even removing it by setting it to `null`.
-# Also, everything is expected to work safely out-of-the-box, but we highly advise to
-# first try to teleoperate the grippers only (by commenting out the rest of the motors in this yaml),
-# then to gradually add more motors (by uncommenting), until you can teleoperate both arms fully
-max_relative_target: 5
-
-leader_arms:
-  left:
-    _target_: lerobot.common.robot_devices.motors.dynamixel.DynamixelMotorsBus
-    port: /dev/ttyDXL_leader_left
-    motors:  # window_x
-      # name: (index, model)
-      waist: [1, xm430-w350]
-      shoulder: [2, xm430-w350]
-      shoulder_shadow: [3, xm430-w350]
-      elbow: [4, xm430-w350]
-      elbow_shadow: [5, xm430-w350]
-      forearm_roll: [6, xm430-w350]
-      wrist_angle: [7, xm430-w350]
-      wrist_rotate: [8, xl430-w250]
-      gripper: [9, xc430-w150]
-  right:
-    _target_: lerobot.common.robot_devices.motors.dynamixel.DynamixelMotorsBus
-    port: /dev/ttyDXL_leader_right
-    motors:  # window_x
-      # name: (index, model)
-      waist: [1, xm430-w350]
-      shoulder: [2, xm430-w350]
-      shoulder_shadow: [3, xm430-w350]
-      elbow: [4, xm430-w350]
-      elbow_shadow: [5, xm430-w350]
-      forearm_roll: [6, xm430-w350]
-      wrist_angle: [7, xm430-w350]
-      wrist_rotate: [8, xl430-w250]
-      gripper: [9, xc430-w150]
-
-follower_arms:
-  left:
-    _target_: lerobot.common.robot_devices.motors.dynamixel.DynamixelMotorsBus
-    port: /dev/ttyDXL_follower_left
-    motors:
-      # name: [index, model]
-      waist: [1, xm540-w270]
-      shoulder: [2, xm540-w270]
-      shoulder_shadow: [3, xm540-w270]
-      elbow: [4, xm540-w270]
-      elbow_shadow: [5, xm540-w270]
-      forearm_roll: [6, xm540-w270]
-      wrist_angle: [7, xm540-w270]
-      wrist_rotate: [8, xm430-w350]
-      gripper: [9, xm430-w350]
-  right:
-    _target_: lerobot.common.robot_devices.motors.dynamixel.DynamixelMotorsBus
-    port: /dev/ttyDXL_follower_right
-    motors:
-      # name: [index, model]
-      waist: [1, xm540-w270]
-      shoulder: [2, xm540-w270]
-      shoulder_shadow: [3, xm540-w270]
-      elbow: [4, xm540-w270]
-      elbow_shadow: [5, xm540-w270]
-      forearm_roll: [6, xm540-w270]
-      wrist_angle: [7, xm540-w270]
-      wrist_rotate: [8, xm430-w350]
-      gripper: [9, xm430-w350]
-
-# Troubleshooting: If one of your IntelRealSense cameras freeze during
-# data recording due to bandwidth limit, you might need to plug the camera
-# on another USB hub or PCIe card.
-cameras:
-  cam_high:
-    _target_: lerobot.common.robot_devices.cameras.intelrealsense.IntelRealSenseCamera
-    serial_number: 128422271347
-    fps: 30
-    width: 640
-    height: 480
-  cam_low:
-    _target_: lerobot.common.robot_devices.cameras.intelrealsense.IntelRealSenseCamera
-    serial_number: 130322270656
-    fps: 30
-    width: 640
-    height: 480
-  cam_left_wrist:
-    _target_: lerobot.common.robot_devices.cameras.intelrealsense.IntelRealSenseCamera
-    serial_number: 218622272670
-    fps: 30
-    width: 640
-    height: 480
-  cam_right_wrist:
-    _target_: lerobot.common.robot_devices.cameras.intelrealsense.IntelRealSenseCamera
-    serial_number: 130322272300
-    fps: 30
-    width: 640
-    height: 480
--- a/lerobot/configs/robot/koch.yaml
+++ b/lerobot/configs/robot/koch.yaml
@@ -1,53 +0,0 @@
-_target_: lerobot.common.robot_devices.robots.manipulator.ManipulatorRobot
-robot_type: koch
-calibration_dir: .cache/calibration/koch
-
-# `max_relative_target` limits the magnitude of the relative positional target vector for safety purposes.
-# Set this to a positive scalar to have the same value for all motors, or a list that is the same length as
-# the number of motors in your follower arms.
-max_relative_target: null
-
-leader_arms:
-  main:
-    _target_: lerobot.common.robot_devices.motors.dynamixel.DynamixelMotorsBus
-    port: /dev/tty.usbmodem575E0031751
-    motors:
-      # name: (index, model)
-      shoulder_pan: [1, "xl330-m077"]
-      shoulder_lift: [2, "xl330-m077"]
-      elbow_flex: [3, "xl330-m077"]
-      wrist_flex: [4, "xl330-m077"]
-      wrist_roll: [5, "xl330-m077"]
-      gripper: [6, "xl330-m077"]
-
-follower_arms:
-  main:
-    _target_: lerobot.common.robot_devices.motors.dynamixel.DynamixelMotorsBus
-    port: /dev/tty.usbmodem575E0032081
-    motors:
-      # name: (index, model)
-      shoulder_pan: [1, "xl430-w250"]
-      shoulder_lift: [2, "xl430-w250"]
-      elbow_flex: [3, "xl330-m288"]
-      wrist_flex: [4, "xl330-m288"]
-      wrist_roll: [5, "xl330-m288"]
-      gripper: [6, "xl330-m288"]
-
-cameras:
-  laptop:
-    _target_: lerobot.common.robot_devices.cameras.opencv.OpenCVCamera
-    camera_index: 0
-    fps: 30
-    width: 640
-    height: 480
-  phone:
-    _target_: lerobot.common.robot_devices.cameras.opencv.OpenCVCamera
-    camera_index: 1
-    fps: 30
-    width: 640
-    height: 480
-
-# ~ Koch specific settings ~
-# Sets the leader arm in torque mode with the gripper motor set to this angle. This makes it possible
-# to squeeze the gripper and have it spring back to an open position on its own.
-gripper_open_degree: 35.156
--- a/lerobot/configs/robot/koch_bimanual.yaml
+++ b/lerobot/configs/robot/koch_bimanual.yaml
@@ -1,75 +0,0 @@
-_target_: lerobot.common.robot_devices.robots.manipulator.ManipulatorRobot
-robot_type: koch_bimanual
-calibration_dir: .cache/calibration/koch_bimanual
-
-# `max_relative_target` limits the magnitude of the relative positional target vector for safety purposes.
-# Set this to a positive scalar to have the same value for all motors, or a list that is the same length as
-# the number of motors in your follower arms.
-max_relative_target: null
-
-leader_arms:
-  left:
-    _target_: lerobot.common.robot_devices.motors.dynamixel.DynamixelMotorsBus
-    port: /dev/tty.usbmodem585A0085511
-    motors:
-      # name: (index, model)
-      shoulder_pan: [1, "xl330-m077"]
-      shoulder_lift: [2, "xl330-m077"]
-      elbow_flex: [3, "xl330-m077"]
-      wrist_flex: [4, "xl330-m077"]
-      wrist_roll: [5, "xl330-m077"]
-      gripper: [6, "xl330-m077"]
-  right:
-    _target_: lerobot.common.robot_devices.motors.dynamixel.DynamixelMotorsBus
-    port: /dev/tty.usbmodem575E0031751
-    motors:
-      # name: (index, model)
-      shoulder_pan: [1, "xl330-m077"]
-      shoulder_lift: [2, "xl330-m077"]
-      elbow_flex: [3, "xl330-m077"]
-      wrist_flex: [4, "xl330-m077"]
-      wrist_roll: [5, "xl330-m077"]
-      gripper: [6, "xl330-m077"]
-
-follower_arms:
-  left:
-    _target_: lerobot.common.robot_devices.motors.dynamixel.DynamixelMotorsBus
-    port: /dev/tty.usbmodem585A0076891
-    motors:
-      # name: (index, model)
-      shoulder_pan: [1, "xl430-w250"]
-      shoulder_lift: [2, "xl430-w250"]
-      elbow_flex: [3, "xl330-m288"]
-      wrist_flex: [4, "xl330-m288"]
-      wrist_roll: [5, "xl330-m288"]
-      gripper: [6, "xl330-m288"]
-  right:
-    _target_: lerobot.common.robot_devices.motors.dynamixel.DynamixelMotorsBus
-    port: /dev/tty.usbmodem575E0032081
-    motors:
-      # name: (index, model)
-      shoulder_pan: [1, "xl430-w250"]
-      shoulder_lift: [2, "xl430-w250"]
-      elbow_flex: [3, "xl330-m288"]
-      wrist_flex: [4, "xl330-m288"]
-      wrist_roll: [5, "xl330-m288"]
-      gripper: [6, "xl330-m288"]
-
-cameras:
-  laptop:
-    _target_: lerobot.common.robot_devices.cameras.opencv.OpenCVCamera
-    camera_index: 0
-    fps: 30
-    width: 640
-    height: 480
-  phone:
-    _target_: lerobot.common.robot_devices.cameras.opencv.OpenCVCamera
-    camera_index: 1
-    fps: 30
-    width: 640
-    height: 480
-
-# ~ Koch specific settings ~
-# Sets the leader arm in torque mode with the gripper motor set to this angle. This makes it possible
-# to squeeze the gripper and have it spring back to an open position on its own.
-gripper_open_degree: 35.156
--- a/lerobot/configs/robot/moss.yaml
+++ b/lerobot/configs/robot/moss.yaml
@@ -1,56 +0,0 @@
-# [Moss v1 robot arm](https://github.com/jess-moss/moss-robot-arms)
-
-# Requires installing extras packages
-# With pip: `pip install -e ".[feetech]"`
-# With poetry: `poetry install --sync --extras "feetech"`
-
-# See [tutorial](https://github.com/huggingface/lerobot/blob/main/examples/11_use_moss.md)
-
-_target_: lerobot.common.robot_devices.robots.manipulator.ManipulatorRobot
-robot_type: moss
-calibration_dir: .cache/calibration/moss
-
-# `max_relative_target` limits the magnitude of the relative positional target vector for safety purposes.
-# Set this to a positive scalar to have the same value for all motors, or a list that is the same length as
-# the number of motors in your follower arms.
-max_relative_target: null
-
-leader_arms:
-  main:
-    _target_: lerobot.common.robot_devices.motors.feetech.FeetechMotorsBus
-    port: /dev/tty.usbmodem58760431091
-    motors:
-      # name: (index, model)
-      shoulder_pan: [1, "sts3215"]
-      shoulder_lift: [2, "sts3215"]
-      elbow_flex: [3, "sts3215"]
-      wrist_flex: [4, "sts3215"]
-      wrist_roll: [5, "sts3215"]
-      gripper: [6, "sts3215"]
-
-follower_arms:
-  main:
-    _target_: lerobot.common.robot_devices.motors.feetech.FeetechMotorsBus
-    port: /dev/tty.usbmodem58760431191
-    motors:
-      # name: (index, model)
-      shoulder_pan: [1, "sts3215"]
-      shoulder_lift: [2, "sts3215"]
-      elbow_flex: [3, "sts3215"]
-      wrist_flex: [4, "sts3215"]
-      wrist_roll: [5, "sts3215"]
-      gripper: [6, "sts3215"]
-
-cameras:
-  laptop:
-    _target_: lerobot.common.robot_devices.cameras.opencv.OpenCVCamera
-    camera_index: 0
-    fps: 30
-    width: 640
-    height: 480
-  phone:
-    _target_: lerobot.common.robot_devices.cameras.opencv.OpenCVCamera
-    camera_index: 1
-    fps: 30
-    width: 640
-    height: 480
--- a/lerobot/configs/robot/so100.yaml
+++ b/lerobot/configs/robot/so100.yaml
@@ -1,56 +0,0 @@
-# [SO-100 robot arm](https://github.com/TheRobotStudio/SO-ARM100)
-
-# Requires installing extras packages
-# With pip: `pip install -e ".[feetech]"`
-# With poetry: `poetry install --sync --extras "feetech"`
-
-# See [tutorial](https://github.com/huggingface/lerobot/blob/main/examples/10_use_so100.md)
-
-_target_: lerobot.common.robot_devices.robots.manipulator.ManipulatorRobot
-robot_type: so100
-calibration_dir: .cache/calibration/so100
-
-# `max_relative_target` limits the magnitude of the relative positional target vector for safety purposes.
-# Set this to a positive scalar to have the same value for all motors, or a list that is the same length as
-# the number of motors in your follower arms.
-max_relative_target: null
-
-leader_arms:
-  main:
-    _target_: lerobot.common.robot_devices.motors.feetech.FeetechMotorsBus
-    port: /dev/tty.usbmodem585A0077581
-    motors:
-      # name: (index, model)
-      shoulder_pan: [1, "sts3215"]
-      shoulder_lift: [2, "sts3215"]
-      elbow_flex: [3, "sts3215"]
-      wrist_flex: [4, "sts3215"]
-      wrist_roll: [5, "sts3215"]
-      gripper: [6, "sts3215"]
-
-follower_arms:
-  main:
-    _target_: lerobot.common.robot_devices.motors.feetech.FeetechMotorsBus
-    port: /dev/tty.usbmodem585A0080971
-    motors:
-      # name: (index, model)
-      shoulder_pan: [1, "sts3215"]
-      shoulder_lift: [2, "sts3215"]
-      elbow_flex: [3, "sts3215"]
-      wrist_flex: [4, "sts3215"]
-      wrist_roll: [5, "sts3215"]
-      gripper: [6, "sts3215"]
-
-cameras:
-  laptop:
-    _target_: lerobot.common.robot_devices.cameras.opencv.OpenCVCamera
-    camera_index: 0
-    fps: 30
-    width: 640
-    height: 480
-  phone:
-    _target_: lerobot.common.robot_devices.cameras.opencv.OpenCVCamera
-    camera_index: 1
-    fps: 30
-    width: 640
-    height: 480
--- a/lerobot/configs/robot/stretch.yaml
+++ b/lerobot/configs/robot/stretch.yaml
@@ -1,33 +0,0 @@
-# [Stretch3 from Hello Robot](https://hello-robot.com/stretch-3-product)
-
-# Requires installing extras packages
-# With pip: `pip install -e ".[stretch]"`
-# With poetry: `poetry install --sync --extras "stretch"`
-
-# See [tutorial](https://github.com/huggingface/lerobot/blob/main/examples/8_use_stretch.md)
-
-
-_target_: lerobot.common.robot_devices.robots.stretch.StretchRobot
-robot_type: stretch3
-
-cameras:
-  navigation:
-    _target_: lerobot.common.robot_devices.cameras.opencv.OpenCVCamera
-    camera_index: /dev/hello-nav-head-camera
-    fps: 10
-    width: 1280
-    height: 720
-    rotation: -90
-  head:
-    _target_: lerobot.common.robot_devices.cameras.intelrealsense.IntelRealSenseCamera.init_from_name
-    name: Intel RealSense D435I
-    fps: 30
-    width: 640
-    height: 480
-    rotation: 90
-  wrist:
-    _target_: lerobot.common.robot_devices.cameras.intelrealsense.IntelRealSenseCamera.init_from_name
-    name: Intel RealSense D405
-    fps: 30
-    width: 640
-    height: 480
--- a/lerobot/configs/train.py
+++ b/lerobot/configs/train.py
@@ -0,0 +1,236 @@
+import datetime as dt
+import logging
+import os
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Type
+
+import draccus
+from huggingface_hub import hf_hub_download
+from huggingface_hub.errors import HfHubHTTPError
+
+from lerobot.common import envs
+from lerobot.common.optim import OptimizerConfig
+from lerobot.common.optim.schedulers import LRSchedulerConfig
+from lerobot.common.utils.hub import HubMixin
+from lerobot.common.utils.utils import auto_select_torch_device, is_amp_available
+from lerobot.configs import parser
+from lerobot.configs.default import DatasetConfig, EvalConfig, WandBConfig
+from lerobot.configs.policies import PreTrainedConfig
+
+TRAIN_CONFIG_NAME = "train_config.json"
+
+
+@dataclass
+class OfflineConfig:
+    steps: int = 100_000
+
+
+@dataclass
+class OnlineConfig:
+    """
+    The online training loop looks something like:
+
+    ```python
+    for i in range(steps):
+        do_online_rollout_and_update_online_buffer()
+        for j in range(steps_between_rollouts):
+            batch = next(dataloader_with_offline_and_online_data)
+            loss = policy(batch)
+            loss.backward()
+            optimizer.step()
+    ```
+
+    Note that the online training loop adopts most of the options from the offline loop unless specified
+    otherwise.
+    """
+
+    steps: int = 0
+    # How many episodes to collect at once when we reach the online rollout part of the training loop.
+    rollout_n_episodes: int = 1
+    # The number of environments to use in the gym.vector.VectorEnv. This ends up also being the batch size for
+    # the policy. Ideally you should set this to by an even divisor of rollout_n_episodes.
+    rollout_batch_size: int = 1
+    # How many optimization steps (forward, backward, optimizer step) to do between running rollouts.
+    steps_between_rollouts: int | None = None
+    # The proportion of online samples (vs offline samples) to include in the online training batches.
+    sampling_ratio: float = 0.5
+    # First seed to use for the online rollout environment. Seeds for subsequent rollouts are incremented by 1.
+    env_seed: int | None = None
+    # Sets the maximum number of frames that are stored in the online buffer for online training. The buffer is
+    # FIFO.
+    buffer_capacity: int | None = None
+    # The minimum number of frames to have in the online buffer before commencing online training.
+    # If buffer_seed_size > rollout_n_episodes, the rollout will be run multiple times until the
+    # seed size condition is satisfied.
+    buffer_seed_size: int = 0
+    # Whether to run the online rollouts asynchronously. This means we can run the online training steps in
+    # parallel with the rollouts. This might be advised if your GPU has the bandwidth to handle training
+    # + eval + environment rendering simultaneously.
+    do_rollout_async: bool = False
+
+    def __post_init__(self):
+        if self.steps == 0:
+            return
+
+        if self.steps_between_rollouts is None:
+            raise ValueError(
+                "'steps_between_rollouts' must be set to a positive integer, but it is currently None."
+            )
+        if self.env_seed is None:
+            raise ValueError("'env_seed' must be set to a positive integer, but it is currently None.")
+        if self.buffer_capacity is None:
+            raise ValueError("'buffer_capacity' must be set to a positive integer, but it is currently None.")
+
+
+@dataclass
+class TrainPipelineConfig(HubMixin):
+    dataset: DatasetConfig
+    env: envs.EnvConfig | None = None
+    policy: PreTrainedConfig | None = None
+    # Set `dir` to where you would like to save all of the run outputs. If you run another training session
+    # with the same value for `dir` its contents will be overwritten unless you set `resume` to true.
+    output_dir: Path | None = None
+    job_name: str | None = None
+    # Set `resume` to true to resume a previous run. In order for this to work, you will need to make sure
+    # `dir` is the directory of an existing run with at least one checkpoint in it.
+    # Note that when resuming a run, the default behavior is to use the configuration from the checkpoint,
+    # regardless of what's provided with the training command at the time of resumption.
+    resume: bool = False
+    device: str | None = None  # cuda | cpu | mp
+    # `use_amp` determines whether to use Automatic Mixed Precision (AMP) for training and evaluation. With AMP,
+    # automatic gradient scaling is used.
+    use_amp: bool = False
+    # `seed` is used for training (eg: model initialization, dataset shuffling)
+    # AND for the evaluation environments.
+    seed: int | None = 1000
+    # Number of workers for the dataloader.
+    num_workers: int = 4
+    batch_size: int = 8
+    eval_freq: int = 20_000
+    log_freq: int = 200
+    save_checkpoint: bool = True
+    # Checkpoint is saved every `save_freq` training iterations and after the last training step.
+    save_freq: int = 20_000
+    offline: OfflineConfig = field(default_factory=OfflineConfig)
+    online: OnlineConfig = field(default_factory=OnlineConfig)
+    use_policy_training_preset: bool = True
+    optimizer: OptimizerConfig | None = None
+    scheduler: LRSchedulerConfig | None = None
+    eval: EvalConfig = field(default_factory=EvalConfig)
+    wandb: WandBConfig = field(default_factory=WandBConfig)
+
+    def __post_init__(self):
+        self.checkpoint_path = None
+
+    def validate(self):
+        if not self.device:
+            logging.warning("No device specified, trying to infer device automatically")
+            device = auto_select_torch_device()
+            self.device = device.type
+
+        # Automatically deactivate AMP if necessary
+        if self.use_amp and not is_amp_available(self.device):
+            logging.warning(
+                f"Automatic Mixed Precision (amp) is not available on device '{self.device}'. Deactivating AMP."
+            )
+            self.use_amp = False
+
+        # HACK: We parse again the cli args here to get the pretrained paths if there was some.
+        policy_path = parser.get_path_arg("policy")
+        if policy_path:
+            # Only load the policy config
+            cli_overrides = parser.get_cli_overrides("policy")
+            self.policy = PreTrainedConfig.from_pretrained(policy_path, cli_overrides=cli_overrides)
+            self.policy.pretrained_path = policy_path
+        elif self.resume:
+            # The entire train config is already loaded, we just need to get the checkpoint dir
+            config_path = parser.parse_arg("config_path")
+            if not config_path:
+                raise ValueError("A config_path is expected when resuming a run.")
+            policy_path = Path(config_path).parent
+            self.policy.pretrained_path = policy_path
+            self.checkpoint_path = policy_path.parent
+
+        if not self.job_name:
+            if self.env is None:
+                self.job_name = f"{self.policy.type}"
+            else:
+                self.job_name = f"{self.env.type}_{self.policy.type}"
+
+        if not self.resume and isinstance(self.output_dir, Path) and self.output_dir.is_dir():
+            raise FileExistsError(
+                f"Output directory {self.output_dir} alreay exists and resume is {self.resume}. "
+                f"Please change your output directory so that {self.output_dir} is not overwritten."
+            )
+        elif not self.output_dir:
+            now = dt.datetime.now()
+            train_dir = f"{now:%Y-%m-%d}/{now:%H-%M-%S}_{self.job_name}"
+            self.output_dir = Path("outputs/train") / train_dir
+
+        if self.online.steps > 0:
+            if isinstance(self.dataset.repo_id, list):
+                raise NotImplementedError("Online training with LeRobotMultiDataset is not implemented.")
+            if self.env is None:
+                raise ValueError("An environment is required for online training")
+
+        if not self.use_policy_training_preset and (self.optimizer is None or self.scheduler is None):
+            raise ValueError("Optimizer and Scheduler must be set when the policy presets are not used.")
+        elif self.use_policy_training_preset and not self.resume:
+            self.optimizer = self.policy.get_optimizer_preset()
+            self.scheduler = self.policy.get_scheduler_preset()
+
+    @classmethod
+    def __get_path_fields__(cls) -> list[str]:
+        """This enables the parser to load config from the policy using `--policy.path=local/dir`"""
+        return ["policy"]
+
+    def _save_pretrained(self, save_directory: Path) -> None:
+        with open(save_directory / TRAIN_CONFIG_NAME, "w") as f, draccus.config_type("json"):
+            draccus.dump(self, f, indent=4)
+
+    @classmethod
+    def from_pretrained(
+        cls: Type["TrainPipelineConfig"],
+        pretrained_name_or_path: str | Path,
+        *,
+        force_download: bool = False,
+        resume_download: bool = None,
+        proxies: dict | None = None,
+        token: str | bool | None = None,
+        cache_dir: str | Path | None = None,
+        local_files_only: bool = False,
+        revision: str | None = None,
+        **kwargs,
+    ) -> "TrainPipelineConfig":
+        model_id = str(pretrained_name_or_path)
+        config_file: str | None = None
+        if Path(model_id).is_dir():
+            if TRAIN_CONFIG_NAME in os.listdir(model_id):
+                config_file = os.path.join(model_id, TRAIN_CONFIG_NAME)
+            else:
+                print(f"{TRAIN_CONFIG_NAME} not found in {Path(model_id).resolve()}")
+        elif Path(model_id).is_file():
+            config_file = model_id
+        else:
+            try:
+                config_file = hf_hub_download(
+                    repo_id=model_id,
+                    filename=TRAIN_CONFIG_NAME,
+                    revision=revision,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    resume_download=resume_download,
+                    token=token,
+                    local_files_only=local_files_only,
+                )
+            except HfHubHTTPError as e:
+                raise FileNotFoundError(
+                    f"{TRAIN_CONFIG_NAME} not found on the HuggingFace Hub in {model_id}"
+                ) from e
+
+        cli_args = kwargs.pop("cli_args", [])
+        cfg = draccus.parse(cls, config_file, args=cli_args)
+
+        return cfg
--- a/lerobot/configs/types.py
+++ b/lerobot/configs/types.py
@@ -0,0 +1,28 @@
+# Note: We subclass str so that serialization is straightforward
+# https://stackoverflow.com/questions/24481852/serialising-an-enum-member-to-json
+from dataclasses import dataclass
+from enum import Enum
+from typing import Any, Protocol
+
+
+class FeatureType(str, Enum):
+    STATE = "STATE"
+    VISUAL = "VISUAL"
+    ENV = "ENV"
+    ACTION = "ACTION"
+
+
+class NormalizationMode(str, Enum):
+    MIN_MAX = "MIN_MAX"
+    MEAN_STD = "MEAN_STD"
+    IDENTITY = "IDENTITY"
+
+
+class DictLike(Protocol):
+    def __getitem__(self, key: Any) -> Any: ...
+
+
+@dataclass
+class PolicyFeature:
+    type: FeatureType
+    shape: tuple