updated params

First commit of tdmpc2 taken from NHansen code
Support for converting OpenX datasets from RLDS format to LeRobotDataset (#354 )
2024-09-02 06:34:24 +00:00 · 2024-08-29 12:48:01 +00:00 · 2024-08-27 09:07:00 +02:00 · 2024-08-26 17:38:48 +02:00 · 2024-08-26 14:30:18 +01:00 · 2024-08-26 12:28:16 +01:00
650 changed files with 9308 additions and 2398 deletions
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@@ -14,20 +14,14 @@ env:
 jobs:
  latest-cpu:
    name: CPU
-    runs-on: ubuntu-latest
+    runs-on:
+      group: aws-general-8-plus
    steps:
-      - name: Cleanup disk
+      - name: Install Git LFS
        run: |
-          sudo df -h
-          # sudo ls -l /usr/local/lib/
-          # sudo ls -l /usr/share/
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /usr/share/dotnet
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo df -h
+          sudo apt-get update
+          sudo apt-get install git-lfs
+          git lfs install

      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
@@ -55,20 +49,15 @@ jobs:

  latest-cuda:
    name: GPU
-    runs-on: ubuntu-latest
+    runs-on:
+      group: aws-general-8-plus
    steps:
-      - name: Cleanup disk
+      - name: Install Git LFS
        run: |
-          sudo df -h
-          # sudo ls -l /usr/local/lib/
-          # sudo ls -l /usr/share/
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /usr/share/dotnet
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo df -h
+          sudo apt-get update
+          sudo apt-get install git-lfs
+          git lfs install
+
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3

@@ -95,20 +84,9 @@ jobs:

  latest-cuda-dev:
    name: GPU Dev
-    runs-on: ubuntu-latest
+    runs-on:
+      group: aws-general-8-plus
    steps:
-      - name: Cleanup disk
-        run: |
-          sudo df -h
-          # sudo ls -l /usr/local/lib/
-          # sudo ls -l /usr/share/
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /usr/share/dotnet
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo df -h
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3

--- a/.github/workflows/nightly-tests.yml
+++ b/.github/workflows/nightly-tests.yml
@@ -16,7 +16,8 @@ jobs:
    name: CPU
    strategy:
      fail-fast: false
-    runs-on: ubuntu-latest
+    runs-on:
+      group: aws-general-8-plus
    container:
      image: huggingface/lerobot-cpu:latest
      options: --shm-size "16gb"
@@ -43,7 +44,8 @@ jobs:
    name: GPU
    strategy:
      fail-fast: false
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g6-4xlarge-plus
    env:
      CUDA_VISIBLE_DEVICES: "0"
      TEST_TYPE: "single_gpu"
--- a/.github/workflows/quality.yml
+++ b/.github/workflows/quality.yml
@@ -54,3 +54,31 @@ jobs:

      - name: Poetry check
        run: poetry check
+
+
+  poetry_relax:
+    name: Poetry relax
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v3
+
+      - name: Install poetry
+        run: pipx install poetry
+
+      - name: Install poetry-relax
+        run: poetry self add poetry-relax
+
+      - name: Poetry relax
+        id: poetry_relax
+        run: |
+          output=$(poetry relax --check 2>&1)
+          if echo "$output" | grep -q "Proposing updates"; then
+            echo "$output"
+            echo ""
+            echo "Some dependencies have caret '^' version requirement added by poetry by default."
+            echo "Please replace them with '>='. You can do this by hand or use poetry-relax to do this."
+            exit 1
+          else
+            echo "$output"
+          fi
--- a/.github/workflows/test-docker-build.yml
+++ b/.github/workflows/test-docker-build.yml
@@ -42,26 +42,14 @@ jobs:
  build_modified_dockerfiles:
    name: Build modified Docker images
    needs: get_changed_files
-    runs-on: ubuntu-latest
+    runs-on:
+      group: aws-general-8-plus
    if: ${{ needs.get_changed_files.outputs.matrix }} != ''
    strategy:
      fail-fast: false
      matrix:
        docker-file: ${{ fromJson(needs.get_changed_files.outputs.matrix) }}
    steps:
-      - name: Cleanup disk
-        run: |
-          sudo df -h
-          # sudo ls -l /usr/local/lib/
-          # sudo ls -l /usr/share/
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /usr/share/dotnet
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo df -h
-
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3

--- a/.github/workflows/trufflehog.yml
+++ b/.github/workflows/trufflehog.yml
@@ -16,3 +16,5 @@ jobs:
        fetch-depth: 0
    - name: Secret Scanning
      uses: trufflesecurity/trufflehog@main
+      with:
+        extra_args: --only-verified
--- a/.gitignore
+++ b/.gitignore
@@ -121,6 +121,7 @@ celerybeat.pid
 # Environments
 .env
 .venv
+env/
 venv/
 env.bak/
 venv.bak/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -14,11 +14,11 @@ repos:
      - id: end-of-file-fixer
      - id: trailing-whitespace
  - repo: https://github.com/asottile/pyupgrade
-    rev: v3.15.2
+    rev: v3.16.0
    hooks:
    -   id: pyupgrade
  - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.4.3
+    rev: v0.5.2
    hooks:
      - id: ruff
        args: [--fix]
@@ -31,3 +31,7 @@ repos:
        args:
          - "--check"
          - "--no-update"
+  - repo: https://github.com/gitleaks/gitleaks
+    rev: v8.18.4
+    hooks:
+      - id: gitleaks
--- a/24
+++ b/24
@@ -26,6 +26,7 @@ test-end-to-end:
 	${MAKE} DEVICE=$(DEVICE) test-diffusion-ete-train
 	${MAKE} DEVICE=$(DEVICE) test-diffusion-ete-eval
 	${MAKE} DEVICE=$(DEVICE) test-tdmpc-ete-train
+	${MAKE} DEVICE=$(DEVICE) test-tdmpc-ete-train-with-online
 	${MAKE} DEVICE=$(DEVICE) test-tdmpc-ete-eval
 	${MAKE} DEVICE=$(DEVICE) test-default-ete-eval
 	${MAKE} DEVICE=$(DEVICE) test-act-pusht-tutorial
@@ -113,7 +114,6 @@ test-diffusion-ete-eval:
 		env.episode_length=8 \
 		device=$(DEVICE) \

-# TODO(alexander-soare): Restore online_steps to 2 when it is reinstated.
 test-tdmpc-ete-train:
 	python lerobot/scripts/train.py \
 		policy=tdmpc \
@@ -133,6 +133,28 @@ test-tdmpc-ete-train:
 		training.image_transforms.enable=true \
 		hydra.run.dir=tests/outputs/tdmpc/

+test-tdmpc-ete-train-with-online:
+	python lerobot/scripts/train.py \
+		env=pusht \
+		env.gym.obs_type=environment_state_agent_pos \
+		policy=tdmpc_pusht_keypoints \
+		eval.n_episodes=1 \
+		eval.batch_size=1 \
+		env.episode_length=10 \
+		device=$(DEVICE) \
+		training.offline_steps=2 \
+		training.online_steps=20 \
+		training.save_checkpoint=false \
+		training.save_freq=10 \
+		training.batch_size=2 \
+		training.online_rollout_n_episodes=2 \
+		training.online_rollout_batch_size=2 \
+		training.online_steps_between_rollouts=10 \
+		training.online_buffer_capacity=15 \
+		eval.use_async_envs=true \
+		hydra.run.dir=tests/outputs/tdmpc_online/
+
+
 test-tdmpc-ete-eval:
 	python lerobot/scripts/eval.py \
 		-p tests/outputs/tdmpc/checkpoints/000002/pretrained_model \
--- a/README.md
+++ b/README.md
@@ -22,8 +22,22 @@

 </div>

+<h2 align="center">
+    <p><a href="https://github.com/huggingface/lerobot/blob/main/examples/7_get_started_with_real_robot.md">Hot new tutorial: Getting started with real-world robots</a></p>
+</h2>
+
+<div align="center">
+    <img src="media/tutorial/koch_v1_1_leader_follower.webp?raw=true" alt="Koch v1.1 leader and follower arms" title="Koch v1.1 leader and follower arms" width="50%">
+    <p>We just dropped an in-depth tutorial on how to build your own robot!</p>
+    <p>Teach it new skills by showing it a few moves with just a laptop.</p>
+    <p>Then watch your homemade robot act autonomously 🤯</p>
+    <p>For more info, see <a href="https://x.com/RemiCadene/status/1825455895561859185">our thread on X</a> or <a href="https://github.com/huggingface/lerobot/blob/main/examples/7_get_started_with_real_robot.md">our tutorial page</a>.</p>
+</div>
+
+<br/>
+
 <h3 align="center">
-    <p>State-of-the-art Machine Learning for real-world robotics</p>
+    <p>LeRobot: State-of-the-art AI for real-world robotics</p>
 </h3>

 ---
@@ -65,17 +79,19 @@

 Download our source code:
 ```bash
-git clone https://github.com/huggingface/lerobot.git && cd lerobot
+git clone https://github.com/huggingface/lerobot.git
+cd lerobot
 ```

 Create a virtual environment with Python 3.10 and activate it, e.g. with [`miniconda`](https://docs.anaconda.com/free/miniconda/index.html):
 ```bash
-conda create -y -n lerobot python=3.10 && conda activate lerobot
+conda create -y -n lerobot python=3.10
+conda activate lerobot
 ```

 Install 🤗 LeRobot:
 ```bash
-pip install .
+pip install -e .
 ```

 > **NOTE:** Depending on your platform, If you encounter any build errors during this step
@@ -89,7 +105,7 @@ For simulations, 🤗 LeRobot comes with gymnasium environments that can be inst

 For instance, to install 🤗 LeRobot with aloha and pusht, use:
 ```bash
-pip install ".[aloha, pusht]"
+pip install -e ".[aloha, pusht]"
 ```

 To use [Weights and Biases](https://docs.wandb.ai/quickstart) for experiment tracking, log in with
@@ -114,10 +130,12 @@ wandb login
 |   |   ├── datasets       # various datasets of human demonstrations: aloha, pusht, xarm
 |   |   ├── envs           # various sim environments: aloha, pusht, xarm
 |   |   ├── policies       # various policies: act, diffusion, tdmpc
+|   |   ├── robot_devices  # various real devices: dynamixel motors, opencv cameras, koch robots
 |   |   └── utils          # various utilities
 |   └── scripts          # contains functions to execute via command line
 |       ├── eval.py                 # load policy and evaluate it on an environment
 |       ├── train.py                # train a policy via imitation learning and/or reinforcement learning
+|       ├── control_robot.py        # teleoperate a real robot, record data, run a policy
 |       ├── push_dataset_to_hub.py  # convert your dataset into LeRobot dataset format and upload it to the Hugging Face hub
 |       └── visualize_dataset.py    # load a dataset and render its demonstrations
 ├── outputs               # contains results of scripts execution: logs, videos, model checkpoints
@@ -180,8 +198,10 @@ dataset attributes:
  │  ├ observation.images.cam_high: {'max': tensor with same number of dimensions (e.g. `(c, 1, 1)` for images, `(c,)` for states), etc.}
  │  ...
  ├ info: a dictionary of metadata on the dataset
+  │  ├ codebase_version (str): this is to keep track of the codebase version the dataset was created with
  │  ├ fps (float): frame per second the dataset is recorded/synchronized to
-  │  └ video (bool): indicates if frames are encoded in mp4 video files to save space or stored as png files
+  │  ├ video (bool): indicates if frames are encoded in mp4 video files to save space or stored as png files
+  │  └ encoding (dict): if video, this documents the main options that were used with ffmpeg to encode the videos
  ├ videos_dir (Path): where the mp4 videos or png images are stored/accessed
  └ camera_keys (list of string): the keys to access camera features in the item returned by the dataset (e.g. `["observation.images.cam_high", ...]`)
 ```
@@ -247,13 +267,20 @@ checkpoints
 │   └── training_state.pth  # optimizer/scheduler/rng state and training step
 ```

+To resume training from a checkpoint, you can add these to the `train.py` python command:
+```bash
+    hydra.run.dir=your/original/experiment/dir resume=true
+```
+
+It will load the pretrained model, optimizer and scheduler states for training. For more information please see our tutorial on training resumption [here](https://github.com/huggingface/lerobot/blob/main/examples/5_resume_training.md).
+
 To use wandb for logging training and evaluation curves, make sure you've run `wandb login` as a one-time setup step. Then, when running the training command above, enable WandB in the configuration by adding:

 ```bash
    wandb.enable=true
 ```

-A link to the wandb logs for the run will also show up in yellow in your terminal. Here is an example of what they look like in your browser:
+A link to the wandb logs for the run will also show up in yellow in your terminal. Here is an example of what they look like in your browser. Please also check [here](https://github.com/huggingface/lerobot/blob/main/examples/4_train_policy_with_script.md#typical-logs-and-metrics) for the explaination of some commonly used metrics in logs.

 ![](media/wandb.png)

--- a/benchmarks/video/run_video_benchmark.py
+++ b/benchmarks/video/run_video_benchmark.py
@@ -257,10 +257,10 @@ def benchmark_encoding_decoding(
            imgs_dir=imgs_dir,
            video_path=video_path,
            fps=fps,
-            video_codec=encoding_cfg["vcodec"],
-            pixel_format=encoding_cfg["pix_fmt"],
-            group_of_pictures_size=encoding_cfg.get("g"),
-            constant_rate_factor=encoding_cfg.get("crf"),
+            vcodec=encoding_cfg["vcodec"],
+            pix_fmt=encoding_cfg["pix_fmt"],
+            g=encoding_cfg.get("g"),
+            crf=encoding_cfg.get("crf"),
            # fast_decode=encoding_cfg.get("fastdecode"),
            overwrite=True,
        )
--- a/docker/lerobot-cpu/Dockerfile
+++ b/docker/lerobot-cpu/Dockerfile
@@ -9,6 +9,7 @@ ARG DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && apt-get install -y --no-install-recommends \
    build-essential cmake \
    libglib2.0-0 libgl1-mesa-glx libegl1-mesa ffmpeg \
+    speech-dispatcher \
    && apt-get clean && rm -rf /var/lib/apt/lists/*

 # Create virtual environment
--- a/docker/lerobot-gpu-dev/Dockerfile
+++ b/docker/lerobot-gpu-dev/Dockerfile
@@ -13,6 +13,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    sed gawk grep curl wget zip unzip \
    tcpdump sysstat screen tmux \
    libglib2.0-0 libgl1-mesa-glx libegl1-mesa \
+    speech-dispatcher \
    python${PYTHON_VERSION} python${PYTHON_VERSION}-venv \
    && apt-get clean && rm -rf /var/lib/apt/lists/*

--- a/docker/lerobot-gpu/Dockerfile
+++ b/docker/lerobot-gpu/Dockerfile
@@ -9,6 +9,7 @@ ARG DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && apt-get install -y --no-install-recommends \
    build-essential cmake \
    libglib2.0-0 libgl1-mesa-glx libegl1-mesa ffmpeg \
+    speech-dispatcher \
    python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
    && apt-get clean && rm -rf /var/lib/apt/lists/*

--- a/examples/2_evaluate_pretrained_policy.py
+++ b/examples/2_evaluate_pretrained_policy.py
@@ -18,8 +18,6 @@ from lerobot.common.policies.diffusion.modeling_diffusion import DiffusionPolicy
 output_directory = Path("outputs/eval/example_pusht_diffusion")
 output_directory.mkdir(parents=True, exist_ok=True)

-device = torch.device("cuda")
-
 # Download the diffusion policy for pusht environment
 pretrained_policy_path = Path(snapshot_download("lerobot/diffusion_pusht"))
 # OR uncomment the following to evaluate a policy from the local outputs/train folder.
@@ -27,6 +25,17 @@ pretrained_policy_path = Path(snapshot_download("lerobot/diffusion_pusht"))

 policy = DiffusionPolicy.from_pretrained(pretrained_policy_path)
 policy.eval()
+
+# Check if GPU is available
+if torch.cuda.is_available():
+    device = torch.device("cuda")
+    print("GPU is available. Device set to:", device)
+else:
+    device = torch.device("cpu")
+    print(f"GPU is not available. Device set to: {device}. Inference will be slower than on GPU.")
+    # Decrease the number of reverse-diffusion steps (trades off a bit of quality for 10x speed)
+    policy.diffusion.num_inference_steps = 10
+
 policy.to(device)

 # Initialize evaluation environment to render two observation types:
--- a/examples/4_train_policy_with_script.md
+++ b/examples/4_train_policy_with_script.md
@@ -170,6 +170,36 @@ python lerobot/scripts/train.py --config-dir outputs/train/my_experiment/checkpo

 Note that you may still use the regular syntax for config parameter overrides (eg: by adding `training.offline_steps=200000`).

+## Typical logs and metrics
+
+When you start the training process, you will first see your full configuration being printed in the terminal. You can check it to make sure that you config it correctly and your config is not overrided by other files. The final configuration will also be saved with the checkpoint.
+
+After that, you will see training log like this one:
+
+```
+INFO 2024-08-14 13:35:12 ts/train.py:192 step:0 smpl:64 ep:1 epch:0.00 loss:1.112 grdn:15.387 lr:2.0e-07 updt_s:1.738 data_s:4.774
+```
+
+or evaluation log like:
+
+```
+INFO 2024-08-14 13:38:45 ts/train.py:226 step:100 smpl:6K ep:52 epch:0.25 ∑rwrd:20.693 success:0.0% eval_s:120.266
+```
+
+These logs will also be saved in wandb if `wandb.enable` is set to `true`. Here are the meaning of some abbreviations:
+
+- `smpl`: number of samples seen during training.
+- `ep`: number of episodes seen during training. An episode contains multiple samples in a complete manipulation task.
+- `epch`: number of time all unique samples are seen (epoch).
+- `grdn`: gradient norm.
+- `∑rwrd`: compute the sum of rewards in every evaluation episode and then take an average of them.
+- `success`: average success rate of eval episodes. Reward and success are usually different except for the sparsing reward setting, where reward=1 only when the task is completed successfully.
+- `eval_s`: time to evaluate the policy in the environment, in second.
+- `updt_s`: time to update the network parameters, in second.
+- `data_s`: time to load a batch of data, in second. 
+
+Some metrics are useful for initial performance profiling. For example, if you find the current GPU utilization is low via the `nvidia-smi` command and `data_s` sometimes is too high, you may need to modify batch size or number of dataloading workers to accelerate dataloading. We also recommend [pytorch profiler](https://github.com/huggingface/lerobot?tab=readme-ov-file#improve-your-code-with-profiling) for detailed performance probing.
+
 ---

 So far we've seen how to train Diffusion Policy for PushT and ACT for ALOHA. Now, what if we want to train ACT for PushT? Well, there are aspects of the ACT configuration that are specific to the ALOHA environments, and these happen to be incompatible with PushT. Therefore, trying to run the following will almost certainly raise an exception of sorts (eg: feature dimension mismatch):
--- a/examples/7_get_started_with_real_robot.md
+++ b/examples/7_get_started_with_real_robot.md
--- a/lerobot/init.py
+++ b/lerobot/init.py
@@ -125,6 +125,57 @@ available_real_world_datasets = [
    "lerobot/aloha_static_vinh_cup_left",
    "lerobot/aloha_static_ziploc_slide",
    "lerobot/umi_cup_in_the_wild",
+    "lerobot/unitreeh1_fold_clothes",
+    "lerobot/unitreeh1_rearrange_objects",
+    "lerobot/unitreeh1_two_robot_greeting",
+    "lerobot/unitreeh1_warehouse",
+    "lerobot/nyu_rot_dataset",
+    "lerobot/utokyo_saytap",
+    "lerobot/imperialcollege_sawyer_wrist_cam",
+    "lerobot/utokyo_xarm_bimanual",
+    "lerobot/tokyo_u_lsmo",
+    "lerobot/utokyo_pr2_opening_fridge",
+    "lerobot/cmu_franka_exploration_dataset",
+    "lerobot/cmu_stretch",
+    "lerobot/asu_table_top",
+    "lerobot/utokyo_pr2_tabletop_manipulation",
+    "lerobot/utokyo_xarm_pick_and_place",
+    "lerobot/ucsd_kitchen_dataset",
+    "lerobot/austin_buds_dataset",
+    "lerobot/dlr_sara_grid_clamp",
+    "lerobot/conq_hose_manipulation",
+    "lerobot/columbia_cairlab_pusht_real",
+    "lerobot/dlr_sara_pour",
+    "lerobot/dlr_edan_shared_control",
+    "lerobot/ucsd_pick_and_place_dataset",
+    "lerobot/berkeley_cable_routing",
+    "lerobot/nyu_franka_play_dataset",
+    "lerobot/austin_sirius_dataset",
+    "lerobot/cmu_play_fusion",
+    "lerobot/berkeley_gnm_sac_son",
+    "lerobot/nyu_door_opening_surprising_effectiveness",
+    "lerobot/berkeley_fanuc_manipulation",
+    "lerobot/jaco_play",
+    "lerobot/viola",
+    "lerobot/kaist_nonprehensile",
+    "lerobot/berkeley_mvp",
+    "lerobot/uiuc_d3field",
+    "lerobot/berkeley_gnm_recon",
+    "lerobot/austin_sailor_dataset",
+    "lerobot/utaustin_mutex",
+    "lerobot/roboturk",
+    "lerobot/stanford_hydra_dataset",
+    "lerobot/berkeley_autolab_ur5",
+    "lerobot/stanford_robocook",
+    "lerobot/toto",
+    "lerobot/fmb",
+    "lerobot/droid_100",
+    "lerobot/berkeley_rpt",
+    "lerobot/stanford_kuka_multimodal_dataset",
+    "lerobot/iamlab_cmu_pickup_insert",
+    "lerobot/taco_play",
+    "lerobot/berkeley_gnm_cory_hall",
+    "lerobot/usc_cloth_sim",
 ]

 available_datasets = list(
--- a/lerobot/common/datasets/compute_stats.py
+++ b/lerobot/common/datasets/compute_stats.py
@@ -40,6 +40,10 @@ def get_stats_einops_patterns(dataset, num_workers=0):

    stats_patterns = {}
    for key, feats_type in dataset.features.items():
+        # NOTE: skip language_instruction embedding in stats computation
+        if key == "language_instruction":
+            continue
+
        # sanity check that tensors are not float64
        assert batch[key].dtype != torch.float64

--- a/lerobot/common/datasets/lerobot_dataset.py
+++ b/lerobot/common/datasets/lerobot_dataset.py
@@ -35,9 +35,8 @@ from lerobot.common.datasets.utils import (
 )
 from lerobot.common.datasets.video_utils import VideoFrame, load_from_videos

-# For maintainers, see lerobot/common/datasets/push_dataset_to_hub/codebase_version.md
-CODEBASE_VERSION = "v1.5"
-
+# For maintainers, see lerobot/common/datasets/push_dataset_to_hub/CODEBASE_VERSION.md
+CODEBASE_VERSION = "v1.6"
 DATA_DIR = Path(os.environ["DATA_DIR"]) if "DATA_DIR" in os.environ else None


--- a/lerobot/common/datasets/online_buffer.py
+++ b/lerobot/common/datasets/online_buffer.py
@@ -0,0 +1,384 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""An online buffer for the online training loop in train.py
+
+Note to maintainers: This duplicates some logic from LeRobotDataset and EpisodeAwareSampler. We should
+consider converging to one approach. Here we have opted to use numpy.memmap to back the data buffer. It's much
+faster than using HuggingFace Datasets as there's no conversion to an intermediate non-python object. Also it
+supports in-place slicing and mutation which is very handy for a dynamic buffer.
+"""
+
+import os
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import torch
+
+from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
+
+
+def _make_memmap_safe(**kwargs) -> np.memmap:
+    """Make a numpy memmap with checks on available disk space first.
+
+    Expected kwargs are: "filename", "dtype" (must by np.dtype), "mode" and "shape"
+
+    For information on dtypes:
+    https://numpy.org/doc/stable/reference/arrays.dtypes.html#arrays-dtypes-constructing
+    """
+    if kwargs["mode"].startswith("w"):
+        required_space = kwargs["dtype"].itemsize * np.prod(kwargs["shape"])  # bytes
+        stats = os.statvfs(Path(kwargs["filename"]).parent)
+        available_space = stats.f_bavail * stats.f_frsize  # bytes
+        if required_space >= available_space * 0.8:
+            raise RuntimeError(
+                f"You're about to take up {required_space} of {available_space} bytes available."
+            )
+    return np.memmap(**kwargs)
+
+
+class OnlineBuffer(torch.utils.data.Dataset):
+    """FIFO data buffer for the online training loop in train.py.
+
+    Follows the protocol of LeRobotDataset as much as is required to have it be used by the online training
+    loop in the same way that a LeRobotDataset would be used.
+
+    The underlying data structure will have data inserted in a circular fashion. Always insert after the
+    last index, and when you reach the end, wrap around to the start.
+
+    The data is stored in a numpy memmap.
+    """
+
+    NEXT_INDEX_KEY = "_next_index"
+    OCCUPANCY_MASK_KEY = "_occupancy_mask"
+    INDEX_KEY = "index"
+    FRAME_INDEX_KEY = "frame_index"
+    EPISODE_INDEX_KEY = "episode_index"
+    TIMESTAMP_KEY = "timestamp"
+    IS_PAD_POSTFIX = "_is_pad"
+
+    def __init__(
+        self,
+        write_dir: str | Path,
+        data_spec: dict[str, Any] | None,
+        buffer_capacity: int | None,
+        fps: float | None = None,
+        delta_timestamps: dict[str, list[float]] | dict[str, np.ndarray] | None = None,
+    ):
+        """
+        The online buffer can be provided from scratch or you can load an existing online buffer by passing
+        a `write_dir` associated with an existing buffer.
+
+        Args:
+            write_dir: Where to keep the numpy memmap files. One memmap file will be stored for each data key.
+                Note that if the files already exist, they are opened in read-write mode (used for training
+                resumption.)
+            data_spec: A mapping from data key to data specification, like {data_key: {"shape": tuple[int],
+                "dtype": np.dtype}}. This should include all the data that you wish to record into the buffer,
+                but note that "index", "frame_index" and "episode_index" are already accounted for by this
+                class, so you don't need to include them.
+            buffer_capacity: How many frames should be stored in the buffer as a maximum. Be aware of your
+                system's available disk space when choosing this.
+            fps: Same as the fps concept in LeRobot dataset. Here it needs to be provided for the
+                 delta_timestamps logic. You can pass None if you are not using delta_timestamps.
+            delta_timestamps: Same as the delta_timestamps concept in LeRobotDataset. This is internally
+                converted to dict[str, np.ndarray] for optimization purposes.
+
+        """
+        self.set_delta_timestamps(delta_timestamps)
+        self._fps = fps
+        # Tolerance in seconds used to discard loaded frames when their timestamps are not close enough from
+        # the requested frames. It is only used when `delta_timestamps` is provided.
+        # minus 1e-4 to account for possible numerical error
+        self.tolerance_s = 1 / self.fps - 1e-4 if fps is not None else None
+        self._buffer_capacity = buffer_capacity
+        data_spec = self._make_data_spec(data_spec, buffer_capacity)
+        Path(write_dir).mkdir(parents=True, exist_ok=True)
+        self._data = {}
+        for k, v in data_spec.items():
+            self._data[k] = _make_memmap_safe(
+                filename=Path(write_dir) / k,
+                dtype=v["dtype"] if v is not None else None,
+                mode="r+" if (Path(write_dir) / k).exists() else "w+",
+                shape=tuple(v["shape"]) if v is not None else None,
+            )
+
+    @property
+    def delta_timestamps(self) -> dict[str, np.ndarray] | None:
+        return self._delta_timestamps
+
+    def set_delta_timestamps(self, value: dict[str, list[float]] | None):
+        """Set delta_timestamps converting the values to numpy arrays.
+
+        The conversion is for an optimization in the __getitem__. The loop is much slower if the arrays
+        need to be converted into numpy arrays.
+        """
+        if value is not None:
+            self._delta_timestamps = {k: np.array(v) for k, v in value.items()}
+        else:
+            self._delta_timestamps = None
+
+    def _make_data_spec(self, data_spec: dict[str, Any], buffer_capacity: int) -> dict[str, dict[str, Any]]:
+        """Makes the data spec for np.memmap."""
+        if any(k.startswith("_") for k in data_spec):
+            raise ValueError(
+                "data_spec keys should not start with '_'. This prefix is reserved for internal logic."
+            )
+        preset_keys = {
+            OnlineBuffer.INDEX_KEY,
+            OnlineBuffer.FRAME_INDEX_KEY,
+            OnlineBuffer.EPISODE_INDEX_KEY,
+            OnlineBuffer.TIMESTAMP_KEY,
+        }
+        if len(intersection := set(data_spec).intersection(preset_keys)) > 0:
+            raise ValueError(
+                f"data_spec should not contain any of {preset_keys} as these are handled internally. "
+                f"The provided data_spec has {intersection}."
+            )
+        complete_data_spec = {
+            # _next_index will be a pointer to the next index that we should start filling from when we add
+            # more data.
+            OnlineBuffer.NEXT_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": ()},
+            # Since the memmap is initialized with all-zeros, this keeps track of which indices are occupied
+            # with real data rather than the dummy initialization.
+            OnlineBuffer.OCCUPANCY_MASK_KEY: {"dtype": np.dtype("?"), "shape": (buffer_capacity,)},
+            OnlineBuffer.INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (buffer_capacity,)},
+            OnlineBuffer.FRAME_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (buffer_capacity,)},
+            OnlineBuffer.EPISODE_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (buffer_capacity,)},
+            OnlineBuffer.TIMESTAMP_KEY: {"dtype": np.dtype("float64"), "shape": (buffer_capacity,)},
+        }
+        for k, v in data_spec.items():
+            complete_data_spec[k] = {"dtype": v["dtype"], "shape": (buffer_capacity, *v["shape"])}
+        return complete_data_spec
+
+    def add_data(self, data: dict[str, np.ndarray]):
+        """Add new data to the buffer, which could potentially mean shifting old data out.
+
+        The new data should contain all the frames (in order) of any number of episodes. The indices should
+        start from 0 (note to the developer: this can easily be generalized). See the `rollout` and
+        `eval_policy` functions in `eval.py` for more information on how the data is constructed.
+
+        Shift the incoming data index and episode_index to continue on from the last frame. Note that this
+        will be done in place!
+        """
+        if len(missing_keys := (set(self.data_keys).difference(set(data)))) > 0:
+            raise ValueError(f"Missing data keys: {missing_keys}")
+        new_data_length = len(data[self.data_keys[0]])
+        if not all(len(data[k]) == new_data_length for k in self.data_keys):
+            raise ValueError("All data items should have the same length")
+
+        next_index = self._data[OnlineBuffer.NEXT_INDEX_KEY]
+
+        # Sanity check to make sure that the new data indices start from 0.
+        assert data[OnlineBuffer.EPISODE_INDEX_KEY][0].item() == 0
+        assert data[OnlineBuffer.INDEX_KEY][0].item() == 0
+
+        # Shift the incoming indices if necessary.
+        if self.num_samples > 0:
+            last_episode_index = self._data[OnlineBuffer.EPISODE_INDEX_KEY][next_index - 1]
+            last_data_index = self._data[OnlineBuffer.INDEX_KEY][next_index - 1]
+            data[OnlineBuffer.EPISODE_INDEX_KEY] += last_episode_index + 1
+            data[OnlineBuffer.INDEX_KEY] += last_data_index + 1
+
+        # Insert the new data starting from next_index. It may be necessary to wrap around to the start.
+        n_surplus = max(0, new_data_length - (self._buffer_capacity - next_index))
+        for k in self.data_keys:
+            if n_surplus == 0:
+                slc = slice(next_index, next_index + new_data_length)
+                self._data[k][slc] = data[k]
+                self._data[OnlineBuffer.OCCUPANCY_MASK_KEY][slc] = True
+            else:
+                self._data[k][next_index:] = data[k][:-n_surplus]
+                self._data[OnlineBuffer.OCCUPANCY_MASK_KEY][next_index:] = True
+                self._data[k][:n_surplus] = data[k][-n_surplus:]
+        if n_surplus == 0:
+            self._data[OnlineBuffer.NEXT_INDEX_KEY] = next_index + new_data_length
+        else:
+            self._data[OnlineBuffer.NEXT_INDEX_KEY] = n_surplus
+
+    @property
+    def data_keys(self) -> list[str]:
+        keys = set(self._data)
+        keys.remove(OnlineBuffer.OCCUPANCY_MASK_KEY)
+        keys.remove(OnlineBuffer.NEXT_INDEX_KEY)
+        return sorted(keys)
+
+    @property
+    def fps(self) -> float | None:
+        return self._fps
+
+    @property
+    def num_episodes(self) -> int:
+        return len(
+            np.unique(self._data[OnlineBuffer.EPISODE_INDEX_KEY][self._data[OnlineBuffer.OCCUPANCY_MASK_KEY]])
+        )
+
+    @property
+    def num_samples(self) -> int:
+        return np.count_nonzero(self._data[OnlineBuffer.OCCUPANCY_MASK_KEY])
+
+    def __len__(self):
+        return self.num_samples
+
+    def _item_to_tensors(self, item: dict) -> dict:
+        item_ = {}
+        for k, v in item.items():
+            if isinstance(v, torch.Tensor):
+                item_[k] = v
+            elif isinstance(v, np.ndarray):
+                item_[k] = torch.from_numpy(v)
+            else:
+                item_[k] = torch.tensor(v)
+        return item_
+
+    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
+        if idx >= len(self) or idx < -len(self):
+            raise IndexError
+
+        item = {k: v[idx] for k, v in self._data.items() if not k.startswith("_")}
+
+        if self.delta_timestamps is None:
+            return self._item_to_tensors(item)
+
+        episode_index = item[OnlineBuffer.EPISODE_INDEX_KEY]
+        current_ts = item[OnlineBuffer.TIMESTAMP_KEY]
+        episode_data_indices = np.where(
+            np.bitwise_and(
+                self._data[OnlineBuffer.EPISODE_INDEX_KEY] == episode_index,
+                self._data[OnlineBuffer.OCCUPANCY_MASK_KEY],
+            )
+        )[0]
+        episode_timestamps = self._data[OnlineBuffer.TIMESTAMP_KEY][episode_data_indices]
+
+        for data_key in self.delta_timestamps:
+            # Note: The logic in this loop is copied from `load_previous_and_future_frames`.
+            # Get timestamps used as query to retrieve data of previous/future frames.
+            query_ts = current_ts + self.delta_timestamps[data_key]
+
+            # Compute distances between each query timestamp and all timestamps of all the frames belonging to
+            # the episode.
+            dist = np.abs(query_ts[:, None] - episode_timestamps[None, :])
+            argmin_ = np.argmin(dist, axis=1)
+            min_ = dist[np.arange(dist.shape[0]), argmin_]
+
+            is_pad = min_ > self.tolerance_s
+
+            # Check violated query timestamps are all outside the episode range.
+            assert (
+                (query_ts[is_pad] < episode_timestamps[0]) | (episode_timestamps[-1] < query_ts[is_pad])
+            ).all(), (
+                f"One or several timestamps unexpectedly violate the tolerance ({min_} > {self.tolerance_s=}"
+                ") inside the episode range."
+            )
+
+            # Load frames for this data key.
+            item[data_key] = self._data[data_key][episode_data_indices[argmin_]]
+
+            item[f"{data_key}{OnlineBuffer.IS_PAD_POSTFIX}"] = is_pad
+
+        return self._item_to_tensors(item)
+
+    def get_data_by_key(self, key: str) -> torch.Tensor:
+        """Returns all data for a given data key as a Tensor."""
+        return torch.from_numpy(self._data[key][self._data[OnlineBuffer.OCCUPANCY_MASK_KEY]])
+
+
+def compute_sampler_weights(
+    offline_dataset: LeRobotDataset,
+    offline_drop_n_last_frames: int = 0,
+    online_dataset: OnlineBuffer | None = None,
+    online_sampling_ratio: float | None = None,
+    online_drop_n_last_frames: int = 0,
+) -> torch.Tensor:
+    """Compute the sampling weights for the online training dataloader in train.py.
+
+    Args:
+        offline_dataset: The LeRobotDataset used for offline pre-training.
+        online_drop_n_last_frames: Number of frames to drop from the end of each offline dataset episode.
+        online_dataset: The OnlineBuffer used in online training.
+        online_sampling_ratio: The proportion of data that should be sampled from the online dataset. If an
+            online dataset is provided, this value must also be provided.
+        online_drop_n_first_frames: See `offline_drop_n_last_frames`. This is the same, but for the online
+            dataset.
+    Returns:
+        Tensor of weights for [offline_dataset; online_dataset], normalized to 1.
+
+    Notes to maintainers:
+        - This duplicates some logic from EpisodeAwareSampler. We should consider converging to one approach.
+        - When used with `torch.utils.data.WeightedRandomSampler`, it could completely replace
+          `EpisodeAwareSampler` as the online dataset related arguments are optional. The only missing feature
+          is the ability to turn shuffling off.
+        - Options `drop_first_n_frames` and `episode_indices_to_use` can be added easily. They were not
+          included here to avoid adding complexity.
+    """
+    if len(offline_dataset) == 0 and (online_dataset is None or len(online_dataset) == 0):
+        raise ValueError("At least one of `offline_dataset` or `online_dataset` should be contain data.")
+    if (online_dataset is None) ^ (online_sampling_ratio is None):
+        raise ValueError(
+            "`online_dataset` and `online_sampling_ratio` must be provided together or not at all."
+        )
+    offline_sampling_ratio = 0 if online_sampling_ratio is None else 1 - online_sampling_ratio
+
+    weights = []
+
+    if len(offline_dataset) > 0:
+        offline_data_mask_indices = []
+        for start_index, end_index in zip(
+            offline_dataset.episode_data_index["from"],
+            offline_dataset.episode_data_index["to"],
+            strict=True,
+        ):
+            offline_data_mask_indices.extend(
+                range(start_index.item(), end_index.item() - offline_drop_n_last_frames)
+            )
+        offline_data_mask = torch.zeros(len(offline_dataset), dtype=torch.bool)
+        offline_data_mask[torch.tensor(offline_data_mask_indices)] = True
+        weights.append(
+            torch.full(
+                size=(len(offline_dataset),),
+                fill_value=offline_sampling_ratio / offline_data_mask.sum(),
+            )
+            * offline_data_mask
+        )
+
+    if online_dataset is not None and len(online_dataset) > 0:
+        online_data_mask_indices = []
+        episode_indices = online_dataset.get_data_by_key("episode_index")
+        for episode_idx in torch.unique(episode_indices):
+            where_episode = torch.where(episode_indices == episode_idx)
+            start_index = where_episode[0][0]
+            end_index = where_episode[0][-1] + 1
+            online_data_mask_indices.extend(
+                range(start_index.item(), end_index.item() - online_drop_n_last_frames)
+            )
+        online_data_mask = torch.zeros(len(online_dataset), dtype=torch.bool)
+        online_data_mask[torch.tensor(online_data_mask_indices)] = True
+        weights.append(
+            torch.full(
+                size=(len(online_dataset),),
+                fill_value=online_sampling_ratio / online_data_mask.sum(),
+            )
+            * online_data_mask
+        )
+
+    weights = torch.cat(weights)
+
+    if weights.sum() == 0:
+        weights += 1 / len(weights)
+    else:
+        weights /= weights.sum()
+
+    return weights
--- a/lerobot/common/datasets/push_dataset_to_hub/CODEBASE_VERSION.md
+++ b/lerobot/common/datasets/push_dataset_to_hub/CODEBASE_VERSION.md
@@ -10,7 +10,8 @@ For instance, [`lerobot/pusht`](https://huggingface.co/datasets/lerobot/pusht) h
 - [v1.2](https://huggingface.co/datasets/lerobot/pusht/tree/v1.2)
 - [v1.3](https://huggingface.co/datasets/lerobot/pusht/tree/v1.3)
 - [v1.4](https://huggingface.co/datasets/lerobot/pusht/tree/v1.4)
- [v1.5](https://huggingface.co/datasets/lerobot/pusht/tree/v1.5) <-- last version
+- [v1.5](https://huggingface.co/datasets/lerobot/pusht/tree/v1.5)
+- [v1.6](https://huggingface.co/datasets/lerobot/pusht/tree/v1.6) <-- last version
 - [main](https://huggingface.co/datasets/lerobot/pusht/tree/main) <-- points to the last version

 Starting with v1.6, every dataset pushed to the hub or saved locally also have this version number in their
@@ -45,13 +46,11 @@ for repo_id in available_datasets:
    dataset_info = api.list_repo_refs(repo_id, repo_type="dataset")
    branches = [b.name for b in dataset_info.branches]
    if CODEBASE_VERSION in branches:
-        # First check if the newer version already exists.
-        print(f"Found existing branch for {repo_id}. Please contact a member of the core LeRobot team.")
-        print("Exiting early")
-        break
+        print(f"{repo_id} already @{CODEBASE_VERSION}, skipping.")
+        continue
    else:
        # Now create a branch named after the new version by branching out from "main"
        # which is expected to be the preceding version
        api.create_branch(repo_id, repo_type="dataset", branch=CODEBASE_VERSION, revision="main")
-        print(f"{repo_id} successfully updated")
+        print(f"{repo_id} successfully updated @{CODEBASE_VERSION}")
 ```
--- a/lerobot/common/datasets/push_dataset_to_hub/_download_raw.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/_download_raw.py
@@ -19,8 +19,8 @@ This file contains download scripts for raw datasets.
 Example of usage:
 ```
 python lerobot/common/datasets/push_dataset_to_hub/_download_raw.py \
--raw-dir data/cadene/pusht_raw \
--repo-id cadene/pusht_raw
+--raw-dir data/lerobot-raw/pusht_raw \
+--repo-id lerobot-raw/pusht_raw
 ```
 """

@@ -31,63 +31,133 @@ from pathlib import Path

 from huggingface_hub import snapshot_download

-AVAILABLE_RAW_REPO_IDS = [
-    "lerobot-raw/aloha_mobile_cabinet_raw",
-    "lerobot-raw/aloha_mobile_chair_raw",
-    "lerobot-raw/aloha_mobile_elevator_raw",
-    "lerobot-raw/aloha_mobile_shrimp_raw",
-    "lerobot-raw/aloha_mobile_wash_pan_raw",
-    "lerobot-raw/aloha_mobile_wipe_wine_raw",
-    "lerobot-raw/aloha_sim_insertion_human_raw",
-    "lerobot-raw/aloha_sim_insertion_scripted_raw",
-    "lerobot-raw/aloha_sim_transfer_cube_human_raw",
-    "lerobot-raw/aloha_sim_transfer_cube_scripted_raw",
-    "lerobot-raw/aloha_static_battery_raw",
-    "lerobot-raw/aloha_static_candy_raw",
-    "lerobot-raw/aloha_static_coffee_new_raw",
-    "lerobot-raw/aloha_static_coffee_raw",
-    "lerobot-raw/aloha_static_cups_open_raw",
-    "lerobot-raw/aloha_static_fork_pick_up_raw",
-    "lerobot-raw/aloha_static_pingpong_test_raw",
-    "lerobot-raw/aloha_static_pro_pencil_raw",
-    "lerobot-raw/aloha_static_screw_driver_raw",
-    "lerobot-raw/aloha_static_tape_raw",
-    "lerobot-raw/aloha_static_thread_velcro_raw",
-    "lerobot-raw/aloha_static_towel_raw",
-    "lerobot-raw/aloha_static_vinh_cup_left_raw",
-    "lerobot-raw/aloha_static_vinh_cup_raw",
-    "lerobot-raw/aloha_static_ziploc_slide_raw",
-    "lerobot-raw/pusht_raw",
-    "lerobot-raw/umi_cup_in_the_wild_raw",
-    "lerobot-raw/unitreeh1_fold_clothes_raw",
-    "lerobot-raw/unitreeh1_rearrange_objects_raw",
-    "lerobot-raw/unitreeh1_two_robot_greeting_raw",
-    "lerobot-raw/unitreeh1_warehouse_raw",
-    "lerobot-raw/xarm_lift_medium_raw",
-    "lerobot-raw/xarm_lift_medium_replay_raw",
-    "lerobot-raw/xarm_push_medium_raw",
-    "lerobot-raw/xarm_push_medium_replay_raw",
-]
+from lerobot.common.datasets.push_dataset_to_hub.utils import check_repo_id
+
+# {raw_repo_id: raw_format}
+AVAILABLE_RAW_REPO_IDS = {
+    "lerobot-raw/aloha_mobile_cabinet_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_mobile_chair_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_mobile_elevator_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_mobile_shrimp_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_mobile_wash_pan_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_mobile_wipe_wine_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_sim_insertion_human_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_sim_insertion_scripted_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_sim_transfer_cube_human_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_sim_transfer_cube_scripted_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_static_battery_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_static_candy_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_static_coffee_new_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_static_coffee_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_static_cups_open_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_static_fork_pick_up_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_static_pingpong_test_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_static_pro_pencil_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_static_screw_driver_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_static_tape_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_static_thread_velcro_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_static_towel_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_static_vinh_cup_left_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_static_vinh_cup_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_static_ziploc_slide_raw": "aloha_hdf5",
+    "lerobot-raw/umi_cup_in_the_wild_raw": "umi_zarr",
+    "lerobot-raw/pusht_raw": "pusht_zarr",
+    "lerobot-raw/unitreeh1_fold_clothes_raw": "aloha_hdf5",
+    "lerobot-raw/unitreeh1_rearrange_objects_raw": "aloha_hdf5",
+    "lerobot-raw/unitreeh1_two_robot_greeting_raw": "aloha_hdf5",
+    "lerobot-raw/unitreeh1_warehouse_raw": "aloha_hdf5",
+    "lerobot-raw/xarm_lift_medium_raw": "xarm_pkl",
+    "lerobot-raw/xarm_lift_medium_replay_raw": "xarm_pkl",
+    "lerobot-raw/xarm_push_medium_raw": "xarm_pkl",
+    "lerobot-raw/xarm_push_medium_replay_raw": "xarm_pkl",
+    "lerobot-raw/fractal20220817_data_raw": "openx_rlds.fractal20220817_data",
+    "lerobot-raw/kuka_raw": "openx_rlds.kuka",
+    "lerobot-raw/bridge_openx_raw": "openx_rlds.bridge_openx",
+    "lerobot-raw/taco_play_raw": "openx_rlds.taco_play",
+    "lerobot-raw/jaco_play_raw": "openx_rlds.jaco_play",
+    "lerobot-raw/berkeley_cable_routing_raw": "openx_rlds.berkeley_cable_routing",
+    "lerobot-raw/roboturk_raw": "openx_rlds.roboturk",
+    "lerobot-raw/nyu_door_opening_surprising_effectiveness_raw": "openx_rlds.nyu_door_opening_surprising_effectiveness",
+    "lerobot-raw/viola_raw": "openx_rlds.viola",
+    "lerobot-raw/berkeley_autolab_ur5_raw": "openx_rlds.berkeley_autolab_ur5",
+    "lerobot-raw/toto_raw": "openx_rlds.toto",
+    "lerobot-raw/language_table_raw": "openx_rlds.language_table",
+    "lerobot-raw/columbia_cairlab_pusht_real_raw": "openx_rlds.columbia_cairlab_pusht_real",
+    "lerobot-raw/stanford_kuka_multimodal_dataset_raw": "openx_rlds.stanford_kuka_multimodal_dataset",
+    "lerobot-raw/nyu_rot_dataset_raw": "openx_rlds.nyu_rot_dataset",
+    "lerobot-raw/io_ai_tech_raw": "openx_rlds.io_ai_tech",
+    "lerobot-raw/stanford_hydra_dataset_raw": "openx_rlds.stanford_hydra_dataset",
+    "lerobot-raw/austin_buds_dataset_raw": "openx_rlds.austin_buds_dataset",
+    "lerobot-raw/nyu_franka_play_dataset_raw": "openx_rlds.nyu_franka_play_dataset",
+    "lerobot-raw/maniskill_dataset_raw": "openx_rlds.maniskill_dataset",
+    "lerobot-raw/furniture_bench_dataset_raw": "openx_rlds.furniture_bench_dataset",
+    "lerobot-raw/cmu_franka_exploration_dataset_raw": "openx_rlds.cmu_franka_exploration_dataset",
+    "lerobot-raw/ucsd_kitchen_dataset_raw": "openx_rlds.ucsd_kitchen_dataset",
+    "lerobot-raw/ucsd_pick_and_place_dataset_raw": "openx_rlds.ucsd_pick_and_place_dataset",
+    "lerobot-raw/spoc_raw": "openx_rlds.spoc",
+    "lerobot-raw/austin_sailor_dataset_raw": "openx_rlds.austin_sailor_dataset",
+    "lerobot-raw/austin_sirius_dataset_raw": "openx_rlds.austin_sirius_dataset",
+    "lerobot-raw/bc_z_raw": "openx_rlds.bc_z",
+    "lerobot-raw/utokyo_pr2_opening_fridge_raw": "openx_rlds.utokyo_pr2_opening_fridge",
+    "lerobot-raw/utokyo_pr2_tabletop_manipulation_raw": "openx_rlds.utokyo_pr2_tabletop_manipulation",
+    "lerobot-raw/utokyo_xarm_pick_and_place_raw": "openx_rlds.utokyo_xarm_pick_and_place",
+    "lerobot-raw/utokyo_xarm_bimanual_raw": "openx_rlds.utokyo_xarm_bimanual",
+    "lerobot-raw/utokyo_saytap_raw": "openx_rlds.utokyo_saytap",
+    "lerobot-raw/robo_net_raw": "openx_rlds.robo_net",
+    "lerobot-raw/robo_set_raw": "openx_rlds.robo_set",
+    "lerobot-raw/berkeley_mvp_raw": "openx_rlds.berkeley_mvp",
+    "lerobot-raw/berkeley_rpt_raw": "openx_rlds.berkeley_rpt",
+    "lerobot-raw/kaist_nonprehensile_raw": "openx_rlds.kaist_nonprehensile",
+    "lerobot-raw/stanford_mask_vit_raw": "openx_rlds.stanford_mask_vit",
+    "lerobot-raw/tokyo_u_lsmo_raw": "openx_rlds.tokyo_u_lsmo",
+    "lerobot-raw/dlr_sara_pour_raw": "openx_rlds.dlr_sara_pour",
+    "lerobot-raw/dlr_sara_grid_clamp_raw": "openx_rlds.dlr_sara_grid_clamp",
+    "lerobot-raw/dlr_edan_shared_control_raw": "openx_rlds.dlr_edan_shared_control",
+    "lerobot-raw/asu_table_top_raw": "openx_rlds.asu_table_top",
+    "lerobot-raw/stanford_robocook_raw": "openx_rlds.stanford_robocook",
+    "lerobot-raw/imperialcollege_sawyer_wrist_cam_raw": "openx_rlds.imperialcollege_sawyer_wrist_cam",
+    "lerobot-raw/iamlab_cmu_pickup_insert_raw": "openx_rlds.iamlab_cmu_pickup_insert",
+    "lerobot-raw/uiuc_d3field_raw": "openx_rlds.uiuc_d3field",
+    "lerobot-raw/utaustin_mutex_raw": "openx_rlds.utaustin_mutex",
+    "lerobot-raw/berkeley_fanuc_manipulation_raw": "openx_rlds.berkeley_fanuc_manipulation",
+    "lerobot-raw/cmu_playing_with_food_raw": "openx_rlds.cmu_playing_with_food",
+    "lerobot-raw/cmu_play_fusion_raw": "openx_rlds.cmu_play_fusion",
+    "lerobot-raw/cmu_stretch_raw": "openx_rlds.cmu_stretch",
+    "lerobot-raw/berkeley_gnm_recon_raw": "openx_rlds.berkeley_gnm_recon",
+    "lerobot-raw/berkeley_gnm_cory_hall_raw": "openx_rlds.berkeley_gnm_cory_hall",
+    "lerobot-raw/berkeley_gnm_sac_son_raw": "openx_rlds.berkeley_gnm_sac_son",
+    "lerobot-raw/droid_raw": "openx_rlds.droid",
+    "lerobot-raw/droid_100_raw": "openx_rlds.droid100",
+    "lerobot-raw/fmb_raw": "openx_rlds.fmb",
+    "lerobot-raw/dobbe_raw": "openx_rlds.dobbe",
+    "lerobot-raw/usc_cloth_sim_raw": "openx_rlds.usc_cloth_sim",
+    "lerobot-raw/plex_robosuite_raw": "openx_rlds.plex_robosuite",
+    "lerobot-raw/conq_hose_manipulation_raw": "openx_rlds.conq_hose_manipulation",
+    "lerobot-raw/vima_raw": "openx_rlds.vima",
+    "lerobot-raw/robot_vqa_raw": "openx_rlds.robot_vqa",
+    "lerobot-raw/mimic_play_raw": "openx_rlds.mimic_play",
+    "lerobot-raw/tidybot_raw": "openx_rlds.tidybot",
+    "lerobot-raw/eth_agent_affordances_raw": "openx_rlds.eth_agent_affordances",
+}


 def download_raw(raw_dir: Path, repo_id: str):
-    # Check repo_id is well formated
-    if len(repo_id.split("/")) != 2:
-        raise ValueError(
-            f"`repo_id` is expected to contain a community or user id `/` the name of the dataset (e.g. 'lerobot/pusht'), but contains '{repo_id}'."
-        )
+    check_repo_id(repo_id)
    user_id, dataset_id = repo_id.split("/")

    if not dataset_id.endswith("_raw"):
        warnings.warn(
-            f"`dataset_id` ({dataset_id}) doesn't end with '_raw' (e.g. 'lerobot/pusht_raw'). Following this naming convention by renaming your repository is advised, but not mandatory.",
+            f"""`dataset_id` ({dataset_id}) doesn't end with '_raw' (e.g. 'lerobot/pusht_raw'). Following this
+             naming convention by renaming your repository is advised, but not mandatory.""",
            stacklevel=1,
        )

    # Send warning if raw_dir isn't well formated
    if raw_dir.parts[-2] != user_id or raw_dir.parts[-1] != dataset_id:
        warnings.warn(
-            f"`raw_dir` ({raw_dir}) doesn't contain a community or user id `/` the name of the dataset that match the `repo_id` (e.g. 'data/lerobot/pusht_raw'). Following this naming convention is advised, but not mandatory.",
+            f"""`raw_dir` ({raw_dir}) doesn't contain a community or user id `/` the name of the dataset that
+             match the `repo_id` (e.g. 'data/lerobot/pusht_raw'). Following this naming convention is advised,
+             but not mandatory.""",
            stacklevel=1,
        )
    raw_dir.mkdir(parents=True, exist_ok=True)
@@ -97,8 +167,9 @@ def download_raw(raw_dir: Path, repo_id: str):
    logging.info(f"Finish downloading from huggingface.co/{user_id} for {dataset_id}")


-def download_all_raw_datasets():
-    data_dir = Path("data")
+def download_all_raw_datasets(data_dir: Path | None = None):
+    if data_dir is None:
+        data_dir = Path("data")
    for repo_id in AVAILABLE_RAW_REPO_IDS:
        raw_dir = data_dir / repo_id
        download_raw(raw_dir, repo_id)
@@ -106,7 +177,8 @@ def download_all_raw_datasets():

 def main():
    parser = argparse.ArgumentParser(
-        description=f"A script to download raw datasets from Hugging Face hub to a local directory. Here is a non exhaustive list of available repositories to use in `--repo-id`: {AVAILABLE_RAW_REPO_IDS}",
+        description=f"""A script to download raw datasets from Hugging Face hub to a local directory. Here is a
+            non exhaustive list of available repositories to use in `--repo-id`: {list(AVAILABLE_RAW_REPO_IDS.keys())}""",
    )

    parser.add_argument(
@@ -119,7 +191,8 @@ def main():
        "--repo-id",
        type=str,
        required=True,
-        help="Repositery identifier on Hugging Face: a community or a user name `/` the name of the dataset (e.g. `lerobot/pusht_raw`, `cadene/aloha_sim_insertion_human_raw`).",
+        help="""Repositery identifier on Hugging Face: a community or a user name `/` the name of
+        the dataset (e.g. `lerobot/pusht_raw`, `cadene/aloha_sim_insertion_human_raw`).""",
    )
    args = parser.parse_args()
    download_raw(**vars(args))
--- a/lerobot/common/datasets/push_dataset_to_hub/_encode_datasets.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/_encode_datasets.py
@@ -0,0 +1,184 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Use this script to batch encode lerobot dataset from their raw format to LeRobotDataset and push their updated
+version to the hub. Under the hood, this script reuses 'push_dataset_to_hub.py'. It assumes that you already
+downloaded raw datasets, which you can do with the related '_download_raw.py' script.
+
+For instance, for codebase_version = 'v1.6', the following command was run, assuming raw datasets from
+lerobot-raw were downloaded in 'raw/datasets/directory':
+```bash
+python lerobot/common/datasets/push_dataset_to_hub/_encode_datasets.py \
+  --raw-dir raw/datasets/directory \
+  --raw-repo-ids lerobot-raw \
+  --local-dir push/datasets/directory \
+  --tests-data-dir tests/data \
+  --push-repo lerobot \
+  --vcodec libsvtav1 \
+  --pix-fmt yuv420p \
+  --g 2 \
+  --crf 30
+```
+"""
+
+import argparse
+from pathlib import Path
+
+from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION
+from lerobot.common.datasets.push_dataset_to_hub._download_raw import AVAILABLE_RAW_REPO_IDS
+from lerobot.common.datasets.push_dataset_to_hub.utils import check_repo_id
+from lerobot.scripts.push_dataset_to_hub import push_dataset_to_hub
+
+
+def get_push_repo_id_from_raw(raw_repo_id: str, push_repo: str) -> str:
+    dataset_id_raw = raw_repo_id.split("/")[1]
+    dataset_id = dataset_id_raw.removesuffix("_raw")
+    return f"{push_repo}/{dataset_id}"
+
+
+def encode_datasets(
+    raw_dir: Path,
+    raw_repo_ids: list[str],
+    push_repo: str,
+    vcodec: str,
+    pix_fmt: str,
+    g: int,
+    crf: int,
+    local_dir: Path | None = None,
+    tests_data_dir: Path | None = None,
+    raw_format: str | None = None,
+    dry_run: bool = False,
+) -> None:
+    if len(raw_repo_ids) == 1 and raw_repo_ids[0].lower() == "lerobot-raw":
+        raw_repo_ids_format = AVAILABLE_RAW_REPO_IDS
+    else:
+        if raw_format is None:
+            raise ValueError(raw_format)
+        raw_repo_ids_format = {id_: raw_format for id_ in raw_repo_ids}
+
+    for raw_repo_id, repo_raw_format in raw_repo_ids_format.items():
+        check_repo_id(raw_repo_id)
+        dataset_repo_id_push = get_push_repo_id_from_raw(raw_repo_id, push_repo)
+        dataset_raw_dir = raw_dir / raw_repo_id
+        dataset_dir = local_dir / dataset_repo_id_push if local_dir is not None else None
+        encoding = {
+            "vcodec": vcodec,
+            "pix_fmt": pix_fmt,
+            "g": g,
+            "crf": crf,
+        }
+
+        if not (dataset_raw_dir).is_dir():
+            raise NotADirectoryError(dataset_raw_dir)
+
+        if not dry_run:
+            push_dataset_to_hub(
+                dataset_raw_dir,
+                raw_format=repo_raw_format,
+                repo_id=dataset_repo_id_push,
+                local_dir=dataset_dir,
+                resume=True,
+                encoding=encoding,
+                tests_data_dir=tests_data_dir,
+            )
+        else:
+            print(
+                f"DRY RUN: {dataset_raw_dir}  -->  {dataset_dir}  -->  {dataset_repo_id_push}@{CODEBASE_VERSION}"
+            )
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--raw-dir",
+        type=Path,
+        default=Path("data"),
+        help="Directory where raw datasets are located.",
+    )
+    parser.add_argument(
+        "--raw-repo-ids",
+        type=str,
+        nargs="*",
+        default=["lerobot-raw"],
+        help="""Raw dataset repo ids. if 'lerobot-raw', the keys from `AVAILABLE_RAW_REPO_IDS` will be
+            used and raw datasets will be fetched from the 'lerobot-raw/' repo and pushed with their
+            associated format. It is assumed that each dataset is located at `raw_dir / raw_repo_id` """,
+    )
+    parser.add_argument(
+        "--raw-format",
+        type=str,
+        default=None,
+        help="""Raw format to use for the raw repo-ids. Must be specified if --raw-repo-ids is not
+            'lerobot-raw'""",
+    )
+    parser.add_argument(
+        "--local-dir",
+        type=Path,
+        default=None,
+        help="""When provided, writes the dataset converted to LeRobotDataset format in this directory
+        (e.g. `data/lerobot/aloha_mobile_chair`).""",
+    )
+    parser.add_argument(
+        "--push-repo",
+        type=str,
+        default="lerobot",
+        help="Repo to upload datasets to",
+    )
+    parser.add_argument(
+        "--vcodec",
+        type=str,
+        default="libsvtav1",
+        help="Codec to use for encoding videos",
+    )
+    parser.add_argument(
+        "--pix-fmt",
+        type=str,
+        default="yuv420p",
+        help="Pixel formats (chroma subsampling) to be used for encoding",
+    )
+    parser.add_argument(
+        "--g",
+        type=int,
+        default=2,
+        help="Group of pictures sizes to be used for encoding.",
+    )
+    parser.add_argument(
+        "--crf",
+        type=int,
+        default=30,
+        help="Constant rate factors to be used for encoding.",
+    )
+    parser.add_argument(
+        "--tests-data-dir",
+        type=Path,
+        default=None,
+        help=(
+            "When provided, save tests artifacts into the given directory "
+            "(e.g. `--tests-data-dir tests/data` will save to tests/data/{--repo-id})."
+        ),
+    )
+    parser.add_argument(
+        "--dry-run",
+        type=int,
+        default=0,
+        help="If not set to 0, this script won't download or upload anything.",
+    )
+    args = parser.parse_args()
+    encode_datasets(**vars(args))
+
+
+if __name__ == "__main__":
+    main()
--- a/lerobot/common/datasets/push_dataset_to_hub/aloha_hdf5_format.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/aloha_hdf5_format.py
@@ -29,7 +29,11 @@ from datasets import Dataset, Features, Image, Sequence, Value
 from PIL import Image as PILImage

 from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION
-from lerobot.common.datasets.push_dataset_to_hub.utils import concatenate_episodes, save_images_concurrently
+from lerobot.common.datasets.push_dataset_to_hub.utils import (
+    concatenate_episodes,
+    get_default_encoding,
+    save_images_concurrently,
+)
 from lerobot.common.datasets.utils import (
    calculate_episode_data_index,
    hf_transform_to_torch,
@@ -72,7 +76,14 @@ def check_format(raw_dir) -> bool:
                    assert c < h and c < w, f"Expect (h,w,c) image format but ({h=},{w=},{c=}) provided."


-def load_from_raw(raw_dir: Path, videos_dir: Path, fps: int, video: bool, episodes: list[int] | None = None):
+def load_from_raw(
+    raw_dir: Path,
+    videos_dir: Path,
+    fps: int,
+    video: bool,
+    episodes: list[int] | None = None,
+    encoding: dict | None = None,
+):
    # only frames from simulation are uncompressed
    compressed_images = "sim" not in raw_dir.name

@@ -123,7 +134,7 @@ def load_from_raw(raw_dir: Path, videos_dir: Path, fps: int, video: bool, episod
                    # encode images to a mp4 video
                    fname = f"{img_key}_episode_{ep_idx:06d}.mp4"
                    video_path = videos_dir / fname
-                    encode_video_frames(tmp_imgs_dir, video_path, fps)
+                    encode_video_frames(tmp_imgs_dir, video_path, fps, **(encoding or {}))

                    # clean temporary images directory
                    shutil.rmtree(tmp_imgs_dir)
@@ -200,6 +211,7 @@ def from_raw_to_lerobot_format(
    fps: int | None = None,
    video: bool = True,
    episodes: list[int] | None = None,
+    encoding: dict | None = None,
 ):
    # sanity check
    check_format(raw_dir)
@@ -207,7 +219,7 @@ def from_raw_to_lerobot_format(
    if fps is None:
        fps = 50

-    data_dict = load_from_raw(raw_dir, videos_dir, fps, video, episodes)
+    data_dict = load_from_raw(raw_dir, videos_dir, fps, video, episodes, encoding)
    hf_dataset = to_hf_dataset(data_dict, video)
    episode_data_index = calculate_episode_data_index(hf_dataset)
    info = {
@@ -215,4 +227,7 @@ def from_raw_to_lerobot_format(
        "fps": fps,
        "video": video,
    }
+    if video:
+        info["encoding"] = get_default_encoding()
+
    return hf_dataset, episode_data_index, info
--- a/lerobot/common/datasets/push_dataset_to_hub/cam_png_format.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/cam_png_format.py
@@ -81,8 +81,9 @@ def from_raw_to_lerobot_format(
    fps: int | None = None,
    video: bool = True,
    episodes: list[int] | None = None,
+    encoding: dict | None = None,
 ):
-    if video or episodes is not None:
+    if video or episodes or encoding is not None:
        # TODO(aliberts): support this
        raise NotImplementedError

--- a/lerobot/common/datasets/push_dataset_to_hub/dora_parquet_format.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/dora_parquet_format.py
@@ -18,6 +18,7 @@ Contains utilities to process raw data format from dora-record
 """

 import re
+import warnings
 from pathlib import Path

 import pandas as pd
@@ -199,6 +200,7 @@ def from_raw_to_lerobot_format(
    fps: int | None = None,
    video: bool = True,
    episodes: list[int] | None = None,
+    encoding: dict | None = None,
 ):
    # sanity check
    check_format(raw_dir)
@@ -211,6 +213,12 @@ def from_raw_to_lerobot_format(
    if not video:
        raise NotImplementedError()

+    if encoding is not None:
+        warnings.warn(
+            "Video encoding is currently done outside of LeRobot for the dora_parquet format.",
+            stacklevel=1,
+        )
+
    data_df = load_from_raw(raw_dir, videos_dir, fps, episodes)
    hf_dataset = to_hf_dataset(data_df, video)
    episode_data_index = calculate_episode_data_index(hf_dataset)
@@ -219,4 +227,7 @@ def from_raw_to_lerobot_format(
        "fps": fps,
        "video": video,
    }
+    if video:
+        info["encoding"] = "unknown"
+
    return hf_dataset, episode_data_index, info
--- a/lerobot/common/datasets/push_dataset_to_hub/openx/configs.yaml
+++ b/lerobot/common/datasets/push_dataset_to_hub/openx/configs.yaml
@@ -0,0 +1,640 @@
+OPENX_DATASET_CONFIGS:
+  fractal20220817_data:
+    image_obs_keys:
+      - image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - base_pose_tool_reached
+      - gripper_closed
+    fps: 3
+  
+  kuka:
+    image_obs_keys:
+      - image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - clip_function_input/base_pose_tool_reached
+      - gripper_closed
+    fps: 10
+  
+  bridge_openx:
+    image_obs_keys:
+      - image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - EEF_state
+      - gripper_state
+    fps: 5
+  
+  taco_play:
+    image_obs_keys:
+      - rgb_static
+      - rgb_gripper
+    depth_obs_keys:
+      - depth_static
+      - depth_gripper
+    state_obs_keys:
+      - state_eef
+      - state_gripper
+    fps: 15
+  
+  jaco_play:
+    image_obs_keys:
+      - image
+      - image_wrist
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - state_eef
+      - state_gripper
+    fps: 10
+  
+  berkeley_cable_routing:
+    image_obs_keys:
+      - image
+      - top_image
+      - wrist45_image
+      - wrist225_image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - robot_state
+    fps: 10
+
+  roboturk:
+    image_obs_keys:
+      - front_rgb
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - null
+    fps: 10
+  
+  nyu_door_opening_surprising_effectiveness:
+    image_obs_keys:
+      - image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - null
+    fps: 3
+
+  viola:
+    image_obs_keys:
+      - agentview_rgb
+      - eye_in_hand_rgb
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - joint_states
+      - gripper_states
+    fps: 20
+
+  berkeley_autolab_ur5:
+    image_obs_keys:
+      - image
+      - hand_image
+    depth_obs_keys:
+      - image_with_depth
+    state_obs_keys:
+      - state
+    fps: 5
+
+  toto:
+    image_obs_keys:
+      - image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - state
+    fps: 30
+
+  language_table:
+    image_obs_keys:
+      - rgb
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - effector_translation
+    fps: 10
+
+  columbia_cairlab_pusht_real:
+    image_obs_keys:
+      - image
+      - wrist_image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - robot_state
+    fps: 10
+
+  stanford_kuka_multimodal_dataset_converted_externally_to_rlds:
+    image_obs_keys:
+      - image
+    depth_obs_keys:
+      - depth_image
+    state_obs_keys:
+      - ee_position
+      - ee_orientation
+    fps: 20
+
+  nyu_rot_dataset_converted_externally_to_rlds:
+    image_obs_keys:
+      - image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - eef_state
+      - gripper_state
+    fps: 3
+
+  io_ai_tech:
+    image_obs_keys:
+      - image
+      - image_fisheye
+      - image_left_side
+      - image_right_side
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - state
+    fps: 3
+
+  stanford_hydra_dataset_converted_externally_to_rlds:
+    image_obs_keys:
+      - image
+      - wrist_image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - eef_state
+      - gripper_state
+    fps: 10
+
+  austin_buds_dataset_converted_externally_to_rlds:
+    image_obs_keys:
+      - image
+      - wrist_image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - state
+    fps: 20
+
+  nyu_franka_play_dataset_converted_externally_to_rlds:
+    image_obs_keys:
+      - image
+      - image_additional_view
+    depth_obs_keys:
+      - depth
+      - depth_additional_view
+    state_obs_keys:
+      - eef_state
+    fps: 3
+
+  maniskill_dataset_converted_externally_to_rlds:
+    image_obs_keys:
+      - image
+      - wrist_image
+    depth_obs_keys:
+      - depth
+      - wrist_depth
+    state_obs_keys:
+      - tcp_pose
+      - gripper_state
+    fps: 20
+
+  furniture_bench_dataset_converted_externally_to_rlds:
+    image_obs_keys:
+      - image
+      - wrist_image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - state
+    fps: 10
+
+  cmu_franka_exploration_dataset_converted_externally_to_rlds:
+    image_obs_keys:
+      - highres_image
+    depth_obs_keys:
+      - null  
+    state_obs_keys:
+      - null
+    fps: 10
+
+  ucsd_kitchen_dataset_converted_externally_to_rlds:
+    image_obs_keys:
+      - image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - joint_state
+    fps: 2
+  
+  ucsd_pick_and_place_dataset_converted_externally_to_rlds:
+    image_obs_keys:
+      - image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - eef_state
+      - gripper_state
+    fps: 3
+  
+  spoc:
+    image_obs_keys:
+      - image
+      - image_manipulation
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - null
+    fps: 3
+  
+  austin_sailor_dataset_converted_externally_to_rlds:
+    image_obs_keys:
+      - image
+      - wrist_image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - state
+    fps: 20
+  
+  austin_sirius_dataset_converted_externally_to_rlds:
+    image_obs_keys:
+      - image
+      - wrist_image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - state
+    fps: 20
+  
+  bc_z:
+    image_obs_keys:
+      - image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - present/xyz
+      - present/axis_angle
+      - present/sensed_close
+    fps: 10
+  
+  utokyo_pr2_opening_fridge_converted_externally_to_rlds:
+    image_obs_keys:
+      - image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - eef_state
+      - gripper_state
+    fps: 10
+  
+  utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds:
+    image_obs_keys:
+      - image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - eef_state
+      - gripper_state
+    fps: 10
+  
+  utokyo_xarm_pick_and_place_converted_externally_to_rlds:
+    image_obs_keys:
+      - image
+      - image2
+      - hand_image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - end_effector_pose
+    fps: 10
+  
+  utokyo_xarm_bimanual_converted_externally_to_rlds:
+    image_obs_keys:
+      - image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - pose_r
+    fps: 10
+  
+  robo_net:
+    image_obs_keys:
+      - image
+      - image1
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - eef_state
+      - gripper_state
+    fps: 1
+  
+  robo_set:
+    image_obs_keys:
+      - image_left
+      - image_right
+      - image_wrist
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - state
+      - state_velocity
+    fps: 5
+  
+  berkeley_mvp_converted_externally_to_rlds:
+    image_obs_keys:
+      - hand_image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - gripper
+      - pose
+      - joint_pos
+    fps: 5
+  
+  berkeley_rpt_converted_externally_to_rlds:
+    image_obs_keys:
+      - hand_image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - joint_pos
+      - gripper
+    fps: 30
+  
+  kaist_nonprehensile_converted_externally_to_rlds:
+    image_obs_keys:
+      - image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - state
+    fps: 10
+  
+  stanford_mask_vit_converted_externally_to_rlds:
+    image_obs_keys:
+      - image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - eef_state
+      - gripper_state
+  
+  tokyo_u_lsmo_converted_externally_to_rlds:
+    image_obs_keys:
+      - image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - eef_state
+      - gripper_state
+    fps: 10
+  
+  dlr_sara_pour_converted_externally_to_rlds:
+    image_obs_keys:
+      - image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - state
+    fps: 10
+  
+  dlr_sara_grid_clamp_converted_externally_to_rlds:
+    image_obs_keys:
+      - image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - state  
+    fps: 10
+  
+  dlr_edan_shared_control_converted_externally_to_rlds:
+    image_obs_keys:
+      - image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - state
+    fps: 5
+  
+  asu_table_top_converted_externally_to_rlds:
+    image_obs_keys:
+      - image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - eef_state
+      - gripper_state
+    fps: 12.5
+
+  stanford_robocook_converted_externally_to_rlds:
+    image_obs_keys:
+      - image_1
+      - image_2
+    depth_obs_keys:
+      - depth_1
+      - depth_2
+    state_obs_keys:
+      - eef_state
+      - gripper_state
+    fps: 5
+
+  imperialcollege_sawyer_wrist_cam:
+    image_obs_keys:
+      - image
+      - wrist_image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - state
+    fps: 10
+
+  iamlab_cmu_pickup_insert_converted_externally_to_rlds:
+    image_obs_keys:
+      - image
+      - wrist_image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - joint_state
+      - gripper_state
+    fps: 20
+
+  uiuc_d3field:
+    image_obs_keys:
+      - image_1
+      - image_2
+    depth_obs_keys:
+      - depth_1
+      - depth_2
+    state_obs_keys:
+      - null
+    fps: 1
+  
+  utaustin_mutex:
+    image_obs_keys:
+      - image
+      - wrist_image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - state
+    fps: 20
+  
+  berkeley_fanuc_manipulation:
+    image_obs_keys:
+      - image
+      - wrist_image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - joint_state
+      - gripper_state
+    fps: 10
+  
+  cmu_playing_with_food:
+    image_obs_keys:
+      - image
+      - finger_vision_1
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - state
+    fps: 10
+  
+  cmu_play_fusion:
+    image_obs_keys:
+      - image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - state
+    fps: 5
+  
+  cmu_stretch:
+    image_obs_keys:
+      - image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - eef_state
+      - gripper_state
+    fps: 10
+  
+  berkeley_gnm_recon:
+    image_obs_keys:
+      - image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - state
+      - position
+      - yaw
+    fps: 3
+ 
+  berkeley_gnm_cory_hall:
+    image_obs_keys:
+      - image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - state
+      - position
+      - yaw
+    fps: 5
+ 
+  berkeley_gnm_sac_son:
+    image_obs_keys:
+      - image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - state
+      - position
+      - yaw
+    fps: 10
+  
+  droid:
+    image_obs_keys:
+      - exterior_image_1_left
+      - exterior_image_2_left
+      - wrist_image_left
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - proprio
+    fps: 15
+  
+  droid_100:
+    image_obs_keys:
+      - exterior_image_1_left
+      - exterior_image_2_left
+      - wrist_image_left
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - proprio
+    fps: 15
+  
+  fmb:
+    image_obs_keys:
+      - image_side_1
+      - image_side_2
+      - image_wrist_1
+      - image_wrist_2
+    depth_obs_keys:
+      - image_side_1_depth
+      - image_side_2_depth
+      - image_wrist_1_depth
+      - image_wrist_2_depth
+    state_obs_keys:
+      - proprio
+    fps: 10
+  
+  dobbe:
+    image_obs_keys:
+      - wrist_image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - proprio
+    fps: 3.75
+  
+  usc_cloth_sim_converted_externally_to_rlds:
+    image_obs_keys:
+      - image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - null
+    fps: 10
+  
+  plex_robosuite:
+    image_obs_keys:
+      - image
+      - wrist_image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - state
+    fps: 20
+  
+  conq_hose_manipulation:
+    image_obs_keys:
+      - frontleft_fisheye_image
+      - frontright_fisheye_image
+      - hand_color_image
+    depth_obs_keys:
+      - null
+    state_obs_keys:
+      - state
+    fps: 30
+  
--- a/lerobot/common/datasets/push_dataset_to_hub/openx/data_utils.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/openx/data_utils.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the Licens    e.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+NOTE(YL): Adapted from:
+    Octo: https://github.com/octo-models/octo/blob/main/octo/data/utils/data_utils.py
+
+data_utils.py
+
+Additional utils for data processing.
+"""
+
+from typing import Any, Dict, List
+
+import tensorflow as tf
+
+
+def binarize_gripper_actions(actions: tf.Tensor) -> tf.Tensor:
+    """
+    Converts gripper actions from continuous to binary values (0 and 1).
+
+    We exploit that fact that most of the time, the gripper is fully open (near 1.0) or fully closed (near 0.0). As it
+    transitions between the two, it sometimes passes through a few intermediate values. We relabel those intermediate
+    values based on the state that is reached _after_ those intermediate values.
+
+    In the edge case that the trajectory ends with an intermediate value, we give up on binarizing and relabel that
+    chunk of intermediate values as the last action in the trajectory.
+
+    The `scan_fn` implements the following logic:
+        new_actions = np.empty_like(actions)
+        carry = actions[-1]
+        for i in reversed(range(actions.shape[0])):
+            if in_between_mask[i]:
+                carry = carry
+            else:
+                carry = float(open_mask[i])
+            new_actions[i] = carry
+    """
+    open_mask, closed_mask = actions > 0.95, actions < 0.05
+    in_between_mask = tf.logical_not(tf.logical_or(open_mask, closed_mask))
+    is_open_float = tf.cast(open_mask, tf.float32)
+
+    def scan_fn(carry, i):
+        return tf.cond(in_between_mask[i], lambda: tf.cast(carry, tf.float32), lambda: is_open_float[i])
+
+    return tf.scan(scan_fn, tf.range(tf.shape(actions)[0]), actions[-1], reverse=True)
+
+
+def invert_gripper_actions(actions: tf.Tensor) -> tf.Tensor:
+    return 1 - actions
+
+
+def rel2abs_gripper_actions(actions: tf.Tensor) -> tf.Tensor:
+    """
+    Converts relative gripper actions (+1 for closing, -1 for opening) to absolute actions (0 = closed; 1 = open).
+
+    Assumes that the first relative gripper is not redundant (i.e. close when already closed)!
+    """
+    # Note =>> -1 for closing, 1 for opening, 0 for no change
+    opening_mask, closing_mask = actions < -0.1, actions > 0.1
+    thresholded_actions = tf.where(opening_mask, 1, tf.where(closing_mask, -1, 0))
+
+    def scan_fn(carry, i):
+        return tf.cond(thresholded_actions[i] == 0, lambda: carry, lambda: thresholded_actions[i])
+
+    # If no relative grasp, assumes open for whole trajectory
+    start = -1 * thresholded_actions[tf.argmax(thresholded_actions != 0, axis=0)]
+    start = tf.cond(start == 0, lambda: 1, lambda: start)
+
+    # Note =>> -1 for closed, 1 for open
+    new_actions = tf.scan(scan_fn, tf.range(tf.shape(actions)[0]), start)
+    new_actions = tf.cast(new_actions, tf.float32) / 2 + 0.5
+
+    return new_actions
+
+
+# === Bridge-V2 =>> Dataset-Specific Transform ===
+def relabel_bridge_actions(traj: Dict[str, Any]) -> Dict[str, Any]:
+    """Relabels actions to use reached proprioceptive state; discards last timestep (no-action)."""
+    movement_actions = traj["observation"]["state"][1:, :6] - traj["observation"]["state"][:-1, :6]
+    traj_truncated = tf.nest.map_structure(lambda x: x[:-1], traj)
+    traj_truncated["action"] = tf.concat([movement_actions, traj["action"][:-1, -1:]], axis=1)
+
+    return traj_truncated
+
+
+# === RLDS Dataset Initialization Utilities ===
+def pprint_data_mixture(dataset_kwargs_list: List[Dict[str, Any]], dataset_weights: List[int]) -> None:
+    print("\n######################################################################################")
+    print(f"# Loading the following {len(dataset_kwargs_list)} datasets (incl. sampling weight):{'': >24} #")
+    for dataset_kwargs, weight in zip(dataset_kwargs_list, dataset_weights, strict=False):
+        pad = 80 - len(dataset_kwargs["name"])
+        print(f"# {dataset_kwargs['name']}: {weight:=>{pad}f} #")
+    print("######################################################################################\n")
--- a/lerobot/common/datasets/push_dataset_to_hub/openx/droid_utils.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/openx/droid_utils.py
@@ -0,0 +1,200 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+NOTE(YL): Adapted from:
+    OpenVLA: https://github.com/openvla/openvla
+
+Episode transforms for DROID dataset.
+"""
+
+from typing import Any, Dict
+
+import tensorflow as tf
+import tensorflow_graphics.geometry.transformation as tfg
+
+
+def rmat_to_euler(rot_mat):
+    return tfg.euler.from_rotation_matrix(rot_mat)
+
+
+def euler_to_rmat(euler):
+    return tfg.rotation_matrix_3d.from_euler(euler)
+
+
+def invert_rmat(rot_mat):
+    return tfg.rotation_matrix_3d.inverse(rot_mat)
+
+
+def rotmat_to_rot6d(mat):
+    """
+    Converts rotation matrix to R6 rotation representation (first two rows in rotation matrix).
+    Args:
+        mat: rotation matrix
+
+    Returns: 6d vector (first two rows of rotation matrix)
+
+    """
+    r6 = mat[..., :2, :]
+    r6_0, r6_1 = r6[..., 0, :], r6[..., 1, :]
+    r6_flat = tf.concat([r6_0, r6_1], axis=-1)
+    return r6_flat
+
+
+def velocity_act_to_wrist_frame(velocity, wrist_in_robot_frame):
+    """
+    Translates velocity actions (translation + rotation) from base frame of the robot to wrist frame.
+    Args:
+        velocity: 6d velocity action (3 x translation, 3 x rotation)
+        wrist_in_robot_frame: 6d pose of the end-effector in robot base frame
+
+    Returns: 9d velocity action in robot wrist frame (3 x translation, 6 x rotation as R6)
+
+    """
+    r_frame = euler_to_rmat(wrist_in_robot_frame[:, 3:6])
+    r_frame_inv = invert_rmat(r_frame)
+
+    # world to wrist: dT_pi = R^-1 dT_rbt
+    vel_t = (r_frame_inv @ velocity[:, :3][..., None])[..., 0]
+
+    # world to wrist: dR_pi = R^-1 dR_rbt R
+    dr_ = euler_to_rmat(velocity[:, 3:6])
+    dr_ = r_frame_inv @ (dr_ @ r_frame)
+    dr_r6 = rotmat_to_rot6d(dr_)
+    return tf.concat([vel_t, dr_r6], axis=-1)
+
+
+def rand_swap_exterior_images(img1, img2):
+    """
+    Randomly swaps the two exterior images (for training with single exterior input).
+    """
+    return tf.cond(tf.random.uniform(shape=[]) > 0.5, lambda: (img1, img2), lambda: (img2, img1))
+
+
+def droid_baseact_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    DROID dataset transformation for actions expressed in *base* frame of the robot.
+    """
+    dt = trajectory["action_dict"]["cartesian_velocity"][:, :3]
+    dr_ = trajectory["action_dict"]["cartesian_velocity"][:, 3:6]
+
+    trajectory["action"] = tf.concat(
+        (
+            dt,
+            dr_,
+            1 - trajectory["action_dict"]["gripper_position"],
+        ),
+        axis=-1,
+    )
+    trajectory["observation"]["exterior_image_1_left"], trajectory["observation"]["exterior_image_2_left"] = (
+        rand_swap_exterior_images(
+            trajectory["observation"]["exterior_image_1_left"],
+            trajectory["observation"]["exterior_image_2_left"],
+        )
+    )
+    trajectory["observation"]["proprio"] = tf.concat(
+        (
+            trajectory["observation"]["cartesian_position"],
+            trajectory["observation"]["gripper_position"],
+        ),
+        axis=-1,
+    )
+    return trajectory
+
+
+def droid_wristact_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    DROID dataset transformation for actions expressed in *wrist* frame of the robot.
+    """
+    wrist_act = velocity_act_to_wrist_frame(
+        trajectory["action_dict"]["cartesian_velocity"], trajectory["observation"]["cartesian_position"]
+    )
+    trajectory["action"] = tf.concat(
+        (
+            wrist_act,
+            trajectory["action_dict"]["gripper_position"],
+        ),
+        axis=-1,
+    )
+    trajectory["observation"]["exterior_image_1_left"], trajectory["observation"]["exterior_image_2_left"] = (
+        rand_swap_exterior_images(
+            trajectory["observation"]["exterior_image_1_left"],
+            trajectory["observation"]["exterior_image_2_left"],
+        )
+    )
+    trajectory["observation"]["proprio"] = tf.concat(
+        (
+            trajectory["observation"]["cartesian_position"],
+            trajectory["observation"]["gripper_position"],
+        ),
+        axis=-1,
+    )
+    return trajectory
+
+
+def droid_finetuning_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    DROID dataset transformation for actions expressed in *base* frame of the robot.
+    """
+    dt = trajectory["action_dict"]["cartesian_velocity"][:, :3]
+    dr_ = trajectory["action_dict"]["cartesian_velocity"][:, 3:6]
+    trajectory["action"] = tf.concat(
+        (
+            dt,
+            dr_,
+            1 - trajectory["action_dict"]["gripper_position"],
+        ),
+        axis=-1,
+    )
+    trajectory["observation"]["proprio"] = tf.concat(
+        (
+            trajectory["observation"]["cartesian_position"],
+            trajectory["observation"]["gripper_position"],
+        ),
+        axis=-1,
+    )
+    return trajectory
+
+
+def zero_action_filter(traj: Dict) -> bool:
+    """
+    Filters transitions whose actions are all-0 (only relative actions, no gripper action).
+    Note: this filter is applied *after* action normalization, so need to compare to "normalized 0".
+    """
+    droid_q01 = tf.convert_to_tensor(
+        [
+            -0.7776297926902771,
+            -0.5803514122962952,
+            -0.5795090794563293,
+            -0.6464047729969025,
+            -0.7041108310222626,
+            -0.8895104378461838,
+        ]
+    )
+    droid_q99 = tf.convert_to_tensor(
+        [
+            0.7597932070493698,
+            0.5726242214441299,
+            0.7351000607013702,
+            0.6705610305070877,
+            0.6464948207139969,
+            0.8897542208433151,
+        ]
+    )
+    droid_norm_0_act = (
+        2 * (tf.zeros_like(traj["action"][:, :6]) - droid_q01) / (droid_q99 - droid_q01 + 1e-8) - 1
+    )
+
+    return tf.reduce_any(tf.math.abs(traj["action"][:, :6] - droid_norm_0_act) > 1e-5)
--- a/lerobot/common/datasets/push_dataset_to_hub/openx/transforms.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/openx/transforms.py
@@ -0,0 +1,859 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+NOTE(YL): Adapted from:
+    OpenVLA: https://github.com/openvla/openvla
+    Octo: https://github.com/octo-models/octo
+
+transforms.py
+
+Defines a registry of per-dataset standardization transforms for each dataset in Open-X Embodiment.
+
+Transforms adopt the following structure:
+    Input: Dictionary of *batched* features (i.e., has leading time dimension)
+    Output: Dictionary `step` =>> {
+        "observation": {
+            <image_keys, depth_image_keys>
+            State (in chosen state representation)
+        },
+        "action": Action (in chosen action representation),
+        "language_instruction": str
+    }
+"""
+
+from typing import Any, Dict
+
+import tensorflow as tf
+
+from lerobot.common.datasets.push_dataset_to_hub.openx.data_utils import (
+    binarize_gripper_actions,
+    invert_gripper_actions,
+    rel2abs_gripper_actions,
+    relabel_bridge_actions,
+)
+
+
+def droid_baseact_transform_fn():
+    from lerobot.common.datasets.push_dataset_to_hub.openx.droid_utils import droid_baseact_transform
+
+    return droid_baseact_transform
+
+
+def bridge_openx_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Applies to version of Bridge V2 in Open X-Embodiment mixture.
+
+    Note =>> In original Bridge V2 dataset, the first timestep has an all-zero action, so we remove it!
+    """
+    for key in trajectory:
+        if key == "traj_metadata":
+            continue
+        elif key in ["observation", "action"]:
+            for key2 in trajectory[key]:
+                trajectory[key][key2] = trajectory[key][key2][1:]
+        else:
+            trajectory[key] = trajectory[key][1:]
+
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["world_vector"],
+            trajectory["action"]["rotation_delta"],
+            tf.cast(trajectory["action"]["open_gripper"][:, None], tf.float32),
+        ),
+        axis=-1,
+    )
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    trajectory = relabel_bridge_actions(trajectory)
+    trajectory["observation"]["EEF_state"] = trajectory["observation"]["state"][:, :6]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
+    return trajectory
+
+
+def bridge_orig_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Applies to original version of Bridge V2 from the official project website.
+
+    Note =>> In original Bridge V2 dataset, the first timestep has an all-zero action, so we remove it!
+    """
+    for key in trajectory:
+        if key == "traj_metadata":
+            continue
+        elif key == "observation":
+            for key2 in trajectory[key]:
+                trajectory[key][key2] = trajectory[key][key2][1:]
+        else:
+            trajectory[key] = trajectory[key][1:]
+
+    trajectory["action"] = tf.concat(
+        [
+            trajectory["action"][:, :6],
+            binarize_gripper_actions(trajectory["action"][:, -1])[:, None],
+        ],
+        axis=1,
+    )
+    trajectory = relabel_bridge_actions(trajectory)
+    trajectory["observation"]["EEF_state"] = trajectory["observation"]["state"][:, :6]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
+    return trajectory
+
+
+def ppgm_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["action"] = tf.concat(
+        [
+            trajectory["action"][:, :6],
+            binarize_gripper_actions(trajectory["action"][:, -1])[:, None],
+        ],
+        axis=1,
+    )
+    trajectory["observation"]["EEF_state"] = trajectory["observation"]["cartesian_position"][:, :6]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["gripper_position"][:, -1:]
+    return trajectory
+
+
+def rt1_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # make gripper action absolute action, +1 = open, 0 = close
+    gripper_action = trajectory["action"]["gripper_closedness_action"][:, 0]
+    gripper_action = rel2abs_gripper_actions(gripper_action)
+
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["world_vector"],
+            trajectory["action"]["rotation_delta"],
+            gripper_action[:, None],
+        ),
+        axis=-1,
+    )
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    return trajectory
+
+
+def kuka_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # make gripper action absolute action, +1 = open, 0 = close
+    gripper_action = trajectory["action"]["gripper_closedness_action"][:, 0]
+    gripper_action = rel2abs_gripper_actions(gripper_action)
+
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["world_vector"],
+            trajectory["action"]["rotation_delta"],
+            gripper_action[:, None],
+        ),
+        axis=-1,
+    )
+    # decode compressed state
+    eef_value = tf.io.decode_compressed(
+        trajectory["observation"]["clip_function_input/base_pose_tool_reached"],
+        compression_type="ZLIB",
+    )
+    eef_value = tf.io.decode_raw(eef_value, tf.float32)
+    trajectory["observation"]["clip_function_input/base_pose_tool_reached"] = tf.reshape(eef_value, (-1, 7))
+    gripper_value = tf.io.decode_compressed(
+        trajectory["observation"]["gripper_closed"], compression_type="ZLIB"
+    )
+    gripper_value = tf.io.decode_raw(gripper_value, tf.float32)
+    trajectory["observation"]["gripper_closed"] = tf.reshape(gripper_value, (-1, 1))
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    return trajectory
+
+
+def taco_play_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["state_eef"] = trajectory["observation"]["robot_obs"][:, :6]
+    trajectory["observation"]["state_gripper"] = trajectory["observation"]["robot_obs"][:, 7:8]
+    trajectory["action"] = trajectory["action"]["rel_actions_world"]
+
+    # invert gripper action + clip, +1 = open, 0 = close
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :6],
+            tf.clip_by_value(trajectory["action"][:, -1:], 0, 1),
+        ),
+        axis=-1,
+    )
+
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    return trajectory
+
+
+def jaco_play_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["state_eef"] = trajectory["observation"]["end_effector_cartesian_pos"][:, :6]
+    trajectory["observation"]["state_gripper"] = trajectory["observation"]["end_effector_cartesian_pos"][
+        :, -1:
+    ]
+
+    # make gripper action absolute action, +1 = open, 0 = close
+    gripper_action = trajectory["action"]["gripper_closedness_action"][:, 0]
+    gripper_action = rel2abs_gripper_actions(gripper_action)
+
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["world_vector"],
+            tf.zeros_like(trajectory["action"]["world_vector"]),
+            gripper_action[:, None],
+        ),
+        axis=-1,
+    )
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    return trajectory
+
+
+def berkeley_cable_routing_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["world_vector"],
+            trajectory["action"]["rotation_delta"],
+            tf.zeros_like(trajectory["action"]["world_vector"][:, :1]),
+        ),
+        axis=-1,
+    )
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    return trajectory
+
+
+def roboturk_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # invert absolute gripper action, +1 = open, 0 = close
+    gripper_action = invert_gripper_actions(
+        tf.clip_by_value(trajectory["action"]["gripper_closedness_action"], 0, 1)
+    )
+
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["world_vector"],
+            trajectory["action"]["rotation_delta"],
+            gripper_action,
+        ),
+        axis=-1,
+    )
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    trajectory["language_embedding"] = trajectory["observation"]["natural_language_embedding"]
+    return trajectory
+
+
+def nyu_door_opening_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # make gripper action absolute action, +1 = open, 0 = close
+    gripper_action = trajectory["action"]["gripper_closedness_action"][:, 0]
+    gripper_action = rel2abs_gripper_actions(gripper_action)
+
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["world_vector"],
+            trajectory["action"]["rotation_delta"],
+            gripper_action[:, None],
+        ),
+        axis=-1,
+    )
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    return trajectory
+
+
+def viola_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # make gripper action, +1 = open, 0 = close
+    gripper_action = trajectory["action"]["gripper_closedness_action"][:, None]
+    gripper_action = tf.clip_by_value(gripper_action, 0, 1)
+    gripper_action = invert_gripper_actions(gripper_action)
+
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["world_vector"],
+            trajectory["action"]["rotation_delta"],
+            gripper_action,
+        ),
+        axis=-1,
+    )
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    return trajectory
+
+
+def berkeley_autolab_ur5_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["state"] = trajectory["observation"]["robot_state"][:, 6:14]
+
+    # make gripper action absolute action, +1 = open, 0 = close
+    gripper_action = trajectory["action"]["gripper_closedness_action"]
+    gripper_action = rel2abs_gripper_actions(gripper_action)
+
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["world_vector"],
+            trajectory["action"]["rotation_delta"],
+            gripper_action[:, None],
+        ),
+        axis=-1,
+    )
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    return trajectory
+
+
+def toto_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["world_vector"],
+            trajectory["action"]["rotation_delta"],
+            tf.cast(trajectory["action"]["open_gripper"][:, None], tf.float32),
+        ),
+        axis=-1,
+    )
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    return trajectory
+
+
+def language_table_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # default to "open" gripper
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"],
+            tf.zeros_like(trajectory["action"]),
+            tf.zeros_like(trajectory["action"]),
+            tf.ones_like(trajectory["action"][:, :1]),
+        ),
+        axis=-1,
+    )
+
+    # decode language instruction
+    instruction_bytes = trajectory["observation"]["instruction"]
+    instruction_encoded = tf.strings.unicode_encode(instruction_bytes, output_encoding="UTF-8")
+    # Remove trailing padding --> convert RaggedTensor to regular Tensor.
+    trajectory["language_instruction"] = tf.strings.split(instruction_encoded, "\x00")[:, :1].to_tensor()[
+        :, 0
+    ]
+    return trajectory
+
+
+def pusht_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["world_vector"],
+            trajectory["action"]["rotation_delta"],
+            trajectory["action"]["gripper_closedness_action"][:, None],
+        ),
+        axis=-1,
+    )
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    return trajectory
+
+
+def stanford_kuka_multimodal_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["depth_image"] = trajectory["observation"]["depth_image"][..., 0]
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :3],
+            tf.zeros_like(trajectory["action"][:, :3]),
+            trajectory["action"][:, -1:],
+        ),
+        axis=-1,
+    )
+    return trajectory
+
+
+def nyu_rot_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][..., :6]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][..., -1:]
+    trajectory["action"] = trajectory["action"][..., :7]
+    return trajectory
+
+
+def stanford_hydra_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # invert gripper action, +1 = open, 0 = close
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :6],
+            invert_gripper_actions(trajectory["action"][:, -1:]),
+        ),
+        axis=-1,
+    )
+
+    trajectory["observation"]["eef_state"] = tf.concat(
+        (
+            trajectory["observation"]["state"][:, :3],
+            trajectory["observation"]["state"][:, 7:10],
+        ),
+        axis=-1,
+    )
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -3:-2]
+    return trajectory
+
+
+def austin_buds_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # invert gripper action + clip, +1 = open, 0 = close
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :6],
+            invert_gripper_actions(tf.clip_by_value(trajectory["action"][:, -1:], 0, 1)),
+        ),
+        axis=-1,
+    )
+
+    trajectory["observation"]["state"] = trajectory["observation"]["state"][:, :8]
+    return trajectory
+
+
+def nyu_franka_play_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["depth"] = tf.cast(trajectory["observation"]["depth"][..., 0], tf.float32)
+    trajectory["observation"]["depth_additional_view"] = tf.cast(
+        trajectory["observation"]["depth_additional_view"][..., 0], tf.float32
+    )
+    trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, -6:]
+
+    # clip gripper action, +1 = open, 0 = close
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, -8:-2],
+            tf.clip_by_value(trajectory["action"][:, -2:-1], 0, 1),
+        ),
+        axis=-1,
+    )
+    return trajectory
+
+
+def maniskill_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][..., 7:8]
+    return trajectory
+
+
+def furniture_bench_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    import tensorflow_graphics.geometry.transformation as tft
+
+    trajectory["observation"]["state"] = tf.concat(
+        (
+            trajectory["observation"]["state"][:, :7],
+            trajectory["observation"]["state"][:, -1:],
+        ),
+        axis=-1,
+    )
+
+    # invert gripper action + clip, +1 = open, 0 = close
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :3],
+            tft.euler.from_quaternion(trajectory["action"][:, 3:7]),
+            invert_gripper_actions(tf.clip_by_value(trajectory["action"][:, -1:], 0, 1)),
+        ),
+        axis=-1,
+    )
+    return trajectory
+
+
+def cmu_franka_exploration_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["action"] = trajectory["action"][..., :-1]
+    return trajectory
+
+
+def ucsd_kitchen_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["joint_state"] = trajectory["observation"]["state"][:, :7]
+    trajectory["action"] = trajectory["action"][..., :-1]
+    return trajectory
+
+
+def ucsd_pick_place_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, :6]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :3],
+            tf.zeros_like(trajectory["action"][:, :3]),
+            trajectory["action"][:, -1:],
+        ),
+        axis=-1,
+    )
+    return trajectory
+
+
+def austin_sailor_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # invert gripper action + clip, +1 = open, 0 = close
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :6],
+            invert_gripper_actions(tf.clip_by_value(trajectory["action"][:, -1:], 0, 1)),
+        ),
+        axis=-1,
+    )
+    return trajectory
+
+
+def austin_sirius_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # invert gripper action + clip, +1 = open, 0 = close
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :6],
+            invert_gripper_actions(tf.clip_by_value(trajectory["action"][:, -1:], 0, 1)),
+        ),
+        axis=-1,
+    )
+    return trajectory
+
+
+def bc_z_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["future/xyz_residual"][:, :3],
+            trajectory["action"]["future/axis_angle_residual"][:, :3],
+            invert_gripper_actions(tf.cast(trajectory["action"]["future/target_close"][:, :1], tf.float32)),
+        ),
+        axis=-1,
+    )
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    return trajectory
+
+
+def tokyo_pr2_opening_fridge_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, :6]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
+    trajectory["action"] = trajectory["action"][..., :-1]
+    return trajectory
+
+
+def tokyo_pr2_tabletop_manipulation_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, :6]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
+    trajectory["action"] = trajectory["action"][..., :-1]
+    return trajectory
+
+
+def utokyo_xarm_bimanual_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["action"] = trajectory["action"][..., -7:]
+    return trajectory
+
+
+def robo_net_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["eef_state"] = tf.concat(
+        (
+            trajectory["observation"]["state"][:, :4],
+            tf.zeros_like(trajectory["observation"]["state"][:, :2]),
+        ),
+        axis=-1,
+    )
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :4],
+            tf.zeros_like(trajectory["action"][:, :2]),
+            trajectory["action"][:, -1:],
+        ),
+        axis=-1,
+    )
+    return trajectory
+
+
+def berkeley_mvp_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    trajectory["observation"]["state"] = tf.concat((
+        tf.cast(trajectory["observation"]["gripper"][:, None], tf.float32),
+                        trajectory["observation"]["pose"],
+                        trajectory["observation"]["joint_pos"],),
+                        axis=-1,)
+    """
+    trajectory["observation"]["gripper"] = tf.cast(trajectory["observation"]["gripper"][:, None], tf.float32)
+    return trajectory
+
+
+def berkeley_rpt_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["gripper"] = tf.cast(trajectory["observation"]["gripper"][:, None], tf.float32)
+    return trajectory
+
+
+def kaist_nonprehensible_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["state"] = trajectory["observation"]["state"][:, -7:]
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :6],
+            tf.zeros_like(trajectory["action"][:, :1]),
+        ),
+        axis=-1,
+    )
+    return trajectory
+
+
+def stanford_mask_vit_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["eef_state"] = tf.concat(
+        (
+            trajectory["observation"]["end_effector_pose"][:, :4],
+            tf.zeros_like(trajectory["observation"]["end_effector_pose"][:, :2]),
+        ),
+        axis=-1,
+    )
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["end_effector_pose"][:, -1:]
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :4],
+            tf.zeros_like(trajectory["action"][:, :2]),
+            trajectory["action"][:, -1:],
+        ),
+        axis=-1,
+    )
+    return trajectory
+
+
+def tokyo_lsmo_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, :6]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
+    return trajectory
+
+
+def dlr_sara_grid_clamp_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["state"] = trajectory["observation"]["state"][:, :6]
+    return trajectory
+
+
+def dlr_edan_shared_control_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # invert gripper action, +1 = open, 0 = close
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :6],
+            invert_gripper_actions(trajectory["action"][:, -1:]),
+        ),
+        axis=-1,
+    )
+    return trajectory
+
+
+def asu_table_top_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["eef_state"] = trajectory["ground_truth_states"]["EE"]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
+    return trajectory
+
+
+def robocook_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, :6]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
+    return trajectory
+
+
+def imperial_wristcam_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["action"] = trajectory["action"][..., :-1]
+    return trajectory
+
+
+def iamlab_pick_insert_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    import tensorflow_graphics.geometry.transformation as tft
+
+    trajectory["observation"]["joint_state"] = trajectory["observation"]["state"][:, :7]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, 7:8]
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :3],
+            tft.euler.from_quaternion(trajectory["action"][:, 3:7]),
+            trajectory["action"][:, 7:8],
+        ),
+        axis=-1,
+    )
+    return trajectory
+
+
+def uiuc_d3field_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"],
+            tf.zeros_like(trajectory["action"]),
+            tf.zeros_like(trajectory["action"][:, :1]),
+        ),
+        axis=-1,
+    )
+    return trajectory
+
+
+def utaustin_mutex_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["state"] = trajectory["observation"]["state"][:, :8]
+
+    # invert gripper action + clip, +1 = open, 0 = close
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :6],
+            invert_gripper_actions(tf.clip_by_value(trajectory["action"][:, -1:], 0, 1)),
+        ),
+        axis=-1,
+    )
+    return trajectory
+
+
+def berkeley_fanuc_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["joint_state"] = trajectory["observation"]["state"][:, :6]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, 6:7]
+
+    # dataset does not store gripper actions, so use gripper state info, invert so +1 = open, 0 = close
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"],
+            invert_gripper_actions(trajectory["observation"]["gripper_state"]),
+        ),
+        axis=-1,
+    )
+    return trajectory
+
+
+def cmu_playing_with_food_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    import tensorflow_graphics.geometry.transformation as tft
+
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :3],
+            tft.euler.from_quaternion(trajectory["action"][:, 3:7]),
+            trajectory["action"][:, -1:],
+        ),
+        axis=-1,
+    )
+    return trajectory
+
+
+def playfusion_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :3],
+            trajectory["action"][:, -4:],
+        ),
+        axis=-1,
+    )
+    return trajectory
+
+
+def cmu_stretch_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["eef_state"] = tf.concat(
+        (
+            trajectory["observation"]["state"][:, :3],
+            tf.zeros_like(trajectory["observation"]["state"][:, :3]),
+        ),
+        axis=-1,
+    )
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
+    trajectory["action"] = trajectory["action"][..., :-1]
+    return trajectory
+
+
+def gnm_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["state"] = tf.concat(
+        (
+            trajectory["observation"]["position"],
+            tf.zeros_like(trajectory["observation"]["state"][:, :3]),
+            trajectory["observation"]["yaw"],
+        ),
+        axis=-1,
+    )
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"],
+            tf.zeros_like(trajectory["action"]),
+            tf.zeros_like(trajectory["action"]),
+            tf.zeros_like(trajectory["action"][:, :1]),
+        ),
+        axis=-1,
+    )
+    return trajectory
+
+
+def fmb_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # every input feature is batched, ie has leading batch dimension
+    trajectory["observation"]["proprio"] = tf.concat(
+        (
+            trajectory["observation"]["eef_pose"],
+            trajectory["observation"]["state_gripper_pose"][..., None],
+        ),
+        axis=-1,
+    )
+    return trajectory
+
+
+def dobbe_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # every input feature is batched, ie has leading batch dimension
+    trajectory["observation"]["proprio"] = trajectory["observation"]["state"]
+    return trajectory
+
+
+def robo_set_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # gripper action is in -1...1 --> clip to 0...1, flip
+    gripper_action = trajectory["action"][:, -1:]
+    gripper_action = invert_gripper_actions(tf.clip_by_value(gripper_action, 0, 1))
+
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :7],
+            gripper_action,
+        ),
+        axis=-1,
+    )
+    return trajectory
+
+
+def identity_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    return trajectory
+
+
+# === Registry ===
+OPENX_STANDARDIZATION_TRANSFORMS = {
+    "bridge_openx": bridge_openx_dataset_transform,
+    "bridge_orig": bridge_orig_dataset_transform,
+    "bridge_dataset": bridge_orig_dataset_transform,
+    "ppgm": ppgm_dataset_transform,
+    "ppgm_static": ppgm_dataset_transform,
+    "ppgm_wrist": ppgm_dataset_transform,
+    "fractal20220817_data": rt1_dataset_transform,
+    "kuka": kuka_dataset_transform,
+    "taco_play": taco_play_dataset_transform,
+    "jaco_play": jaco_play_dataset_transform,
+    "berkeley_cable_routing": berkeley_cable_routing_dataset_transform,
+    "roboturk": roboturk_dataset_transform,
+    "nyu_door_opening_surprising_effectiveness": nyu_door_opening_dataset_transform,
+    "viola": viola_dataset_transform,
+    "berkeley_autolab_ur5": berkeley_autolab_ur5_dataset_transform,
+    "toto": toto_dataset_transform,
+    "language_table": language_table_dataset_transform,
+    "columbia_cairlab_pusht_real": pusht_dataset_transform,
+    "stanford_kuka_multimodal_dataset_converted_externally_to_rlds": stanford_kuka_multimodal_dataset_transform,
+    "nyu_rot_dataset_converted_externally_to_rlds": nyu_rot_dataset_transform,
+    "stanford_hydra_dataset_converted_externally_to_rlds": stanford_hydra_dataset_transform,
+    "austin_buds_dataset_converted_externally_to_rlds": austin_buds_dataset_transform,
+    "nyu_franka_play_dataset_converted_externally_to_rlds": nyu_franka_play_dataset_transform,
+    "maniskill_dataset_converted_externally_to_rlds": maniskill_dataset_transform,
+    "furniture_bench_dataset_converted_externally_to_rlds": furniture_bench_dataset_transform,
+    "cmu_franka_exploration_dataset_converted_externally_to_rlds": cmu_franka_exploration_dataset_transform,
+    "ucsd_kitchen_dataset_converted_externally_to_rlds": ucsd_kitchen_dataset_transform,
+    "ucsd_pick_and_place_dataset_converted_externally_to_rlds": ucsd_pick_place_dataset_transform,
+    "austin_sailor_dataset_converted_externally_to_rlds": austin_sailor_dataset_transform,
+    "austin_sirius_dataset_converted_externally_to_rlds": austin_sirius_dataset_transform,
+    "bc_z": bc_z_dataset_transform,
+    "utokyo_pr2_opening_fridge_converted_externally_to_rlds": tokyo_pr2_opening_fridge_dataset_transform,
+    "utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds": tokyo_pr2_tabletop_manipulation_dataset_transform,
+    "utokyo_xarm_pick_and_place_converted_externally_to_rlds": identity_transform,
+    "utokyo_xarm_bimanual_converted_externally_to_rlds": utokyo_xarm_bimanual_dataset_transform,
+    "robo_net": robo_net_dataset_transform,
+    "berkeley_mvp_converted_externally_to_rlds": berkeley_mvp_dataset_transform,
+    "berkeley_rpt_converted_externally_to_rlds": berkeley_rpt_dataset_transform,
+    "kaist_nonprehensile_converted_externally_to_rlds": kaist_nonprehensible_dataset_transform,
+    "stanford_mask_vit_converted_externally_to_rlds": stanford_mask_vit_dataset_transform,
+    "tokyo_u_lsmo_converted_externally_to_rlds": tokyo_lsmo_dataset_transform,
+    "dlr_sara_pour_converted_externally_to_rlds": identity_transform,
+    "dlr_sara_grid_clamp_converted_externally_to_rlds": dlr_sara_grid_clamp_dataset_transform,
+    "dlr_edan_shared_control_converted_externally_to_rlds": dlr_edan_shared_control_dataset_transform,
+    "asu_table_top_converted_externally_to_rlds": asu_table_top_dataset_transform,
+    "stanford_robocook_converted_externally_to_rlds": robocook_dataset_transform,
+    "imperialcollege_sawyer_wrist_cam": imperial_wristcam_dataset_transform,
+    "iamlab_cmu_pickup_insert_converted_externally_to_rlds": iamlab_pick_insert_dataset_transform,
+    "uiuc_d3field": uiuc_d3field_dataset_transform,
+    "utaustin_mutex": utaustin_mutex_dataset_transform,
+    "berkeley_fanuc_manipulation": berkeley_fanuc_dataset_transform,
+    "cmu_playing_with_food": cmu_playing_with_food_dataset_transform,
+    "cmu_play_fusion": playfusion_dataset_transform,
+    "cmu_stretch": cmu_stretch_dataset_transform,
+    "berkeley_gnm_recon": gnm_dataset_transform,
+    "berkeley_gnm_cory_hall": gnm_dataset_transform,
+    "berkeley_gnm_sac_son": gnm_dataset_transform,
+    "droid": droid_baseact_transform_fn(),
+    "droid_100": droid_baseact_transform_fn(),  # first 100 episodes of droid
+    "fmb": fmb_transform,
+    "dobbe": dobbe_dataset_transform,
+    "robo_set": robo_set_dataset_transform,
+    "usc_cloth_sim_converted_externally_to_rlds": identity_transform,
+    "plex_robosuite": identity_transform,
+    "conq_hose_manipulation": identity_transform,
+    "io_ai_tech": identity_transform,
+    "spoc": identity_transform,
+}
--- a/lerobot/common/datasets/push_dataset_to_hub/openx_rlds_format.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/openx_rlds_format.py
@@ -0,0 +1,359 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+For https://github.com/google-deepmind/open_x_embodiment (OPENX) datasets.
+
+Example:
+    python lerobot/scripts/push_dataset_to_hub.py \
+        --raw-dir /hdd/tensorflow_datasets/bridge_dataset/1.0.0/ \
+        --repo-id youliangtan/sampled_bridge_data_v2 \
+        --raw-format openx_rlds.bridge_orig \
+        --episodes 3 4 5 8 9
+
+Exact dataset fps defined in openx/config.py, obtained from:
+    https://docs.google.com/spreadsheets/d/1rPBD77tk60AEIGZrGSODwyyzs5FgCU9Uz3h-3_t2A9g/edit?gid=0#gid=0&range=R:R
+"""
+
+import shutil
+from pathlib import Path
+
+import numpy as np
+import tensorflow as tf
+import tensorflow_datasets as tfds
+import torch
+import tqdm
+import yaml
+from datasets import Dataset, Features, Image, Sequence, Value
+from PIL import Image as PILImage
+
+from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION
+from lerobot.common.datasets.push_dataset_to_hub.openx.transforms import OPENX_STANDARDIZATION_TRANSFORMS
+from lerobot.common.datasets.push_dataset_to_hub.utils import (
+    concatenate_episodes,
+    get_default_encoding,
+    save_images_concurrently,
+)
+from lerobot.common.datasets.utils import (
+    calculate_episode_data_index,
+    hf_transform_to_torch,
+)
+from lerobot.common.datasets.video_utils import VideoFrame, encode_video_frames
+
+with open("lerobot/common/datasets/push_dataset_to_hub/openx/configs.yaml", "r") as f:
+    _openx_list = yaml.safe_load(f)
+
+OPENX_DATASET_CONFIGS = _openx_list["OPENX_DATASET_CONFIGS"]
+
+np.set_printoptions(precision=2)
+
+
+def tf_to_torch(data):
+    return torch.from_numpy(data.numpy())
+
+
+def tf_img_convert(img):
+    if img.dtype == tf.string:
+        img = tf.io.decode_image(img, expand_animations=False, dtype=tf.uint8)
+    elif img.dtype != tf.uint8:
+        raise ValueError(f"Unsupported image dtype: found with dtype {img.dtype}")
+    return img.numpy()
+
+
+def _broadcast_metadata_rlds(i: tf.Tensor, traj: dict) -> dict:
+    """
+    In the RLDS format, each trajectory has some top-level metadata that is explicitly separated out, and a "steps"
+    entry. This function moves the "steps" entry to the top level, broadcasting any metadata to the length of the
+    trajectory. This function also adds the extra metadata fields `_len`, `_traj_index`, and `_frame_index`.
+
+    NOTE: adapted from DLimp library https://github.com/kvablack/dlimp/
+    """
+    steps = traj.pop("steps")
+
+    traj_len = tf.shape(tf.nest.flatten(steps)[0])[0]
+
+    # broadcast metadata to the length of the trajectory
+    metadata = tf.nest.map_structure(lambda x: tf.repeat(x, traj_len), traj)
+
+    # put steps back in
+    assert "traj_metadata" not in steps
+    traj = {**steps, "traj_metadata": metadata}
+
+    assert "_len" not in traj
+    assert "_traj_index" not in traj
+    assert "_frame_index" not in traj
+    traj["_len"] = tf.repeat(traj_len, traj_len)
+    traj["_traj_index"] = tf.repeat(i, traj_len)
+    traj["_frame_index"] = tf.range(traj_len)
+
+    return traj
+
+
+def load_from_raw(
+    raw_dir: Path,
+    videos_dir: Path,
+    fps: int,
+    video: bool,
+    episodes: list[int] | None = None,
+    encoding: dict | None = None,
+    openx_dataset_name: str | None = None,
+):
+    """
+    Args:
+        raw_dir (Path): _description_
+        videos_dir (Path): _description_
+        fps (int): _description_
+        video (bool): _description_
+        episodes (list[int] | None, optional): _description_. Defaults to None.
+    """
+    ds_builder = tfds.builder_from_directory(str(raw_dir))
+    dataset = ds_builder.as_dataset(
+        split="all",
+        decoders={"steps": tfds.decode.SkipDecoding()},
+    )
+
+    dataset_info = ds_builder.info
+    print("dataset_info: ", dataset_info)
+
+    ds_length = len(dataset)
+    dataset = dataset.take(ds_length)
+    # "flatten" the dataset as such we can apply trajectory level map() easily
+    # each [obs][key] has a shape of (frame_size, ...)
+    dataset = dataset.enumerate().map(_broadcast_metadata_rlds)
+
+    # we will apply the standardization transform if the dataset_name is provided
+    # if the dataset name is not provided and the goal is to convert any rlds formatted dataset
+    # search for 'image' keys in the observations
+    if openx_dataset_name is not None:
+        print(" - applying standardization transform for dataset: ", openx_dataset_name)
+        assert openx_dataset_name in OPENX_STANDARDIZATION_TRANSFORMS
+        transform_fn = OPENX_STANDARDIZATION_TRANSFORMS[openx_dataset_name]
+        dataset = dataset.map(transform_fn)
+
+        image_keys = OPENX_DATASET_CONFIGS[openx_dataset_name]["image_obs_keys"]
+    else:
+        obs_keys = dataset_info.features["steps"]["observation"].keys()
+        image_keys = [key for key in obs_keys if "image" in key]
+
+    lang_key = "language_instruction" if "language_instruction" in dataset.element_spec else None
+
+    print(" - image_keys: ", image_keys)
+    print(" - lang_key: ", lang_key)
+
+    it = iter(dataset)
+
+    ep_dicts = []
+    # Init temp path to save ep_dicts in case of crash
+    tmp_ep_dicts_dir = videos_dir.parent.joinpath("ep_dicts")
+    tmp_ep_dicts_dir.mkdir(parents=True, exist_ok=True)
+
+    # check if ep_dicts have already been saved in /tmp
+    starting_ep_idx = 0
+    saved_ep_dicts = [ep.__str__() for ep in tmp_ep_dicts_dir.iterdir()]
+    if len(saved_ep_dicts) > 0:
+        saved_ep_dicts.sort()
+        # get last ep_idx number
+        starting_ep_idx = int(saved_ep_dicts[-1][-13:-3]) + 1
+        for i in range(starting_ep_idx):
+            episode = next(it)
+            ep_dicts.append(torch.load(saved_ep_dicts[i]))
+
+    # if we user specified episodes, skip the ones not in the list
+    if episodes is not None:
+        if ds_length == 0:
+            raise ValueError("No episodes found.")
+        # convert episodes index to sorted list
+        episodes = sorted(episodes)
+
+    for ep_idx in tqdm.tqdm(range(starting_ep_idx, ds_length)):
+        episode = next(it)
+
+        # if user specified episodes, skip the ones not in the list
+        if episodes is not None:
+            if len(episodes) == 0:
+                break
+            if ep_idx == episodes[0]:
+                # process this episode
+                print(" selecting episode idx: ", ep_idx)
+                episodes.pop(0)
+            else:
+                continue  # skip
+
+        num_frames = episode["action"].shape[0]
+
+        ###########################################################
+        # Handle the episodic data
+
+        # last step of demonstration is considered done
+        done = torch.zeros(num_frames, dtype=torch.bool)
+        done[-1] = True
+        ep_dict = {}
+        langs = []  # TODO: might be located in "observation"
+
+        image_array_dict = {key: [] for key in image_keys}
+
+        # We will create the state observation tensor by stacking the state
+        # obs keys defined in the openx/configs.py
+        if openx_dataset_name is not None:
+            state_obs_keys = OPENX_DATASET_CONFIGS[openx_dataset_name]["state_obs_keys"]
+            # stack the state observations, if is None, pad with zeros
+            states = []
+            for key in state_obs_keys:
+                if key in episode["observation"]:
+                    states.append(tf_to_torch(episode["observation"][key]))
+                else:
+                    states.append(torch.zeros(num_frames, 1))  # pad with zeros
+            states = torch.cat(states, dim=1)
+            # assert states.shape == (num_frames, 8), f"states shape: {states.shape}"
+        else:
+            states = tf_to_torch(episode["observation"]["state"])
+
+        actions = tf_to_torch(episode["action"])
+        rewards = tf_to_torch(episode["reward"]).float()
+
+        # If lang_key is present, convert the entire tensor at once
+        if lang_key is not None:
+            langs = [str(x) for x in episode[lang_key]]
+
+        for im_key in image_keys:
+            imgs = episode["observation"][im_key]
+            image_array_dict[im_key] = [tf_img_convert(img) for img in imgs]
+
+        # simple assertions
+        for item in [states, actions, rewards, done]:
+            assert len(item) == num_frames
+
+        ###########################################################
+
+        # loop through all cameras
+        for im_key in image_keys:
+            img_key = f"observation.images.{im_key}"
+            imgs_array = image_array_dict[im_key]
+            imgs_array = np.array(imgs_array)
+            if video:
+                # save png images in temporary directory
+                tmp_imgs_dir = videos_dir / "tmp_images"
+                save_images_concurrently(imgs_array, tmp_imgs_dir)
+
+                # encode images to a mp4 video
+                fname = f"{img_key}_episode_{ep_idx:06d}.mp4"
+                video_path = videos_dir / fname
+                encode_video_frames(tmp_imgs_dir, video_path, fps, **(encoding or {}))
+
+                # clean temporary images directory
+                shutil.rmtree(tmp_imgs_dir)
+
+                # store the reference to the video frame
+                ep_dict[img_key] = [
+                    {"path": f"videos/{fname}", "timestamp": i / fps} for i in range(num_frames)
+                ]
+            else:
+                ep_dict[img_key] = [PILImage.fromarray(x) for x in imgs_array]
+
+        if lang_key is not None:
+            ep_dict["language_instruction"] = langs
+
+        ep_dict["observation.state"] = states
+        ep_dict["action"] = actions
+        ep_dict["timestamp"] = torch.arange(0, num_frames, 1) / fps
+        ep_dict["episode_index"] = torch.tensor([ep_idx] * num_frames)
+        ep_dict["frame_index"] = torch.arange(0, num_frames, 1)
+        ep_dict["next.reward"] = rewards
+        ep_dict["next.done"] = done
+
+        path_ep_dict = tmp_ep_dicts_dir.joinpath(
+            "ep_dict_" + "0" * (10 - len(str(ep_idx))) + str(ep_idx) + ".pt"
+        )
+        torch.save(ep_dict, path_ep_dict)
+
+        ep_dicts.append(ep_dict)
+
+    data_dict = concatenate_episodes(ep_dicts)
+
+    total_frames = data_dict["frame_index"].shape[0]
+    data_dict["index"] = torch.arange(0, total_frames, 1)
+    return data_dict
+
+
+def to_hf_dataset(data_dict, video) -> Dataset:
+    features = {}
+
+    keys = [key for key in data_dict if "observation.images." in key]
+    for key in keys:
+        if video:
+            features[key] = VideoFrame()
+        else:
+            features[key] = Image()
+
+    features["observation.state"] = Sequence(
+        length=data_dict["observation.state"].shape[1], feature=Value(dtype="float32", id=None)
+    )
+    if "observation.velocity" in data_dict:
+        features["observation.velocity"] = Sequence(
+            length=data_dict["observation.velocity"].shape[1], feature=Value(dtype="float32", id=None)
+        )
+    if "observation.effort" in data_dict:
+        features["observation.effort"] = Sequence(
+            length=data_dict["observation.effort"].shape[1], feature=Value(dtype="float32", id=None)
+        )
+    if "language_instruction" in data_dict:
+        features["language_instruction"] = Value(dtype="string", id=None)
+
+    features["action"] = Sequence(
+        length=data_dict["action"].shape[1], feature=Value(dtype="float32", id=None)
+    )
+    features["episode_index"] = Value(dtype="int64", id=None)
+    features["frame_index"] = Value(dtype="int64", id=None)
+    features["timestamp"] = Value(dtype="float32", id=None)
+    features["next.reward"] = Value(dtype="float32", id=None)
+    features["next.done"] = Value(dtype="bool", id=None)
+    features["index"] = Value(dtype="int64", id=None)
+
+    hf_dataset = Dataset.from_dict(data_dict, features=Features(features))
+    hf_dataset.set_transform(hf_transform_to_torch)
+    return hf_dataset
+
+
+def from_raw_to_lerobot_format(
+    raw_dir: Path,
+    videos_dir: Path,
+    fps: int | None = None,
+    video: bool = True,
+    episodes: list[int] | None = None,
+    encoding: dict | None = None,
+    openx_dataset_name: str | None = None,
+):
+    """This is a test impl for rlds conversion"""
+    if openx_dataset_name is None:
+        # set a default rlds frame rate if the dataset is not from openx
+        fps = 30
+    elif "fps" not in OPENX_DATASET_CONFIGS[openx_dataset_name]:
+        raise ValueError(
+            "fps for this dataset is not specified in openx/configs.py yet," "means it is not yet tested"
+        )
+    fps = OPENX_DATASET_CONFIGS[openx_dataset_name]["fps"]
+
+    data_dict = load_from_raw(raw_dir, videos_dir, fps, video, episodes, encoding, openx_dataset_name)
+    hf_dataset = to_hf_dataset(data_dict, video)
+    episode_data_index = calculate_episode_data_index(hf_dataset)
+    info = {
+        "codebase_version": CODEBASE_VERSION,
+        "fps": fps,
+        "video": video,
+    }
+    if video:
+        info["encoding"] = get_default_encoding()
+
+    return hf_dataset, episode_data_index, info
--- a/lerobot/common/datasets/push_dataset_to_hub/pusht_zarr_format.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/pusht_zarr_format.py
@@ -26,7 +26,11 @@ from datasets import Dataset, Features, Image, Sequence, Value
 from PIL import Image as PILImage

 from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION
-from lerobot.common.datasets.push_dataset_to_hub.utils import concatenate_episodes, save_images_concurrently
+from lerobot.common.datasets.push_dataset_to_hub.utils import (
+    concatenate_episodes,
+    get_default_encoding,
+    save_images_concurrently,
+)
 from lerobot.common.datasets.utils import (
    calculate_episode_data_index,
    hf_transform_to_torch,
@@ -62,6 +66,7 @@ def load_from_raw(
    video: bool,
    episodes: list[int] | None = None,
    keypoints_instead_of_image: bool = False,
+    encoding: dict | None = None,
 ):
    try:
        import pymunk
@@ -172,7 +177,7 @@ def load_from_raw(
                # encode images to a mp4 video
                fname = f"{img_key}_episode_{ep_idx:06d}.mp4"
                video_path = videos_dir / fname
-                encode_video_frames(tmp_imgs_dir, video_path, fps)
+                encode_video_frames(tmp_imgs_dir, video_path, fps, **(encoding or {}))

                # clean temporary images directory
                shutil.rmtree(tmp_imgs_dir)
@@ -244,6 +249,7 @@ def from_raw_to_lerobot_format(
    fps: int | None = None,
    video: bool = True,
    episodes: list[int] | None = None,
+    encoding: dict | None = None,
 ):
    # Manually change this to True to use keypoints of the T instead of an image observation (but don't merge
    # with True). Also make sure to use video = 0 in the `push_dataset_to_hub.py` script.
@@ -255,7 +261,7 @@ def from_raw_to_lerobot_format(
    if fps is None:
        fps = 10

-    data_dict = load_from_raw(raw_dir, videos_dir, fps, video, episodes, keypoints_instead_of_image)
+    data_dict = load_from_raw(raw_dir, videos_dir, fps, video, episodes, keypoints_instead_of_image, encoding)
    hf_dataset = to_hf_dataset(data_dict, video, keypoints_instead_of_image)
    episode_data_index = calculate_episode_data_index(hf_dataset)
    info = {
@@ -263,4 +269,7 @@ def from_raw_to_lerobot_format(
        "fps": fps,
        "video": video if not keypoints_instead_of_image else 0,
    }
+    if video:
+        info["encoding"] = get_default_encoding()
+
    return hf_dataset, episode_data_index, info
--- a/lerobot/common/datasets/push_dataset_to_hub/umi_zarr_format.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/umi_zarr_format.py
@@ -27,7 +27,11 @@ from PIL import Image as PILImage

 from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION
 from lerobot.common.datasets.push_dataset_to_hub._umi_imagecodecs_numcodecs import register_codecs
-from lerobot.common.datasets.push_dataset_to_hub.utils import concatenate_episodes, save_images_concurrently
+from lerobot.common.datasets.push_dataset_to_hub.utils import (
+    concatenate_episodes,
+    get_default_encoding,
+    save_images_concurrently,
+)
 from lerobot.common.datasets.utils import (
    calculate_episode_data_index,
    hf_transform_to_torch,
@@ -60,7 +64,14 @@ def check_format(raw_dir) -> bool:
    assert all(nb_frames == zarr_data[dataset].shape[0] for dataset in required_datasets)


-def load_from_raw(raw_dir: Path, videos_dir: Path, fps: int, video: bool, episodes: list[int] | None = None):
+def load_from_raw(
+    raw_dir: Path,
+    videos_dir: Path,
+    fps: int,
+    video: bool,
+    episodes: list[int] | None = None,
+    encoding: dict | None = None,
+):
    zarr_path = raw_dir / "cup_in_the_wild.zarr"
    zarr_data = zarr.open(zarr_path, mode="r")

@@ -88,49 +99,61 @@ def load_from_raw(raw_dir: Path, videos_dir: Path, fps: int, video: bool, episod
        to_ids.append(to_idx)
        from_idx = to_idx

+    ep_dicts_dir = videos_dir / "ep_dicts"
+    ep_dicts_dir.mkdir(exist_ok=True, parents=True)
    ep_dicts = []
+
    ep_ids = episodes if episodes else range(num_episodes)
    for ep_idx, selected_ep_idx in tqdm.tqdm(enumerate(ep_ids)):
-        from_idx = from_ids[selected_ep_idx]
-        to_idx = to_ids[selected_ep_idx]
-        num_frames = to_idx - from_idx
+        ep_dict_path = ep_dicts_dir / f"{ep_idx}"
+        if not ep_dict_path.is_file():
+            from_idx = from_ids[selected_ep_idx]
+            to_idx = to_ids[selected_ep_idx]
+            num_frames = to_idx - from_idx

-        # TODO(rcadene): save temporary images of the episode?
+            # TODO(rcadene): save temporary images of the episode?

-        state = states[from_idx:to_idx]
+            state = states[from_idx:to_idx]

-        ep_dict = {}
+            ep_dict = {}

-        # load 57MB of images in RAM (400x224x224x3 uint8)
-        imgs_array = zarr_data["data/camera0_rgb"][from_idx:to_idx]
-        img_key = "observation.image"
-        if video:
-            # save png images in temporary directory
-            tmp_imgs_dir = videos_dir / "tmp_images"
-            save_images_concurrently(imgs_array, tmp_imgs_dir)
+            # load 57MB of images in RAM (400x224x224x3 uint8)
+            imgs_array = zarr_data["data/camera0_rgb"][from_idx:to_idx]
+            img_key = "observation.image"
+            if video:
+                fname = f"{img_key}_episode_{ep_idx:06d}.mp4"
+                video_path = videos_dir / fname
+                if not video_path.is_file():
+                    # save png images in temporary directory
+                    tmp_imgs_dir = videos_dir / "tmp_images"
+                    save_images_concurrently(imgs_array, tmp_imgs_dir)

-            # encode images to a mp4 video
-            fname = f"{img_key}_episode_{ep_idx:06d}.mp4"
-            video_path = videos_dir / fname
-            encode_video_frames(tmp_imgs_dir, video_path, fps)
+                    # encode images to a mp4 video
+                    encode_video_frames(tmp_imgs_dir, video_path, fps, **(encoding or {}))

-            # clean temporary images directory
-            shutil.rmtree(tmp_imgs_dir)
+                    # clean temporary images directory
+                    shutil.rmtree(tmp_imgs_dir)

-            # store the reference to the video frame
-            ep_dict[img_key] = [{"path": f"videos/{fname}", "timestamp": i / fps} for i in range(num_frames)]
+                # store the reference to the video frame
+                ep_dict[img_key] = [
+                    {"path": f"videos/{fname}", "timestamp": i / fps} for i in range(num_frames)
+                ]
+            else:
+                ep_dict[img_key] = [PILImage.fromarray(x) for x in imgs_array]
+
+            ep_dict["observation.state"] = state
+            ep_dict["episode_index"] = torch.tensor([ep_idx] * num_frames, dtype=torch.int64)
+            ep_dict["frame_index"] = torch.arange(0, num_frames, 1)
+            ep_dict["timestamp"] = torch.arange(0, num_frames, 1) / fps
+            ep_dict["episode_data_index_from"] = torch.tensor([from_idx] * num_frames)
+            ep_dict["episode_data_index_to"] = torch.tensor([from_idx + num_frames] * num_frames)
+            ep_dict["end_pose"] = end_pose[from_idx:to_idx]
+            ep_dict["start_pos"] = start_pos[from_idx:to_idx]
+            ep_dict["gripper_width"] = gripper_width[from_idx:to_idx]
+            torch.save(ep_dict, ep_dict_path)
        else:
-            ep_dict[img_key] = [PILImage.fromarray(x) for x in imgs_array]
+            ep_dict = torch.load(ep_dict_path)

-        ep_dict["observation.state"] = state
-        ep_dict["episode_index"] = torch.tensor([ep_idx] * num_frames, dtype=torch.int64)
-        ep_dict["frame_index"] = torch.arange(0, num_frames, 1)
-        ep_dict["timestamp"] = torch.arange(0, num_frames, 1) / fps
-        ep_dict["episode_data_index_from"] = torch.tensor([from_idx] * num_frames)
-        ep_dict["episode_data_index_to"] = torch.tensor([from_idx + num_frames] * num_frames)
-        ep_dict["end_pose"] = end_pose[from_idx:to_idx]
-        ep_dict["start_pos"] = start_pos[from_idx:to_idx]
-        ep_dict["gripper_width"] = gripper_width[from_idx:to_idx]
        ep_dicts.append(ep_dict)

    data_dict = concatenate_episodes(ep_dicts)
@@ -183,6 +206,7 @@ def from_raw_to_lerobot_format(
    fps: int | None = None,
    video: bool = True,
    episodes: list[int] | None = None,
+    encoding: dict | None = None,
 ):
    # sanity check
    check_format(raw_dir)
@@ -196,7 +220,7 @@ def from_raw_to_lerobot_format(
            "Generating UMI dataset without `video=True` creates ~150GB on disk and requires ~80GB in RAM."
        )

-    data_dict = load_from_raw(raw_dir, videos_dir, fps, video, episodes)
+    data_dict = load_from_raw(raw_dir, videos_dir, fps, video, episodes, encoding)
    hf_dataset = to_hf_dataset(data_dict, video)
    episode_data_index = calculate_episode_data_index(hf_dataset)
    info = {
@@ -204,4 +228,7 @@ def from_raw_to_lerobot_format(
        "fps": fps,
        "video": video,
    }
+    if video:
+        info["encoding"] = get_default_encoding()
+
    return hf_dataset, episode_data_index, info
--- a/lerobot/common/datasets/push_dataset_to_hub/utils.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/utils.py
@@ -13,6 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import inspect
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path

@@ -20,6 +21,8 @@ import numpy
 import PIL
 import torch

+from lerobot.common.datasets.video_utils import encode_video_frames
+

 def concatenate_episodes(ep_dicts):
    data_dict = {}
@@ -51,3 +54,21 @@ def save_images_concurrently(imgs_array: numpy.array, out_dir: Path, max_workers
    num_images = len(imgs_array)
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        [executor.submit(save_image, imgs_array[i], i, out_dir) for i in range(num_images)]
+
+
+def get_default_encoding() -> dict:
+    """Returns the default ffmpeg encoding parameters used by `encode_video_frames`."""
+    signature = inspect.signature(encode_video_frames)
+    return {
+        k: v.default
+        for k, v in signature.parameters.items()
+        if v.default is not inspect.Parameter.empty and k in ["vcodec", "pix_fmt", "g", "crf"]
+    }
+
+
+def check_repo_id(repo_id: str) -> None:
+    if len(repo_id.split("/")) != 2:
+        raise ValueError(
+            f"""`repo_id` is expected to contain a community or user id `/` the name of the dataset
+            (e.g. 'lerobot/pusht'), but contains '{repo_id}'."""
+        )
--- a/lerobot/common/datasets/push_dataset_to_hub/xarm_pkl_format.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/xarm_pkl_format.py
@@ -26,7 +26,11 @@ from datasets import Dataset, Features, Image, Sequence, Value
 from PIL import Image as PILImage

 from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION
-from lerobot.common.datasets.push_dataset_to_hub.utils import concatenate_episodes, save_images_concurrently
+from lerobot.common.datasets.push_dataset_to_hub.utils import (
+    concatenate_episodes,
+    get_default_encoding,
+    save_images_concurrently,
+)
 from lerobot.common.datasets.utils import (
    calculate_episode_data_index,
    hf_transform_to_torch,
@@ -56,7 +60,14 @@ def check_format(raw_dir):
        assert all(len(nested_dict[subkey]) == expected_len for subkey in subkeys if subkey in nested_dict)


-def load_from_raw(raw_dir: Path, videos_dir: Path, fps: int, video: bool, episodes: list[int] | None = None):
+def load_from_raw(
+    raw_dir: Path,
+    videos_dir: Path,
+    fps: int,
+    video: bool,
+    episodes: list[int] | None = None,
+    encoding: dict | None = None,
+):
    pkl_path = raw_dir / "buffer.pkl"

    with open(pkl_path, "rb") as f:
@@ -105,7 +116,7 @@ def load_from_raw(raw_dir: Path, videos_dir: Path, fps: int, video: bool, episod
            # encode images to a mp4 video
            fname = f"{img_key}_episode_{ep_idx:06d}.mp4"
            video_path = videos_dir / fname
-            encode_video_frames(tmp_imgs_dir, video_path, fps)
+            encode_video_frames(tmp_imgs_dir, video_path, fps, **(encoding or {}))

            # clean temporary images directory
            shutil.rmtree(tmp_imgs_dir)
@@ -167,6 +178,7 @@ def from_raw_to_lerobot_format(
    fps: int | None = None,
    video: bool = True,
    episodes: list[int] | None = None,
+    encoding: dict | None = None,
 ):
    # sanity check
    check_format(raw_dir)
@@ -174,7 +186,7 @@ def from_raw_to_lerobot_format(
    if fps is None:
        fps = 15

-    data_dict = load_from_raw(raw_dir, videos_dir, fps, video, episodes)
+    data_dict = load_from_raw(raw_dir, videos_dir, fps, video, episodes, encoding)
    hf_dataset = to_hf_dataset(data_dict, video)
    episode_data_index = calculate_episode_data_index(hf_dataset)
    info = {
@@ -182,4 +194,7 @@ def from_raw_to_lerobot_format(
        "fps": fps,
        "video": video,
    }
+    if video:
+        info["encoding"] = get_default_encoding()
+
    return hf_dataset, episode_data_index, info
--- a/lerobot/common/datasets/utils.py
+++ b/lerobot/common/datasets/utils.py
@@ -23,11 +23,19 @@ from typing import Dict
 import datasets
 import torch
 from datasets import load_dataset, load_from_disk
-from huggingface_hub import HfApi, hf_hub_download, snapshot_download
+from huggingface_hub import DatasetCard, HfApi, hf_hub_download, snapshot_download
 from PIL import Image as PILImage
 from safetensors.torch import load_file
 from torchvision import transforms

+DATASET_CARD_TEMPLATE = """
+---
+# Metadata will go there
+---
+This dataset was created using [🤗 LeRobot](https://github.com/huggingface/lerobot).
+
+"""
+

 def flatten_dict(d, parent_key="", sep="/"):
    """Flatten a nested dictionary structure by collapsing nested keys into one key with a separator.
@@ -72,6 +80,11 @@ def hf_transform_to_torch(items_dict: dict[torch.Tensor | None]):
        if isinstance(first_item, PILImage.Image):
            to_tensor = transforms.ToTensor()
            items_dict[key] = [to_tensor(img) for img in items_dict[key]]
+        elif isinstance(first_item, str):
+            # TODO (michel-aractingi): add str2embedding via language tokenizer
+            # For now we leave this part up to the user to choose how to address
+            # language conditioned tasks
+            pass
        elif isinstance(first_item, dict) and "path" in first_item and "timestamp" in first_item:
            # video frame will be processed downstream
            pass
@@ -385,3 +398,29 @@ def cycle(iterable):
            yield next(iterator)
        except StopIteration:
            iterator = iter(iterable)
+
+
+def create_branch(repo_id, *, branch: str, repo_type: str | None = None):
+    """Create a branch on a existing Hugging Face repo. Delete the branch if it already
+    exists before creating it.
+    """
+    api = HfApi()
+
+    branches = api.list_repo_refs(repo_id, repo_type=repo_type).branches
+    refs = [branch.ref for branch in branches]
+    ref = f"refs/heads/{branch}"
+    if ref in refs:
+        api.delete_branch(repo_id, repo_type=repo_type, branch=branch)
+
+    api.create_branch(repo_id, repo_type=repo_type, branch=branch)
+
+
+def create_lerobot_dataset_card(tags: list | None = None, text: str | None = None) -> DatasetCard:
+    card = DatasetCard(DATASET_CARD_TEMPLATE)
+    card.data.task_categories = ["robotics"]
+    card.data.tags = ["LeRobot"]
+    if tags is not None:
+        card.data.tags += tags
+    if text is not None:
+        card.text += text
+    return card
--- a/lerobot/common/datasets/video_utils.py
+++ b/lerobot/common/datasets/video_utils.py
@@ -166,10 +166,10 @@ def encode_video_frames(
    imgs_dir: Path,
    video_path: Path,
    fps: int,
-    video_codec: str = "libsvtav1",
-    pixel_format: str = "yuv420p",
-    group_of_pictures_size: int | None = 2,
-    constant_rate_factor: int | None = 30,
+    vcodec: str = "libsvtav1",
+    pix_fmt: str = "yuv420p",
+    g: int | None = 2,
+    crf: int | None = 30,
    fast_decode: int = 0,
    log_level: str | None = "error",
    overwrite: bool = False,
@@ -183,20 +183,20 @@ def encode_video_frames(
            ("-f", "image2"),
            ("-r", str(fps)),
            ("-i", str(imgs_dir / "frame_%06d.png")),
-            ("-vcodec", video_codec),
-            ("-pix_fmt", pixel_format),
+            ("-vcodec", vcodec),
+            ("-pix_fmt", pix_fmt),
        ]
    )

-    if group_of_pictures_size is not None:
-        ffmpeg_args["-g"] = str(group_of_pictures_size)
+    if g is not None:
+        ffmpeg_args["-g"] = str(g)

-    if constant_rate_factor is not None:
-        ffmpeg_args["-crf"] = str(constant_rate_factor)
+    if crf is not None:
+        ffmpeg_args["-crf"] = str(crf)

    if fast_decode:
-        key = "-svtav1-params" if video_codec == "libsvtav1" else "-tune"
-        value = f"fast-decode={fast_decode}" if video_codec == "libsvtav1" else "fastdecode"
+        key = "-svtav1-params" if vcodec == "libsvtav1" else "-tune"
+        value = f"fast-decode={fast_decode}" if vcodec == "libsvtav1" else "fastdecode"
        ffmpeg_args[key] = value

    if log_level is not None:
@@ -210,6 +210,12 @@ def encode_video_frames(
    # redirect stdin to subprocess.DEVNULL to prevent reading random keyboard inputs from terminal
    subprocess.run(ffmpeg_cmd, check=True, stdin=subprocess.DEVNULL)

+    if not video_path.exists():
+        raise OSError(
+            f"Video encoding did not work. File not found: {video_path}. "
+            f"Try running the command manually to debug: `{''.join(ffmpeg_cmd)}`"
+        )
+

@dataclass
 class VideoFrame:
--- a/lerobot/common/policies/act/modeling_act.py
+++ b/lerobot/common/policies/act/modeling_act.py
@@ -38,7 +38,13 @@ from lerobot.common.policies.act.configuration_act import ACTConfig
 from lerobot.common.policies.normalize import Normalize, Unnormalize


-class ACTPolicy(nn.Module, PyTorchModelHubMixin):
+class ACTPolicy(
+    nn.Module,
+    PyTorchModelHubMixin,
+    library_name="lerobot",
+    repo_url="https://github.com/huggingface/lerobot",
+    tags=["robotics", "act"],
+):
    """
    Action Chunking Transformer Policy as per Learning Fine-Grained Bimanual Manipulation with Low-Cost
    Hardware (paper: https://arxiv.org/abs/2304.13705, code: https://github.com/tonyzhaozh/act)
@@ -101,6 +107,7 @@ class ACTPolicy(nn.Module, PyTorchModelHubMixin):

        batch = self.normalize_inputs(batch)
        if len(self.expected_image_keys) > 0:
+            batch = dict(batch)  # shallow copy so that adding a key doesn't modify the original
            batch["observation.images"] = torch.stack([batch[k] for k in self.expected_image_keys], dim=-4)

        # If we are doing temporal ensembling, do online updates where we keep track of the number of actions
@@ -128,6 +135,7 @@ class ACTPolicy(nn.Module, PyTorchModelHubMixin):
        """Run the batch through the model and compute the loss for training or validation."""
        batch = self.normalize_inputs(batch)
        if len(self.expected_image_keys) > 0:
+            batch = dict(batch)  # shallow copy so that adding a key doesn't modify the original
            batch["observation.images"] = torch.stack([batch[k] for k in self.expected_image_keys], dim=-4)
        batch = self.normalize_targets(batch)
        actions_hat, (mu_hat, log_sigma_x2_hat) = self.model(batch)
@@ -467,10 +475,9 @@ class ACT(nn.Module):
        if self.use_images:
            all_cam_features = []
            all_cam_pos_embeds = []
-            images = batch["observation.images"]

-            for cam_index in range(images.shape[-4]):
-                cam_features = self.backbone(images[:, cam_index])["feature_map"]
+            for cam_index in range(batch["observation.images"].shape[-4]):
+                cam_features = self.backbone(batch["observation.images"][:, cam_index])["feature_map"]
                # TODO(rcadene, alexander-soare): remove call to `.to` to speedup forward ; precompute and use
                # buffer
                cam_pos_embed = self.encoder_cam_feat_pos_embed(cam_features).to(dtype=cam_features.dtype)
--- a/lerobot/common/policies/diffusion/modeling_diffusion.py
+++ b/lerobot/common/policies/diffusion/modeling_diffusion.py
@@ -43,7 +43,13 @@ from lerobot.common.policies.utils import (
 )


-class DiffusionPolicy(nn.Module, PyTorchModelHubMixin):
+class DiffusionPolicy(
+    nn.Module,
+    PyTorchModelHubMixin,
+    library_name="lerobot",
+    repo_url="https://github.com/huggingface/lerobot",
+    tags=["robotics", "diffusion-policy"],
+):
    """
    Diffusion Policy as per "Diffusion Policy: Visuomotor Policy Learning via Action Diffusion"
    (paper: https://arxiv.org/abs/2303.04137, code: https://github.com/real-stanford/diffusion_policy).
@@ -111,17 +117,18 @@ class DiffusionPolicy(nn.Module, PyTorchModelHubMixin):
        Schematically this looks like:
            ----------------------------------------------------------------------------------------------
            (legend: o = n_obs_steps, h = horizon, a = n_action_steps)
-            |timestep            | n-o+1 | n-o+2 | ..... | n     | ..... | n+a-1 | n+a   | ..... |n-o+1+h|
-            |observation is used | YES   | YES   | YES   | NO    | NO    | NO    | NO    | NO    | NO    |
+            |timestep            | n-o+1 | n-o+2 | ..... | n     | ..... | n+a-1 | n+a   | ..... | n-o+h |
+            |observation is used | YES   | YES   | YES   | YES   | NO    | NO    | NO    | NO    | NO    |
            |action is generated | YES   | YES   | YES   | YES   | YES   | YES   | YES   | YES   | YES   |
            |action is used      | NO    | NO    | NO    | YES   | YES   | YES   | NO    | NO    | NO    |
            ----------------------------------------------------------------------------------------------
-        Note that this means we require: `n_action_steps < horizon - n_obs_steps + 1`. Also, note that
+        Note that this means we require: `n_action_steps <= horizon - n_obs_steps + 1`. Also, note that
        "horizon" may not the best name to describe what the variable actually means, because this period is
        actually measured from the first observation which (if `n_obs_steps` > 1) happened in the past.
        """
        batch = self.normalize_inputs(batch)
        if len(self.expected_image_keys) > 0:
+            batch = dict(batch)  # shallow copy so that adding a key doesn't modify the original
            batch["observation.images"] = torch.stack([batch[k] for k in self.expected_image_keys], dim=-4)
        # Note: It's important that this happens after stacking the images into a single key.
        self._queues = populate_queues(self._queues, batch)
@@ -143,6 +150,7 @@ class DiffusionPolicy(nn.Module, PyTorchModelHubMixin):
        """Run the batch through the model and compute the loss for training or validation."""
        batch = self.normalize_inputs(batch)
        if len(self.expected_image_keys) > 0:
+            batch = dict(batch)  # shallow copy so that adding a key doesn't modify the original
            batch["observation.images"] = torch.stack([batch[k] for k in self.expected_image_keys], dim=-4)
        batch = self.normalize_targets(batch)
        loss = self.diffusion.compute_loss(batch)
--- a/lerobot/common/policies/factory.py
+++ b/lerobot/common/policies/factory.py
@@ -51,6 +51,11 @@ def get_policy_and_config_classes(name: str) -> tuple[Policy, object]:
        from lerobot.common.policies.tdmpc.modeling_tdmpc import TDMPCPolicy

        return TDMPCPolicy, TDMPCConfig
+    elif name == "tdmpc2":
+        from lerobot.common.policies.tdmpc2.configuration_tdmpc2 import TDMPC2Config
+        from lerobot.common.policies.tdmpc2.modeling_tdmpc2 import TDMPC2Policy
+
+        return TDMPC2Policy, TDMPC2Config
    elif name == "diffusion":
        from lerobot.common.policies.diffusion.configuration_diffusion import DiffusionConfig
        from lerobot.common.policies.diffusion.modeling_diffusion import DiffusionPolicy
--- a/lerobot/common/policies/normalize.py
+++ b/lerobot/common/policies/normalize.py
@@ -132,6 +132,7 @@ class Normalize(nn.Module):
    # TODO(rcadene): should we remove torch.no_grad?
    @torch.no_grad
    def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor]:
+        batch = dict(batch)  # shallow copy avoids mutating the input batch
        for key, mode in self.modes.items():
            buffer = getattr(self, "buffer_" + key.replace(".", "_"))

@@ -197,6 +198,7 @@ class Unnormalize(nn.Module):
    # TODO(rcadene): should we remove torch.no_grad?
    @torch.no_grad
    def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor]:
+        batch = dict(batch)  # shallow copy avoids mutating the input batch
        for key, mode in self.modes.items():
            buffer = getattr(self, "buffer_" + key.replace(".", "_"))

--- a/lerobot/common/policies/tdmpc/configuration_tdmpc.py
+++ b/lerobot/common/policies/tdmpc/configuration_tdmpc.py
@@ -25,12 +25,16 @@ class TDMPCConfig:
    camera observations.

    The parameters you will most likely need to change are the ones which depend on the environment / sensors.
-    Those are: `input_shapes`, `output_shapes`, and perhaps `max_random_shift`.
+    Those are: `input_shapes`, `output_shapes`, and perhaps `max_random_shift_ratio`.

    Args:
        n_action_repeats: The number of times to repeat the action returned by the planning. (hint: Google
            action repeats in Q-learning or ask your favorite chatbot)
        horizon: Horizon for model predictive control.
+        n_action_steps: Number of action steps to take from the plan given by model predictive control. This
+            is an alternative to using action repeats. If this is set to more than 1, then we require
+            `n_action_repeats == 1`, `use_mpc == True` and `n_action_steps <= horizon`. Note that this
+            approach of using multiple steps from the plan is not in the original implementation.
        input_shapes: A dictionary defining the shapes of the input data for the policy. The key represents
            the input data name, and the value is a list indicating the dimensions of the corresponding data.
            For example, "observation.image" refers to an input from a camera with dimensions [3, 96, 96],
@@ -100,6 +104,7 @@ class TDMPCConfig:
    # Input / output structure.
    n_action_repeats: int = 2
    horizon: int = 5
+    n_action_steps: int = 1

    input_shapes: dict[str, list[int]] = field(
        default_factory=lambda: {
@@ -158,17 +163,18 @@ class TDMPCConfig:
        """Input validation (not exhaustive)."""
        # There should only be one image key.
        image_keys = {k for k in self.input_shapes if k.startswith("observation.image")}
-        if len(image_keys) != 1:
+        if len(image_keys) > 1:
            raise ValueError(
-                f"{self.__class__.__name__} only handles one image for now. Got image keys {image_keys}."
-            )
-        image_key = next(iter(image_keys))
-        if self.input_shapes[image_key][-2] != self.input_shapes[image_key][-1]:
-            # TODO(alexander-soare): This limitation is solely because of code in the random shift
-            # augmentation. It should be able to be removed.
-            raise ValueError(
-                f"Only square images are handled now. Got image shape {self.input_shapes[image_key]}."
+                f"{self.__class__.__name__} handles at most one image for now. Got image keys {image_keys}."
            )
+        if len(image_keys) > 0:
+            image_key = next(iter(image_keys))
+            if self.input_shapes[image_key][-2] != self.input_shapes[image_key][-1]:
+                # TODO(alexander-soare): This limitation is solely because of code in the random shift
+                # augmentation. It should be able to be removed.
+                raise ValueError(
+                    f"Only square images are handled now. Got image shape {self.input_shapes[image_key]}."
+                )
        if self.n_gaussian_samples <= 0:
            raise ValueError(
                f"The number of guassian samples for CEM should be non-zero. Got `{self.n_gaussian_samples=}`"
@@ -179,3 +185,12 @@ class TDMPCConfig:
                f"advised that you stick with the default. See {self.__class__.__name__} docstring for more "
                "information."
            )
+        if self.n_action_steps > 1:
+            if self.n_action_repeats != 1:
+                raise ValueError(
+                    "If `n_action_steps > 1`, `n_action_repeats` must be left to its default value of 1."
+                )
+            if not self.use_mpc:
+                raise ValueError("If `n_action_steps > 1`, `use_mpc` must be set to `True`.")
+            if self.n_action_steps > self.horizon:
+                raise ValueError("`n_action_steps` must be less than or equal to `horizon`.")
--- a/lerobot/common/policies/tdmpc/modeling_tdmpc.py
+++ b/lerobot/common/policies/tdmpc/modeling_tdmpc.py
@@ -19,14 +19,10 @@
 The comments in this code may sometimes refer to these references:
    TD-MPC paper: Temporal Difference Learning for Model Predictive Control (https://arxiv.org/abs/2203.04955)
    FOWM paper: Finetuning Offline World Models in the Real World (https://arxiv.org/abs/2310.16029)
-
-TODO(alexander-soare): Make rollout work for batch sizes larger than 1.
-TODO(alexander-soare): Use batch-first throughout.
 """

 # ruff: noqa: N806

-import logging
 from collections import deque
 from copy import deepcopy
 from functools import partial
@@ -45,7 +41,13 @@ from lerobot.common.policies.tdmpc.configuration_tdmpc import TDMPCConfig
 from lerobot.common.policies.utils import get_device_from_parameters, populate_queues


-class TDMPCPolicy(nn.Module, PyTorchModelHubMixin):
+class TDMPCPolicy(
+    nn.Module,
+    PyTorchModelHubMixin,
+    library_name="lerobot",
+    repo_url="https://github.com/huggingface/lerobot",
+    tags=["robotics", "tdmpc"],
+):
    """Implementation of TD-MPC learning + inference.

    Please note several warnings for this policy.
@@ -56,9 +58,11 @@ class TDMPCPolicy(nn.Module, PyTorchModelHubMixin):
            process communication to use the xarm environment from FOWM. This is because our xarm
            environment uses newer dependencies and does not match the environment in FOWM. See
            https://github.com/huggingface/lerobot/pull/103 for implementation details.
-        - We have NOT checked that training on LeRobot reproduces SOTA results. This is a TODO.
+        - We have NOT checked that training on LeRobot reproduces the results from FOWM.
+        - Nevertheless, we have verified that we can train TD-MPC for PushT. See
+          `lerobot/configs/policy/tdmpc_pusht_keypoints.yaml`.
        - Our current xarm datasets were generated using the environment from FOWM. Therefore they do not
-            match our xarm environment.
+          match our xarm environment.
    """

    name = "tdmpc"
@@ -74,22 +78,6 @@ class TDMPCPolicy(nn.Module, PyTorchModelHubMixin):
                that they will be passed with a call to `load_state_dict` before the policy is used.
        """
        super().__init__()
-        logging.warning(
-            """
-            Please note several warnings for this policy.
-
-            - Evaluation of pretrained weights created with the original FOWM code
-              (https://github.com/fyhMer/fowm) works as expected. To be precise: we trained and evaluated a
-              model with the FOWM code for the xarm_lift_medium_replay dataset. We ported the weights across
-              to LeRobot, and were able to evaluate with the same success metric. BUT, we had to use inter-
-              process communication to use the xarm environment from FOWM. This is because our xarm
-              environment uses newer dependencies and does not match the environment in FOWM. See
-              https://github.com/huggingface/lerobot/pull/103 for implementation details.
-            - We have NOT checked that training on LeRobot reproduces SOTA results. This is a TODO.
-            - Our current xarm datasets were generated using the environment from FOWM. Therefore they do not
-              match our xarm environment.
-            """
-        )

        if config is None:
            config = TDMPCConfig()
@@ -114,8 +102,14 @@ class TDMPCPolicy(nn.Module, PyTorchModelHubMixin):

        image_keys = [k for k in config.input_shapes if k.startswith("observation.image")]
        # Note: This check is covered in the post-init of the config but have a sanity check just in case.
-        assert len(image_keys) == 1
-        self.input_image_key = image_keys[0]
+        self._use_image = False
+        self._use_env_state = False
+        if len(image_keys) > 0:
+            assert len(image_keys) == 1
+            self._use_image = True
+            self.input_image_key = image_keys[0]
+        if "observation.environment_state" in config.input_shapes:
+            self._use_env_state = True

        self.reset()

@@ -125,10 +119,13 @@ class TDMPCPolicy(nn.Module, PyTorchModelHubMixin):
        called on `env.reset()`
        """
        self._queues = {
-            "observation.image": deque(maxlen=1),
            "observation.state": deque(maxlen=1),
-            "action": deque(maxlen=self.config.n_action_repeats),
+            "action": deque(maxlen=max(self.config.n_action_steps, self.config.n_action_repeats)),
        }
+        if self._use_image:
+            self._queues["observation.image"] = deque(maxlen=1)
+        if self._use_env_state:
+            self._queues["observation.environment_state"] = deque(maxlen=1)
        # Previous mean obtained from the cross-entropy method (CEM) used during MPC. It is used to warm start
        # CEM for the next step.
        self._prev_mean: torch.Tensor | None = None
@@ -137,7 +134,9 @@ class TDMPCPolicy(nn.Module, PyTorchModelHubMixin):
    def select_action(self, batch: dict[str, Tensor]) -> Tensor:
        """Select a single action given environment observations."""
        batch = self.normalize_inputs(batch)
-        batch["observation.image"] = batch[self.input_image_key]
+        if self._use_image:
+            batch = dict(batch)  # shallow copy so that adding a key doesn't modify the original
+            batch["observation.image"] = batch[self.input_image_key]

        self._queues = populate_queues(self._queues, batch)

@@ -151,49 +150,57 @@ class TDMPCPolicy(nn.Module, PyTorchModelHubMixin):
                batch[key] = batch[key][:, 0]

            # NOTE: Order of observations matters here.
-            z = self.model.encode({k: batch[k] for k in ["observation.image", "observation.state"]})
-            if self.config.use_mpc:
-                batch_size = batch["observation.image"].shape[0]
-                # Batch processing is not handled in MPC mode, so process the batch in a loop.
-                action = []  # will be a batch of actions for one step
-                for i in range(batch_size):
-                    # Note: self.plan does not handle batches, hence the squeeze.
-                    action.append(self.plan(z[i]))
-                action = torch.stack(action)
+            encode_keys = []
+            if self._use_image:
+                encode_keys.append("observation.image")
+            if self._use_env_state:
+                encode_keys.append("observation.environment_state")
+            encode_keys.append("observation.state")
+            z = self.model.encode({k: batch[k] for k in encode_keys})
+            if self.config.use_mpc:  # noqa: SIM108
+                actions = self.plan(z)  # (horizon, batch, action_dim)
            else:
-                # Plan with the policy (π) alone.
-                action = self.model.pi(z)
+                # Plan with the policy (π) alone. This always returns one action so unsqueeze to get a
+                # sequence dimension like in the MPC branch.
+                actions = self.model.pi(z).unsqueeze(0)

-            self.unnormalize_outputs({"action": action})["action"]
+            actions = torch.clamp(actions, -1, +1)

-            for _ in range(self.config.n_action_repeats):
-                self._queues["action"].append(action)
+            actions = self.unnormalize_outputs({"action": actions})["action"]
+
+            if self.config.n_action_repeats > 1:
+                for _ in range(self.config.n_action_repeats):
+                    self._queues["action"].append(actions[0])
+            else:
+                # Action queue is (n_action_steps, batch_size, action_dim), so we transpose the action.
+                self._queues["action"].extend(actions[: self.config.n_action_steps])

        action = self._queues["action"].popleft()
-        return torch.clamp(action, -1, 1)
+        return action

    @torch.no_grad()
    def plan(self, z: Tensor) -> Tensor:
-        """Plan next action using TD-MPC inference.
+        """Plan sequence of actions using TD-MPC inference.

        Args:
-            z: (latent_dim,) tensor for the initial state.
+            z: (batch, latent_dim,) tensor for the initial state.
        Returns:
-            (action_dim,) tensor for the next action.
-
-        TODO(alexander-soare) Extend this to be able to work with batches.
+            (horizon, batch, action_dim,) tensor for the planned trajectory of actions.
        """
        device = get_device_from_parameters(self)

+        batch_size = z.shape[0]
+
        # Sample Nπ trajectories from the policy.
        pi_actions = torch.empty(
            self.config.horizon,
            self.config.n_pi_samples,
+            batch_size,
            self.config.output_shapes["action"][0],
            device=device,
        )
        if self.config.n_pi_samples > 0:
-            _z = einops.repeat(z, "d -> n d", n=self.config.n_pi_samples)
+            _z = einops.repeat(z, "b d -> n b d", n=self.config.n_pi_samples)
            for t in range(self.config.horizon):
                # Note: Adding a small amount of noise here doesn't hurt during inference and may even be
                # helpful for CEM.
@@ -202,12 +209,14 @@ class TDMPCPolicy(nn.Module, PyTorchModelHubMixin):

        # In the CEM loop we will need this for a call to estimate_value with the gaussian sampled
        # trajectories.
-        z = einops.repeat(z, "d -> n d", n=self.config.n_gaussian_samples + self.config.n_pi_samples)
+        z = einops.repeat(z, "b d -> n b d", n=self.config.n_gaussian_samples + self.config.n_pi_samples)

        # Model Predictive Path Integral (MPPI) with the cross-entropy method (CEM) as the optimization
        # algorithm.
        # The initial mean and standard deviation for the cross-entropy method (CEM).
-        mean = torch.zeros(self.config.horizon, self.config.output_shapes["action"][0], device=device)
+        mean = torch.zeros(
+            self.config.horizon, batch_size, self.config.output_shapes["action"][0], device=device
+        )
        # Maybe warm start CEM with the mean from the previous step.
        if self._prev_mean is not None:
            mean[:-1] = self._prev_mean[1:]
@@ -218,6 +227,7 @@ class TDMPCPolicy(nn.Module, PyTorchModelHubMixin):
            std_normal_noise = torch.randn(
                self.config.horizon,
                self.config.n_gaussian_samples,
+                batch_size,
                self.config.output_shapes["action"][0],
                device=std.device,
            )
@@ -226,21 +236,24 @@ class TDMPCPolicy(nn.Module, PyTorchModelHubMixin):
            # Compute elite actions.
            actions = torch.cat([gaussian_actions, pi_actions], dim=1)
            value = self.estimate_value(z, actions).nan_to_num_(0)
-            elite_idxs = torch.topk(value, self.config.n_elites, dim=0).indices
-            elite_value, elite_actions = value[elite_idxs], actions[:, elite_idxs]
+            elite_idxs = torch.topk(value, self.config.n_elites, dim=0).indices  # (n_elites, batch)
+            elite_value = value.take_along_dim(elite_idxs, dim=0)  # (n_elites, batch)
+            # (horizon, n_elites, batch, action_dim)
+            elite_actions = actions.take_along_dim(einops.rearrange(elite_idxs, "n b -> 1 n b 1"), dim=1)

-            # Update guassian PDF parameters to be the (weighted) mean and standard deviation of the elites.
-            max_value = elite_value.max(0)[0]
+            # Update gaussian PDF parameters to be the (weighted) mean and standard deviation of the elites.
+            max_value = elite_value.max(0, keepdim=True)[0]  # (1, batch)
            # The weighting is a softmax over trajectory values. Note that this is not the same as the usage
            # of Ω in eqn 4 of the TD-MPC paper. Instead it is the normalized version of it: s = Ω/ΣΩ. This
            # makes the equations: μ = Σ(s⋅Γ), σ = Σ(s⋅(Γ-μ)²).
            score = torch.exp(self.config.elite_weighting_temperature * (elite_value - max_value))
-            score /= score.sum()
-            _mean = torch.sum(einops.rearrange(score, "n -> n 1") * elite_actions, dim=1)
+            score /= score.sum(axis=0, keepdim=True)
+            # (horizon, batch, action_dim)
+            _mean = torch.sum(einops.rearrange(score, "n b -> n b 1") * elite_actions, dim=1)
            _std = torch.sqrt(
                torch.sum(
-                    einops.rearrange(score, "n -> n 1")
-                    * (elite_actions - einops.rearrange(_mean, "h d -> h 1 d")) ** 2,
+                    einops.rearrange(score, "n b -> n b 1")
+                    * (elite_actions - einops.rearrange(_mean, "h b d -> h 1 b d")) ** 2,
                    dim=1,
                )
            )
@@ -255,11 +268,9 @@ class TDMPCPolicy(nn.Module, PyTorchModelHubMixin):

        # Randomly select one of the elite actions from the last iteration of MPPI/CEM using the softmax
        # scores from the last iteration.
-        actions = elite_actions[:, torch.multinomial(score, 1).item()]
+        actions = elite_actions[:, torch.multinomial(score.T, 1).squeeze(), torch.arange(batch_size)]

-        # Select only the first action
-        action = actions[0]
-        return action
+        return actions

    @torch.no_grad()
    def estimate_value(self, z: Tensor, actions: Tensor):
@@ -311,12 +322,17 @@ class TDMPCPolicy(nn.Module, PyTorchModelHubMixin):
            G -= running_discount * self.config.uncertainty_regularizer_coeff * terminal_values.std(0)
        return G

-    def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor]:
-        """Run the batch through the model and compute the loss."""
+    def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor | float]:
+        """Run the batch through the model and compute the loss.
+
+        Returns a dictionary with loss as a tensor, and other information as native floats.
+        """
        device = get_device_from_parameters(self)

        batch = self.normalize_inputs(batch)
-        batch["observation.image"] = batch[self.input_image_key]
+        if self._use_image:
+            batch = dict(batch)  # shallow copy so that adding a key doesn't modify the original
+            batch["observation.image"] = batch[self.input_image_key]
        batch = self.normalize_targets(batch)

        info = {}
@@ -326,12 +342,12 @@ class TDMPCPolicy(nn.Module, PyTorchModelHubMixin):
            if batch[key].ndim > 1:
                batch[key] = batch[key].transpose(1, 0)

-        action = batch["action"]  # (t, b)
-        reward = batch["next.reward"]  # (t,)
+        action = batch["action"]  # (t, b, action_dim)
+        reward = batch["next.reward"]  # (t, b)
        observations = {k: v for k, v in batch.items() if k.startswith("observation.")}

        # Apply random image augmentations.
-        if self.config.max_random_shift_ratio > 0:
+        if self._use_image and self.config.max_random_shift_ratio > 0:
            observations["observation.image"] = flatten_forward_unflatten(
                partial(random_shifts_aug, max_random_shift_ratio=self.config.max_random_shift_ratio),
                observations["observation.image"],
@@ -343,7 +359,9 @@ class TDMPCPolicy(nn.Module, PyTorchModelHubMixin):
        for k in observations:
            current_observation[k] = observations[k][0]
            next_observations[k] = observations[k][1:]
-        horizon = next_observations["observation.image"].shape[0]
+        horizon, batch_size = next_observations[
+            "observation.image" if self._use_image else "observation.environment_state"
+        ].shape[:2]

        # Run latent rollout using the latent dynamics model and policy model.
        # Note this has shape `horizon+1` because there are `horizon` actions and a current `z`. Each action
@@ -413,7 +431,8 @@ class TDMPCPolicy(nn.Module, PyTorchModelHubMixin):
        # Compute state-action value loss (TD loss) for all of the Q functions in the ensemble.
        q_value_loss = (
            (
-                F.mse_loss(
+                temporal_loss_coeffs
+                * F.mse_loss(
                    q_preds_ensemble,
                    einops.repeat(q_targets, "t b -> e t b", e=q_preds_ensemble.shape[0]),
                    reduction="none",
@@ -462,10 +481,11 @@ class TDMPCPolicy(nn.Module, PyTorchModelHubMixin):
        action_preds = self.model.pi(z_preds[:-1])  # (t, b, a)
        # Calculate the MSE between the actions and the action predictions.
        # Note: FOWM's original code calculates the log probability (wrt to a unit standard deviation
-        # gaussian) and sums over the action dimension. Computing the log probability amounts to multiplying
-        # the MSE by 0.5 and adding a constant offset (the log(2*pi) term) . Here we drop the constant offset
-        # as it doesn't change the optimization step, and we drop the 0.5 as we instead make a configuration
-        # parameter for it (see below where we compute the total loss).
+        # gaussian) and sums over the action dimension. Computing the (negative) log probability amounts to
+        # multiplying the MSE by 0.5 and adding a constant offset (the log(2*pi)/2 term, times the action
+        # dimension). Here we drop the constant offset as it doesn't change the optimization step, and we drop
+        # the 0.5 as we instead make a configuration parameter for it (see below where we compute the total
+        # loss).
        mse = F.mse_loss(action_preds, action, reduction="none").sum(-1)  # (t, b)
        # NOTE: The original implementation does not take the sum over the temporal dimension like with the
        # other losses.
@@ -726,6 +746,16 @@ class TDMPCObservationEncoder(nn.Module):
                nn.LayerNorm(config.latent_dim),
                nn.Sigmoid(),
            )
+        if "observation.environment_state" in config.input_shapes:
+            self.env_state_enc_layers = nn.Sequential(
+                nn.Linear(
+                    config.input_shapes["observation.environment_state"][0], config.state_encoder_hidden_dim
+                ),
+                nn.ELU(),
+                nn.Linear(config.state_encoder_hidden_dim, config.latent_dim),
+                nn.LayerNorm(config.latent_dim),
+                nn.Sigmoid(),
+            )

    def forward(self, obs_dict: dict[str, Tensor]) -> Tensor:
        """Encode the image and/or state vector.
@@ -734,8 +764,11 @@ class TDMPCObservationEncoder(nn.Module):
        over all features.
        """
        feat = []
+        # NOTE: Order of observations matters here.
        if "observation.image" in self.config.input_shapes:
            feat.append(flatten_forward_unflatten(self.image_enc_layers, obs_dict["observation.image"]))
+        if "observation.environment_state" in self.config.input_shapes:
+            feat.append(self.env_state_enc_layers(obs_dict["observation.environment_state"]))
        if "observation.state" in self.config.input_shapes:
            feat.append(self.state_enc_layers(obs_dict["observation.state"]))
        return torch.stack(feat, dim=0).mean(0)
--- a/lerobot/common/policies/tdmpc2/configuration_tdmpc2.py
+++ b/lerobot/common/policies/tdmpc2/configuration_tdmpc2.py
@@ -0,0 +1,217 @@
+#!/usr/bin/env python
+
+# Copyright 2024 Nicklas Hansen, Xiaolong Wang, Hao Su,
+# and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass, field
+
+
+@dataclass
+class TDMPC2Config:
+    """Configuration class for TDMPC2Policy.
+
+    Defaults are configured for training with xarm_lift_medium_replay providing proprioceptive and single
+    camera observations.
+
+    The parameters you will most likely need to change are the ones which depend on the environment / sensors.
+    Those are: `input_shapes`, `output_shapes`, and perhaps `max_random_shift_ratio`.
+
+    Args:
+        n_action_repeats: The number of times to repeat the action returned by the planning. (hint: Google
+            action repeats in Q-learning or ask your favorite chatbot)
+        horizon: Horizon for model predictive control.
+        n_action_steps: Number of action steps to take from the plan given by model predictive control. This
+            is an alternative to using action repeats. If this is set to more than 1, then we require
+            `n_action_repeats == 1`, `use_mpc == True` and `n_action_steps <= horizon`. Note that this
+            approach of using multiple steps from the plan is not in the original implementation.
+        input_shapes: A dictionary defining the shapes of the input data for the policy. The key represents
+            the input data name, and the value is a list indicating the dimensions of the corresponding data.
+            For example, "observation.image" refers to an input from a camera with dimensions [3, 96, 96],
+            indicating it has three color channels and 96x96 resolution. Importantly, `input_shapes` doesn't
+            include batch dimension or temporal dimension.
+        output_shapes: A dictionary defining the shapes of the output data for the policy. The key represents
+            the output data name, and the value is a list indicating the dimensions of the corresponding data.
+            For example, "action" refers to an output shape of [14], indicating 14-dimensional actions.
+            Importantly, `output_shapes` doesn't include batch dimension or temporal dimension.
+        input_normalization_modes: A dictionary with key representing the modality (e.g. "observation.state"),
+            and the value specifies the normalization mode to apply. The two available modes are "mean_std"
+            which subtracts the mean and divides by the standard deviation and "min_max" which rescale in a
+            [-1, 1] range. Note that here this defaults to None meaning inputs are not normalized. This is to
+            match the original implementation.
+        output_normalization_modes: Similar dictionary as `normalize_input_modes`, but to unnormalize to the
+            original scale. Note that this is also used for normalizing the training targets. NOTE: Clipping
+            to [-1, +1] is used during MPPI/CEM. Therefore, it is recommended that you stick with "min_max"
+            normalization mode here.
+        image_encoder_hidden_dim: Number of channels for the convolutional layers used for image encoding.
+        state_encoder_hidden_dim: Hidden dimension for MLP used for state vector encoding.
+        latent_dim: Observation's latent embedding dimension.
+        q_ensemble_size: Number of Q function estimators to use in an ensemble for uncertainty estimation.
+        mlp_dim: Hidden dimension of MLPs used for modelling the dynamics encoder, reward function, policy
+            (π), Q ensemble, and V.
+        discount: Discount factor (γ) to use for the reinforcement learning formalism.
+        use_mpc: Whether to use model predictive control. The alternative is to just sample the policy model
+            (π) for each step.
+        cem_iterations: Number of iterations for the MPPI/CEM loop in MPC.
+        max_std: Maximum standard deviation for actions sampled from the gaussian PDF in CEM.
+        min_std: Minimum standard deviation for noise applied to actions sampled from the policy model (π).
+            Doubles up as the minimum standard deviation for actions sampled from the gaussian PDF in CEM.
+        n_gaussian_samples: Number of samples to draw from the gaussian distribution every CEM iteration. Must
+            be non-zero.
+        n_pi_samples: Number of samples to draw from the policy / world model rollout every CEM iteration. Can
+            be zero.
+        uncertainty_regularizer_coeff: Coefficient for the uncertainty regularization used when estimating
+            trajectory values (this is the λ coeffiecient in eqn 4 of FOWM).
+        n_elites: The number of elite samples to use for updating the gaussian parameters every CEM iteration.
+        elite_weighting_temperature: The temperature to use for softmax weighting (by trajectory value) of the
+            elites, when updating the gaussian parameters for CEM.
+        gaussian_mean_momentum: Momentum (α) used for EMA updates of the mean parameter μ of the gaussian
+            parameters optimized in CEM. Updates are calculated as μ⁻ ← αμ⁻ + (1-α)μ.
+        max_random_shift_ratio: Maximum random shift (as a proportion of the image size) to apply to the
+            image(s) (in units of pixels) for training-time augmentation. If set to 0, no such augmentation
+            is applied. Note that the input images are assumed to be square for this augmentation.
+        reward_coeff: Loss weighting coefficient for the reward regression loss.
+        expectile_weight: Weighting (τ) used in expectile regression for the state value function (V).
+            v_pred < v_target is weighted by τ and v_pred >= v_target is weighted by (1-τ). τ is expected to
+            be in [0, 1]. Setting τ closer to 1 results in a more "optimistic" V. This is sensible to do
+            because v_target is obtained by evaluating the learned state-action value functions (Q) with
+            in-sample actions that may not be always optimal.
+        value_coeff: Loss weighting coefficient for both the state-action value (Q) TD loss, and the state
+            value (V) expectile regression loss.
+        consistency_coeff: Loss weighting coefficient for the consistency loss.
+        advantage_scaling: A factor by which the advantages are scaled prior to exponentiation for advantage
+            weighted regression of the policy (π) estimator parameters. Note that the exponentiated advantages
+            are clamped at 100.0.
+        pi_coeff: Loss weighting coefficient for the action regression loss.
+        temporal_decay_coeff: Exponential decay coefficient for decaying the loss coefficient for future time-
+            steps. Hint: each loss computation involves `horizon` steps worth of actions starting from the
+            current time step.
+        target_model_momentum: Momentum (α) used for EMA updates of the target models. Updates are calculated
+            as ϕ ← αϕ + (1-α)θ where ϕ are the parameters of the target model and θ are the parameters of the
+            model being trained.
+    """
+
+    num_bins = 101
+    vmin = -10
+    vmax = +10
+    rho: float = 0.5
+    tau: float = 0.01
+    simnorm_dim: int = 8
+
+    # Input / output structure.
+    n_action_repeats: int = 2
+    horizon: int = 5
+    n_action_steps: int = 1
+
+    input_shapes: dict[str, list[int]] = field(
+        default_factory=lambda: {
+            "observation.image": [3, 64, 64],
+            "observation.state": [4],
+        }
+    )
+    output_shapes: dict[str, list[int]] = field(
+        default_factory=lambda: {
+            "action": [4],
+        }
+    )
+
+    # Normalization / Unnormalization
+    input_normalization_modes: dict[str, str] | None = None
+    output_normalization_modes: dict[str, str] = field(
+        default_factory=lambda: {"action": "min_max"},
+    )
+
+    # Architecture / modeling.
+    # Neural networks.
+    image_encoder_hidden_dim: int = 32
+    state_encoder_hidden_dim: int = 256
+    latent_dim: int = 8 #50
+    q_ensemble_size: int = 5
+    mlp_dim: int = 512
+    # Reinforcement learning.
+    discount: float = 0.9
+    lr: float = 3e-4
+    enc_lr_scale: float = 0.3
+
+    num_q: int = 5
+    dropout: float = 0.01
+
+    num_channels = 32
+    num_enc_layers = 2
+    enc_dim = 256
+
+    # Inference.
+    use_mpc: bool = True
+    cem_iterations: int = 6
+    max_std: float = 2.0
+    min_std: float = 0.05
+    n_gaussian_samples: int = 512
+    n_pi_samples: int = 51
+    uncertainty_regularizer_coeff: float = 1.0
+    n_elites: int = 50
+    elite_weighting_temperature: float = 0.5
+    gaussian_mean_momentum: float = 0.1
+
+    # Training and loss computation.
+    grad_clip_norm: float = 20
+
+    max_random_shift_ratio: float = 0.0476
+    # Loss coefficients.
+    consistency_coef: float = 20
+    entropy_coef: float = 1e-4
+
+    reward_coef: float = 0.1
+    expectile_weight: float = 0.9
+    value_coef: float = 0.1
+    consistency_coeff: float = 20.0
+    advantage_scaling: float = 3.0
+    pi_coeff: float = 0.5
+    temporal_decay_coeff: float = 0.5
+    # Target model.
+    target_model_momentum: float = 0.995
+
+    def __post_init__(self):
+        """Input validation (not exhaustive)."""
+        # There should only be one image key.
+        image_keys = {k for k in self.input_shapes if k.startswith("observation.image")}
+        if len(image_keys) > 1:
+            raise ValueError(
+                f"{self.__class__.__name__} handles at most one image for now. Got image keys {image_keys}."
+            )
+        if len(image_keys) > 0:
+            image_key = next(iter(image_keys))
+            if self.input_shapes[image_key][-2] != self.input_shapes[image_key][-1]:
+                # TODO(alexander-soare): This limitation is solely because of code in the random shift
+                # augmentation. It should be able to be removed.
+                raise ValueError(
+                    f"Only square images are handled now. Got image shape {self.input_shapes[image_key]}."
+                )
+        if self.n_gaussian_samples <= 0:
+            raise ValueError(
+                f"The number of guassian samples for CEM should be non-zero. Got `{self.n_gaussian_samples=}`"
+            )
+        if self.output_normalization_modes != {"action": "min_max"}:
+            raise ValueError(
+                "TD-MPC assumes the action space dimensions to all be in [-1, 1]. Therefore it is strongly "
+                f"advised that you stick with the default. See {self.__class__.__name__} docstring for more "
+                "information."
+            )
+        if self.n_action_steps > 1:
+            if self.n_action_repeats != 1:
+                raise ValueError(
+                    "If `n_action_steps > 1`, `n_action_repeats` must be left to its default value of 1."
+                )
+            if not self.use_mpc:
+                raise ValueError("If `n_action_steps > 1`, `use_mpc` must be set to `True`.")
+            if self.n_action_steps > self.horizon:
+                raise ValueError("`n_action_steps` must be less than or equal to `horizon`.")
--- a/lerobot/common/policies/tdmpc2/modeling_tdmpc2.py
+++ b/lerobot/common/policies/tdmpc2/modeling_tdmpc2.py
@@ -0,0 +1,727 @@
+#!/usr/bin/env python
+
+# Copyright 2024 Nicklas Hansen, Xiaolong Wang, Hao Su,
+# and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Implementation of Finetuning Offline World Models in the Real World.
+
+The comments in this code may sometimes refer to these references:
+    TD-MPC paper: Temporal Difference Learning for Model Predictive Control (https://arxiv.org/abs/2203.04955)
+    FOWM paper: Finetuning Offline World Models in the Real World (https://arxiv.org/abs/2310.16029)
+"""
+
+# ruff: noqa: N806
+
+import logging
+from collections import deque
+from copy import deepcopy
+from functools import partial
+from typing import Callable
+
+import einops
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F  # noqa: N812
+from huggingface_hub import PyTorchModelHubMixin
+from torch import Tensor
+
+import lerobot.common.policies.tdmpc2.tdmpc2_utils as utils
+from lerobot.common.policies.normalize import Normalize, Unnormalize
+from lerobot.common.policies.tdmpc2.configuration_tdmpc2 import TDMPC2Config
+from lerobot.common.policies.utils import get_device_from_parameters, populate_queues
+
+
+class TDMPC2Policy(nn.Module, PyTorchModelHubMixin):
+    """Implementation of TD-MPC2 learning + inference.
+
+    Please note several warnings for this policy.
+        - We have NOT checked that training on LeRobot reproduces SOTA results. This is a TODO.
+    """
+
+    name = "tdmpc2"
+
+    def __init__(
+        self, config: TDMPC2Config | None = None, dataset_stats: dict[str, dict[str, Tensor]] | None = None
+    ):
+        """
+        Args:
+            config: Policy configuration class instance or None, in which case the default instantiation of
+                the configuration class is used.
+            dataset_stats: Dataset statistics to be used for normalization. If not passed here, it is expected
+                that they will be passed with a call to `load_state_dict` before the policy is used.
+        """
+        super().__init__()
+        logging.warning(
+            """
+            Please note several warnings for this policy.
+            - We have NOT checked that training on LeRobot reproduces SOTA results. This is a TODO.
+            """
+        )
+
+        if config is None:
+            config = TDMPC2Config()
+        self.config = config
+        self.model = TDMPC2TOLD(config)
+
+        if config.input_normalization_modes is not None:
+            self.normalize_inputs = Normalize(
+                config.input_shapes, config.input_normalization_modes, dataset_stats
+            )
+        else:
+            self.normalize_inputs = nn.Identity()
+        self.normalize_targets = Normalize(
+            config.output_shapes, config.output_normalization_modes, dataset_stats
+        )
+        self.unnormalize_outputs = Unnormalize(
+            config.output_shapes, config.output_normalization_modes, dataset_stats
+        )
+
+        image_keys = [k for k in config.input_shapes if k.startswith("observation.image")]
+        # Note: This check is covered in the post-init of the config but have a sanity check just in case.
+        self._use_image = False
+        self._use_env_state = False
+        if len(image_keys) > 0:
+            assert len(image_keys) == 1
+            self._use_image = True
+            self.input_image_key = image_keys[0]
+        if "observation.environment_state" in config.input_shapes:
+            self._use_env_state = True
+
+        self.scale = utils.RunningScale(self.config)
+
+        self.queue_keys = None
+
+        self.reset()
+
+    def reset(self):
+        """
+        Clear observation and action queues. Clear previous means for warm starting of MPPI/CEM. Should be
+        called on `env.reset()`
+        """
+        self._queues = {
+            "observation.state": deque(maxlen=1),
+            "action": deque(maxlen=max(self.config.n_action_steps, self.config.n_action_repeats)),
+        }
+        if self._use_image:
+            self._queues["observation.image"] = deque(maxlen=1)
+        if self._use_env_state:
+            self._queues["observation.environment_state"] = deque(maxlen=1)
+        # Previous mean obtained from the cross-entropy method (CEM) used during MPC. It is used to warm start
+        # CEM for the next step.
+        self._prev_mean: torch.Tensor | None = None
+
+    @torch.no_grad()
+    def select_action(self, batch: dict[str, Tensor]) -> Tensor:
+        """Select a single action given environment observations."""
+        batch = self.normalize_inputs(batch)
+        if self._use_image:
+            batch = dict(batch)  # shallow copy so that adding a key doesn't modify the original
+            batch["observation.image"] = batch[self.input_image_key]
+
+        self._queues = populate_queues(self._queues, batch)
+
+        if self.queue_keys is None:
+            self.queue_keys = [k for k in batch if k in self._queues]
+
+        # When the action queue is depleted, populate it again by querying the policy.
+        if len(self._queues["action"]) == 0:
+            batch = {key: torch.stack(list(self._queues[key]), dim=1) for key in self.queue_keys}
+
+            # Remove the time dimensions as it is not handled yet.
+            for key in batch:
+                assert batch[key].shape[1] == 1
+                batch[key] = batch[key][:, 0]
+
+            # NOTE: Order of observations matters here.
+            encode_keys = []
+            if self._use_image:
+                encode_keys.append("observation.image")
+            if self._use_env_state:
+                encode_keys.append("observation.environment_state")
+            encode_keys.append("observation.state")
+
+            z = self.model.encode({k: batch[k] for k in encode_keys})
+
+            if self.config.use_mpc:  # noqa: SIM108
+                actions = self.plan(z)  # (horizon, batch, action_dim)
+            else:
+                # Plan with the policy (π) alone. This always returns one action so unsqueeze to get a
+                # sequence dimension like in the MPC branch.
+                actions = self.model.pi(z).unsqueeze(0)
+
+            actions = torch.clamp(actions, -1, +1)
+
+            actions = self.unnormalize_outputs({"action": actions})["action"]
+
+            if self.config.n_action_repeats > 1:
+                for _ in range(self.config.n_action_repeats):
+                    self._queues["action"].append(actions[0])
+            else:
+                # Action queue is (n_action_steps, batch_size, action_dim), so we transpose the action.
+                self._queues["action"].extend(
+                    actions[: self.config.n_action_steps]
+                )  # TDMPC2 does it use n_action_steps?
+
+        action = self._queues["action"].popleft()
+        return action
+
+    @torch.no_grad()
+    def plan(self, z: Tensor) -> Tensor:
+        """Plan sequence of actions using TD-MPC inference.
+
+        Args:
+            z: (batch, latent_dim,) tensor for the initial state.
+        Returns:
+            (horizon, batch, action_dim,) tensor for the planned trajectory of actions.
+        """
+        device = get_device_from_parameters(self)
+
+        batch_size = z.shape[0]
+
+        # Sample Nπ trajectories from the policy.
+        pi_actions = torch.empty(
+            self.config.horizon,
+            self.config.n_pi_samples,
+            batch_size,
+            self.config.output_shapes["action"][0],
+            device=device,
+        )
+        if self.config.n_pi_samples > 0:
+            _z = einops.repeat(z, "b d -> n b d", n=self.config.n_pi_samples)
+            for t in range(self.config.horizon):
+                # Note: Adding a small amount of noise here doesn't hurt during inference and may even be
+                # helpful for CEM.
+                pi_actions[t] = self.model.pi_action(_z)
+                _z = self.model.latent_dynamics(_z, pi_actions[t])
+
+        # In the CEM loop we will need this for a call to estimate_value with the gaussian sampled
+        # trajectories.
+        z = einops.repeat(z, "b d -> n b d", n=self.config.n_gaussian_samples + self.config.n_pi_samples)
+
+        # Model Predictive Path Integral (MPPI) with the cross-entropy method (CEM) as the optimization
+        # algorithm.
+        # The initial mean and standard deviation for the cross-entropy method (CEM).
+        mean = torch.zeros(
+            self.config.horizon, batch_size, self.config.output_shapes["action"][0], device=device
+        )
+        # Maybe warm start CEM with the mean from the previous step.
+        if self._prev_mean is not None:
+            mean[:-1] = self._prev_mean[1:]
+        std = self.config.max_std * torch.ones_like(mean)
+
+        for _ in range(self.config.cem_iterations):
+            # Randomly sample action trajectories for the gaussian distribution.
+            std_normal_noise = torch.randn(
+                self.config.horizon,
+                self.config.n_gaussian_samples,
+                batch_size,
+                self.config.output_shapes["action"][0],
+                device=std.device,
+            )
+            gaussian_actions = torch.clamp(mean.unsqueeze(1) + std.unsqueeze(1) * std_normal_noise, -1, 1)
+
+            # Compute elite actions.
+            actions = torch.cat([gaussian_actions, pi_actions], dim=1)
+            value = self.estimate_value(z, actions).nan_to_num_(0).squeeze(-1)
+            elite_idxs = torch.topk(value, self.config.n_elites, dim=0).indices  # (n_elites, batch)
+            # from IPython import embed; embed()
+            elite_value = value.take_along_dim(elite_idxs, dim=0)  # (n_elites, batch)
+            # (horizon, n_elites, batch, action_dim)
+            elite_actions = actions.take_along_dim(einops.rearrange(elite_idxs, "n b -> 1 n b 1"), dim=1)
+
+            # Update gaussian PDF parameters to be the (weighted) mean and standard deviation of the elites.
+            max_value = elite_value.max(0, keepdim=True)[0]  # (1, batch)
+            # The weighting is a softmax over trajectory values. Note that this is not the same as the usage
+            # of Ω in eqn 4 of the TD-MPC paper. Instead it is the normalized version of it: s = Ω/ΣΩ. This
+            # makes the equations: μ = Σ(s⋅Γ), σ = Σ(s⋅(Γ-μ)²).
+            score = torch.exp(self.config.elite_weighting_temperature * (elite_value - max_value))
+            score /= score.sum(axis=0, keepdim=True)
+            # (horizon, batch, action_dim)
+            _mean = torch.sum(einops.rearrange(score, "n b -> n b 1") * elite_actions, dim=1)
+            _std = torch.sqrt(
+                torch.sum(
+                    einops.rearrange(score, "n b -> n b 1")
+                    * (elite_actions - einops.rearrange(_mean, "h b d -> h 1 b d")) ** 2,
+                    dim=1,
+                )
+            )
+            # Update mean with an exponential moving average, and std with a direct replacement.
+            mean = (
+                self.config.gaussian_mean_momentum * mean + (1 - self.config.gaussian_mean_momentum) * _mean
+            )
+            std = _std.clamp_(self.config.min_std, self.config.max_std)
+
+        # Keep track of the mean for warm-starting subsequent steps.
+        self._prev_mean = mean
+
+        # Randomly select one of the elite actions from the last iteration of MPPI/CEM using the softmax
+        # scores from the last iteration.
+        actions = elite_actions[:, torch.multinomial(score.T, 1).squeeze(), torch.arange(batch_size)]
+
+        return actions
+
+    @torch.no_grad()
+    def estimate_value(self, z, actions):
+        """Estimate value of a trajectory starting at latent state z and executing given actions."""
+        G, discount = 0, 1
+        for t in range(self.config.horizon):
+            reward = utils.two_hot_inv(self.model._reward(torch.cat([z, actions[t]], dim=-1)), self.config)
+            z = self.model.next(z, actions[t])
+            G += discount * reward
+            discount *= self.config.discount
+        return G + discount * self.model.Qs(z, self.model.pi(z)[1], return_type="avg")
+
+    def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor | float]:
+        """Run the batch through the model and compute the loss.
+
+        Returns a dictionary with loss as a tensor, and other information as native floats.
+        """
+        device = get_device_from_parameters(self)
+
+        batch = self.normalize_inputs(batch)
+        if self._use_image:
+            batch = dict(batch)  # shallow copy so that adding a key doesn't modify the original
+            batch["observation.image"] = batch[self.input_image_key]
+        batch = self.normalize_targets(batch)
+
+        # (b, t) -> (t, b)
+        for key in batch:
+            if batch[key].ndim > 1:
+                batch[key] = batch[key].transpose(1, 0)
+
+        action = batch["action"]  # (t, b, action_dim)
+        reward = batch["next.reward"]  # (t, b)
+        reward = reward.unsqueeze(-1)  # (t, b, 1)
+        observations = {k: v for k, v in batch.items() if k.startswith("observation.")}
+
+        # Apply random image augmentations.
+        if self._use_image and self.config.max_random_shift_ratio > 0:
+            observations["observation.image"] = flatten_forward_unflatten(
+                partial(random_shifts_aug, max_random_shift_ratio=self.config.max_random_shift_ratio),
+                observations["observation.image"],
+            )
+
+        # Get the current observation for predicting trajectories, and all future observations for use in
+        # the latent consistency loss and TD loss.
+        current_observation, next_observations = {}, {}
+        for k in observations:
+            current_observation[k] = observations[k][0]
+            next_observations[k] = observations[k][1:]
+        horizon, batch_size = next_observations[
+            "observation.image" if self._use_image else "observation.environment_state"
+        ].shape[:2]
+
+        # Compute targets
+        with torch.no_grad():
+            next_z = self.model.encode(next_observations)
+            curr_z = self.model.encode(current_observation).unsqueeze(
+                0
+            )  # TODO: not necessary to do the whole thing
+            # get the next targets # _td_target in the original code
+            pi = self.model.pi(next_z)[1]
+            discount = self.config.discount
+
+            td_targets = reward + discount * self.model.Qs(next_z, pi, return_type="min", target=True)
+
+        #self.model.train()
+
+        # Latent rollout
+        zs = torch.empty(self.config.horizon + 1, batch_size, self.config.latent_dim, device=device)
+        zs[0] = z = curr_z[0]
+        consistency_loss = 0
+        for t in range(self.config.horizon):
+            x = torch.cat([z, action[t]], dim=-1)
+            z = self.model._dynamics(x)
+            consistency_loss += F.mse_loss(z, next_z[t]) * self.config.rho**t
+            zs[t + 1] = z
+
+        # Predictions
+        _zs = zs[:-1]
+        qs = self.model.Qs(_zs, action, return_type="all")
+        reward_preds = self.model._reward(torch.cat([_zs, action], dim=-1))
+
+        # Compute losses
+        reward_loss, value_loss = 0, 0
+        for t in range(self.config.horizon):
+            reward_loss += utils.soft_ce(reward_preds[t], reward[t], self.config).mean() * self.config.rho**t
+            for q in range(self.config.num_q):
+                value_loss += utils.soft_ce(qs[q][t], td_targets[t], self.config).mean() * self.config.rho**t
+        consistency_loss *= 1 / self.config.horizon
+        reward_loss *= 1 / self.config.horizon
+        value_loss *= 1 / (self.config.horizon * self.config.num_q)
+
+        ############################ deviation from NHansen
+        # total_loss = (
+        #    self.config.consistency_coef * consistency_loss
+        #    + self.config.reward_coef * reward_loss
+        #    + self.config.value_coef * value_loss
+        # )
+
+        # Update model########################
+        # total_loss.backward()
+        # grad_norm = torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.grad_clip_norm)
+        # self.optim.step()
+        ##########################################
+
+        # Deviation from Hansen, since the optimizer step is called in train.py
+        # Update the policy using a sequence of latent states.
+        zs_for_pi = zs.detach()
+        # self.pi_optim.zero_grad(set_to_none=True)
+        self.model.track_q_grad(False)
+        _, pis, log_pis, _ = self.model.pi(zs_for_pi)
+        qs = self.model.Qs(zs_for_pi, pis, return_type="avg")
+        self.scale.update(qs[0])
+        qs = self.scale(qs)
+
+        # Loss is a weighted sum of Q-values
+        rho = torch.pow(self.config.rho, torch.arange(len(qs), device=device))
+        pi_loss = ((self.config.entropy_coef * log_pis - qs).mean(dim=(1, 2)) * rho).mean()
+        # pi_loss.backward()
+        # torch.nn.utils.clip_grad_norm_(self.model._pi.parameters(), self.config.grad_clip_norm)
+        # self.pi_optim.step()
+
+        # self.model.track_q_grad(True)
+
+        # pi_loss = pi_loss.item()
+
+        loss = (
+            self.config.consistency_coef * consistency_loss
+            + self.config.reward_coef * reward_loss
+            + self.config.value_coef * value_loss
+            + self.config.pi_coeff * pi_loss
+        )
+
+        # Update target Q-functions
+        # """
+        # Soft-update target Q-networks using Polyak averaging.
+        # """
+        # with torch.no_grad():
+        #    for p, p_target in zip(self.model._Qs.parameters(), self.model._target_Qs.parameters()):
+        #        p_target.data.lerp_(p.data, self.config.tau)
+
+        # Return training statistics
+        self.model.eval()
+        info = {
+            "loss": loss,
+            "consistency_loss": consistency_loss.mean().item(),
+            "reward_loss": reward_loss.mean().item(),
+            "value_loss": value_loss.mean().item(),
+            "pi_loss": pi_loss.item(),
+            "pi_scale": self.scale.value,
+        }
+
+        # Undo (b, t) -> (t, b).
+        for key in batch:
+            if batch[key].ndim > 1:
+                batch[key] = batch[key].transpose(1, 0)
+
+        return info
+
+    def update(self):
+        """Soft-update target Q-networks using Polyak averaging."""
+        with torch.no_grad():
+            for p, p_target in zip(
+                self.model._Qs.parameters(), self.model._target_Qs.parameters(), strict=False
+            ):
+                p_target.data.lerp_(p.data, self.config.tau)
+
+
+class TDMPC2TOLD(nn.Module):
+    """Task-Oriented Latent Dynamics (TOLD) model used in TD-MPC2."""
+
+    def __init__(self, config: TDMPC2Config):
+        super().__init__()
+        self.config = config
+
+        self.config.bin_size = (config.vmax - config.vmin) / (
+            config.num_bins - 1
+        )  # Bin size for discrete regression
+
+        action_dim = config.output_shapes["action"][0]
+
+        self._encoder = TDMPC2ObservationEncoder(config)
+        self._dynamics = utils.mlp(
+            config.latent_dim + action_dim,
+            2 * [config.mlp_dim],
+            config.latent_dim,
+            act=utils.SimNorm(config),
+        )
+        self._reward = utils.mlp(
+            config.latent_dim + action_dim, 2 * [config.mlp_dim], max(config.num_bins, 1)
+        )
+        self._pi = utils.mlp(config.latent_dim, 2 * [config.mlp_dim], 2 * action_dim)
+        self._Qs = utils.Ensemble(
+            [
+                utils.mlp(
+                    config.latent_dim + action_dim,
+                    2 * [config.mlp_dim],
+                    max(config.num_bins, 1),
+                    dropout=config.dropout,
+                )
+                for _ in range(config.num_q)
+            ]
+        )
+
+        self.apply(self.weight_init)
+        for p in [self._reward[-1].weight, self._Qs.params[-2]]:
+            p.data.fill_(0)
+
+        self._target_Qs = deepcopy(self._Qs).requires_grad_(False)
+        log_std_min, log_std_max = -10, 2  # TODO: add to config
+        self.log_std_min = torch.tensor(log_std_min)
+        self.log_std_dif = torch.tensor(log_std_max) - self.log_std_min
+
+    def track_q_grad(self, mode=True):
+        """
+        Enables/disables gradient tracking of Q-networks.
+        Avoids unnecessary computation during policy optimization.
+        This method also enables/disables gradients for task embeddings.
+        """
+        for p in self._Qs.parameters():
+            p.requires_grad_(mode)
+
+    def weight_init(self, m):  # lifted from Nicklas' code
+        """Custom weight initialization for TD-MPC2."""
+        if isinstance(m, nn.Linear):
+            nn.init.trunc_normal_(m.weight, std=0.02)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.Embedding):
+            nn.init.uniform_(m.weight, -0.02, 0.02)
+        elif isinstance(m, nn.ParameterList):
+            for i, p in enumerate(m):
+                if p.dim() == 3:  # Linear
+                    nn.init.trunc_normal_(p, std=0.02)  # Weight
+                    nn.init.constant_(m[i + 1], 0)  # Bias
+
+    def encode(self, obs: dict[str, Tensor]) -> Tensor:
+        """Encodes an observation into its latent representation."""
+        # from IPython import embed; embed()
+        # print(obs["observation.state"].shape, obs["observation.image"].shape)
+        return self._encoder(obs)
+
+    def latent_dynamics_and_reward(self, z: Tensor, a: Tensor) -> tuple[Tensor, Tensor]:
+        """Predict the next state's latent representation and the reward given a current latent and action.
+
+        Args:
+            z: (*, latent_dim) tensor for the current state's latent representation.
+            a: (*, action_dim) tensor for the action to be applied.
+        Returns:
+            A tuple containing:
+                - (*, latent_dim) tensor for the next state's latent representation.
+                - (*,) tensor for the estimated reward.
+        """
+        x = torch.cat([z, a], dim=-1)
+        r = self._reward(x)
+        r = utils.two_hot_inv(r, self.config).squeeze(-1)
+        # from IPython import embed; embed()
+        return self._dynamics(x), r
+
+    def latent_dynamics(self, z: Tensor, a: Tensor) -> Tensor:
+        """Predict the next state's latent representation given a current latent and action.
+
+        Args:
+            z: (*, latent_dim) tensor for the current state's latent representation.
+            a: (*, action_dim) tensor for the action to be applied.
+        Returns:
+            (*, latent_dim) tensor for the next state's latent representation.
+        """
+        x = torch.cat([z, a], dim=-1)
+        return self._dynamics(x)
+
+    def next(self, z: Tensor, a: Tensor) -> Tensor:
+        return self.latent_dynamics(z, a)  # just a wrapper
+
+    def pi(self, z):  # lifted from Nicklas' code
+        """
+        Samples an action from the policy prior.
+        The policy prior is a Gaussian distribution with
+        mean and (log) std predicted by a neural network.
+        """
+        # Gaussian policy prior
+        mu, log_std = self._pi(z).chunk(2, dim=-1)
+        log_std = utils.log_std_fn(log_std, self.log_std_min, self.log_std_dif)
+        eps = torch.randn_like(mu)
+
+        # No masking
+        action_dims = None
+
+        log_pi = utils.gaussian_logprob(eps, log_std, size=action_dims)
+        pi = mu + eps * log_std.exp()
+        mu, pi, log_pi = utils.squash(mu, pi, log_pi)
+
+        return mu, pi, log_pi, log_std
+
+    def pi_action(self, z):
+        return self.pi(z)[1]  # just return the action
+
+    def Qs(self, z: Tensor, a: Tensor, return_type: str = "min", target: bool = False) -> Tensor:  # noqa: N802
+        """Predict state-action value for all of the learned Q functions.
+
+        Args:
+            z: (*, latent_dim) tensor for the current state's latent representation.
+            a: (*, action_dim) tensor for the action to be applied.
+            return_type can be one of [`min`, `avg`, `all`]:
+                - `min`: return the minimum of two randomly subsampled Q-values.
+                - `avg`: return the average of two randomly subsampled Q-values.
+                - `all`: return all Q-values.
+            target: Set to true to use the target Q functions.
+        Returns:
+            (q_ensemble, *) tensor for the value predictions of each learned Q function in the ensemble OR
+            (*,) tensor if return_min=True.
+        """
+        assert return_type in {"min", "avg", "all"}
+
+        z = torch.cat([z, a], dim=-1)
+        out = (self._target_Qs if target else self._Qs)(z)
+
+        if return_type == "all":
+            return out
+
+        Q1, Q2 = out[np.random.choice(self.config.num_q, 2, replace=False)]
+        Q1, Q2 = utils.two_hot_inv(Q1, self.config), utils.two_hot_inv(Q2, self.config)
+        return torch.min(Q1, Q2) if return_type == "min" else (Q1 + Q2) / 2
+
+
+class TDMPC2ObservationEncoder(nn.Module):
+    """Encode image and/or state vector observations."""
+
+    def __init__(self, config: TDMPC2Config):
+        """
+        Creates encoders for pixel and/or state modalities.
+        TODO(alexander-soare): The original work allows for multiple images by concatenating them along the
+            channel dimension. Re-implement this capability.
+        """
+        super().__init__()
+        self.config = config
+
+        for k in config.input_shapes:
+            if "observation.environment_state" in k:
+                obs_dim = config.input_shapes["observation.environment_state"][0]
+                self.env_state_enc_layers = utils.mlp(
+                    obs_dim,
+                    max(config.num_enc_layers - 1, 1) * [config.enc_dim],
+                    config.latent_dim,
+                    act=utils.SimNorm(config),
+                )
+            elif "observation.state" in k:
+                obs_dim = config.input_shapes["observation.state"][0]
+                self.state_enc_layers = utils.mlp(
+                    obs_dim,
+                    max(config.num_enc_layers - 1, 1) * [config.enc_dim],
+                    config.latent_dim,
+                    act=utils.SimNorm(config),
+                )
+            elif "observation.image" in k:
+                obs_shape = config.input_shapes["observation.image"]
+                self.image_enc_layers = utils.conv(obs_shape, config.num_channels, act=utils.SimNorm(config))
+                dummy_batch = torch.zeros(1, *config.input_shapes["observation.image"])
+                with torch.no_grad():
+                    out_shape = self.image_enc_layers(dummy_batch).shape[1]
+                self.image_enc_layers.extend(
+                    utils.mlp(
+                            out_shape,
+                            max(config.num_enc_layers - 1, 1) * [config.enc_dim],
+                            config.latent_dim,
+                            act=utils.SimNorm(config),
+                            ))
+            
+
+
+    def forward(self, obs_dict: dict[str, Tensor]) -> Tensor:
+        """Encode the image and/or state vector.
+
+        Each modality is encoded into a feature vector of size (latent_dim,) and then a uniform mean is taken
+        over all features.
+        """
+        feat = []
+        # NOTE: Order of observations matters here.
+        if "observation.image" in self.config.input_shapes:
+            feat.append(flatten_forward_unflatten(self.image_enc_layers, obs_dict["observation.image"]))
+        if "observation.environment_state" in self.config.input_shapes:
+            feat.append(self.env_state_enc_layers(obs_dict["observation.environment_state"]))
+        if "observation.state" in self.config.input_shapes:
+            feat.append(self.state_enc_layers(obs_dict["observation.state"]))
+
+        return torch.stack(feat, dim=0).mean(0)
+
+
+def random_shifts_aug(x: Tensor, max_random_shift_ratio: float) -> Tensor:
+    """Randomly shifts images horizontally and vertically.
+
+    Adapted from https://github.com/facebookresearch/drqv2
+    """
+    b, _, h, w = x.size()
+    assert h == w, "non-square images not handled yet"
+    pad = int(round(max_random_shift_ratio * h))
+    x = F.pad(x, tuple([pad] * 4), "replicate")
+    eps = 1.0 / (h + 2 * pad)
+    arange = torch.linspace(
+        -1.0 + eps,
+        1.0 - eps,
+        h + 2 * pad,
+        device=x.device,
+        dtype=torch.float32,
+    )[:h]
+    arange = einops.repeat(arange, "w -> h w 1", h=h)
+    base_grid = torch.cat([arange, arange.transpose(1, 0)], dim=2)
+    base_grid = einops.repeat(base_grid, "h w c -> b h w c", b=b)
+    # A random shift in units of pixels and within the boundaries of the padding.
+    shift = torch.randint(
+        0,
+        2 * pad + 1,
+        size=(b, 1, 1, 2),
+        device=x.device,
+        dtype=torch.float32,
+    )
+    shift *= 2.0 / (h + 2 * pad)
+    grid = base_grid + shift
+    return F.grid_sample(x, grid, padding_mode="zeros", align_corners=False)
+
+
+def update_ema_parameters(ema_net: nn.Module, net: nn.Module, alpha: float):
+    """Update EMA parameters in place with ema_param <- alpha * ema_param + (1 - alpha) * param."""
+    for ema_module, module in zip(ema_net.modules(), net.modules(), strict=True):
+        for (n_p_ema, p_ema), (n_p, p) in zip(
+            ema_module.named_parameters(recurse=False), module.named_parameters(recurse=False), strict=True
+        ):
+            assert n_p_ema == n_p, "Parameter names don't match for EMA model update"
+            if isinstance(p, dict):
+                raise RuntimeError("Dict parameter not supported")
+            if isinstance(module, nn.modules.batchnorm._BatchNorm) or not p.requires_grad:
+                # Copy BatchNorm parameters, and non-trainable parameters directly.
+                p_ema.copy_(p.to(dtype=p_ema.dtype).data)
+            with torch.no_grad():
+                p_ema.mul_(alpha)
+                p_ema.add_(p.to(dtype=p_ema.dtype).data, alpha=1 - alpha)
+
+
+def flatten_forward_unflatten(fn: Callable[[Tensor], Tensor], image_tensor: Tensor) -> Tensor:
+    """Helper to temporarily flatten extra dims at the start of the image tensor.
+
+    Args:
+        fn: Callable that the image tensor will be passed to. It should accept (B, C, H, W) and return
+            (B, *), where * is any number of dimensions.
+        image_tensor: An image tensor of shape (**, C, H, W), where ** is any number of dimensions, generally
+            different from *.
+    Returns:
+        A return value from the callable reshaped to (**, *).
+    """
+    if image_tensor.ndim == 4:
+        return fn(image_tensor)
+    start_dims = image_tensor.shape[:-3]
+    inp = torch.flatten(image_tensor, end_dim=-4)
+    flat_out = fn(inp)
+    return torch.reshape(flat_out, (*start_dims, *flat_out.shape[1:]))
--- a/lerobot/common/policies/tdmpc2/tdmpc2_utils.py
+++ b/lerobot/common/policies/tdmpc2/tdmpc2_utils.py
@@ -0,0 +1,305 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F  # noqa: N812
+from functorch import combine_state_for_ensemble
+
+# Lifted directly from https://github.com/nicklashansen/tdmpc2
+DREG_BINS = None
+
+
+def soft_ce(pred, target, cfg):
+    """Computes the cross entropy loss between predictions and soft targets."""
+    pred = F.log_softmax(pred, dim=-1)
+    target = two_hot(target, cfg)
+    return -(target * pred).sum(-1, keepdim=True)
+
+
+@torch.jit.script
+def log_std(x, low, dif):
+    return low + 0.5 * dif * (torch.tanh(x) + 1)
+
+
+@torch.jit.script
+def _gaussian_residual(eps, log_std):
+    return -0.5 * eps.pow(2) - log_std
+
+
+@torch.jit.script
+def _gaussian_logprob(residual):
+    return residual - 0.5 * torch.log(2 * torch.pi)
+
+
+def gaussian_logprob(eps, log_std, size=None):
+    """Compute Gaussian log probability."""
+    residual = _gaussian_residual(eps, log_std).sum(-1, keepdim=True)
+    if size is None:
+        size = eps.size(-1)
+    return _gaussian_logprob(residual) * size
+
+
+@torch.jit.script
+def _squash(pi):
+    return torch.log(F.relu(1 - pi.pow(2)) + 1e-6)
+
+
+def squash(mu, pi, log_pi):
+    """Apply squashing function."""
+    mu = torch.tanh(mu)
+    pi = torch.tanh(pi)
+    log_pi -= _squash(pi).sum(-1, keepdim=True)
+    return mu, pi, log_pi
+
+
+@torch.jit.script
+def symexp(x):
+    """
+    Symmetric exponential function.
+    Adapted from https://github.com/danijar/dreamerv3.
+    """
+    return torch.sign(x) * (torch.exp(torch.abs(x)) - 1)
+
+
+@torch.jit.script
+def symlog(x):
+    """
+    Symmetric logarithmic function.
+    Adapted from https://github.com/danijar/dreamerv3.
+    """
+    return torch.sign(x) * torch.log(1 + torch.abs(x))
+
+
+@torch.jit.script
+def log_std_fn(x, low, dif):
+    return low + 0.5 * dif * (torch.tanh(x) + 1)
+
+
+def two_hot(x, cfg):
+    """Converts a batch of scalars to soft two-hot encoded targets for discrete regression."""
+    if cfg.num_bins == 0:
+        return x
+    elif cfg.num_bins == 1:
+        return symlog(x)
+    x = torch.clamp(symlog(x), cfg.vmin, cfg.vmax).squeeze(1)
+    bin_idx = torch.floor((x - cfg.vmin) / cfg.bin_size).long()
+    bin_offset = ((x - cfg.vmin) / cfg.bin_size - bin_idx.float()).unsqueeze(-1)
+    soft_two_hot = torch.zeros(x.size(0), cfg.num_bins, device=x.device)
+
+    # print("x shape:", x.shape)
+    # print("bin_idx shape:", bin_idx.shape)
+    # print("bin_offset shape:", bin_offset.shape)
+    # print("soft_two_hot shape:", soft_two_hot.shape)
+
+    # from IPython import embed; embed()
+
+    soft_two_hot.scatter_(1, bin_idx.unsqueeze(1), 1 - bin_offset)
+    soft_two_hot.scatter_(1, (bin_idx.unsqueeze(1) + 1) % cfg.num_bins, bin_offset)
+    return soft_two_hot
+
+
+def two_hot_inv(x, cfg):
+    """Converts a batch of soft two-hot encoded vectors to scalars."""
+    global DREG_BINS
+    if cfg.num_bins == 0:
+        return x
+    elif cfg.num_bins == 1:
+        return symexp(x)
+    if DREG_BINS is None:
+        DREG_BINS = torch.linspace(cfg.vmin, cfg.vmax, cfg.num_bins, device=x.device)
+    x = F.softmax(x, dim=-1)
+
+    # cloning bins to avoid the inference tensor errodr
+    x = torch.sum(x * DREG_BINS.clone(), dim=-1, keepdim=True)
+
+    return symexp(x)
+
+
+class Ensemble(nn.Module):
+    """
+    Vectorized ensemble of modules.
+    """
+
+    def __init__(self, modules, **kwargs):
+        super().__init__()
+        modules = nn.ModuleList(modules)
+        fn, params, _ = combine_state_for_ensemble(modules)
+        self.vmap = torch.vmap(fn, in_dims=(0, 0, None), randomness="different", **kwargs)
+        self.params = nn.ParameterList([nn.Parameter(p) for p in params])
+        self._repr = str(modules)
+
+    def forward(self, *args, **kwargs):
+        return self.vmap(list(self.params), (), *args, **kwargs)
+
+    def __repr__(self):
+        return "Vectorized " + self._repr
+
+
+class ShiftAug(nn.Module):
+    """
+    Random shift image augmentation.
+    Adapted from https://github.com/facebookresearch/drqv2
+    """
+
+    def __init__(self, pad=3):
+        super().__init__()
+        self.pad = pad
+
+    def forward(self, x):
+        x = x.float()
+        n, _, h, w = x.size()
+        assert h == w
+        padding = tuple([self.pad] * 4)
+        x = F.pad(x, padding, "replicate")
+        eps = 1.0 / (h + 2 * self.pad)
+        arange = torch.linspace(-1.0 + eps, 1.0 - eps, h + 2 * self.pad, device=x.device, dtype=x.dtype)[:h]
+        arange = arange.unsqueeze(0).repeat(h, 1).unsqueeze(2)
+        base_grid = torch.cat([arange, arange.transpose(1, 0)], dim=2)
+        base_grid = base_grid.unsqueeze(0).repeat(n, 1, 1, 1)
+        shift = torch.randint(0, 2 * self.pad + 1, size=(n, 1, 1, 2), device=x.device, dtype=x.dtype)
+        shift *= 2.0 / (h + 2 * self.pad)
+        grid = base_grid + shift
+        return F.grid_sample(x, grid, padding_mode="zeros", align_corners=False)
+
+
+class PixelPreprocess(nn.Module):
+    """
+    Normalizes pixel observations to [-0.5, 0.5].
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return x.div_(255.0).sub_(0.5)
+
+
+class SimNorm(nn.Module):
+    """
+    Simplicial normalization.
+    Adapted from https://arxiv.org/abs/2204.00616.
+    """
+
+    def __init__(self, cfg):
+        super().__init__()
+        self.dim = cfg.simnorm_dim
+
+    def forward(self, x):
+        shp = x.shape
+        x = x.view(*shp[:-1], -1, self.dim)
+
+        x = F.softmax(x, dim=-1)
+        return x.view(*shp)
+
+    def __repr__(self):
+        return f"SimNorm(dim={self.dim})"
+
+
+class NormedLinear(nn.Linear):
+    """
+    Linear layer with LayerNorm, activation, and optionally dropout.
+    """
+
+    def __init__(self, *args, dropout=0.0, act=None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.ln = nn.LayerNorm(self.out_features)
+        self.act = nn.Mish(inplace=True) if act is None else act
+        self.dropout = nn.Dropout(dropout, inplace=True) if dropout else None
+
+    def forward(self, x):
+        x = super().forward(x)
+        if self.dropout:
+            x = self.dropout(x)
+        return self.act(self.ln(x))
+
+    def __repr__(self):
+        repr_dropout = f", dropout={self.dropout.p}" if self.dropout else ""
+        return (
+            f"NormedLinear(in_features={self.in_features}, "
+            f"out_features={self.out_features}, "
+            f"bias={self.bias is not None}{repr_dropout}, "
+            f"act={self.act.__class__.__name__})"
+        )
+
+
+def mlp(in_dim, mlp_dims, out_dim, act=None, dropout=0.0):
+    """
+    Basic building block of TD-MPC2.
+    MLP with LayerNorm, Mish activations, and optionally dropout.
+    """
+    if isinstance(mlp_dims, int):
+        mlp_dims = [mlp_dims]
+    dims = [in_dim] + mlp_dims + [out_dim]
+    mlp = nn.ModuleList()
+    for i in range(len(dims) - 2):
+        mlp.append(NormedLinear(dims[i], dims[i + 1], dropout=dropout * (i == 0)))
+    mlp.append(NormedLinear(dims[-2], dims[-1], act=act) if act else nn.Linear(dims[-2], dims[-1]))
+    return nn.Sequential(*mlp)
+
+
+def conv(in_shape, num_channels, act=None):
+    """
+    Basic convolutional encoder for TD-MPC2 with raw image observations.
+    4 layers of convolution with ReLU activations, followed by a linear layer.
+    """
+    #assert in_shape[-1] == 64  # assumes rgb observations to be 64x64
+    layers = [
+        ShiftAug(),
+        PixelPreprocess(),
+        nn.Conv2d(in_shape[0], num_channels, 7, stride=2),
+        nn.ReLU(inplace=True),
+        nn.Conv2d(num_channels, num_channels, 5, stride=2),
+        nn.ReLU(inplace=True),
+        nn.Conv2d(num_channels, num_channels, 3, stride=2),
+        nn.ReLU(inplace=True),
+        nn.Conv2d(num_channels, num_channels, 3, stride=1),
+        nn.Flatten(),
+    ]
+    if act:
+        layers.append(act)
+    return nn.Sequential(*layers)
+
+
+class RunningScale:
+    """Running trimmed scale estimator."""
+
+    def __init__(self, cfg):
+        self.cfg = cfg
+        self._value = torch.ones(1, dtype=torch.float32, device=torch.device("cuda"))
+        self._percentiles = torch.tensor([5, 95], dtype=torch.float32, device=torch.device("cuda"))
+
+    def state_dict(self):
+        return {"value": self._value, "percentiles": self._percentiles}
+
+    def load_state_dict(self, state_dict):
+        self._value.data.copy_(state_dict["value"])
+        self._percentiles.data.copy_(state_dict["percentiles"])
+
+    @property
+    def value(self):
+        return self._value.cpu().item()
+
+    def _percentile(self, x):
+        x_dtype, x_shape = x.dtype, x.shape
+        x = x.view(x.shape[0], -1)
+        in_sorted, _ = torch.sort(x, dim=0)
+        positions = self._percentiles * (x.shape[0] - 1) / 100
+        floored = torch.floor(positions)
+        ceiled = floored + 1
+        ceiled[ceiled > x.shape[0] - 1] = x.shape[0] - 1
+        weight_ceiled = positions - floored
+        weight_floored = 1.0 - weight_ceiled
+        d0 = in_sorted[floored.long(), :] * weight_floored[:, None]
+        d1 = in_sorted[ceiled.long(), :] * weight_ceiled[:, None]
+        return (d0 + d1).view(-1, *x_shape[1:]).type(x_dtype)
+
+    def update(self, x):
+        percentiles = self._percentile(x.detach())
+        value = torch.clamp(percentiles[1] - percentiles[0], min=1.0)
+        self._value.data.lerp_(value, self.cfg.tau)
+
+    def __call__(self, x, update=False):
+        if update:
+            self.update(x)
+        return x * (1 / self.value)
+
+    def __repr__(self):
+        return f"RunningScale(S: {self.value})"
--- a/lerobot/common/policies/vqbet/modeling_vqbet.py
+++ b/lerobot/common/policies/vqbet/modeling_vqbet.py
@@ -38,7 +38,13 @@ from lerobot.common.policies.vqbet.vqbet_utils import GPT, ResidualVQ
 # ruff: noqa: N806


-class VQBeTPolicy(nn.Module, PyTorchModelHubMixin):
+class VQBeTPolicy(
+    nn.Module,
+    PyTorchModelHubMixin,
+    library_name="lerobot",
+    repo_url="https://github.com/huggingface/lerobot",
+    tags=["robotics", "vqbet"],
+):
    """
    VQ-BeT Policy as per "Behavior Generation with Latent Actions"
    """
@@ -98,6 +104,7 @@ class VQBeTPolicy(nn.Module, PyTorchModelHubMixin):
        """

        batch = self.normalize_inputs(batch)
+        batch = dict(batch)  # shallow copy so that adding a key doesn't modify the original
        batch["observation.images"] = torch.stack([batch[k] for k in self.expected_image_keys], dim=-4)
        # Note: It's important that this happens after stacking the images into a single key.
        self._queues = populate_queues(self._queues, batch)
@@ -123,6 +130,7 @@ class VQBeTPolicy(nn.Module, PyTorchModelHubMixin):
    def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor]:
        """Run the batch through the model and compute the loss for training or validation."""
        batch = self.normalize_inputs(batch)
+        batch = dict(batch)  # shallow copy so that adding a key doesn't modify the original
        batch["observation.images"] = torch.stack([batch[k] for k in self.expected_image_keys], dim=-4)
        batch = self.normalize_targets(batch)
        # VQ-BeT discretizes action using VQ-VAE before training BeT (please refer to section 3.2 in the VQ-BeT paper https://arxiv.org/pdf/2403.03181)
@@ -287,7 +295,7 @@ class VQBeTModel(nn.Module):

        # To input state and observation features into GPT layers, we first project the features to fit the shape of input size of GPT.
        self.state_projector = MLP(
-            config.output_shapes["action"][0], hidden_channels=[self.config.gpt_input_dim]
+            config.input_shapes["observation.state"][0], hidden_channels=[self.config.gpt_input_dim]
        )
        self.rgb_feature_projector = MLP(
            self.rgb_encoder.feature_dim, hidden_channels=[self.config.gpt_input_dim]
--- a/lerobot/common/robot_devices/cameras/opencv.py
+++ b/lerobot/common/robot_devices/cameras/opencv.py
@@ -5,6 +5,7 @@ This file contains utilities for recording frames from cameras. For more info lo
 import argparse
 import concurrent.futures
 import math
+import platform
 import shutil
 import threading
 import time
@@ -33,8 +34,22 @@ MAX_OPENCV_INDEX = 60


 def find_camera_indices(raise_when_empty=False, max_index_search_range=MAX_OPENCV_INDEX):
+    if platform.system() == "Linux":
+        # Linux uses camera ports
+        print("Linux detected. Finding available camera indices through scanning '/dev/video*' ports")
+        possible_camera_ids = []
+        for port in Path("/dev").glob("video*"):
+            camera_idx = int(str(port).replace("/dev/video", ""))
+            possible_camera_ids.append(camera_idx)
+    else:
+        print(
+            "Mac or Windows detected. Finding available camera indices through "
+            f"scanning all indices from 0 to {MAX_OPENCV_INDEX}"
+        )
+        possible_camera_ids = range(max_index_search_range)
+
    camera_ids = []
-    for camera_idx in range(max_index_search_range):
+    for camera_idx in possible_camera_ids:
        camera = cv2.VideoCapture(camera_idx)
        is_open = camera.isOpened()
        camera.release()
@@ -45,7 +60,8 @@ def find_camera_indices(raise_when_empty=False, max_index_search_range=MAX_OPENC

    if raise_when_empty and len(camera_ids) == 0:
        raise OSError(
-            "Not a single camera was detected. Try re-plugging, or re-installing `opencv2`, or your camera driver, or make sure your camera is compatible with opencv2."
+            "Not a single camera was detected. Try re-plugging, or re-installing `opencv2`, "
+            "or your camera driver, or make sure your camera is compatible with opencv2."
        )

    return camera_ids
@@ -59,10 +75,9 @@ def save_image(img_array, camera_index, frame_index, images_dir):


 def save_images_from_cameras(
-    images_dir: Path, camera_ids=None, fps=None, width=None, height=None, record_time_s=2
+    images_dir: Path, camera_ids: list[int] | None = None, fps=None, width=None, height=None, record_time_s=2
 ):
    if camera_ids is None:
-        print("Finding available camera indices")
        camera_ids = find_camera_indices()

    print("Connecting cameras")
@@ -71,13 +86,12 @@ def save_images_from_cameras(
        camera = OpenCVCamera(cam_idx, fps=fps, width=width, height=height)
        camera.connect()
        print(
-            f"OpenCVCamera({camera.camera_index}, fps={camera.fps}, width={camera.width}, height={camera.height}, color_mode={camera.color_mode})"
+            f"OpenCVCamera({camera.camera_index}, fps={camera.fps}, width={camera.width}, "
+            f"height={camera.height}, color_mode={camera.color_mode})"
        )
        cameras.append(camera)

-    images_dir = Path(
-        images_dir,
-    )
+    images_dir = Path(images_dir)
    if images_dir.exists():
        shutil.rmtree(
            images_dir,
@@ -160,7 +174,7 @@ class OpenCVCamera:
    When an OpenCVCamera is instantiated, if no specific config is provided, the default fps, width, height and color_mode
    of the given camera will be used.

-    Example of usage of the class:
+    Example of usage:
    ```python
    camera = OpenCVCamera(camera_index=0)
    camera.connect()
@@ -194,11 +208,6 @@ class OpenCVCamera:
        self.height = config.height
        self.color_mode = config.color_mode

-        if not isinstance(self.camera_index, int):
-            raise ValueError(
-                f"Camera index must be provided as an int, but {self.camera_index} was given instead."
-            )
-
        self.camera = None
        self.is_connected = False
        self.thread = None
@@ -212,7 +221,13 @@ class OpenCVCamera:

        # First create a temporary camera trying to access `camera_index`,
        # and verify it is a valid camera by calling `isOpened`.
-        tmp_camera = cv2.VideoCapture(self.camera_index)
+
+        if platform.system() == "Linux":
+            # Linux uses ports for connecting to cameras
+            tmp_camera = cv2.VideoCapture(f"/dev/video{self.camera_index}")
+        else:
+            tmp_camera = cv2.VideoCapture(self.camera_index)
+
        is_camera_open = tmp_camera.isOpened()
        # Release camera to make it accessible for `find_camera_indices`
        del tmp_camera
@@ -224,7 +239,8 @@ class OpenCVCamera:
            available_cam_ids = find_camera_indices()
            if self.camera_index not in available_cam_ids:
                raise ValueError(
-                    f"`camera_index` is expected to be one of these available cameras {available_cam_ids}, but {self.camera_index} is provided instead."
+                    f"`camera_index` is expected to be one of these available cameras {available_cam_ids}, but {self.camera_index} is provided instead. "
+                    "To find the camera index you should use, run `python lerobot/common/robot_devices/cameras/opencv.py`."
                )

            raise OSError(f"Can't access camera {self.camera_index}.")
@@ -232,7 +248,10 @@ class OpenCVCamera:
        # Secondly, create the camera that will be used downstream.
        # Note: For some unknown reason, calling `isOpened` blocks the camera which then
        # needs to be re-created.
-        self.camera = cv2.VideoCapture(self.camera_index)
+        if platform.system() == "Linux":
+            self.camera = cv2.VideoCapture(f"/dev/video{self.camera_index}")
+        else:
+            self.camera = cv2.VideoCapture(self.camera_index)

        if self.fps is not None:
            self.camera.set(cv2.CAP_PROP_FPS, self.fps)
--- a/lerobot/common/robot_devices/cameras/utils.py
+++ b/lerobot/common/robot_devices/cameras/utils.py
@@ -2,6 +2,7 @@ from pathlib import Path
 from typing import Protocol

 import cv2
+import einops
 import numpy as np


@@ -39,6 +40,16 @@ def save_depth_image(depth, path, write_shape=False):
    cv2.imwrite(str(path), depth_image)


+def convert_torch_image_to_cv2(tensor, rgb_to_bgr=True):
+    assert tensor.ndim == 3
+    c, h, w = tensor.shape
+    assert c < h and c < w
+    color_image = einops.rearrange(tensor, "c h w -> h w c").numpy()
+    if rgb_to_bgr:
+        color_image = cv2.cvtColor(color_image, cv2.COLOR_RGB2BGR)
+    return color_image
+
+
 # Defines a camera type
 class Camera(Protocol):
    def connect(self): ...
--- a/lerobot/common/robot_devices/motors/dynamixel.py
+++ b/lerobot/common/robot_devices/motors/dynamixel.py
@@ -5,6 +5,7 @@ from copy import deepcopy
 from pathlib import Path

 import numpy as np
+import tqdm
 from dynamixel_sdk import (
    COMM_SUCCESS,
    DXL_HIBYTE,
@@ -21,9 +22,11 @@ from lerobot.common.robot_devices.utils import RobotDeviceAlreadyConnectedError,
 from lerobot.common.utils.utils import capture_timestamp_utc

 PROTOCOL_VERSION = 2.0
-BAUD_RATE = 1_000_000
+BAUDRATE = 1_000_000
 TIMEOUT_MS = 1000

+MAX_ID_RANGE = 252
+
 # https://emanual.robotis.com/docs/en/dxl/x/xl330-m077
 # https://emanual.robotis.com/docs/en/dxl/x/xl330-m288
 # https://emanual.robotis.com/docs/en/dxl/x/xl430-w250
@@ -86,6 +89,16 @@ X_SERIES_CONTROL_TABLE = {
    "Present_Temperature": (146, 1),
 }

+X_SERIES_BAUDRATE_TABLE = {
+    0: 9_600,
+    1: 57_600,
+    2: 115_200,
+    3: 1_000_000,
+    4: 2_000_000,
+    5: 3_000_000,
+    6: 4_000_000,
+}
+
 CALIBRATION_REQUIRED = ["Goal_Position", "Present_Position"]
 CONVERT_UINT32_TO_INT32_REQUIRED = ["Goal_Position", "Present_Position"]

@@ -98,7 +111,67 @@ MODEL_CONTROL_TABLE = {
    "xm540-w270": X_SERIES_CONTROL_TABLE,
 }

+MODEL_RESOLUTION = {
+    "x_series": 4096,
+    "xl330-m077": 4096,
+    "xl330-m288": 4096,
+    "xl430-w250": 4096,
+    "xm430-w350": 4096,
+    "xm540-w270": 4096,
+}
+
+MODEL_BAUDRATE_TABLE = {
+    "x_series": X_SERIES_BAUDRATE_TABLE,
+    "xl330-m077": X_SERIES_BAUDRATE_TABLE,
+    "xl330-m288": X_SERIES_BAUDRATE_TABLE,
+    "xl430-w250": X_SERIES_BAUDRATE_TABLE,
+    "xm430-w350": X_SERIES_BAUDRATE_TABLE,
+    "xm540-w270": X_SERIES_BAUDRATE_TABLE,
+}
+
 NUM_READ_RETRY = 10
+NUM_WRITE_RETRY = 10
+
+
+def convert_degrees_to_steps(degrees: float | np.ndarray, models: str | list[str]):
+    """This function convert the degree range to the step range for indicating motors rotation.
+    It assums a motor achieves a full rotation by going from -180 degree position to +180.
+    The motor resolution (e.g. 4096) corresponds to the number of steps needed to achieve a full rotation.
+    """
+    if isinstance(degrees, float):
+        degrees = np.array(degrees)
+
+    resolutions = [MODEL_RESOLUTION[model] for model in models]
+    steps = degrees / 180 * np.array(resolutions) / 2
+    steps = steps.astype(int)
+    return steps
+
+
+def convert_to_bytes(value, bytes):
+    # Note: No need to convert back into unsigned int, since this byte preprocessing
+    # already handles it for us.
+    if bytes == 1:
+        data = [
+            DXL_LOBYTE(DXL_LOWORD(value)),
+        ]
+    elif bytes == 2:
+        data = [
+            DXL_LOBYTE(DXL_LOWORD(value)),
+            DXL_HIBYTE(DXL_LOWORD(value)),
+        ]
+    elif bytes == 4:
+        data = [
+            DXL_LOBYTE(DXL_LOWORD(value)),
+            DXL_HIBYTE(DXL_LOWORD(value)),
+            DXL_LOBYTE(DXL_HIWORD(value)),
+            DXL_HIBYTE(DXL_HIWORD(value)),
+        ]
+    else:
+        raise NotImplementedError(
+            f"Value of the number of bytes to be sent is expected to be in [1, 2, 4], but "
+            f"{bytes} is provided instead."
+        )
+    return data


 def get_group_sync_key(data_name, motor_names):
@@ -207,13 +280,12 @@ class DynamixelMotorsBus:
    >>> The port of this DynamixelMotorsBus is /dev/tty.usbmodem575E0031751.
    >>> Reconnect the usb cable.
    ```
-    To find the motor indices, use [DynamixelWizzard2](https://emanual.robotis.com/docs/en/software/dynamixel/dynamixel_wizard2).

    Example of usage for 1 motor connected to the bus:
    ```python
    motor_name = "gripper"
    motor_index = 6
-    motor_model = "xl330-m077"
+    motor_model = "xl330-m288"

    motors_bus = DynamixelMotorsBus(
        port="/dev/tty.usbmodem575E0031751",
@@ -221,7 +293,11 @@ class DynamixelMotorsBus:
    )
    motors_bus.connect()

-    motors_bus.teleop_step()
+    position = motors_bus.read("Present_Position")
+
+    # move from a few motor steps as an example
+    few_steps = 30
+    motors_bus.write("Goal_Position", position + few_steps)

    # when done, consider disconnecting
    motors_bus.disconnect()
@@ -233,6 +309,7 @@ class DynamixelMotorsBus:
        port: str,
        motors: dict[str, tuple[int, str]],
        extra_model_control_table: dict[str, list[tuple]] | None = None,
+        extra_model_resolution: dict[str, int] | None = None,
    ):
        self.port = port
        self.motors = motors
@@ -241,6 +318,10 @@ class DynamixelMotorsBus:
        if extra_model_control_table:
            self.model_ctrl_table.update(extra_model_control_table)

+        self.model_resolution = deepcopy(MODEL_RESOLUTION)
+        if extra_model_resolution:
+            self.model_resolution.update(extra_model_resolution)
+
        self.port_handler = None
        self.packet_handler = None
        self.calibration = None
@@ -268,52 +349,286 @@ class DynamixelMotorsBus:
            )
            raise

-        self.port_handler.setBaudRate(BAUD_RATE)
-        self.port_handler.setPacketTimeoutMillis(TIMEOUT_MS)
-
+        # Allow to read and write
        self.is_connected = True

+        self.port_handler.setPacketTimeoutMillis(TIMEOUT_MS)
+
+        # Set expected baudrate for the bus
+        self.set_bus_baudrate(BAUDRATE)
+
+        if not self.are_motors_configured():
+            input(
+                "\n/!\\ A configuration issue has been detected with your motors: \n"
+                "If it's the first time that you use these motors, press enter to configure your motors... but before "
+                "verify that all the cables are connected the proper way. If you find an issue, before making a modification, "
+                "kill the python process, unplug the power cord to not damage the motors, rewire correctly, then plug the power "
+                "again and relaunch the script.\n"
+            )
+            print()
+            self.configure_motors()
+
+    def reconnect(self):
+        self.port_handler = PortHandler(self.port)
+        self.packet_handler = PacketHandler(PROTOCOL_VERSION)
+        if not self.port_handler.openPort():
+            raise OSError(f"Failed to open port '{self.port}'.")
+        self.is_connected = True
+
+    def are_motors_configured(self):
+        # Only check the motor indices and not baudrate, since if the motor baudrates are incorrect,
+        # a ConnectionError will be raised anyway.
+        try:
+            return (self.motor_indices == self.read("ID")).all()
+        except ConnectionError as e:
+            print(e)
+            return False
+
+    def configure_motors(self):
+        # TODO(rcadene): This script assumes motors follow the X_SERIES baudrates
+        # TODO(rcadene): Refactor this function with intermediate high-level functions
+
+        print("Scanning all baudrates and motor indices")
+        all_baudrates = set(X_SERIES_BAUDRATE_TABLE.values())
+        ids_per_baudrate = {}
+        for baudrate in all_baudrates:
+            self.set_bus_baudrate(baudrate)
+            present_ids = self.find_motor_indices()
+            if len(present_ids) > 0:
+                ids_per_baudrate[baudrate] = present_ids
+        print(f"Motor indices detected: {ids_per_baudrate}")
+        print()
+
+        possible_baudrates = list(ids_per_baudrate.keys())
+        possible_ids = list({idx for sublist in ids_per_baudrate.values() for idx in sublist})
+        untaken_ids = list(set(range(MAX_ID_RANGE)) - set(possible_ids) - set(self.motor_indices))
+
+        # Connect successively one motor to the chain and write a unique random index for each
+        for i in range(len(self.motors)):
+            self.disconnect()
+            input(
+                "1. Unplug the power cord\n"
+                "2. Plug/unplug minimal number of cables to only have the first "
+                f"{i+1} motor(s) ({self.motor_names[:i+1]}) connected.\n"
+                "3. Re-plug the power cord\n"
+                "Press Enter to continue..."
+            )
+            print()
+            self.reconnect()
+
+            if i > 0:
+                try:
+                    self._read_with_motor_ids(self.motor_models, untaken_ids[:i], "ID")
+                except ConnectionError:
+                    print(f"Failed to read from {untaken_ids[:i+1]}. Make sure the power cord is plugged in.")
+                    input("Press Enter to continue...")
+                    print()
+                    self.reconnect()
+
+            print("Scanning possible baudrates and motor indices")
+            motor_found = False
+            for baudrate in possible_baudrates:
+                self.set_bus_baudrate(baudrate)
+                present_ids = self.find_motor_indices(possible_ids)
+                if len(present_ids) == 1:
+                    present_idx = present_ids[0]
+                    print(f"Detected motor with index {present_idx}")
+
+                    if baudrate != BAUDRATE:
+                        print(f"Setting its baudrate to {BAUDRATE}")
+                        baudrate_idx = list(X_SERIES_BAUDRATE_TABLE.values()).index(BAUDRATE)
+
+                        # The write can fail, so we allow retries
+                        for _ in range(NUM_WRITE_RETRY):
+                            self._write_with_motor_ids(
+                                self.motor_models, present_idx, "Baud_Rate", baudrate_idx
+                            )
+                            time.sleep(0.5)
+                            self.set_bus_baudrate(BAUDRATE)
+                            try:
+                                present_baudrate_idx = self._read_with_motor_ids(
+                                    self.motor_models, present_idx, "Baud_Rate"
+                                )
+                            except ConnectionError:
+                                print("Failed to write baudrate. Retrying.")
+                                self.set_bus_baudrate(baudrate)
+                                continue
+                            break
+                        else:
+                            raise
+
+                        if present_baudrate_idx != baudrate_idx:
+                            raise OSError("Failed to write baudrate.")
+
+                    print(f"Setting its index to a temporary untaken index ({untaken_ids[i]})")
+                    self._write_with_motor_ids(self.motor_models, present_idx, "ID", untaken_ids[i])
+
+                    present_idx = self._read_with_motor_ids(self.motor_models, untaken_ids[i], "ID")
+                    if present_idx != untaken_ids[i]:
+                        raise OSError("Failed to write index.")
+
+                    motor_found = True
+                    break
+                elif len(present_ids) > 1:
+                    raise OSError(f"More than one motor detected ({present_ids}), but only one was expected.")
+
+            if not motor_found:
+                raise OSError(
+                    "No motor found, but one new motor expected. Verify power cord is plugged in and retry."
+                )
+            print()
+
+        print(f"Setting expected motor indices: {self.motor_indices}")
+        self.set_bus_baudrate(BAUDRATE)
+        self._write_with_motor_ids(
+            self.motor_models, untaken_ids[: len(self.motors)], "ID", self.motor_indices
+        )
+        print()
+
+        if (self.read("ID") != self.motor_indices).any():
+            raise OSError("Failed to write motors indices.")
+
+        print("Configuration is done!")
+
+    def find_motor_indices(self, possible_ids=None):
+        if possible_ids is None:
+            possible_ids = range(MAX_ID_RANGE)
+
+        indices = []
+        for idx in tqdm.tqdm(possible_ids):
+            try:
+                present_idx = self._read_with_motor_ids(self.motor_models, [idx], "ID")[0]
+            except ConnectionError:
+                continue
+
+            if idx != present_idx:
+                # sanity check
+                raise OSError(
+                    "Motor index used to communicate through the bus is not the same as the one present in the motor memory. The motor memory might be damaged."
+                )
+            indices.append(idx)
+
+        return indices
+
+    def set_bus_baudrate(self, baudrate):
+        present_bus_baudrate = self.port_handler.getBaudRate()
+        if present_bus_baudrate != baudrate:
+            print(f"Setting bus baud rate to {baudrate}. Previously {present_bus_baudrate}.")
+            self.port_handler.setBaudRate(baudrate)
+
+            if self.port_handler.getBaudRate() != baudrate:
+                raise OSError("Failed to write bus baud rate.")
+
    @property
-    def motor_names(self) -> list[int]:
+    def motor_names(self) -> list[str]:
        return list(self.motors.keys())

+    @property
+    def motor_models(self) -> list[str]:
+        return [model for _, model in self.motors.values()]
+
+    @property
+    def motor_indices(self) -> list[int]:
+        return [idx for idx, _ in self.motors.values()]
+
    def set_calibration(self, calibration: dict[str, tuple[int, bool]]):
        self.calibration = calibration

    def apply_calibration(self, values: np.ndarray | list, motor_names: list[str] | None):
-        if not self.calibration:
-            return values
+        """Convert from unsigned int32 joint position range [0, 2**32[ to the universal float32 nominal degree range ]-180.0, 180.0[ with
+        a "zero position" at 0 degree.

+        Note: We say "nominal degree range" since the motors can take values outside this range. For instance, 190 degrees, if the motor
+        rotate more than a half a turn from the zero position. However, most motors can't rotate more than 180 degrees and will stay in this range.
+
+        Joints values are original in [0, 2**32[ (unsigned int32). Each motor are expected to complete a full rotation
+        when given a goal position that is + or - their resolution. For instance, dynamixel xl330-m077 have a resolution of 4096, and
+        at any position in their original range, let's say the position 56734, they complete a full rotation clockwise by moving to 60830,
+        or anticlockwise by moving to 52638. The position in the original range is arbitrary and might change a lot between each motor.
+        To harmonize between motors of the same model, different robots, or even models of different brands, we propose to work
+        in the centered nominal degree range ]-180, 180[.
+        """
        if motor_names is None:
            motor_names = self.motor_names

+        # Convert from unsigned int32 original range [0, 2**32[ to centered signed int32 range [-2**31, 2**31[
+        values = values.astype(np.int32)
+
        for i, name in enumerate(motor_names):
            homing_offset, drive_mode = self.calibration[name]

-            if values[i] is not None:
-                if drive_mode:
-                    values[i] *= -1
-                values[i] += homing_offset
+            # Update direction of rotation of the motor to match between leader and follower. In fact, the motor of the leader for a given joint
+            # can be assembled in an opposite direction in term of rotation than the motor of the follower on the same joint.
+            if drive_mode:
+                values[i] *= -1
+
+            # Convert from range [-2**31, 2**31[ to nominal range ]-resolution, resolution[ (e.g. ]-2048, 2048[)
+            values[i] += homing_offset
+
+        # Convert from range ]-resolution, resolution[ to the universal float32 centered degree range ]-180, 180[
+        values = values.astype(np.float32)
+        for i, name in enumerate(motor_names):
+            _, model = self.motors[name]
+            resolution = self.model_resolution[model]
+            values[i] = values[i] / (resolution // 2) * 180

        return values

    def revert_calibration(self, values: np.ndarray | list, motor_names: list[str] | None):
-        if not self.calibration:
-            return values
-
+        """Inverse of `apply_calibration`."""
        if motor_names is None:
            motor_names = self.motor_names

+        # Convert from the universal float32 centered degree range ]-180, 180[ to resolution range ]-resolution, resolution[
+        for i, name in enumerate(motor_names):
+            _, model = self.motors[name]
+            resolution = self.model_resolution[model]
+            values[i] = values[i] / 180 * (resolution // 2)
+
+        values = np.round(values).astype(np.int32)
+
+        # Convert from nominal range ]-resolution, resolution[ to centered signed int32 range [-2**31, 2**31[
        for i, name in enumerate(motor_names):
            homing_offset, drive_mode = self.calibration[name]
+            values[i] -= homing_offset

-            if values[i] is not None:
-                values[i] -= homing_offset
-                if drive_mode:
-                    values[i] *= -1
+            # Update direction of rotation of the motor that was matching between leader and follower to their original direction.
+            # In fact, the motor of the leader for a given joint can be assembled in an opposite direction in term of rotation
+            # than the motor of the follower on the same joint.
+            if drive_mode:
+                values[i] *= -1

        return values

+    def _read_with_motor_ids(self, motor_models, motor_ids, data_name):
+        return_list = True
+        if not isinstance(motor_ids, list):
+            return_list = False
+            motor_ids = [motor_ids]
+
+        assert_same_address(self.model_ctrl_table, self.motor_models, data_name)
+        addr, bytes = self.model_ctrl_table[motor_models[0]][data_name]
+        group = GroupSyncRead(self.port_handler, self.packet_handler, addr, bytes)
+        for idx in motor_ids:
+            group.addParam(idx)
+
+        comm = group.txRxPacket()
+        if comm != COMM_SUCCESS:
+            raise ConnectionError(
+                f"Read failed due to communication error on port {self.port_handler.port_name} for indices {motor_ids}: "
+                f"{self.packet_handler.getTxRxResult(comm)}"
+            )
+
+        values = []
+        for idx in motor_ids:
+            value = group.getData(idx, addr, bytes)
+            values.append(value)
+
+        if return_list:
+            return values
+        else:
+            return values[0]
+
    def read(self, data_name, motor_names: str | list[str] | None = None):
        if not self.is_connected:
            raise RobotDeviceNotConnectedError(
@@ -367,9 +682,21 @@ class DynamixelMotorsBus:
        if data_name in CONVERT_UINT32_TO_INT32_REQUIRED:
            values = values.astype(np.int32)

-        if data_name in CALIBRATION_REQUIRED:
+        if data_name in CALIBRATION_REQUIRED and self.calibration is not None:
            values = self.apply_calibration(values, motor_names)

+            # We expect our motors to stay in a nominal range of [-180, 180] degrees
+            # which corresponds to a half turn rotation.
+            # However, some motors can turn a bit more, hence we extend the nominal range to [-270, 270]
+            # which is less than a full 360 degree rotation.
+            if not np.all((values > -270) & (values < 270)):
+                raise ValueError(
+                    f"Wrong motor position range detected. "
+                    f"Expected to be in [-270, +270] but in [{values.min()}, {values.max()}]. "
+                    "This might be due to a cable connection issue creating an artificial 360 degrees jump in motor values. "
+                    "You need to recalibrate by running: `python lerobot/scripts/control_robot.py calibrate`"
+                )
+
        # log the number of seconds it took to read the data from the motors
        delta_ts_name = get_log_name("delta_timestamp_s", "read", data_name, motor_names)
        self.logs[delta_ts_name] = time.perf_counter() - start_time
@@ -380,6 +707,26 @@ class DynamixelMotorsBus:

        return values

+    def _write_with_motor_ids(self, motor_models, motor_ids, data_name, values):
+        if not isinstance(motor_ids, list):
+            motor_ids = [motor_ids]
+        if not isinstance(values, list):
+            values = [values]
+
+        assert_same_address(self.model_ctrl_table, motor_models, data_name)
+        addr, bytes = self.model_ctrl_table[motor_models[0]][data_name]
+        group = GroupSyncWrite(self.port_handler, self.packet_handler, addr, bytes)
+        for idx, value in zip(motor_ids, values, strict=True):
+            data = convert_to_bytes(value, bytes)
+            group.addParam(idx, data)
+
+        comm = group.txPacket()
+        if comm != COMM_SUCCESS:
+            raise ConnectionError(
+                f"Write failed due to communication error on port {self.port_handler.port_name} for indices {motor_ids}: "
+                f"{self.packet_handler.getTxRxResult(comm)}"
+            )
+
    def write(self, data_name, values: int | float | np.ndarray, motor_names: str | list[str] | None = None):
        if not self.is_connected:
            raise RobotDeviceNotConnectedError(
@@ -406,7 +753,7 @@ class DynamixelMotorsBus:
            motor_ids.append(motor_idx)
            models.append(model)

-        if data_name in CALIBRATION_REQUIRED:
+        if data_name in CALIBRATION_REQUIRED and self.calibration is not None:
            values = self.revert_calibration(values, motor_names)

        values = values.tolist()
@@ -422,30 +769,7 @@ class DynamixelMotorsBus:
            )

        for idx, value in zip(motor_ids, values, strict=True):
-            # Note: No need to convert back into unsigned int, since this byte preprocessing
-            # already handles it for us.
-            if bytes == 1:
-                data = [
-                    DXL_LOBYTE(DXL_LOWORD(value)),
-                ]
-            elif bytes == 2:
-                data = [
-                    DXL_LOBYTE(DXL_LOWORD(value)),
-                    DXL_HIBYTE(DXL_LOWORD(value)),
-                ]
-            elif bytes == 4:
-                data = [
-                    DXL_LOBYTE(DXL_LOWORD(value)),
-                    DXL_HIBYTE(DXL_LOWORD(value)),
-                    DXL_LOBYTE(DXL_HIWORD(value)),
-                    DXL_HIBYTE(DXL_HIWORD(value)),
-                ]
-            else:
-                raise NotImplementedError(
-                    f"Value of the number of bytes to be sent is expected to be in [1, 2, 4], but "
-                    f"{bytes} is provided instead."
-                )
-
+            data = convert_to_bytes(value, bytes)
            if init_group:
                self.group_writers[group_key].addParam(idx, data)
            else:
--- a/lerobot/common/robot_devices/robots/factory.py
+++ b/lerobot/common/robot_devices/robots/factory.py
@@ -1,46 +1,7 @@
-def make_robot(name):
-    if name == "koch":
-        # TODO(rcadene): Add configurable robot from command line and yaml config
-        # TODO(rcadene): Add example with and without cameras
-        from lerobot.common.robot_devices.cameras.opencv import OpenCVCamera
-        from lerobot.common.robot_devices.motors.dynamixel import DynamixelMotorsBus
-        from lerobot.common.robot_devices.robots.koch import KochRobot
+import hydra
+from omegaconf import DictConfig

-        robot = KochRobot(
-            leader_arms={
-                "main": DynamixelMotorsBus(
-                    port="/dev/tty.usbmodem575E0031751",
-                    motors={
-                        # name: (index, model)
-                        "shoulder_pan": (1, "xl330-m077"),
-                        "shoulder_lift": (2, "xl330-m077"),
-                        "elbow_flex": (3, "xl330-m077"),
-                        "wrist_flex": (4, "xl330-m077"),
-                        "wrist_roll": (5, "xl330-m077"),
-                        "gripper": (6, "xl330-m077"),
-                    },
-                ),
-            },
-            follower_arms={
-                "main": DynamixelMotorsBus(
-                    port="/dev/tty.usbmodem575E0032081",
-                    motors={
-                        # name: (index, model)
-                        "shoulder_pan": (1, "xl430-w250"),
-                        "shoulder_lift": (2, "xl430-w250"),
-                        "elbow_flex": (3, "xl330-m288"),
-                        "wrist_flex": (4, "xl330-m288"),
-                        "wrist_roll": (5, "xl330-m288"),
-                        "gripper": (6, "xl330-m288"),
-                    },
-                ),
-            },
-            cameras={
-                "laptop": OpenCVCamera(0, fps=30, width=640, height=480),
-                "phone": OpenCVCamera(1, fps=30, width=640, height=480),
-            },
-        )
-    else:
-        raise ValueError(f"Robot '{name}' not found.")

+def make_robot(cfg: DictConfig):
+    robot = hydra.utils.instantiate(cfg)
    return robot
--- a/lerobot/common/robot_devices/robots/koch.py
+++ b/lerobot/common/robot_devices/robots/koch.py
@@ -1,129 +1,51 @@
+import logging
 import pickle
 import time
 from dataclasses import dataclass, field, replace
 from pathlib import Path
+from typing import Sequence

 import numpy as np
 import torch

 from lerobot.common.robot_devices.cameras.utils import Camera
 from lerobot.common.robot_devices.motors.dynamixel import (
-    DriveMode,
-    DynamixelMotorsBus,
    OperatingMode,
    TorqueMode,
+    convert_degrees_to_steps,
 )
 from lerobot.common.robot_devices.motors.utils import MotorsBus
 from lerobot.common.robot_devices.utils import RobotDeviceAlreadyConnectedError, RobotDeviceNotConnectedError

-URL_HORIZONTAL_POSITION = {
-    "follower": "https://raw.githubusercontent.com/huggingface/lerobot/main/media/koch/follower_horizontal.png",
-    "leader": "https://raw.githubusercontent.com/huggingface/lerobot/main/media/koch/leader_horizontal.png",
-}
-URL_90_DEGREE_POSITION = {
-    "follower": "https://raw.githubusercontent.com/huggingface/lerobot/main/media/koch/follower_90_degree.png",
-    "leader": "https://raw.githubusercontent.com/huggingface/lerobot/main/media/koch/leader_90_degree.png",
-}
-
 ########################################################################
 # Calibration logic
 ########################################################################

-TARGET_HORIZONTAL_POSITION = np.array([0, -1024, 1024, 0, -1024, 0])
-TARGET_90_DEGREE_POSITION = np.array([1024, 0, 0, 1024, 0, -1024])
-GRIPPER_OPEN = np.array([-400])
+URL_TEMPLATE = (
+    "https://raw.githubusercontent.com/huggingface/lerobot/main/media/{robot}/{arm}_{position}.webp"
+)
+
+# In nominal degree range ]-180, +180[
+ZERO_POSITION_DEGREE = 0
+ROTATED_POSITION_DEGREE = 90


-def apply_homing_offset(values: np.array, homing_offset: np.array) -> np.array:
-    for i in range(len(values)):
-        if values[i] is not None:
-            values[i] += homing_offset[i]
-    return values
+def assert_drive_mode(drive_mode):
+    # `drive_mode` is in [0,1] with 0 means original rotation direction for the motor, and 1 means inverted.
+    if not np.all(np.isin(drive_mode, [0, 1])):
+        raise ValueError(f"`drive_mode` contains values other than 0 or 1: ({drive_mode})")


-def apply_drive_mode(values: np.array, drive_mode: np.array) -> np.array:
-    for i in range(len(values)):
-        if values[i] is not None and drive_mode[i]:
-            values[i] = -values[i]
-    return values
+def apply_drive_mode(position, drive_mode):
+    assert_drive_mode(drive_mode)
+    # Convert `drive_mode` from [0, 1] with 0 indicates original rotation direction and 1 inverted,
+    # to [-1, 1] with 1 indicates original rotation direction and -1 inverted.
+    signed_drive_mode = -(drive_mode * 2 - 1)
+    position *= signed_drive_mode
+    return position


-def apply_calibration(values: np.array, homing_offset: np.array, drive_mode: np.array) -> np.array:
-    values = apply_drive_mode(values, drive_mode)
-    values = apply_homing_offset(values, homing_offset)
-    return values
-
-
-def revert_calibration(values: np.array, homing_offset: np.array, drive_mode: np.array) -> np.array:
-    """
-    Transform working position into real position for the robot.
-    """
-    values = apply_homing_offset(
-        values,
-        np.array([-homing_offset if homing_offset is not None else None for homing_offset in homing_offset]),
-    )
-    values = apply_drive_mode(values, drive_mode)
-    return values
-
-
-def revert_appropriate_positions(positions: np.array, drive_mode: list[bool]) -> np.array:
-    for i, revert in enumerate(drive_mode):
-        if not revert and positions[i] is not None:
-            positions[i] = -positions[i]
-    return positions
-
-
-def compute_corrections(positions: np.array, drive_mode: list[bool], target_position: np.array) -> np.array:
-    correction = revert_appropriate_positions(positions, drive_mode)
-
-    for i in range(len(positions)):
-        if correction[i] is not None:
-            if drive_mode[i]:
-                correction[i] -= target_position[i]
-            else:
-                correction[i] += target_position[i]
-
-    return correction
-
-
-def compute_nearest_rounded_positions(positions: np.array) -> np.array:
-    return np.array(
-        [
-            round(positions[i] / 1024) * 1024 if positions[i] is not None else None
-            for i in range(len(positions))
-        ]
-    )
-
-
-def compute_homing_offset(
-    arm: DynamixelMotorsBus, drive_mode: list[bool], target_position: np.array
-) -> np.array:
-    # Get the present positions of the servos
-    present_positions = apply_calibration(
-        arm.read("Present_Position"), np.array([0, 0, 0, 0, 0, 0]), drive_mode
-    )
-
-    nearest_positions = compute_nearest_rounded_positions(present_positions)
-    correction = compute_corrections(nearest_positions, drive_mode, target_position)
-    return correction
-
-
-def compute_drive_mode(arm: DynamixelMotorsBus, offset: np.array):
-    # Get current positions
-    present_positions = apply_calibration(
-        arm.read("Present_Position"), offset, np.array([False, False, False, False, False, False])
-    )
-
-    nearest_positions = compute_nearest_rounded_positions(present_positions)
-
-    # construct 'drive_mode' list comparing nearest_positions and TARGET_90_DEGREE_POSITION
-    drive_mode = []
-    for i in range(len(nearest_positions)):
-        drive_mode.append(nearest_positions[i] != TARGET_90_DEGREE_POSITION[i])
-    return drive_mode
-
-
-def reset_arm(arm: MotorsBus):
+def reset_torque_mode(arm: MotorsBus):
    # To be configured, all servos must be in "torque disable" mode
    arm.write("Torque_Enable", TorqueMode.DISABLED.value)

@@ -132,55 +54,95 @@ def reset_arm(arm: MotorsBus):
    # you could end up with a servo with a position 0 or 4095 at a crucial point See [
    # https://emanual.robotis.com/docs/en/dxl/x/x_series/#operating-mode11]
    all_motors_except_gripper = [name for name in arm.motor_names if name != "gripper"]
-    arm.write("Operating_Mode", OperatingMode.EXTENDED_POSITION.value, all_motors_except_gripper)
+    if len(all_motors_except_gripper) > 0:
+        arm.write("Operating_Mode", OperatingMode.EXTENDED_POSITION.value, all_motors_except_gripper)

-    # TODO(rcadene): why?
-    # Use 'position control current based' for gripper
+    # Use 'position control current based' for gripper to be limited by the limit of the current.
+    # For the follower gripper, it means it can grasp an object without forcing too much even tho,
+    # it's goal position is a complete grasp (both gripper fingers are ordered to join and reach a touch).
+    # For the leader gripper, it means we can use it as a physical trigger, since we can force with our finger
+    # to make it move, and it will move back to its original target position when we release the force.
    arm.write("Operating_Mode", OperatingMode.CURRENT_CONTROLLED_POSITION.value, "gripper")

-    # Make sure the native calibration (homing offset abd drive mode) is disabled, since we use our own calibration layer to be more generic
-    arm.write("Homing_Offset", 0)
-    arm.write("Drive_Mode", DriveMode.NON_INVERTED.value)
-

 def run_arm_calibration(arm: MotorsBus, name: str, arm_type: str):
-    """Example of usage:
+    """This function ensures that a neural network trained on data collected on a given robot
+    can work on another robot. For instance before calibration, setting a same goal position
+    for each motor of two different robots will get two very different positions. But after calibration,
+    the two robots will move to the same position.To this end, this function computes the homing offset
+    and the drive mode for each motor of a given robot.
+
+    Homing offset is used to shift the motor position to a ]-2048, +2048[ nominal range (when the motor uses 2048 steps
+    to complete a half a turn). This range is set around an arbitrary "zero position" corresponding to all motor positions
+    being 0. During the calibration process, you will need to manually move the robot to this "zero position".
+
+    Drive mode is used to invert the rotation direction of the motor. This is useful when some motors have been assembled
+    in the opposite orientation for some robots. During the calibration process, you will need to manually move the robot
+    to the "rotated position".
+
+    After calibration, the homing offsets and drive modes are stored in a cache.
+
+    Example of usage:
    ```python
    run_arm_calibration(arm, "left", "follower")
    ```
    """
-    reset_arm(arm)
+    reset_torque_mode(arm)

-    # TODO(rcadene): document what position 1 mean
-    print(
-        f"Please move the '{name} {arm_type}' arm to the horizontal position (gripper fully closed, see {URL_HORIZONTAL_POSITION[arm_type]})"
-    )
+    print(f"\nRunning calibration of {name} {arm_type}...")
+
+    print("\nMove arm to zero position")
+    print("See: " + URL_TEMPLATE.format(robot="koch", arm=arm_type, position="zero"))
    input("Press Enter to continue...")

-    horizontal_homing_offset = compute_homing_offset(
-        arm, [False, False, False, False, False, False], TARGET_HORIZONTAL_POSITION
-    )
+    # We arbitrarely choosed our zero target position to be a straight horizontal position with gripper upwards and closed.
+    # It is easy to identify and all motors are in a "quarter turn" position. Once calibration is done, this position will
+    # corresponds to every motor angle being 0. If you set all 0 as Goal Position, the arm will move in this position.
+    zero_position = convert_degrees_to_steps(ZERO_POSITION_DEGREE, arm.motor_models)

-    # TODO(rcadene): document what position 2 mean
-    print(
-        f"Please move the '{name} {arm_type}' arm to the 90 degree position (gripper fully open, see {URL_90_DEGREE_POSITION[arm_type]})"
-    )
+    def _compute_nearest_rounded_position(position, models):
+        # TODO(rcadene): Rework this function since some motors cant physically rotate a quarter turn
+        # (e.g. the gripper of Aloha arms can only rotate ~50 degree)
+        quarter_turn_degree = 90
+        quarter_turn = convert_degrees_to_steps(quarter_turn_degree, models)
+        nearest_pos = np.round(position.astype(float) / quarter_turn) * quarter_turn
+        return nearest_pos.astype(position.dtype)
+
+    # Compute homing offset so that `present_position + homing_offset ~= target_position`.
+    position = arm.read("Present_Position")
+    position = _compute_nearest_rounded_position(position, arm.motor_models)
+    homing_offset = zero_position - position
+
+    print("\nMove arm to rotated target position")
+    print("See: " + URL_TEMPLATE.format(robot="koch", arm=arm_type, position="rotated"))
    input("Press Enter to continue...")

-    drive_mode = compute_drive_mode(arm, horizontal_homing_offset)
-    homing_offset = compute_homing_offset(arm, drive_mode, TARGET_90_DEGREE_POSITION)
+    # The rotated target position corresponds to a rotation of a quarter turn from the zero position.
+    # This allows to identify the rotation direction of each motor.
+    # For instance, if the motor rotates 90 degree, and its value is -90 after applying the homing offset, then we know its rotation direction
+    # is inverted. However, for the calibration being successful, we need everyone to follow the same target position.
+    # Sometimes, there is only one possible rotation direction. For instance, if the gripper is closed, there is only one direction which
+    # corresponds to opening the gripper. When the rotation direction is ambiguous, we arbitrarely rotate clockwise from the point of view
+    # of the previous motor in the kinetic chain.
+    rotated_position = convert_degrees_to_steps(ROTATED_POSITION_DEGREE, arm.motor_models)

-    # Invert offset for all drive_mode servos
-    for i in range(len(drive_mode)):
-        if drive_mode[i]:
-            homing_offset[i] = -homing_offset[i]
+    # Find drive mode by rotating each motor by a quarter of a turn.
+    # Drive mode indicates if the motor rotation direction should be inverted (=1) or not (=0).
+    position = arm.read("Present_Position")
+    position += homing_offset
+    position = _compute_nearest_rounded_position(position, arm.motor_models)
+    drive_mode = (position != rotated_position).astype(np.int32)

-    print("Calibration is done!")
+    # Re-compute homing offset to take into account drive mode
+    position = arm.read("Present_Position")
+    position = apply_drive_mode(position, drive_mode)
+    position = _compute_nearest_rounded_position(position, arm.motor_models)
+    homing_offset = rotated_position - position

-    print("=====================================")
-    print("      HOMING_OFFSET: ", " ".join([str(i) for i in homing_offset]))
-    print("      DRIVE_MODE: ", " ".join([str(i) for i in drive_mode]))
-    print("=====================================")
+    print("\nMove arm to rest position")
+    print("See: " + URL_TEMPLATE.format(robot="koch", arm=arm_type, position="rest"))
+    input("Press Enter to continue...")
+    print()

    return homing_offset, drive_mode

@@ -204,10 +166,39 @@ class KochRobotConfig:
    follower_arms: dict[str, MotorsBus] = field(default_factory=lambda: {})
    cameras: dict[str, Camera] = field(default_factory=lambda: {})

+    # Optionally limit the magnitude of the relative positional target vector for safety purposes.
+    # Set this to a positive scalar to have the same value for all motors, or a list that is the same length
+    # as the number of motors in your follower arms (assumes all follower arms have the same number of
+    # motors).
+    max_relative_target: list[float] | float | None = None
+
+    # Optionally set the leader arm in torque mode with the gripper motor set to this angle. This makes it
+    # possible to squeeze the gripper and have it spring back to an open position on its own. If None, the
+    # gripper is not put in torque mode.
+    gripper_open_degree: float | None = None
+
+    def __setattr__(self, prop: str, val):
+        if prop == "max_relative_target" and val is not None and isinstance(val, Sequence):
+            for name in self.follower_arms:
+                if len(self.follower_arms[name].motors) != len(val):
+                    raise ValueError(
+                        f"len(max_relative_target)={len(val)} but the follower arm with name {name} has "
+                        f"{len(self.follower_arms[name].motors)} motors. Please make sure that the "
+                        f"`max_relative_target` list has as many parameters as there are motors per arm. "
+                        "Note: This feature does not yet work with robots where different follower arms have "
+                        "different numbers of motors."
+                    )
+        super().__setattr__(prop, val)
+

 class KochRobot:
    # TODO(rcadene): Implement force feedback
-    """Tau Robotics: https://tau-robotics.com
+    """This class allows to control any Koch robot of various number of motors.
+
+    A few versions are available:
+    - [Koch v1.0](https://github.com/AlexanderKoch-Koch/low_cost_robot), with and without the wrist-to-elbow expansion, which was developed
+    by Alexander Koch from [Tau Robotics](https://tau-robotics.com): [Github for sourcing and assembly](
+    - [Koch v1.1])https://github.com/jess-moss/koch-v1-1), which was developed by Jess Moss.

    Example of highest frequency teleoperation without camera:
    ```python
@@ -240,7 +231,10 @@ class KochRobot:
            },
        ),
    }
-    robot = KochRobot(leader_arms, follower_arms)
+    robot = KochRobot(
+        leader_arms=leader_arms,
+        follower_arms=follower_arms,
+    )

    # Connect motors buses and cameras if any (Required)
    robot.connect()
@@ -252,7 +246,10 @@ class KochRobot:
    Example of highest frequency data collection without camera:
    ```python
    # Assumes leader and follower arms have been instantiated already (see first example)
-    robot = KochRobot(leader_arms, follower_arms)
+    robot = KochRobot(
+        leader_arms=leader_arms,
+        follower_arms=follower_arms,
+    )
    robot.connect()
    while True:
        observation, action = robot.teleop_step(record_data=True)
@@ -261,16 +258,20 @@ class KochRobot:
    Example of highest frequency data collection with cameras:
    ```python
    # Defines how to communicate with 2 cameras connected to the computer.
-    # Here, the webcam of the mackbookpro and the iphone (connected in USB to the macbookpro)
+    # Here, the webcam of the laptop and the phone (connected in USB to the laptop)
    # can be reached respectively using the camera indices 0 and 1. These indices can be
    # arbitrary. See the documentation of `OpenCVCamera` to find your own camera indices.
    cameras = {
-        "macbookpro": OpenCVCamera(camera_index=0, fps=30, width=640, height=480),
-        "iphone": OpenCVCamera(camera_index=1, fps=30, width=640, height=480),
+        "laptop": OpenCVCamera(camera_index=0, fps=30, width=640, height=480),
+        "phone": OpenCVCamera(camera_index=1, fps=30, width=640, height=480),
    }

    # Assumes leader and follower arms have been instantiated already (see first example)
-    robot = KochRobot(leader_arms, follower_arms, cameras)
+    robot = KochRobot(
+        leader_arms=leader_arms,
+        follower_arms=follower_arms,
+        cameras=cameras,
+    )
    robot.connect()
    while True:
        observation, action = robot.teleop_step(record_data=True)
@@ -279,7 +280,11 @@ class KochRobot:
    Example of controlling the robot with a policy (without running multiple policies in parallel to ensure highest frequency):
    ```python
    # Assumes leader and follower arms + cameras have been instantiated already (see previous example)
-    robot = KochRobot(leader_arms, follower_arms, cameras)
+    robot = KochRobot(
+        leader_arms=leader_arms,
+        follower_arms=follower_arms,
+        cameras=cameras,
+    )
    robot.connect()
    while True:
        # Uses the follower arms and cameras to capture an observation
@@ -330,23 +335,27 @@ class KochRobot:

        # Connect the arms
        for name in self.follower_arms:
+            print(f"Connecting {name} follower arm.")
            self.follower_arms[name].connect()
+            print(f"Connecting {name} leader arm.")
            self.leader_arms[name].connect()

        # Reset the arms and load or run calibration
        if self.calibration_path.exists():
            # Reset all arms before setting calibration
            for name in self.follower_arms:
-                reset_arm(self.follower_arms[name])
+                reset_torque_mode(self.follower_arms[name])
            for name in self.leader_arms:
-                reset_arm(self.leader_arms[name])
+                reset_torque_mode(self.leader_arms[name])

            with open(self.calibration_path, "rb") as f:
                calibration = pickle.load(f)
        else:
+            print(f"Missing calibration file '{self.calibration_path}'. Starting calibration precedure.")
            # Run calibration process which begins by reseting all arms
            calibration = self.run_calibration()

+            print(f"Calibration is done! Saving calibration file '{self.calibration_path}'")
            self.calibration_path.parent.mkdir(parents=True, exist_ok=True)
            with open(self.calibration_path, "wb") as f:
                pickle.dump(calibration, f)
@@ -366,13 +375,15 @@ class KochRobot:

        # Enable torque on all motors of the follower arms
        for name in self.follower_arms:
+            print(f"Activating torque on {name} follower arm.")
            self.follower_arms[name].write("Torque_Enable", 1)

-        # Enable torque on the gripper of the leader arms, and move it to 45 degrees,
-        # so that we can use it as a trigger to close the gripper of the follower arms.
-        for name in self.leader_arms:
-            self.leader_arms[name].write("Torque_Enable", 1, "gripper")
-            self.leader_arms[name].write("Goal_Position", GRIPPER_OPEN, "gripper")
+        if self.config.gripper_open_degree is not None:
+            # Set the leader arm in torque mode with the gripper motor set to an angle. This makes it possible
+            # to squeeze the gripper and have it spring back to an open position on its own.
+            for name in self.leader_arms:
+                self.leader_arms[name].write("Torque_Enable", 1, "gripper")
+                self.leader_arms[name].write("Goal_Position", self.config.gripper_open_degree, "gripper")

        # Connect the cameras
        for name in self.cameras:
@@ -407,12 +418,12 @@ class KochRobot:
                "KochRobot is not connected. You need to run `robot.connect()`."
            )

-        # Prepare to assign the positions of the leader to the follower
+        # Prepare to assign the position of the leader to the follower
        leader_pos = {}
        for name in self.leader_arms:
-            now = time.perf_counter()
+            before_lread_t = time.perf_counter()
            leader_pos[name] = self.leader_arms[name].read("Present_Position")
-            self.logs[f"read_leader_{name}_pos_dt_s"] = time.perf_counter() - now
+            self.logs[f"read_leader_{name}_pos_dt_s"] = time.perf_counter() - before_lread_t

        follower_goal_pos = {}
        for name in self.leader_arms:
@@ -420,9 +431,9 @@ class KochRobot:

        # Send action
        for name in self.follower_arms:
-            now = time.perf_counter()
-            self.follower_arms[name].write("Goal_Position", follower_goal_pos[name])
-            self.logs[f"write_follower_{name}_goal_pos_dt_s"] = time.perf_counter() - now
+            before_fwrite_t = time.perf_counter()
+            self.send_action(torch.tensor(follower_goal_pos[name]), [name])
+            self.logs[f"write_follower_{name}_goal_pos_dt_s"] = time.perf_counter() - before_fwrite_t

        # Early exit when recording data is not requested
        if not record_data:
@@ -432,9 +443,9 @@ class KochRobot:
        # Read follower position
        follower_pos = {}
        for name in self.follower_arms:
-            now = time.perf_counter()
+            before_fread_t = time.perf_counter()
            follower_pos[name] = self.follower_arms[name].read("Present_Position")
-            self.logs[f"read_follower_{name}_pos_dt_s"] = time.perf_counter() - now
+            self.logs[f"read_follower_{name}_pos_dt_s"] = time.perf_counter() - before_fread_t

        # Create state by concatenating follower current position
        state = []
@@ -453,10 +464,10 @@ class KochRobot:
        # Capture images from cameras
        images = {}
        for name in self.cameras:
-            now = time.perf_counter()
+            before_camread_t = time.perf_counter()
            images[name] = self.cameras[name].async_read()
            self.logs[f"read_camera_{name}_dt_s"] = self.cameras[name].logs["delta_timestamp_s"]
-            self.logs[f"async_read_camera_{name}_dt_s"] = time.perf_counter() - now
+            self.logs[f"async_read_camera_{name}_dt_s"] = time.perf_counter() - before_camread_t

        # Populate output dictionnaries and format to pytorch
        obs_dict, action_dict = {}, {}
@@ -477,9 +488,9 @@ class KochRobot:
        # Read follower position
        follower_pos = {}
        for name in self.follower_arms:
-            now = time.perf_counter()
+            before_fread_t = time.perf_counter()
            follower_pos[name] = self.follower_arms[name].read("Present_Position")
-            self.logs[f"read_follower_{name}_pos_dt_s"] = time.perf_counter() - now
+            self.logs[f"read_follower_{name}_pos_dt_s"] = time.perf_counter() - before_fread_t

        # Create state by concatenating follower current position
        state = []
@@ -491,37 +502,67 @@ class KochRobot:
        # Capture images from cameras
        images = {}
        for name in self.cameras:
-            now = time.perf_counter()
+            before_camread_t = time.perf_counter()
            images[name] = self.cameras[name].async_read()
            self.logs[f"read_camera_{name}_dt_s"] = self.cameras[name].logs["delta_timestamp_s"]
-            self.logs[f"async_read_camera_{name}_dt_s"] = time.perf_counter() - now
+            self.logs[f"async_read_camera_{name}_dt_s"] = time.perf_counter() - before_camread_t

        # Populate output dictionnaries and format to pytorch
        obs_dict = {}
        obs_dict["observation.state"] = torch.from_numpy(state)
        for name in self.cameras:
-            # Convert to pytorch format: channel first and float32 in [0,1]
-            img = torch.from_numpy(images[name])
-            img = img.type(torch.float32) / 255
-            img = img.permute(2, 0, 1).contiguous()
-            obs_dict[f"observation.images.{name}"] = img
+            obs_dict[f"observation.images.{name}"] = torch.from_numpy(images[name])
        return obs_dict

-    def send_action(self, action: torch.Tensor):
-        """The provided action is expected to be a vector."""
+    def send_action(self, action: torch.Tensor, follower_names: list[str] | None = None):
+        """Command the follower arms to move to a target joint configuration.
+
+        The relative action magnitude may be clipped depending on the configuration parameter
+        `max_relative_target`.
+
+        Args:
+            action: tensor containing the concatenated joint positions for the follower arms.
+            follower_names: Pass follower arm names to only control a subset of all the follower arms.
+        """
        if not self.is_connected:
            raise RobotDeviceNotConnectedError(
                "KochRobot is not connected. You need to run `robot.connect()`."
            )

+        if follower_names is None:
+            follower_names = list(self.follower_arms)
+        elif not set(follower_names).issubset(self.follower_arms):
+            raise ValueError(
+                f"You provided {follower_names=} but only the following arms are registered: "
+                f"{list(self.follower_arms)}"
+            )
+
        from_idx = 0
        to_idx = 0
        follower_goal_pos = {}
-        for name in self.follower_arms:
-            if name in self.follower_arms:
-                to_idx += len(self.follower_arms[name].motor_names)
-                follower_goal_pos[name] = action[from_idx:to_idx].numpy()
-                from_idx = to_idx
+        for name in follower_names:
+            to_idx += len(self.follower_arms[name].motor_names)
+            this_action = action[from_idx:to_idx]
+
+            if self.config.max_relative_target is not None:
+                if not isinstance(self.config.max_relative_target, list):
+                    max_relative_target = [self.config.max_relative_target for _ in range(from_idx, to_idx)]
+                max_relative_target = torch.tensor(self.config.max_relative_target)
+                # Cap relative action target magnitude for safety.
+                current_pos = torch.tensor(self.follower_arms[name].read("Present_Position"))
+                diff = this_action - current_pos
+                safe_diff = torch.minimum(diff, max_relative_target)
+                safe_diff = torch.maximum(safe_diff, -max_relative_target)
+                safe_action = current_pos + safe_diff
+                if not torch.allclose(safe_action, action):
+                    logging.warning(
+                        "Relative action magnitude had to be clamped to be safe.\n"
+                        f"  requested relative action target: {diff}\n"
+                        f"    clamped relative action target: {safe_diff}"
+                    )
+
+            follower_goal_pos[name] = safe_action.numpy()
+            from_idx = to_idx

        for name in self.follower_arms:
            self.follower_arms[name].write("Goal_Position", follower_goal_pos[name].astype(np.int32))
--- a/lerobot/common/utils/utils.py
+++ b/lerobot/common/utils/utils.py
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
+import os
 import os.path as osp
 import random
 from contextlib import contextmanager
@@ -27,6 +28,12 @@ import torch
 from omegaconf import DictConfig


+def inside_slurm():
+    """Check whether the python process was launched through slurm"""
+    # TODO(rcadene): return False for interactive mode `--pty bash`
+    return "SLURM_JOB_ID" in os.environ
+
+
 def get_safe_torch_device(cfg_device: str, log: bool = False) -> torch.device:
    """Given a string, return a torch.device with checks on whether the device is available."""
    match cfg_device:
--- a/lerobot/configs/default.yaml
+++ b/lerobot/configs/default.yaml
@@ -32,19 +32,54 @@ video_backend: pyav

 training:
  offline_steps: ???
-  # NOTE: `online_steps` is not implemented yet. It's here as a placeholder.
-  online_steps: ???
-  online_steps_between_rollouts: ???
-  online_sampling_ratio: 0.5
-  # `online_env_seed` is used for environments for online training data rollouts.
-  online_env_seed: ???
+
+  # Number of workers for the offline training dataloader.
+  num_workers: 4
+
+  batch_size: ???
+
  eval_freq: ???
  log_freq: 200
  save_checkpoint: true
  # Checkpoint is saved every `save_freq` training iterations and after the last training step.
  save_freq: ???
-  num_workers: 4
-  batch_size: ???
+
+  # Online training. Note that the online training loop adopts most of the options above apart from the
+  # dataloader options. Unless otherwise specified.
+  # The online training look looks something like:
+  #
+  # for i in range(online_steps):
+  #     do_online_rollout_and_update_online_buffer()
+  #     for j in range(online_steps_between_rollouts):
+  #         batch = next(dataloader_with_offline_and_online_data)
+  #         loss = policy(batch)
+  #         loss.backward()
+  #         optimizer.step()
+  #
+  online_steps: ???
+  # How many episodes to collect at once when we reach the online rollout part of the training loop.
+  online_rollout_n_episodes: 1
+  # The number of environments to use in the gym.vector.VectorEnv. This ends up also being the batch size for
+  # the policy. Ideally you should set this to by an even divisor or online_rollout_n_episodes.
+  online_rollout_batch_size: 1
+  # How many optimization steps (forward, backward, optimizer step) to do between running rollouts.
+  online_steps_between_rollouts: null
+  # The proportion of online samples (vs offline samples) to include in the online training batches.
+  online_sampling_ratio: 0.5
+  # First seed to use for the online rollout environment. Seeds for subsequent rollouts are incremented by 1.
+  online_env_seed: null
+  # Sets the maximum number of frames that are stored in the online buffer for online training. The buffer is
+  # FIFO.
+  online_buffer_capacity: null
+  # The minimum number of frames to have in the online buffer before commencing online training.
+  # If online_buffer_seed_size > online_rollout_n_episodes, the rollout will be run multiple times until the
+  # seed size condition is satisfied.
+  online_buffer_seed_size: 0
+  # Whether to run the online rollouts asynchronously. This means we can run the online training steps in
+  # parallel with the rollouts. This might be advised if your GPU has the bandwidth to handle training
+  # + eval + environment rendering simultaneously.
+  do_online_rollout_async: false
+
  image_transforms:
  # These transforms are all using standard torchvision.transforms.v2
  # You can find out how these transformations affect images here:
@@ -85,7 +120,7 @@ eval:
  # `batch_size` specifies the number of environments to use in a gym.vector.VectorEnv.
  batch_size: 1
  # `use_async_envs` specifies whether to use asynchronous environments (multiprocessing).
-  use_async_envs: false
+  use_async_envs: true

 wandb:
  enable: false
--- a/lerobot/configs/env/aloha.yaml
+++ b/lerobot/configs/env/aloha.yaml
@@ -2,6 +2,11 @@

 fps: 50

+eval:
+  # `use_async_envs` specifies whether to use asynchronous environments (multiprocessing).
+  # set it to false to avoid some problems of the aloha env
+  use_async_envs: false
+
 env:
  name: aloha
  task: AlohaInsertion-v0
--- a/lerobot/configs/env/xarm.yaml
+++ b/lerobot/configs/env/xarm.yaml
@@ -2,6 +2,11 @@

 fps: 15

+eval:
+  # `use_async_envs` specifies whether to use asynchronous environments (multiprocessing).
+  # set it to false to avoid some problems of the aloha env
+  use_async_envs: false
+
 env:
  name: xarm
  task: XarmLift-v0
@@ -9,7 +14,7 @@ env:
  state_dim: 4
  action_dim: 4
  fps: ${fps}
-  episode_length: 25
+  episode_length: 200
  gym:
    obs_type: pixels_agent_pos
    render_mode: rgb_array
--- a/lerobot/configs/policy/tdmpc.yaml
+++ b/lerobot/configs/policy/tdmpc.yaml
@@ -4,19 +4,30 @@ seed: 1
 dataset_repo_id: lerobot/xarm_lift_medium

 training:
-  offline_steps: 25000
-  # TODO(alexander-soare): uncomment when online training gets reinstated
-  online_steps: 0  # 25000 not implemented yet
-  eval_freq: 5000
-  online_steps_between_rollouts: 1
-  online_sampling_ratio: 0.5
-  online_env_seed: 10000
-  log_freq: 100
+  offline_steps: 50000
+
+  num_workers: 4

  batch_size: 256
  grad_clip_norm: 10.0
  lr: 3e-4

+  eval_freq: 5000
+  log_freq: 100
+
+  online_steps: 50000
+  online_rollout_n_episodes: 1
+  online_rollout_batch_size: 1
+  # Note: in FOWM `online_steps_between_rollouts` is actually dynamically set to match exactly the length of
+  # the last sampled episode.
+  online_steps_between_rollouts: 50
+  online_sampling_ratio: 0.5
+  online_env_seed: 10000
+  # FOWM Push uses 10000 for `online_buffer_capacity`. Given that their maximum episode length for this task
+  # is 25, 10000 is approx 400 of their episodes worth. Since our episodes are about 8 times longer, we'll use
+  # 80000.
+  online_buffer_capacity: 80000
+
  delta_timestamps:
    observation.image: "[i / ${fps} for i in range(${policy.horizon} + 1)]"
    observation.state: "[i / ${fps} for i in range(${policy.horizon} + 1)]"
@@ -31,6 +42,7 @@ policy:
  # Input / output structure.
  n_action_repeats: 2
  horizon: 5
+  n_action_steps: 1

  input_shapes:
    # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
--- a/lerobot/configs/policy/tdmpc_pusht_keypoints.yaml
+++ b/lerobot/configs/policy/tdmpc_pusht_keypoints.yaml
@@ -0,0 +1,105 @@
+# @package _global_
+
+# Train with:
+#
+# python lerobot/scripts/train.py \
+#   env=pusht \
+#   env.gym.obs_type=environment_state_agent_pos \
+#   policy=tdmpc_pusht_keypoints \
+#   eval.batch_size=50 \
+#   eval.n_episodes=50 \
+#   eval.use_async_envs=true \
+#   device=cuda \
+#   use_amp=true
+
+seed: 1
+dataset_repo_id: lerobot/pusht_keypoints
+
+training:
+  offline_steps: 0
+
+  # Offline training dataloader
+  num_workers: 4
+
+  batch_size: 256
+  grad_clip_norm: 10.0
+  lr: 3e-4
+
+  eval_freq: 10000
+  log_freq: 500
+  save_freq: 50000
+
+  online_steps: 1000000
+  online_rollout_n_episodes: 10
+  online_rollout_batch_size: 10
+  online_steps_between_rollouts: 1000
+  online_sampling_ratio: 1.0
+  online_env_seed: 10000
+  online_buffer_capacity: 40000
+  online_buffer_seed_size: 0
+  do_online_rollout_async: false
+
+  delta_timestamps:
+    observation.environment_state: "[i / ${fps} for i in range(${policy.horizon} + 1)]"
+    observation.state: "[i / ${fps} for i in range(${policy.horizon} + 1)]"
+    action: "[i / ${fps} for i in range(${policy.horizon})]"
+    next.reward: "[i / ${fps} for i in range(${policy.horizon})]"
+
+policy:
+  name: tdmpc
+
+  pretrained_model_path:
+
+  # Input / output structure.
+  n_action_repeats: 1
+  horizon: 5
+  n_action_steps: 5
+
+  input_shapes:
+    # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
+    observation.environment_state: [16]
+    observation.state: ["${env.state_dim}"]
+  output_shapes:
+    action: ["${env.action_dim}"]
+
+  # Normalization / Unnormalization
+  input_normalization_modes:
+    observation.environment_state: min_max
+    observation.state: min_max
+  output_normalization_modes:
+    action: min_max
+
+  # Architecture / modeling.
+  # Neural networks.
+  image_encoder_hidden_dim: 32
+  state_encoder_hidden_dim: 256
+  latent_dim: 50
+  q_ensemble_size: 5
+  mlp_dim: 512
+  # Reinforcement learning.
+  discount: 0.98
+
+  # Inference.
+  use_mpc: true
+  cem_iterations: 6
+  max_std: 2.0
+  min_std: 0.05
+  n_gaussian_samples: 512
+  n_pi_samples: 51
+  uncertainty_regularizer_coeff: 1.0
+  n_elites: 50
+  elite_weighting_temperature: 0.5
+  gaussian_mean_momentum: 0.1
+
+  # Training and loss computation.
+  max_random_shift_ratio: 0.0476
+  # Loss coefficients.
+  reward_coeff: 0.5
+  expectile_weight: 0.9
+  value_coeff: 0.1
+  consistency_coeff: 20.0
+  advantage_scaling: 3.0
+  pi_coeff: 0.5
+  temporal_decay_coeff: 0.5
+  # Target model.
+  target_model_momentum: 0.995
--- a/lerobot/configs/robot/koch.yaml
+++ b/lerobot/configs/robot/koch.yaml
@@ -0,0 +1,46 @@
+_target_: lerobot.common.robot_devices.robots.koch.KochRobot
+calibration_path: .cache/calibration/koch.pkl
+leader_arms:
+  main:
+    _target_: lerobot.common.robot_devices.motors.dynamixel.DynamixelMotorsBus
+    port: /dev/tty.usbmodem575E0031751
+    motors:
+      # name: (index, model)
+      shoulder_pan: [1, "xl330-m077"]
+      shoulder_lift: [2, "xl330-m077"]
+      elbow_flex: [3, "xl330-m077"]
+      wrist_flex: [4, "xl330-m077"]
+      wrist_roll: [5, "xl330-m077"]
+      gripper: [6, "xl330-m077"]
+follower_arms:
+  main:
+    _target_: lerobot.common.robot_devices.motors.dynamixel.DynamixelMotorsBus
+    port: /dev/tty.usbmodem575E0032081
+    motors:
+      # name: (index, model)
+      shoulder_pan: [1, "xl430-w250"]
+      shoulder_lift: [2, "xl430-w250"]
+      elbow_flex: [3, "xl330-m288"]
+      wrist_flex: [4, "xl330-m288"]
+      wrist_roll: [5, "xl330-m288"]
+      gripper: [6, "xl330-m288"]
+cameras:
+  laptop:
+    _target_: lerobot.common.robot_devices.cameras.opencv.OpenCVCamera
+    camera_index: 0
+    fps: 30
+    width: 640
+    height: 480
+  phone:
+    _target_: lerobot.common.robot_devices.cameras.opencv.OpenCVCamera
+    camera_index: 1
+    fps: 30
+    width: 640
+    height: 480
+# `max_relative_target` limits the magnitude of the relative positional target vector for safety purposes.
+# Set this to a positive scalar to have the same value for all motors, or a list that is the same length as
+# the number of motors in your follower arms.
+max_relative_target: null
+# Sets the leader arm in torque mode with the gripper motor set to this angle. This makes it possible
+# to squeeze the gripper and have it spring back to an open position on its own.
+gripper_open_degree: 35.156
--- a/lerobot/scripts/control_robot.py
+++ b/lerobot/scripts/control_robot.py
@@ -1,9 +1,22 @@
 """
+Utilities to control a robot.
+
+Useful to record a dataset, replay a recorded episode, run the policy on your robot
+and record an evaluation dataset, and to recalibrate your robot if needed.
+
 Examples of usage:

+- Recalibrate your robot:
+```bash
+python lerobot/scripts/control_robot.py calibrate
+```
+
 - Unlimited teleoperation at highest frequency (~200 Hz is expected), to exit with CTRL+C:
 ```bash
 python lerobot/scripts/control_robot.py teleoperate
+
+# Remove the cameras from the robot definition. They are not used in 'teleoperate' anyway.
+python lerobot/scripts/control_robot.py teleoperate --robot-overrides '~cameras'
 ```

 - Unlimited teleoperation at a limited frequency of 30 Hz, to simulate data recording frequency:
@@ -14,7 +27,7 @@ python lerobot/scripts/control_robot.py teleoperate \

 - Record one episode in order to test replay:
 ```bash
-python lerobot/scripts/control_robot.py record_dataset \
+python lerobot/scripts/control_robot.py record \
    --fps 30 \
    --root tmp/data \
    --repo-id $USER/koch_test \
@@ -32,7 +45,7 @@ python lerobot/scripts/visualize_dataset.py \

 - Replay this test episode:
 ```bash
-python lerobot/scripts/control_robot.py replay_episode \
+python lerobot/scripts/control_robot.py replay \
    --fps 30 \
    --root tmp/data \
    --repo-id $USER/koch_test \
@@ -42,12 +55,11 @@ python lerobot/scripts/control_robot.py replay_episode \
 - Record a full dataset in order to train a policy, with 2 seconds of warmup,
 30 seconds of recording for each episode, and 10 seconds to reset the environment in between episodes:
 ```bash
-python lerobot/scripts/control_robot.py record_dataset \
+python lerobot/scripts/control_robot.py record \
    --fps 30 \
    --root data \
    --repo-id $USER/koch_pick_place_lego \
    --num-episodes 50 \
-    --run-compute-stats 1 \
    --warmup-time-s 2 \
    --episode-time-s 30 \
    --reset-time-s 10
@@ -74,7 +86,14 @@ DATA_DIR=data python lerobot/scripts/train.py \

 - Run the pretrained policy on the robot:
 ```bash
-python lerobot/scripts/control_robot.py run_policy \
+python lerobot/scripts/control_robot.py record \
+    --fps 30 \
+    --root data \
+    --repo-id $USER/eval_act_koch_real \
+    --num-episodes 10 \
+    --warmup-time-s 2 \
+    --episode-time-s 30 \
+    --reset-time-s 10
    -p outputs/train/act_koch_real/checkpoints/080000/pretrained_model
 ```
 """
@@ -87,12 +106,14 @@ import os
 import platform
 import shutil
 import time
+import traceback
 from contextlib import nullcontext
+from functools import cache
 from pathlib import Path

+import cv2
 import torch
 import tqdm
-from huggingface_hub import create_branch
 from omegaconf import DictConfig
 from PIL import Image
 from termcolor import colored
@@ -101,21 +122,46 @@ from termcolor import colored
 from lerobot.common.datasets.compute_stats import compute_stats
 from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDataset
 from lerobot.common.datasets.push_dataset_to_hub.aloha_hdf5_format import to_hf_dataset
-from lerobot.common.datasets.push_dataset_to_hub.utils import concatenate_episodes
-from lerobot.common.datasets.utils import calculate_episode_data_index
+from lerobot.common.datasets.push_dataset_to_hub.utils import concatenate_episodes, get_default_encoding
+from lerobot.common.datasets.utils import calculate_episode_data_index, create_branch
 from lerobot.common.datasets.video_utils import encode_video_frames
 from lerobot.common.policies.factory import make_policy
 from lerobot.common.robot_devices.robots.factory import make_robot
 from lerobot.common.robot_devices.robots.utils import Robot
 from lerobot.common.utils.utils import get_safe_torch_device, init_hydra_config, init_logging, set_global_seed
 from lerobot.scripts.eval import get_pretrained_policy_path
-from lerobot.scripts.push_dataset_to_hub import push_meta_data_to_hub, push_videos_to_hub, save_meta_data
+from lerobot.scripts.push_dataset_to_hub import (
+    push_dataset_card_to_hub,
+    push_meta_data_to_hub,
+    push_videos_to_hub,
+    save_meta_data,
+)

 ########################################################################################
 # Utilities
 ########################################################################################


+def say(text, blocking=False):
+    # Check if mac, linux, or windows.
+    if platform.system() == "Darwin":
+        cmd = f'say "{text}"'
+    elif platform.system() == "Linux":
+        cmd = f'spd-say "{text}"'
+    elif platform.system() == "Windows":
+        cmd = (
+            'PowerShell -Command "Add-Type -AssemblyName System.Speech; '
+            f"(New-Object System.Speech.Synthesis.SpeechSynthesizer).Speak('{text}')\""
+        )
+
+    if not blocking and platform.system() in ["Darwin", "Linux"]:
+        # TODO(rcadene): Make it work for Windows
+        # Use the ampersand to run command in the background
+        cmd += " &"
+
+    os.system(cmd)
+
+
 def save_image(img_tensor, key, frame_index, episode_index, videos_dir):
    img = Image.fromarray(img_tensor.numpy())
    path = videos_dir / f"{key}_episode_{episode_index:06d}" / f"frame_{frame_index:06d}.png"
@@ -160,11 +206,11 @@ def log_control_info(robot, dt_s, episode_index=None, frame_index=None, fps=None
    for name in robot.follower_arms:
        key = f"write_follower_{name}_goal_pos_dt_s"
        if key in robot.logs:
-            log_dt("dtRfoll", robot.logs[key])
+            log_dt("dtWfoll", robot.logs[key])

        key = f"read_follower_{name}_pos_dt_s"
        if key in robot.logs:
-            log_dt("dtWfoll", robot.logs[key])
+            log_dt("dtRfoll", robot.logs[key])

    for name in robot.cameras:
        key = f"read_camera_{name}_dt_s"
@@ -179,12 +225,23 @@ def log_control_info(robot, dt_s, episode_index=None, frame_index=None, fps=None
    logging.info(info_str)


-def get_is_headless():
-    if platform.system() == "Linux":
-        display = os.environ.get("DISPLAY")
-        if display is None or display == "":
-            return True
-    return False
+@cache
+def is_headless():
+    """Detects if python is running without a monitor."""
+    try:
+        import pynput  # noqa
+
+        return False
+    except Exception:
+        print(
+            "Error trying to import pynput. Switching to headless mode. "
+            "As a result, the video stream from the cameras won't be shown, "
+            "and you won't be able to change the control flow with keyboards. "
+            "For more info, see traceback below.\n"
+        )
+        traceback.print_exc()
+        print()
+        return True


 ########################################################################################
@@ -192,29 +249,44 @@ def get_is_headless():
 ########################################################################################


+def calibrate(robot: Robot):
+    if robot.calibration_path.exists():
+        print(f"Removing '{robot.calibration_path}'")
+        robot.calibration_path.unlink()
+
+    if robot.is_connected:
+        robot.disconnect()
+
+    # Calling `connect` automatically runs calibration
+    # when the calibration file is missing
+    robot.connect()
+
+
 def teleoperate(robot: Robot, fps: int | None = None, teleop_time_s: float | None = None):
    # TODO(rcadene): Add option to record logs
    if not robot.is_connected:
        robot.connect()

-    start_time = time.perf_counter()
+    start_teleop_t = time.perf_counter()
    while True:
-        now = time.perf_counter()
+        start_loop_t = time.perf_counter()
        robot.teleop_step()

        if fps is not None:
-            dt_s = time.perf_counter() - now
+            dt_s = time.perf_counter() - start_loop_t
            busy_wait(1 / fps - dt_s)

-        dt_s = time.perf_counter() - now
+        dt_s = time.perf_counter() - start_loop_t
        log_control_info(robot, dt_s, fps=fps)

-        if teleop_time_s is not None and time.perf_counter() - start_time > teleop_time_s:
+        if teleop_time_s is not None and time.perf_counter() - start_teleop_t > teleop_time_s:
            break


-def record_dataset(
+def record(
    robot: Robot,
+    policy: torch.nn.Module | None = None,
+    hydra_cfg: DictConfig | None = None,
    fps: int | None = None,
    root="data",
    repo_id="lerobot/debug",
@@ -225,10 +297,18 @@ def record_dataset(
    video=True,
    run_compute_stats=True,
    push_to_hub=True,
+    tags=None,
    num_image_writers=8,
    force_override=False,
 ):
    # TODO(rcadene): Add option to record logs
+    # TODO(rcadene): Clean this function via decomposition in higher level functions
+
+    _, dataset_name = repo_id.split("/")
+    if dataset_name.startswith("eval_") and policy is None:
+        raise ValueError(
+            f"Your dataset name begins by 'eval_' ({dataset_name}) but no policy is provided ({policy})."
+        )

    if not video:
        raise NotImplementedError()
@@ -255,32 +335,10 @@ def record_dataset(
    else:
        episode_index = 0

-    is_headless = get_is_headless()
-
-    # Execute a few seconds without recording data, to give times
-    # to the robot devices to connect and start synchronizing.
-    timestamp = 0
-    start_time = time.perf_counter()
-    is_warmup_print = False
-    while timestamp < warmup_time_s:
-        if not is_warmup_print:
-            logging.info("Warming up (no data recording)")
-            os.system('say "Warmup" &')
-            is_warmup_print = True
-
-        now = time.perf_counter()
-        observation, action = robot.teleop_step(record_data=True)
-
-        if not is_headless:
-            image_keys = [key for key in observation if "image" in key]
-
-        dt_s = time.perf_counter() - now
-        busy_wait(1 / fps - dt_s)
-
-        dt_s = time.perf_counter() - now
-        log_control_info(robot, dt_s, fps=fps)
-
-        timestamp = time.perf_counter() - start_time
+    if is_headless():
+        logging.info(
+            "Headless environment detected. On-screen cameras display and keyboard inputs will not be available."
+        )

    # Allow to exit early while recording an episode or resetting the environment,
    # by tapping the right arrow key '->'. This might require a sudo permission
@@ -290,9 +348,7 @@ def record_dataset(
    stop_recording = False

    # Only import pynput if not in a headless environment
-    if is_headless:
-        logging.info("Headless environment detected. Keyboard input will not be available.")
-    else:
+    if not is_headless():
        from pynput import keyboard

        def on_press(key):
@@ -315,6 +371,53 @@ def record_dataset(
        listener = keyboard.Listener(on_press=on_press)
        listener.start()

+    # Load policy if any
+    if policy is not None:
+        # Check device is available
+        device = get_safe_torch_device(hydra_cfg.device, log=True)
+
+        policy.eval()
+        policy.to(device)
+
+        torch.backends.cudnn.benchmark = True
+        torch.backends.cuda.matmul.allow_tf32 = True
+        set_global_seed(hydra_cfg.seed)
+
+        # override fps using policy fps
+        fps = hydra_cfg.env.fps
+
+    # Execute a few seconds without recording data, to give times
+    # to the robot devices to connect and start synchronizing.
+    timestamp = 0
+    start_warmup_t = time.perf_counter()
+    is_warmup_print = False
+    while timestamp < warmup_time_s:
+        if not is_warmup_print:
+            logging.info("Warming up (no data recording)")
+            say("Warming up")
+            is_warmup_print = True
+
+        start_loop_t = time.perf_counter()
+
+        if policy is None:
+            observation, action = robot.teleop_step(record_data=True)
+        else:
+            observation = robot.capture_observation()
+
+        if not is_headless():
+            image_keys = [key for key in observation if "image" in key]
+            for key in image_keys:
+                cv2.imshow(key, cv2.cvtColor(observation[key].numpy(), cv2.COLOR_RGB2BGR))
+            cv2.waitKey(1)
+
+        dt_s = time.perf_counter() - start_loop_t
+        busy_wait(1 / fps - dt_s)
+
+        dt_s = time.perf_counter() - start_loop_t
+        log_control_info(robot, dt_s, fps=fps)
+
+        timestamp = time.perf_counter() - start_warmup_t
+
    # Save images using threads to reach high fps (30 and more)
    # Using `with` to exist smoothly if an execption is raised.
    # Using only 4 worker threads to avoid blocking the main thread.
@@ -323,14 +426,18 @@ def record_dataset(
        # Start recording all episodes
        while episode_index < num_episodes:
            logging.info(f"Recording episode {episode_index}")
-            os.system(f'say "Recording episode {episode_index}" &')
+            say(f"Recording episode {episode_index}")
            ep_dict = {}
            frame_index = 0
            timestamp = 0
-            start_time = time.perf_counter()
+            start_episode_t = time.perf_counter()
            while timestamp < episode_time_s:
-                now = time.perf_counter()
-                observation, action = robot.teleop_step(record_data=True)
+                start_loop_t = time.perf_counter()
+
+                if policy is None:
+                    observation, action = robot.teleop_step(record_data=True)
+                else:
+                    observation = robot.capture_observation()

                image_keys = [key for key in observation if "image" in key]
                not_image_keys = [key for key in observation if "image" not in key]
@@ -342,11 +449,46 @@ def record_dataset(
                        )
                    ]

+                if not is_headless():
+                    image_keys = [key for key in observation if "image" in key]
+                    for key in image_keys:
+                        cv2.imshow(key, cv2.cvtColor(observation[key].numpy(), cv2.COLOR_RGB2BGR))
+                    cv2.waitKey(1)
+
                for key in not_image_keys:
                    if key not in ep_dict:
                        ep_dict[key] = []
                    ep_dict[key].append(observation[key])

+                if policy is not None:
+                    with (
+                        torch.inference_mode(),
+                        torch.autocast(device_type=device.type)
+                        if device.type == "cuda" and hydra_cfg.use_amp
+                        else nullcontext(),
+                    ):
+                        # Convert to pytorch format: channel first and float32 in [0,1] with batch dimension
+                        for name in observation:
+                            if "image" in name:
+                                observation[name] = observation[name].type(torch.float32) / 255
+                                observation[name] = observation[name].permute(2, 0, 1).contiguous()
+                            observation[name] = observation[name].unsqueeze(0)
+                            observation[name] = observation[name].to(device)
+
+                        # Compute the next action with the policy
+                        # based on the current observation
+                        action = policy.select_action(observation)
+
+                        # Remove batch dimension
+                        action = action.squeeze(0)
+
+                        # Move to cpu, if not already the case
+                        action = action.to("cpu")
+
+                    # Order the robot to move
+                    robot.send_action(action)
+                    action = {"action": action}
+
                for key in action:
                    if key not in ep_dict:
                        ep_dict[key] = []
@@ -354,14 +496,13 @@ def record_dataset(

                frame_index += 1

-                dt_s = time.perf_counter() - now
+                dt_s = time.perf_counter() - start_loop_t
                busy_wait(1 / fps - dt_s)

-                dt_s = time.perf_counter() - now
+                dt_s = time.perf_counter() - start_loop_t
                log_control_info(robot, dt_s, fps=fps)

-                timestamp = time.perf_counter() - start_time
-
+                timestamp = time.perf_counter() - start_episode_t
                if exit_early:
                    exit_early = False
                    break
@@ -369,10 +510,10 @@ def record_dataset(
            if not stop_recording:
                # Start resetting env while the executor are finishing
                logging.info("Reset the environment")
-                os.system('say "Reset the environment" &')
+                say("Reset the environment")

            timestamp = 0
-            start_time = time.perf_counter()
+            start_vencod_t = time.perf_counter()

            # During env reset we save the data and encode the videos
            num_frames = frame_index
@@ -418,7 +559,7 @@ def record_dataset(
            with tqdm.tqdm(total=reset_time_s, desc="Waiting") as pbar:
                while timestamp < reset_time_s and not is_last_episode:
                    time.sleep(1)
-                    timestamp = time.perf_counter() - start_time
+                    timestamp = time.perf_counter() - start_vencod_t
                    pbar.update(1)
                    if exit_early:
                        exit_early = False
@@ -433,8 +574,8 @@ def record_dataset(

            if is_last_episode:
                logging.info("Done recording")
-                os.system('say "Done recording"')
-                if not is_headless:
+                say("Done recording", blocking=True)
+                if not is_headless():
                    listener.stop()

                logging.info("Waiting for threads writing the images on disk to terminate...")
@@ -444,10 +585,14 @@ def record_dataset(
                    pass
                break

+    robot.disconnect()
+    if not is_headless():
+        cv2.destroyAllWindows()
+
    num_episodes = episode_index

    logging.info("Encoding videos")
-    os.system('say "Encoding videos" &')
+    say("Encoding videos")
    # Use ffmpeg to convert frames stored as png into mp4 videos
    for episode_index in tqdm.tqdm(range(num_episodes)):
        for key in image_keys:
@@ -455,6 +600,7 @@ def record_dataset(
            fname = f"{key}_episode_{episode_index:06d}.mp4"
            video_path = local_dir / "videos" / fname
            if video_path.exists():
+                # Skip if video is already encoded. Could be the case when resuming data recording.
                continue
            # note: `encode_video_frames` is a blocking call. Making it asynchronous shouldn't speedup encoding,
            # since video encoding with ffmpeg is already using multithreading.
@@ -479,6 +625,8 @@ def record_dataset(
        "fps": fps,
        "video": video,
    }
+    if video:
+        info["encoding"] = get_default_encoding()

    lerobot_dataset = LeRobotDataset.from_preloaded(
        repo_id=repo_id,
@@ -489,11 +637,12 @@ def record_dataset(
    )
    if run_compute_stats:
        logging.info("Computing dataset statistics")
-        os.system('say "Computing dataset statistics" &')
+        say("Computing dataset statistics")
        stats = compute_stats(lerobot_dataset)
        lerobot_dataset.stats = stats
    else:
-        logging.info("Skipping computation of the dataset statistrics")
+        stats = {}
+        logging.info("Skipping computation of the dataset statistics")

    hf_dataset = hf_dataset.with_format(None)  # to remove transforms that cant be saved
    hf_dataset.save_to_disk(str(local_dir / "train"))
@@ -504,17 +653,17 @@ def record_dataset(
    if push_to_hub:
        hf_dataset.push_to_hub(repo_id, revision="main")
        push_meta_data_to_hub(repo_id, meta_data_dir, revision="main")
+        push_dataset_card_to_hub(repo_id, revision="main", tags=tags)
        if video:
            push_videos_to_hub(repo_id, videos_dir, revision="main")
        create_branch(repo_id, repo_type="dataset", branch=CODEBASE_VERSION)

    logging.info("Exiting")
-    os.system('say "Exiting" &')
-
+    say("Exiting")
    return lerobot_dataset


-def replay_episode(robot: Robot, episode: int, fps: int | None = None, root="data", repo_id="lerobot/debug"):
+def replay(robot: Robot, episode: int, fps: int | None = None, root="data", repo_id="lerobot/debug"):
    # TODO(rcadene): Add option to record logs
    local_dir = Path(root) / repo_id
    if not local_dir.exists():
@@ -529,76 +678,20 @@ def replay_episode(robot: Robot, episode: int, fps: int | None = None, root="dat
        robot.connect()

    logging.info("Replaying episode")
-    os.system('say "Replaying episode"')
-
+    say("Replaying episode", blocking=True)
    for idx in range(from_idx, to_idx):
-        now = time.perf_counter()
+        start_episode_t = time.perf_counter()

        action = items[idx]["action"]
        robot.send_action(action)

-        dt_s = time.perf_counter() - now
+        dt_s = time.perf_counter() - start_episode_t
        busy_wait(1 / fps - dt_s)

-        dt_s = time.perf_counter() - now
+        dt_s = time.perf_counter() - start_episode_t
        log_control_info(robot, dt_s, fps=fps)


-def run_policy(robot: Robot, policy: torch.nn.Module, hydra_cfg: DictConfig, run_time_s: float | None = None):
-    # TODO(rcadene): Add option to record eval dataset and logs
-
-    # Check device is available
-    device = get_safe_torch_device(hydra_cfg.device, log=True)
-
-    policy.eval()
-    policy.to(device)
-
-    torch.backends.cudnn.benchmark = True
-    torch.backends.cuda.matmul.allow_tf32 = True
-    set_global_seed(hydra_cfg.seed)
-
-    fps = hydra_cfg.env.fps
-
-    if not robot.is_connected:
-        robot.connect()
-
-    start_time = time.perf_counter()
-    while True:
-        now = time.perf_counter()
-
-        observation = robot.capture_observation()
-
-        with (
-            torch.inference_mode(),
-            torch.autocast(device_type=device.type)
-            if device.type == "cuda" and hydra_cfg.use_amp
-            else nullcontext(),
-        ):
-            # add batch dimension to 1
-            for name in observation:
-                observation[name] = observation[name].unsqueeze(0)
-
-            if device.type == "mps":
-                for name in observation:
-                    observation[name] = observation[name].to(device)
-
-            action = policy.select_action(observation)
-
-            # remove batch dimension
-            action = action.squeeze(0)
-
-        robot.send_action(action.to("cpu"))
-
-        dt_s = time.perf_counter() - now
-        busy_wait(1 / fps - dt_s)
-
-        dt_s = time.perf_counter() - now
-        log_control_info(robot, dt_s, fps=fps)
-
-        if run_time_s is not None and time.perf_counter() - start_time > run_time_s:
-            break
-
-
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    subparsers = parser.add_subparsers(dest="mode", required=True)
@@ -606,18 +699,26 @@ if __name__ == "__main__":
    # Set common options for all the subparsers
    base_parser = argparse.ArgumentParser(add_help=False)
    base_parser.add_argument(
-        "--robot",
+        "--robot-path",
        type=str,
-        default="koch",
-        help="Name of the robot provided to the `make_robot(name)` factory function.",
+        default="lerobot/configs/robot/koch.yaml",
+        help="Path to robot yaml file used to instantiate the robot using `make_robot` factory function.",
    )
+    base_parser.add_argument(
+        "--robot-overrides",
+        type=str,
+        nargs="*",
+        help="Any key=value arguments to override config values (use dots for.nested=overrides)",
+    )
+
+    parser_calib = subparsers.add_parser("calibrate", parents=[base_parser])

    parser_teleop = subparsers.add_parser("teleoperate", parents=[base_parser])
    parser_teleop.add_argument(
        "--fps", type=none_or_int, default=None, help="Frames per second (set to None to disable)"
    )

-    parser_record = subparsers.add_parser("record_dataset", parents=[base_parser])
+    parser_record = subparsers.add_parser("record", parents=[base_parser])
    parser_record.add_argument(
        "--fps", type=none_or_int, default=None, help="Frames per second (set to None to disable)"
    )
@@ -636,19 +737,19 @@ if __name__ == "__main__":
    parser_record.add_argument(
        "--warmup-time-s",
        type=int,
-        default=2,
+        default=10,
        help="Number of seconds before starting data collection. It allows the robot devices to warmup and synchronize.",
    )
    parser_record.add_argument(
        "--episode-time-s",
        type=int,
-        default=10,
+        default=60,
        help="Number of seconds for data recording for each episode.",
    )
    parser_record.add_argument(
        "--reset-time-s",
        type=int,
-        default=5,
+        default=60,
        help="Number of seconds for resetting the environment after each episode.",
    )
    parser_record.add_argument("--num-episodes", type=int, default=50, help="Number of episodes to record.")
@@ -664,6 +765,12 @@ if __name__ == "__main__":
        default=1,
        help="Upload dataset to Hugging Face hub.",
    )
+    parser_record.add_argument(
+        "--tags",
+        type=str,
+        nargs="*",
+        help="Add tags to your dataset on the hub.",
+    )
    parser_record.add_argument(
        "--num-image-writers",
        type=int,
@@ -676,8 +783,23 @@ if __name__ == "__main__":
        default=0,
        help="By default, data recording is resumed. When set to 1, delete the local directory and start data recording from scratch.",
    )
+    parser_record.add_argument(
+        "-p",
+        "--pretrained-policy-name-or-path",
+        type=str,
+        help=(
+            "Either the repo ID of a model hosted on the Hub or a path to a directory containing weights "
+            "saved using `Policy.save_pretrained`."
+        ),
+    )
+    parser_record.add_argument(
+        "--policy-overrides",
+        type=str,
+        nargs="*",
+        help="Any key=value arguments to override config values (use dots for.nested=overrides)",
+    )

-    parser_replay = subparsers.add_parser("replay_episode", parents=[base_parser])
+    parser_replay = subparsers.add_parser("replay", parents=[base_parser])
    parser_replay.add_argument(
        "--fps", type=none_or_int, default=None, help="Frames per second (set to None to disable)"
    )
@@ -695,41 +817,46 @@ if __name__ == "__main__":
    )
    parser_replay.add_argument("--episode", type=int, default=0, help="Index of the episode to replay.")

-    parser_policy = subparsers.add_parser("run_policy", parents=[base_parser])
-    parser_policy.add_argument(
-        "-p",
-        "--pretrained-policy-name-or-path",
-        type=str,
-        help=(
-            "Either the repo ID of a model hosted on the Hub or a path to a directory containing weights "
-            "saved using `Policy.save_pretrained`."
-        ),
-    )
-    parser_policy.add_argument(
-        "overrides",
-        nargs="*",
-        help="Any key=value arguments to override config values (use dots for.nested=overrides)",
-    )
    args = parser.parse_args()

    init_logging()

    control_mode = args.mode
-    robot_name = args.robot
+    robot_path = args.robot_path
+    robot_overrides = args.robot_overrides
    kwargs = vars(args)
    del kwargs["mode"]
-    del kwargs["robot"]
+    del kwargs["robot_path"]
+    del kwargs["robot_overrides"]

-    robot = make_robot(robot_name)
-    if control_mode == "teleoperate":
+    robot_cfg = init_hydra_config(robot_path, robot_overrides)
+    robot = make_robot(robot_cfg)
+
+    if control_mode == "calibrate":
+        calibrate(robot, **kwargs)
+
+    elif control_mode == "teleoperate":
        teleoperate(robot, **kwargs)
-    elif control_mode == "record_dataset":
-        record_dataset(robot, **kwargs)
-    elif control_mode == "replay_episode":
-        replay_episode(robot, **kwargs)

-    elif control_mode == "run_policy":
-        pretrained_policy_path = get_pretrained_policy_path(args.pretrained_policy_name_or_path)
-        hydra_cfg = init_hydra_config(pretrained_policy_path / "config.yaml", args.overrides)
-        policy = make_policy(hydra_cfg=hydra_cfg, pretrained_policy_name_or_path=pretrained_policy_path)
-        run_policy(robot, policy, hydra_cfg)
+    elif control_mode == "record":
+        pretrained_policy_name_or_path = args.pretrained_policy_name_or_path
+        policy_overrides = args.policy_overrides
+        del kwargs["pretrained_policy_name_or_path"]
+        del kwargs["policy_overrides"]
+
+        policy_cfg = None
+        if pretrained_policy_name_or_path is not None:
+            pretrained_policy_path = get_pretrained_policy_path(pretrained_policy_name_or_path)
+            policy_cfg = init_hydra_config(pretrained_policy_path / "config.yaml", policy_overrides)
+            policy = make_policy(hydra_cfg=policy_cfg, pretrained_policy_name_or_path=pretrained_policy_path)
+            record(robot, policy, policy_cfg, **kwargs)
+        else:
+            record(robot, **kwargs)
+
+    elif control_mode == "replay":
+        replay(robot, **kwargs)
+
+    if robot.is_connected:
+        # Disconnect manually to avoid a "Core dump" during process
+        # termination due to camera threads not properly exiting.
+        robot.disconnect()
--- a/lerobot/scripts/eval.py
+++ b/lerobot/scripts/eval.py
@@ -56,16 +56,13 @@ import einops
 import gymnasium as gym
 import numpy as np
 import torch
-from datasets import Dataset, Features, Image, Sequence, Value, concatenate_datasets
 from huggingface_hub import snapshot_download
 from huggingface_hub.utils._errors import RepositoryNotFoundError
 from huggingface_hub.utils._validators import HFValidationError
-from PIL import Image as PILImage
 from torch import Tensor, nn
 from tqdm import trange

 from lerobot.common.datasets.factory import make_dataset
-from lerobot.common.datasets.utils import hf_transform_to_torch
 from lerobot.common.envs.factory import make_env
 from lerobot.common.envs.utils import preprocess_observation
 from lerobot.common.logger import log_output_dir
@@ -73,7 +70,13 @@ from lerobot.common.policies.factory import make_policy
 from lerobot.common.policies.policy_protocol import Policy
 from lerobot.common.policies.utils import get_device_from_parameters
 from lerobot.common.utils.io_utils import write_video
-from lerobot.common.utils.utils import get_safe_torch_device, init_hydra_config, init_logging, set_global_seed
+from lerobot.common.utils.utils import (
+    get_safe_torch_device,
+    init_hydra_config,
+    init_logging,
+    inside_slurm,
+    set_global_seed,
+)


 def rollout(
@@ -82,7 +85,6 @@ def rollout(
    seeds: list[int] | None = None,
    return_observations: bool = False,
    render_callback: Callable[[gym.vector.VectorEnv], None] | None = None,
-    enable_progbar: bool = False,
 ) -> dict:
    """Run a batched policy rollout once through a batch of environments.

@@ -112,7 +114,6 @@ def rollout(
            are returned optionally because they typically take more memory to cache. Defaults to False.
        render_callback: Optional rendering callback to be used after the environments are reset, and after
            every step.
-        enable_progbar: Enable a progress bar over rollout steps.
    Returns:
        The dictionary described above.
    """
@@ -139,7 +140,7 @@ def rollout(
    progbar = trange(
        max_steps,
        desc=f"Running rollout with at most {max_steps} steps",
-        disable=not enable_progbar,
+        disable=inside_slurm(),  # we dont want progress bar when we use slurm, since it clutters the logs
        leave=False,
    )
    while not np.all(done):
@@ -213,8 +214,6 @@ def eval_policy(
    videos_dir: Path | None = None,
    return_episode_data: bool = False,
    start_seed: int | None = None,
-    enable_progbar: bool = False,
-    enable_inner_progbar: bool = False,
 ) -> dict:
    """
    Args:
@@ -227,8 +226,6 @@ def eval_policy(
            the "episodes" key of the returned dictionary.
        start_seed: The first seed to use for the first individual rollout. For all subsequent rollouts the
            seed is incremented by 1. If not provided, the environments are not manually seeded.
-        enable_progbar: Enable progress bar over batches.
-        enable_inner_progbar: Enable progress bar over steps in each batch.
    Returns:
        Dictionary with metrics and data regarding the rollouts.
    """
@@ -269,7 +266,8 @@ def eval_policy(
    if return_episode_data:
        episode_data: dict | None = None

-    progbar = trange(n_batches, desc="Stepping through eval batches", disable=not enable_progbar)
+    # we dont want progress bar when we use slurm, since it clutters the logs
+    progbar = trange(n_batches, desc="Stepping through eval batches", disable=inside_slurm())
    for batch_ix in progbar:
        # Cache frames for rendering videos. Each item will be (b, h, w, c), and the list indexes the rollout
        # step.
@@ -288,7 +286,6 @@ def eval_policy(
            seeds=list(seeds) if seeds else None,
            return_observations=return_episode_data,
            render_callback=render_frame if max_episodes_rendered > 0 else None,
-            enable_progbar=enable_inner_progbar,
        )

        # Figure out where in each rollout sequence the first done condition was encountered (results after
@@ -318,41 +315,17 @@ def eval_policy(
                rollout_data,
                done_indices,
                start_episode_index=batch_ix * env.num_envs,
-                start_data_index=(
-                    0 if episode_data is None else (episode_data["episode_data_index"]["to"][-1].item())
-                ),
+                start_data_index=(0 if episode_data is None else (episode_data["index"][-1].item() + 1)),
                fps=env.unwrapped.metadata["render_fps"],
            )
            if episode_data is None:
                episode_data = this_episode_data
            else:
-                # Some sanity checks to make sure we are not correctly compiling the data.
-                assert (
-                    episode_data["hf_dataset"]["episode_index"][-1] + 1
-                    == this_episode_data["hf_dataset"]["episode_index"][0]
-                )
-                assert (
-                    episode_data["hf_dataset"]["index"][-1] + 1 == this_episode_data["hf_dataset"]["index"][0]
-                )
-                assert torch.equal(
-                    episode_data["episode_data_index"]["to"][-1],
-                    this_episode_data["episode_data_index"]["from"][0],
-                )
+                # Some sanity checks to make sure we are correctly compiling the data.
+                assert episode_data["episode_index"][-1] + 1 == this_episode_data["episode_index"][0]
+                assert episode_data["index"][-1] + 1 == this_episode_data["index"][0]
                # Concatenate the episode data.
-                episode_data = {
-                    "hf_dataset": concatenate_datasets(
-                        [episode_data["hf_dataset"], this_episode_data["hf_dataset"]]
-                    ),
-                    "episode_data_index": {
-                        k: torch.cat(
-                            [
-                                episode_data["episode_data_index"][k],
-                                this_episode_data["episode_data_index"][k],
-                            ]
-                        )
-                        for k in ["from", "to"]
-                    },
-                }
+                episode_data = {k: torch.cat([episode_data[k], this_episode_data[k]]) for k in episode_data}

        # Maybe render video for visualization.
        if max_episodes_rendered > 0 and len(ep_frames) > 0:
@@ -434,89 +407,39 @@ def _compile_episode_data(
    Similar logic is implemented when datasets are pushed to hub (see: `push_to_hub`).
    """
    ep_dicts = []
-    episode_data_index = {"from": [], "to": []}
    total_frames = 0
-    data_index_from = start_data_index
    for ep_ix in range(rollout_data["action"].shape[0]):
-        num_frames = done_indices[ep_ix].item() + 1  # + 1 to include the first done frame
+        # + 2 to include the first done frame and the last observation frame.
+        num_frames = done_indices[ep_ix].item() + 2
        total_frames += num_frames

-        # TODO(rcadene): We need to add a missing last frame which is the observation
-        # of a done state. it is critical to have this frame for tdmpc to predict a "done observation/state"
+        # Here we do `num_frames - 1` as we don't want to include the last observation frame just yet.
        ep_dict = {
-            "action": rollout_data["action"][ep_ix, :num_frames],
-            "episode_index": torch.tensor([start_episode_index + ep_ix] * num_frames),
-            "frame_index": torch.arange(0, num_frames, 1),
-            "timestamp": torch.arange(0, num_frames, 1) / fps,
-            "next.done": rollout_data["done"][ep_ix, :num_frames],
-            "next.reward": rollout_data["reward"][ep_ix, :num_frames].type(torch.float32),
+            "action": rollout_data["action"][ep_ix, : num_frames - 1],
+            "episode_index": torch.tensor([start_episode_index + ep_ix] * (num_frames - 1)),
+            "frame_index": torch.arange(0, num_frames - 1, 1),
+            "timestamp": torch.arange(0, num_frames - 1, 1) / fps,
+            "next.done": rollout_data["done"][ep_ix, : num_frames - 1],
+            "next.success": rollout_data["success"][ep_ix, : num_frames - 1],
+            "next.reward": rollout_data["reward"][ep_ix, : num_frames - 1].type(torch.float32),
        }
+
+        # For the last observation frame, all other keys will just be copy padded.
+        for k in ep_dict:
+            ep_dict[k] = torch.cat([ep_dict[k], ep_dict[k][-1:]])
+
        for key in rollout_data["observation"]:
-            ep_dict[key] = rollout_data["observation"][key][ep_ix][:num_frames]
+            ep_dict[key] = rollout_data["observation"][key][ep_ix, :num_frames]
+
        ep_dicts.append(ep_dict)

-        episode_data_index["from"].append(data_index_from)
-        episode_data_index["to"].append(data_index_from + num_frames)
-
-        data_index_from += num_frames
-
    data_dict = {}
    for key in ep_dicts[0]:
-        if "image" not in key:
-            data_dict[key] = torch.cat([x[key] for x in ep_dicts])
-        else:
-            if key not in data_dict:
-                data_dict[key] = []
-            for ep_dict in ep_dicts:
-                for img in ep_dict[key]:
-                    # sanity check that images are channel first
-                    c, h, w = img.shape
-                    assert c < h and c < w, f"expect channel first images, but instead {img.shape}"
-
-                    # sanity check that images are float32 in range [0,1]
-                    assert img.dtype == torch.float32, f"expect torch.float32, but instead {img.dtype=}"
-                    assert img.max() <= 1, f"expect pixels lower than 1, but instead {img.max()=}"
-                    assert img.min() >= 0, f"expect pixels greater than 1, but instead {img.min()=}"
-
-                    # from float32 in range [0,1] to uint8 in range [0,255]
-                    img *= 255
-                    img = img.type(torch.uint8)
-
-                    # convert to channel last and numpy as expected by PIL
-                    img = PILImage.fromarray(img.permute(1, 2, 0).numpy())
-
-                    data_dict[key].append(img)
+        data_dict[key] = torch.cat([x[key] for x in ep_dicts])

    data_dict["index"] = torch.arange(start_data_index, start_data_index + total_frames, 1)
-    episode_data_index["from"] = torch.tensor(episode_data_index["from"])
-    episode_data_index["to"] = torch.tensor(episode_data_index["to"])

-    # TODO(rcadene): clean this
-    features = {}
-    for key in rollout_data["observation"]:
-        if "image" in key:
-            features[key] = Image()
-        else:
-            features[key] = Sequence(length=data_dict[key].shape[1], feature=Value(dtype="float32", id=None))
-    features.update(
-        {
-            "action": Sequence(length=data_dict["action"].shape[1], feature=Value(dtype="float32", id=None)),
-            "episode_index": Value(dtype="int64", id=None),
-            "frame_index": Value(dtype="int64", id=None),
-            "timestamp": Value(dtype="float32", id=None),
-            "next.reward": Value(dtype="float32", id=None),
-            "next.done": Value(dtype="bool", id=None),
-            #'next.success': Value(dtype='bool', id=None),
-            "index": Value(dtype="int64", id=None),
-        }
-    )
-    features = Features(features)
-    hf_dataset = Dataset.from_dict(data_dict, features=features)
-    hf_dataset.set_transform(hf_transform_to_torch)
-    return {
-        "hf_dataset": hf_dataset,
-        "episode_data_index": episode_data_index,
-    }
+    return data_dict


 def main(
@@ -531,6 +454,16 @@ def main(
    else:
        hydra_cfg = init_hydra_config(hydra_cfg_path, config_overrides)

+    if hydra_cfg.eval.batch_size > hydra_cfg.eval.n_episodes:
+        raise ValueError(
+            "The eval batch size is greater than the number of eval episodes "
+            f"({hydra_cfg.eval.batch_size} > {hydra_cfg.eval.n_episodes}). As a result, {hydra_cfg.eval.batch_size} "
+            f"eval environments will be instantiated, but only {hydra_cfg.eval.n_episodes} will be used. "
+            "This might significantly slow down evaluation. To fix this, you should update your command "
+            f"to increase the number of episodes to match the batch size (e.g. `eval.n_episodes={hydra_cfg.eval.batch_size}`), "
+            f"or lower the batch size (e.g. `eval.batch_size={hydra_cfg.eval.n_episodes}`)."
+        )
+
    if out_dir is None:
        out_dir = f"outputs/eval/{dt.now().strftime('%Y-%m-%d/%H-%M-%S')}_{hydra_cfg.env.name}_{hydra_cfg.policy.name}"

@@ -564,8 +497,6 @@ def main(
            max_episodes_rendered=10,
            videos_dir=Path(out_dir) / "videos",
            start_seed=hydra_cfg.seed,
-            enable_progbar=True,
-            enable_inner_progbar=True,
        )
    print(info["aggregated"])

--- a/lerobot/scripts/push_dataset_to_hub.py
+++ b/lerobot/scripts/push_dataset_to_hub.py
@@ -55,7 +55,8 @@ from safetensors.torch import save_file

 from lerobot.common.datasets.compute_stats import compute_stats
 from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDataset
-from lerobot.common.datasets.utils import flatten_dict
+from lerobot.common.datasets.push_dataset_to_hub.utils import check_repo_id
+from lerobot.common.datasets.utils import create_branch, create_lerobot_dataset_card, flatten_dict


 def get_from_raw_to_lerobot_format_fn(raw_format: str):
@@ -65,6 +66,8 @@ def get_from_raw_to_lerobot_format_fn(raw_format: str):
        from lerobot.common.datasets.push_dataset_to_hub.umi_zarr_format import from_raw_to_lerobot_format
    elif raw_format == "aloha_hdf5":
        from lerobot.common.datasets.push_dataset_to_hub.aloha_hdf5_format import from_raw_to_lerobot_format
+    elif "openx_rlds" in raw_format:
+        from lerobot.common.datasets.push_dataset_to_hub.openx_rlds_format import from_raw_to_lerobot_format
    elif raw_format == "dora_parquet":
        from lerobot.common.datasets.push_dataset_to_hub.dora_parquet_format import from_raw_to_lerobot_format
    elif raw_format == "xarm_pkl":
@@ -113,6 +116,14 @@ def push_meta_data_to_hub(repo_id: str, meta_data_dir: str | Path, revision: str
    )


+def push_dataset_card_to_hub(
+    repo_id: str, revision: str | None, tags: list | None = None, text: str | None = None
+):
+    """Creates and pushes a LeRobotDataset Card with appropriate tags to easily find it on the hub."""
+    card = create_lerobot_dataset_card(tags=tags, text=text)
+    card.push_to_hub(repo_id=repo_id, repo_type="dataset", revision=revision)
+
+
 def push_videos_to_hub(repo_id: str, videos_dir: str | Path, revision: str | None):
    """Expect mp4 files to be all stored in a single "videos" directory.
    On the hugging face repositery, they will be uploaded in a "videos" directory at the root.
@@ -140,14 +151,12 @@ def push_dataset_to_hub(
    num_workers: int = 8,
    episodes: list[int] | None = None,
    force_override: bool = False,
+    resume: bool = False,
    cache_dir: Path = Path("/tmp"),
    tests_data_dir: Path | None = None,
+    encoding: dict | None = None,
 ):
-    # Check repo_id is well formated
-    if len(repo_id.split("/")) != 2:
-        raise ValueError(
-            f"`repo_id` is expected to contain a community or user id `/` the name of the dataset (e.g. 'lerobot/pusht'), but instead contains '{repo_id}'."
-        )
+    check_repo_id(repo_id)
    user_id, dataset_id = repo_id.split("/")

    # Robustify when `raw_dir` is str instead of Path
@@ -173,7 +182,7 @@ def push_dataset_to_hub(
        if local_dir.exists():
            if force_override:
                shutil.rmtree(local_dir)
-            else:
+            elif not resume:
                raise ValueError(f"`local_dir` already exists ({local_dir}). Use `--force-override 1`.")

        meta_data_dir = local_dir / "meta_data"
@@ -190,9 +199,25 @@ def push_dataset_to_hub(

    # convert dataset from original raw format to LeRobot format
    from_raw_to_lerobot_format = get_from_raw_to_lerobot_format_fn(raw_format)
-    hf_dataset, episode_data_index, info = from_raw_to_lerobot_format(
-        raw_dir, videos_dir, fps, video, episodes
-    )
+
+    fmt_kwgs = {
+        "raw_dir": raw_dir,
+        "videos_dir": videos_dir,
+        "fps": fps,
+        "video": video,
+        "episodes": episodes,
+        "encoding": encoding,
+    }
+
+    if "openx_rlds." in raw_format:
+        # Support for official OXE dataset name inside `raw_format`.
+        # For instance, `raw_format="oxe_rlds"` uses the default formating (TODO what does that mean?),
+        # and `raw_format="oxe_rlds.bridge_orig"` uses the brdige_orig formating
+        _, openx_dataset_name = raw_format.split(".")
+        print(f"Converting dataset [{openx_dataset_name}] from 'openx_rlds' to LeRobot format.")
+        fmt_kwgs["openx_dataset_name"] = openx_dataset_name
+
+    hf_dataset, episode_data_index, info = from_raw_to_lerobot_format(**fmt_kwgs)

    lerobot_dataset = LeRobotDataset.from_preloaded(
        repo_id=repo_id,
@@ -214,10 +239,10 @@ def push_dataset_to_hub(
    if push_to_hub:
        hf_dataset.push_to_hub(repo_id, revision="main")
        push_meta_data_to_hub(repo_id, meta_data_dir, revision="main")
+        push_dataset_card_to_hub(repo_id, revision="main")
        if video:
            push_videos_to_hub(repo_id, videos_dir, revision="main")
-        api = HfApi()
-        api.create_branch(repo_id, repo_type="dataset", branch=CODEBASE_VERSION)
+        create_branch(repo_id, repo_type="dataset", branch=CODEBASE_VERSION)

    if tests_data_dir:
        # get the first episode
@@ -261,7 +286,7 @@ def main():
        "--raw-format",
        type=str,
        required=True,
-        help="Dataset type (e.g. `pusht_zarr`, `umi_zarr`, `aloha_hdf5`, `xarm_pkl`, `dora_parquet`).",
+        help="Dataset type (e.g. `pusht_zarr`, `umi_zarr`, `aloha_hdf5`, `xarm_pkl`, `dora_parquet`, `openx_rlds`).",
    )
    parser.add_argument(
        "--repo-id",
@@ -315,6 +340,19 @@ def main():
        default=0,
        help="When set to 1, removes provided output directory if it already exists. By default, raises a ValueError exception.",
    )
+    parser.add_argument(
+        "--resume",
+        type=int,
+        default=0,
+        help="When set to 1, resumes a previous run.",
+    )
+    parser.add_argument(
+        "--cache-dir",
+        type=Path,
+        required=False,
+        default="/tmp",
+        help="Directory to store the temporary videos and images generated while creating the dataset.",
+    )
    parser.add_argument(
        "--tests-data-dir",
        type=Path,
--- a/lerobot/scripts/train.py
+++ b/lerobot/scripts/train.py
@@ -15,20 +15,25 @@
 # limitations under the License.
 import logging
 import time
+from concurrent.futures import ThreadPoolExecutor
 from contextlib import nullcontext
+from copy import deepcopy
 from pathlib import Path
 from pprint import pformat
+from threading import Lock

 import hydra
+import numpy as np
 import torch
 from deepdiff import DeepDiff
-from omegaconf import DictConfig, OmegaConf
+from omegaconf import DictConfig, ListConfig, OmegaConf
 from termcolor import colored
 from torch import nn
 from torch.cuda.amp import GradScaler

 from lerobot.common.datasets.factory import make_dataset, resolve_delta_timestamps
 from lerobot.common.datasets.lerobot_dataset import MultiLeRobotDataset
+from lerobot.common.datasets.online_buffer import OnlineBuffer, compute_sampler_weights
 from lerobot.common.datasets.sampler import EpisodeAwareSampler
 from lerobot.common.datasets.utils import cycle
 from lerobot.common.envs.factory import make_env
@@ -88,6 +93,16 @@ def make_optimizer_and_scheduler(cfg, policy):
    elif policy.name == "tdmpc":
        optimizer = torch.optim.Adam(policy.parameters(), cfg.training.lr)
        lr_scheduler = None
+    elif policy.name == "tdmpc2":
+        params_group = [
+            {"params": policy.model._encoder.parameters(), "lr": cfg.training.lr * cfg.training.enc_lr_scale},
+            {"params": policy.model._dynamics.parameters()},
+            {"params": policy.model._reward.parameters()},
+            {"params": policy.model._Qs.parameters()},
+            {"params": policy.model._pi.parameters(), "eps": 1e-5},
+        ]
+        optimizer = torch.optim.Adam(params_group, lr=cfg.training.lr)
+        lr_scheduler = None
    elif cfg.policy.name == "vqbet":
        from lerobot.common.policies.vqbet.modeling_vqbet import VQBeTOptimizer, VQBeTScheduler

@@ -107,6 +122,7 @@ def update_policy(
    grad_scaler: GradScaler,
    lr_scheduler=None,
    use_amp: bool = False,
+    lock=None,
 ):
    """Returns a dictionary of items for logging."""
    start_time = time.perf_counter()
@@ -129,7 +145,8 @@ def update_policy(

    # Optimizer's gradients are already unscaled, so scaler.step does not unscale them,
    # although it still skips optimizer.step() if the gradients contain infs or NaNs.
-    grad_scaler.step(optimizer)
+    with lock if lock is not None else nullcontext():
+        grad_scaler.step(optimizer)
    # Updates the scale for next iteration.
    grad_scaler.update()

@@ -149,11 +166,12 @@ def update_policy(
        "update_s": time.perf_counter() - start_time,
        **{k: v for k, v in output_dict.items() if k != "loss"},
    }
+    info.update({k: v for k, v in output_dict.items() if k not in info})

    return info


-def log_train_info(logger: Logger, info, step, cfg, dataset, is_offline):
+def log_train_info(logger: Logger, info, step, cfg, dataset, is_online):
    loss = info["loss"]
    grad_norm = info["grad_norm"]
    lr = info["lr"]
@@ -187,12 +205,12 @@ def log_train_info(logger: Logger, info, step, cfg, dataset, is_offline):
    info["num_samples"] = num_samples
    info["num_episodes"] = num_episodes
    info["num_epochs"] = num_epochs
-    info["is_offline"] = is_offline
+    info["is_online"] = is_online

    logger.log_dict(info, step, mode="train")


-def log_eval_info(logger, info, step, cfg, dataset, is_offline):
+def log_eval_info(logger, info, step, cfg, dataset, is_online):
    eval_s = info["eval_s"]
    avg_sum_reward = info["avg_sum_reward"]
    pc_success = info["pc_success"]
@@ -221,7 +239,7 @@ def log_eval_info(logger, info, step, cfg, dataset, is_offline):
    info["num_samples"] = num_samples
    info["num_episodes"] = num_episodes
    info["num_epochs"] = num_epochs
-    info["is_offline"] = is_offline
+    info["is_online"] = is_online

    logger.log_dict(info, step, mode="eval")

@@ -233,6 +251,10 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
        raise NotImplementedError()

    init_logging()
+    logging.info(pformat(OmegaConf.to_container(cfg)))
+
+    if cfg.training.online_steps > 0 and isinstance(cfg.dataset_repo_id, ListConfig):
+        raise NotImplementedError("Online training with LeRobotMultiDataset is not implemented.")

    # If we are resuming a run, we need to check that a checkpoint exists in the log directory, and we need
    # to check for any differences between the provided config and the checkpoint's config.
@@ -276,12 +298,19 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
            "you meant to resume training, please use `resume=true` in your command or yaml configuration."
        )

+    if cfg.eval.batch_size > cfg.eval.n_episodes:
+        raise ValueError(
+            "The eval batch size is greater than the number of eval episodes "
+            f"({cfg.eval.batch_size} > {cfg.eval.n_episodes}). As a result, {cfg.eval.batch_size} "
+            f"eval environments will be instantiated, but only {cfg.eval.n_episodes} will be used. "
+            "This might significantly slow down evaluation. To fix this, you should update your command "
+            f"to increase the number of episodes to match the batch size (e.g. `eval.n_episodes={cfg.eval.batch_size}`), "
+            f"or lower the batch size (e.g. `eval.batch_size={cfg.eval.n_episodes}`)."
+        )
+
    # log metrics to terminal and wandb
    logger = Logger(cfg, out_dir, wandb_job_name=job_name)

-    if cfg.training.online_steps > 0:
-        raise NotImplementedError("Online training is not implemented yet.")
-
    set_global_seed(cfg.seed)

    # Check device is available
@@ -336,7 +365,7 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
    logging.info(f"{num_total_params=} ({format_big_number(num_total_params)})")

    # Note: this helper will be used in offline and online training loops.
-    def evaluate_and_checkpoint_if_needed(step):
+    def evaluate_and_checkpoint_if_needed(step, is_online):
        _num_digits = max(6, len(str(cfg.training.offline_steps + cfg.training.online_steps)))
        step_identifier = f"{step:0{_num_digits}d}"

@@ -352,7 +381,7 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
                    max_episodes_rendered=4,
                    start_seed=cfg.seed,
                )
-            log_eval_info(logger, eval_info["aggregated"], step, cfg, offline_dataset, is_offline=True)
+            log_eval_info(logger, eval_info["aggregated"], step, cfg, offline_dataset, is_online=is_online)
            if cfg.wandb.enable:
                logger.log_video(eval_info["video_paths"][0], step, mode="eval")
            logging.info("Resume training")
@@ -396,8 +425,9 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
    dl_iter = cycle(dataloader)

    policy.train()
+    offline_step = 0
    for _ in range(step, cfg.training.offline_steps):
-        if step == 0:
+        if offline_step == 0:
            logging.info("Start offline training on a fixed dataset")

        start_time = time.perf_counter()
@@ -420,13 +450,207 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
        train_info["dataloading_s"] = dataloading_s

        if step % cfg.training.log_freq == 0:
-            log_train_info(logger, train_info, step, cfg, offline_dataset, is_offline=True)
+            log_train_info(logger, train_info, step, cfg, offline_dataset, is_online=False)

        # Note: evaluate_and_checkpoint_if_needed happens **after** the `step`th training update has completed,
        # so we pass in step + 1.
-        evaluate_and_checkpoint_if_needed(step + 1)
+        evaluate_and_checkpoint_if_needed(step + 1, is_online=False)

        step += 1
+        offline_step += 1  # noqa: SIM113
+
+    if cfg.training.online_steps == 0:
+        if eval_env:
+            eval_env.close()
+        logging.info("End of training")
+        return
+
+    # Online training.
+
+    # Create an env dedicated to online episodes collection from policy rollout.
+    online_env = make_env(cfg, n_envs=cfg.training.online_rollout_batch_size)
+    resolve_delta_timestamps(cfg)
+    online_buffer_path = logger.log_dir / "online_buffer"
+    if cfg.resume and not online_buffer_path.exists():
+        # If we are resuming a run, we default to the data shapes and buffer capacity from the saved online
+        # buffer.
+        logging.warning(
+            "When online training is resumed, we load the latest online buffer from the prior run, "
+            "and this might not coincide with the state of the buffer as it was at the moment the checkpoint "
+            "was made. This is because the online buffer is updated on disk during training, independently "
+            "of our explicit checkpointing mechanisms."
+        )
+    online_dataset = OnlineBuffer(
+        online_buffer_path,
+        data_spec={
+            **{k: {"shape": v, "dtype": np.dtype("float32")} for k, v in policy.config.input_shapes.items()},
+            **{k: {"shape": v, "dtype": np.dtype("float32")} for k, v in policy.config.output_shapes.items()},
+            "next.reward": {"shape": (), "dtype": np.dtype("float32")},
+            "next.done": {"shape": (), "dtype": np.dtype("?")},
+            "next.success": {"shape": (), "dtype": np.dtype("?")},
+        },
+        buffer_capacity=cfg.training.online_buffer_capacity,
+        fps=online_env.unwrapped.metadata["render_fps"],
+        delta_timestamps=cfg.training.delta_timestamps,
+    )
+
+    # If we are doing online rollouts asynchronously, deepcopy the policy to use for online rollouts (this
+    # makes it possible to do online rollouts in parallel with training updates).
+    online_rollout_policy = deepcopy(policy) if cfg.training.do_online_rollout_async else policy
+
+    # Create dataloader for online training.
+    concat_dataset = torch.utils.data.ConcatDataset([offline_dataset, online_dataset])
+    sampler_weights = compute_sampler_weights(
+        offline_dataset,
+        offline_drop_n_last_frames=cfg.training.get("drop_n_last_frames", 0),
+        online_dataset=online_dataset,
+        # +1 because online rollouts return an extra frame for the "final observation". Note: we don't have
+        # this final observation in the offline datasets, but we might add them in future.
+        online_drop_n_last_frames=cfg.training.get("drop_n_last_frames", 0) + 1,
+        online_sampling_ratio=cfg.training.online_sampling_ratio,
+    )
+    sampler = torch.utils.data.WeightedRandomSampler(
+        sampler_weights,
+        num_samples=len(concat_dataset),
+        replacement=True,
+    )
+    dataloader = torch.utils.data.DataLoader(
+        concat_dataset,
+        batch_size=cfg.training.batch_size,
+        num_workers=cfg.training.num_workers,
+        sampler=sampler,
+        pin_memory=device.type != "cpu",
+        drop_last=True,
+    )
+    dl_iter = cycle(dataloader)
+
+    # Lock and thread pool executor for asynchronous online rollouts. When asynchronous mode is disabled,
+    # these are still used but effectively do nothing.
+    lock = Lock()
+    # Note: 1 worker because we only ever want to run one set of online rollouts at a time. Batch
+    # parallelization of rollouts is handled within the job.
+    executor = ThreadPoolExecutor(max_workers=1)
+
+    online_step = 0
+    online_rollout_s = 0  # time take to do online rollout
+    update_online_buffer_s = 0  # time taken to update the online buffer with the online rollout data
+    # Time taken waiting for the online buffer to finish being updated. This is relevant when using the async
+    # online rollout option.
+    await_update_online_buffer_s = 0
+    rollout_start_seed = cfg.training.online_env_seed
+
+    while True:
+        if online_step == cfg.training.online_steps:
+            break
+
+        if online_step == 0:
+            logging.info("Start online training by interacting with environment")
+
+        def sample_trajectory_and_update_buffer():
+            nonlocal rollout_start_seed
+            with lock:
+                online_rollout_policy.load_state_dict(policy.state_dict())
+            online_rollout_policy.eval()
+            start_rollout_time = time.perf_counter()
+            with torch.no_grad():
+                eval_info = eval_policy(
+                    online_env,
+                    online_rollout_policy,
+                    n_episodes=cfg.training.online_rollout_n_episodes,
+                    max_episodes_rendered=min(10, cfg.training.online_rollout_n_episodes),
+                    videos_dir=logger.log_dir / "online_rollout_videos",
+                    return_episode_data=True,
+                    start_seed=(
+                        rollout_start_seed := (rollout_start_seed + cfg.training.batch_size) % 1000000
+                    ),
+                )
+            online_rollout_s = time.perf_counter() - start_rollout_time
+
+            with lock:
+                start_update_buffer_time = time.perf_counter()
+                online_dataset.add_data(eval_info["episodes"])
+
+                # Update the concatenated dataset length used during sampling.
+                concat_dataset.cumulative_sizes = concat_dataset.cumsum(concat_dataset.datasets)
+
+                # Update the sampling weights.
+                sampler.weights = compute_sampler_weights(
+                    offline_dataset,
+                    offline_drop_n_last_frames=cfg.training.get("drop_n_last_frames", 0),
+                    online_dataset=online_dataset,
+                    # +1 because online rollouts return an extra frame for the "final observation". Note: we don't have
+                    # this final observation in the offline datasets, but we might add them in future.
+                    online_drop_n_last_frames=cfg.training.get("drop_n_last_frames", 0) + 1,
+                    online_sampling_ratio=cfg.training.online_sampling_ratio,
+                )
+                sampler.num_samples = len(concat_dataset)
+
+                update_online_buffer_s = time.perf_counter() - start_update_buffer_time
+
+            return online_rollout_s, update_online_buffer_s
+
+        future = executor.submit(sample_trajectory_and_update_buffer)
+        # If we aren't doing async rollouts, or if we haven't yet gotten enough examples in our buffer, wait
+        # here until the rollout and buffer update is done, before proceeding to the policy update steps.
+        if (
+            not cfg.training.do_online_rollout_async
+            or len(online_dataset) <= cfg.training.online_buffer_seed_size
+        ):
+            online_rollout_s, update_online_buffer_s = future.result()
+
+        if len(online_dataset) <= cfg.training.online_buffer_seed_size:
+            logging.info(
+                f"Seeding online buffer: {len(online_dataset)}/{cfg.training.online_buffer_seed_size}"
+            )
+            continue
+
+        policy.train()
+        for _ in range(cfg.training.online_steps_between_rollouts):
+            with lock:
+                start_time = time.perf_counter()
+                batch = next(dl_iter)
+                dataloading_s = time.perf_counter() - start_time
+
+            for key in batch:
+                batch[key] = batch[key].to(cfg.device, non_blocking=True)
+
+            train_info = update_policy(
+                policy,
+                batch,
+                optimizer,
+                cfg.training.grad_clip_norm,
+                grad_scaler=grad_scaler,
+                lr_scheduler=lr_scheduler,
+                use_amp=cfg.use_amp,
+                lock=lock,
+            )
+
+            train_info["dataloading_s"] = dataloading_s
+            train_info["online_rollout_s"] = online_rollout_s
+            train_info["update_online_buffer_s"] = update_online_buffer_s
+            train_info["await_update_online_buffer_s"] = await_update_online_buffer_s
+            with lock:
+                train_info["online_buffer_size"] = len(online_dataset)
+
+            if step % cfg.training.log_freq == 0:
+                log_train_info(logger, train_info, step, cfg, online_dataset, is_online=True)
+
+            # Note: evaluate_and_checkpoint_if_needed happens **after** the `step`th training update has completed,
+            # so we pass in step + 1.
+            evaluate_and_checkpoint_if_needed(step + 1, is_online=True)
+
+            step += 1
+            online_step += 1
+
+        # If we're doing async rollouts, we should now wait until we've completed them before proceeding
+        # to do the next batch of rollouts.
+        if future.running():
+            start = time.perf_counter()
+            online_rollout_s, update_online_buffer_s = future.result()
+            await_update_online_buffer_s = time.perf_counter() - start
+
+        if online_step >= cfg.training.online_steps:
+            break

    if eval_env:
        eval_env.close()
--- a/lerobot/scripts/visualize_dataset.py
+++ b/lerobot/scripts/visualize_dataset.py
@@ -108,8 +108,8 @@ def visualize_dataset(
    web_port: int = 9090,
    ws_port: int = 9087,
    save: bool = False,
-    output_dir: Path | None = None,
    root: Path | None = None,
+    output_dir: Path | None = None,
 ) -> Path | None:
    if save:
        assert (
@@ -209,6 +209,18 @@ def main():
        required=True,
        help="Episode to visualize.",
    )
+    parser.add_argument(
+        "--root",
+        type=Path,
+        default=None,
+        help="Root directory for a dataset stored locally (e.g. `--root data`). By default, the dataset will be loaded from hugging face cache folder, or downloaded from the hub if available.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=None,
+        help="Directory path to write a .rrd file when `--save 1` is set.",
+    )
    parser.add_argument(
        "--batch-size",
        type=int,
@@ -254,17 +266,6 @@ def main():
            "Visualize the data by running `rerun path/to/file.rrd` on your local machine."
        ),
    )
-    parser.add_argument(
-        "--output-dir",
-        type=str,
-        help="Directory path to write a .rrd file when `--save 1` is set.",
-    )
-
-    parser.add_argument(
-        "--root",
-        type=str,
-        help="Root directory for a dataset stored on a local machine.",
-    )

    args = parser.parse_args()
    visualize_dataset(**vars(args))
--- a/lerobot/scripts/visualize_dataset_html.py
+++ b/lerobot/scripts/visualize_dataset_html.py
@@ -0,0 +1,300 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Visualize data of **all** frames of any episode of a dataset of type LeRobotDataset.
+
+Note: The last frame of the episode doesnt always correspond to a final state.
+That's because our datasets are composed of transition from state to state up to
+the antepenultimate state associated to the ultimate action to arrive in the final state.
+However, there might not be a transition from a final state to another state.
+
+Note: This script aims to visualize the data used to train the neural networks.
+~What you see is what you get~. When visualizing image modality, it is often expected to observe
+lossly compression artifacts since these images have been decoded from compressed mp4 videos to
+save disk space. The compression factor applied has been tuned to not affect success rate.
+
+Example of usage:
+
+- Visualize data stored on a local machine:
+```bash
+local$ python lerobot/scripts/visualize_dataset_html.py \
+    --repo-id lerobot/pusht
+
+local$ open http://localhost:9090
+```
+
+- Visualize data stored on a distant machine with a local viewer:
+```bash
+distant$ python lerobot/scripts/visualize_dataset_html.py \
+    --repo-id lerobot/pusht
+
+local$ ssh -L 9090:localhost:9090 distant  # create a ssh tunnel
+local$ open http://localhost:9090
+```
+
+- Select episodes to visualize:
+```bash
+python lerobot/scripts/visualize_dataset_html.py \
+    --repo-id lerobot/pusht \
+    --episodes 7 3 5 1 4
+```
+"""
+
+import argparse
+import logging
+import shutil
+from pathlib import Path
+
+import torch
+import tqdm
+from flask import Flask, redirect, render_template, url_for
+
+from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
+from lerobot.common.utils.utils import init_logging
+
+
+class EpisodeSampler(torch.utils.data.Sampler):
+    def __init__(self, dataset, episode_index):
+        from_idx = dataset.episode_data_index["from"][episode_index].item()
+        to_idx = dataset.episode_data_index["to"][episode_index].item()
+        self.frame_ids = range(from_idx, to_idx)
+
+    def __iter__(self):
+        return iter(self.frame_ids)
+
+    def __len__(self):
+        return len(self.frame_ids)
+
+
+def run_server(
+    dataset: LeRobotDataset,
+    episodes: list[int],
+    host: str,
+    port: str,
+    static_folder: Path,
+    template_folder: Path,
+):
+    app = Flask(__name__, static_folder=static_folder.resolve(), template_folder=template_folder.resolve())
+    app.config["SEND_FILE_MAX_AGE_DEFAULT"] = 0  # specifying not to cache
+
+    @app.route("/")
+    def index():
+        # home page redirects to the first episode page
+        [dataset_namespace, dataset_name] = dataset.repo_id.split("/")
+        first_episode_id = episodes[0]
+        return redirect(
+            url_for(
+                "show_episode",
+                dataset_namespace=dataset_namespace,
+                dataset_name=dataset_name,
+                episode_id=first_episode_id,
+            )
+        )
+
+    @app.route("/<string:dataset_namespace>/<string:dataset_name>/episode_<int:episode_id>")
+    def show_episode(dataset_namespace, dataset_name, episode_id):
+        dataset_info = {
+            "repo_id": dataset.repo_id,
+            "num_samples": dataset.num_samples,
+            "num_episodes": dataset.num_episodes,
+            "fps": dataset.fps,
+        }
+        video_paths = get_episode_video_paths(dataset, episode_id)
+        videos_info = [
+            {"url": url_for("static", filename=video_path), "filename": Path(video_path).name}
+            for video_path in video_paths
+        ]
+        ep_csv_url = url_for("static", filename=get_ep_csv_fname(episode_id))
+        return render_template(
+            "visualize_dataset_template.html",
+            episode_id=episode_id,
+            episodes=episodes,
+            dataset_info=dataset_info,
+            videos_info=videos_info,
+            ep_csv_url=ep_csv_url,
+            has_policy=False,
+        )
+
+    app.run(host=host, port=port)
+
+
+def get_ep_csv_fname(episode_id: int):
+    ep_csv_fname = f"episode_{episode_id}.csv"
+    return ep_csv_fname
+
+
+def write_episode_data_csv(output_dir, file_name, episode_index, dataset):
+    """Write a csv file containg timeseries data of an episode (e.g. state and action).
+    This file will be loaded by Dygraph javascript to plot data in real time."""
+    from_idx = dataset.episode_data_index["from"][episode_index]
+    to_idx = dataset.episode_data_index["to"][episode_index]
+
+    has_state = "observation.state" in dataset.hf_dataset.features
+    has_action = "action" in dataset.hf_dataset.features
+
+    # init header of csv with state and action names
+    header = ["timestamp"]
+    if has_state:
+        dim_state = len(dataset.hf_dataset["observation.state"][0])
+        header += [f"state_{i}" for i in range(dim_state)]
+    if has_action:
+        dim_action = len(dataset.hf_dataset["action"][0])
+        header += [f"action_{i}" for i in range(dim_action)]
+
+    columns = ["timestamp"]
+    if has_state:
+        columns += ["observation.state"]
+    if has_action:
+        columns += ["action"]
+
+    rows = []
+    data = dataset.hf_dataset.select_columns(columns)
+    for i in range(from_idx, to_idx):
+        row = [data[i]["timestamp"].item()]
+        if has_state:
+            row += data[i]["observation.state"].tolist()
+        if has_action:
+            row += data[i]["action"].tolist()
+        rows.append(row)
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+    with open(output_dir / file_name, "w") as f:
+        f.write(",".join(header) + "\n")
+        for row in rows:
+            row_str = [str(col) for col in row]
+            f.write(",".join(row_str) + "\n")
+
+
+def get_episode_video_paths(dataset: LeRobotDataset, ep_index: int) -> list[str]:
+    # get first frame of episode (hack to get video_path of the episode)
+    first_frame_idx = dataset.episode_data_index["from"][ep_index].item()
+    return [
+        dataset.hf_dataset.select_columns(key)[first_frame_idx][key]["path"]
+        for key in dataset.video_frame_keys
+    ]
+
+
+def visualize_dataset_html(
+    repo_id: str,
+    root: Path | None = None,
+    episodes: list[int] = None,
+    output_dir: Path | None = None,
+    serve: bool = True,
+    host: str = "127.0.0.1",
+    port: int = 9090,
+    force_override: bool = False,
+) -> Path | None:
+    init_logging()
+
+    dataset = LeRobotDataset(repo_id, root=root)
+
+    if not dataset.video:
+        raise NotImplementedError(f"Image datasets ({dataset.video=}) are currently not supported.")
+
+    if output_dir is None:
+        output_dir = f"outputs/visualize_dataset_html/{repo_id}"
+
+    output_dir = Path(output_dir)
+    if output_dir.exists():
+        if force_override:
+            shutil.rmtree(output_dir)
+        else:
+            logging.info(f"Output directory already exists. Loading from it: '{output_dir}'")
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Create a simlink from the dataset video folder containg mp4 files to the output directory
+    # so that the http server can get access to the mp4 files.
+    static_dir = output_dir / "static"
+    static_dir.mkdir(parents=True, exist_ok=True)
+    ln_videos_dir = static_dir / "videos"
+    if not ln_videos_dir.exists():
+        ln_videos_dir.symlink_to(dataset.videos_dir.resolve())
+
+    template_dir = Path(__file__).resolve().parent.parent / "templates"
+
+    if episodes is None:
+        episodes = list(range(dataset.num_episodes))
+
+    logging.info("Writing CSV files")
+    for episode_index in tqdm.tqdm(episodes):
+        # write states and actions in a csv (it can be slow for big datasets)
+        ep_csv_fname = get_ep_csv_fname(episode_index)
+        # TODO(rcadene): speedup script by loading directly from dataset, pyarrow, parquet, safetensors?
+        write_episode_data_csv(static_dir, ep_csv_fname, episode_index, dataset)
+
+    if serve:
+        run_server(dataset, episodes, host, port, static_dir, template_dir)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--repo-id",
+        type=str,
+        required=True,
+        help="Name of hugging face repositery containing a LeRobotDataset dataset (e.g. `lerobot/pusht` for https://huggingface.co/datasets/lerobot/pusht).",
+    )
+    parser.add_argument(
+        "--root",
+        type=Path,
+        default=None,
+        help="Root directory for a dataset stored locally (e.g. `--root data`). By default, the dataset will be loaded from hugging face cache folder, or downloaded from the hub if available.",
+    )
+    parser.add_argument(
+        "--episodes",
+        type=int,
+        nargs="*",
+        default=None,
+        help="Episode indices to visualize (e.g. `0 1 5 6` to load episodes of index 0, 1, 5 and 6). By default loads all episodes.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=None,
+        help="Directory path to write html files and kickoff a web server. By default write them to 'outputs/visualize_dataset/REPO_ID'.",
+    )
+    parser.add_argument(
+        "--serve",
+        type=int,
+        default=1,
+        help="Launch web server.",
+    )
+    parser.add_argument(
+        "--host",
+        type=str,
+        default="127.0.0.1",
+        help="Web host used by the http server.",
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=9090,
+        help="Web port used by the http server.",
+    )
+    parser.add_argument(
+        "--force-override",
+        type=int,
+        default=0,
+        help="Delete the output directory if it exists already.",
+    )
+
+    args = parser.parse_args()
+    visualize_dataset_html(**vars(args))
+
+
+if __name__ == "__main__":
+    main()
--- a/lerobot/scripts/visualize_image_transforms.py
+++ b/lerobot/scripts/visualize_image_transforms.py
@@ -25,7 +25,7 @@ Increase hue jitter
 ```
 python lerobot/scripts/visualize_image_transforms.py \
    dataset_repo_id=lerobot/aloha_mobile_shrimp \
-    training.image_transforms.hue.min_max=[-0.25,0.25]
+    training.image_transforms.hue.min_max="[-0.25,0.25]"
 ```

 Increase brightness & brightness weight
@@ -33,7 +33,7 @@ Increase brightness & brightness weight
 python lerobot/scripts/visualize_image_transforms.py \
    dataset_repo_id=lerobot/aloha_mobile_shrimp \
    training.image_transforms.brightness.weight=10.0 \
-    training.image_transforms.brightness.min_max=[1.0,2.0]
+    training.image_transforms.brightness.min_max="[1.0,2.0]"
 ```

 Blur images and disable saturation & hue
@@ -41,7 +41,7 @@ Blur images and disable saturation & hue
 python lerobot/scripts/visualize_image_transforms.py \
    dataset_repo_id=lerobot/aloha_mobile_shrimp \
    training.image_transforms.sharpness.weight=10.0 \
-    training.image_transforms.sharpness.min_max=[0.0,1.0] \
+    training.image_transforms.sharpness.min_max="[0.0,1.0]" \
    training.image_transforms.saturation.weight=0.0 \
    training.image_transforms.hue.weight=0.0
 ```
@@ -172,4 +172,4 @@ def visualize_transforms_cli(cfg):


 if __name__ == "__main__":
-    visualize_transforms()
+    visualize_transforms_cli()
--- a/lerobot/templates/visualize_dataset_template.html
+++ b/lerobot/templates/visualize_dataset_template.html
@@ -0,0 +1,368 @@
+<!DOCTYPE html>
+<html lang="en">
+
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <!-- # TODO(rcadene, mishig25): store the js files locally -->
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/alpinejs/3.13.5/cdn.min.js" defer></script>
+    <script src="https://cdn.jsdelivr.net/npm/dygraphs@2.2.1/dist/dygraph.min.js" type="text/javascript"></script>
+    <script src="https://cdn.tailwindcss.com"></script>
+    <title>{{ dataset_info.repo_id }} episode {{ episode_id }}</title>
+</head>
+
+<!-- Use [Alpin.js](https://alpinejs.dev), a lightweight and easy to learn JS framework -->
+<!-- Use [tailwindcss](https://tailwindcss.com/), CSS classes for styling html -->
+<!-- Use [dygraphs](https://dygraphs.com/), a lightweight JS charting library -->
+<body class="flex h-screen max-h-screen bg-slate-950 text-gray-200" x-data="createAlpineData()" @keydown.window="(e) => {
+    // Use the space bar to play and pause, instead of default action (e.g. scrolling)
+    const { keyCode, key } = e;
+    if (keyCode === 32 || key === ' ') {
+        e.preventDefault();
+        $refs.btnPause.classList.contains('hidden') ? $refs.btnPlay.click() : $refs.btnPause.click();
+    }else if (key === 'ArrowDown' || key === 'ArrowUp'){
+        const nextEpisodeId = key === 'ArrowDown' ? {{ episode_id }} + 1 : {{ episode_id }} - 1;
+        const lowestEpisodeId = {{ episodes }}.at(0);
+        const highestEpisodeId = {{ episodes }}.at(-1);
+        if(nextEpisodeId >= lowestEpisodeId && nextEpisodeId <= highestEpisodeId){
+            window.location.href = `./episode_${nextEpisodeId}`;
+        }
+    }
+}">
+    <!-- Sidebar -->
+    <div x-ref="sidebar" class="w-60 bg-slate-900 p-5 break-words max-h-screen overflow-y-auto">
+        <h1 class="mb-4 text-xl font-semibold">{{ dataset_info.repo_id }}</h1>
+
+        <ul>
+            <li>
+                Number of samples/frames: {{ dataset_info.num_samples }}
+            </li>
+            <li>
+                Number of episodes: {{ dataset_info.num_episodes }}
+            </li>
+            <li>
+                Frames per second: {{ dataset_info.fps }}
+            </li>
+        </ul>
+
+        <p>Episodes:</p>
+        <ul class="ml-2">
+            {% for episode in episodes %}
+            <li class="font-mono text-sm mt-0.5">
+                <a href="episode_{{ episode }}" class="underline {% if episode_id == episode %}font-bold -ml-1{% endif %}">
+                    Episode {{ episode }}
+                </a>
+            </li>
+            {% endfor %}
+        </ul>
+
+    </div>
+
+    <!-- Toggle sidebar button -->
+    <button class="flex items-center opacity-50 hover:opacity-100 mx-1"
+        @click="() => ($refs.sidebar.classList.toggle('hidden'))" title="Toggle sidebar">
+        <div class="bg-slate-500 w-2 h-10 rounded-full"></div>
+    </button>
+
+    <!-- Content -->
+    <div class="flex-1 max-h-screen flex flex-col gap-4 overflow-y-auto">
+        <h1 class="text-xl font-bold mt-4 font-mono">
+            Episode {{ episode_id }}
+        </h1>
+
+        <!-- Videos -->
+        <div class="flex flex-wrap gap-1">
+            {% for video_info in videos_info %}
+            <div class="max-w-96">
+                <p class="text-sm text-gray-300 bg-gray-800 px-2 rounded-t-xl truncate">{{ video_info.filename }}</p>
+                <video muted loop type="video/mp4" class="min-w-64" @canplay="videoCanPlay" @timeupdate="() => {
+                    if (video.duration) {
+                      const time = video.currentTime;
+                      const pc = (100 / video.duration) * time;
+                      $refs.slider.value = pc;
+                      dygraphTime = time;
+                      dygraphIndex = Math.floor(pc * dygraph.numRows() / 100);
+                      dygraph.setSelection(dygraphIndex, undefined, true, true);
+
+                      $refs.timer.textContent = formatTime(time) + ' / ' + formatTime(video.duration);
+
+                      updateTimeQuery(time.toFixed(2));
+                    }
+                }" @ended="() => {
+                    $refs.btnPlay.classList.remove('hidden');
+                    $refs.btnPause.classList.add('hidden');
+                }"
+                    @loadedmetadata="() => ($refs.timer.textContent = formatTime(0) + ' / ' + formatTime(video.duration))">
+                    <source src="{{ video_info.url }}">
+                    Your browser does not support the video tag.
+                </video>
+            </div>
+            {% endfor %}
+        </div>
+
+        <!-- Shortcuts info -->
+        <div class="text-sm hidden md:block">
+            Hotkeys: <span class="font-mono">Space</span> to pause/unpause, <span class="font-mono">Arrow Down</span> to go to next episode, <span class="font-mono">Arrow Up</span> to go to previous episode.
+        </div>
+
+        <!-- Controllers -->
+        <div class="flex gap-1 text-3xl items-center">
+            <button x-ref="btnPlay" class="-rotate-90" class="-rotate-90" title="Play. Toggle with Space" @click="() => {
+                videos.forEach(video => video.play());
+                $refs.btnPlay.classList.toggle('hidden');
+                $refs.btnPause.classList.toggle('hidden');
+            }">🔽</button>
+            <button x-ref="btnPause" class="hidden" title="Pause. Toggle with Space" @click="() => {
+                videos.forEach(video => video.pause());
+                $refs.btnPlay.classList.toggle('hidden');
+                $refs.btnPause.classList.toggle('hidden');
+            }">⏸️</button>
+            <button title="Jump backward 5 seconds"
+                @click="() => (videos.forEach(video => (video.currentTime -= 5)))">⏪</button>
+            <button title="Jump forward 5 seconds"
+                @click="() => (videos.forEach(video => (video.currentTime += 5)))">⏩</button>
+            <button title="Rewind from start"
+                @click="() => (videos.forEach(video => (video.currentTime = 0.0)))">↩️</button>
+            <input x-ref="slider" max="100" min="0" step="1" type="range" value="0" class="w-80 mx-2" @input="() => {
+                const sliderValue = $refs.slider.value;
+                videos.forEach(video => {
+                    const time = (video.duration * sliderValue) / 100;
+                    video.currentTime = time;
+                });
+            }" />
+            <div x-ref="timer" class="font-mono text-sm border border-slate-500 rounded-lg px-1 py-0.5 shrink-0">0:00 /
+                0:00
+            </div>
+        </div>
+
+        <!-- Graph -->
+        <div class="flex gap-2 mb-4 flex-wrap">
+            <div>
+                <div id="graph" @mouseleave="() => {
+                    dygraph.setSelection(dygraphIndex, undefined, true, true);
+                    dygraphTime = video.currentTime;
+                }">
+                </div>
+                <p x-ref="graphTimer" class="font-mono ml-14 mt-4"
+                    x-init="$watch('dygraphTime', value => ($refs.graphTimer.innerText = `Time: ${dygraphTime.toFixed(2)}s`))">
+                    Time: 0.00s
+                </p>
+            </div>
+
+            <table class="text-sm border-collapse border border-slate-700" x-show="currentFrameData">
+                <thead>
+                    <tr>
+                        <th></th>
+                        <template x-for="(_, colIndex) in Array.from({length: nColumns}, (_, index) => index)">
+                            <th class="border border-slate-700">
+                                <div class="flex gap-x-2 justify-between px-2">
+                                    <input type="checkbox" :checked="isColumnChecked(colIndex)"
+                                        @change="toggleColumn(colIndex)">
+                                    <p x-text="`${columnNames[colIndex]}`"></p>
+                                </div>
+                            </th>
+                        </template>
+                    </tr>
+                </thead>
+                <tbody>
+                    <template x-for="(row, rowIndex) in rows">
+                        <tr class="odd:bg-gray-800 even:bg-gray-900">
+                            <td class="border border-slate-700">
+                                <div class="flex gap-x-2 w-24 font-semibold px-1">
+                                    <input type="checkbox" :checked="isRowChecked(rowIndex)"
+                                        @change="toggleRow(rowIndex)">
+                                    <p x-text="`Motor ${rowIndex}`"></p>
+                                </div>
+                            </td>
+                            <template x-for="(cell, colIndex) in row">
+                                <td x-show="cell" class="border border-slate-700">
+                                    <div class="flex gap-x-2 w-24 justify-between px-2">
+                                        <input type="checkbox" x-model="cell.checked" @change="updateTableValues()">
+                                        <span x-text="`${cell.value.toFixed(2)}`"
+                                            :style="`color: ${cell.color}`"></span>
+                                    </div>
+                                </td>
+                            </template>
+                        </tr>
+                    </template>
+                </tbody>
+            </table>
+
+            <div id="labels" class="hidden">
+            </div>
+        </div>
+    </div>
+
+    <script>
+        function createAlpineData() {
+            return {
+                // state
+                dygraph: null,
+                currentFrameData: null,
+                columnNames: ["state", "action", "pred action"],
+                nColumns: {% if has_policy %}3{% else %}2{% endif %},
+                checked: [],
+                dygraphTime: 0.0,
+                dygraphIndex: 0,
+                videos: null,
+                video: null,
+                colors: null,
+                nVideos: {{ videos_info | length }},
+                nVideoReadyToPlay: 0,
+
+                // alpine initialization
+                init() {
+                    this.videos = document.querySelectorAll('video');
+                    this.video = this.videos[0];
+                    this.dygraph = new Dygraph(document.getElementById("graph"), '{{ ep_csv_url }}', {
+                        pixelsPerPoint: 0.01,
+                        legend: 'always',
+                        labelsDiv: document.getElementById('labels'),
+                        labelsKMB: true,
+                        strokeWidth: 1.5,
+                        pointClickCallback: (event, point) => {
+                            this.dygraphTime = point.xval;
+                            this.updateTableValues(this.dygraphTime);
+                        },
+                        highlightCallback: (event, x, points, row, seriesName) => {
+                            this.dygraphTime = x;
+                            this.updateTableValues(this.dygraphTime);
+                        },
+                        drawCallback: (dygraph, is_initial) => {
+                            if (is_initial) {
+                                // dygraph initialization
+                                this.dygraph.setSelection(this.dygraphIndex, undefined, true, true);
+                                this.colors = this.dygraph.getColors();
+                                this.checked = Array(this.colors.length).fill(true);
+
+                                const seriesNames = this.dygraph.getLabels().slice(1);
+                                const colors = [];
+                                const LIGHTNESS = [30, 65, 85]; // state_lightness, action_lightness, pred_action_lightness
+                                let lightnessIdx = 0;
+                                const chunkSize = Math.ceil(seriesNames.length / this.nColumns);
+                                for (let i = 0; i < seriesNames.length; i += chunkSize) {
+                                    const lightness = LIGHTNESS[lightnessIdx];
+                                    for (let hue = 0; hue < 360; hue += parseInt(360/chunkSize)) {
+                                        const color = `hsl(${hue}, 100%, ${lightness}%)`;
+                                        colors.push(color);
+                                    }
+                                    lightnessIdx += 1;
+                                }
+                                this.dygraph.updateOptions({ colors });
+                                this.colors = colors;
+
+                                this.updateTableValues();
+
+                                let url = new URL(window.location.href);
+                                let params = new URLSearchParams(url.search);
+                                let time = params.get("t");
+                                if(time){
+                                    time = parseFloat(time);
+                                    this.videos.forEach(video => (video.currentTime = time));
+                                }
+                            }
+                        },
+                    });
+                },
+
+                //#region Table Data
+
+                // turn dygraph's 1D data (at a given time t) to 2D data that whose columns names are defined in this.columnNames.
+                // 2d data view is used to create html table element.
+                get rows() {
+                    if (!this.currentFrameData) {
+                        return [];
+                    }
+                    const columnSize = Math.ceil(this.currentFrameData.length / this.nColumns);
+                    return Array.from({
+                        length: columnSize
+                    }, (_, rowIndex) => {
+                        const row = [
+                            this.currentFrameData[rowIndex] || null,
+                            this.currentFrameData[rowIndex + columnSize] || null,
+                        ];
+                        if (this.nColumns === 3) {
+                            row.push(this.currentFrameData[rowIndex + 2 * columnSize] || null)
+                        }
+                        return row;
+                    });
+                },
+                isRowChecked(rowIndex) {
+                    return this.rows[rowIndex].every(cell => cell && cell.checked);
+                },
+                isColumnChecked(colIndex) {
+                    return this.rows.every(row => row[colIndex] && row[colIndex].checked);
+                },
+                toggleRow(rowIndex) {
+                    const newState = !this.isRowChecked(rowIndex);
+                    this.rows[rowIndex].forEach(cell => {
+                        if (cell) cell.checked = newState;
+                    });
+                    this.updateTableValues();
+                },
+                toggleColumn(colIndex) {
+                    const newState = !this.isColumnChecked(colIndex);
+                    this.rows.forEach(row => {
+                        if (row[colIndex]) row[colIndex].checked = newState;
+                    });
+                    this.updateTableValues();
+                },
+
+                // given time t, update the values in the html table with "data[t]"
+                updateTableValues(time) {
+                    if (!this.colors) {
+                        return;
+                    }
+                    let pc = (100 / this.video.duration) * (time === undefined ? this.video.currentTime : time);
+                    if (isNaN(pc)) pc = 0;
+                    const index = Math.floor(pc * this.dygraph.numRows() / 100);
+                    // slice(1) to remove the timestamp point that we do not need
+                    const labels = this.dygraph.getLabels().slice(1);
+                    const values = this.dygraph.rawData_[index].slice(1);
+                    const checkedNew = this.currentFrameData ? this.currentFrameData.map(cell => cell.checked) : Array(
+                        this.colors.length).fill(true);
+                    this.currentFrameData = labels.map((label, idx) => ({
+                        label,
+                        value: values[idx],
+                        color: this.colors[idx],
+                        checked: checkedNew[idx],
+                    }));
+                    const shouldUpdateVisibility = !this.checked.every((value, index) => value === checkedNew[index]);
+                    if (shouldUpdateVisibility) {
+                        this.checked = checkedNew;
+                        this.dygraph.setVisibility(this.checked);
+                    }
+                },
+
+                //#endregion
+
+                updateTimeQuery(time) {
+                    let url = new URL(window.location.href);
+                    let params = new URLSearchParams(url.search);
+                    params.set("t", time);
+                    url.search = params.toString();
+                    window.history.replaceState({}, '', url.toString());
+                },
+
+                formatTime(time) {
+                    var hours = Math.floor(time / 3600);
+                    var minutes = Math.floor((time % 3600) / 60);
+                    var seconds = Math.floor(time % 60);
+                    return (hours > 0 ? hours + ':' : '') + (minutes < 10 ? '0' + minutes : minutes) + ':' + (seconds <
+                        10 ?
+                        '0' + seconds : seconds);
+                },
+
+                videoCanPlay() {
+                    this.nVideoReadyToPlay += 1;
+                    if(this.nVideoReadyToPlay == this.nVideos) {
+                        // start autoplay all videos in sync
+                        this.$refs.btnPlay.click();
+                    }
+                }
+            };
+        }
+    </script>
+</body>
+
+</html>
--- a/media/koch/follower_90_degree.png
+++ b/media/koch/follower_90_degree.png
--- a/media/koch/follower_horizontal.png
+++ b/media/koch/follower_horizontal.png
--- a/media/koch/follower_rest.webp
+++ b/media/koch/follower_rest.webp
--- a/media/koch/follower_rotated.webp
+++ b/media/koch/follower_rotated.webp
--- a/media/koch/follower_zero.webp
+++ b/media/koch/follower_zero.webp
--- a/media/koch/leader_90_degree.png
+++ b/media/koch/leader_90_degree.png
--- a/media/koch/leader_horizontal.png
+++ b/media/koch/leader_horizontal.png
--- a/media/koch/leader_rest.webp
+++ b/media/koch/leader_rest.webp
--- a/media/koch/leader_rotated.webp
+++ b/media/koch/leader_rotated.webp
--- a/media/koch/leader_zero.webp
+++ b/media/koch/leader_zero.webp
--- a/media/tutorial/koch_v1_1_leader_follower.webp
+++ b/media/tutorial/koch_v1_1_leader_follower.webp
--- a/media/tutorial/visualize_dataset_html.webp
+++ b/media/tutorial/visualize_dataset_html.webp
--- a/poetry.lock
+++ b/poetry.lock
@@ -192,6 +192,17 @@ charset-normalizer = ["charset-normalizer"]
 html5lib = ["html5lib"]
 lxml = ["lxml"]

+[[package]]
+name = "blinker"
+version = "1.8.2"
+description = "Fast, simple object-to-object and broadcast signaling"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "blinker-1.8.2-py3-none-any.whl", hash = "sha256:1779309f71bf239144b9399d06ae925637cf6634cf6bd131104184531bf67c01"},
+    {file = "blinker-1.8.2.tar.gz", hash = "sha256:8f77b09d3bf7c795e969e9486f39c2c5e9c39d4ee07424be2bc594ece9642d83"},
+]
+
 [[package]]
 name = "certifi"
 version = "2024.7.4"
@@ -444,63 +455,63 @@ files = [

 [[package]]
 name = "coverage"
-version = "7.5.4"
+version = "7.6.0"
 description = "Code coverage measurement for Python"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "coverage-7.5.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6cfb5a4f556bb51aba274588200a46e4dd6b505fb1a5f8c5ae408222eb416f99"},
-    {file = "coverage-7.5.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2174e7c23e0a454ffe12267a10732c273243b4f2d50d07544a91198f05c48f47"},
-    {file = "coverage-7.5.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2214ee920787d85db1b6a0bd9da5f8503ccc8fcd5814d90796c2f2493a2f4d2e"},
-    {file = "coverage-7.5.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1137f46adb28e3813dec8c01fefadcb8c614f33576f672962e323b5128d9a68d"},
-    {file = "coverage-7.5.4-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b385d49609f8e9efc885790a5a0e89f2e3ae042cdf12958b6034cc442de428d3"},
-    {file = "coverage-7.5.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b4a474f799456e0eb46d78ab07303286a84a3140e9700b9e154cfebc8f527016"},
-    {file = "coverage-7.5.4-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5cd64adedf3be66f8ccee418473c2916492d53cbafbfcff851cbec5a8454b136"},
-    {file = "coverage-7.5.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e564c2cf45d2f44a9da56f4e3a26b2236504a496eb4cb0ca7221cd4cc7a9aca9"},
-    {file = "coverage-7.5.4-cp310-cp310-win32.whl", hash = "sha256:7076b4b3a5f6d2b5d7f1185fde25b1e54eb66e647a1dfef0e2c2bfaf9b4c88c8"},
-    {file = "coverage-7.5.4-cp310-cp310-win_amd64.whl", hash = "sha256:018a12985185038a5b2bcafab04ab833a9a0f2c59995b3cec07e10074c78635f"},
-    {file = "coverage-7.5.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:db14f552ac38f10758ad14dd7b983dbab424e731588d300c7db25b6f89e335b5"},
-    {file = "coverage-7.5.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3257fdd8e574805f27bb5342b77bc65578e98cbc004a92232106344053f319ba"},
-    {file = "coverage-7.5.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3a6612c99081d8d6134005b1354191e103ec9705d7ba2754e848211ac8cacc6b"},
-    {file = "coverage-7.5.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d45d3cbd94159c468b9b8c5a556e3f6b81a8d1af2a92b77320e887c3e7a5d080"},
-    {file = "coverage-7.5.4-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ed550e7442f278af76d9d65af48069f1fb84c9f745ae249c1a183c1e9d1b025c"},
-    {file = "coverage-7.5.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:7a892be37ca35eb5019ec85402c3371b0f7cda5ab5056023a7f13da0961e60da"},
-    {file = "coverage-7.5.4-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8192794d120167e2a64721d88dbd688584675e86e15d0569599257566dec9bf0"},
-    {file = "coverage-7.5.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:820bc841faa502e727a48311948e0461132a9c8baa42f6b2b84a29ced24cc078"},
-    {file = "coverage-7.5.4-cp311-cp311-win32.whl", hash = "sha256:6aae5cce399a0f065da65c7bb1e8abd5c7a3043da9dceb429ebe1b289bc07806"},
-    {file = "coverage-7.5.4-cp311-cp311-win_amd64.whl", hash = "sha256:d2e344d6adc8ef81c5a233d3a57b3c7d5181f40e79e05e1c143da143ccb6377d"},
-    {file = "coverage-7.5.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:54317c2b806354cbb2dc7ac27e2b93f97096912cc16b18289c5d4e44fc663233"},
-    {file = "coverage-7.5.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:042183de01f8b6d531e10c197f7f0315a61e8d805ab29c5f7b51a01d62782747"},
-    {file = "coverage-7.5.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a6bb74ed465d5fb204b2ec41d79bcd28afccf817de721e8a807d5141c3426638"},
-    {file = "coverage-7.5.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3d45ff86efb129c599a3b287ae2e44c1e281ae0f9a9bad0edc202179bcc3a2e"},
-    {file = "coverage-7.5.4-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5013ed890dc917cef2c9f765c4c6a8ae9df983cd60dbb635df8ed9f4ebc9f555"},
-    {file = "coverage-7.5.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1014fbf665fef86cdfd6cb5b7371496ce35e4d2a00cda501cf9f5b9e6fced69f"},
-    {file = "coverage-7.5.4-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:3684bc2ff328f935981847082ba4fdc950d58906a40eafa93510d1b54c08a66c"},
-    {file = "coverage-7.5.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:581ea96f92bf71a5ec0974001f900db495488434a6928a2ca7f01eee20c23805"},
-    {file = "coverage-7.5.4-cp312-cp312-win32.whl", hash = "sha256:73ca8fbc5bc622e54627314c1a6f1dfdd8db69788f3443e752c215f29fa87a0b"},
-    {file = "coverage-7.5.4-cp312-cp312-win_amd64.whl", hash = "sha256:cef4649ec906ea7ea5e9e796e68b987f83fa9a718514fe147f538cfeda76d7a7"},
-    {file = "coverage-7.5.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cdd31315fc20868c194130de9ee6bfd99755cc9565edff98ecc12585b90be882"},
-    {file = "coverage-7.5.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:02ff6e898197cc1e9fa375581382b72498eb2e6d5fc0b53f03e496cfee3fac6d"},
-    {file = "coverage-7.5.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d05c16cf4b4c2fc880cb12ba4c9b526e9e5d5bb1d81313d4d732a5b9fe2b9d53"},
-    {file = "coverage-7.5.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c5986ee7ea0795a4095ac4d113cbb3448601efca7f158ec7f7087a6c705304e4"},
-    {file = "coverage-7.5.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5df54843b88901fdc2f598ac06737f03d71168fd1175728054c8f5a2739ac3e4"},
-    {file = "coverage-7.5.4-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:ab73b35e8d109bffbda9a3e91c64e29fe26e03e49addf5b43d85fc426dde11f9"},
-    {file = "coverage-7.5.4-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:aea072a941b033813f5e4814541fc265a5c12ed9720daef11ca516aeacd3bd7f"},
-    {file = "coverage-7.5.4-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:16852febd96acd953b0d55fc842ce2dac1710f26729b31c80b940b9afcd9896f"},
-    {file = "coverage-7.5.4-cp38-cp38-win32.whl", hash = "sha256:8f894208794b164e6bd4bba61fc98bf6b06be4d390cf2daacfa6eca0a6d2bb4f"},
-    {file = "coverage-7.5.4-cp38-cp38-win_amd64.whl", hash = "sha256:e2afe743289273209c992075a5a4913e8d007d569a406ffed0bd080ea02b0633"},
-    {file = "coverage-7.5.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b95c3a8cb0463ba9f77383d0fa8c9194cf91f64445a63fc26fb2327e1e1eb088"},
-    {file = "coverage-7.5.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3d7564cc09dd91b5a6001754a5b3c6ecc4aba6323baf33a12bd751036c998be4"},
-    {file = "coverage-7.5.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:44da56a2589b684813f86d07597fdf8a9c6ce77f58976727329272f5a01f99f7"},
-    {file = "coverage-7.5.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e16f3d6b491c48c5ae726308e6ab1e18ee830b4cdd6913f2d7f77354b33f91c8"},
-    {file = "coverage-7.5.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dbc5958cb471e5a5af41b0ddaea96a37e74ed289535e8deca404811f6cb0bc3d"},
-    {file = "coverage-7.5.4-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:a04e990a2a41740b02d6182b498ee9796cf60eefe40cf859b016650147908029"},
-    {file = "coverage-7.5.4-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:ddbd2f9713a79e8e7242d7c51f1929611e991d855f414ca9996c20e44a895f7c"},
-    {file = "coverage-7.5.4-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:b1ccf5e728ccf83acd313c89f07c22d70d6c375a9c6f339233dcf792094bcbf7"},
-    {file = "coverage-7.5.4-cp39-cp39-win32.whl", hash = "sha256:56b4eafa21c6c175b3ede004ca12c653a88b6f922494b023aeb1e836df953ace"},
-    {file = "coverage-7.5.4-cp39-cp39-win_amd64.whl", hash = "sha256:65e528e2e921ba8fd67d9055e6b9f9e34b21ebd6768ae1c1723f4ea6ace1234d"},
-    {file = "coverage-7.5.4-pp38.pp39.pp310-none-any.whl", hash = "sha256:79b356f3dd5b26f3ad23b35c75dbdaf1f9e2450b6bcefc6d0825ea0aa3f86ca5"},
-    {file = "coverage-7.5.4.tar.gz", hash = "sha256:a44963520b069e12789d0faea4e9fdb1e410cdc4aab89d94f7f55cbb7fef0353"},
+    {file = "coverage-7.6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:dff044f661f59dace805eedb4a7404c573b6ff0cdba4a524141bc63d7be5c7fd"},
+    {file = "coverage-7.6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a8659fd33ee9e6ca03950cfdcdf271d645cf681609153f218826dd9805ab585c"},
+    {file = "coverage-7.6.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7792f0ab20df8071d669d929c75c97fecfa6bcab82c10ee4adb91c7a54055463"},
+    {file = "coverage-7.6.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d4b3cd1ca7cd73d229487fa5caca9e4bc1f0bca96526b922d61053ea751fe791"},
+    {file = "coverage-7.6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e7e128f85c0b419907d1f38e616c4f1e9f1d1b37a7949f44df9a73d5da5cd53c"},
+    {file = "coverage-7.6.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a94925102c89247530ae1dab7dc02c690942566f22e189cbd53579b0693c0783"},
+    {file = "coverage-7.6.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:dcd070b5b585b50e6617e8972f3fbbee786afca71b1936ac06257f7e178f00f6"},
+    {file = "coverage-7.6.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d50a252b23b9b4dfeefc1f663c568a221092cbaded20a05a11665d0dbec9b8fb"},
+    {file = "coverage-7.6.0-cp310-cp310-win32.whl", hash = "sha256:0e7b27d04131c46e6894f23a4ae186a6a2207209a05df5b6ad4caee6d54a222c"},
+    {file = "coverage-7.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:54dece71673b3187c86226c3ca793c5f891f9fc3d8aa183f2e3653da18566169"},
+    {file = "coverage-7.6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c7b525ab52ce18c57ae232ba6f7010297a87ced82a2383b1afd238849c1ff933"},
+    {file = "coverage-7.6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4bea27c4269234e06f621f3fac3925f56ff34bc14521484b8f66a580aacc2e7d"},
+    {file = "coverage-7.6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ed8d1d1821ba5fc88d4a4f45387b65de52382fa3ef1f0115a4f7a20cdfab0e94"},
+    {file = "coverage-7.6.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:01c322ef2bbe15057bc4bf132b525b7e3f7206f071799eb8aa6ad1940bcf5fb1"},
+    {file = "coverage-7.6.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:03cafe82c1b32b770a29fd6de923625ccac3185a54a5e66606da26d105f37dac"},
+    {file = "coverage-7.6.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0d1b923fc4a40c5832be4f35a5dab0e5ff89cddf83bb4174499e02ea089daf57"},
+    {file = "coverage-7.6.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:4b03741e70fb811d1a9a1d75355cf391f274ed85847f4b78e35459899f57af4d"},
+    {file = "coverage-7.6.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a73d18625f6a8a1cbb11eadc1d03929f9510f4131879288e3f7922097a429f63"},
+    {file = "coverage-7.6.0-cp311-cp311-win32.whl", hash = "sha256:65fa405b837060db569a61ec368b74688f429b32fa47a8929a7a2f9b47183713"},
+    {file = "coverage-7.6.0-cp311-cp311-win_amd64.whl", hash = "sha256:6379688fb4cfa921ae349c76eb1a9ab26b65f32b03d46bb0eed841fd4cb6afb1"},
+    {file = "coverage-7.6.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f7db0b6ae1f96ae41afe626095149ecd1b212b424626175a6633c2999eaad45b"},
+    {file = "coverage-7.6.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:bbdf9a72403110a3bdae77948b8011f644571311c2fb35ee15f0f10a8fc082e8"},
+    {file = "coverage-7.6.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9cc44bf0315268e253bf563f3560e6c004efe38f76db03a1558274a6e04bf5d5"},
+    {file = "coverage-7.6.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:da8549d17489cd52f85a9829d0e1d91059359b3c54a26f28bec2c5d369524807"},
+    {file = "coverage-7.6.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0086cd4fc71b7d485ac93ca4239c8f75732c2ae3ba83f6be1c9be59d9e2c6382"},
+    {file = "coverage-7.6.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1fad32ee9b27350687035cb5fdf9145bc9cf0a094a9577d43e909948ebcfa27b"},
+    {file = "coverage-7.6.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:044a0985a4f25b335882b0966625270a8d9db3d3409ddc49a4eb00b0ef5e8cee"},
+    {file = "coverage-7.6.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:76d5f82213aa78098b9b964ea89de4617e70e0d43e97900c2778a50856dac605"},
+    {file = "coverage-7.6.0-cp312-cp312-win32.whl", hash = "sha256:3c59105f8d58ce500f348c5b56163a4113a440dad6daa2294b5052a10db866da"},
+    {file = "coverage-7.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:ca5d79cfdae420a1d52bf177de4bc2289c321d6c961ae321503b2ca59c17ae67"},
+    {file = "coverage-7.6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d39bd10f0ae453554798b125d2f39884290c480f56e8a02ba7a6ed552005243b"},
+    {file = "coverage-7.6.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:beb08e8508e53a568811016e59f3234d29c2583f6b6e28572f0954a6b4f7e03d"},
+    {file = "coverage-7.6.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2e16f4cd2bc4d88ba30ca2d3bbf2f21f00f382cf4e1ce3b1ddc96c634bc48ca"},
+    {file = "coverage-7.6.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6616d1c9bf1e3faea78711ee42a8b972367d82ceae233ec0ac61cc7fec09fa6b"},
+    {file = "coverage-7.6.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad4567d6c334c46046d1c4c20024de2a1c3abc626817ae21ae3da600f5779b44"},
+    {file = "coverage-7.6.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:d17c6a415d68cfe1091d3296ba5749d3d8696e42c37fca5d4860c5bf7b729f03"},
+    {file = "coverage-7.6.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:9146579352d7b5f6412735d0f203bbd8d00113a680b66565e205bc605ef81bc6"},
+    {file = "coverage-7.6.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:cdab02a0a941af190df8782aafc591ef3ad08824f97850b015c8c6a8b3877b0b"},
+    {file = "coverage-7.6.0-cp38-cp38-win32.whl", hash = "sha256:df423f351b162a702c053d5dddc0fc0ef9a9e27ea3f449781ace5f906b664428"},
+    {file = "coverage-7.6.0-cp38-cp38-win_amd64.whl", hash = "sha256:f2501d60d7497fd55e391f423f965bbe9e650e9ffc3c627d5f0ac516026000b8"},
+    {file = "coverage-7.6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7221f9ac9dad9492cecab6f676b3eaf9185141539d5c9689d13fd6b0d7de840c"},
+    {file = "coverage-7.6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ddaaa91bfc4477d2871442bbf30a125e8fe6b05da8a0015507bfbf4718228ab2"},
+    {file = "coverage-7.6.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c4cbe651f3904e28f3a55d6f371203049034b4ddbce65a54527a3f189ca3b390"},
+    {file = "coverage-7.6.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:831b476d79408ab6ccfadaaf199906c833f02fdb32c9ab907b1d4aa0713cfa3b"},
+    {file = "coverage-7.6.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:46c3d091059ad0b9c59d1034de74a7f36dcfa7f6d3bde782c49deb42438f2450"},
+    {file = "coverage-7.6.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:4d5fae0a22dc86259dee66f2cc6c1d3e490c4a1214d7daa2a93d07491c5c04b6"},
+    {file = "coverage-7.6.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:07ed352205574aad067482e53dd606926afebcb5590653121063fbf4e2175166"},
+    {file = "coverage-7.6.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:49c76cdfa13015c4560702574bad67f0e15ca5a2872c6a125f6327ead2b731dd"},
+    {file = "coverage-7.6.0-cp39-cp39-win32.whl", hash = "sha256:482855914928c8175735a2a59c8dc5806cf7d8f032e4820d52e845d1f731dca2"},
+    {file = "coverage-7.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:543ef9179bc55edfd895154a51792b01c017c87af0ebaae092720152e19e42ca"},
+    {file = "coverage-7.6.0-pp38.pp39.pp310-none-any.whl", hash = "sha256:6fe885135c8a479d3e37a7aae61cbd3a0fb2deccb4dda3c25f92a49189f766d6"},
+    {file = "coverage-7.6.0.tar.gz", hash = "sha256:289cc803fa1dc901f84701ac10c9ee873619320f2f9aff38794db4a4a0268d51"},
 ]

 [package.dependencies]
@@ -584,17 +595,6 @@ files = [
    {file = "debugpy-1.8.2.zip", hash = "sha256:95378ed08ed2089221896b9b3a8d021e642c24edc8fef20e5d4342ca8be65c00"},
 ]

-[[package]]
-name = "decorator"
-version = "4.4.2"
-description = "Decorators for Humans"
-optional = false
-python-versions = ">=2.6, !=3.0.*, !=3.1.*"
-files = [
-    {file = "decorator-4.4.2-py2.py3-none-any.whl", hash = "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760"},
-    {file = "decorator-4.4.2.tar.gz", hash = "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7"},
-]
-
 [[package]]
 name = "deepdiff"
 version = "7.0.1"
@@ -615,18 +615,18 @@ optimize = ["orjson"]

 [[package]]
 name = "diffusers"
-version = "0.27.2"
+version = "0.29.2"
 description = "State-of-the-art diffusion in PyTorch and JAX."
 optional = false
 python-versions = ">=3.8.0"
 files = [
-    {file = "diffusers-0.27.2-py3-none-any.whl", hash = "sha256:85da5cd1098ab428535d592136973ec0c3f12f78148c94b379cb9f02d2414e75"},
-    {file = "diffusers-0.27.2.tar.gz", hash = "sha256:6cefd7770d7fc1d139614233aa17cdcd639c138d0c3517b8d8bbc8cf573050a0"},
+    {file = "diffusers-0.29.2-py3-none-any.whl", hash = "sha256:d5e9bb13c8097b4eed10df23d1294d2e5a418f53e3f89c7ef228b5b982970428"},
+    {file = "diffusers-0.29.2.tar.gz", hash = "sha256:b85f277668e22089cf68b40dd9b76940db7d24ba9cdac107533ed10ab8e4e9db"},
 ]

 [package.dependencies]
 filelock = "*"
-huggingface-hub = ">=0.20.2"
+huggingface-hub = ">=0.23.2"
 importlib-metadata = "*"
 numpy = "*"
 Pillow = "*"
@@ -635,13 +635,13 @@ requests = "*"
 safetensors = ">=0.3.1"

 [package.extras]
-dev = ["GitPython (<3.1.19)", "Jinja2", "accelerate (>=0.11.0)", "compel (==0.1.8)", "datasets", "flax (>=0.4.1)", "hf-doc-builder (>=0.3.0)", "invisible-watermark (>=0.2.0)", "isort (>=5.5.4)", "jax (>=0.4.1)", "jaxlib (>=0.4.1)", "k-diffusion (>=0.0.12)", "librosa", "parameterized", "peft (>=0.6.0)", "protobuf (>=3.20.3,<4)", "pytest", "pytest-timeout", "pytest-xdist", "requests-mock (==1.10.0)", "ruff (==0.1.5)", "safetensors (>=0.3.1)", "scipy", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "torch (>=1.4)", "torchvision", "transformers (>=4.25.1)", "urllib3 (<=2.0.0)"]
+dev = ["GitPython (<3.1.19)", "Jinja2", "accelerate (>=0.29.3)", "compel (==0.1.8)", "datasets", "flax (>=0.4.1)", "hf-doc-builder (>=0.3.0)", "invisible-watermark (>=0.2.0)", "isort (>=5.5.4)", "jax (>=0.4.1)", "jaxlib (>=0.4.1)", "k-diffusion (>=0.0.12)", "librosa", "parameterized", "peft (>=0.6.0)", "protobuf (>=3.20.3,<4)", "pytest", "pytest-timeout", "pytest-xdist", "requests-mock (==1.10.0)", "ruff (==0.1.5)", "safetensors (>=0.3.1)", "scipy", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "torch (>=1.4)", "torchvision", "transformers (>=4.25.1)", "urllib3 (<=2.0.0)"]
 docs = ["hf-doc-builder (>=0.3.0)"]
 flax = ["flax (>=0.4.1)", "jax (>=0.4.1)", "jaxlib (>=0.4.1)"]
 quality = ["hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)", "ruff (==0.1.5)", "urllib3 (<=2.0.0)"]
 test = ["GitPython (<3.1.19)", "Jinja2", "compel (==0.1.8)", "datasets", "invisible-watermark (>=0.2.0)", "k-diffusion (>=0.0.12)", "librosa", "parameterized", "pytest", "pytest-timeout", "pytest-xdist", "requests-mock (==1.10.0)", "safetensors (>=0.3.1)", "scipy", "sentencepiece (>=0.1.91,!=0.1.92)", "torchvision", "transformers (>=4.25.1)"]
-torch = ["accelerate (>=0.11.0)", "torch (>=1.4)"]
-training = ["Jinja2", "accelerate (>=0.11.0)", "datasets", "peft (>=0.6.0)", "protobuf (>=3.20.3,<4)", "tensorboard"]
+torch = ["accelerate (>=0.29.3)", "torch (>=1.4)"]
+training = ["Jinja2", "accelerate (>=0.29.3)", "datasets", "peft (>=0.6.0)", "protobuf (>=3.20.3,<4)", "tensorboard"]

 [[package]]
 name = "dill"
@@ -843,13 +843,13 @@ files = [

 [[package]]
 name = "exceptiongroup"
-version = "1.2.1"
+version = "1.2.2"
 description = "Backport of PEP 654 (exception groups)"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "exceptiongroup-1.2.1-py3-none-any.whl", hash = "sha256:5258b9ed329c5bbdd31a309f53cbfb0b155341807f6ff7606a1e801a891b29ad"},
-    {file = "exceptiongroup-1.2.1.tar.gz", hash = "sha256:a4785e48b045528f5bfe627b6ad554ff32def154f42372786903b7abcfe1aa16"},
+    {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"},
+    {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"},
 ]

 [package.extras]
@@ -893,6 +893,28 @@ docs = ["furo (>=2023.9.10)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1
 testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8.0.1)", "pytest (>=7.4.3)", "pytest-asyncio (>=0.21)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)", "virtualenv (>=20.26.2)"]
 typing = ["typing-extensions (>=4.8)"]

+[[package]]
+name = "flask"
+version = "3.0.3"
+description = "A simple framework for building complex web applications."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "flask-3.0.3-py3-none-any.whl", hash = "sha256:34e815dfaa43340d1d15a5c3a02b8476004037eb4840b34910c6e21679d288f3"},
+    {file = "flask-3.0.3.tar.gz", hash = "sha256:ceb27b0af3823ea2737928a4d99d125a06175b8512c445cbd9a9ce200ef76842"},
+]
+
+[package.dependencies]
+blinker = ">=1.6.2"
+click = ">=8.1.3"
+itsdangerous = ">=2.1.2"
+Jinja2 = ">=3.1.2"
+Werkzeug = ">=3.0.0"
+
+[package.extras]
+async = ["asgiref (>=3.2)"]
+dotenv = ["python-dotenv"]
+
 [[package]]
 name = "frozenlist"
 version = "1.4.1"
@@ -1132,7 +1154,7 @@ pyarrow = ">=12.0.0"
 type = "git"
 url = "https://github.com/dora-rs/dora-lerobot.git"
 reference = "HEAD"
-resolved_reference = "2addd1131a3c94f7b70b805577901b7967853e98"
+resolved_reference = "fda22deba84c46695369736edd34dc740aef45eb"
 subdirectory = "gym_dora"

 [[package]]
@@ -1338,19 +1360,20 @@ files = [

 [[package]]
 name = "huggingface-hub"
-version = "0.23.4"
+version = "0.23.5"
 description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
 optional = false
 python-versions = ">=3.8.0"
 files = [
-    {file = "huggingface_hub-0.23.4-py3-none-any.whl", hash = "sha256:3a0b957aa87150addf0cc7bd71b4d954b78e749850e1e7fb29ebbd2db64ca037"},
-    {file = "huggingface_hub-0.23.4.tar.gz", hash = "sha256:35d99016433900e44ae7efe1c209164a5a81dbbcd53a52f99c281dcd7ce22431"},
+    {file = "huggingface_hub-0.23.5-py3-none-any.whl", hash = "sha256:d7a7d337615e11a45cc14a0ce5a605db6b038dc24af42866f731684825226e90"},
+    {file = "huggingface_hub-0.23.5.tar.gz", hash = "sha256:67a9caba79b71235be3752852ca27da86bd54311d2424ca8afdb8dda056edf98"},
 ]

 [package.dependencies]
 filelock = "*"
 fsspec = ">=2023.5.0"
 hf-transfer = {version = ">=0.1.4", optional = true, markers = "extra == \"hf-transfer\""}
+InquirerPy = {version = "0.3.4", optional = true, markers = "extra == \"cli\""}
 packaging = ">=20.9"
 pyyaml = ">=5.1"
 requests = "*"
@@ -1389,13 +1412,13 @@ packaging = "*"

 [[package]]
 name = "identify"
-version = "2.5.36"
+version = "2.6.0"
 description = "File identification library for Python"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "identify-2.5.36-py2.py3-none-any.whl", hash = "sha256:37d93f380f4de590500d9dba7db359d0d3da95ffe7f9de1753faa159e71e7dfa"},
-    {file = "identify-2.5.36.tar.gz", hash = "sha256:e5e00f54165f9047fbebeb4a560f9acfb8af4c88232be60a488e9b68d122745d"},
+    {file = "identify-2.6.0-py2.py3-none-any.whl", hash = "sha256:e79ae4406387a9d300332b5fd366d8994f1525e8414984e1a59e058b2eda2dd0"},
+    {file = "identify-2.6.0.tar.gz", hash = "sha256:cb171c685bdc31bcc4c1734698736a7d5b6c8bf2e0c15117f4d469c8640ae5cf"},
 ]

 [package.extras]
@@ -1537,6 +1560,24 @@ files = [
    {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
 ]

+[[package]]
+name = "inquirerpy"
+version = "0.3.4"
+description = "Python port of Inquirer.js (A collection of common interactive command-line user interfaces)"
+optional = false
+python-versions = ">=3.7,<4.0"
+files = [
+    {file = "InquirerPy-0.3.4-py3-none-any.whl", hash = "sha256:c65fdfbac1fa00e3ee4fb10679f4d3ed7a012abf4833910e63c295827fe2a7d4"},
+    {file = "InquirerPy-0.3.4.tar.gz", hash = "sha256:89d2ada0111f337483cb41ae31073108b2ec1e618a49d7110b0d7ade89fc197e"},
+]
+
+[package.dependencies]
+pfzy = ">=0.3.1,<0.4.0"
+prompt-toolkit = ">=3.0.1,<4.0.0"
+
+[package.extras]
+docs = ["Sphinx (>=4.1.2,<5.0.0)", "furo (>=2021.8.17-beta.43,<2022.0.0)", "myst-parser (>=0.15.1,<0.16.0)", "sphinx-autobuild (>=2021.3.14,<2022.0.0)", "sphinx-copybutton (>=0.4.0,<0.5.0)"]
+
 [[package]]
 name = "intel-openmp"
 version = "2021.4.0"
@@ -1551,6 +1592,17 @@ files = [
    {file = "intel_openmp-2021.4.0-py2.py3-none-win_amd64.whl", hash = "sha256:eef4c8bcc8acefd7f5cd3b9384dbf73d59e2c99fc56545712ded913f43c4a94f"},
 ]

+[[package]]
+name = "itsdangerous"
+version = "2.2.0"
+description = "Safely pass data to untrusted environments and back."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "itsdangerous-2.2.0-py3-none-any.whl", hash = "sha256:c6242fc49e35958c8b15141343aa660db5fc54d4f13a1db01a3f5891b98700ef"},
+    {file = "itsdangerous-2.2.0.tar.gz", hash = "sha256:e0050c0b7da1eea53ffaf149c0cfbb5c6e2e2b69c4bef22c81fa6eb73e5f6173"},
+]
+
 [[package]]
 name = "jinja2"
 version = "3.1.4"
@@ -1906,30 +1958,6 @@ files = [
 intel-openmp = "==2021.*"
 tbb = "==2021.*"

-[[package]]
-name = "moviepy"
-version = "1.0.3"
-description = "Video editing with Python"
-optional = false
-python-versions = "*"
-files = [
-    {file = "moviepy-1.0.3.tar.gz", hash = "sha256:2884e35d1788077db3ff89e763c5ba7bfddbd7ae9108c9bc809e7ba58fa433f5"},
-]
-
-[package.dependencies]
-decorator = ">=4.0.2,<5.0"
-imageio = {version = ">=2.5,<3.0", markers = "python_version >= \"3.4\""}
-imageio_ffmpeg = {version = ">=0.2.0", markers = "python_version >= \"3.4\""}
-numpy = {version = ">=1.17.3", markers = "python_version > \"2.7\""}
-proglog = "<=1.0.0"
-requests = ">=2.8.1,<3.0"
-tqdm = ">=4.11.2,<5.0"
-
-[package.extras]
-doc = ["Sphinx (>=1.5.2,<2.0)", "numpydoc (>=0.6.0,<1.0)", "pygame (>=1.9.3,<2.0)", "sphinx_rtd_theme (>=0.1.10b0,<1.0)"]
-optional = ["matplotlib (>=2.0.0,<3.0)", "opencv-python (>=3.0,<4.0)", "scikit-image (>=0.13.0,<1.0)", "scikit-learn", "scipy (>=0.19.0,<1.5)", "youtube_dl"]
-test = ["coverage (<5.0)", "coveralls (>=1.1,<2.0)", "pytest (>=3.0.0,<4.0)", "pytest-cov (>=2.5.1,<3.0)", "requests (>=2.8.1,<3.0)"]
-
 [[package]]
 name = "mpmath"
 version = "1.3.0"
@@ -2175,43 +2203,36 @@ numpy = ">=1.22,<2.1"

 [[package]]
 name = "numcodecs"
-version = "0.12.1"
+version = "0.13.0"
 description = "A Python package providing buffer compression and transformation codecs for use in data storage and communication applications."
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.10"
 files = [
-    {file = "numcodecs-0.12.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d37f628fe92b3699e65831d5733feca74d2e33b50ef29118ffd41c13c677210e"},
-    {file = "numcodecs-0.12.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:941b7446b68cf79f089bcfe92edaa3b154533dcbcd82474f994b28f2eedb1c60"},
-    {file = "numcodecs-0.12.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e79bf9d1d37199ac00a60ff3adb64757523291d19d03116832e600cac391c51"},
-    {file = "numcodecs-0.12.1-cp310-cp310-win_amd64.whl", hash = "sha256:82d7107f80f9307235cb7e74719292d101c7ea1e393fe628817f0d635b7384f5"},
-    {file = "numcodecs-0.12.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:eeaf42768910f1c6eebf6c1bb00160728e62c9343df9e2e315dc9fe12e3f6071"},
-    {file = "numcodecs-0.12.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:135b2d47563f7b9dc5ee6ce3d1b81b0f1397f69309e909f1a35bb0f7c553d45e"},
-    {file = "numcodecs-0.12.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a191a8e347ecd016e5c357f2bf41fbcb026f6ffe78fff50c77ab12e96701d155"},
-    {file = "numcodecs-0.12.1-cp311-cp311-win_amd64.whl", hash = "sha256:21d8267bd4313f4d16f5b6287731d4c8ebdab236038f29ad1b0e93c9b2ca64ee"},
-    {file = "numcodecs-0.12.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:2f84df6b8693206365a5b37c005bfa9d1be486122bde683a7b6446af4b75d862"},
-    {file = "numcodecs-0.12.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:760627780a8b6afdb7f942f2a0ddaf4e31d3d7eea1d8498cf0fd3204a33c4618"},
-    {file = "numcodecs-0.12.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c258bd1d3dfa75a9b708540d23b2da43d63607f9df76dfa0309a7597d1de3b73"},
-    {file = "numcodecs-0.12.1-cp312-cp312-win_amd64.whl", hash = "sha256:e04649ea504aff858dbe294631f098fbfd671baf58bfc04fc48d746554c05d67"},
-    {file = "numcodecs-0.12.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:caf1a1e6678aab9c1e29d2109b299f7a467bd4d4c34235b1f0e082167846b88f"},
-    {file = "numcodecs-0.12.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:c17687b1fd1fef68af616bc83f896035d24e40e04e91e7e6dae56379eb59fe33"},
-    {file = "numcodecs-0.12.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:29dfb195f835a55c4d490fb097aac8c1bcb96c54cf1b037d9218492c95e9d8c5"},
-    {file = "numcodecs-0.12.1-cp38-cp38-win_amd64.whl", hash = "sha256:2f1ba2f4af3fd3ba65b1bcffb717fe65efe101a50a91c368f79f3101dbb1e243"},
-    {file = "numcodecs-0.12.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2fbb12a6a1abe95926f25c65e283762d63a9bf9e43c0de2c6a1a798347dfcb40"},
-    {file = "numcodecs-0.12.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f2207871868b2464dc11c513965fd99b958a9d7cde2629be7b2dc84fdaab013b"},
-    {file = "numcodecs-0.12.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:abff3554a6892a89aacf7b642a044e4535499edf07aeae2f2e6e8fc08c9ba07f"},
-    {file = "numcodecs-0.12.1-cp39-cp39-win_amd64.whl", hash = "sha256:ef964d4860d3e6b38df0633caf3e51dc850a6293fd8e93240473642681d95136"},
-    {file = "numcodecs-0.12.1.tar.gz", hash = "sha256:05d91a433733e7eef268d7e80ec226a0232da244289614a8f3826901aec1098e"},
+    {file = "numcodecs-0.13.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:56e49f68ce6aeba29f144992524c8897d94f846d02bbcc820dd29d7c5c2a073e"},
+    {file = "numcodecs-0.13.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:17bc4b568214582f4c623700592f633f3afd920848630049c584fa1e535253ad"},
+    {file = "numcodecs-0.13.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eed420a9c62d0a569aa94a387f93045f068ad3e7bbd787c6ce70bc5fefbaa7d9"},
+    {file = "numcodecs-0.13.0-cp310-cp310-win_amd64.whl", hash = "sha256:e7d3b9693df52eeaf978d2a56971d01cf9b4e284ae769ec764807f2087cce51d"},
+    {file = "numcodecs-0.13.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f208a1b8b5e66c767ed043812ca74d9045e09b7b2e085d064a585c30b9efc8e7"},
+    {file = "numcodecs-0.13.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a68368d3ce625ec76fcacd84785f6110d30a232909d5c6093a7aa25628880477"},
+    {file = "numcodecs-0.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f5904216811f2e9d312c23ffaad3b3d4c7442a3583d3a8bf81ca8319e9f5deb5"},
+    {file = "numcodecs-0.13.0-cp311-cp311-win_amd64.whl", hash = "sha256:208cab0f4d9cf4409e9c4a4c935e165833786614822c81dee9d865af372da9df"},
+    {file = "numcodecs-0.13.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f3cf462d2357998d7f6baaa0427657b0eeda3eb79fba2b146d2d04542912a513"},
+    {file = "numcodecs-0.13.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ac4dd5556fb126271e93bd1a02266e21b01d3617db448d70d00eec8e034506b4"},
+    {file = "numcodecs-0.13.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:820be89729583c91601a6b35c052008cdd2665b25bfedb91b367cc155fb34ba0"},
+    {file = "numcodecs-0.13.0-cp312-cp312-win_amd64.whl", hash = "sha256:d67a859dd8a7f026829e91cb1799c26720cc9d29ee4ae0060cc7a581670abc06"},
+    {file = "numcodecs-0.13.0.tar.gz", hash = "sha256:ba4fac7036ea5a078c7afe1d4dffeb9685080d42f19c9c16b12dad866703aa2e"},
 ]

 [package.dependencies]
 numpy = ">=1.7"

 [package.extras]
-docs = ["mock", "numpydoc", "sphinx (<7.0.0)", "sphinx-issues"]
+docs = ["mock", "numpydoc", "pydata-sphinx-theme", "sphinx (<7.0.0)", "sphinx-issues"]
 msgpack = ["msgpack"]
-test = ["coverage", "flake8", "pytest", "pytest-cov"]
+pcodec = ["pcodec (>=0.2.0)"]
+test = ["coverage", "pytest", "pytest-cov"]
 test-extras = ["importlib-metadata"]
-zfpy = ["zfpy (>=1.0.0)"]
+zfpy = ["numpy (<2.0.0)", "zfpy (>=1.0.0)"]

 [[package]]
 name = "numpy"
@@ -2562,6 +2583,20 @@ other = ["pillow (>=8.0.1)"]
 sisl = ["box2d-py (==2.3.5)", "pygame (==2.3.0)", "pymunk (==6.2.0)", "scipy (>=1.4.1)"]
 testing = ["AutoROM", "pre-commit", "pynput", "pytest", "pytest-cov", "pytest-markdown-docs", "pytest-xdist"]

+[[package]]
+name = "pfzy"
+version = "0.3.4"
+description = "Python port of the fzy fuzzy string matching algorithm"
+optional = false
+python-versions = ">=3.7,<4.0"
+files = [
+    {file = "pfzy-0.3.4-py3-none-any.whl", hash = "sha256:5f50d5b2b3207fa72e7ec0ef08372ef652685470974a107d0d4999fc5a903a96"},
+    {file = "pfzy-0.3.4.tar.gz", hash = "sha256:717ea765dd10b63618e7298b2d98efd819e0b30cd5905c9707223dceeb94b3f1"},
+]
+
+[package.extras]
+docs = ["Sphinx (>=4.1.2,<5.0.0)", "furo (>=2021.8.17-beta.43,<2022.0.0)", "myst-parser (>=0.15.1,<0.16.0)", "sphinx-autobuild (>=2021.3.14,<2022.0.0)", "sphinx-copybutton (>=0.4.0,<0.5.0)"]
+
 [[package]]
 name = "pillow"
 version = "10.4.0"
@@ -2709,18 +2744,18 @@ pyyaml = ">=5.1"
 virtualenv = ">=20.10.0"

 [[package]]
-name = "proglog"
-version = "0.1.10"
-description = "Log and progress bar manager for console, notebooks, web..."
+name = "prompt-toolkit"
+version = "3.0.47"
+description = "Library for building powerful interactive command lines in Python"
 optional = false
-python-versions = "*"
+python-versions = ">=3.7.0"
 files = [
-    {file = "proglog-0.1.10-py3-none-any.whl", hash = "sha256:19d5da037e8c813da480b741e3fa71fb1ac0a5b02bf21c41577c7f327485ec50"},
-    {file = "proglog-0.1.10.tar.gz", hash = "sha256:658c28c9c82e4caeb2f25f488fff9ceace22f8d69b15d0c1c86d64275e4ddab4"},
+    {file = "prompt_toolkit-3.0.47-py3-none-any.whl", hash = "sha256:0d7bfa67001d5e39d02c224b663abc33687405033a8c422d0d675a5a13361d10"},
+    {file = "prompt_toolkit-3.0.47.tar.gz", hash = "sha256:1e1b29cb58080b1e69f207c893a1a7bf16d127a5c30c9d17a25a5d77792e5360"},
 ]

 [package.dependencies]
-tqdm = "*"
+wcwidth = "*"

 [[package]]
 name = "protobuf"
@@ -2773,52 +2808,55 @@ test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"]

 [[package]]
 name = "pyarrow"
-version = "16.1.0"
+version = "17.0.0"
 description = "Python library for Apache Arrow"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "pyarrow-16.1.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:17e23b9a65a70cc733d8b738baa6ad3722298fa0c81d88f63ff94bf25eaa77b9"},
-    {file = "pyarrow-16.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4740cc41e2ba5d641071d0ab5e9ef9b5e6e8c7611351a5cb7c1d175eaf43674a"},
-    {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:98100e0268d04e0eec47b73f20b39c45b4006f3c4233719c3848aa27a03c1aef"},
-    {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f68f409e7b283c085f2da014f9ef81e885d90dcd733bd648cfba3ef265961848"},
-    {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:a8914cd176f448e09746037b0c6b3a9d7688cef451ec5735094055116857580c"},
-    {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:48be160782c0556156d91adbdd5a4a7e719f8d407cb46ae3bb4eaee09b3111bd"},
-    {file = "pyarrow-16.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:9cf389d444b0f41d9fe1444b70650fea31e9d52cfcb5f818b7888b91b586efff"},
-    {file = "pyarrow-16.1.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:d0ebea336b535b37eee9eee31761813086d33ed06de9ab6fc6aaa0bace7b250c"},
-    {file = "pyarrow-16.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e73cfc4a99e796727919c5541c65bb88b973377501e39b9842ea71401ca6c1c"},
-    {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf9251264247ecfe93e5f5a0cd43b8ae834f1e61d1abca22da55b20c788417f6"},
-    {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddf5aace92d520d3d2a20031d8b0ec27b4395cab9f74e07cc95edf42a5cc0147"},
-    {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:25233642583bf658f629eb230b9bb79d9af4d9f9229890b3c878699c82f7d11e"},
-    {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:a33a64576fddfbec0a44112eaf844c20853647ca833e9a647bfae0582b2ff94b"},
-    {file = "pyarrow-16.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:185d121b50836379fe012753cf15c4ba9638bda9645183ab36246923875f8d1b"},
-    {file = "pyarrow-16.1.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:2e51ca1d6ed7f2e9d5c3c83decf27b0d17bb207a7dea986e8dc3e24f80ff7d6f"},
-    {file = "pyarrow-16.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:06ebccb6f8cb7357de85f60d5da50e83507954af617d7b05f48af1621d331c9a"},
-    {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b04707f1979815f5e49824ce52d1dceb46e2f12909a48a6a753fe7cafbc44a0c"},
-    {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d32000693deff8dc5df444b032b5985a48592c0697cb6e3071a5d59888714e2"},
-    {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:8785bb10d5d6fd5e15d718ee1d1f914fe768bf8b4d1e5e9bf253de8a26cb1628"},
-    {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:e1369af39587b794873b8a307cc6623a3b1194e69399af0efd05bb202195a5a7"},
-    {file = "pyarrow-16.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:febde33305f1498f6df85e8020bca496d0e9ebf2093bab9e0f65e2b4ae2b3444"},
-    {file = "pyarrow-16.1.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:b5f5705ab977947a43ac83b52ade3b881eb6e95fcc02d76f501d549a210ba77f"},
-    {file = "pyarrow-16.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0d27bf89dfc2576f6206e9cd6cf7a107c9c06dc13d53bbc25b0bd4556f19cf5f"},
-    {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d07de3ee730647a600037bc1d7b7994067ed64d0eba797ac74b2bc77384f4c2"},
-    {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fbef391b63f708e103df99fbaa3acf9f671d77a183a07546ba2f2c297b361e83"},
-    {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:19741c4dbbbc986d38856ee7ddfdd6a00fc3b0fc2d928795b95410d38bb97d15"},
-    {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:f2c5fb249caa17b94e2b9278b36a05ce03d3180e6da0c4c3b3ce5b2788f30eed"},
-    {file = "pyarrow-16.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:e6b6d3cd35fbb93b70ade1336022cc1147b95ec6af7d36906ca7fe432eb09710"},
-    {file = "pyarrow-16.1.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:18da9b76a36a954665ccca8aa6bd9f46c1145f79c0bb8f4f244f5f8e799bca55"},
-    {file = "pyarrow-16.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:99f7549779b6e434467d2aa43ab2b7224dd9e41bdde486020bae198978c9e05e"},
-    {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f07fdffe4fd5b15f5ec15c8b64584868d063bc22b86b46c9695624ca3505b7b4"},
-    {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddfe389a08ea374972bd4065d5f25d14e36b43ebc22fc75f7b951f24378bf0b5"},
-    {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:3b20bd67c94b3a2ea0a749d2a5712fc845a69cb5d52e78e6449bbd295611f3aa"},
-    {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:ba8ac20693c0bb0bf4b238751d4409e62852004a8cf031c73b0e0962b03e45e3"},
-    {file = "pyarrow-16.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:31a1851751433d89a986616015841977e0a188662fcffd1a5677453f1df2de0a"},
-    {file = "pyarrow-16.1.0.tar.gz", hash = "sha256:15fbb22ea96d11f0b5768504a3f961edab25eaf4197c341720c4a387f6c60315"},
+    {file = "pyarrow-17.0.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:a5c8b238d47e48812ee577ee20c9a2779e6a5904f1708ae240f53ecbee7c9f07"},
+    {file = "pyarrow-17.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:db023dc4c6cae1015de9e198d41250688383c3f9af8f565370ab2b4cb5f62655"},
+    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da1e060b3876faa11cee287839f9cc7cdc00649f475714b8680a05fd9071d545"},
+    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75c06d4624c0ad6674364bb46ef38c3132768139ddec1c56582dbac54f2663e2"},
+    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:fa3c246cc58cb5a4a5cb407a18f193354ea47dd0648194e6265bd24177982fe8"},
+    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:f7ae2de664e0b158d1607699a16a488de3d008ba99b3a7aa5de1cbc13574d047"},
+    {file = "pyarrow-17.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:5984f416552eea15fd9cee03da53542bf4cddaef5afecefb9aa8d1010c335087"},
+    {file = "pyarrow-17.0.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:1c8856e2ef09eb87ecf937104aacfa0708f22dfeb039c363ec99735190ffb977"},
+    {file = "pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e19f569567efcbbd42084e87f948778eb371d308e137a0f97afe19bb860ccb3"},
+    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b244dc8e08a23b3e352899a006a26ae7b4d0da7bb636872fa8f5884e70acf15"},
+    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b72e87fe3e1db343995562f7fff8aee354b55ee83d13afba65400c178ab2597"},
+    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:dc5c31c37409dfbc5d014047817cb4ccd8c1ea25d19576acf1a001fe07f5b420"},
+    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e3343cb1e88bc2ea605986d4b94948716edc7a8d14afd4e2c097232f729758b4"},
+    {file = "pyarrow-17.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:a27532c38f3de9eb3e90ecab63dfda948a8ca859a66e3a47f5f42d1e403c4d03"},
+    {file = "pyarrow-17.0.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:9b8a823cea605221e61f34859dcc03207e52e409ccf6354634143e23af7c8d22"},
+    {file = "pyarrow-17.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f1e70de6cb5790a50b01d2b686d54aaf73da01266850b05e3af2a1bc89e16053"},
+    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0071ce35788c6f9077ff9ecba4858108eebe2ea5a3f7cf2cf55ebc1dbc6ee24a"},
+    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:757074882f844411fcca735e39aae74248a1531367a7c80799b4266390ae51cc"},
+    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:9ba11c4f16976e89146781a83833df7f82077cdab7dc6232c897789343f7891a"},
+    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b0c6ac301093b42d34410b187bba560b17c0330f64907bfa4f7f7f2444b0cf9b"},
+    {file = "pyarrow-17.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:392bc9feabc647338e6c89267635e111d71edad5fcffba204425a7c8d13610d7"},
+    {file = "pyarrow-17.0.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:af5ff82a04b2171415f1410cff7ebb79861afc5dae50be73ce06d6e870615204"},
+    {file = "pyarrow-17.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:edca18eaca89cd6382dfbcff3dd2d87633433043650c07375d095cd3517561d8"},
+    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c7916bff914ac5d4a8fe25b7a25e432ff921e72f6f2b7547d1e325c1ad9d155"},
+    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f553ca691b9e94b202ff741bdd40f6ccb70cdd5fbf65c187af132f1317de6145"},
+    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0cdb0e627c86c373205a2f94a510ac4376fdc523f8bb36beab2e7f204416163c"},
+    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d7d192305d9d8bc9082d10f361fc70a73590a4c65cf31c3e6926cd72b76bc35c"},
+    {file = "pyarrow-17.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:02dae06ce212d8b3244dd3e7d12d9c4d3046945a5933d28026598e9dbbda1fca"},
+    {file = "pyarrow-17.0.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:13d7a460b412f31e4c0efa1148e1d29bdf18ad1411eb6757d38f8fbdcc8645fb"},
+    {file = "pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9b564a51fbccfab5a04a80453e5ac6c9954a9c5ef2890d1bcf63741909c3f8df"},
+    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32503827abbc5aadedfa235f5ece8c4f8f8b0a3cf01066bc8d29de7539532687"},
+    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a155acc7f154b9ffcc85497509bcd0d43efb80d6f733b0dc3bb14e281f131c8b"},
+    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:dec8d129254d0188a49f8a1fc99e0560dc1b85f60af729f47de4046015f9b0a5"},
+    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:a48ddf5c3c6a6c505904545c25a4ae13646ae1f8ba703c4df4a1bfe4f4006bda"},
+    {file = "pyarrow-17.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:42bf93249a083aca230ba7e2786c5f673507fa97bbd9725a1e2754715151a204"},
+    {file = "pyarrow-17.0.0.tar.gz", hash = "sha256:4beca9521ed2c0921c1023e68d097d0299b62c362639ea315572a58f3f50fd28"},
 ]

 [package.dependencies]
 numpy = ">=1.16.6"

+[package.extras]
+test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"]
+
 [[package]]
 name = "pyarrow-hotfix"
 version = "0.6"
@@ -3221,23 +3259,6 @@ pytest = ">=4.6"
 [package.extras]
 testing = ["fields", "hunter", "process-tests", "pytest-xdist", "virtualenv"]

-[[package]]
-name = "pytest-mock"
-version = "3.14.0"
-description = "Thin-wrapper around the mock package for easier use with pytest"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "pytest-mock-3.14.0.tar.gz", hash = "sha256:2719255a1efeceadbc056d6bf3df3d1c5015530fb40cf347c0f9afac88410bd0"},
-    {file = "pytest_mock-3.14.0-py3-none-any.whl", hash = "sha256:0b72c38033392a5f4621342fe11e9219ac11ec9d375f8e2a0c164539e0d70f6f"},
-]
-
-[package.dependencies]
-pytest = ">=6.2.5"
-
-[package.extras]
-dev = ["pre-commit", "pytest-asyncio", "tox"]
-
 [[package]]
 name = "python-dateutil"
 version = "2.9.0.post0"
@@ -3449,16 +3470,16 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]

 [[package]]
 name = "rerun-sdk"
-version = "0.16.1"
+version = "0.17.0"
 description = "The Rerun Logging SDK"
 optional = false
 python-versions = "<3.13,>=3.8"
 files = [
-    {file = "rerun_sdk-0.16.1-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:170c6976634008611753e10dfef8cdc395ce8180e634c169e7c61cef2f89a277"},
-    {file = "rerun_sdk-0.16.1-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:c9a76eab7eb5559276737dad655200e9350df0837158dbc5a896970ab4201454"},
-    {file = "rerun_sdk-0.16.1-cp38-abi3-manylinux_2_31_aarch64.whl", hash = "sha256:4d6436752d57e8b8038489a0e7e37f0c760b088e96db5fb81667d3a376d63fea"},
-    {file = "rerun_sdk-0.16.1-cp38-abi3-manylinux_2_31_x86_64.whl", hash = "sha256:37b7b47948471873e84f224b16f417a94a91c7cbd6c72c68281eeff1ba414b8f"},
-    {file = "rerun_sdk-0.16.1-cp38-abi3-win_amd64.whl", hash = "sha256:be88799c8afdf68eafa99e64e2e4f0a484e187e017a180219abbe6bb988acd4e"},
+    {file = "rerun_sdk-0.17.0-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:abd34f746eada83b8bb0bc50007183151981d7ccf18306f3d42165819a3f6fcb"},
+    {file = "rerun_sdk-0.17.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:8b0a8a6feab3f8e679801d158216a71d88a81480021587719330f50d083c4d26"},
+    {file = "rerun_sdk-0.17.0-cp38-abi3-manylinux_2_31_aarch64.whl", hash = "sha256:ad55807abafb01e527846742e087819aac8e103f1ec15aadc563a4038bb44e1d"},
+    {file = "rerun_sdk-0.17.0-cp38-abi3-manylinux_2_31_x86_64.whl", hash = "sha256:9d41f1f475270b1e0d50ddb8cb62e0d828988f0c371ac8457af25c8be5aa1dc0"},
+    {file = "rerun_sdk-0.17.0-cp38-abi3-win_amd64.whl", hash = "sha256:34e5595a326cbdddfebdf00b08e877358c564fce74cc8c6d617fc89ef3a6aa70"},
 ]

 [package.dependencies]
@@ -3469,6 +3490,7 @@ pyarrow = ">=14.0.2"
 typing-extensions = ">=4.5"

 [package.extras]
+notebook = ["rerun-notebook (==0.17.0)"]
 tests = ["pytest (==7.1.2)"]

 [[package]]
@@ -3595,27 +3617,32 @@ torch = ["safetensors[numpy]", "torch (>=1.10)"]

 [[package]]
 name = "scikit-image"
-version = "0.23.2"
+version = "0.24.0"
 description = "Image processing in Python"
 optional = true
-python-versions = ">=3.10"
+python-versions = ">=3.9"
 files = [
-    {file = "scikit_image-0.23.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f9a8db6c52f8d0e1474ea8320d7b8db442b4d6baa29dd0acbd02f8a49572f18a"},
-    {file = "scikit_image-0.23.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:524b51a7440e46ed2ebbde7bc288bf2dde1dee2caafdd9513b2aca38a48223b7"},
-    {file = "scikit_image-0.23.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b335c229170d787b3fb8c60d220f72049ccf862d5191a3cfda6ac84b995ac4e"},
-    {file = "scikit_image-0.23.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:08b10781efbd6b084f3c847ff4049b657241ea866b9e331b14bf791dcb3e6661"},
-    {file = "scikit_image-0.23.2-cp310-cp310-win_amd64.whl", hash = "sha256:a207352e9a1956dda1424bbe872c7795345187138118e8be6a421aef3b988c2a"},
-    {file = "scikit_image-0.23.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ee83fdb1843ee938eabdfeb9498623282935ea30aa20dffc5d5d16698efb4b2a"},
-    {file = "scikit_image-0.23.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:a158f50d3df4867bbd1c698520ede8bc493e430ad83f54ac1f0d8f57b328779b"},
-    {file = "scikit_image-0.23.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:55de3326be124334b89314e9e04c8971ad98d6681e11a243f71bfb85ef9554b0"},
-    {file = "scikit_image-0.23.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fce619a6d84fe40c1208fa579b646e93ce13ef0afc3652a23e9782b2c183291a"},
-    {file = "scikit_image-0.23.2-cp311-cp311-win_amd64.whl", hash = "sha256:ee65669aa586e110346f567ed5c92d1bd63799a19e951cb83da3f54b0caf7c52"},
-    {file = "scikit_image-0.23.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:15bfb4e8d7bd90a967e6a3c3ab6be678063fc45e950b730684a8db46a02ff892"},
-    {file = "scikit_image-0.23.2-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:5736e66d01b11cd90988ec24ab929c80a03af28f690189c951886891ebf63154"},
-    {file = "scikit_image-0.23.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3597ac5d8f51dafbcb7433ef1fdefdefb535f50745b2002ae0a5d651df4f063b"},
-    {file = "scikit_image-0.23.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1978be2abe3c3c3189a99a411d48bbb1306f7c2debb3aefbf426e23947f26623"},
-    {file = "scikit_image-0.23.2-cp312-cp312-win_amd64.whl", hash = "sha256:ae32bf0cb02b672ed74d28880ca6f88928ae8dd794d67e04fa3ff4836feb9bd6"},
-    {file = "scikit_image-0.23.2.tar.gz", hash = "sha256:c9da4b2c3117e3e30364a3d14496ee5c72b09eb1a4ab1292b302416faa360590"},
+    {file = "scikit_image-0.24.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:cb3bc0264b6ab30b43c4179ee6156bc18b4861e78bb329dd8d16537b7bbf827a"},
+    {file = "scikit_image-0.24.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:9c7a52e20cdd760738da38564ba1fed7942b623c0317489af1a598a8dedf088b"},
+    {file = "scikit_image-0.24.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:93f46e6ce42e5409f4d09ce1b0c7f80dd7e4373bcec635b6348b63e3c886eac8"},
+    {file = "scikit_image-0.24.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:39ee0af13435c57351a3397eb379e72164ff85161923eec0c38849fecf1b4764"},
+    {file = "scikit_image-0.24.0-cp310-cp310-win_amd64.whl", hash = "sha256:7ac7913b028b8aa780ffae85922894a69e33d1c0bf270ea1774f382fe8bf95e7"},
+    {file = "scikit_image-0.24.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:272909e02a59cea3ed4aa03739bb88df2625daa809f633f40b5053cf09241831"},
+    {file = "scikit_image-0.24.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:190ebde80b4470fe8838764b9b15f232a964f1a20391663e31008d76f0c696f7"},
+    {file = "scikit_image-0.24.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:59c98cc695005faf2b79904e4663796c977af22586ddf1b12d6af2fa22842dc2"},
+    {file = "scikit_image-0.24.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa27b3a0dbad807b966b8db2d78da734cb812ca4787f7fbb143764800ce2fa9c"},
+    {file = "scikit_image-0.24.0-cp311-cp311-win_amd64.whl", hash = "sha256:dacf591ac0c272a111181afad4b788a27fe70d213cfddd631d151cbc34f8ca2c"},
+    {file = "scikit_image-0.24.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6fccceb54c9574590abcddc8caf6cefa57c13b5b8b4260ab3ff88ad8f3c252b3"},
+    {file = "scikit_image-0.24.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:ccc01e4760d655aab7601c1ba7aa4ddd8b46f494ac46ec9c268df6f33ccddf4c"},
+    {file = "scikit_image-0.24.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:18836a18d3a7b6aca5376a2d805f0045826bc6c9fc85331659c33b4813e0b563"},
+    {file = "scikit_image-0.24.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8579bda9c3f78cb3b3ed8b9425213c53a25fa7e994b7ac01f2440b395babf660"},
+    {file = "scikit_image-0.24.0-cp312-cp312-win_amd64.whl", hash = "sha256:82ab903afa60b2da1da2e6f0c8c65e7c8868c60a869464c41971da929b3e82bc"},
+    {file = "scikit_image-0.24.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ef04360eda372ee5cd60aebe9be91258639c86ae2ea24093fb9182118008d009"},
+    {file = "scikit_image-0.24.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:e9aadb442360a7e76f0c5c9d105f79a83d6df0e01e431bd1d5757e2c5871a1f3"},
+    {file = "scikit_image-0.24.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5e37de6f4c1abcf794e13c258dc9b7d385d5be868441de11c180363824192ff7"},
+    {file = "scikit_image-0.24.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4688c18bd7ec33c08d7bf0fd19549be246d90d5f2c1d795a89986629af0a1e83"},
+    {file = "scikit_image-0.24.0-cp39-cp39-win_amd64.whl", hash = "sha256:56dab751d20b25d5d3985e95c9b4e975f55573554bd76b0aedf5875217c93e69"},
+    {file = "scikit_image-0.24.0.tar.gz", hash = "sha256:5d16efe95da8edbeb363e0c4157b99becbd650a60b77f6e3af5768b66cf007ab"},
 ]

 [package.dependencies]
@@ -3680,13 +3707,13 @@ test = ["Cython", "array-api-strict", "asv", "gmpy2", "hypothesis (>=6.30)", "me

 [[package]]
 name = "sentry-sdk"
-version = "2.7.1"
+version = "2.10.0"
 description = "Python client for Sentry (https://sentry.io)"
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "sentry_sdk-2.7.1-py2.py3-none-any.whl", hash = "sha256:ef1b3d54eb715825657cd4bb3cb42bb4dc85087bac14c56b0fd8c21abd968c9a"},
-    {file = "sentry_sdk-2.7.1.tar.gz", hash = "sha256:25006c7e68b75aaa5e6b9c6a420ece22e8d7daec4b7a906ffd3a8607b67c037b"},
+    {file = "sentry_sdk-2.10.0-py2.py3-none-any.whl", hash = "sha256:87b3d413c87d8e7f816cc9334bff255a83d8b577db2b22042651c30c19c09190"},
+    {file = "sentry_sdk-2.10.0.tar.gz", hash = "sha256:545fcc6e36c335faa6d6cda84669b6e17025f31efbf3b2211ec14efe008b75d1"},
 ]

 [package.dependencies]
@@ -3830,67 +3857,63 @@ test = ["pytest"]

 [[package]]
 name = "setuptools"
-version = "70.2.0"
+version = "71.0.0"
 description = "Easily download, build, install, upgrade, and uninstall Python packages"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "setuptools-70.2.0-py3-none-any.whl", hash = "sha256:b8b8060bb426838fbe942479c90296ce976249451118ef566a5a0b7d8b78fb05"},
-    {file = "setuptools-70.2.0.tar.gz", hash = "sha256:bd63e505105011b25c3c11f753f7e3b8465ea739efddaccef8f0efac2137bac1"},
+    {file = "setuptools-71.0.0-py3-none-any.whl", hash = "sha256:f06fbe978a91819d250a30e0dc4ca79df713d909e24438a42d0ec300fc52247f"},
+    {file = "setuptools-71.0.0.tar.gz", hash = "sha256:98da3b8aca443b9848a209ae4165e2edede62633219afa493a58fbba57f72e2e"},
 ]

 [package.extras]
-doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
-test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test", "mypy (==1.10.0)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.3.2)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
+core = ["importlib-metadata (>=6)", "importlib-resources (>=5.10.2)", "jaraco.text (>=3.7)", "more-itertools (>=8.8)", "ordered-set (>=3.1.1)", "packaging (>=24)", "platformdirs (>=2.6.2)", "tomli (>=2.0.1)", "wheel (>=0.43.0)"]
+doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (<7.4)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
+test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test", "mypy (==1.10.0)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (<0.4)", "pytest-ruff (>=0.2.1)", "pytest-ruff (>=0.3.2)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]

 [[package]]
 name = "shapely"
-version = "2.0.4"
+version = "2.0.5"
 description = "Manipulation and analysis of geometric objects"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "shapely-2.0.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:011b77153906030b795791f2fdfa2d68f1a8d7e40bce78b029782ade3afe4f2f"},
-    {file = "shapely-2.0.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9831816a5d34d5170aa9ed32a64982c3d6f4332e7ecfe62dc97767e163cb0b17"},
-    {file = "shapely-2.0.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5c4849916f71dc44e19ed370421518c0d86cf73b26e8656192fcfcda08218fbd"},
-    {file = "shapely-2.0.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:841f93a0e31e4c64d62ea570d81c35de0f6cea224568b2430d832967536308e6"},
-    {file = "shapely-2.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b4431f522b277c79c34b65da128029a9955e4481462cbf7ebec23aab61fc58"},
-    {file = "shapely-2.0.4-cp310-cp310-win32.whl", hash = "sha256:92a41d936f7d6743f343be265ace93b7c57f5b231e21b9605716f5a47c2879e7"},
-    {file = "shapely-2.0.4-cp310-cp310-win_amd64.whl", hash = "sha256:30982f79f21bb0ff7d7d4a4e531e3fcaa39b778584c2ce81a147f95be1cd58c9"},
-    {file = "shapely-2.0.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:de0205cb21ad5ddaef607cda9a3191eadd1e7a62a756ea3a356369675230ac35"},
-    {file = "shapely-2.0.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7d56ce3e2a6a556b59a288771cf9d091470116867e578bebced8bfc4147fbfd7"},
-    {file = "shapely-2.0.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:58b0ecc505bbe49a99551eea3f2e8a9b3b24b3edd2a4de1ac0dc17bc75c9ec07"},
-    {file = "shapely-2.0.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:790a168a808bd00ee42786b8ba883307c0e3684ebb292e0e20009588c426da47"},
-    {file = "shapely-2.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4310b5494271e18580d61022c0857eb85d30510d88606fa3b8314790df7f367d"},
-    {file = "shapely-2.0.4-cp311-cp311-win32.whl", hash = "sha256:63f3a80daf4f867bd80f5c97fbe03314348ac1b3b70fb1c0ad255a69e3749879"},
-    {file = "shapely-2.0.4-cp311-cp311-win_amd64.whl", hash = "sha256:c52ed79f683f721b69a10fb9e3d940a468203f5054927215586c5d49a072de8d"},
-    {file = "shapely-2.0.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:5bbd974193e2cc274312da16b189b38f5f128410f3377721cadb76b1e8ca5328"},
-    {file = "shapely-2.0.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:41388321a73ba1a84edd90d86ecc8bfed55e6a1e51882eafb019f45895ec0f65"},
-    {file = "shapely-2.0.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0776c92d584f72f1e584d2e43cfc5542c2f3dd19d53f70df0900fda643f4bae6"},
-    {file = "shapely-2.0.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c75c98380b1ede1cae9a252c6dc247e6279403fae38c77060a5e6186c95073ac"},
-    {file = "shapely-2.0.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c3e700abf4a37b7b8b90532fa6ed5c38a9bfc777098bc9fbae5ec8e618ac8f30"},
-    {file = "shapely-2.0.4-cp312-cp312-win32.whl", hash = "sha256:4f2ab0faf8188b9f99e6a273b24b97662194160cc8ca17cf9d1fb6f18d7fb93f"},
-    {file = "shapely-2.0.4-cp312-cp312-win_amd64.whl", hash = "sha256:03152442d311a5e85ac73b39680dd64a9892fa42bb08fd83b3bab4fe6999bfa0"},
-    {file = "shapely-2.0.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:994c244e004bc3cfbea96257b883c90a86e8cbd76e069718eb4c6b222a56f78b"},
-    {file = "shapely-2.0.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:05ffd6491e9e8958b742b0e2e7c346635033d0a5f1a0ea083547fcc854e5d5cf"},
-    {file = "shapely-2.0.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2fbdc1140a7d08faa748256438291394967aa54b40009f54e8d9825e75ef6113"},
-    {file = "shapely-2.0.4-cp37-cp37m-win32.whl", hash = "sha256:5af4cd0d8cf2912bd95f33586600cac9c4b7c5053a036422b97cfe4728d2eb53"},
-    {file = "shapely-2.0.4-cp37-cp37m-win_amd64.whl", hash = "sha256:464157509ce4efa5ff285c646a38b49f8c5ef8d4b340f722685b09bb033c5ccf"},
-    {file = "shapely-2.0.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:489c19152ec1f0e5c5e525356bcbf7e532f311bff630c9b6bc2db6f04da6a8b9"},
-    {file = "shapely-2.0.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b79bbd648664aa6f44ef018474ff958b6b296fed5c2d42db60078de3cffbc8aa"},
-    {file = "shapely-2.0.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:674d7baf0015a6037d5758496d550fc1946f34bfc89c1bf247cabdc415d7747e"},
-    {file = "shapely-2.0.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6cd4ccecc5ea5abd06deeaab52fcdba372f649728050c6143cc405ee0c166679"},
-    {file = "shapely-2.0.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fb5cdcbbe3080181498931b52a91a21a781a35dcb859da741c0345c6402bf00c"},
-    {file = "shapely-2.0.4-cp38-cp38-win32.whl", hash = "sha256:55a38dcd1cee2f298d8c2ebc60fc7d39f3b4535684a1e9e2f39a80ae88b0cea7"},
-    {file = "shapely-2.0.4-cp38-cp38-win_amd64.whl", hash = "sha256:ec555c9d0db12d7fd777ba3f8b75044c73e576c720a851667432fabb7057da6c"},
-    {file = "shapely-2.0.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:3f9103abd1678cb1b5f7e8e1af565a652e036844166c91ec031eeb25c5ca8af0"},
-    {file = "shapely-2.0.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:263bcf0c24d7a57c80991e64ab57cba7a3906e31d2e21b455f493d4aab534aaa"},
-    {file = "shapely-2.0.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ddf4a9bfaac643e62702ed662afc36f6abed2a88a21270e891038f9a19bc08fc"},
-    {file = "shapely-2.0.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:485246fcdb93336105c29a5cfbff8a226949db37b7473c89caa26c9bae52a242"},
-    {file = "shapely-2.0.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8de4578e838a9409b5b134a18ee820730e507b2d21700c14b71a2b0757396acc"},
-    {file = "shapely-2.0.4-cp39-cp39-win32.whl", hash = "sha256:9dab4c98acfb5fb85f5a20548b5c0abe9b163ad3525ee28822ffecb5c40e724c"},
-    {file = "shapely-2.0.4-cp39-cp39-win_amd64.whl", hash = "sha256:31c19a668b5a1eadab82ff070b5a260478ac6ddad3a5b62295095174a8d26398"},
-    {file = "shapely-2.0.4.tar.gz", hash = "sha256:5dc736127fac70009b8d309a0eeb74f3e08979e530cf7017f2f507ef62e6cfb8"},
+    {file = "shapely-2.0.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:89d34787c44f77a7d37d55ae821f3a784fa33592b9d217a45053a93ade899375"},
+    {file = "shapely-2.0.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:798090b426142df2c5258779c1d8d5734ec6942f778dab6c6c30cfe7f3bf64ff"},
+    {file = "shapely-2.0.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45211276900c4790d6bfc6105cbf1030742da67594ea4161a9ce6812a6721e68"},
+    {file = "shapely-2.0.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e119444bc27ca33e786772b81760f2028d930ac55dafe9bc50ef538b794a8e1"},
+    {file = "shapely-2.0.5-cp310-cp310-win32.whl", hash = "sha256:9a4492a2b2ccbeaebf181e7310d2dfff4fdd505aef59d6cb0f217607cb042fb3"},
+    {file = "shapely-2.0.5-cp310-cp310-win_amd64.whl", hash = "sha256:1e5cb5ee72f1bc7ace737c9ecd30dc174a5295fae412972d3879bac2e82c8fae"},
+    {file = "shapely-2.0.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5bbfb048a74cf273db9091ff3155d373020852805a37dfc846ab71dde4be93ec"},
+    {file = "shapely-2.0.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:93be600cbe2fbaa86c8eb70656369f2f7104cd231f0d6585c7d0aa555d6878b8"},
+    {file = "shapely-2.0.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f8e71bb9a46814019f6644c4e2560a09d44b80100e46e371578f35eaaa9da1c"},
+    {file = "shapely-2.0.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5251c28a29012e92de01d2e84f11637eb1d48184ee8f22e2df6c8c578d26760"},
+    {file = "shapely-2.0.5-cp311-cp311-win32.whl", hash = "sha256:35110e80070d664781ec7955c7de557456b25727a0257b354830abb759bf8311"},
+    {file = "shapely-2.0.5-cp311-cp311-win_amd64.whl", hash = "sha256:6c6b78c0007a34ce7144f98b7418800e0a6a5d9a762f2244b00ea560525290c9"},
+    {file = "shapely-2.0.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:03bd7b5fa5deb44795cc0a503999d10ae9d8a22df54ae8d4a4cd2e8a93466195"},
+    {file = "shapely-2.0.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2ff9521991ed9e201c2e923da014e766c1aa04771bc93e6fe97c27dcf0d40ace"},
+    {file = "shapely-2.0.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1b65365cfbf657604e50d15161ffcc68de5cdb22a601bbf7823540ab4918a98d"},
+    {file = "shapely-2.0.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21f64e647a025b61b19585d2247137b3a38a35314ea68c66aaf507a1c03ef6fe"},
+    {file = "shapely-2.0.5-cp312-cp312-win32.whl", hash = "sha256:3ac7dc1350700c139c956b03d9c3df49a5b34aaf91d024d1510a09717ea39199"},
+    {file = "shapely-2.0.5-cp312-cp312-win_amd64.whl", hash = "sha256:30e8737983c9d954cd17feb49eb169f02f1da49e24e5171122cf2c2b62d65c95"},
+    {file = "shapely-2.0.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:ff7731fea5face9ec08a861ed351734a79475631b7540ceb0b66fb9732a5f529"},
+    {file = "shapely-2.0.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ff9e520af0c5a578e174bca3c18713cd47a6c6a15b6cf1f50ac17dc8bb8db6a2"},
+    {file = "shapely-2.0.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49b299b91557b04acb75e9732645428470825061f871a2edc36b9417d66c1fc5"},
+    {file = "shapely-2.0.5-cp37-cp37m-win32.whl", hash = "sha256:b5870633f8e684bf6d1ae4df527ddcb6f3895f7b12bced5c13266ac04f47d231"},
+    {file = "shapely-2.0.5-cp37-cp37m-win_amd64.whl", hash = "sha256:401cb794c5067598f50518e5a997e270cd7642c4992645479b915c503866abed"},
+    {file = "shapely-2.0.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:e91ee179af539100eb520281ba5394919067c6b51824e6ab132ad4b3b3e76dd0"},
+    {file = "shapely-2.0.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:8af6f7260f809c0862741ad08b1b89cb60c130ae30efab62320bbf4ee9cc71fa"},
+    {file = "shapely-2.0.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f5456dd522800306ba3faef77c5ba847ec30a0bd73ab087a25e0acdd4db2514f"},
+    {file = "shapely-2.0.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b714a840402cde66fd7b663bb08cacb7211fa4412ea2a209688f671e0d0631fd"},
+    {file = "shapely-2.0.5-cp38-cp38-win32.whl", hash = "sha256:7e8cf5c252fac1ea51b3162be2ec3faddedc82c256a1160fc0e8ddbec81b06d2"},
+    {file = "shapely-2.0.5-cp38-cp38-win_amd64.whl", hash = "sha256:4461509afdb15051e73ab178fae79974387f39c47ab635a7330d7fee02c68a3f"},
+    {file = "shapely-2.0.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7545a39c55cad1562be302d74c74586f79e07b592df8ada56b79a209731c0219"},
+    {file = "shapely-2.0.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4c83a36f12ec8dee2066946d98d4d841ab6512a6ed7eb742e026a64854019b5f"},
+    {file = "shapely-2.0.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:89e640c2cd37378480caf2eeda9a51be64201f01f786d127e78eaeff091ec897"},
+    {file = "shapely-2.0.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:06efe39beafde3a18a21dde169d32f315c57da962826a6d7d22630025200c5e6"},
+    {file = "shapely-2.0.5-cp39-cp39-win32.whl", hash = "sha256:8203a8b2d44dcb366becbc8c3d553670320e4acf0616c39e218c9561dd738d92"},
+    {file = "shapely-2.0.5-cp39-cp39-win_amd64.whl", hash = "sha256:7fed9dbfbcfec2682d9a047b9699db8dcc890dfca857ecba872c42185fc9e64e"},
+    {file = "shapely-2.0.5.tar.gz", hash = "sha256:bff2366bc786bfa6cb353d6b47d0443c570c32776612e527ee47b6df63fcfe32"},
 ]

 [package.dependencies]
@@ -3935,17 +3958,20 @@ files = [

 [[package]]
 name = "sympy"
-version = "1.12.1"
+version = "1.13.0"
 description = "Computer algebra system (CAS) in Python"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "sympy-1.12.1-py3-none-any.whl", hash = "sha256:9b2cbc7f1a640289430e13d2a56f02f867a1da0190f2f99d8968c2f74da0e515"},
-    {file = "sympy-1.12.1.tar.gz", hash = "sha256:2877b03f998cd8c08f07cd0de5b767119cd3ef40d09f41c30d722f6686b0fb88"},
+    {file = "sympy-1.13.0-py3-none-any.whl", hash = "sha256:6b0b32a4673fb91bd3cac3b55406c8e01d53ae22780be467301cc452f6680c92"},
+    {file = "sympy-1.13.0.tar.gz", hash = "sha256:3b6af8f4d008b9a1a6a4268b335b984b23835f26d1d60b0526ebc71d48a25f57"},
 ]

 [package.dependencies]
-mpmath = ">=1.1.0,<1.4.0"
+mpmath = ">=1.1.0,<1.4"
+
+[package.extras]
+dev = ["hypothesis (>=6.70.0)", "pytest (>=7.1.0)"]

 [[package]]
 name = "tbb"
@@ -4237,6 +4263,34 @@ perf = ["orjson"]
 sweeps = ["sweeps (>=0.2.0)"]
 workspaces = ["wandb-workspaces"]

+[[package]]
+name = "wcwidth"
+version = "0.2.13"
+description = "Measures the displayed width of unicode strings in a terminal"
+optional = false
+python-versions = "*"
+files = [
+    {file = "wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859"},
+    {file = "wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5"},
+]
+
+[[package]]
+name = "werkzeug"
+version = "3.0.3"
+description = "The comprehensive WSGI web application library."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "werkzeug-3.0.3-py3-none-any.whl", hash = "sha256:fc9645dc43e03e4d630d23143a04a7f947a9a3b5727cd535fdfe155a17cc48c8"},
+    {file = "werkzeug-3.0.3.tar.gz", hash = "sha256:097e5bfda9f0aba8da6b8545146def481d06aa7d3266e7448e2cccf67dd8bd18"},
+]
+
+[package.dependencies]
+MarkupSafe = ">=2.1.1"
+
+[package.extras]
+watchdog = ["watchdog (>=2.3)"]
+
 [[package]]
 name = "xxhash"
 version = "3.4.1"
@@ -4499,7 +4553,7 @@ dev = ["debugpy", "pre-commit"]
 dora = ["gym-dora"]
 koch = ["dynamixel-sdk", "pynput"]
 pusht = ["gym-pusht"]
-test = ["pytest", "pytest-cov", "pytest-mock"]
+test = ["pytest", "pytest-cov"]
 umi = ["imagecodecs"]
 video-benchmark = ["pandas", "scikit-image"]
 xarm = ["gym-xarm"]
@@ -4507,4 +4561,4 @@ xarm = ["gym-xarm"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10,<3.13"
-content-hash = "2c59d869c6b1f2132070387f3d371b5b004765ae853501bbd522eb400738f2d0"
+content-hash = "a340f2ed23db2f3c371c494cbc9a33392e122ed6713e6098277a87b3fb805f2b"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,12 +38,12 @@ einops = ">=0.8.0"
 pymunk = ">=6.6.0"
 zarr = ">=2.17.0"
 numba = ">=0.59.0"
-torch = "^2.2.1"
+torch = ">=2.2.1"
 opencv-python = ">=4.9.0"
-diffusers = "^0.27.2"
+diffusers = ">=0.27.2"
 torchvision = ">=0.17.1"
 h5py = ">=3.10.0"
-huggingface-hub = {extras = ["hf-transfer"], version = "^0.23.0"}
+huggingface-hub = {extras = ["hf-transfer", "cli"], version = ">=0.23.0"}
 gymnasium = ">=0.29.1"
 cmake = ">=3.29.0.1"
 gym-dora = { git = "https://github.com/dora-rs/dora-lerobot.git", subdirectory = "gym_dora", optional = true }
@@ -54,17 +54,18 @@ pre-commit = {version = ">=3.7.0", optional = true}
 debugpy = {version = ">=1.8.1", optional = true}
 pytest = {version = ">=8.1.0", optional = true}
 pytest-cov = {version = ">=5.0.0", optional = true}
-datasets = "^2.19.0"
+datasets = ">=2.19.0"
 imagecodecs = { version = ">=2024.1.1", optional = true }
 pyav = ">=12.0.5"
-moviepy = ">=1.0.3"
 rerun-sdk = ">=0.15.1"
 deepdiff = ">=7.0.1"
-scikit-image = {version = "^0.23.2", optional = true}
-pandas = {version = "^2.2.2", optional = true}
-pytest-mock = {version = "^3.14.0", optional = true}
-dynamixel-sdk = {version = "^3.7.31", optional = true}
-pynput = {version = "^1.7.7", optional = true}
+flask = ">=3.0.3"
+pandas = {version = ">=2.2.2", optional = true}
+scikit-image = {version = ">=0.23.2", optional = true}
+dynamixel-sdk = {version = ">=3.7.31", optional = true}
+pynput = {version = ">=1.7.7", optional = true}
+# TODO(rcadene, salibert): 71.0.1 has a bug
+setuptools = {version = "!=71.0.1", optional = true}



@@ -74,7 +75,7 @@ pusht = ["gym-pusht"]
 xarm = ["gym-xarm"]
 aloha = ["gym-aloha"]
 dev = ["pre-commit", "debugpy"]
-test = ["pytest", "pytest-cov", "pytest-mock"]
+test = ["pytest", "pytest-cov"]
 umi = ["imagecodecs"]
 video_benchmark = ["scikit-image", "pandas"]
 koch = ["dynamixel-sdk", "pynput"]
@@ -110,7 +111,6 @@ exclude = [

 [tool.ruff.lint]
 select = ["E4", "E7", "E9", "F", "I", "N", "B", "C4", "SIM"]
-ignore-init-module-imports = true


 [build-system]
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -15,7 +15,9 @@
 # limitations under the License.
 import pytest

-from .utils import DEVICE
+from lerobot.common.utils.utils import init_hydra_config
+
+from .utils import DEVICE, KOCH_ROBOT_CONFIG_PATH


 def pytest_collection_finish():
@@ -27,11 +29,12 @@ def is_koch_available():
    try:
        from lerobot.common.robot_devices.robots.factory import make_robot

-        robot = make_robot("koch")
+        robot_cfg = init_hydra_config(KOCH_ROBOT_CONFIG_PATH)
+        robot = make_robot(robot_cfg)
        robot.connect()
        del robot
        return True
    except Exception as e:
-        print("An alexander koch robot is not available.")
+        print("A koch robot is not available.")
        print(e)
        return False
--- a/tests/data/lerobot/aloha_mobile_cabinet/meta_data/episode_data_index.safetensors
+++ b/tests/data/lerobot/aloha_mobile_cabinet/meta_data/episode_data_index.safetensors
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9f9347c8d9ac90ee44e6dd86f65043438168df6bbe4bab2d2b875e55ef7376ef
-size 1488
+oid sha256:7841afb9ef99c0601448c43a20c25eb029440c73816319c67c5d7e1c5cde2445
+size 136
--- a/tests/data/lerobot/aloha_mobile_cabinet/meta_data/info.json
+++ b/tests/data/lerobot/aloha_mobile_cabinet/meta_data/info.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cf148247bf191c7f7e8af738a7b9e147f9ffffeec0e4b9d1c4783c4e384da7eb
-size 33
+oid sha256:50e40e4c2bb523fca0b54e9a9635281312e9c6f9d757db03c06a0865c5508f29
+size 188
--- a/tests/data/lerobot/aloha_mobile_cabinet/meta_data/stats.safetensors
+++ b/tests/data/lerobot/aloha_mobile_cabinet/meta_data/stats.safetensors
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:02fc4ea25766269f65752a60b0594c43d799b0ae528cd773bf024b064b5aa329
+oid sha256:03508d82db846a804aef1a28aec3cb9572e3105b55a02b6ddbb09b2522d57b84
 size 4344
--- a/tests/data/lerobot/aloha_mobile_cabinet/train/data-00000-of-00001.arrow
+++ b/tests/data/lerobot/aloha_mobile_cabinet/train/data-00000-of-00001.arrow
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:55d7b1a06fe3e3051482752740074348bdb5fc98fb2e305b06d6203994117b27
+oid sha256:7009b3d2f14d6af497eeb32a52332e79cb9c07db24a6c2bbfbeffbaa8151dd69
 size 592448
--- a/tests/data/lerobot/aloha_mobile_cabinet/train/state.json
+++ b/tests/data/lerobot/aloha_mobile_cabinet/train/state.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:98329e4b40e9be0d63f7d36da9d86c44bbe7eeeb1b10d3ba973c923f3be70867
+oid sha256:34ece24fb6b302db0b68987858509f31713fb299faa9a9d34b8fd68f10bc3100
 size 247
--- a/tests/data/lerobot/aloha_mobile_cabinet/videos/observation.images.cam_high_episode_000000.mp4
+++ b/tests/data/lerobot/aloha_mobile_cabinet/videos/observation.images.cam_high_episode_000000.mp4
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:54e42cdfd016a0ced2ab1fe2966a8c15a2384e0dbe1a2fe87433a2d1b8209ac0
-size 5220057
+oid sha256:a70cc17019407cf6bee44fa2c78b4f29e48eb1696aa1a4ff4c048ba256574523
+size 6356921
--- a/tests/data/lerobot/aloha_mobile_cabinet/videos/observation.images.cam_left_wrist_episode_000000.mp4
+++ b/tests/data/lerobot/aloha_mobile_cabinet/videos/observation.images.cam_left_wrist_episode_000000.mp4
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:af1ded2a244cb47a96255b75f584a643edf6967e13bb5464b330ffdd9d7ad859
-size 5284692
+oid sha256:2b35992036e6dcee7d4df6d1675d55d1dd2d658b2d65442737e709895699a2f0
+size 5084448
--- a/tests/data/lerobot/aloha_mobile_cabinet/videos/observation.images.cam_right_wrist_episode_000000.mp4
+++ b/tests/data/lerobot/aloha_mobile_cabinet/videos/observation.images.cam_right_wrist_episode_000000.mp4
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:13d1bebabd79984fd6715971be758ef9a354495adea5e8d33f4e7904365e112b
-size 5258380
+oid sha256:3aa92e6b6bd0e39f6de530ea6a270671db7350cdc101c9d9030c775539c708c1
+size 5441406
--- a/tests/data/lerobot/aloha_mobile_chair/meta_data/episode_data_index.safetensors
+++ b/tests/data/lerobot/aloha_mobile_chair/meta_data/episode_data_index.safetensors
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f33bc6810f0b91817a42610364cb49ed1b99660f058f0f9407e6f5920d0aee02
-size 1008
+oid sha256:4ee862b1a6dc1d11df77c36c47ea00db88ad35a48e4d71c2940ad26b55fe2167
+size 136
--- a/tests/data/lerobot/aloha_mobile_chair/meta_data/info.json
+++ b/tests/data/lerobot/aloha_mobile_chair/meta_data/info.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cf148247bf191c7f7e8af738a7b9e147f9ffffeec0e4b9d1c4783c4e384da7eb
-size 33
+oid sha256:50e40e4c2bb523fca0b54e9a9635281312e9c6f9d757db03c06a0865c5508f29
+size 188
--- a/tests/data/lerobot/aloha_mobile_chair/meta_data/stats.safetensors
+++ b/tests/data/lerobot/aloha_mobile_chair/meta_data/stats.safetensors
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7b58d6c89e936a781a307805ebecf0dd473fbc02d52a7094da62e54bffb9454a
+oid sha256:095c30bfe3c5da168c85aceef905e74e2142866332282965aa6812f6e6e48448
 size 4344
--- a/tests/data/lerobot/aloha_mobile_chair/train/data-00000-of-00001.arrow
+++ b/tests/data/lerobot/aloha_mobile_chair/train/data-00000-of-00001.arrow
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a08be578285cbe2d35b78f150d464ff3e10604a9865398c976983e0d711774f9
+oid sha256:98859f2d87e1a0abb9a930a82af623504b3efb26f70fe576f05bab7f19024427
 size 788528
--- a/tests/data/lerobot/aloha_mobile_chair/train/state.json
+++ b/tests/data/lerobot/aloha_mobile_chair/train/state.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:34e36233477c8aa0b0840314ddace072062d4f486d06546bbd6550832c370065
+oid sha256:38cf4116a65cb92a5c43f9b9da7a7b81cfa9168b17605c8c456f7d3a3a23b77a
 size 247
--- a/tests/data/lerobot/aloha_mobile_chair/videos/observation.images.cam_high_episode_000000.mp4
+++ b/tests/data/lerobot/aloha_mobile_chair/videos/observation.images.cam_high_episode_000000.mp4
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:66e7349a4a82ca6042a7189608d01eb1cfa38d100d039b5445ae1a9e65d824ab
-size 14470946
+oid sha256:596dda720d378a44b6b61a6a72b44bec3e55e85198bca37f9dace6fe84af7ff0
+size 16062396
--- a/tests/data/lerobot/aloha_mobile_chair/videos/observation.images.cam_left_wrist_episode_000000.mp4
+++ b/tests/data/lerobot/aloha_mobile_chair/videos/observation.images.cam_left_wrist_episode_000000.mp4
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a2146f0c10c9f2611e57e617983aa4f91ad681b4fc50d91b992b97abd684f926
-size 11662185
+oid sha256:c614bbaf93d65354a82001b357682a0bd36f9603685f6c735c5e377b763d0bdb
+size 10317415
--- a/tests/data/lerobot/aloha_mobile_chair/videos/observation.images.cam_right_wrist_episode_000000.mp4
+++ b/tests/data/lerobot/aloha_mobile_chair/videos/observation.images.cam_right_wrist_episode_000000.mp4
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5affbaf1c48895ba3c626e0d8cf1309e5f4ec6bbaa135313096f52a22de66c05
-size 11410342
+oid sha256:868788028a38334b6b566cb17ffcc2ace2ec2b2b68ff2a58b6d29eb3c3e2ec1f
+size 9516445
--- a/tests/data/lerobot/aloha_mobile_elevator/meta_data/episode_data_index.safetensors
+++ b/tests/data/lerobot/aloha_mobile_elevator/meta_data/episode_data_index.safetensors
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6c2b195ca91b88fd16422128d386d2cabd808a1862c6d127e6bf2e83e1fe819a
-size 448
+oid sha256:f365a02b052a2697b1558f4ab9b813f0d4ba46a5bc6ae3da30bbc4b135426aa6
+size 136
--- a/tests/data/lerobot/aloha_mobile_elevator/meta_data/info.json
+++ b/tests/data/lerobot/aloha_mobile_elevator/meta_data/info.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cf148247bf191c7f7e8af738a7b9e147f9ffffeec0e4b9d1c4783c4e384da7eb
-size 33
+oid sha256:50e40e4c2bb523fca0b54e9a9635281312e9c6f9d757db03c06a0865c5508f29
+size 188
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Michel Aractingi	d5fb8e9802	updated params	2024-09-02 06:34:24 +00:00
Michel Aractingi	53d67bb5b7	First commit of tdmpc2 taken from NHansen code	2024-08-29 12:48:01 +00:00
Michel Aractingi	eb4c505cff	Support for converting OpenX datasets from RLDS format to LeRobotDataset (#354 ) Signed-off-by: youliangtan <tan_you_liang@hotmail.com> Co-authored-by: Simon Alibert <alibert.sim@gmail.com> Co-authored-by: youliangtan <tan_you_liang@hotmail.com> Co-authored-by: Remi <re.cadene@gmail.com>	2024-08-27 09:07:00 +02:00
Mishig	aad59e6b6b	Fix videos in visualize_dataset are not in sync (#382 )	2024-08-26 17:38:48 +02:00
Alexander Soare	9ce98bb93c	Add safety limits on relative action target (#373 )	2024-08-26 14:30:18 +01:00
Alexander Soare	97086cdcdf	Make gripper_open_degree a config param (#379 )	2024-08-26 12:28:16 +01:00
Alexander Soare	9c7649f140	Make sure `init_hydra_config` does not require any keys (#376 )	2024-08-23 12:27:08 +01:00
Zhuoheng Li	a2592a5563	Provide more information to the user (#358 ) Co-authored-by: Alexander Soare <alexander.soare159@gmail.com> Co-authored-by: Remi <re.cadene@gmail.com>	2024-08-23 11:00:35 +01:00
ellacroix	b5ad79a7d3	Fix typo in tutorial (#371 )	2024-08-21 14:14:01 +02:00
Remi	996468bcce	Update README.md	2024-08-20 16:45:57 +02:00
Remi	f98200297d	Slightly improve tutorial and README (#370 ) Co-authored-by: Simon Alibert <75076266+aliberts@users.noreply.github.com>	2024-08-20 16:41:39 +02:00
NielsRogge	86bbd16d43	Improve discoverability on the hub (#325 ) Co-authored-by: Lucain <lucainp@gmail.com> Co-authored-by: Simon Alibert <75076266+aliberts@users.noreply.github.com>	2024-08-19 15:16:46 +02:00
Alexander Soare	0f6e0f6d74	Fix input dim (#365 )	2024-08-19 11:42:32 +01:00
Remi	fc3e545e03	Update README.md	2024-08-19 11:14:10 +02:00
Simon Alibert	b98ea415c1	Add dataset cards (#363 )	2024-08-16 10:08:44 +02:00
Remi	bbe9057225	Improve control robot ; Add process to configure motor indices (#326 ) Co-authored-by: Simon Alibert <alibert.sim@gmail.com> Co-authored-by: jess-moss <jess.moss@dextrousrobotics.com> Co-authored-by: Marina Barannikov <marina.barannikov@huggingface.co> Co-authored-by: Alexander Soare <alexander.soare159@gmail.com>	2024-08-15 18:11:33 +02:00
Alexander Soare	8c4643687c	fix bug in example 2 (#361 )	2024-08-15 13:59:47 +01:00
Julien Perez	fab037f78d	feat for the GPU poors : Add GPU availability check in evaluate_pretr… (#359 ) Co-authored-by: Alexander Soare <alexander.soare159@gmail.com>	2024-08-13 16:03:05 +01:00
Simon Alibert	03d647269e	Fix CI builds (#357 )	2024-08-12 17:57:03 +02:00
Remi	2252b42337	Add visualize_dataset_html with `http.server` (#188 )	2024-08-08 20:19:06 +03:00
Adrien	bc6384bb80	fix ci (#351 ) Signed-off-by: Adrien <adrien@huggingface.co>	2024-08-05 16:12:26 +02:00
resolver101757	8df7e63d61	Update README for cross-platform installation compatibility (#347 )	2024-07-30 00:48:41 +02:00
Halvard Bariller	7a3cb1ad34	Adjust the timestamps' description in Diffusion Policy (#343 ) Co-authored-by: Alexander Soare <alexander.soare159@gmail.com>	2024-07-26 12:47:03 +01:00
Alexander Soare	f8a6574698	Add online training with TD-MPC as proof of concept (#338 )	2024-07-25 11:16:38 +01:00
Alexander Soare	abbb1d2367	Make sure policies don't mutate the batch (#323 )	2024-07-22 20:38:33 +01:00
Simon Alibert	0b21210d72	Convert datasets to av1 encoding (#302 )	2024-07-22 20:08:59 +02:00
Simon Alibert	461d5472d3	Fix visualize_image_transforms (#333 )	2024-07-18 22:26:00 +02:00
Simon Alibert	c75ea789a8	Detect secrets in pre-commit (#332 )	2024-07-18 19:39:15 +02:00
Simon Alibert	ee200e86cb	Ensure no upper bound constraints on dependencies (#327 )	2024-07-18 12:07:15 +02:00