Disconnect after scan_port

Remove comment
Refactor feetech _broadcast_ping
2025-06-04 17:12:30 +02:00 · 2025-06-04 16:59:44 +02:00 · 2025-06-04 16:41:33 +02:00 · 2025-06-04 14:54:29 +02:00 · 2025-06-04 14:27:57 +02:00 · 2025-06-04 13:16:41 +02:00
938 changed files with 40771 additions and 14060 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,3 +1,17 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # Misc
 .git
 tmp
@@ -59,13 +73,12 @@ pip-log.txt
 pip-delete-this-directory.txt

 # Unit test / coverage reports
-!tests/data
+!tests/artifacts
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
-.cache
 nosetests.xml
 coverage.xml
 *.cover
@@ -73,6 +86,11 @@ coverage.xml
 .hypothesis/
 .pytest_cache/

+# Ignore .cache except calibration
+.cache/*
+!.cache/calibration/
+!.cache/calibration/**
+
 # Translations
 *.mo
 *.pot
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,6 +1,21 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 *.memmap filter=lfs diff=lfs merge=lfs -text
 *.stl filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
 *.mp4 filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
-*.json filter=lfs diff=lfs merge=lfs -text
+*.json !text !filter !merge !diff
+tests/artifacts/cameras/*.png filter=lfs diff=lfs merge=lfs -text
+*.bag filter=lfs diff=lfs merge=lfs -text
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -1,3 +1,17 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 name: "\U0001F41B Bug Report"
 description: Submit a bug report to help us improve LeRobot
 body:
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -21,7 +21,7 @@ Provide a simple way for the reviewer to try out your changes.

 Examples:
 ```bash
-DATA_DIR=tests/data pytest -sx tests/test_stuff.py::test_something
+pytest -sx tests/test_stuff.py::test_something
 ```
 ```bash
 python lerobot/scripts/train.py --some.option=true
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@@ -1,3 +1,17 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # Inspired by
 # https://github.com/huggingface/peft/blob/main/.github/workflows/build_docker_images.yml
 name: Builds
@@ -8,43 +22,42 @@ on:
  schedule:
    - cron: "0 1 * * *"

+permissions: {}
+
 env:
  PYTHON_VERSION: "3.10"

 jobs:
  latest-cpu:
    name: CPU
-    runs-on: ubuntu-latest
+    runs-on:
+      group: aws-general-8-plus
    steps:
-      - name: Cleanup disk
+      - name: Install Git LFS
        run: |
-          sudo df -h
-          # sudo ls -l /usr/local/lib/
-          # sudo ls -l /usr/share/
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /usr/share/dotnet
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo df -h
+          sudo apt-get update
+          sudo apt-get install git-lfs
+          git lfs install

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0
+        with:
+          cache-binary: false

      - name: Check out code
-        uses: actions/checkout@v4
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          lfs: true
+          persist-credentials: false

      - name: Login to DockerHub
-        uses: docker/login-action@v3
+        uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}

      - name: Build and Push CPU
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@ca052bb54ab0790a636c9b5f226502c73d547a25 # v5.4.0
        with:
          context: .
          file: ./docker/lerobot-cpu/Dockerfile
@@ -55,36 +68,34 @@ jobs:

  latest-cuda:
    name: GPU
-    runs-on: ubuntu-latest
+    runs-on:
+      group: aws-general-8-plus
    steps:
-      - name: Cleanup disk
+      - name: Install Git LFS
        run: |
-          sudo df -h
-          # sudo ls -l /usr/local/lib/
-          # sudo ls -l /usr/share/
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /usr/share/dotnet
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo df -h
+          sudo apt-get update
+          sudo apt-get install git-lfs
+          git lfs install
+
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0
+        with:
+          cache-binary: false

      - name: Check out code
-        uses: actions/checkout@v4
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          lfs: true
+          persist-credentials: false

      - name: Login to DockerHub
-        uses: docker/login-action@v3
+        uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}

      - name: Build and Push GPU
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@ca052bb54ab0790a636c9b5f226502c73d547a25 # v5.4.0
        with:
          context: .
          file: ./docker/lerobot-gpu/Dockerfile
@@ -95,34 +106,27 @@ jobs:

  latest-cuda-dev:
    name: GPU Dev
-    runs-on: ubuntu-latest
+    runs-on:
+      group: aws-general-8-plus
    steps:
-      - name: Cleanup disk
-        run: |
-          sudo df -h
-          # sudo ls -l /usr/local/lib/
-          # sudo ls -l /usr/share/
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /usr/share/dotnet
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo df -h
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0
+        with:
+          cache-binary: false

      - name: Check out code
-        uses: actions/checkout@v4
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          persist-credentials: false

      - name: Login to DockerHub
-        uses: docker/login-action@v3
+        uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}

      - name: Build and Push GPU dev
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@ca052bb54ab0790a636c9b5f226502c73d547a25 # v5.4.0
        with:
          context: .
          file: ./docker/lerobot-gpu-dev/Dockerfile
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@@ -0,0 +1,23 @@
+name: Build documentation
+
+on:
+  workflow_dispatch:
+  push:
+    paths:
+      - "docs/**"
+    branches:
+    - main
+    - doc-builder*
+    - v*-release
+
+
+jobs:
+  build:  # zizmor: ignore[excessive-permissions] We follow the same pattern as in Transformers
+    uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
+    with:
+      commit_sha: ${{ github.sha }}
+      package: lerobot
+      additional_args: --not_python_module
+    secrets:
+      token: ${{ secrets.HUGGINGFACE_PUSH }}
+      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -0,0 +1,19 @@
+name: Build PR Documentation
+
+on:
+  pull_request:
+    paths:
+      - "docs/**"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  build:  # zizmor: ignore[excessive-permissions] We follow the same pattern as in Transformers
+    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
+    with:
+      commit_sha: ${{ github.event.pull_request.head.sha }}
+      pr_number: ${{ github.event.number }}
+      package: lerobot
+      additional_args: --not_python_module
--- a/.github/workflows/nightly-tests.yml
+++ b/.github/workflows/nightly-tests.yml
@@ -1,3 +1,17 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # Inspired by
 # https://github.com/huggingface/peft/blob/main/.github/workflows/nightly.yml
 name: Nightly
@@ -7,18 +21,19 @@ on:
  schedule:
    - cron: "0 2 * * *"

-env:
-  DATA_DIR: tests/data
-  # SLACK_API_TOKEN: ${{ secrets.SLACK_API_TOKEN }}
+permissions: {}

+# env:
+  # SLACK_API_TOKEN: ${{ secrets.SLACK_API_TOKEN }}
 jobs:
  run_all_tests_cpu:
    name: CPU
    strategy:
      fail-fast: false
-    runs-on: ubuntu-latest
+    runs-on:
+      group: aws-general-8-plus
    container:
-      image: huggingface/lerobot-cpu:latest
+      image: huggingface/lerobot-cpu:latest  # zizmor: ignore[unpinned-images]
      options: --shm-size "16gb"
      credentials:
        username: ${{ secrets.DOCKERHUB_USERNAME }}
@@ -29,13 +44,9 @@ jobs:
        working-directory: /lerobot
    steps:
      - name: Tests
-        env:
-          DATA_DIR: tests/data
        run: pytest -v --cov=./lerobot --disable-warnings tests

      - name: Tests end-to-end
-        env:
-          DATA_DIR: tests/data
        run: make test-end-to-end


@@ -43,12 +54,13 @@ jobs:
    name: GPU
    strategy:
      fail-fast: false
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g6-4xlarge-plus
    env:
      CUDA_VISIBLE_DEVICES: "0"
      TEST_TYPE: "single_gpu"
    container:
-      image: huggingface/lerobot-gpu:latest
+      image: huggingface/lerobot-gpu:latest  # zizmor: ignore[unpinned-images]
      options: --gpus all --shm-size "16gb"
      credentials:
        username: ${{ secrets.DOCKERHUB_USERNAME }}
--- a/.github/workflows/quality.yml
+++ b/.github/workflows/quality.yml
@@ -1,15 +1,29 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 name: Quality

 on:
  workflow_dispatch:
  workflow_call:
  pull_request:
-    branches:
-      - main
  push:
    branches:
      - main

+permissions: {}
+
 env:
  PYTHON_VERSION: "3.10"

@@ -19,10 +33,12 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout Repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          persist-credentials: false

      - name: Set up Python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@7f4fc3e22c37d6ff65e88745f38bd3157c663f7c # v4.9.1
        with:
          python-version: ${{ env.PYTHON_VERSION }}

@@ -30,27 +46,27 @@ jobs:
        id: get-ruff-version
        run: |
          RUFF_VERSION=$(awk '/repo: https:\/\/github.com\/astral-sh\/ruff-pre-commit/{flag=1;next}/rev:/{if(flag){print $2;exit}}' .pre-commit-config.yaml)
-          echo "RUFF_VERSION=${RUFF_VERSION}" >> $GITHUB_ENV
+          echo "ruff_version=${RUFF_VERSION}" >> $GITHUB_OUTPUT

      - name: Install Ruff
-        run: python -m pip install "ruff==${{ env.RUFF_VERSION }}"
+        env:
+          RUFF_VERSION: ${{ steps.get-ruff-version.outputs.ruff_version }}
+        run: python -m pip install "ruff==${RUFF_VERSION}"

      - name: Ruff check
-        run: ruff check
+        run: ruff check --output-format=github

      - name: Ruff format
        run: ruff format --diff

-
-  poetry_check:
-    name: Poetry check
+  typos:
+    name: Typos
    runs-on: ubuntu-latest
    steps:
      - name: Checkout Repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          persist-credentials: false

-      - name: Install poetry
-        run: pipx install poetry
-
-      - name: Poetry check
-        run: poetry check
+      - name: typos-action
+        uses: crate-ci/typos@db35ee91e80fbb447f33b0e5fbddb24d2a1a884f # v1.29.10
--- a/.github/workflows/test-docker-build.yml
+++ b/.github/workflows/test-docker-build.yml
@@ -1,15 +1,29 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # Inspired by
 # https://github.com/huggingface/peft/blob/main/.github/workflows/test-docker-build.yml
 name: Test Dockerfiles

 on:
  pull_request:
-    branches:
-      - main
    paths:
      # Run only when DockerFile files are modified
      - "docker/**"

+permissions: {}
+
 env:
  PYTHON_VERSION: "3.10"

@@ -21,55 +35,46 @@ jobs:
      matrix: ${{ steps.set-matrix.outputs.matrix }}
    steps:
      - name: Check out code
-        uses: actions/checkout@v4
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          persist-credentials: false

      - name: Get changed files
        id: changed-files
-        uses: tj-actions/changed-files@v44
+        uses: tj-actions/changed-files@3f54ebb830831fc121d3263c1857cfbdc310cdb9 #v42
        with:
          files: docker/**
          json: "true"

-      - name: Run step if only the files listed above change
+      - name: Run step if only the files listed above change  # zizmor: ignore[template-injection]
        if: steps.changed-files.outputs.any_changed == 'true'
        id: set-matrix
-        env:
-          ALL_CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
        run: |
          echo "matrix=${{ steps.changed-files.outputs.all_changed_files}}" >> $GITHUB_OUTPUT

-
  build_modified_dockerfiles:
    name: Build modified Docker images
    needs: get_changed_files
-    runs-on: ubuntu-latest
-    if: ${{ needs.get_changed_files.outputs.matrix }} != ''
+    runs-on:
+      group: aws-general-8-plus
+    if: needs.get_changed_files.outputs.matrix != ''
    strategy:
      fail-fast: false
      matrix:
        docker-file: ${{ fromJson(needs.get_changed_files.outputs.matrix) }}
    steps:
-      - name: Cleanup disk
-        run: |
-          sudo df -h
-          # sudo ls -l /usr/local/lib/
-          # sudo ls -l /usr/share/
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /usr/share/dotnet
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo df -h
-
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0
+        with:
+          cache-binary: false

      - name: Check out code
-        uses: actions/checkout@v4
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          persist-credentials: false

      - name: Build Docker image
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@ca052bb54ab0790a636c9b5f226502c73d547a25 # v5.4.0
        with:
          file: ${{ matrix.docker-file }}
          context: .
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -1,15 +1,30 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 name: Tests

 on:
  pull_request:
-    branches:
-      - main
    paths:
      - "lerobot/**"
      - "tests/**"
      - "examples/**"
      - ".github/**"
-      - "poetry.lock"
+      - "pyproject.toml"
+      - ".pre-commit-config.yaml"
+      - "Makefile"
+      - ".cache/**"
  push:
    branches:
      - main
@@ -18,109 +33,116 @@ on:
      - "tests/**"
      - "examples/**"
      - ".github/**"
-      - "poetry.lock"
+      - "pyproject.toml"
+      - ".pre-commit-config.yaml"
+      - "Makefile"
+      - ".cache/**"
+
+permissions: {}
+
+env:
+  UV_VERSION: "0.6.0"

 jobs:
  pytest:
    name: Pytest
    runs-on: ubuntu-latest
    env:
-      DATA_DIR: tests/data
      MUJOCO_GL: egl
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          lfs: true  # Ensure LFS files are pulled
+          persist-credentials: false

-      - name: Install EGL
-        run: sudo apt-get update && sudo apt-get install -y libegl1-mesa-dev
-
-      - name: Install poetry
+      - name: Install apt dependencies
+      # portaudio19-dev is needed to install pyaudio
        run: |
-          pipx install poetry && poetry config virtualenvs.in-project true
-          echo "${{ github.workspace }}/.venv/bin" >> $GITHUB_PATH
+          sudo apt-get update && \
+          sudo apt-get install -y libegl1-mesa-dev ffmpeg portaudio19-dev

-      - name: Set up Python 3.10
-        uses: actions/setup-python@v5
+      - name: Install uv and python
+        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5.4.2
        with:
+          enable-cache: true
+          version: ${{ env.UV_VERSION }}
          python-version: "3.10"
-          cache: "poetry"

-      - name: Install poetry dependencies
-        run: |
-          poetry install --all-extras
+      - name: Install lerobot (all extras)
+        run: uv sync --all-extras

      - name: Test with pytest
        run: |
-          pytest tests -v --cov=./lerobot --durations=0 \
+          uv run pytest tests -v --cov=./lerobot --durations=0 \
            -W ignore::DeprecationWarning:imageio_ffmpeg._utils:7 \
            -W ignore::UserWarning:torch.utils.data.dataloader:558 \
            -W ignore::UserWarning:gymnasium.utils.env_checker:247 \
            && rm -rf tests/outputs outputs

-
  pytest-minimal:
    name: Pytest (minimal install)
    runs-on: ubuntu-latest
    env:
-      DATA_DIR: tests/data
      MUJOCO_GL: egl
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          lfs: true  # Ensure LFS files are pulled
+          persist-credentials: false

-      - name: Install poetry
-        run: |
-          pipx install poetry && poetry config virtualenvs.in-project true
-          echo "${{ github.workspace }}/.venv/bin" >> $GITHUB_PATH
+      - name: Install apt dependencies
+        run: sudo apt-get update && sudo apt-get install -y ffmpeg

-      - name: Set up Python 3.10
-        uses: actions/setup-python@v5
+      - name: Install uv and python
+        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5.4.2
        with:
+          enable-cache: true
+          version: ${{ env.UV_VERSION }}
          python-version: "3.10"

-      - name: Install poetry dependencies
-        run: |
-          poetry install --extras "test"
+      - name: Install lerobot
+        run: uv sync --extra "test"

      - name: Test with pytest
        run: |
-          pytest tests -v --cov=./lerobot --durations=0 \
+          uv run pytest tests -v --cov=./lerobot --durations=0 \
            -W ignore::DeprecationWarning:imageio_ffmpeg._utils:7 \
            -W ignore::UserWarning:torch.utils.data.dataloader:558 \
            -W ignore::UserWarning:gymnasium.utils.env_checker:247 \
            && rm -rf tests/outputs outputs

-
  end-to-end:
    name: End-to-end
    runs-on: ubuntu-latest
    env:
-      DATA_DIR: tests/data
      MUJOCO_GL: egl
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          lfs: true  # Ensure LFS files are pulled
+          persist-credentials: false

-      - name: Install EGL
-        run: sudo apt-get update && sudo apt-get install -y libegl1-mesa-dev
-
-      - name: Install poetry
+      - name: Install apt dependencies
+      # portaudio19-dev is needed to install pyaudio
        run: |
-          pipx install poetry && poetry config virtualenvs.in-project true
-          echo "${{ github.workspace }}/.venv/bin" >> $GITHUB_PATH
+          sudo apt-get update && \
+          sudo apt-get install -y libegl1-mesa-dev ffmpeg portaudio19-dev

-      - name: Set up Python 3.10
-        uses: actions/setup-python@v5
+      - name: Install uv and python
+        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5.4.2
        with:
+          enable-cache: true
+          version: ${{ env.UV_VERSION }}
          python-version: "3.10"
-          cache: "poetry"

-      - name: Install poetry dependencies
+      - name: Install lerobot (all extras)
        run: |
-          poetry install --all-extras
+          uv venv
+          uv sync --all-extras
+
+      - name: venv
+        run: |
+          echo "PYTHON_PATH=${{ github.workspace }}/.venv/bin/python" >> $GITHUB_ENV

      - name: Test end-to-end
        run: |
--- a/.github/workflows/trufflehog.yml
+++ b/.github/workflows/trufflehog.yml
@@ -0,0 +1,35 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+on:
+  push:
+
+name: Secret Leaks
+
+permissions: {}
+
+jobs:
+  trufflehog:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      with:
+        fetch-depth: 0
+        persist-credentials: false
+
+    - name: Secret Scanning
+      uses: trufflesecurity/trufflehog@90694bf9af66e7536abc5824e7a87246dbf933cb # v3.88.35
+      with:
+        extra_args: --only-verified
--- a/.github/workflows/upload_pr_documentation.yml
+++ b/.github/workflows/upload_pr_documentation.yml
@@ -0,0 +1,16 @@
+name: Upload PR Documentation
+
+on: # zizmor: ignore[dangerous-triggers] We follow the same pattern as in Transformers
+  workflow_run:
+    workflows: [ "Build PR Documentation" ]
+    types:
+    - completed
+
+jobs:
+  build:  # zizmor: ignore[excessive-permissions] We follow the same pattern as in Transformers
+    uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main
+    with:
+      package_name: lerobot
+    secrets:
+      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
+      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,20 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Dev scripts
+.dev
+
 # Logging
 logs
 tmp
@@ -49,6 +66,10 @@ share/python-wheels/
 *.egg
 MANIFEST

+# uv/poetry lock files
+poetry.lock
+uv.lock
+
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
@@ -60,13 +81,12 @@ pip-log.txt
 pip-delete-this-directory.txt

 # Unit test / coverage reports
-!tests/data
+!tests/artifacts
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
-.cache
 nosetests.xml
 coverage.xml
 *.cover
@@ -74,6 +94,9 @@ coverage.xml
 .hypothesis/
 .pytest_cache/

+# Ignore .cache
+.cache/*
+
 # Translations
 *.mo
 *.pot
@@ -123,7 +146,6 @@ celerybeat.pid
 .venv
 env/
 venv/
-ENV/
 env.bak/
 venv.bak/

--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,9 +1,31 @@
-exclude: ^(tests/data)
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+exclude: "tests/artifacts/.*\\.safetensors$"
 default_language_version:
    python: python3.10
 repos:
+  ##### Meta #####
+  - repo: meta
+    hooks:
+      - id: check-useless-excludes
+      - id: check-hooks-apply
+
+
+  ##### Style / Misc. #####
  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.6.0
+    rev: v5.0.0
    hooks:
      - id: check-added-large-files
      - id: debug-statements
@@ -13,21 +35,40 @@ repos:
      - id: check-toml
      - id: end-of-file-fixer
      - id: trailing-whitespace
+
+  - repo: https://github.com/adhtruong/mirrors-typos
+    rev: v1.32.0
+    hooks:
+      - id: typos
+        args: [--force-exclude]
+
  - repo: https://github.com/asottile/pyupgrade
-    rev: v3.15.2
+    rev: v3.20.0
    hooks:
    -   id: pyupgrade
+
  - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.4.3
+    rev: v0.11.11
    hooks:
      - id: ruff
        args: [--fix]
      - id: ruff-format
-  - repo: https://github.com/python-poetry/poetry
-    rev: 1.8.0
+
+
+  ##### Security #####
+  - repo: https://github.com/gitleaks/gitleaks
+    rev: v8.26.0
    hooks:
-      - id: poetry-check
-      - id: poetry-lock
-        args:
-          - "--check"
-          - "--no-update"
+      - id: gitleaks
+
+  - repo: https://github.com/woodruffw/zizmor-pre-commit
+    rev: v1.8.0
+    hooks:
+      - id: zizmor
+
+  - repo: https://github.com/PyCQA/bandit
+    rev: 1.8.3
+    hooks:
+    - id: bandit
+      args: ["-c", "pyproject.toml"]
+      additional_dependencies: ["bandit[toml]"]
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -20,7 +20,7 @@ Some of the ways you can contribute to 🤗 LeRobot:
 * Contributing to the examples or to the documentation.
 * Submitting issues related to bugs or desired new features.

-Following the guides below, feel free to open issues and PRs and to coordinate your efforts with the community on our [Discord Channel](https://discord.gg/VjFz58wn3R). For specific inquiries, reach out to [Remi Cadene](remi.cadene@huggingface.co).
+Following the guides below, feel free to open issues and PRs and to coordinate your efforts with the community on our [Discord Channel](https://discord.gg/VjFz58wn3R). For specific inquiries, reach out to [Remi Cadene](mailto:remi.cadene@huggingface.co).

 If you are not sure how to contribute or want to know the next features we working on, look on this project page: [LeRobot TODO](https://github.com/orgs/huggingface/projects/46)

@@ -129,38 +129,71 @@ Follow these steps to start contributing:

   🚨 **Do not** work on the `main` branch.

-4. for development, we use `poetry` instead of just `pip` to easily track our dependencies.
-   If you don't have it already, follow the [instructions](https://python-poetry.org/docs/#installation) to install it.
+4. for development, we advise to use a tool like `poetry` or `uv` instead of just `pip` to easily track our dependencies.
+   Follow the instructions to [install poetry](https://python-poetry.org/docs/#installation) (use a version >=2.1.0) or to [install uv](https://docs.astral.sh/uv/getting-started/installation/#installation-methods) if you don't have one of them already.

   Set up a development environment with conda or miniconda:
   ```bash
   conda create -y -n lerobot-dev python=3.10 && conda activate lerobot-dev
   ```

-   To develop on 🤗 LeRobot, you will at least need to install the `dev` and `test` extras dependencies along with the core library:
+   If you're using `uv`, it can manage python versions so you can instead do:
   ```bash
-   poetry install --sync --extras "dev test"
+   uv venv --python 3.10 && source .venv/bin/activate
+   ```
+
+   To develop on 🤗 LeRobot, you will at least need to install the `dev` and `test` extras dependencies along with the core library:
+
+   using `poetry`
+   ```bash
+   poetry sync --extras "dev test"
+   ```
+
+   using `uv`
+   ```bash
+   uv sync --extra dev --extra test
   ```

   You can also install the project with all its dependencies (including environments):
+
+   using `poetry`
   ```bash
-   poetry install --sync --all-extras
+   poetry sync --all-extras
+   ```
+
+   using `uv`
+   ```bash
+   uv sync --all-extras
   ```

   > **Note:** If you don't install simulation environments with `--all-extras`, the tests that require them will be skipped when running the pytest suite locally. However, they *will* be tested in the CI. In general, we advise you to install everything and test locally before pushing.

-   Whichever command you chose to install the project (e.g. `poetry install --sync --all-extras`), you should run it again when pulling code with an updated version of `pyproject.toml` and `poetry.lock` in order to synchronize your virtual environment with the new dependencies.
+   Whichever command you chose to install the project (e.g. `poetry sync --all-extras`), you should run it again when pulling code with an updated version of `pyproject.toml` and `poetry.lock` in order to synchronize your virtual environment with the new dependencies.

   The equivalent of `pip install some-package`, would just be:
+
+   using `poetry`
   ```bash
   poetry add some-package
   ```

-   When making changes to the poetry sections of the `pyproject.toml`, you should run the following command to lock dependencies.
+   using `uv`
   ```bash
-   poetry lock --no-update
+   uv add some-package
   ```

+   When making changes to the poetry sections of the `pyproject.toml`, you should run the following command to lock dependencies.
+   using `poetry`
+   ```bash
+   poetry lock
+   ```
+
+   using `uv`
+   ```bash
+   uv lock
+   ```
+
+
 5. Develop the features on your branch.

   As you work on the features, you should make sure that the test suite
@@ -195,7 +228,7 @@ Follow these steps to start contributing:
   git commit
   ```

-   Note, if you already commited some changes that have a wrong formatting, you can use:
+   Note, if you already committed some changes that have a wrong formatting, you can use:
   ```bash
   pre-commit run --all-files
   ```
@@ -236,9 +269,6 @@ Follow these steps to start contributing:
   the PR as a draft PR. These are useful to avoid duplicated work, and to differentiate
   it from PRs ready to be merged;
 4. Make sure existing tests pass;
-<!-- 5. Add high-coverage tests. No quality testing = no merge.
-
-See an example of a good PR here: https://github.com/huggingface/lerobot/pull/ -->

 ### Tests

@@ -258,7 +288,7 @@ sudo apt-get install git-lfs
 git lfs install
 ```

-Pull artifacts if they're not in [tests/data](tests/data)
+Pull artifacts if they're not in [tests/artifacts](tests/artifacts)
 ```bash
 git lfs pull
 ```
@@ -267,7 +297,7 @@ We use `pytest` in order to run the tests. From the root of the
 repository, here's how to run tests with `pytest` for the library:

 ```bash
-DATA_DIR="tests/data" python -m pytest -sv ./tests
+python -m pytest -sv ./tests
 ```


--- a/223
+++ b/223
@@ -1,11 +1,25 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 .PHONY: tests

 PYTHON_PATH := $(shell which python)

-# If Poetry is installed, redefine PYTHON_PATH to use the Poetry-managed Python
-POETRY_CHECK := $(shell command -v poetry)
-ifneq ($(POETRY_CHECK),)
-    PYTHON_PATH := $(shell poetry run which python)
+# If uv is installed and a virtual environment exists, use it
+UV_CHECK := $(shell command -v uv)
+ifneq ($(UV_CHECK),)
+	PYTHON_PATH := $(shell .venv/bin/python)
 endif

 export PATH := $(dir $(PYTHON_PATH)):$(PATH)
@@ -20,144 +34,109 @@ build-gpu:

 test-end-to-end:
 	${MAKE} DEVICE=$(DEVICE) test-act-ete-train
+	${MAKE} DEVICE=$(DEVICE) test-act-ete-train-resume
 	${MAKE} DEVICE=$(DEVICE) test-act-ete-eval
-	${MAKE} DEVICE=$(DEVICE) test-act-ete-train-amp
-	${MAKE} DEVICE=$(DEVICE) test-act-ete-eval-amp
 	${MAKE} DEVICE=$(DEVICE) test-diffusion-ete-train
 	${MAKE} DEVICE=$(DEVICE) test-diffusion-ete-eval
 	${MAKE} DEVICE=$(DEVICE) test-tdmpc-ete-train
 	${MAKE} DEVICE=$(DEVICE) test-tdmpc-ete-eval
-	${MAKE} DEVICE=$(DEVICE) test-default-ete-eval
-	${MAKE} DEVICE=$(DEVICE) test-act-pusht-tutorial

 test-act-ete-train:
 	python lerobot/scripts/train.py \
-		policy=act \
-		policy.dim_model=64 \
-		env=aloha \
-		wandb.enable=False \
-		training.offline_steps=2 \
-		training.online_steps=0 \
-		eval.n_episodes=1 \
-		eval.batch_size=1 \
-		device=$(DEVICE) \
-		training.save_checkpoint=true \
-		training.save_freq=2 \
-		policy.n_action_steps=20 \
-		policy.chunk_size=20 \
-		training.batch_size=2 \
-		hydra.run.dir=tests/outputs/act/
+		--policy.type=act \
+		--policy.dim_model=64 \
+		--policy.n_action_steps=20 \
+		--policy.chunk_size=20 \
+		--policy.device=$(DEVICE) \
+		--env.type=aloha \
+		--env.episode_length=5 \
+		--dataset.repo_id=lerobot/aloha_sim_transfer_cube_human \
+		--dataset.image_transforms.enable=true \
+		--dataset.episodes="[0]" \
+		--batch_size=2 \
+		--steps=4 \
+		--eval_freq=2 \
+		--eval.n_episodes=1 \
+		--eval.batch_size=1 \
+		--save_freq=2 \
+		--save_checkpoint=true \
+		--log_freq=1 \
+		--wandb.enable=false \
+		--output_dir=tests/outputs/act/
+
+test-act-ete-train-resume:
+	python lerobot/scripts/train.py \
+		--config_path=tests/outputs/act/checkpoints/000002/pretrained_model/train_config.json \
+		--resume=true

 test-act-ete-eval:
 	python lerobot/scripts/eval.py \
-		-p tests/outputs/act/checkpoints/000002/pretrained_model \
-		eval.n_episodes=1 \
-		eval.batch_size=1 \
-		env.episode_length=8 \
-		device=$(DEVICE) \
-
-test-act-ete-train-amp:
-	python lerobot/scripts/train.py \
-		policy=act \
-		policy.dim_model=64 \
-		env=aloha \
-		wandb.enable=False \
-		training.offline_steps=2 \
-		training.online_steps=0 \
-		eval.n_episodes=1 \
-		eval.batch_size=1 \
-		device=$(DEVICE) \
-		training.save_checkpoint=true \
-		training.save_freq=2 \
-		policy.n_action_steps=20 \
-		policy.chunk_size=20 \
-		training.batch_size=2 \
-		hydra.run.dir=tests/outputs/act_amp/ \
-		use_amp=true
-
-test-act-ete-eval-amp:
-	python lerobot/scripts/eval.py \
-		-p tests/outputs/act_amp/checkpoints/000002/pretrained_model \
-		eval.n_episodes=1 \
-		eval.batch_size=1 \
-		env.episode_length=8 \
-		device=$(DEVICE) \
-		use_amp=true
+		--policy.path=tests/outputs/act/checkpoints/000004/pretrained_model \
+		--policy.device=$(DEVICE) \
+		--env.type=aloha \
+		--env.episode_length=5 \
+		--eval.n_episodes=1 \
+		--eval.batch_size=1

 test-diffusion-ete-train:
 	python lerobot/scripts/train.py \
-		policy=diffusion \
-		policy.down_dims=\[64,128,256\] \
-		policy.diffusion_step_embed_dim=32 \
-		policy.num_inference_steps=10 \
-		env=pusht \
-		wandb.enable=False \
-		training.offline_steps=2 \
-		training.online_steps=0 \
-		eval.n_episodes=1 \
-		eval.batch_size=1 \
-		device=$(DEVICE) \
-		training.save_checkpoint=true \
-		training.save_freq=2 \
-		training.batch_size=2 \
-		hydra.run.dir=tests/outputs/diffusion/
+		--policy.type=diffusion \
+		--policy.down_dims='[64,128,256]' \
+		--policy.diffusion_step_embed_dim=32 \
+		--policy.num_inference_steps=10 \
+		--policy.device=$(DEVICE) \
+		--env.type=pusht \
+		--env.episode_length=5 \
+		--dataset.repo_id=lerobot/pusht \
+		--dataset.image_transforms.enable=true \
+		--dataset.episodes="[0]" \
+		--batch_size=2 \
+		--steps=2 \
+		--eval_freq=2 \
+		--eval.n_episodes=1 \
+		--eval.batch_size=1 \
+		--save_checkpoint=true \
+		--save_freq=2 \
+		--log_freq=1 \
+		--wandb.enable=false \
+		--output_dir=tests/outputs/diffusion/

 test-diffusion-ete-eval:
 	python lerobot/scripts/eval.py \
-		-p tests/outputs/diffusion/checkpoints/000002/pretrained_model \
-		eval.n_episodes=1 \
-		eval.batch_size=1 \
-		env.episode_length=8 \
-		device=$(DEVICE) \
+		--policy.path=tests/outputs/diffusion/checkpoints/000002/pretrained_model \
+		--policy.device=$(DEVICE) \
+		--env.type=pusht \
+		--env.episode_length=5 \
+		--eval.n_episodes=1 \
+		--eval.batch_size=1

-# TODO(alexander-soare): Restore online_steps to 2 when it is reinstated.
 test-tdmpc-ete-train:
 	python lerobot/scripts/train.py \
-		policy=tdmpc \
-		env=xarm \
-		env.task=XarmLift-v0 \
-		dataset_repo_id=lerobot/xarm_lift_medium \
-		wandb.enable=False \
-		training.offline_steps=2 \
-		training.online_steps=0 \
-		eval.n_episodes=1 \
-		eval.batch_size=1 \
-		env.episode_length=2 \
-		device=$(DEVICE) \
-		training.save_checkpoint=true \
-		training.save_freq=2 \
-		training.batch_size=2 \
-		hydra.run.dir=tests/outputs/tdmpc/
+		--policy.type=tdmpc \
+		--policy.device=$(DEVICE) \
+		--env.type=xarm \
+		--env.task=XarmLift-v0 \
+		--env.episode_length=5 \
+		--dataset.repo_id=lerobot/xarm_lift_medium \
+		--dataset.image_transforms.enable=true \
+		--dataset.episodes="[0]" \
+		--batch_size=2 \
+		--steps=2 \
+		--eval_freq=2 \
+		--eval.n_episodes=1 \
+		--eval.batch_size=1 \
+		--save_checkpoint=true \
+		--save_freq=2 \
+		--log_freq=1 \
+		--wandb.enable=false \
+		--output_dir=tests/outputs/tdmpc/

 test-tdmpc-ete-eval:
 	python lerobot/scripts/eval.py \
-		-p tests/outputs/tdmpc/checkpoints/000002/pretrained_model \
-		eval.n_episodes=1 \
-		eval.batch_size=1 \
-		env.episode_length=8 \
-		device=$(DEVICE) \
-
-test-default-ete-eval:
-	python lerobot/scripts/eval.py \
-		--config lerobot/configs/default.yaml \
-		eval.n_episodes=1 \
-		eval.batch_size=1 \
-		env.episode_length=8 \
-		device=$(DEVICE) \
-
-test-act-pusht-tutorial:
-	cp examples/advanced/1_train_act_pusht/act_pusht.yaml lerobot/configs/policy/created_by_Makefile.yaml
-	python lerobot/scripts/train.py \
-		policy=created_by_Makefile.yaml \
-		env=pusht \
-		wandb.enable=False \
-		training.offline_steps=2 \
-		eval.n_episodes=1 \
-		eval.batch_size=1 \
-		env.episode_length=2 \
-		device=$(DEVICE) \
-		training.save_model=true \
-		training.save_freq=2 \
-		training.batch_size=2 \
-		hydra.run.dir=tests/outputs/act_pusht/
-	rm lerobot/configs/policy/created_by_Makefile.yaml
+		--policy.path=tests/outputs/tdmpc/checkpoints/000002/pretrained_model \
+		--policy.device=$(DEVICE) \
+		--env.type=xarm \
+		--env.episode_length=5 \
+		--env.task=XarmLift-v0 \
+		--eval.n_episodes=1 \
+		--eval.batch_size=1
--- a/README.md
+++ b/README.md
@@ -22,13 +22,49 @@

 </div>

+<h2 align="center">
+    <p><a href="https://github.com/huggingface/lerobot/blob/main/examples/12_use_so101.md">
+        Build Your Own SO-101 Robot!</a></p>
+</h2>
+
+<div align="center">
+  <div style="display: flex; gap: 1rem; justify-content: center; align-items: center;" >
+    <img
+      src="media/so101/so101.webp?raw=true"
+      alt="SO-101 follower arm"
+      title="SO-101 follower arm"
+      style="width: 40%;"
+    />
+    <img
+      src="media/so101/so101-leader.webp?raw=true"
+      alt="SO-101 leader arm"
+      title="SO-101 leader arm"
+      style="width: 40%;"
+    />
+  </div>
+
+
+  <p><strong>Meet the updated SO100, the SO-101 – Just €114 per arm!</strong></p>
+  <p>Train it in minutes with a few simple moves on your laptop.</p>
+  <p>Then sit back and watch your creation act autonomously! 🤯</p>
+
+  <p><a href="https://github.com/huggingface/lerobot/blob/main/examples/12_use_so101.md">
+      See the full SO-101 tutorial here.</a></p>
+
+  <p>Want to take it to the next level? Make your SO-101 mobile by building LeKiwi!</p>
+  <p>Check out the <a href="https://github.com/huggingface/lerobot/blob/main/examples/11_use_lekiwi.md">LeKiwi tutorial</a> and bring your robot to life on wheels.</p>
+
+  <img src="media/lekiwi/kiwi.webp?raw=true" alt="LeKiwi mobile robot" title="LeKiwi mobile robot" width="50%">
+</div>
+
+<br/>
+
 <h3 align="center">
-    <p>State-of-the-art Machine Learning for real-world robotics</p>
+    <p>LeRobot: State-of-the-art AI for real-world robotics</p>
 </h3>

 ---

-
 🤗 LeRobot aims to provide models, datasets, and tools for real-world robotics in PyTorch. The goal is to lower the barrier to entry to robotics so that everyone can contribute and benefit from sharing datasets and pretrained models.

 🤗 LeRobot contains state-of-the-art approaches that have been shown to transfer to the real-world with a focus on imitation learning and reinforcement learning.
@@ -41,9 +77,9 @@

 <table>
  <tr>
-    <td><img src="http://remicadene.com/assets/gif/aloha_act.gif" width="100%" alt="ACT policy on ALOHA env"/></td>
-    <td><img src="http://remicadene.com/assets/gif/simxarm_tdmpc.gif" width="100%" alt="TDMPC policy on SimXArm env"/></td>
-    <td><img src="http://remicadene.com/assets/gif/pusht_diffusion.gif" width="100%" alt="Diffusion policy on PushT env"/></td>
+    <td><img src="media/gym/aloha_act.gif" width="100%" alt="ACT policy on ALOHA env"/></td>
+    <td><img src="media/gym/simxarm_tdmpc.gif" width="100%" alt="TDMPC policy on SimXArm env"/></td>
+    <td><img src="media/gym/pusht_diffusion.gif" width="100%" alt="Diffusion policy on PushT env"/></td>
  </tr>
  <tr>
    <td align="center">ACT policy on ALOHA env</td>
@@ -54,32 +90,46 @@

 ### Acknowledgment

- Thanks to Tony Zaho, Zipeng Fu and colleagues for open sourcing ACT policy, ALOHA environments and datasets. Ours are adapted from [ALOHA](https://tonyzhaozh.github.io/aloha) and [Mobile ALOHA](https://mobile-aloha.github.io).
+- Thanks to Tony Zhao, Zipeng Fu and colleagues for open sourcing ACT policy, ALOHA environments and datasets. Ours are adapted from [ALOHA](https://tonyzhaozh.github.io/aloha) and [Mobile ALOHA](https://mobile-aloha.github.io).
 - Thanks to Cheng Chi, Zhenjia Xu and colleagues for open sourcing Diffusion policy, Pusht environment and datasets, as well as UMI datasets. Ours are adapted from [Diffusion Policy](https://diffusion-policy.cs.columbia.edu) and [UMI Gripper](https://umi-gripper.github.io).
 - Thanks to Nicklas Hansen, Yunhai Feng and colleagues for open sourcing TDMPC policy, Simxarm environments and datasets. Ours are adapted from [TDMPC](https://github.com/nicklashansen/tdmpc) and [FOWM](https://www.yunhaifeng.com/FOWM).
 - Thanks to Antonio Loquercio and Ashish Kumar for their early support.
+- Thanks to [Seungjae (Jay) Lee](https://sjlee.cc/), [Mahi Shafiullah](https://mahis.life/) and colleagues for open sourcing [VQ-BeT](https://sjlee.cc/vq-bet/) policy and helping us adapt the codebase to our repository. The policy is adapted from [VQ-BeT repo](https://github.com/jayLEE0301/vq_bet_official).


 ## Installation

 Download our source code:
 ```bash
-git clone https://github.com/huggingface/lerobot.git && cd lerobot
+git clone https://github.com/huggingface/lerobot.git
+cd lerobot
 ```

 Create a virtual environment with Python 3.10 and activate it, e.g. with [`miniconda`](https://docs.anaconda.com/free/miniconda/index.html):
 ```bash
-conda create -y -n lerobot python=3.10 && conda activate lerobot
+conda create -y -n lerobot python=3.10
+conda activate lerobot
 ```

+When using `miniconda`, install `ffmpeg` in your environment:
+```bash
+conda install ffmpeg -c conda-forge
+```
+
+> **NOTE:** This usually installs `ffmpeg 7.X` for your platform compiled with the `libsvtav1` encoder. If `libsvtav1` is not supported (check supported encoders with `ffmpeg -encoders`), you can:
+>  - _[On any platform]_ Explicitly install `ffmpeg 7.X` using:
+>  ```bash
+>  conda install ffmpeg=7.1.1 -c conda-forge
+>  ```
+>  - _[On Linux only]_ Install [ffmpeg build dependencies](https://trac.ffmpeg.org/wiki/CompilationGuide/Ubuntu#GettheDependencies) and [compile ffmpeg from source with libsvtav1](https://trac.ffmpeg.org/wiki/CompilationGuide/Ubuntu#libsvtav1), and make sure you use the corresponding ffmpeg binary to your install with `which ffmpeg`.
+
 Install 🤗 LeRobot:
 ```bash
-pip install .
+pip install -e .
 ```

-> **NOTE:** Depending on your platform, If you encounter any build errors during this step
-you may need to install `cmake` and `build-essential` for building some of our dependencies.
-On linux: `sudo apt-get install cmake build-essential`
+> **NOTE:** If you encounter build errors, you may need to install additional dependencies (`cmake`, `build-essential`, and `ffmpeg libs`). On Linux, run:
+`sudo apt-get install cmake build-essential python3-dev pkg-config libavformat-dev libavcodec-dev libavdevice-dev libavutil-dev libswscale-dev libswresample-dev libavfilter-dev pkg-config`. For other systems, see: [Compiling PyAV](https://pyav.org/docs/develop/overview/installation.html#bring-your-own-ffmpeg)

 For simulations, 🤗 LeRobot comes with gymnasium environments that can be installed as extras:
 - [aloha](https://github.com/huggingface/gym-aloha)
@@ -88,7 +138,7 @@ For simulations, 🤗 LeRobot comes with gymnasium environments that can be inst

 For instance, to install 🤗 LeRobot with aloha and pusht, use:
 ```bash
-pip install ".[aloha, pusht]"
+pip install -e ".[aloha, pusht]"
 ```

 To use [Weights and Biases](https://docs.wandb.ai/quickstart) for experiment tracking, log in with
@@ -105,18 +155,17 @@ wandb login
 ├── examples             # contains demonstration examples, start here to learn about LeRobot
 |   └── advanced         # contains even more examples for those who have mastered the basics
 ├── lerobot
-|   ├── configs          # contains hydra yaml files with all options that you can override in the command line
-|   |   ├── default.yaml   # selected by default, it loads pusht environment and diffusion policy
-|   |   ├── env            # various sim environments and their datasets: aloha.yaml, pusht.yaml, xarm.yaml
-|   |   └── policy         # various policies: act.yaml, diffusion.yaml, tdmpc.yaml
+|   ├── configs          # contains config classes with all options that you can override in the command line
 |   ├── common           # contains classes and utilities
 |   |   ├── datasets       # various datasets of human demonstrations: aloha, pusht, xarm
 |   |   ├── envs           # various sim environments: aloha, pusht, xarm
 |   |   ├── policies       # various policies: act, diffusion, tdmpc
+|   |   ├── robot_devices  # various real devices: dynamixel motors, opencv cameras, koch robots
 |   |   └── utils          # various utilities
 |   └── scripts          # contains functions to execute via command line
 |       ├── eval.py                 # load policy and evaluate it on an environment
 |       ├── train.py                # train a policy via imitation learning and/or reinforcement learning
+|       ├── control_robot.py        # teleoperate a real robot, record data, run a policy
 |       ├── push_dataset_to_hub.py  # convert your dataset into LeRobot dataset format and upload it to the Hugging Face hub
 |       └── visualize_dataset.py    # load a dataset and render its demonstrations
 ├── outputs               # contains results of scripts execution: logs, videos, model checkpoints
@@ -125,15 +174,25 @@ wandb login

 ### Visualize datasets

-Check out [example 1](./examples/1_load_lerobot_dataset.py) that illustrates how to use our dataset class which automatically download data from the Hugging Face hub.
+Check out [example 1](./examples/1_load_lerobot_dataset.py) that illustrates how to use our dataset class which automatically downloads data from the Hugging Face hub.

-You can also locally visualize episodes from a dataset by executing our script from the command line:
+You can also locally visualize episodes from a dataset on the hub by executing our script from the command line:
 ```bash
 python lerobot/scripts/visualize_dataset.py \
    --repo-id lerobot/pusht \
    --episode-index 0
 ```

+or from a dataset in a local folder with the `root` option and the `--local-files-only` (in the following case the dataset will be searched for in `./my_local_data_dir/lerobot/pusht`)
+```bash
+python lerobot/scripts/visualize_dataset.py \
+    --repo-id lerobot/pusht \
+    --root ./my_local_data_dir \
+    --local-files-only 1 \
+    --episode-index 0
+```
+
+
 It will open `rerun.io` and display the camera streams, robot states and actions, like this:

 https://github-production-user-asset-6210df.s3.amazonaws.com/4681518/328035972-fd46b787-b532-47e2-bb6f-fd536a55a7ed.mov?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVCODYLSA53PQK4ZA%2F20240505%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240505T172924Z&X-Amz-Expires=300&X-Amz-Signature=d680b26c532eeaf80740f08af3320d22ad0b8a4e4da1bcc4f33142c15b509eda&X-Amz-SignedHeaders=host&actor_id=24889239&key_id=0&repo_id=748713144
@@ -141,6 +200,51 @@ https://github-production-user-asset-6210df.s3.amazonaws.com/4681518/328035972-f

 Our script can also visualize datasets stored on a distant server. See `python lerobot/scripts/visualize_dataset.py --help` for more instructions.

+### The `LeRobotDataset` format
+
+A dataset in `LeRobotDataset` format is very simple to use. It can be loaded from a repository on the Hugging Face hub or a local folder simply with e.g. `dataset = LeRobotDataset("lerobot/aloha_static_coffee")` and can be indexed into like any Hugging Face and PyTorch dataset. For instance `dataset[0]` will retrieve a single temporal frame from the dataset containing observation(s) and an action as PyTorch tensors ready to be fed to a model.
+
+A specificity of `LeRobotDataset` is that, rather than retrieving a single frame by its index, we can retrieve several frames based on their temporal relationship with the indexed frame, by setting `delta_timestamps` to a list of relative times with respect to the indexed frame. For example, with `delta_timestamps = {"observation.image": [-1, -0.5, -0.2, 0]}`  one can retrieve, for a given index, 4 frames: 3 "previous" frames 1 second, 0.5 seconds, and 0.2 seconds before the indexed frame, and the indexed frame itself (corresponding to the 0 entry). See example [1_load_lerobot_dataset.py](examples/1_load_lerobot_dataset.py) for more details on `delta_timestamps`.
+
+Under the hood, the `LeRobotDataset` format makes use of several ways to serialize data which can be useful to understand if you plan to work more closely with this format. We tried to make a flexible yet simple dataset format that would cover most type of features and specificities present in reinforcement learning and robotics, in simulation and in real-world, with a focus on cameras and robot states but easily extended to other types of sensory inputs as long as they can be represented by a tensor.
+
+Here are the important details and internal structure organization of a typical `LeRobotDataset` instantiated with `dataset = LeRobotDataset("lerobot/aloha_static_coffee")`. The exact features will change from dataset to dataset but not the main aspects:
+
+```
+dataset attributes:
+  ├ hf_dataset: a Hugging Face dataset (backed by Arrow/parquet). Typical features example:
+  │  ├ observation.images.cam_high (VideoFrame):
+  │  │   VideoFrame = {'path': path to a mp4 video, 'timestamp' (float32): timestamp in the video}
+  │  ├ observation.state (list of float32): position of an arm joints (for instance)
+  │  ... (more observations)
+  │  ├ action (list of float32): goal position of an arm joints (for instance)
+  │  ├ episode_index (int64): index of the episode for this sample
+  │  ├ frame_index (int64): index of the frame for this sample in the episode ; starts at 0 for each episode
+  │  ├ timestamp (float32): timestamp in the episode
+  │  ├ next.done (bool): indicates the end of an episode ; True for the last frame in each episode
+  │  └ index (int64): general index in the whole dataset
+  ├ episode_data_index: contains 2 tensors with the start and end indices of each episode
+  │  ├ from (1D int64 tensor): first frame index for each episode — shape (num episodes,) starts with 0
+  │  └ to: (1D int64 tensor): last frame index for each episode — shape (num episodes,)
+  ├ stats: a dictionary of statistics (max, mean, min, std) for each feature in the dataset, for instance
+  │  ├ observation.images.cam_high: {'max': tensor with same number of dimensions (e.g. `(c, 1, 1)` for images, `(c,)` for states), etc.}
+  │  ...
+  ├ info: a dictionary of metadata on the dataset
+  │  ├ codebase_version (str): this is to keep track of the codebase version the dataset was created with
+  │  ├ fps (float): frame per second the dataset is recorded/synchronized to
+  │  ├ video (bool): indicates if frames are encoded in mp4 video files to save space or stored as png files
+  │  └ encoding (dict): if video, this documents the main options that were used with ffmpeg to encode the videos
+  ├ videos_dir (Path): where the mp4 videos or png images are stored/accessed
+  └ camera_keys (list of string): the keys to access camera features in the item returned by the dataset (e.g. `["observation.images.cam_high", ...]`)
+```
+
+A `LeRobotDataset` is serialised using several widespread file formats for each of its parts, namely:
+- hf_dataset stored using Hugging Face datasets library serialization to parquet
+- videos are stored in mp4 format to save space
+- metadata are stored in plain json/jsonl files
+
+Dataset can be uploaded/downloaded from the HuggingFace hub seamlessly. To work on a local dataset, you can specify its location with the `root` argument if it's not in the default `~/.cache/huggingface/lerobot` location.
+
 ### Evaluate a pretrained policy

 Check out [example 2](./examples/2_evaluate_pretrained_policy.py) that illustrates how to download a pretrained policy from Hugging Face hub, and run an evaluation on its corresponding environment.
@@ -148,15 +252,18 @@ Check out [example 2](./examples/2_evaluate_pretrained_policy.py) that illustrat
 We also provide a more capable script to parallelize the evaluation over multiple environments during the same rollout. Here is an example with a pretrained model hosted on [lerobot/diffusion_pusht](https://huggingface.co/lerobot/diffusion_pusht):
 ```bash
 python lerobot/scripts/eval.py \
-    -p lerobot/diffusion_pusht \
-    eval.n_episodes=10 \
-    eval.batch_size=10
+    --policy.path=lerobot/diffusion_pusht \
+    --env.type=pusht \
+    --eval.batch_size=10 \
+    --eval.n_episodes=10 \
+    --policy.use_amp=false \
+    --policy.device=cuda
 ```

 Note: After training your own policy, you can re-evaluate the checkpoints with:

 ```bash
-python lerobot/scripts/eval.py -p {OUTPUT_DIR}/checkpoints/last/pretrained_model
+python lerobot/scripts/eval.py --policy.path={OUTPUT_DIR}/checkpoints/last/pretrained_model
 ```

 See `python lerobot/scripts/eval.py --help` for more instructions.
@@ -165,81 +272,46 @@ See `python lerobot/scripts/eval.py --help` for more instructions.

 Check out [example 3](./examples/3_train_policy.py) that illustrates how to train a model using our core library in python, and [example 4](./examples/4_train_policy_with_script.md) that shows how to use our training script from command line.

-In general, you can use our training script to easily train any policy. Here is an example of training the ACT policy on trajectories collected by humans on the Aloha simulation environment for the insertion task:
+To use wandb for logging training and evaluation curves, make sure you've run `wandb login` as a one-time setup step. Then, when running the training command above, enable WandB in the configuration by adding `--wandb.enable=true`.

-```bash
-python lerobot/scripts/train.py \
-    policy=act \
-    env=aloha \
-    env.task=AlohaInsertion-v0 \
-    dataset_repo_id=lerobot/aloha_sim_insertion_human \
-```
-
-The experiment directory is automatically generated and will show up in yellow in your terminal. It looks like `outputs/train/2024-05-05/20-21-12_aloha_act_default`. You can manually specify an experiment directory by adding this argument to the `train.py` python command:
-```bash
-    hydra.run.dir=your/new/experiment/dir
-```
-
-In the experiment directory there will be a folder called `checkpoints` which will have the following structure:
-
-```bash
-checkpoints
-├── 000250  # checkpoint_dir for training step 250
-│   ├── pretrained_model  # Hugging Face pretrained model dir
-│   │   ├── config.json  # Hugging Face pretrained model config
-│   │   ├── config.yaml  # consolidated Hydra config
-│   │   ├── model.safetensors  # model weights
-│   │   └── README.md  # Hugging Face model card
-│   └── training_state.pth  # optimizer/scheduler/rng state and training step
-```
-
-To use wandb for logging training and evaluation curves, make sure you've run `wandb login` as a one-time setup step. Then, when running the training command above, enable WandB in the configuration by adding:
-
-```bash
-    wandb.enable=true
-```
-
-A link to the wandb logs for the run will also show up in yellow in your terminal. Here is an example of what they look like in your browser:
+A link to the wandb logs for the run will also show up in yellow in your terminal. Here is an example of what they look like in your browser. Please also check [here](./examples/4_train_policy_with_script.md#typical-logs-and-metrics) for the explanation of some commonly used metrics in logs.

 ![](media/wandb.png)

-Note: For efficiency, during training every checkpoint is evaluated on a low number of episodes. You may use `eval.n_episodes=500` to evaluate on more episodes than the default. Or, after training, you may want to re-evaluate your best checkpoints on more episodes or change the evaluation settings. See `python lerobot/scripts/eval.py --help` for more instructions.
+Note: For efficiency, during training every checkpoint is evaluated on a low number of episodes. You may use `--eval.n_episodes=500` to evaluate on more episodes than the default. Or, after training, you may want to re-evaluate your best checkpoints on more episodes or change the evaluation settings. See `python lerobot/scripts/eval.py --help` for more instructions.

 #### Reproduce state-of-the-art (SOTA)

-We have organized our configuration files (found under [`lerobot/configs`](./lerobot/configs)) such that they reproduce SOTA results from a given model variant in their respective original works. Simply running:
-
+We provide some pretrained policies on our [hub page](https://huggingface.co/lerobot) that can achieve state-of-the-art performances.
+You can reproduce their training by loading the config from their run. Simply running:
 ```bash
-python lerobot/scripts/train.py policy=diffusion env=pusht
+python lerobot/scripts/train.py --config_path=lerobot/diffusion_pusht
 ```
-
 reproduces SOTA results for Diffusion Policy on the PushT task.

-Pretrained policies, along with reproduction details, can be found under the "Models" section of https://huggingface.co/lerobot.
-
 ## Contribute

 If you would like to contribute to 🤗 LeRobot, please check out our [contribution guide](https://github.com/huggingface/lerobot/blob/main/CONTRIBUTING.md).

-### Add a new dataset
+<!-- ### Add a new dataset

 To add a dataset to the hub, you need to login using a write-access token, which can be generated from the [Hugging Face settings](https://huggingface.co/settings/tokens):
 ```bash
 huggingface-cli login --token ${HUGGINGFACE_TOKEN} --add-to-git-credential
 ```

-Then move your dataset folder in `data` directory (e.g. `data/aloha_static_pingpong_test`), and push your dataset to the hub with:
+Then point to your raw dataset folder (e.g. `data/aloha_static_pingpong_test_raw`), and push your dataset to the hub with:
 ```bash
 python lerobot/scripts/push_dataset_to_hub.py \
--data-dir data \
--dataset-id aloha_static_pingpong_test \
--raw-format aloha_hdf5 \
--community-id lerobot
+--raw-dir data/aloha_static_pingpong_test_raw \
+--out-dir data \
+--repo-id lerobot/aloha_static_pingpong_test \
+--raw-format aloha_hdf5
 ```

 See `python lerobot/scripts/push_dataset_to_hub.py --help` for more instructions.

-If your dataset format is not supported, implement your own in `lerobot/common/datasets/push_dataset_to_hub/${raw_format}_format.py` by copying examples like [pusht_zarr](https://github.com/huggingface/lerobot/blob/main/lerobot/common/datasets/push_dataset_to_hub/pusht_zarr_format.py), [umi_zarr](https://github.com/huggingface/lerobot/blob/main/lerobot/common/datasets/push_dataset_to_hub/umi_zarr_format.py), [aloha_hdf5](https://github.com/huggingface/lerobot/blob/main/lerobot/common/datasets/push_dataset_to_hub/aloha_hdf5_format.py), or [xarm_pkl](https://github.com/huggingface/lerobot/blob/main/lerobot/common/datasets/push_dataset_to_hub/xarm_pkl_format.py).
+If your dataset format is not supported, implement your own in `lerobot/common/datasets/push_dataset_to_hub/${raw_format}_format.py` by copying examples like [pusht_zarr](https://github.com/huggingface/lerobot/blob/main/lerobot/common/datasets/push_dataset_to_hub/pusht_zarr_format.py), [umi_zarr](https://github.com/huggingface/lerobot/blob/main/lerobot/common/datasets/push_dataset_to_hub/umi_zarr_format.py), [aloha_hdf5](https://github.com/huggingface/lerobot/blob/main/lerobot/common/datasets/push_dataset_to_hub/aloha_hdf5_format.py), or [xarm_pkl](https://github.com/huggingface/lerobot/blob/main/lerobot/common/datasets/push_dataset_to_hub/xarm_pkl_format.py). -->


 ### Add a pretrained policy
@@ -249,7 +321,7 @@ Once you have trained a policy you may upload it to the Hugging Face hub using a
 You first need to find the checkpoint folder located inside your experiment directory (e.g. `outputs/train/2024-05-05/20-21-12_aloha_act_default/checkpoints/002500`). Within that there is a `pretrained_model` directory which should contain:
 - `config.json`: A serialized version of the policy configuration (following the policy's dataclass config).
 - `model.safetensors`: A set of `torch.nn.Module` parameters, saved in [Hugging Face Safetensors](https://huggingface.co/docs/safetensors/index) format.
- `config.yaml`: A consolidated Hydra training configuration containing the policy, environment, and dataset configs. The policy configuration should match `config.json` exactly. The environment config is useful for anyone who wants to evaluate your policy. The dataset config just serves as a paper trail for reproducibility.
+- `train_config.json`: A consolidated configuration containing all parameters used for training. The policy configuration should match `config.json` exactly. This is useful for anyone who wants to evaluate your policy or for reproducibility.

 To upload these to the hub, run the following:
 ```bash
@@ -286,11 +358,56 @@ with profile(
 ## Citation

 If you want, you can cite this work with:
-```
+```bibtex
@misc{cadene2024lerobot,
-    author = {Cadene, Remi and Alibert, Simon and Soare, Alexander and Gallouedec, Quentin and Zouitine, Adil and Wolf, Thomas},
+    author = {Cadene, Remi and Alibert, Simon and Soare, Alexander and Gallouedec, Quentin and Zouitine, Adil and Palma, Steven and Kooijmans, Pepijn and Aractingi, Michel and Shukor, Mustafa and Aubakirova, Dana and Russi, Martino and Capuano, Francesco and Pascale, Caroline and Choghari, Jade and Moss, Jess and Wolf, Thomas},
    title = {LeRobot: State-of-the-art Machine Learning for Real-World Robotics in Pytorch},
    howpublished = "\url{https://github.com/huggingface/lerobot}",
    year = {2024}
 }
 ```
+
+Additionally, if you are using any of the particular policy architecture, pretrained models, or datasets, it is recommended to cite the original authors of the work as they appear below:
+
+- [Diffusion Policy](https://diffusion-policy.cs.columbia.edu)
+```bibtex
+@article{chi2024diffusionpolicy,
+	author = {Cheng Chi and Zhenjia Xu and Siyuan Feng and Eric Cousineau and Yilun Du and Benjamin Burchfiel and Russ Tedrake and Shuran Song},
+	title ={Diffusion Policy: Visuomotor Policy Learning via Action Diffusion},
+	journal = {The International Journal of Robotics Research},
+	year = {2024},
+}
+```
+- [ACT or ALOHA](https://tonyzhaozh.github.io/aloha)
+```bibtex
+@article{zhao2023learning,
+  title={Learning fine-grained bimanual manipulation with low-cost hardware},
+  author={Zhao, Tony Z and Kumar, Vikash and Levine, Sergey and Finn, Chelsea},
+  journal={arXiv preprint arXiv:2304.13705},
+  year={2023}
+}
+```
+
+- [TDMPC](https://www.nicklashansen.com/td-mpc/)
+
+```bibtex
+@inproceedings{Hansen2022tdmpc,
+	title={Temporal Difference Learning for Model Predictive Control},
+	author={Nicklas Hansen and Xiaolong Wang and Hao Su},
+	booktitle={ICML},
+	year={2022}
+}
+```
+
+- [VQ-BeT](https://sjlee.cc/vq-bet/)
+```bibtex
+@article{lee2024behavior,
+  title={Behavior generation with latent actions},
+  author={Lee, Seungjae and Wang, Yibin and Etukuru, Haritheja and Kim, H Jin and Shafiullah, Nur Muhammad Mahi and Pinto, Lerrel},
+  journal={arXiv preprint arXiv:2403.03181},
+  year={2024}
+}
+```
+## Star History
+
+[![Star History Chart](https://api.star-history.com/svg?repos=huggingface/lerobot&type=Timeline)](https://star-history.com/#huggingface/lerobot&Timeline)
--- a/benchmarks/video/README.md
+++ b/benchmarks/video/README.md
@@ -0,0 +1,271 @@
+# Video benchmark
+
+
+## Questions
+What is the optimal trade-off between:
+- maximizing loading time with random access,
+- minimizing memory space on disk,
+- maximizing success rate of policies,
+- compatibility across devices/platforms for decoding videos (e.g. video players, web browsers).
+
+How to encode videos?
+- Which video codec (`-vcodec`) to use? h264, h265, AV1?
+- What pixel format to use (`-pix_fmt`)? `yuv444p` or `yuv420p`?
+- How much compression (`-crf`)? No compression with `0`, intermediate compression with `25` or extreme with `50+`?
+- Which frequency to chose for key frames (`-g`)? A key frame every `10` frames?
+
+How to decode videos?
+- Which `decoder`? `torchvision`, `torchaudio`, `ffmpegio`, `decord`, or `nvc`?
+- What scenarios to use for the requesting timestamps during benchmark? (`timestamps_mode`)
+
+
+## Variables
+**Image content & size**
+We don't expect the same optimal settings for a dataset of images from a simulation, or from real-world in an apartment, or in a factory, or outdoor, or with lots of moving objects in the scene, etc. Similarly, loading times might not vary linearly with the image size (resolution).
+For these reasons, we run this benchmark on four representative datasets:
+- `lerobot/pusht_image`: (96 x 96 pixels) simulation with simple geometric shapes, fixed camera.
+- `aliberts/aloha_mobile_shrimp_image`: (480 x 640 pixels) real-world indoor, moving camera.
+- `aliberts/paris_street`: (720 x 1280 pixels) real-world outdoor, moving camera.
+- `aliberts/kitchen`: (1080 x 1920 pixels) real-world indoor, fixed camera.
+
+Note: The datasets used for this benchmark need to be image datasets, not video datasets.
+
+**Data augmentations**
+We might revisit this benchmark and find better settings if we train our policies with various data augmentations to make them more robust (e.g. robust to color changes, compression, etc.).
+
+### Encoding parameters
+| parameter   | values                                                       |
+|-------------|--------------------------------------------------------------|
+| **vcodec**  | `libx264`, `libx265`, `libsvtav1`                            |
+| **pix_fmt** | `yuv444p`, `yuv420p`                                         |
+| **g**       | `1`, `2`, `3`, `4`, `5`, `6`, `10`, `15`, `20`, `40`, `None` |
+| **crf**     | `0`, `5`, `10`, `15`, `20`, `25`, `30`, `40`, `50`, `None`   |
+
+Note that `crf` value might be interpreted differently by various video codecs. In other words, the same value used with one codec doesn't necessarily translate into the same compression level with another codec. In fact, the default value (`None`) isn't the same amongst the different video codecs. Importantly, it is also the case for many other ffmpeg arguments like `g` which specifies the frequency of the key frames.
+
+For a comprehensive list and documentation of these parameters, see the ffmpeg documentation depending on the video codec used:
+- h264: https://trac.ffmpeg.org/wiki/Encode/H.264
+- h265: https://trac.ffmpeg.org/wiki/Encode/H.265
+- AV1: https://trac.ffmpeg.org/wiki/Encode/AV1
+
+### Decoding parameters
+**Decoder**
+We tested two video decoding backends from torchvision:
+- `pyav`
+- `video_reader` (requires to build torchvision from source)
+
+**Requested timestamps**
+Given the way video decoding works, once a keyframe has been loaded, the decoding of subsequent frames is fast.
+This of course is affected by the `-g` parameter during encoding, which specifies the frequency of the keyframes. Given our typical use cases in robotics policies which might request a few timestamps in different random places, we want to replicate these use cases with the following scenarios:
+- `1_frame`: 1 frame,
+- `2_frames`: 2 consecutive frames (e.g. `[t, t + 1 / fps]`),
+- `6_frames`: 6 consecutive frames (e.g. `[t + i / fps for i in range(6)]`)
+
+Note that this differs significantly from a typical use case like watching a movie, in which every frame is loaded sequentially from the beginning to the end and it's acceptable to have big values for `-g`.
+
+Additionally, because some policies might request single timestamps that are a few frames apart, we also have the following scenario:
+- `2_frames_4_space`: 2 frames with 4 consecutive frames of spacing in between (e.g `[t, t + 5 / fps]`),
+
+However, due to how video decoding is implemented with `pyav`, we don't have access to an accurate seek so in practice this scenario is essentially the same as `6_frames` since all 6 frames between `t` and `t + 5 / fps` will be decoded.
+
+
+## Metrics
+**Data compression ratio (lower is better)**
+`video_images_size_ratio` is the ratio of the memory space on disk taken by the encoded video over the memory space taken by the original images. For instance, `video_images_size_ratio=25%` means that the video takes 4 times less memory space on disk compared to the original images.
+
+**Loading time ratio (lower is better)**
+`video_images_load_time_ratio` is the ratio of the time it takes to decode frames from the video at a given timestamps over the time it takes to load the exact same original images. Lower is better. For instance, `video_images_load_time_ratio=200%` means that decoding from video is 2 times slower than loading the original images.
+
+**Average Mean Square Error (lower is better)**
+`avg_mse` is the average mean square error between each decoded frame and its corresponding original image over all requested timestamps, and also divided by the number of pixels in the image to be comparable when switching to different image sizes.
+
+**Average Peak Signal to Noise Ratio (higher is better)**
+`avg_psnr` measures the ratio between the maximum possible power of a signal and the power of corrupting noise that affects the fidelity of its representation. Higher PSNR indicates better quality.
+
+**Average Structural Similarity Index Measure (higher is better)**
+`avg_ssim` evaluates the perceived quality of images by comparing luminance, contrast, and structure. SSIM values range from -1 to 1, where 1 indicates perfect similarity.
+
+One aspect that can't be measured here with those metrics is the compatibility of the encoding across platforms, in particular on web browser, for visualization purposes.
+h264, h265 and AV1 are all commonly used codecs and should not pose an issue. However, the chroma subsampling (`pix_fmt`) format might affect compatibility:
+- `yuv420p` is more widely supported across various platforms, including web browsers.
+- `yuv444p` offers higher color fidelity but might not be supported as broadly.
+
+
+<!-- **Loss of a pretrained policy (higher is better)** (not available)
+`loss_pretrained` is the result of evaluating with the selected encoding/decoding settings a policy pretrained on original images. It is easier to understand than `avg_l2_error`.
+
+**Success rate after retraining (higher is better)** (not available)
+`success_rate` is the result of training and evaluating a policy with the selected encoding/decoding settings. It is the most difficult metric to get but also the very best. -->
+
+
+## How the benchmark works
+The benchmark evaluates both encoding and decoding of video frames on the first episode of each dataset.
+
+**Encoding:** for each `vcodec` and `pix_fmt` pair, we use a default value for `g` and `crf` upon which we change a single value (either `g` or `crf`) to one of the specified values (we don't test every combination of those as this would be computationally too heavy).
+This gives a unique set of encoding parameters which is used to encode the episode.
+
+**Decoding:** Then, for each of those unique encodings, we iterate through every combination of the decoding parameters `backend` and `timestamps_mode`. For each of them, we record the metrics of a number of samples (given by `--num-samples`). This is parallelized for efficiency and the number of processes can be controlled with `--num-workers`. Ideally, it's best to have a `--num-samples` that is divisible by `--num-workers`.
+
+Intermediate results saved for each `vcodec` and `pix_fmt` combination in csv tables.
+These are then all concatenated to a single table ready for analysis.
+
+## Caveats
+We tried to measure the most impactful parameters for both encoding and decoding. However, for computational reasons we can't test out every combination.
+
+Additional encoding parameters exist that are not included in this benchmark. In particular:
+- `-preset` which allows for selecting encoding presets. This represents a collection of options that will provide a certain encoding speed to compression ratio. By leaving this parameter unspecified, it is considered to be `medium` for libx264 and libx265 and `8` for libsvtav1.
+- `-tune` which allows to optimize the encoding for certain aspects (e.g. film quality, fast decoding, etc.).
+
+See the documentation mentioned above for more detailed info on these settings and for a more comprehensive list of other parameters.
+
+Similarly on the decoding side, other decoders exist but are not implemented in our current benchmark. To name a few:
+- `torchaudio`
+- `ffmpegio`
+- `decord`
+- `nvc`
+
+Note as well that since we are mostly interested in the performance at decoding time (also because encoding is done only once before uploading a dataset), we did not measure encoding times nor have any metrics regarding encoding.
+However, besides the necessity to build ffmpeg from source, encoding did not pose any issue and it didn't take a significant amount of time during this benchmark.
+
+
+## Install
+Building ffmpeg from source is required to include libx265 and libaom/libsvtav1 (av1) video codecs ([compilation guide](https://trac.ffmpeg.org/wiki/CompilationGuide/Ubuntu)).
+
+**Note:** While you still need to build torchvision with a conda-installed `ffmpeg<4.3` to use the `video_reader` decoder (as described in [#220](https://github.com/huggingface/lerobot/pull/220)), you also need another version which is custom-built with all the video codecs for encoding. For the script to then use that version, you can prepend the command above with `PATH="$HOME/bin:$PATH"`, which is where ffmpeg should be built.
+
+
+## Adding a video decoder
+Right now, we're only benchmarking the two video decoder available with torchvision: `pyav` and `video_reader`.
+You can easily add a new decoder to benchmark by adding it to this function in the script:
+```diff
+def decode_video_frames(
+    video_path: str,
+    timestamps: list[float],
+    tolerance_s: float,
+    backend: str,
+) -> torch.Tensor:
+    if backend in ["pyav", "video_reader"]:
+        return decode_video_frames_torchvision(
+            video_path, timestamps, tolerance_s, backend
+        )
+    elif backend == ["your_decoder"]:
+        return your_decoder_function(
+            video_path, timestamps, tolerance_s, backend
+        )
+    else:
+        raise NotImplementedError(backend)
+```
+
+
+## Example
+For a quick run, you can try these parameters:
+```bash
+python benchmark/video/run_video_benchmark.py \
+    --output-dir outputs/video_benchmark \
+    --repo-ids \
+        lerobot/pusht_image \
+        aliberts/aloha_mobile_shrimp_image \
+    --vcodec libx264 libx265 \
+    --pix-fmt yuv444p yuv420p \
+    --g 2 20 None \
+    --crf 10 40 None \
+    --timestamps-modes 1_frame 2_frames \
+    --backends pyav video_reader \
+    --num-samples 5 \
+    --num-workers 5 \
+    --save-frames 0
+```
+
+
+## Results
+
+### Reproduce
+We ran the benchmark with the following parameters:
+```bash
+# h264 and h265 encodings
+python benchmark/video/run_video_benchmark.py \
+    --output-dir outputs/video_benchmark \
+    --repo-ids \
+        lerobot/pusht_image \
+        aliberts/aloha_mobile_shrimp_image \
+        aliberts/paris_street \
+        aliberts/kitchen \
+    --vcodec libx264 libx265 \
+    --pix-fmt yuv444p yuv420p \
+    --g 1 2 3 4 5 6 10 15 20 40 None \
+    --crf 0 5 10 15 20 25 30 40 50 None \
+    --timestamps-modes 1_frame 2_frames 6_frames \
+    --backends pyav video_reader \
+    --num-samples 50 \
+    --num-workers 5 \
+    --save-frames 1
+
+# av1 encoding (only compatible with yuv420p and pyav decoder)
+python benchmark/video/run_video_benchmark.py \
+    --output-dir outputs/video_benchmark \
+    --repo-ids \
+        lerobot/pusht_image \
+        aliberts/aloha_mobile_shrimp_image \
+        aliberts/paris_street \
+        aliberts/kitchen \
+    --vcodec libsvtav1 \
+    --pix-fmt yuv420p \
+    --g 1 2 3 4 5 6 10 15 20 40 None \
+    --crf 0 5 10 15 20 25 30 40 50 None \
+    --timestamps-modes 1_frame 2_frames 6_frames \
+    --backends pyav \
+    --num-samples 50 \
+    --num-workers 5 \
+    --save-frames 1
+```
+
+The full results are available [here](https://docs.google.com/spreadsheets/d/1OYJB43Qu8fC26k_OyoMFgGBBKfQRCi4BIuYitQnq3sw/edit?usp=sharing)
+
+
+### Parameters selected for LeRobotDataset
+Considering these results, we chose what we think is the best set of encoding parameter:
+- vcodec: `libsvtav1`
+- pix-fmt: `yuv420p`
+- g: `2`
+- crf: `30`
+
+Since we're using av1 encoding, we're choosing the `pyav` decoder as `video_reader` does not support it (and `pyav` doesn't require a custom build of `torchvision`).
+
+### Summary
+
+These tables show the results for `g=2` and `crf=30`, using `timestamps-modes=6_frames` and `backend=pyav`
+
+| video_images_size_ratio            | vcodec     | pix_fmt |           |           |           |
+|------------------------------------|------------|---------|-----------|-----------|-----------|
+|                                    | libx264    |         | libx265   |           | libsvtav1 |
+| repo_id                            | yuv420p    | yuv444p | yuv420p   | yuv444p   | yuv420p   |
+| lerobot/pusht_image                | **16.97%** | 17.58%  | 18.57%    | 18.86%    | 22.06%    |
+| aliberts/aloha_mobile_shrimp_image | 2.14%      | 2.11%   | 1.38%     | **1.37%** | 5.59%     |
+| aliberts/paris_street              | 2.12%      | 2.13%   | **1.54%** | **1.54%** | 4.43%     |
+| aliberts/kitchen                   | 1.40%      | 1.39%   | **1.00%** | **1.00%** | 2.52%     |
+
+| video_images_load_time_ratio       | vcodec  | pix_fmt |          |         |           |
+|------------------------------------|---------|---------|----------|---------|-----------|
+|                                    | libx264 |         | libx265  |         | libsvtav1 |
+| repo_id                            | yuv420p | yuv444p | yuv420p  | yuv444p | yuv420p   |
+| lerobot/pusht_image                | 6.45    | 5.19    | **1.90** | 2.12    | 2.47      |
+| aliberts/aloha_mobile_shrimp_image | 11.80   | 7.92    | 0.71     | 0.85    | **0.48**  |
+| aliberts/paris_street              | 2.21    | 2.05    | 0.36     | 0.49    | **0.30**  |
+| aliberts/kitchen                   | 1.46    | 1.46    | 0.28     | 0.51    | **0.26**  |
+
+|                                    |          | vcodec   | pix_fmt      |          |           |              |
+|------------------------------------|----------|----------|--------------|----------|-----------|--------------|
+|                                    |          | libx264  |              | libx265  |           | libsvtav1    |
+| repo_id                            | metric   | yuv420p  | yuv444p      | yuv420p  | yuv444p   | yuv420p      |
+| lerobot/pusht_image                | avg_mse  | 2.90E-04 | **2.03E-04** | 3.13E-04 | 2.29E-04  | 2.19E-04     |
+|                                    | avg_psnr | 35.44    | 37.07        | 35.49    | **37.30** | 37.20        |
+|                                    | avg_ssim | 98.28%   | **98.85%**   | 98.31%   | 98.84%    | 98.72%       |
+| aliberts/aloha_mobile_shrimp_image | avg_mse  | 2.76E-04 | 2.59E-04     | 3.17E-04 | 3.06E-04  | **1.30E-04** |
+|                                    | avg_psnr | 35.91    | 36.21        | 35.88    | 36.09     | **40.17**    |
+|                                    | avg_ssim | 95.19%   | 95.18%       | 95.00%   | 95.05%    | **97.73%**   |
+| aliberts/paris_street              | avg_mse  | 6.89E-04 | 6.70E-04     | 4.03E-03 | 4.02E-03  | **3.09E-04** |
+|                                    | avg_psnr | 33.48    | 33.68        | 32.05    | 32.15     | **35.40**    |
+|                                    | avg_ssim | 93.76%   | 93.75%       | 89.46%   | 89.46%    | **95.46%**   |
+| aliberts/kitchen                   | avg_mse  | 2.50E-04 | 2.24E-04     | 4.28E-04 | 4.18E-04  | **1.53E-04** |
+|                                    | avg_psnr | 36.73    | 37.33        | 36.56    | 36.75     | **39.12**    |
+|                                    | avg_ssim | 95.47%   | 95.58%       | 95.52%   | 95.53%    | **96.82%**   |
--- a/benchmarks/video/capture_camera_feed.py
+++ b/benchmarks/video/capture_camera_feed.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Capture video feed from a camera as raw images."""
+
+import argparse
+import datetime as dt
+import os
+import time
+from pathlib import Path
+
+import cv2
+import rerun as rr
+
+# see https://rerun.io/docs/howto/visualization/limit-ram
+RERUN_MEMORY_LIMIT = os.getenv("LEROBOT_RERUN_MEMORY_LIMIT", "5%")
+
+
+def display_and_save_video_stream(output_dir: Path, fps: int, width: int, height: int, duration: int):
+    rr.init("lerobot_capture_camera_feed")
+    rr.spawn(memory_limit=RERUN_MEMORY_LIMIT)
+
+    now = dt.datetime.now()
+    capture_dir = output_dir / f"{now:%Y-%m-%d}" / f"{now:%H-%M-%S}"
+    if not capture_dir.exists():
+        capture_dir.mkdir(parents=True, exist_ok=True)
+
+    # Opens the default webcam
+    cap = cv2.VideoCapture(0)
+    if not cap.isOpened():
+        print("Error: Could not open video stream.")
+        return
+
+    cap.set(cv2.CAP_PROP_FPS, fps)
+    cap.set(cv2.CAP_PROP_FRAME_WIDTH, width)
+    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, height)
+
+    frame_index = 0
+    start_time = time.time()
+    while time.time() - start_time < duration:
+        ret, frame = cap.read()
+
+        if not ret:
+            print("Error: Could not read frame.")
+            break
+        rr.log("video/stream", rr.Image(frame.numpy()), static=True)
+        cv2.imwrite(str(capture_dir / f"frame_{frame_index:06d}.png"), frame)
+        frame_index += 1
+
+    # Release the capture
+    cap.release()
+
+    # TODO(Steven): Add a graceful shutdown via a close() method for the Viewer context, though not currently supported in the Rerun API.
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=Path("outputs/cam_capture/"),
+        help="Directory where the capture images are written. A subfolder named with the current date & time will be created inside it for each capture.",
+    )
+    parser.add_argument(
+        "--fps",
+        type=int,
+        default=30,
+        help="Frames Per Second of the capture.",
+    )
+    parser.add_argument(
+        "--width",
+        type=int,
+        default=1280,
+        help="Width of the captured images.",
+    )
+    parser.add_argument(
+        "--height",
+        type=int,
+        default=720,
+        help="Height of the captured images.",
+    )
+    parser.add_argument(
+        "--duration",
+        type=int,
+        default=20,
+        help="Duration in seconds for which the video stream should be captured.",
+    )
+    args = parser.parse_args()
+    display_and_save_video_stream(**vars(args))
--- a/benchmarks/video/run_video_benchmark.py
+++ b/benchmarks/video/run_video_benchmark.py
@@ -0,0 +1,490 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Assess the performance of video decoding in various configurations.
+
+This script will benchmark different video encoding and decoding parameters.
+See the provided README.md or run `python benchmark/video/run_video_benchmark.py --help` for usage info.
+"""
+
+import argparse
+import datetime as dt
+import random
+import shutil
+from collections import OrderedDict
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+
+import einops
+import numpy as np
+import pandas as pd
+import PIL
+import torch
+from skimage.metrics import mean_squared_error, peak_signal_noise_ratio, structural_similarity
+from tqdm import tqdm
+
+from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
+from lerobot.common.datasets.video_utils import (
+    decode_video_frames_torchvision,
+    encode_video_frames,
+)
+from lerobot.common.utils.benchmark import TimeBenchmark
+
+BASE_ENCODING = OrderedDict(
+    [
+        ("vcodec", "libx264"),
+        ("pix_fmt", "yuv444p"),
+        ("g", 2),
+        ("crf", None),
+        # TODO(aliberts): Add fastdecode
+        # ("fastdecode", 0),
+    ]
+)
+
+
+# TODO(rcadene, aliberts): move to `utils.py` folder when we want to refactor
+def parse_int_or_none(value) -> int | None:
+    if value.lower() == "none":
+        return None
+    try:
+        return int(value)
+    except ValueError as e:
+        raise argparse.ArgumentTypeError(f"Invalid int or None: {value}") from e
+
+
+def check_datasets_formats(repo_ids: list) -> None:
+    for repo_id in repo_ids:
+        dataset = LeRobotDataset(repo_id)
+        if len(dataset.meta.video_keys) > 0:
+            raise ValueError(
+                f"Use only image dataset for running this benchmark. Video dataset provided: {repo_id}"
+            )
+
+
+def get_directory_size(directory: Path) -> int:
+    total_size = 0
+    for item in directory.rglob("*"):
+        if item.is_file():
+            total_size += item.stat().st_size
+    return total_size
+
+
+def load_original_frames(imgs_dir: Path, timestamps: list[float], fps: int) -> torch.Tensor:
+    frames = []
+    for ts in timestamps:
+        idx = int(ts * fps)
+        frame = PIL.Image.open(imgs_dir / f"frame_{idx:06d}.png")
+        frame = torch.from_numpy(np.array(frame))
+        frame = frame.type(torch.float32) / 255
+        frame = einops.rearrange(frame, "h w c -> c h w")
+        frames.append(frame)
+    return torch.stack(frames)
+
+
+def save_decoded_frames(
+    imgs_dir: Path, save_dir: Path, frames: torch.Tensor, timestamps: list[float], fps: int
+) -> None:
+    if save_dir.exists() and len(list(save_dir.glob("frame_*.png"))) == len(timestamps):
+        return
+
+    save_dir.mkdir(parents=True, exist_ok=True)
+    for i, ts in enumerate(timestamps):
+        idx = int(ts * fps)
+        frame_hwc = (frames[i].permute((1, 2, 0)) * 255).type(torch.uint8).cpu().numpy()
+        PIL.Image.fromarray(frame_hwc).save(save_dir / f"frame_{idx:06d}_decoded.png")
+        shutil.copyfile(imgs_dir / f"frame_{idx:06d}.png", save_dir / f"frame_{idx:06d}_original.png")
+
+
+def save_first_episode(imgs_dir: Path, dataset: LeRobotDataset) -> None:
+    ep_num_images = dataset.episode_data_index["to"][0].item()
+    if imgs_dir.exists() and len(list(imgs_dir.glob("frame_*.png"))) == ep_num_images:
+        return
+
+    imgs_dir.mkdir(parents=True, exist_ok=True)
+    hf_dataset = dataset.hf_dataset.with_format(None)
+
+    # We only save images from the first camera
+    img_keys = [key for key in hf_dataset.features if key.startswith("observation.image")]
+    imgs_dataset = hf_dataset.select_columns(img_keys[0])
+
+    for i, item in enumerate(
+        tqdm(imgs_dataset, desc=f"saving {dataset.repo_id} first episode images", leave=False)
+    ):
+        img = item[img_keys[0]]
+        img.save(str(imgs_dir / f"frame_{i:06d}.png"), quality=100)
+
+        if i >= ep_num_images - 1:
+            break
+
+
+def sample_timestamps(timestamps_mode: str, ep_num_images: int, fps: int) -> list[float]:
+    # Start at 5 to allow for 2_frames_4_space and 6_frames
+    idx = random.randint(5, ep_num_images - 1)
+    match timestamps_mode:
+        case "1_frame":
+            frame_indexes = [idx]
+        case "2_frames":
+            frame_indexes = [idx - 1, idx]
+        case "2_frames_4_space":
+            frame_indexes = [idx - 5, idx]
+        case "6_frames":
+            frame_indexes = [idx - i for i in range(6)][::-1]
+        case _:
+            raise ValueError(timestamps_mode)
+
+    return [idx / fps for idx in frame_indexes]
+
+
+def decode_video_frames(
+    video_path: str,
+    timestamps: list[float],
+    tolerance_s: float,
+    backend: str,
+) -> torch.Tensor:
+    if backend in ["pyav", "video_reader"]:
+        return decode_video_frames_torchvision(video_path, timestamps, tolerance_s, backend)
+    else:
+        raise NotImplementedError(backend)
+
+
+def benchmark_decoding(
+    imgs_dir: Path,
+    video_path: Path,
+    timestamps_mode: str,
+    backend: str,
+    ep_num_images: int,
+    fps: int,
+    num_samples: int = 50,
+    num_workers: int = 4,
+    save_frames: bool = False,
+) -> dict:
+    def process_sample(sample: int):
+        time_benchmark = TimeBenchmark()
+        timestamps = sample_timestamps(timestamps_mode, ep_num_images, fps)
+        num_frames = len(timestamps)
+        result = {
+            "psnr_values": [],
+            "ssim_values": [],
+            "mse_values": [],
+        }
+
+        with time_benchmark:
+            frames = decode_video_frames(video_path, timestamps=timestamps, tolerance_s=5e-1, backend=backend)
+        result["load_time_video_ms"] = time_benchmark.result_ms / num_frames
+
+        with time_benchmark:
+            original_frames = load_original_frames(imgs_dir, timestamps, fps)
+        result["load_time_images_ms"] = time_benchmark.result_ms / num_frames
+
+        frames_np, original_frames_np = frames.numpy(), original_frames.numpy()
+        for i in range(num_frames):
+            result["mse_values"].append(mean_squared_error(original_frames_np[i], frames_np[i]))
+            result["psnr_values"].append(
+                peak_signal_noise_ratio(original_frames_np[i], frames_np[i], data_range=1.0)
+            )
+            result["ssim_values"].append(
+                structural_similarity(original_frames_np[i], frames_np[i], data_range=1.0, channel_axis=0)
+            )
+
+        if save_frames and sample == 0:
+            save_dir = video_path.with_suffix("") / f"{timestamps_mode}_{backend}"
+            save_decoded_frames(imgs_dir, save_dir, frames, timestamps, fps)
+
+        return result
+
+    load_times_video_ms = []
+    load_times_images_ms = []
+    mse_values = []
+    psnr_values = []
+    ssim_values = []
+
+    # A sample is a single set of decoded frames specified by timestamps_mode (e.g. a single frame, 2 frames, etc.).
+    # For each sample, we record metrics (loading time and quality metrics) which are then averaged over all samples.
+    # As these samples are independent, we run them in parallel threads to speed up the benchmark.
+    with ThreadPoolExecutor(max_workers=num_workers) as executor:
+        futures = [executor.submit(process_sample, i) for i in range(num_samples)]
+        for future in tqdm(as_completed(futures), total=num_samples, desc="samples", leave=False):
+            result = future.result()
+            load_times_video_ms.append(result["load_time_video_ms"])
+            load_times_images_ms.append(result["load_time_images_ms"])
+            psnr_values.extend(result["psnr_values"])
+            ssim_values.extend(result["ssim_values"])
+            mse_values.extend(result["mse_values"])
+
+    avg_load_time_video_ms = float(np.array(load_times_video_ms).mean())
+    avg_load_time_images_ms = float(np.array(load_times_images_ms).mean())
+    video_images_load_time_ratio = avg_load_time_video_ms / avg_load_time_images_ms
+
+    return {
+        "avg_load_time_video_ms": avg_load_time_video_ms,
+        "avg_load_time_images_ms": avg_load_time_images_ms,
+        "video_images_load_time_ratio": video_images_load_time_ratio,
+        "avg_mse": float(np.mean(mse_values)),
+        "avg_psnr": float(np.mean(psnr_values)),
+        "avg_ssim": float(np.mean(ssim_values)),
+    }
+
+
+def benchmark_encoding_decoding(
+    dataset: LeRobotDataset,
+    video_path: Path,
+    imgs_dir: Path,
+    encoding_cfg: dict,
+    decoding_cfg: dict,
+    num_samples: int,
+    num_workers: int,
+    save_frames: bool,
+    overwrite: bool = False,
+    seed: int = 1337,
+) -> list[dict]:
+    fps = dataset.fps
+
+    if overwrite or not video_path.is_file():
+        tqdm.write(f"encoding {video_path}")
+        encode_video_frames(
+            imgs_dir=imgs_dir,
+            video_path=video_path,
+            fps=fps,
+            vcodec=encoding_cfg["vcodec"],
+            pix_fmt=encoding_cfg["pix_fmt"],
+            g=encoding_cfg.get("g"),
+            crf=encoding_cfg.get("crf"),
+            # fast_decode=encoding_cfg.get("fastdecode"),
+            overwrite=True,
+        )
+
+    ep_num_images = dataset.episode_data_index["to"][0].item()
+    width, height = tuple(dataset[0][dataset.meta.camera_keys[0]].shape[-2:])
+    num_pixels = width * height
+    video_size_bytes = video_path.stat().st_size
+    images_size_bytes = get_directory_size(imgs_dir)
+    video_images_size_ratio = video_size_bytes / images_size_bytes
+
+    random.seed(seed)
+    benchmark_table = []
+    for timestamps_mode in tqdm(
+        decoding_cfg["timestamps_modes"], desc="decodings (timestamps_modes)", leave=False
+    ):
+        for backend in tqdm(decoding_cfg["backends"], desc="decodings (backends)", leave=False):
+            benchmark_row = benchmark_decoding(
+                imgs_dir,
+                video_path,
+                timestamps_mode,
+                backend,
+                ep_num_images,
+                fps,
+                num_samples,
+                num_workers,
+                save_frames,
+            )
+            benchmark_row.update(
+                **{
+                    "repo_id": dataset.repo_id,
+                    "resolution": f"{width} x {height}",
+                    "num_pixels": num_pixels,
+                    "video_size_bytes": video_size_bytes,
+                    "images_size_bytes": images_size_bytes,
+                    "video_images_size_ratio": video_images_size_ratio,
+                    "timestamps_mode": timestamps_mode,
+                    "backend": backend,
+                },
+                **encoding_cfg,
+            )
+            benchmark_table.append(benchmark_row)
+
+    return benchmark_table
+
+
+def main(
+    output_dir: Path,
+    repo_ids: list[str],
+    vcodec: list[str],
+    pix_fmt: list[str],
+    g: list[int],
+    crf: list[int],
+    # fastdecode: list[int],
+    timestamps_modes: list[str],
+    backends: list[str],
+    num_samples: int,
+    num_workers: int,
+    save_frames: bool,
+):
+    check_datasets_formats(repo_ids)
+    encoding_benchmarks = {
+        "g": g,
+        "crf": crf,
+        # "fastdecode": fastdecode,
+    }
+    decoding_benchmarks = {
+        "timestamps_modes": timestamps_modes,
+        "backends": backends,
+    }
+    headers = ["repo_id", "resolution", "num_pixels"]
+    headers += list(BASE_ENCODING.keys())
+    headers += [
+        "timestamps_mode",
+        "backend",
+        "video_size_bytes",
+        "images_size_bytes",
+        "video_images_size_ratio",
+        "avg_load_time_video_ms",
+        "avg_load_time_images_ms",
+        "video_images_load_time_ratio",
+        "avg_mse",
+        "avg_psnr",
+        "avg_ssim",
+    ]
+    file_paths = []
+    for video_codec in tqdm(vcodec, desc="encodings (vcodec)"):
+        for pixel_format in tqdm(pix_fmt, desc="encodings (pix_fmt)", leave=False):
+            benchmark_table = []
+            for repo_id in tqdm(repo_ids, desc="encodings (datasets)", leave=False):
+                dataset = LeRobotDataset(repo_id)
+                imgs_dir = output_dir / "images" / dataset.repo_id.replace("/", "_")
+                # We only use the first episode
+                save_first_episode(imgs_dir, dataset)
+                for key, values in tqdm(encoding_benchmarks.items(), desc="encodings (g, crf)", leave=False):
+                    for value in tqdm(values, desc=f"encodings ({key})", leave=False):
+                        encoding_cfg = BASE_ENCODING.copy()
+                        encoding_cfg["vcodec"] = video_codec
+                        encoding_cfg["pix_fmt"] = pixel_format
+                        encoding_cfg[key] = value
+                        args_path = Path("_".join(str(value) for value in encoding_cfg.values()))
+                        video_path = output_dir / "videos" / args_path / f"{repo_id.replace('/', '_')}.mp4"
+                        benchmark_table += benchmark_encoding_decoding(
+                            dataset,
+                            video_path,
+                            imgs_dir,
+                            encoding_cfg,
+                            decoding_benchmarks,
+                            num_samples,
+                            num_workers,
+                            save_frames,
+                        )
+
+            # Save intermediate results
+            benchmark_df = pd.DataFrame(benchmark_table, columns=headers)
+            now = dt.datetime.now()
+            csv_path = (
+                output_dir
+                / f"{now:%Y-%m-%d}_{now:%H-%M-%S}_{video_codec}_{pixel_format}_{num_samples}-samples.csv"
+            )
+            benchmark_df.to_csv(csv_path, header=True, index=False)
+            file_paths.append(csv_path)
+            del benchmark_df
+
+    # Concatenate all results
+    df_list = [pd.read_csv(csv_path) for csv_path in file_paths]
+    concatenated_df = pd.concat(df_list, ignore_index=True)
+    concatenated_path = output_dir / f"{now:%Y-%m-%d}_{now:%H-%M-%S}_all_{num_samples}-samples.csv"
+    concatenated_df.to_csv(concatenated_path, header=True, index=False)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=Path("outputs/video_benchmark"),
+        help="Directory where the video benchmark outputs are written.",
+    )
+    parser.add_argument(
+        "--repo-ids",
+        type=str,
+        nargs="*",
+        default=[
+            "lerobot/pusht_image",
+            "aliberts/aloha_mobile_shrimp_image",
+            "aliberts/paris_street",
+            "aliberts/kitchen",
+        ],
+        help="Datasets repo-ids to test against. First episodes only are used. Must be images.",
+    )
+    parser.add_argument(
+        "--vcodec",
+        type=str,
+        nargs="*",
+        default=["libx264", "hevc", "libsvtav1"],
+        help="Video codecs to be tested",
+    )
+    parser.add_argument(
+        "--pix-fmt",
+        type=str,
+        nargs="*",
+        default=["yuv444p", "yuv420p"],
+        help="Pixel formats (chroma subsampling) to be tested",
+    )
+    parser.add_argument(
+        "--g",
+        type=parse_int_or_none,
+        nargs="*",
+        default=[1, 2, 3, 4, 5, 6, 10, 15, 20, 40, 100, None],
+        help="Group of pictures sizes to be tested.",
+    )
+    parser.add_argument(
+        "--crf",
+        type=parse_int_or_none,
+        nargs="*",
+        default=[0, 5, 10, 15, 20, 25, 30, 40, 50, None],
+        help="Constant rate factors to be tested.",
+    )
+    # parser.add_argument(
+    #     "--fastdecode",
+    #     type=int,
+    #     nargs="*",
+    #     default=[0, 1],
+    #     help="Use the fastdecode tuning option. 0 disables it. "
+    #         "For libx264 and libx265/hevc, only 1 is possible. "
+    #         "For libsvtav1, 1, 2 or 3 are possible values with a higher number meaning a faster decoding optimization",
+    # )
+    parser.add_argument(
+        "--timestamps-modes",
+        type=str,
+        nargs="*",
+        default=[
+            "1_frame",
+            "2_frames",
+            "2_frames_4_space",
+            "6_frames",
+        ],
+        help="Timestamps scenarios to be tested.",
+    )
+    parser.add_argument(
+        "--backends",
+        type=str,
+        nargs="*",
+        default=["pyav", "video_reader"],
+        help="Torchvision decoding backend to be tested.",
+    )
+    parser.add_argument(
+        "--num-samples",
+        type=int,
+        default=50,
+        help="Number of samples for each encoding x decoding config.",
+    )
+    parser.add_argument(
+        "--num-workers",
+        type=int,
+        default=10,
+        help="Number of processes for parallelized sample processing.",
+    )
+    parser.add_argument(
+        "--save-frames",
+        type=int,
+        default=0,
+        help="Whether to save decoded frames or not. Enter a non-zero number for true.",
+    )
+    args = parser.parse_args()
+    main(**vars(args))
--- a/docker/lerobot-cpu/Dockerfile
+++ b/docker/lerobot-cpu/Dockerfile
@@ -1,31 +1,29 @@
 # Configure image
 ARG PYTHON_VERSION=3.10
-
 FROM python:${PYTHON_VERSION}-slim
+
+# Configure environment variables
 ARG PYTHON_VERSION
-ARG DEBIAN_FRONTEND=noninteractive
-
-# Install apt dependencies
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    build-essential cmake \
-    libglib2.0-0 libgl1-mesa-glx libegl1-mesa \
-    && apt-get clean && rm -rf /var/lib/apt/lists/*
-
-# Create virtual environment
-RUN ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python
-RUN python -m venv /opt/venv
+ENV DEBIAN_FRONTEND=noninteractive
+ENV MUJOCO_GL="egl"
 ENV PATH="/opt/venv/bin:$PATH"
-RUN echo "source /opt/venv/bin/activate" >> /root/.bashrc

-# Install LeRobot
+# Install dependencies and set up Python in a single layer
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential cmake git \
+    libglib2.0-0 libgl1-mesa-glx libegl1-mesa ffmpeg \
+    speech-dispatcher libgeos-dev \
+    && ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python \
+    && python -m venv /opt/venv \
+    && apt-get clean && rm -rf /var/lib/apt/lists/* \
+    && echo "source /opt/venv/bin/activate" >> /root/.bashrc
+
+# Clone repository and install LeRobot in a single layer
 COPY . /lerobot
 WORKDIR /lerobot
-RUN pip install --upgrade --no-cache-dir pip
-RUN pip install --no-cache-dir ".[test, aloha, xarm, pusht]" \
-    --extra-index-url https://download.pytorch.org/whl/cpu
-
-# Set EGL as the rendering backend for MuJoCo
-ENV MUJOCO_GL="egl"
+RUN /opt/venv/bin/pip install --upgrade --no-cache-dir pip \
+    && /opt/venv/bin/pip install --no-cache-dir ".[test, aloha, xarm, pusht]" \
+        --extra-index-url https://download.pytorch.org/whl/cpu

 # Execute in bash shell rather than python
 CMD ["/bin/bash"]
--- a/docker/lerobot-gpu-dev/Dockerfile
+++ b/docker/lerobot-gpu-dev/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:12.4.1-base-ubuntu22.04
+FROM nvidia/cuda:12.2.2-devel-ubuntu22.04

 # Configure image
 ARG PYTHON_VERSION=3.10
@@ -8,14 +8,42 @@ ARG DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && apt-get install -y --no-install-recommends \
    build-essential cmake \
    git git-lfs openssh-client \
-    nano vim less util-linux \
+    nano vim less util-linux tree \
    htop atop nvtop \
-    sed gawk grep curl wget \
+    sed gawk grep curl wget zip unzip \
    tcpdump sysstat screen tmux \
-    libglib2.0-0 libgl1-mesa-glx libegl1-mesa ffmpeg \
-    python${PYTHON_VERSION} python${PYTHON_VERSION}-venv \
+    libglib2.0-0 libgl1-mesa-glx libegl1-mesa \
+    speech-dispatcher portaudio19-dev libgeos-dev \
+    python${PYTHON_VERSION} python${PYTHON_VERSION}-venv python${PYTHON_VERSION}-dev \
    && apt-get clean && rm -rf /var/lib/apt/lists/*

+# Install ffmpeg build dependencies. See:
+# https://trac.ffmpeg.org/wiki/CompilationGuide/Ubuntu
+# TODO(aliberts): create image to build dependencies from source instead
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    autoconf automake yasm \
+    libass-dev \
+    libfreetype6-dev \
+    libgnutls28-dev \
+    libunistring-dev \
+    libmp3lame-dev \
+    libtool \
+    libvorbis-dev \
+    meson \
+    ninja-build \
+    pkg-config \
+    texinfo \
+    yasm \
+    zlib1g-dev \
+    nasm \
+    libx264-dev \
+    libx265-dev libnuma-dev \
+    libvpx-dev \
+    libfdk-aac-dev \
+    libopus-dev \
+    libsvtav1-dev libsvtav1enc-dev libsvtav1dec-dev \
+    libdav1d-dev
+
 # Install gh cli tool
 RUN (type -p wget >/dev/null || (apt update && apt-get install wget -y)) \
    && mkdir -p -m 755 /etc/apt/keyrings \
--- a/docker/lerobot-gpu/Dockerfile
+++ b/docker/lerobot-gpu/Dockerfile
@@ -1,29 +1,24 @@
 FROM nvidia/cuda:12.4.1-base-ubuntu22.04

-# Configure image
+# Configure environment variables
 ARG PYTHON_VERSION=3.10
-ARG DEBIAN_FRONTEND=noninteractive
-
-
-# Install apt dependencies
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    build-essential cmake \
-    libglib2.0-0 libgl1-mesa-glx libegl1-mesa \
-    python${PYTHON_VERSION} python${PYTHON_VERSION}-venv \
-    && apt-get clean && rm -rf /var/lib/apt/lists/*
-
-
-# Create virtual environment
-RUN ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python
-RUN python -m venv /opt/venv
+ENV DEBIAN_FRONTEND=noninteractive
+ENV MUJOCO_GL="egl"
 ENV PATH="/opt/venv/bin:$PATH"
-RUN echo "source /opt/venv/bin/activate" >> /root/.bashrc

-# Install LeRobot
+# Install dependencies and set up Python in a single layer
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential cmake git \
+    libglib2.0-0 libgl1-mesa-glx libegl1-mesa ffmpeg \
+    speech-dispatcher libgeos-dev \
+    python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
+    && ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python \
+    && python -m venv /opt/venv \
+    && apt-get clean && rm -rf /var/lib/apt/lists/* \
+    && echo "source /opt/venv/bin/activate" >> /root/.bashrc
+
+# Clone repository and install LeRobot in a single layer
 COPY . /lerobot
 WORKDIR /lerobot
-RUN pip install --upgrade --no-cache-dir pip
-RUN pip install --no-cache-dir ".[test, aloha, xarm, pusht]"
-
-# Set EGL as the rendering backend for MuJoCo
-ENV MUJOCO_GL="egl"
+RUN /opt/venv/bin/pip install --upgrade --no-cache-dir pip \
+    && /opt/venv/bin/pip install --no-cache-dir ".[test, aloha, xarm, pusht, dynamixel]"
--- a/docs/README.md
+++ b/docs/README.md
@@ -0,0 +1,137 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Generating the documentation
+
+To generate the documentation, you first have to build it. Several packages are necessary to build the doc,
+you can install them with the following command, at the root of the code repository:
+
+```bash
+pip install -e ".[docs]"
+```
+
+You will also need `nodejs`. Please refer to their [installation page](https://nodejs.org/en/download)
+
+---
+**NOTE**
+
+You only need to generate the documentation to inspect it locally (if you're planning changes and want to
+check how they look before committing for instance). You don't have to `git commit` the built documentation.
+
+---
+
+## Building the documentation
+
+Once you have setup the `doc-builder` and additional packages, you can generate the documentation by
+typing the following command:
+
+```bash
+doc-builder build lerobot docs/source/ --build_dir ~/tmp/test-build
+```
+
+You can adapt the `--build_dir` to set any temporary folder that you prefer. This command will create it and generate
+the MDX files that will be rendered as the documentation on the main website. You can inspect them in your favorite
+Markdown editor.
+
+## Previewing the documentation
+
+To preview the docs, first install the `watchdog` module with:
+
+```bash
+pip install watchdog
+```
+
+Then run the following command:
+
+```bash
+doc-builder preview lerobot docs/source/
+```
+
+The docs will be viewable at [http://localhost:3000](http://localhost:3000). You can also preview the docs once you have opened a PR. You will see a bot add a comment to a link where the documentation with your changes lives.
+
+---
+**NOTE**
+
+The `preview` command only works with existing doc files. When you add a completely new file, you need to update `_toctree.yml` & restart `preview` command (`ctrl-c` to stop it & call `doc-builder preview ...` again).
+
+---
+
+## Adding a new element to the navigation bar
+
+Accepted files are Markdown (.md).
+
+Create a file with its extension and put it in the source directory. You can then link it to the toc-tree by putting
+the filename without the extension in the [`_toctree.yml`](https://github.com/huggingface/lerobot/blob/main/docs/source/_toctree.yml) file.
+
+## Renaming section headers and moving sections
+
+It helps to keep the old links working when renaming the section header and/or moving sections from one document to another. This is because the old links are likely to be used in Issues, Forums, and Social media and it'd make for a much more superior user experience if users reading those months later could still easily navigate to the originally intended information.
+
+Therefore, we simply keep a little map of moved sections at the end of the document where the original section was. The key is to preserve the original anchor.
+
+So if you renamed a section from: "Section A" to "Section B", then you can add at the end of the file:
+
+```
+Sections that were moved:
+
+[ <a href="#section-b">Section A</a><a id="section-a"></a> ]
+```
+and of course, if you moved it to another file, then:
+
+```
+Sections that were moved:
+
+[ <a href="../new-file#section-b">Section A</a><a id="section-a"></a> ]
+```
+
+Use the relative style to link to the new file so that the versioned docs continue to work.
+
+For an example of a rich moved sections set please see the very end of [the transformers Trainer doc](https://github.com/huggingface/transformers/blob/main/docs/source/en/main_classes/trainer.md).
+
+### Adding a new tutorial
+
+Adding a new tutorial or section is done in two steps:
+
+- Add a new file under `./source`. This file can either be ReStructuredText (.rst) or Markdown (.md).
+- Link that file in `./source/_toctree.yml` on the correct toc-tree.
+
+Make sure to put your new file under the proper section. If you have a doubt, feel free to ask in a Github Issue or PR.
+
+### Writing source documentation
+
+Values that should be put in `code` should either be surrounded by backticks: \`like so\`. Note that argument names
+and objects like True, None or any strings should usually be put in `code`.
+
+#### Writing a multi-line code block
+
+Multi-line code blocks can be useful for displaying examples. They are done between two lines of three backticks as usual in Markdown:
+
+
+````
+```
+# first line of code
+# second line
+# etc
+```
+````
+
+#### Adding an image
+
+Due to the rapidly growing repository, it is important to make sure that no files that would significantly weigh down the repository are added. This includes images, videos, and other non-text files. We prefer to leverage a hf.co hosted `dataset` like
+the ones hosted on [`hf-internal-testing`](https://huggingface.co/hf-internal-testing) in which to place these files and reference
+them by URL. We recommend putting them in the following dataset: [huggingface/documentation-images](https://huggingface.co/datasets/huggingface/documentation-images).
+If an external contribution, feel free to add the images to your PR and ask a Hugging Face member to migrate your images
+to this dataset.
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -0,0 +1,26 @@
+- sections:
+  - local: index
+    title: LeRobot
+  - local: installation
+    title: Installation
+  title: Get started
+- sections:
+  - local: getting_started_real_world_robot
+    title: Getting Started with Real-World Robots
+  - local: cameras
+    title: Cameras
+  title: "Tutorials"
+- sections:
+  - local: so101
+    title: SO-101
+  - local: so100
+    title: SO-100
+  - local: koch
+    title: Koch v1.1
+  - local: lekiwi
+    title: LeKiwi
+  title: "Robots"
+- sections:
+  - local: contributing
+    title: Contribute to LeRobot
+  title: "Contribute"
--- a/docs/source/cameras.mdx
+++ b/docs/source/cameras.mdx
@@ -0,0 +1,173 @@
+# Cameras
+
+LeRobot offers multiple options for video capture, including phone cameras, built-in laptop cameras, external webcams, and Intel RealSense cameras. To efficiently record frames from most cameras, you can use either the `OpenCVCamera` or `RealSenseCamera` class. For additional compatibility details on the `OpenCVCamera` class, refer to the [Video I/O with OpenCV Overview](https://docs.opencv.org/4.x/d0/da7/videoio_overview.html).
+
+### Finding your camera
+
+To instantiate a camera, you need a camera identifier. This identifier might change if you reboot your computer or re-plug your camera, a behavior mostly dependant on your operating system.
+
+To find the camera indices of the cameras plugged into your system, run the following script:
+```bash
+python lerobot/find_cameras.py opencv # or realsense for Intel Realsense cameras
+```
+
+The output will look something like this if you have two cameras connected:
+```
+--- Detected Cameras ---
+Camera #0:
+  Name: OpenCV Camera @ 0
+  Type: OpenCV
+  Id: 0
+  Backend api: AVFOUNDATION
+  Default stream profile:
+    Format: 16.0
+    Width: 1920
+    Height: 1080
+    Fps: 15.0
+--------------------
+(more cameras ...)
+```
+
+> [!WARNING]
+> When using Intel RealSense cameras in `macOS`, you could get this [error](https://github.com/IntelRealSense/librealsense/issues/12307): `Error finding RealSense cameras: failed to set power state`, this can be solved by running the same command with `sudo` permissions. Note that using RealSense cameras in `macOS` is unstable.
+
+
+## Use Cameras
+
+Below are two examples, demonstrating how to work with the API.
+
+- **Asynchronous frame capture** using an OpenCV-based camera
+- **Color and depth capture** using an Intel RealSense camera
+
+
+<hfoptions id="shell_restart">
+<hfoption id="Open CV Camera">
+
+```python
+from lerobot.common.cameras.opencv.configuration_opencv import OpenCVCameraConfig
+from lerobot.common.cameras.opencv.camera_opencv import OpenCVCamera
+from lerobot.common.cameras.configs import ColorMode, Cv2Rotation
+
+# Construct an `OpenCVCameraConfig` with your desired FPS, resolution, color mode, and rotation.
+config = OpenCVCameraConfig(
+    index_or_path=0,
+    fps=15,
+    width=1920,
+    height=1080,
+    color_mode=ColorMode.RGB,
+    rotation=Cv2Rotation.NO_ROTATION
+)
+
+# Instantiate and connect an `OpenCVCamera`, performing a warm-up read (default).
+camera = OpenCVCamera(config)
+camera.connect()
+
+# Read frames asynchronously in a loop via `async_read(timeout_ms)`
+try:
+    for i in range(10):
+        frame = camera.async_read(timeout_ms=200)
+        print(f"Async frame {i} shape:", frame.shape)
+finally:
+    camera.disconnect()
+```
+
+</hfoption>
+<hfoption id="Intel Realsense Camera">
+
+```python
+from lerobot.common.cameras.intel.configuration_realsense import RealSenseCameraConfig
+from lerobot.common.cameras.intel.camera_realsense import RealSenseCamera
+from lerobot.common.cameras.configs import ColorMode, Cv2Rotation
+
+# Create a `RealSenseCameraConfig` specifying your camera’s serial number and enabling depth.
+config = RealSenseCameraConfig(
+    serial_number="233522074606",
+    fps=15,
+    width=640,
+    height=480,
+    color_mode=ColorMode.RGB,
+    use_depth=True,
+    rotation=Cv2Rotation.NO_ROTATION
+)
+
+# Instantiate and connect a `RealSenseCamera` with warm-up read (default).
+camera = RealSenseCamera(config)
+camera.connect()
+
+# Capture a color frame via `read()` and a depth map via `read_depth()`.
+try:
+    color_frame = camera.read()
+    depth_map = camera.read_depth()
+    print("Color frame shape:", color_frame.shape)
+    print("Depth map shape:", depth_map.shape)
+finally:
+    camera.disconnect()
+```
+</hfoption>
+</hfoptions>
+
+
+## Use your phone
+<hfoptions id="use phone">
+<hfoption id="Mac">
+
+To use your iPhone as a camera on macOS, enable the Continuity Camera feature:
+- Ensure your Mac is running macOS 13 or later, and your iPhone is on iOS 16 or later.
+- Sign in both devices with the same Apple ID.
+- Connect your devices with a USB cable or turn on Wi-Fi and Bluetooth for a wireless connection.
+
+For more details, visit [Apple support](https://support.apple.com/en-gb/guide/mac-help/mchl77879b8a/mac).
+
+Your iPhone should be detected automatically when running the camera setup script in the next section.
+
+</hfoption>
+<hfoption id="Linux">
+
+If you want to use your phone as a camera on Linux, follow these steps to set up a virtual camera
+
+1. *Install `v4l2loopback-dkms` and `v4l-utils`*. Those packages are required to create virtual camera devices (`v4l2loopback`) and verify their settings with the `v4l2-ctl` utility from `v4l-utils`. Install them using:
+```python
+sudo apt install v4l2loopback-dkms v4l-utils
+```
+2. *Install [DroidCam](https://droidcam.app) on your phone*. This app is available for both iOS and Android.
+3. *Install [OBS Studio](https://obsproject.com)*. This software will help you manage the camera feed. Install it using [Flatpak](https://flatpak.org):
+```python
+flatpak install flathub com.obsproject.Studio
+```
+4. *Install the DroidCam OBS plugin*. This plugin integrates DroidCam with OBS Studio. Install it with:
+```python
+flatpak install flathub com.obsproject.Studio.Plugin.DroidCam
+```
+5. *Start OBS Studio*. Launch with:
+```python
+flatpak run com.obsproject.Studio
+```
+6. *Add your phone as a source*. Follow the instructions [here](https://droidcam.app/obs/usage). Be sure to set the resolution to `640x480`.
+7. *Adjust resolution settings*. In OBS Studio, go to `File > Settings > Video`. Change the `Base(Canvas) Resolution` and the `Output(Scaled) Resolution` to `640x480` by manually typing it in.
+8. *Start virtual camera*. In OBS Studio, follow the instructions [here](https://obsproject.com/kb/virtual-camera-guide).
+9. *Verify the virtual camera setup*. Use `v4l2-ctl` to list the devices:
+```python
+v4l2-ctl --list-devices
+```
+You should see an entry like:
+```
+VirtualCam (platform:v4l2loopback-000):
+/dev/video1
+```
+10. *Check the camera resolution*. Use `v4l2-ctl` to ensure that the virtual camera output resolution is `640x480`. Change `/dev/video1` to the port of your virtual camera from the output of `v4l2-ctl --list-devices`.
+```python
+v4l2-ctl -d /dev/video1 --get-fmt-video
+```
+You should see an entry like:
+```
+>>> Format Video Capture:
+>>>	Width/Height      : 640/480
+>>>	Pixel Format      : 'YUYV' (YUYV 4:2:2)
+```
+
+Troubleshooting: If the resolution is not correct you will have to delete the Virtual Camera port and try again as it cannot be changed.
+
+If everything is set up correctly, you can proceed with the rest of the tutorial.
+
+</hfoption>
+</hfoptions>
--- a/docs/source/contributing.md
+++ b/docs/source/contributing.md
@@ -0,0 +1 @@
+../../CONTRIBUTING.md
--- a/docs/source/getting_started_real_world_robot.mdx
+++ b/docs/source/getting_started_real_world_robot.mdx
@@ -0,0 +1,311 @@
+# Getting Started with Real-World Robots
+
+This tutorial will explain how to train a neural network to control a real robot autonomously.
+
+**You'll learn:**
+1. How to record and visualize your dataset.
+2. How to train a policy using your data and prepare it for evaluation.
+3. How to evaluate your policy and visualize the results.
+
+By following these steps, you'll be able to replicate tasks, such as picking up a Lego block and placing it in a bin with a high success rate, as shown in the video below.
+
+<details>
+<summary><strong>Video: pickup lego block task</strong></summary>
+
+<div class="video-container">
+  <video controls width="600">
+    <source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/lerobot_task.mp4" type="video/mp4" />
+  </video>
+</div>
+
+</details>
+
+This tutorial isn’t tied to a specific robot: we walk you through the commands and API snippets you can adapt for any supported platform.
+
+During data collection, you’ll use a “teloperation” device, such as a leader arm or keyboard to teleoperate the robot and record its motion trajectories.
+
+Once you’ve gathered enough trajectories, you’ll train a neural network to imitate these trajectories and deploy the trained model so your robot can perform the task autonomously.
+
+If you run into any issues at any point, jump into our [Discord community](https://discord.com/invite/s3KuuzsPFb) for support.
+
+## Set up and Calibrate
+
+If you haven't yet set up and calibrated your robot and teleop device, please do so by following the robot-specific tutorial.
+
+## Teleoperate
+
+In this example, we’ll demonstrate how to teleoperate the SO101 robot. For each command, we also provide a corresponding API example.
+
+Note that the `id` associated with a robot is used to store the calibration file. It's important to use the same `id` when teleoperating, recording, and evaluating when using the same setup.
+
+<hfoptions id="teleoperate_so101">
+<hfoption id="Command">
+```bash
+python -m lerobot.teleoperate \
+    --robot.type=so101_follower \
+    --robot.port=/dev/tty.usbmodem58760431541 \
+    --robot.id=my_awesome_follower_arm \
+    --teleop.type=so101_leader \
+    --teleop.port=/dev/tty.usbmodem58760431551 \
+    --teleop.id=my_awesome_leader_arm
+```
+</hfoption>
+<hfoption id="API example">
+```python
+from lerobot.common.teleoperators.so101_leader import SO101LeaderConfig, SO101Leader
+from lerobot.common.robots.so101_follower import SO101FollowerConfig, SO101Follower
+
+robot_config = SO101FollowerConfig(
+    port="/dev/tty.usbmodem58760431541",
+    id="my_red_robot_arm",
+)
+
+teleop_config = SO101LeaderConfig(
+    port="/dev/tty.usbmodem58760431551",
+    id="my_blue_leader_arm",
+)
+
+robot = SO101Follower(robot_config)
+teleop_device = SO101Leader(teleop_config)
+robot.connect()
+teleop_device.connect()
+
+while True:
+    action = teleop_device.get_action()
+    robot.send_action(action)
+```
+</hfoption>
+</hfoptions>
+
+The teleoperate command will automatically:
+1. Identify any missing calibrations and initiate the calibration procedure.
+2. Connect the robot and teleop device and start teleoperation.
+
+## Cameras
+
+To add cameras to your setup, follow this [Guide](./cameras#setup-cameras).
+
+## Teleoperate with cameras
+
+With `rerun`, you can teleoperate again while simultaneously visualizing the camera feeds and joint positions. In this example, we’re using the Koch arm.
+
+<hfoptions id="teleoperate_koch_camera">
+<hfoption id="Command">
+```bash
+python -m lerobot.teleoperate \
+    --robot.type=koch_follower \
+    --robot.port=/dev/tty.usbmodem58760431541 \
+    --robot.id=my_awesome_follower_arm \
+    --robot.cameras="{ front: {type: opencv, index_or_path: 0, width: 1920, height: 1080, fps: 30}}" \
+    --teleop.type=koch_leader \
+    --teleop.port=/dev/tty.usbmodem58760431551 \
+    --teleop.id=my_awesome_leader_arm \
+    --display_data=true
+```
+</hfoption>
+<hfoption id="API example">
+```python
+from lerobot.common.cameras.opencv.configuration_opencv import OpenCVCameraConfig
+from lerobot.common.teleoperators.koch_leader import KochLeaderConfig, KochLeader
+from lerobot.common.robots.koch_follower import KochFollowerConfig, KochFollower
+
+camera_config = {
+    "front": OpenCVCameraConfig(index_or_path=0, width=1920, height=1080, fps=30)
+}
+
+robot_config = KochFollowerConfig(
+    port="/dev/tty.usbmodem585A0076841",
+    id="my_red_robot_arm",
+    cameras=camera_config
+)
+
+teleop_config = KochLeaderConfig(
+    port="/dev/tty.usbmodem58760431551",
+    id="my_blue_leader_arm",
+)
+
+robot = KochFollower(robot_config)
+teleop_device = KochLeader(teleop_config)
+robot.connect()
+teleop_device.connect()
+
+while True:
+    observation = robot.get_observation()
+    action = teleop_device.get_action()
+    robot.send_action(action)
+```
+</hfoption>
+</hfoptions>
+
+## Record a dataset
+
+Once you're familiar with teleoperation, you can record your first dataset.
+
+We use the Hugging Face hub features for uploading your dataset. If you haven't previously used the Hub, make sure you can login via the cli using a write-access token, this token can be generated from the [Hugging Face settings](https://huggingface.co/settings/tokens).
+
+Add your token to the CLI by running this command:
+```bash
+huggingface-cli login --token ${HUGGINGFACE_TOKEN} --add-to-git-credential
+```
+
+Then store your Hugging Face repository name in a variable:
+```bash
+HF_USER=$(huggingface-cli whoami | head -n 1)
+echo $HF_USER
+```
+
+Now you can record a dataset. To record 2 episodes and upload your dataset to the hub, execute this command tailored to the SO101.
+```bash
+python -m lerobot.record \
+    --robot.type=so101_follower \
+    --robot.port=/dev/tty.usbmodem585A0076841 \
+    --robot.id=my_awesome_follower_arm \
+    --robot.cameras="{ front: {type: opencv, index_or_path: 0, width: 1920, height: 1080, fps: 30}}" \
+    --teleop.type=so101_leader \
+    --teleop.port=/dev/tty.usbmodem58760431551 \
+    --teleop.id=my_awesome_leader_arm \
+    --display_data=true \
+    --dataset.repo_id=${HF_USER}/record-test \
+    --dataset.num_episodes=2 \
+    --dataset.single_task="Grab the black cube"
+```
+
+#### Dataset upload
+Locally, your dataset is stored in this folder: `~/.cache/huggingface/lerobot/{repo-id}`. At the end of data recording, your dataset will be uploaded on your Hugging Face page (e.g. https://huggingface.co/datasets/cadene/so101_test) that you can obtain by running:
+```bash
+echo https://huggingface.co/datasets/${HF_USER}/so101_test
+```
+Your dataset will be automatically tagged with `LeRobot` for the community to find it easily, and you can also add custom tags (in this case `tutorial` for example).
+
+You can look for other LeRobot datasets on the hub by searching for `LeRobot` [tags](https://huggingface.co/datasets?other=LeRobot).
+
+#### Record function
+
+The `record` function provides a suite of tools for capturing and managing data during robot operation:
+
+##### 1. Data Storage
+- Data is stored using the `LeRobotDataset` format and is stored on disk during recording.
+- By default, the dataset is pushed to your Hugging Face page after recording.
+  - To disable uploading, use `--dataset.push_to_hub=False`.
+
+##### 2. Checkpointing and Resuming
+- Checkpoints are automatically created during recording.
+- If an issue occurs, you can resume by re-running the same command with `--control.resume=true`.
+- To start recording from scratch, **manually delete** the dataset directory.
+
+##### 3. Recording Parameters
+Set the flow of data recording using command-line arguments:
+- `--dataset.episode_time_s=60`
+  Duration of each data recording episode (default: **60 seconds**).
+- `--dataset.reset_time_s=60`
+  Duration for resetting the environment after each episode (default: **60 seconds**).
+- `--dataset.num_episodes=50`
+  Total number of episodes to record (default: **50**).
+
+##### 4. Keyboard Controls During Recording
+Control the data recording flow using keyboard shortcuts:
+- Press **Right Arrow (`→`)**: Early stop the current episode or reset time and move to the next.
+- Press **Left Arrow (`←`)**: Cancel the current episode and re-record it.
+- Press **Escape (`ESC`)**: Immediately stop the session, encode videos, and upload the dataset.
+
+#### Tips for gathering data
+
+Once you're comfortable with data recording, you can create a larger dataset for training. A good starting task is grasping an object at different locations and placing it in a bin. We suggest recording at least 50 episodes, with 10 episodes per location. Keep the cameras fixed and maintain consistent grasping behavior throughout the recordings. Also make sure the object you are manipulating is visible on the camera's. A good rule of thumb is you should be able to do the task yourself by only looking at the camera images.
+
+In the following sections, you’ll train your neural network. After achieving reliable grasping performance, you can start introducing more variations during data collection, such as additional grasp locations, different grasping techniques, and altering camera positions.
+
+Avoid adding too much variation too quickly, as it may hinder your results.
+
+If you want to dive deeper into this important topic, you can check out the [blog post](https://huggingface.co/blog/lerobot-datasets#what-makes-a-good-dataset) we wrote on what makes a good dataset.
+
+
+#### Troubleshooting:
+- On Linux, if the left and right arrow keys and escape key don't have any effect during data recording, make sure you've set the `$DISPLAY` environment variable. See [pynput limitations](https://pynput.readthedocs.io/en/latest/limitations.html#linux).
+
+## Visualize a dataset
+
+If you uploaded your dataset to the hub with `--control.push_to_hub=true`, you can [visualize your dataset online](https://huggingface.co/spaces/lerobot/visualize_dataset) by copy pasting your repo id given by:
+```bash
+echo ${HF_USER}/so101_test
+```
+
+## Replay an episode
+
+A useful feature is the `replay` function, which allows you to replay any episode that you've recorded or episodes from any dataset out there. This function helps you test the repeatability of your robot's actions and assess transferability across robots of the same model.
+
+You can replay the first episode on your robot with:
+```bash
+python -m lerobot.replay \
+    --robot.type=so101_follower \
+    --robot.port=/dev/tty.usbmodem58760431541 \
+    --robot.id=my_awesome_follower_arm \
+    --dataset.repo_id=${HF_USER}/record-test \
+    --dataset.episode=0 # choose the episode you want to replay
+```
+
+Your robot should replicate movements similar to those you recorded. For example, check out [this video](https://x.com/RemiCadene/status/1793654950905680090) where we use `replay` on a Aloha robot from [Trossen Robotics](https://www.trossenrobotics.com).
+
+## Train a policy
+
+To train a policy to control your robot, use the [`python lerobot/scripts/train.py`](../lerobot/scripts/train.py) script. A few arguments are required. Here is an example command:
+```bash
+python lerobot/scripts/train.py \
+  --dataset.repo_id=${HF_USER}/so101_test \
+  --policy.type=act \
+  --output_dir=outputs/train/act_so101_test \
+  --job_name=act_so101_test \
+  --policy.device=cuda \
+  --wandb.enable=true
+```
+
+Let's explain the command:
+1. We provided the dataset as argument with `--dataset.repo_id=${HF_USER}/so101_test`.
+2. We provided the policy with `policy.type=act`. This loads configurations from [`configuration_act.py`](../lerobot/common/policies/act/configuration_act.py). Importantly, this policy will automatically adapt to the number of motor states, motor actions and cameras of your robot (e.g. `laptop` and `phone`) which have been saved in your dataset.
+4. We provided `policy.device=cuda` since we are training on a Nvidia GPU, but you could use `policy.device=mps` to train on Apple silicon.
+5. We provided `wandb.enable=true` to use [Weights and Biases](https://docs.wandb.ai/quickstart) for visualizing training plots. This is optional but if you use it, make sure you are logged in by running `wandb login`.
+
+Training should take several hours. You will find checkpoints in `outputs/train/act_so101_test/checkpoints`.
+
+To resume training from a checkpoint, below is an example command to resume from `last` checkpoint of the `act_so101_test` policy:
+```bash
+python lerobot/scripts/train.py \
+  --config_path=outputs/train/act_so101_test/checkpoints/last/pretrained_model/train_config.json \
+  --resume=true
+```
+
+#### Upload policy checkpoints
+
+Once training is done, upload the latest checkpoint with:
+```bash
+huggingface-cli upload ${HF_USER}/act_so101_test \
+  outputs/train/act_so101_test/checkpoints/last/pretrained_model
+```
+
+You can also upload intermediate checkpoints with:
+```bash
+CKPT=010000
+huggingface-cli upload ${HF_USER}/act_so101_test${CKPT} \
+  outputs/train/act_so101_test/checkpoints/${CKPT}/pretrained_model
+```
+
+## Evaluate your policy
+
+You can use the `record` script from [`lerobot/record.py`](https://github.com/huggingface/lerobot/blob/main/lerobot/record.py) but with a policy checkpoint as input. For instance, run this command to record 10 evaluation episodes:
+```bash
+python -m lerobot.record  \
+  --robot.type=so100_follower \
+  --robot.port=/dev/ttyACM1 \
+  --robot.cameras="{ up: {type: opencv, index_or_path: /dev/video10, width: 640, height: 480, fps: 30}, side: {type: intelrealsense, serial_number_or_name: 233522074606, width: 640, height: 480, fps: 30}}" \
+  --robot.id=my_awesome_follower_arm \
+  --teleop.type=so100_leader \
+  --teleop.port=/dev/ttyACM0 \
+  --teleop.id=my_awesome_leader_arm \
+  --display_data=false \
+  --dataset.repo_id=$HF_USER/eval_so100 \
+  --dataset.single_task="Put lego brick into the transparent box" \
+  --policy.path=${HF_USER}/my_policy
+```
+
+As you can see, it's almost the same command as previously used to record your training dataset. Two things changed:
+1. There is an additional `--control.policy.path` argument which indicates the path to your policy checkpoint with  (e.g. `outputs/train/eval_act_so101_test/checkpoints/last/pretrained_model`). You can also use the model repository if you uploaded a model checkpoint to the hub (e.g. `${HF_USER}/act_so101_test`).
+2. The name of dataset begins by `eval` to reflect that you are running inference (e.g. `${HF_USER}/eval_act_so101_test`).
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -0,0 +1,19 @@
+<div class="flex justify-center">
+  <a target="_blank" href="https://huggingface.co/lerobot">
+      <img alt="HuggingFace Expert Acceleration Program" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/lerobot-logo-thumbnail.png" style="width: 100%"></img>
+  </a>
+</div>
+
+# LeRobot
+
+**State-of-the-art machine learning for real-world robotics**
+
+🤗 LeRobot aims to provide models, datasets, and tools for real-world robotics in PyTorch. The goal is to lower the barrier for entry to robotics so that everyone can contribute and benefit from sharing datasets and pretrained models.
+
+🤗 LeRobot contains state-of-the-art approaches that have been shown to transfer to the real-world with a focus on imitation learning and reinforcement learning.
+
+🤗 LeRobot already provides a set of pretrained models, datasets with human collected demonstrations, and simulated environments so that everyone can get started.
+
+🤗 LeRobot hosts pretrained models and datasets on the LeRobot HuggingFace page.
+
+Join the LeRobot community on [Discord](https://discord.gg/s3KuuzsPFb)
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -0,0 +1,70 @@
+# Installation
+
+## Install LeRobot
+
+Currently only available from source.
+
+Download our source code:
+```bash
+git clone https://github.com/huggingface/lerobot.git
+cd lerobot
+```
+
+Create a virtual environment with Python 3.10, using [`Miniconda`](https://docs.anaconda.com/miniconda/install/#quick-command-line-install)
+```bash
+conda create -y -n lerobot python=3.10
+```
+
+Then activate your conda environment, you have to do this each time you open a shell to use lerobot:
+```bash
+conda activate lerobot
+```
+
+When using `miniconda`, install `ffmpeg` in your environment:
+```bash
+conda install ffmpeg -c conda-forge
+```
+
+> [!TIP]
+> This usually installs `ffmpeg 7.X` for your platform compiled with the `libsvtav1` encoder. If `libsvtav1` is not supported (check supported encoders with `ffmpeg -encoders`), you can:
+>  - _[On any platform]_ Explicitly install `ffmpeg 7.X` using:
+>  ```bash
+>  conda install ffmpeg=7.1.1 -c conda-forge
+>  ```
+>  - _[On Linux only]_ If you want to bring your own ffmpeg: Install [ffmpeg build dependencies](https://trac.ffmpeg.org/wiki/CompilationGuide/Ubuntu#GettheDependencies) and [compile ffmpeg from source with libsvtav1](https://trac.ffmpeg.org/wiki/CompilationGuide/Ubuntu#libsvtav1), and make sure you use the corresponding ffmpeg binary to your install with `which ffmpeg`.
+
+Install 🤗 LeRobot:
+```bash
+pip install -e .
+```
+
+### Troubleshooting
+If you encounter build errors, you may need to install additional dependencies: `cmake`, `build-essential`, and `ffmpeg libs`.
+To install these for linux run:
+```bash
+sudo apt-get install cmake build-essential python-dev pkg-config libavformat-dev libavcodec-dev libavdevice-dev libavutil-dev libswscale-dev libswresample-dev libavfilter-dev pkg-config
+```
+For other systems, see: [Compiling PyAV](https://pyav.org/docs/develop/overview/installation.html#bring-your-own-ffmpeg)
+
+## Optional dependencies
+
+LeRobot provides optional extras for specific functionalities. Multiple extras can be combined (e.g., `.[aloha,feetech]`). For all available extras, refer to `pyproject.toml`.
+
+### Simulations
+Install environment packages: `aloha` ([gym-aloha](https://github.com/huggingface/gym-aloha)), `xarm` ([gym-xarm](https://github.com/huggingface/gym-xarm)), or `pusht` ([gym-pusht](https://github.com/huggingface/gym-pusht))
+Example:
+```bash
+pip install -e ".[aloha]" # or "[pusht]" for example
+```
+
+### Motor Control
+For Koch v1.1 install the Dynamixel SDK, for SO100/SO101/Moss install the Feetech SDK.
+```bash
+pip install -e ".[feetech]" # or "[dynamixel]" for example
+```
+
+### Experiment Tracking
+To use [Weights and Biases](https://docs.wandb.ai/quickstart) for experiment tracking, log in with
+```bash
+wandb login
+```
--- a/docs/source/koch.mdx
+++ b/docs/source/koch.mdx
@@ -0,0 +1 @@
+../../lerobot/common/robots/koch_follower/koch.mdx
--- a/docs/source/lekiwi.mdx
+++ b/docs/source/lekiwi.mdx
@@ -0,0 +1 @@
+../../lerobot/common/robots/lekiwi/lekiwi.mdx
--- a/docs/source/so100.mdx
+++ b/docs/source/so100.mdx
@@ -0,0 +1 @@
+../../lerobot/common/robots/so100_follower/so100.mdx
--- a/docs/source/so101.mdx
+++ b/docs/source/so101.mdx
@@ -0,0 +1 @@
+../../lerobot/common/robots/so101_follower/so101.mdx
--- a/examples/1_load_lerobot_dataset.py
+++ b/examples/1_load_lerobot_dataset.py
@@ -1,80 +1,136 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """
 This script demonstrates the use of `LeRobotDataset` class for handling and processing robotic datasets from Hugging Face.
 It illustrates how to load datasets, manipulate them, and apply transformations suitable for machine learning tasks in PyTorch.

 Features included in this script:
- Loading a dataset and accessing its properties.
- Filtering data by episode number.
- Converting tensor data for visualization.
- Saving video files from dataset frames.
+- Viewing a dataset's metadata and exploring its properties.
+- Loading an existing dataset from the hub or a subset of it.
+- Accessing frames by episode number.
 - Using advanced dataset features like timestamp-based frame selection.
 - Demonstrating compatibility with PyTorch DataLoader for batch processing.

 The script ends with examples of how to batch process data using PyTorch's DataLoader.
 """

-from pathlib import Path
 from pprint import pprint

-import imageio
 import torch
+from huggingface_hub import HfApi

 import lerobot
-from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
+from lerobot.common.datasets.lerobot_dataset import LeRobotDataset, LeRobotDatasetMetadata

+# We ported a number of existing datasets ourselves, use this to see the list:
 print("List of available datasets:")
 pprint(lerobot.available_datasets)

-# Let's take one for this example
-repo_id = "lerobot/pusht"
+# You can also browse through the datasets created/ported by the community on the hub using the hub api:
+hub_api = HfApi()
+repo_ids = [info.id for info in hub_api.list_datasets(task_categories="robotics", tags=["LeRobot"])]
+pprint(repo_ids)

-# You can easily load a dataset from a Hugging Face repository
+# Or simply explore them in your web browser directly at:
+# https://huggingface.co/datasets?other=LeRobot
+
+# Let's take this one for this example
+repo_id = "lerobot/aloha_mobile_cabinet"
+# We can have a look and fetch its metadata to know more about it:
+ds_meta = LeRobotDatasetMetadata(repo_id)
+
+# By instantiating just this class, you can quickly access useful information about the content and the
+# structure of the dataset without downloading the actual data yet (only metadata files — which are
+# lightweight).
+print(f"Total number of episodes: {ds_meta.total_episodes}")
+print(f"Average number of frames per episode: {ds_meta.total_frames / ds_meta.total_episodes:.3f}")
+print(f"Frames per second used during data collection: {ds_meta.fps}")
+print(f"Robot type: {ds_meta.robot_type}")
+print(f"keys to access images from cameras: {ds_meta.camera_keys=}\n")
+
+print("Tasks:")
+print(ds_meta.tasks)
+print("Features:")
+pprint(ds_meta.features)
+
+# You can also get a short summary by simply printing the object:
+print(ds_meta)
+
+# You can then load the actual dataset from the hub.
+# Either load any subset of episodes:
+dataset = LeRobotDataset(repo_id, episodes=[0, 10, 11, 23])
+
+# And see how many frames you have:
+print(f"Selected episodes: {dataset.episodes}")
+print(f"Number of episodes selected: {dataset.num_episodes}")
+print(f"Number of frames selected: {dataset.num_frames}")
+
+# Or simply load the entire dataset:
 dataset = LeRobotDataset(repo_id)
+print(f"Number of episodes selected: {dataset.num_episodes}")
+print(f"Number of frames selected: {dataset.num_frames}")

-# LeRobotDataset is actually a thin wrapper around an underlying Hugging Face dataset
-# (see https://huggingface.co/docs/datasets/index for more information).
-print(dataset)
+# The previous metadata class is contained in the 'meta' attribute of the dataset:
+print(dataset.meta)
+
+# LeRobotDataset actually wraps an underlying Hugging Face dataset
+# (see https://huggingface.co/docs/datasets for more information).
 print(dataset.hf_dataset)

-# And provides additional utilities for robotics and compatibility with Pytorch
-print(f"\naverage number of frames per episode: {dataset.num_samples / dataset.num_episodes:.3f}")
-print(f"frames per second used during data collection: {dataset.fps=}")
-print(f"keys to access images from cameras: {dataset.camera_keys=}\n")
-
-# Access frame indexes associated to first episode
+# LeRobot datasets also subclasses PyTorch datasets so you can do everything you know and love from working
+# with the latter, like iterating through the dataset.
+# The __getitem__ iterates over the frames of the dataset. Since our datasets are also structured by
+# episodes, you can access the frame indices of any episode using the episode_data_index. Here, we access
+# frame indices associated to the first episode:
 episode_index = 0
 from_idx = dataset.episode_data_index["from"][episode_index].item()
 to_idx = dataset.episode_data_index["to"][episode_index].item()

-# LeRobot datasets actually subclass PyTorch datasets so you can do everything you know and love from working
-# with the latter, like iterating through the dataset. Here we grab all the image frames.
-frames = [dataset[idx]["observation.image"] for idx in range(from_idx, to_idx)]
+# Then we grab all the image frames from the first camera:
+camera_key = dataset.meta.camera_keys[0]
+frames = [dataset[idx][camera_key] for idx in range(from_idx, to_idx)]

-# Video frames are now float32 in range [0,1] channel first (c,h,w) to follow pytorch convention. To visualize
-# them, we convert to uint8 in range [0,255]
-frames = [(frame * 255).type(torch.uint8) for frame in frames]
-# and to channel last (h,w,c).
-frames = [frame.permute((1, 2, 0)).numpy() for frame in frames]
+# The objects returned by the dataset are all torch.Tensors
+print(type(frames[0]))
+print(frames[0].shape)

-# Finally, we save the frames to a mp4 video for visualization.
-Path("outputs/examples/1_load_lerobot_dataset").mkdir(parents=True, exist_ok=True)
-imageio.mimsave("outputs/examples/1_load_lerobot_dataset/episode_0.mp4", frames, fps=dataset.fps)
+# Since we're using pytorch, the shape is in pytorch, channel-first convention (c, h, w).
+# We can compare this shape with the information available for that feature
+pprint(dataset.features[camera_key])
+# In particular:
+print(dataset.features[camera_key]["shape"])
+# The shape is in (h, w, c) which is a more universal format.

 # For many machine learning applications we need to load the history of past observations or trajectories of
 # future actions. Our datasets can load previous and future frames for each key/modality, using timestamps
 # differences with the current loaded frame. For instance:
 delta_timestamps = {
    # loads 4 images: 1 second before current frame, 500 ms before, 200 ms before, and current frame
-    "observation.image": [-1, -0.5, -0.20, 0],
-    # loads 8 state vectors: 1.5 seconds before, 1 second before, ... 20 ms, 10 ms, and current frame
-    "observation.state": [-1.5, -1, -0.5, -0.20, -0.10, -0.02, -0.01, 0],
+    camera_key: [-1, -0.5, -0.20, 0],
+    # loads 6 state vectors: 1.5 seconds before, 1 second before, ... 200 ms, 100 ms, and current frame
+    "observation.state": [-1.5, -1, -0.5, -0.20, -0.10, 0],
    # loads 64 action vectors: current frame, 1 frame in the future, 2 frames, ... 63 frames in the future
    "action": [t / dataset.fps for t in range(64)],
 }
+# Note that in any case, these delta_timestamps values need to be multiples of (1/fps) so that added to any
+# timestamp, you still get a valid timestamp.
+
 dataset = LeRobotDataset(repo_id, delta_timestamps=delta_timestamps)
-print(f"\n{dataset[0]['observation.image'].shape=}")  # (4,c,h,w)
-print(f"{dataset[0]['observation.state'].shape=}")  # (8,c)
-print(f"{dataset[0]['action'].shape=}\n")  # (64,c)
+print(f"\n{dataset[0][camera_key].shape=}")  # (4, c, h, w)
+print(f"{dataset[0]['observation.state'].shape=}")  # (6, c)
+print(f"{dataset[0]['action'].shape=}\n")  # (64, c)

 # Finally, our datasets are fully compatible with PyTorch dataloaders and samplers because they are just
 # PyTorch datasets.
@@ -84,8 +140,9 @@ dataloader = torch.utils.data.DataLoader(
    batch_size=32,
    shuffle=True,
 )
+
 for batch in dataloader:
-    print(f"{batch['observation.image'].shape=}")  # (32,4,c,h,w)
-    print(f"{batch['observation.state'].shape=}")  # (32,8,c)
-    print(f"{batch['action'].shape=}")  # (32,64,c)
+    print(f"{batch[camera_key].shape=}")  # (32, 4, c, h, w)
+    print(f"{batch['observation.state'].shape=}")  # (32, 6, c)
+    print(f"{batch['action'].shape=}")  # (32, 64, c)
    break
--- a/examples/2_evaluate_pretrained_policy.py
+++ b/examples/2_evaluate_pretrained_policy.py
@@ -1,6 +1,25 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """
-This scripts demonstrates how to evaluate a pretrained policy from the HuggingFace Hub or from your local
+This script demonstrates how to evaluate a pretrained policy from the HuggingFace Hub or from your local
 training outputs directory. In the latter case, you might want to run examples/3_train_policy.py first.
+
+It requires the installation of the 'gym_pusht' simulation environment. Install it by running:
+```bash
+pip install -e ".[pusht]"
+```
 """

 from pathlib import Path
@@ -10,7 +29,6 @@ import gymnasium as gym
 import imageio
 import numpy
 import torch
-from huggingface_hub import snapshot_download

 from lerobot.common.policies.diffusion.modeling_diffusion import DiffusionPolicy

@@ -18,16 +36,15 @@ from lerobot.common.policies.diffusion.modeling_diffusion import DiffusionPolicy
 output_directory = Path("outputs/eval/example_pusht_diffusion")
 output_directory.mkdir(parents=True, exist_ok=True)

-device = torch.device("cuda")
+# Select your device
+device = "cuda"

-# Download the diffusion policy for pusht environment
-pretrained_policy_path = Path(snapshot_download("lerobot/diffusion_pusht"))
-# OR uncomment the following to evaluate a policy from the local outputs/train folder.
+# Provide the [hugging face repo id](https://huggingface.co/lerobot/diffusion_pusht):
+pretrained_policy_path = "lerobot/diffusion_pusht"
+# OR a path to a local outputs/train folder.
 # pretrained_policy_path = Path("outputs/train/example_pusht_diffusion")

 policy = DiffusionPolicy.from_pretrained(pretrained_policy_path)
-policy.eval()
-policy.to(device)

 # Initialize evaluation environment to render two observation types:
 # an image of the scene and state/position of the agent. The environment
@@ -38,7 +55,17 @@ env = gym.make(
    max_episode_steps=300,
 )

-# Reset the policy and environmens to prepare for rollout
+# We can verify that the shapes of the features expected by the policy match the ones from the observations
+# produced by the environment
+print(policy.config.input_features)
+print(env.observation_space)
+
+# Similarly, we can check that the actions produced by the policy will match the actions expected by the
+# environment
+print(policy.config.output_features)
+print(env.action_space)
+
+# Reset the policy and environments to prepare for rollout
 policy.reset()
 numpy_observation, info = env.reset(seed=42)

@@ -92,7 +119,7 @@ while not done:
    rewards.append(reward)
    frames.append(env.render())

-    # The rollout is considered done when the success state is reach (i.e. terminated is True),
+    # The rollout is considered done when the success state is reached (i.e. terminated is True),
    # or the maximum number of iterations is reached (i.e. truncated is True)
    done = terminated | truncated | done
    step += 1
--- a/examples/3_train_policy.py
+++ b/examples/3_train_policy.py
@@ -1,4 +1,18 @@
-"""This scripts demonstrates how to train Diffusion Policy on the PushT environment.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This script demonstrates how to train Diffusion Policy on the PushT environment.

 Once you have trained a model with this script, you can try to evaluate it on
 examples/2_evaluate_pretrained_policy.py
@@ -8,72 +22,99 @@ from pathlib import Path

 import torch

-from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
+from lerobot.common.datasets.lerobot_dataset import LeRobotDataset, LeRobotDatasetMetadata
+from lerobot.common.datasets.utils import dataset_to_policy_features
 from lerobot.common.policies.diffusion.configuration_diffusion import DiffusionConfig
 from lerobot.common.policies.diffusion.modeling_diffusion import DiffusionPolicy
+from lerobot.configs.types import FeatureType

-# Create a directory to store the training checkpoint.
-output_directory = Path("outputs/train/example_pusht_diffusion")
-output_directory.mkdir(parents=True, exist_ok=True)

-# Number of offline training steps (we'll only do offline training for this example.)
-# Adjust as you prefer. 5000 steps are needed to get something worth evaluating.
-training_steps = 5000
-device = torch.device("cuda")
-log_freq = 250
+def main():
+    # Create a directory to store the training checkpoint.
+    output_directory = Path("outputs/train/example_pusht_diffusion")
+    output_directory.mkdir(parents=True, exist_ok=True)

-# Set up the dataset.
-delta_timestamps = {
-    # Load the previous image and state at -0.1 seconds before current frame,
-    # then load current image and state corresponding to 0.0 second.
-    "observation.image": [-0.1, 0.0],
-    "observation.state": [-0.1, 0.0],
-    # Load the previous action (-0.1), the next action to be executed (0.0),
-    # and 14 future actions with a 0.1 seconds spacing. All these actions will be
-    # used to supervise the policy.
-    "action": [-0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4],
-}
-dataset = LeRobotDataset("lerobot/pusht", delta_timestamps=delta_timestamps)
+    # # Select your device
+    device = torch.device("cuda")

-# Set up the the policy.
-# Policies are initialized with a configuration class, in this case `DiffusionConfig`.
-# For this example, no arguments need to be passed because the defaults are set up for PushT.
-# If you're doing something different, you will likely need to change at least some of the defaults.
-cfg = DiffusionConfig()
-policy = DiffusionPolicy(cfg, dataset_stats=dataset.stats)
-policy.train()
-policy.to(device)
+    # Number of offline training steps (we'll only do offline training for this example.)
+    # Adjust as you prefer. 5000 steps are needed to get something worth evaluating.
+    training_steps = 5000
+    log_freq = 1

-optimizer = torch.optim.Adam(policy.parameters(), lr=1e-4)
+    # When starting from scratch (i.e. not from a pretrained policy), we need to specify 2 things before
+    # creating the policy:
+    #   - input/output shapes: to properly size the policy
+    #   - dataset stats: for normalization and denormalization of input/outputs
+    dataset_metadata = LeRobotDatasetMetadata("lerobot/pusht")
+    features = dataset_to_policy_features(dataset_metadata.features)
+    output_features = {key: ft for key, ft in features.items() if ft.type is FeatureType.ACTION}
+    input_features = {key: ft for key, ft in features.items() if key not in output_features}

-# Create dataloader for offline training.
-dataloader = torch.utils.data.DataLoader(
-    dataset,
-    num_workers=4,
-    batch_size=64,
-    shuffle=True,
-    pin_memory=device != torch.device("cpu"),
-    drop_last=True,
-)
+    # Policies are initialized with a configuration class, in this case `DiffusionConfig`. For this example,
+    # we'll just use the defaults and so no arguments other than input/output features need to be passed.
+    cfg = DiffusionConfig(input_features=input_features, output_features=output_features)

-# Run training loop.
-step = 0
-done = False
-while not done:
-    for batch in dataloader:
-        batch = {k: v.to(device, non_blocking=True) for k, v in batch.items()}
-        output_dict = policy.forward(batch)
-        loss = output_dict["loss"]
-        loss.backward()
-        optimizer.step()
-        optimizer.zero_grad()
+    # We can now instantiate our policy with this config and the dataset stats.
+    policy = DiffusionPolicy(cfg, dataset_stats=dataset_metadata.stats)
+    policy.train()
+    policy.to(device)

-        if step % log_freq == 0:
-            print(f"step: {step} loss: {loss.item():.3f}")
-        step += 1
-        if step >= training_steps:
-            done = True
-            break
+    # Another policy-dataset interaction is with the delta_timestamps. Each policy expects a given number frames
+    # which can differ for inputs, outputs and rewards (if there are some).
+    delta_timestamps = {
+        "observation.image": [i / dataset_metadata.fps for i in cfg.observation_delta_indices],
+        "observation.state": [i / dataset_metadata.fps for i in cfg.observation_delta_indices],
+        "action": [i / dataset_metadata.fps for i in cfg.action_delta_indices],
+    }

-# Save a policy checkpoint.
-policy.save_pretrained(output_directory)
+    # In this case with the standard configuration for Diffusion Policy, it is equivalent to this:
+    delta_timestamps = {
+        # Load the previous image and state at -0.1 seconds before current frame,
+        # then load current image and state corresponding to 0.0 second.
+        "observation.image": [-0.1, 0.0],
+        "observation.state": [-0.1, 0.0],
+        # Load the previous action (-0.1), the next action to be executed (0.0),
+        # and 14 future actions with a 0.1 seconds spacing. All these actions will be
+        # used to supervise the policy.
+        "action": [-0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4],
+    }
+
+    # We can then instantiate the dataset with these delta_timestamps configuration.
+    dataset = LeRobotDataset("lerobot/pusht", delta_timestamps=delta_timestamps)
+
+    # Then we create our optimizer and dataloader for offline training.
+    optimizer = torch.optim.Adam(policy.parameters(), lr=1e-4)
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        num_workers=4,
+        batch_size=64,
+        shuffle=True,
+        pin_memory=device.type != "cpu",
+        drop_last=True,
+    )
+
+    # Run training loop.
+    step = 0
+    done = False
+    while not done:
+        for batch in dataloader:
+            batch = {k: (v.to(device) if isinstance(v, torch.Tensor) else v) for k, v in batch.items()}
+            loss, _ = policy.forward(batch)
+            loss.backward()
+            optimizer.step()
+            optimizer.zero_grad()
+
+            if step % log_freq == 0:
+                print(f"step: {step} loss: {loss.item():.3f}")
+            step += 1
+            if step >= training_steps:
+                done = True
+                break
+
+    # Save a policy checkpoint.
+    policy.save_pretrained(output_directory)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/4_train_policy_with_script.md
+++ b/examples/4_train_policy_with_script.md
@@ -1,183 +1,274 @@
-This tutorial will explain the training script, how to use it, and particularly the use of Hydra to configure everything needed for the training run.
+This tutorial will explain the training script, how to use it, and particularly how to configure everything needed for the training run.
+> **Note:** The following assumes you're running these commands on a machine equipped with a cuda GPU. If you don't have one (or if you're using a Mac), you can add `--policy.device=cpu` (`--policy.device=mps` respectively). However, be advised that the code executes much slower on cpu.
+

 ## The training script

-LeRobot offers a training script at [`lerobot/scripts/train.py`](../../lerobot/scripts/train.py). At a high level it does the following:
+LeRobot offers a training script at [`lerobot/scripts/train.py`](../lerobot/scripts/train.py). At a high level it does the following:

- Loads a Hydra configuration file for the following steps (more on Hydra in a moment).
- Makes a simulation environment.
- Makes a dataset corresponding to that simulation environment.
- Makes a policy.
+- Initialize/load a configuration for the following steps using.
+- Instantiates a dataset.
+- (Optional) Instantiates a simulation environment corresponding to that dataset.
+- Instantiates a policy.
 - Runs a standard training loop with forward pass, backward pass, optimization step, and occasional logging, evaluation (of the policy on the environment), and checkpointing.

-## Basics of how we use Hydra
-
-Explaining the ins and outs of [Hydra](https://hydra.cc/docs/intro/) is beyond the scope of this document, but here we'll share the main points you need to know.
-
-First, `lerobot/configs` has a directory structure like this:
-
-```
-.
-├── default.yaml
-├── env
-│   ├── aloha.yaml
-│   ├── pusht.yaml
-│   └── xarm.yaml
-└── policy
-    ├── act.yaml
-    ├── diffusion.yaml
-    └── tdmpc.yaml
-```
-
-**_For brevity, in the rest of this document we'll drop the leading `lerobot/configs` path. So `default.yaml` really refers to `lerobot/configs/default.yaml`._**
-
-When you run the training script with
+## Overview of the configuration system

+In the training script, the main function `train` expects a `TrainPipelineConfig` object:
 ```python
-python lerobot/scripts/train.py
+# train.py
+@parser.wrap()
+def train(cfg: TrainPipelineConfig):
 ```

-Hydra is set up to read `default.yaml` (via the `@hydra.main` decorator). If you take a look at the `@hydra.main`'s arguments you will see `config_path="../configs", config_name="default"`. At the top of `default.yaml`, is a `defaults` section which looks likes this:
+You can inspect the `TrainPipelineConfig` defined in [`lerobot/configs/train.py`](../lerobot/configs/train.py) (which is heavily commented and meant to be a reference to understand any option)

-```yaml
-defaults:
-  - _self_
-  - env: pusht
-  - policy: diffusion
+When running the script, inputs for the command line are parsed thanks to the `@parser.wrap()` decorator and an instance of this class is automatically generated. Under the hood, this is done with [Draccus](https://github.com/dlwh/draccus) which is a tool dedicated to this purpose. If you're familiar with Hydra, Draccus can similarly load configurations from config files (.json, .yaml) and also override their values through command line inputs. Unlike Hydra, these configurations are pre-defined in the code through dataclasses rather than being defined entirely in config files. This allows for more rigorous serialization/deserialization, typing, and to manipulate configuration as objects directly in the code and not as dictionaries or namespaces (which enables nice features in an IDE such as autocomplete, jump-to-def, etc.)
+
+Let's have a look at a simplified example. Amongst other attributes, the training config has the following attributes:
+```python
+@dataclass
+class TrainPipelineConfig:
+    dataset: DatasetConfig
+    env: envs.EnvConfig | None = None
+    policy: PreTrainedConfig | None = None
+```
+in which `DatasetConfig` for example is defined as such:
+```python
+@dataclass
+class DatasetConfig:
+    repo_id: str
+    episodes: list[int] | None = None
+    video_backend: str = "pyav"
 ```

-This logic tells Hydra to incorporate configuration parameters from `env/pusht.yaml` and `policy/diffusion.yaml`. _Note: Be aware of the order as any configuration parameters with the same name will be overidden. Thus, `default.yaml` is overriden by `env/pusht.yaml`  which is overidden by `policy/diffusion.yaml`_.
+This creates a hierarchical relationship where, for example assuming we have a `cfg` instance of `TrainPipelineConfig`, we can access the `repo_id` value with `cfg.dataset.repo_id`.
+From the command line, we can specify this value by using a very similar syntax `--dataset.repo_id=repo/id`.

-Then, `default.yaml` also contains common configuration parameters such as `device: cuda` or `use_amp: false` (for enabling fp16 training). Some other parameters are set to `???` which indicates that they are expected to be set in additional yaml files. For instance, `training.offline_steps: ???` in `default.yaml` is set to `200000` in `diffusion.yaml`.
+By default, every field takes its default value specified in the dataclass. If a field doesn't have a default value, it needs to be specified either from the command line or from a config file – which path is also given in the command line (more in this below). In the example above, the `dataset` field doesn't have a default value which means it must be specified.

-Thanks to this `defaults` section in `default.yaml`, if you want to train Diffusion Policy with PushT, you really only need to run:

-```bash
-python lerobot/scripts/train.py
-```
-
-However, you can be more explicit and launch the exact same Diffusion Policy training on PushT with:
-
-```bash
-python lerobot/scripts/train.py policy=diffusion env=pusht
-```
-
-This way of overriding defaults via the CLI is especially useful when you want to change the policy and/or environment. For instance, you can train ACT on the default Aloha environment with:
-
-```bash
-python lerobot/scripts/train.py policy=act env=aloha
-```
-
-There are two things to note here:
- Config overrides are passed as `param_name=param_value`.
- Here we have overridden the defaults section. `policy=act` tells Hydra to use `policy/act.yaml`, and `env=aloha` tells Hydra to use `env/pusht.yaml`.
-
-_As an aside: we've set up all of our configurations so that they reproduce state-of-the-art results from papers in the literature._
-
-## Overriding configuration parameters in the CLI
-
-Now let's say that we want to train on a different task in the Aloha environment. If you look in `env/aloha.yaml` you will see something like:
-
-```yaml
-# lerobot/configs/env/aloha.yaml
-env:
-  task: AlohaInsertion-v0
-```
-
-And if you look in `policy/act.yaml` you will see something like:
-
-```yaml
-# lerobot/configs/policy/act.yaml
-dataset_repo_id: lerobot/aloha_sim_insertion_human
-```
-
-But our Aloha environment actually supports a cube transfer task as well. To train for this task, you could manually modify the two yaml configuration files respectively.
-
-First, we'd need to switch to using the cube transfer task for the ALOHA environment.
-
-```diff
-# lerobot/configs/env/aloha.yaml
-env:
-  task: AlohaInsertion-v0
-+  task: AlohaTransferCube-v0
-```
-
-Then, we'd also need to switch to using the cube transfer dataset.
-
-```diff
-# lerobot/configs/policy/act.yaml
-dataset_repo_id: lerobot/aloha_sim_insertion_human
-+dataset_repo_id: lerobot/aloha_sim_transfer_cube_human
-```
-
-Then, you'd be able to run:
-
-```bash
-python lerobot/scripts/train.py policy=act env=aloha
-```
-
-and you'd be training and evaluating on the cube transfer task.
-
-An alternative approach to editing the yaml configuration files, would be to override the defaults via the command line:
+## Specifying values from the CLI

+Let's say that we want to train [Diffusion Policy](../lerobot/common/policies/diffusion) on the [pusht](https://huggingface.co/datasets/lerobot/pusht) dataset, using the [gym_pusht](https://github.com/huggingface/gym-pusht) environment for evaluation. The command to do so would look like this:
 ```bash
 python lerobot/scripts/train.py \
-    policy=act \
-    dataset_repo_id=lerobot/aloha_sim_transfer_cube_human \
-    env=aloha \
-    env.task=AlohaTransferCube-v0
+    --dataset.repo_id=lerobot/pusht \
+    --policy.type=diffusion \
+    --env.type=pusht
 ```

-There's something new here. Notice the `.` delimiter used to traverse the configuration hierarchy. _But be aware that the `defaults` section is an exception. As you saw above, we didn't need to write `defaults.policy=act` in the CLI. `policy=act` was enough._
-
-Putting all that knowledge together, here's the command that was used to train https://huggingface.co/lerobot/act_aloha_sim_transfer_cube_human.
+Let's break this down:
+- To specify the dataset, we just need to specify its `repo_id` on the hub which is the only required argument in the `DatasetConfig`. The rest of the fields have default values and in this case we are fine with those so we can just add the option `--dataset.repo_id=lerobot/pusht`.
+- To specify the policy, we can just select diffusion policy using `--policy` appended with `.type`. Here, `.type` is a special argument which allows us to select config classes inheriting from `draccus.ChoiceRegistry` and that have been decorated with the `register_subclass()` method. To have a better explanation of this feature, have a look at this [Draccus demo](https://github.com/dlwh/draccus?tab=readme-ov-file#more-flexible-configuration-with-choice-types). In our code, we use this mechanism mainly to select policies, environments, robots, and some other components like optimizers. The policies available to select are located in [lerobot/common/policies](../lerobot/common/policies)
+- Similarly, we select the environment with `--env.type=pusht`. The different environment configs are available in [`lerobot/common/envs/configs.py`](../lerobot/common/envs/configs.py)

+Let's see another example. Let's say you've been training [ACT](../lerobot/common/policies/act) on [lerobot/aloha_sim_insertion_human](https://huggingface.co/datasets/lerobot/aloha_sim_insertion_human) using the [gym-aloha](https://github.com/huggingface/gym-aloha) environment for evaluation with:
 ```bash
 python lerobot/scripts/train.py \
-    hydra.run.dir=outputs/train/act_aloha_sim_transfer_cube_human \
-    device=cuda
-    env=aloha \
-    env.task=AlohaTransferCube-v0 \
-    dataset_repo_id=lerobot/aloha_sim_transfer_cube_human \
-    policy=act \
-    training.eval_freq=10000 \
-    training.log_freq=250 \
-    training.offline_steps=100000 \
-    training.save_model=true \
-    training.save_freq=25000 \
-    eval.n_episodes=50 \
-    eval.batch_size=50 \
-    wandb.enable=false \
+    --policy.type=act \
+    --dataset.repo_id=lerobot/aloha_sim_insertion_human \
+    --env.type=aloha \
+    --output_dir=outputs/train/act_aloha_insertion
 ```
+> Notice we added `--output_dir` to explicitly tell where to write outputs from this run (checkpoints, training state, configs etc.). This is not mandatory and if you don't specify it, a default directory will be created from the current date and time, env.type and policy.type. This will typically look like `outputs/train/2025-01-24/16-10-05_aloha_act`.

-There's one new thing here: `hydra.run.dir=outputs/train/act_aloha_sim_transfer_cube_human`, which specifies where to save the training output.
-
-## Using a configuration file not in `lerobot/configs`
-
-Above we discusses the our training script is set up such that Hydra looks for `default.yaml` in `lerobot/configs`. But, if you have a configuration file elsewhere in your filesystem you may use:
-
+We now want to train a different policy for aloha on another task. We'll change the dataset and use [lerobot/aloha_sim_transfer_cube_human](https://huggingface.co/datasets/lerobot/aloha_sim_transfer_cube_human) instead. Of course, we also need to change the task of the environment as well to match this other task.
+Looking at the [`AlohaEnv`](../lerobot/common/envs/configs.py) config, the task is `"AlohaInsertion-v0"` by default, which corresponds to the task we trained on in the command above. The [gym-aloha](https://github.com/huggingface/gym-aloha?tab=readme-ov-file#description) environment also has the `AlohaTransferCube-v0` task which corresponds to this other task we want to train on. Putting this together, we can train this new policy on this different task using:
 ```bash
-python lerobot/scripts/train.py --config-dir PARENT/PATH --config-name FILE_NAME_WITHOUT_EXTENSION
+python lerobot/scripts/train.py \
+    --policy.type=act \
+    --dataset.repo_id=lerobot/aloha_sim_transfer_cube_human \
+    --env.type=aloha \
+    --env.task=AlohaTransferCube-v0 \
+    --output_dir=outputs/train/act_aloha_transfer
 ```

-Note: here we use regular syntax for providing CLI arguments to a Python script, not Hydra's `param_name=param_value` syntax.
+## Loading from a config file

-As a concrete example, this becomes particularly handy when you have a folder with training outputs, and would like to re-run the training. For example, say you previously ran the training script with one of the earlier commands and have `outputs/train/my_experiment/checkpoints/pretrained_model/config.yaml`. This `config.yaml` file will have the full set of configuration parameters within it. To run the training with the same configuration again, do:
+Now, let's assume that we want to reproduce the run just above. That run has produced a `train_config.json` file in its checkpoints, which serializes the `TrainPipelineConfig` instance it used:
+```json
+{
+    "dataset": {
+        "repo_id": "lerobot/aloha_sim_transfer_cube_human",
+        "episodes": null,
+        ...
+    },
+    "env": {
+        "type": "aloha",
+        "task": "AlohaTransferCube-v0",
+        "fps": 50,
+        ...
+    },
+    "policy": {
+        "type": "act",
+        "n_obs_steps": 1,
+        ...
+    },
+    ...
+}
+```

+We can then simply load the config values from this file using:
 ```bash
-python lerobot/scripts/train.py --config-dir outputs/train/my_experiment/checkpoints/last/pretrained_model --config-name config
+python lerobot/scripts/train.py \
+    --config_path=outputs/train/act_aloha_transfer/checkpoints/last/pretrained_model/ \
+    --output_dir=outputs/train/act_aloha_transfer_2
+```
+`--config_path` is also a special argument which allows to initialize the config from a local config file. It can point to a directory that contains `train_config.json` or to the config file itself directly.
+
+Similarly to Hydra, we can still override some parameters in the CLI if we want to, e.g.:
+```bash
+python lerobot/scripts/train.py \
+    --config_path=outputs/train/act_aloha_transfer/checkpoints/last/pretrained_model/ \
+    --output_dir=outputs/train/act_aloha_transfer_2
+    --policy.n_action_steps=80
+```
+> Note: While `--output_dir` is not required in general, in this case we need to specify it since it will otherwise take the value from the `train_config.json` (which is `outputs/train/act_aloha_transfer`). In order to prevent accidental deletion of previous run checkpoints, we raise an error if you're trying to write in an existing directory. This is not the case when resuming a run, which is what you'll learn next.
+
+`--config_path` can also accept the repo_id of a repo on the hub that contains a `train_config.json` file, e.g. running:
+```bash
+python lerobot/scripts/train.py --config_path=lerobot/diffusion_pusht
+```
+will start a training run with the same configuration used for training [lerobot/diffusion_pusht](https://huggingface.co/lerobot/diffusion_pusht)
+
+
+## Resume training
+
+Being able to resume a training run is important in case it crashed or aborted for any reason. We'll demonstrate how to do that here.
+
+Let's reuse the command from the previous run and add a few more options:
+```bash
+python lerobot/scripts/train.py \
+    --policy.type=act \
+    --dataset.repo_id=lerobot/aloha_sim_transfer_cube_human \
+    --env.type=aloha \
+    --env.task=AlohaTransferCube-v0 \
+    --log_freq=25 \
+    --save_freq=100 \
+    --output_dir=outputs/train/run_resumption
 ```

-Note that you may still use the regular syntax for config parameter overrides (eg: by adding `training.offline_steps=200000`).
+Here we've taken care to set up the log frequency and checkpointing frequency to low numbers so we can showcase resumption. You should be able to see some logging and have a first checkpoint within 1 minute (depending on hardware). Wait for the first checkpoint to happen, you should see a line that looks like this in your terminal:
+```
+INFO 2025-01-24 16:10:56 ts/train.py:263 Checkpoint policy after step 100
+```
+Now let's simulate a crash by killing the process (hit `ctrl`+`c`). We can then simply resume this run from the last checkpoint available with:
+```bash
+python lerobot/scripts/train.py \
+    --config_path=outputs/train/run_resumption/checkpoints/last/pretrained_model/ \
+    --resume=true
+```
+You should see from the logging that your training picks up from where it left off.
+
+Another reason for which you might want to resume a run is simply to extend training and add more training steps. The number of training steps is set by the option `--steps`, which is 100 000 by default.
+You could double the number of steps of the previous run with:
+```bash
+python lerobot/scripts/train.py \
+    --config_path=outputs/train/run_resumption/checkpoints/last/pretrained_model/ \
+    --resume=true \
+    --steps=200000
+```
+
+## Outputs of a run
+In the output directory, there will be a folder called `checkpoints` with the following structure:
+```bash
+outputs/train/run_resumption/checkpoints
+├── 000100  # checkpoint_dir for training step 100
+│   ├── pretrained_model/
+│   │   ├── config.json  # policy config
+│   │   ├── model.safetensors  # policy weights
+│   │   └── train_config.json  # train config
+│   └── training_state/
+│       ├── optimizer_param_groups.json  #  optimizer param groups
+│       ├── optimizer_state.safetensors  # optimizer state
+│       ├── rng_state.safetensors  # rng states
+│       ├── scheduler_state.json  # scheduler state
+│       └── training_step.json  # training step
+├── 000200
+└── last -> 000200  # symlink to the last available checkpoint
+```
+
+## Fine-tuning a pre-trained policy
+
+In addition to the features currently in Draccus, we've added a special `.path` argument for the policy, which allows to load a policy as you would with `PreTrainedPolicy.from_pretrained()`. In that case, `path` can be a local directory that contains a checkpoint or a repo_id pointing to a pretrained policy on the hub.
+
+For example, we could fine-tune a [policy pre-trained on the aloha transfer task](https://huggingface.co/lerobot/act_aloha_sim_transfer_cube_human) on the aloha insertion task. We can achieve this with:
+```bash
+python lerobot/scripts/train.py \
+    --policy.path=lerobot/act_aloha_sim_transfer_cube_human \
+    --dataset.repo_id=lerobot/aloha_sim_insertion_human \
+    --env.type=aloha \
+    --env.task=AlohaInsertion-v0
+```
+
+When doing so, keep in mind that the features of the fine-tuning dataset would have to match the input/output features of the pretrained policy.
+
+## Typical logs and metrics
+
+When you start the training process, you will first see your full configuration being printed in the terminal. You can check it to make sure that you configured your run correctly. The final configuration will also be saved with the checkpoint.
+
+After that, you will see training log like this one:
+```
+INFO 2024-08-14 13:35:12 ts/train.py:192 step:0 smpl:64 ep:1 epch:0.00 loss:1.112 grdn:15.387 lr:2.0e-07 updt_s:1.738 data_s:4.774
+```
+or evaluation log:
+```
+INFO 2024-08-14 13:38:45 ts/train.py:226 step:100 smpl:6K ep:52 epch:0.25 ∑rwrd:20.693 success:0.0% eval_s:120.266
+```
+
+These logs will also be saved in wandb if `wandb.enable` is set to `true`. Here are the meaning of some abbreviations:
+- `smpl`: number of samples seen during training.
+- `ep`: number of episodes seen during training. An episode contains multiple samples in a complete manipulation task.
+- `epch`: number of time all unique samples are seen (epoch).
+- `grdn`: gradient norm.
+- `∑rwrd`: compute the sum of rewards in every evaluation episode and then take an average of them.
+- `success`: average success rate of eval episodes. Reward and success are usually different except for the sparsing reward setting, where reward=1 only when the task is completed successfully.
+- `eval_s`: time to evaluate the policy in the environment, in second.
+- `updt_s`: time to update the network parameters, in second.
+- `data_s`: time to load a batch of data, in second.
+
+Some metrics are useful for initial performance profiling. For example, if you find the current GPU utilization is low via the `nvidia-smi` command and `data_s` sometimes is too high, you may need to modify batch size or number of dataloading workers to accelerate dataloading. We also recommend [pytorch profiler](https://github.com/huggingface/lerobot?tab=readme-ov-file#improve-your-code-with-profiling) for detailed performance probing.
+
+## In short
+
+We'll summarize here the main use cases to remember from this tutorial.
+
+#### Train a policy from scratch – CLI
+```bash
+python lerobot/scripts/train.py \
+    --policy.type=act \  # <- select 'act' policy
+    --env.type=pusht \  # <- select 'pusht' environment
+    --dataset.repo_id=lerobot/pusht  # <- train on this dataset
+```
+
+#### Train a policy from scratch - config file + CLI
+```bash
+python lerobot/scripts/train.py \
+    --config_path=path/to/pretrained_model \  # <- can also be a repo_id
+    --policy.n_action_steps=80  # <- you may still override values
+```
+
+#### Resume/continue a training run
+```bash
+python lerobot/scripts/train.py \
+    --config_path=checkpoint/pretrained_model/ \
+    --resume=true \
+    --steps=200000  # <- you can change some training parameters
+```
+
+#### Fine-tuning
+```bash
+python lerobot/scripts/train.py \
+    --policy.path=lerobot/act_aloha_sim_transfer_cube_human \  # <- can also be a local path to a checkpoint
+    --dataset.repo_id=lerobot/aloha_sim_insertion_human \
+    --env.type=aloha \
+    --env.task=AlohaInsertion-v0
+```

 ---

-So far we've seen how to train Diffusion Policy for PushT and ACT for ALOHA. Now, what if we want to train ACT for PushT? Well, there are aspects of the ACT configuration that are specific to the ALOHA environments, and these happen to be incompatible with PushT. Therefore, trying to run the following will almost certainly raise an exception of sorts (eg: feature dimension mismatch):
+Now that you know the basics of how to train a policy, you might want to know how to apply this knowledge to actual robots, or how to record your own datasets and train policies on your specific task?
+If that's the case, head over to the next tutorial [`7_get_started_with_real_robot.md`](./7_get_started_with_real_robot.md).

-```bash
-python lerobot/scripts/train.py policy=act env=pusht dataset_repo_id=lerobot/pusht
-```
-
-Please, head on over to our [advanced tutorial on adapting policy configuration to various environments](./advanced/train_act_pusht/train_act_pusht.md) to learn more.
-
-Or in the meantime, happy coding! 🤗
+Or in the meantime, happy training! 🤗
--- a/examples/5_resume_training.md
+++ b/examples/5_resume_training.md
@@ -1,37 +0,0 @@
-This tutorial explains how to resume a training run that you've started with the training script. If you don't know how our training script and configuration system works, please read [4_train_policy_with_script.md](./4_train_policy_with_script.md) first.
-
-## Basic training resumption
-
-Let's consider the example of training ACT for one of the ALOHA tasks. Here's a command that can achieve that:
-
-```bash
-python lerobot/scripts/train.py \
-    hydra.run.dir=outputs/train/run_resumption \
-    policy=act \
-    dataset_repo_id=lerobot/aloha_sim_transfer_cube_human \
-    env=aloha \
-    env.task=AlohaTransferCube-v0 \
-    training.log_freq=25 \
-    training.save_checkpoint=true \
-    training.save_freq=100
-```
-
-Here we're using the default dataset and environment for ACT, and we've taken care to set up the log frequency and checkpointing frequency to low numbers so we can test resumption. You should be able to see some logging and have a first checkpoint within 1 minute. Please interrupt the training after the first checkpoint.
-
-To resume, all that we have to do is run the training script, providing the run directory, and the resume option:
-
-```bash
-python lerobot/scripts/train.py \
-    hydra.run.dir=outputs/train/run_resumption \
-    resume=true
-```
-
-You should see from the logging that your training picks up from where it left off.
-
-Note that with `resume=true`, the configuration file from the last checkpoint in the training output directory is loaded. So it doesn't matter that we haven't provided all the other configuration parameters from our previous command (although there may be warnings to notify you that your command has a different configuration than than the checkpoint).
-
---
-
-Now you should know how to resume your training run in case it gets interrupted or you want to extend a finished training run.
-
-Happy coding! 🤗
--- a/examples/advanced/1_add_image_transforms.py
+++ b/examples/advanced/1_add_image_transforms.py
@@ -0,0 +1,67 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This script demonstrates how to use torchvision's image transformation with LeRobotDataset for data
+augmentation purposes. The transformations are passed to the dataset as an argument upon creation, and
+transforms are applied to the observation images before they are returned in the dataset's __getitem__.
+"""
+
+from pathlib import Path
+
+from torchvision.transforms import ToPILImage, v2
+
+from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
+
+dataset_repo_id = "lerobot/aloha_static_screw_driver"
+
+# Create a LeRobotDataset with no transformations
+dataset = LeRobotDataset(dataset_repo_id, episodes=[0])
+# This is equivalent to `dataset = LeRobotDataset(dataset_repo_id, image_transforms=None)`
+
+# Get the index of the first observation in the first episode
+first_idx = dataset.episode_data_index["from"][0].item()
+
+# Get the frame corresponding to the first camera
+frame = dataset[first_idx][dataset.meta.camera_keys[0]]
+
+
+# Define the transformations
+transforms = v2.Compose(
+    [
+        v2.ColorJitter(brightness=(0.5, 1.5)),
+        v2.ColorJitter(contrast=(0.5, 1.5)),
+        v2.ColorJitter(hue=(-0.1, 0.1)),
+        v2.RandomAdjustSharpness(sharpness_factor=2, p=1),
+    ]
+)
+
+# Create another LeRobotDataset with the defined transformations
+transformed_dataset = LeRobotDataset(dataset_repo_id, episodes=[0], image_transforms=transforms)
+
+# Get a frame from the transformed dataset
+transformed_frame = transformed_dataset[first_idx][transformed_dataset.meta.camera_keys[0]]
+
+# Create a directory to store output images
+output_dir = Path("outputs/image_transforms")
+output_dir.mkdir(parents=True, exist_ok=True)
+
+# Save the original frame
+to_pil = ToPILImage()
+to_pil(frame).save(output_dir / "original_frame.png", quality=100)
+print(f"Original frame saved to {output_dir / 'original_frame.png'}.")
+
+# Save the transformed frame
+to_pil(transformed_frame).save(output_dir / "transformed_frame.png", quality=100)
+print(f"Transformed frame saved to {output_dir / 'transformed_frame.png'}.")
--- a/examples/advanced/1_train_act_pusht/act_pusht.yaml
+++ b/examples/advanced/1_train_act_pusht/act_pusht.yaml
@@ -1,87 +0,0 @@
-# @package _global_
-
-# Change the seed to match what PushT eval uses
-# (to avoid evaluating on seeds used for generating the training data).
-seed: 100000
-# Change the dataset repository to the PushT one.
-dataset_repo_id: lerobot/pusht
-
-override_dataset_stats:
-  observation.image:
-    # stats from imagenet, since we use a pretrained vision model
-    mean: [[[0.485]], [[0.456]], [[0.406]]]  # (c,1,1)
-    std: [[[0.229]], [[0.224]], [[0.225]]]  # (c,1,1)
-
-training:
-  offline_steps: 80000
-  online_steps: 0
-  eval_freq: 10000
-  save_freq: 100000
-  log_freq: 250
-  save_model: true
-
-  batch_size: 8
-  lr: 1e-5
-  lr_backbone: 1e-5
-  weight_decay: 1e-4
-  grad_clip_norm: 10
-  online_steps_between_rollouts: 1
-
-  delta_timestamps:
-    action: "[i / ${fps} for i in range(${policy.chunk_size})]"
-
-eval:
-  n_episodes: 50
-  batch_size: 50
-
-# See `configuration_act.py` for more details.
-policy:
-  name: act
-
-  # Input / output structure.
-  n_obs_steps: 1
-  chunk_size: 100 # chunk_size
-  n_action_steps: 100
-
-  input_shapes:
-    observation.image: [3, 96, 96]
-    observation.state: ["${env.state_dim}"]
-  output_shapes:
-    action: ["${env.action_dim}"]
-
-  # Normalization / Unnormalization
-  input_normalization_modes:
-    observation.image: mean_std
-    # Use min_max normalization just because it's more standard.
-    observation.state: min_max
-  output_normalization_modes:
-    # Use min_max normalization just because it's more standard.
-    action: min_max
-
-  # Architecture.
-  # Vision backbone.
-  vision_backbone: resnet18
-  pretrained_backbone_weights: ResNet18_Weights.IMAGENET1K_V1
-  replace_final_stride_with_dilation: false
-  # Transformer layers.
-  pre_norm: false
-  dim_model: 512
-  n_heads: 8
-  dim_feedforward: 3200
-  feedforward_activation: relu
-  n_encoder_layers: 4
-    # Note: Although the original ACT implementation has 7 for `n_decoder_layers`, there is a bug in the code
-  # that means only the first layer is used. Here we match the original implementation by setting this to 1.
-  # See this issue https://github.com/tonyzhaozh/act/issues/25#issue-2258740521.
-  n_decoder_layers: 1
-  # VAE.
-  use_vae: true
-  latent_dim: 32
-  n_vae_encoder_layers: 4
-
-  # Inference.
-  temporal_ensemble_momentum: null
-
-  # Training and loss computation.
-  dropout: 0.1
-  kl_weight: 10.0
--- a/examples/advanced/1_train_act_pusht/train_act_pusht.md
+++ b/examples/advanced/1_train_act_pusht/train_act_pusht.md
@@ -1,70 +0,0 @@
-In this tutorial we will learn how to adapt a policy configuration to be compatible with a new environment and dataset. As a concrete example, we will adapt the default configuration for ACT to be compatible with the PushT environment and dataset.
-
-If you haven't already read our tutorial on the [training script and configuration tooling](../4_train_policy_with_script.md) please do so prior to tackling this tutorial.
-
-Let's get started!
-
-Suppose we want to train ACT for PushT. Well, there are aspects of the ACT configuration that are specific to the ALOHA environments, and these happen to be incompatible with PushT. Therefore, trying to run the following will almost certainly raise an exception of sorts (eg: feature dimension mismatch):
-
-```bash
-python lerobot/scripts/train.py policy=act env=pusht dataset_repo_id=lerobot/pusht
-```
-
-We need to adapt the parameters of the ACT policy configuration to the PushT environment. The most important ones are the image keys.
-
-ALOHA's datasets and environments typically use a variable number of cameras. In `lerobot/configs/policy/act.yaml` you may notice two relevant sections. Here we show you the minimal diff needed to adjust to PushT:
-
-```diff
-override_dataset_stats:
-  observation.images.top:
-+  observation.image:
-    # stats from imagenet, since we use a pretrained vision model
-    mean: [[[0.485]], [[0.456]], [[0.406]]]  # (c,1,1)
-    std: [[[0.229]], [[0.224]], [[0.225]]]  # (c,1,1)
-
-policy:
-  input_shapes:
-    observation.images.top: [3, 480, 640]
-+    observation.image: [3, 96, 96]
-    observation.state: ["${env.state_dim}"]
-  output_shapes:
-    action: ["${env.action_dim}"]
-
-  input_normalization_modes:
-    observation.images.top: mean_std
-+    observation.image: mean_std
-     observation.state: min_max
-  output_normalization_modes:
-    action: min_max
-```
-
-Here we've accounted for the following:
- PushT uses "observation.image" for its image key.
- PushT provides smaller images.
-
-_Side note: technically we could override these via the CLI, but with many changes it gets a bit messy, and we also have a bit of a challenge in that we're using `.` in our observation keys which is treated by Hydra as a hierarchical separator_.
-
-For your convenience, we provide [`act_pusht.yaml`](./act_pusht.yaml) in this directory. It contains the diff above, plus some other (optional) ones that are explained within. Please copy it into `lerobot/configs/policy` with:
-
-```bash
-cp examples/advanced/1_train_act_pusht/act_pusht.yaml lerobot/configs/policy/act_pusht.yaml
-```
-
-(remember from a [previous tutorial](../4_train_policy_with_script.md) that Hydra will look in the `lerobot/configs` directory). Now try running the following.
-
-<!-- Note to contributor: are you changing this command? Note that it's tested in `Makefile`, so change it there too! -->
-```bash
-python lerobot/scripts/train.py policy=act_pusht env=pusht
-```
-
-Notice that this is much the same as the command that failed at the start of the tutorial, only:
- Now we are using `policy=act_pusht` to point to our new configuration file.
- We can drop `dataset_repo_id=lerobot/pusht` as the change is incorporated in our new configuration file.
-
-Hurrah! You're now training ACT for the PushT environment.
-
---
-
-The bottom line of this tutorial is that when training policies for different environments and datasets you will need to understand what parts of the policy configuration are specific to those and make changes accordingly.
-
-Happy coding! 🤗
--- a/examples/advanced/2_calculate_validation_loss.py
+++ b/examples/advanced/2_calculate_validation_loss.py
@@ -1,3 +1,17 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """This script demonstrates how to slice a dataset and calculate the loss on a subset of the data.

 This technique can be useful for debugging and testing purposes, as well as identifying whether a policy
@@ -9,82 +23,82 @@ on the target environment, whether that be in simulation or the real world.
 """

 import math
-from pathlib import Path

 import torch
-from huggingface_hub import snapshot_download

-from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
+from lerobot.common.datasets.lerobot_dataset import LeRobotDataset, LeRobotDatasetMetadata
 from lerobot.common.policies.diffusion.modeling_diffusion import DiffusionPolicy

-device = torch.device("cuda")

-# Download the diffusion policy for pusht environment
-pretrained_policy_path = Path(snapshot_download("lerobot/diffusion_pusht"))
-# OR uncomment the following to evaluate a policy from the local outputs/train folder.
-# pretrained_policy_path = Path("outputs/train/example_pusht_diffusion")
+def main():
+    device = torch.device("cuda")

-policy = DiffusionPolicy.from_pretrained(pretrained_policy_path)
-policy.eval()
-policy.to(device)
+    # Download the diffusion policy for pusht environment
+    pretrained_policy_path = "lerobot/diffusion_pusht"
+    # OR uncomment the following to evaluate a policy from the local outputs/train folder.
+    # pretrained_policy_path = Path("outputs/train/example_pusht_diffusion")

-# Set up the dataset.
-delta_timestamps = {
-    # Load the previous image and state at -0.1 seconds before current frame,
-    # then load current image and state corresponding to 0.0 second.
-    "observation.image": [-0.1, 0.0],
-    "observation.state": [-0.1, 0.0],
-    # Load the previous action (-0.1), the next action to be executed (0.0),
-    # and 14 future actions with a 0.1 seconds spacing. All these actions will be
-    # used to calculate the loss.
-    "action": [-0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4],
-}
+    policy = DiffusionPolicy.from_pretrained(pretrained_policy_path)
+    policy.eval()
+    policy.to(device)

-# Load the last 10% of episodes of the dataset as a validation set.
-# - Load full dataset
-full_dataset = LeRobotDataset("lerobot/pusht", split="train")
-# - Calculate train and val subsets
-num_train_episodes = math.floor(full_dataset.num_episodes * 90 / 100)
-num_val_episodes = full_dataset.num_episodes - num_train_episodes
-print(f"Number of episodes in full dataset: {full_dataset.num_episodes}")
-print(f"Number of episodes in training dataset (90% subset): {num_train_episodes}")
-print(f"Number of episodes in validation dataset (10% subset): {num_val_episodes}")
-# - Get first frame index of the validation set
-first_val_frame_index = full_dataset.episode_data_index["from"][num_train_episodes].item()
-# - Load frames subset belonging to validation set using the `split` argument.
-#   It utilizes the `datasets` library's syntax for slicing datasets.
-#   For more information on the Slice API, please see:
-#   https://huggingface.co/docs/datasets/v2.19.0/loading#slice-splits
-train_dataset = LeRobotDataset(
-    "lerobot/pusht", split=f"train[:{first_val_frame_index}]", delta_timestamps=delta_timestamps
-)
-val_dataset = LeRobotDataset(
-    "lerobot/pusht", split=f"train[{first_val_frame_index}:]", delta_timestamps=delta_timestamps
-)
-print(f"Number of frames in training dataset (90% subset): {len(train_dataset)}")
-print(f"Number of frames in validation dataset (10% subset): {len(val_dataset)}")
+    # Set up the dataset.
+    delta_timestamps = {
+        # Load the previous image and state at -0.1 seconds before current frame,
+        # then load current image and state corresponding to 0.0 second.
+        "observation.image": [-0.1, 0.0],
+        "observation.state": [-0.1, 0.0],
+        # Load the previous action (-0.1), the next action to be executed (0.0),
+        # and 14 future actions with a 0.1 seconds spacing. All these actions will be
+        # used to calculate the loss.
+        "action": [-0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4],
+    }

-# Create dataloader for evaluation.
-val_dataloader = torch.utils.data.DataLoader(
-    val_dataset,
-    num_workers=4,
-    batch_size=64,
-    shuffle=False,
-    pin_memory=device != torch.device("cpu"),
-    drop_last=False,
-)
+    # Load the last 10% of episodes of the dataset as a validation set.
+    # - Load dataset metadata
+    dataset_metadata = LeRobotDatasetMetadata("lerobot/pusht")
+    # - Calculate train and val episodes
+    total_episodes = dataset_metadata.total_episodes
+    episodes = list(range(dataset_metadata.total_episodes))
+    num_train_episodes = math.floor(total_episodes * 90 / 100)
+    train_episodes = episodes[:num_train_episodes]
+    val_episodes = episodes[num_train_episodes:]
+    print(f"Number of episodes in full dataset: {total_episodes}")
+    print(f"Number of episodes in training dataset (90% subset): {len(train_episodes)}")
+    print(f"Number of episodes in validation dataset (10% subset): {len(val_episodes)}")
+    # - Load train and val datasets
+    train_dataset = LeRobotDataset(
+        "lerobot/pusht", episodes=train_episodes, delta_timestamps=delta_timestamps
+    )
+    val_dataset = LeRobotDataset("lerobot/pusht", episodes=val_episodes, delta_timestamps=delta_timestamps)
+    print(f"Number of frames in training dataset (90% subset): {len(train_dataset)}")
+    print(f"Number of frames in validation dataset (10% subset): {len(val_dataset)}")

-# Run validation loop.
-loss_cumsum = 0
-n_examples_evaluated = 0
-for batch in val_dataloader:
-    batch = {k: v.to(device, non_blocking=True) for k, v in batch.items()}
-    output_dict = policy.forward(batch)
+    # Create dataloader for evaluation.
+    val_dataloader = torch.utils.data.DataLoader(
+        val_dataset,
+        num_workers=4,
+        batch_size=64,
+        shuffle=False,
+        pin_memory=device != torch.device("cpu"),
+        drop_last=False,
+    )

-    loss_cumsum += output_dict["loss"].item()
-    n_examples_evaluated += batch["index"].shape[0]
+    # Run validation loop.
+    loss_cumsum = 0
+    n_examples_evaluated = 0
+    for batch in val_dataloader:
+        batch = {k: v.to(device, non_blocking=True) for k, v in batch.items()}
+        loss, _ = policy.forward(batch)

-# Calculate the average loss over the validation set.
-average_loss = loss_cumsum / n_examples_evaluated
+        loss_cumsum += loss.item()
+        n_examples_evaluated += batch["index"].shape[0]

-print(f"Average loss on validation set: {average_loss:.4f}")
+    # Calculate the average loss over the validation set.
+    average_loss = loss_cumsum / n_examples_evaluated
+
+    print(f"Average loss on validation set: {average_loss:.4f}")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/lekiwi/evaluate.py
+++ b/examples/lekiwi/evaluate.py
@@ -0,0 +1,38 @@
+import torch
+
+from lerobot.common.policies.act.modeling_act import ACTPolicy
+from lerobot.common.robots.lekiwi.config_lekiwi import LeKiwiClientConfig
+from lerobot.common.robots.lekiwi.lekiwi_client import LeKiwiClient
+from lerobot.common.utils.control_utils import predict_action
+from lerobot.common.utils.utils import get_safe_torch_device
+
+NB_CYCLES_CLIENT_CONNECTION = 1000
+
+robot_config = LeKiwiClientConfig(remote_ip="172.18.134.136", id="lekiwi")
+robot = LeKiwiClient(robot_config)
+
+robot.connect()
+
+policy = ACTPolicy.from_pretrained("pepijn223/act_lekiwi_circle")
+policy.reset()
+
+print("Running inference")
+i = 0
+while i < NB_CYCLES_CLIENT_CONNECTION:
+    obs = robot.get_observation()
+
+    for key, value in obs.items():
+        if isinstance(value, torch.Tensor):
+            obs[key] = value.numpy()
+
+    action_values = predict_action(
+        obs, policy, get_safe_torch_device(policy.config.device), policy.config.use_amp
+    )
+    action = {
+        key: action_values[i].item() if isinstance(action_values[i], torch.Tensor) else action_values[i]
+        for i, key in enumerate(robot.action_features)
+    }
+    robot.send_action(action)
+    i += 1
+
+robot.disconnect()
--- a/examples/lekiwi/record.py
+++ b/examples/lekiwi/record.py
@@ -0,0 +1,67 @@
+import time
+
+from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
+from lerobot.common.datasets.utils import hw_to_dataset_features
+from lerobot.common.robots.lekiwi.config_lekiwi import LeKiwiClientConfig
+from lerobot.common.robots.lekiwi.lekiwi_client import LeKiwiClient
+from lerobot.common.teleoperators.keyboard import KeyboardTeleop, KeyboardTeleopConfig
+from lerobot.common.teleoperators.so100_leader import SO100Leader, SO100LeaderConfig
+
+NB_CYCLES_CLIENT_CONNECTION = 250
+
+leader_arm_config = SO100LeaderConfig(port="/dev/tty.usbmodem58760431551")
+leader_arm = SO100Leader(leader_arm_config)
+
+keyboard_config = KeyboardTeleopConfig()
+keyboard = KeyboardTeleop(keyboard_config)
+
+robot_config = LeKiwiClientConfig(remote_ip="172.18.134.136", id="lekiwi")
+robot = LeKiwiClient(robot_config)
+
+action_features = hw_to_dataset_features(robot.action_features, "action")
+obs_features = hw_to_dataset_features(robot.observation_features, "observation")
+dataset_features = {**action_features, **obs_features}
+
+dataset = LeRobotDataset.create(
+    repo_id="user/lekiwi" + str(int(time.time())),
+    fps=10,
+    features=dataset_features,
+    robot_type=robot.name,
+)
+
+leader_arm.connect()
+keyboard.connect()
+robot.connect()
+
+if not robot.is_connected or not leader_arm.is_connected or not keyboard.is_connected:
+    exit()
+
+print("Starting LeKiwi teleoperation")
+i = 0
+while i < NB_CYCLES_CLIENT_CONNECTION:
+    arm_action = leader_arm.get_action()
+    arm_action = {f"arm_{k}": v for k, v in arm_action.items()}
+
+    keyboard_keys = keyboard.get_action()
+
+    base_action = robot._from_keyboard_to_base_action(keyboard_keys)
+
+    action = {**arm_action, **base_action} if len(base_action) > 0 else arm_action
+
+    action_sent = robot.send_action(action)
+    observation = robot.get_observation()
+
+    frame = {**action_sent, **observation}
+    task = "Dummy Example Task Dataset"
+
+    dataset.add_frame(frame, task)
+    i += 1
+
+print("Disconnecting Teleop Devices and LeKiwi Client")
+robot.disconnect()
+leader_arm.disconnect()
+keyboard.disconnect()
+
+print("Uploading dataset to the hub")
+dataset.save_episode()
+dataset.push_to_hub()
--- a/examples/lekiwi/replay.py
+++ b/examples/lekiwi/replay.py
@@ -0,0 +1,25 @@
+import time
+
+from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
+from lerobot.common.robots.lekiwi.config_lekiwi import LeKiwiClientConfig
+from lerobot.common.robots.lekiwi.lekiwi_client import LeKiwiClient
+from lerobot.common.utils.robot_utils import busy_wait
+
+robot_config = LeKiwiClientConfig(remote_ip="172.18.134.136", id="lekiwi")
+robot = LeKiwiClient(robot_config)
+
+dataset = LeRobotDataset("pepijn223/lekiwi1749025613", episodes=[0])
+
+robot.connect()
+
+print("Replaying episode…")
+for _, action_array in enumerate(dataset.hf_dataset["action"]):
+    t0 = time.perf_counter()
+
+    action = {name: float(action_array[i]) for i, name in enumerate(dataset.features["action"]["names"])}
+    robot.send_action(action)
+
+    busy_wait(max(1.0 / dataset.fps - (time.perf_counter() - t0), 0.0))
+
+print("Disconnecting LeKiwi Client")
+robot.disconnect()
--- a/examples/lekiwi/teleoperate.py
+++ b/examples/lekiwi/teleoperate.py
@@ -0,0 +1,32 @@
+from lerobot.common.robots.lekiwi import LeKiwiClient, LeKiwiClientConfig
+from lerobot.common.teleoperators.keyboard.teleop_keyboard import KeyboardTeleop, KeyboardTeleopConfig
+from lerobot.common.teleoperators.so100_leader import SO100Leader, SO100LeaderConfig
+
+robot_config = LeKiwiClientConfig(remote_ip="172.18.134.136", id="my_lekiwi")
+
+teleop__arm_config = SO100LeaderConfig(
+    port="/dev/tty.usbmodem58760431551",
+    id="my_awesome_leader_arm",
+)
+
+teleop_keyboard_config = KeyboardTeleopConfig(
+    id="my_laptop_keyboard",
+)
+
+robot = LeKiwiClient(robot_config)
+teleop_arm = SO100Leader(teleop__arm_config)
+telep_keyboard = KeyboardTeleop(teleop_keyboard_config)
+robot.connect()
+teleop_arm.connect()
+telep_keyboard.connect()
+
+while True:
+    observation = robot.get_observation()
+
+    arm_action = teleop_arm.get_action()
+    arm_action = {f"arm_{k}": v for k, v in arm_action.items()}
+
+    keyboard_keys = telep_keyboard.get_action()
+    base_action = robot._from_keyboard_to_base_action(keyboard_keys)
+
+    robot.send_action(arm_action | base_action)
--- a/lerobot/init.py
+++ b/lerobot/init.py
@@ -27,6 +27,9 @@ Example:
        print(lerobot.available_real_world_datasets)
        print(lerobot.available_policies)
        print(lerobot.available_policies_per_env)
+        print(lerobot.available_robots)
+        print(lerobot.available_cameras)
+        print(lerobot.available_motors)
    ```

 When implementing a new dataset loadable with LeRobotDataset follow these steps:
@@ -45,6 +48,9 @@ import itertools

 from lerobot.__version__ import __version__  # noqa: F401

+# TODO(rcadene): Improve policies and envs. As of now, an item in `available_policies`
+# refers to a yaml file AND a modeling name. Same for `available_envs` which refers to
+# a yaml file AND a environment name. The difference should be more obvious.
 available_tasks_per_env = {
    "aloha": [
        "AlohaInsertion-v0",
@@ -66,6 +72,8 @@ available_datasets_per_env = {
        "lerobot/aloha_sim_transfer_cube_human_image",
        "lerobot/aloha_sim_transfer_cube_scripted_image",
    ],
+    # TODO(alexander-soare): Add "lerobot/pusht_keypoints". Right now we can't because this is too tightly
+    # coupled with tests.
    "pusht": ["lerobot/pusht", "lerobot/pusht_image"],
    "xarm": [
        "lerobot/xarm_lift_medium",
@@ -102,22 +110,94 @@ available_real_world_datasets = [
    "lerobot/aloha_static_vinh_cup_left",
    "lerobot/aloha_static_ziploc_slide",
    "lerobot/umi_cup_in_the_wild",
+    "lerobot/unitreeh1_fold_clothes",
+    "lerobot/unitreeh1_rearrange_objects",
+    "lerobot/unitreeh1_two_robot_greeting",
+    "lerobot/unitreeh1_warehouse",
+    "lerobot/nyu_rot_dataset",
+    "lerobot/utokyo_saytap",
+    "lerobot/imperialcollege_sawyer_wrist_cam",
+    "lerobot/utokyo_xarm_bimanual",
+    "lerobot/tokyo_u_lsmo",
+    "lerobot/utokyo_pr2_opening_fridge",
+    "lerobot/cmu_franka_exploration_dataset",
+    "lerobot/cmu_stretch",
+    "lerobot/asu_table_top",
+    "lerobot/utokyo_pr2_tabletop_manipulation",
+    "lerobot/utokyo_xarm_pick_and_place",
+    "lerobot/ucsd_kitchen_dataset",
+    "lerobot/austin_buds_dataset",
+    "lerobot/dlr_sara_grid_clamp",
+    "lerobot/conq_hose_manipulation",
+    "lerobot/columbia_cairlab_pusht_real",
+    "lerobot/dlr_sara_pour",
+    "lerobot/dlr_edan_shared_control",
+    "lerobot/ucsd_pick_and_place_dataset",
+    "lerobot/berkeley_cable_routing",
+    "lerobot/nyu_franka_play_dataset",
+    "lerobot/austin_sirius_dataset",
+    "lerobot/cmu_play_fusion",
+    "lerobot/berkeley_gnm_sac_son",
+    "lerobot/nyu_door_opening_surprising_effectiveness",
+    "lerobot/berkeley_fanuc_manipulation",
+    "lerobot/jaco_play",
+    "lerobot/viola",
+    "lerobot/kaist_nonprehensile",
+    "lerobot/berkeley_mvp",
+    "lerobot/uiuc_d3field",
+    "lerobot/berkeley_gnm_recon",
+    "lerobot/austin_sailor_dataset",
+    "lerobot/utaustin_mutex",
+    "lerobot/roboturk",
+    "lerobot/stanford_hydra_dataset",
+    "lerobot/berkeley_autolab_ur5",
+    "lerobot/stanford_robocook",
+    "lerobot/toto",
+    "lerobot/fmb",
+    "lerobot/droid_100",
+    "lerobot/berkeley_rpt",
+    "lerobot/stanford_kuka_multimodal_dataset",
+    "lerobot/iamlab_cmu_pickup_insert",
+    "lerobot/taco_play",
+    "lerobot/berkeley_gnm_cory_hall",
+    "lerobot/usc_cloth_sim",
 ]

-available_datasets = list(
-    itertools.chain(*available_datasets_per_env.values(), available_real_world_datasets)
+available_datasets = sorted(
+    set(itertools.chain(*available_datasets_per_env.values(), available_real_world_datasets))
 )

-available_policies = [
-    "act",
-    "diffusion",
-    "tdmpc",
+# lists all available policies from `lerobot/common/policies`
+available_policies = ["act", "diffusion", "tdmpc", "vqbet"]
+
+# lists all available robots from `lerobot/common/robot_devices/robots`
+available_robots = [
+    "koch",
+    "koch_bimanual",
+    "aloha",
+    "so100",
+    "so101",
 ]

+# lists all available cameras from `lerobot/common/robot_devices/cameras`
+available_cameras = [
+    "opencv",
+    "intelrealsense",
+]
+
+# lists all available motors from `lerobot/common/robot_devices/motors`
+available_motors = [
+    "dynamixel",
+    "feetech",
+]
+
+# keys and values refer to yaml files
 available_policies_per_env = {
    "aloha": ["act"],
-    "pusht": ["diffusion"],
+    "pusht": ["diffusion", "vqbet"],
    "xarm": ["tdmpc"],
+    "koch_real": ["act_koch_real"],
+    "aloha_real": ["act_aloha_real"],
 }

 env_task_pairs = [(env, task) for env, tasks in available_tasks_per_env.items() for task in tasks]
--- a/lerobot/calibrate.py
+++ b/lerobot/calibrate.py
@@ -0,0 +1,84 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Helper to recalibrate your device (robot or teleoperator).
+
+Example:
+
+```shell
+python -m lerobot.calibrate \
+    --teleop.type=so100_leader \
+    --teleop.port=/dev/tty.usbmodem58760431551 \
+    --teleop.id=blue
+```
+"""
+
+import logging
+from dataclasses import asdict, dataclass
+from pprint import pformat
+
+import draccus
+
+from lerobot.common.cameras.opencv.configuration_opencv import OpenCVCameraConfig  # noqa: F401
+from lerobot.common.cameras.realsense.configuration_realsense import RealSenseCameraConfig  # noqa: F401
+from lerobot.common.robots import (  # noqa: F401
+    Robot,
+    RobotConfig,
+    koch_follower,
+    lekiwi,
+    make_robot_from_config,
+    so100_follower,
+    so101_follower,
+)
+from lerobot.common.teleoperators import (  # noqa: F401
+    Teleoperator,
+    TeleoperatorConfig,
+    koch_leader,
+    make_teleoperator_from_config,
+    so100_leader,
+    so101_leader,
+)
+from lerobot.common.utils.utils import init_logging
+
+
+@dataclass
+class CalibrateConfig:
+    teleop: TeleoperatorConfig | None = None
+    robot: RobotConfig | None = None
+
+    def __post_init__(self):
+        if bool(self.teleop) == bool(self.robot):
+            raise ValueError("Choose either a teleop or a robot.")
+
+        self.device = self.robot if self.robot else self.teleop
+
+
+@draccus.wrap()
+def calibrate(cfg: CalibrateConfig):
+    init_logging()
+    logging.info(pformat(asdict(cfg)))
+
+    if isinstance(cfg.device, RobotConfig):
+        device = make_robot_from_config(cfg.device)
+    elif isinstance(cfg.device, TeleoperatorConfig):
+        device = make_teleoperator_from_config(cfg.device)
+
+    device.connect(calibrate=False)
+    device.calibrate()
+    device.disconnect()
+
+
+if __name__ == "__main__":
+    calibrate()
--- a/lerobot/common/cameras/init.py
+++ b/lerobot/common/cameras/init.py
@@ -0,0 +1,17 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .camera import Camera
+from .configs import CameraConfig, ColorMode, Cv2Rotation
+from .utils import make_cameras_from_configs
--- a/lerobot/common/cameras/camera.py
+++ b/lerobot/common/cameras/camera.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import abc
+from typing import Any, Dict, List
+
+import numpy as np
+
+from .configs import CameraConfig, ColorMode
+
+
+class Camera(abc.ABC):
+    """Base class for camera implementations.
+
+    Defines a standard interface for camera operations across different backends.
+    Subclasses must implement all abstract methods.
+
+    Manages basic camera properties (FPS, resolution) and core operations:
+    - Connection/disconnection
+    - Frame capture (sync/async)
+
+    Attributes:
+        fps (int | None): Configured frames per second
+        width (int | None): Frame width in pixels
+        height (int | None): Frame height in pixels
+
+    Example:
+        class MyCamera(Camera):
+            def __init__(self, config): ...
+            @property
+            def is_connected(self) -> bool: ...
+            def connect(self, warmup=True): ...
+            # Plus other required methods
+    """
+
+    def __init__(self, config: CameraConfig):
+        """Initialize the camera with the given configuration.
+
+        Args:
+            config: Camera configuration containing FPS and resolution.
+        """
+        self.fps: int | None = config.fps
+        self.width: int | None = config.width
+        self.height: int | None = config.height
+
+    @property
+    @abc.abstractmethod
+    def is_connected(self) -> bool:
+        """Check if the camera is currently connected.
+
+        Returns:
+            bool: True if the camera is connected and ready to capture frames,
+                  False otherwise.
+        """
+        pass
+
+    @staticmethod
+    @abc.abstractmethod
+    def find_cameras() -> List[Dict[str, Any]]:
+        """Detects available cameras connected to the system.
+        Returns:
+            List[Dict[str, Any]]: A list of dictionaries,
+            where each dictionary contains information about a detected camera.
+        """
+        pass
+
+    @abc.abstractmethod
+    def connect(self, warmup: bool = True) -> None:
+        """Establish connection to the camera.
+
+        Args:
+            warmup: If True (default), captures a warmup frame before returning. Useful
+                   for cameras that require time to adjust capture settings.
+                   If False, skips the warmup frame.
+        """
+        pass
+
+    @abc.abstractmethod
+    def read(self, color_mode: ColorMode | None = None) -> np.ndarray:
+        """Capture and return a single frame from the camera.
+
+        Args:
+            color_mode: Desired color mode for the output frame. If None,
+                        uses the camera's default color mode.
+
+        Returns:
+            np.ndarray: Captured frame as a numpy array.
+        """
+        pass
+
+    @abc.abstractmethod
+    def async_read(self, timeout_ms: float = ...) -> np.ndarray:
+        """Asynchronously capture and return a single frame from the camera.
+
+        Args:
+            timeout_ms: Maximum time to wait for a frame in milliseconds.
+                        Defaults to implementation-specific timeout.
+
+        Returns:
+            np.ndarray: Captured frame as a numpy array.
+        """
+        pass
+
+    @abc.abstractmethod
+    def disconnect(self) -> None:
+        """Disconnect from the camera and release resources."""
+        pass
--- a/lerobot/common/cameras/configs.py
+++ b/lerobot/common/cameras/configs.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import abc
+from dataclasses import dataclass
+from enum import Enum
+
+import draccus
+
+
+class ColorMode(str, Enum):
+    RGB = "rgb"
+    BGR = "bgr"
+
+
+class Cv2Rotation(int, Enum):
+    NO_ROTATION = 0
+    ROTATE_90 = 90
+    ROTATE_180 = 180
+    ROTATE_270 = -90
+
+
+@dataclass(kw_only=True)
+class CameraConfig(draccus.ChoiceRegistry, abc.ABC):
+    fps: int | None = None
+    width: int | None = None
+    height: int | None = None
+
+    @property
+    def type(self) -> str:
+        return self.get_choice_name(self.__class__)
--- a/lerobot/common/cameras/opencv/init.py
+++ b/lerobot/common/cameras/opencv/init.py
@@ -0,0 +1,16 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .camera_opencv import OpenCVCamera
+from .configuration_opencv import OpenCVCameraConfig
--- a/lerobot/common/cameras/opencv/camera_opencv.py
+++ b/lerobot/common/cameras/opencv/camera_opencv.py
@@ -0,0 +1,479 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Provides the OpenCVCamera class for capturing frames from cameras using OpenCV.
+"""
+
+import logging
+import math
+import platform
+import time
+from pathlib import Path
+from threading import Event, Lock, Thread
+from typing import Any, Dict, List
+
+import cv2
+import numpy as np
+
+from lerobot.common.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError
+
+from ..camera import Camera
+from ..utils import get_cv2_backend, get_cv2_rotation
+from .configuration_opencv import ColorMode, OpenCVCameraConfig
+
+# NOTE(Steven): The maximum opencv device index depends on your operating system. For instance,
+# if you have 3 cameras, they should be associated to index 0, 1, and 2. This is the case
+# on MacOS. However, on Ubuntu, the indices are different like 6, 16, 23.
+# When you change the USB port or reboot the computer, the operating system might
+# treat the same cameras as new devices. Thus we select a higher bound to search indices.
+MAX_OPENCV_INDEX = 60
+
+logger = logging.getLogger(__name__)
+
+
+class OpenCVCamera(Camera):
+    """
+    Manages camera interactions using OpenCV for efficient frame recording.
+
+    This class provides a high-level interface to connect to, configure, and read
+    frames from cameras compatible with OpenCV's VideoCapture. It supports both
+    synchronous and asynchronous frame reading.
+
+    An OpenCVCamera instance requires a camera index (e.g., 0) or a device path
+    (e.g., '/dev/video0' on Linux). Camera indices can be unstable across reboots
+    or port changes, especially on Linux. Use the provided utility script to find
+    available camera indices or paths:
+    ```bash
+    python -m lerobot.find_cameras opencv
+    ```
+
+    The camera's default settings (FPS, resolution, color mode) are used unless
+    overridden in the configuration.
+
+    Example:
+        ```python
+        from lerobot.common.cameras.opencv import OpenCVCamera
+        from lerobot.common.cameras.configuration_opencv import OpenCVCameraConfig, ColorMode, Cv2Rotation
+
+        # Basic usage with camera index 0
+        config = OpenCVCameraConfig(index_or_path=0)
+        camera = OpenCVCamera(config)
+        camera.connect()
+
+        # Read 1 frame synchronously
+        color_image = camera.read()
+        print(color_image.shape)
+
+        # Read 1 frame asynchronously
+        async_image = camera.async_read()
+
+        # When done, properly disconnect the camera using
+        camera.disconnect()
+
+        # Example with custom settings
+        custom_config = OpenCVCameraConfig(
+            index_or_path='/dev/video0', # Or use an index
+            fps=30,
+            width=1280,
+            height=720,
+            color_mode=ColorMode.RGB,
+            rotation=Cv2Rotation.ROTATE_90
+        )
+        custom_camera = OpenCVCamera(custom_config)
+        # ... connect, read, disconnect ...
+        ```
+    """
+
+    def __init__(self, config: OpenCVCameraConfig):
+        """
+        Initializes the OpenCVCamera instance.
+
+        Args:
+            config: The configuration settings for the camera.
+        """
+        super().__init__(config)
+
+        self.config = config
+        self.index_or_path = config.index_or_path
+
+        self.fps = config.fps
+        self.color_mode = config.color_mode
+        self.warmup_s = config.warmup_s
+
+        self.videocapture: cv2.VideoCapture | None = None
+
+        self.thread: Thread | None = None
+        self.stop_event: Event | None = None
+        self.frame_lock: Lock = Lock()
+        self.latest_frame: np.ndarray | None = None
+        self.new_frame_event: Event = Event()
+
+        self.rotation: int | None = get_cv2_rotation(config.rotation)
+        self.backend: int = get_cv2_backend()
+
+        if self.height and self.width:
+            self.capture_width, self.capture_height = self.width, self.height
+            if self.rotation in [cv2.ROTATE_90_CLOCKWISE, cv2.ROTATE_90_COUNTERCLOCKWISE]:
+                self.capture_width, self.capture_height = self.height, self.width
+
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}({self.index_or_path})"
+
+    @property
+    def is_connected(self) -> bool:
+        """Checks if the camera is currently connected and opened."""
+        return isinstance(self.videocapture, cv2.VideoCapture) and self.videocapture.isOpened()
+
+    def connect(self, warmup: bool = True):
+        """
+        Connects to the OpenCV camera specified in the configuration.
+
+        Initializes the OpenCV VideoCapture object, sets desired camera properties
+        (FPS, width, height), and performs initial checks.
+
+        Raises:
+            DeviceAlreadyConnectedError: If the camera is already connected.
+            ConnectionError: If the specified camera index/path is not found or the camera is found but fails to open.
+            RuntimeError: If the camera opens but fails to apply requested FPS/resolution settings.
+        """
+        if self.is_connected:
+            raise DeviceAlreadyConnectedError(f"{self} is already connected.")
+
+        # Use 1 thread for OpenCV operations to avoid potential conflicts or
+        # blocking in multi-threaded applications, especially during data collection.
+        cv2.setNumThreads(1)
+
+        self.videocapture = cv2.VideoCapture(self.index_or_path, self.backend)
+
+        if not self.videocapture.isOpened():
+            self.videocapture.release()
+            self.videocapture = None
+            raise ConnectionError(
+                f"Failed to open {self}."
+                f"Run `python -m lerobot.find_cameras opencv` to find available cameras."
+            )
+
+        self._configure_capture_settings()
+
+        if warmup:
+            start_time = time.time()
+            while time.time() - start_time < self.warmup_s:
+                self.read()
+                time.sleep(0.1)
+
+        logger.info(f"{self} connected.")
+
+    def _configure_capture_settings(self) -> None:
+        """
+        Applies the specified FPS, width, and height settings to the connected camera.
+
+        This method attempts to set the camera properties via OpenCV. It checks if
+        the camera successfully applied the settings and raises an error if not.
+
+        Args:
+            fps: The desired frames per second. If None, the setting is skipped.
+            width: The desired capture width. If None, the setting is skipped.
+            height: The desired capture height. If None, the setting is skipped.
+
+        Raises:
+            RuntimeError: If the camera fails to set any of the specified properties
+                          to the requested value.
+            DeviceNotConnectedError: If the camera is not connected when attempting
+                                     to configure settings.
+        """
+        if not self.is_connected:
+            raise DeviceNotConnectedError(f"Cannot configure settings for {self} as it is not connected.")
+
+        if self.fps is None:
+            self.fps = self.videocapture.get(cv2.CAP_PROP_FPS)
+        else:
+            self._validate_fps()
+
+        default_width = int(round(self.videocapture.get(cv2.CAP_PROP_FRAME_WIDTH)))
+        default_height = int(round(self.videocapture.get(cv2.CAP_PROP_FRAME_HEIGHT)))
+
+        if self.width is None or self.height is None:
+            self.width, self.height = default_width, default_height
+            self.capture_width, self.capture_height = default_width, default_height
+            if self.rotation in [cv2.ROTATE_90_CLOCKWISE, cv2.ROTATE_90_COUNTERCLOCKWISE]:
+                self.width, self.height = default_height, default_width
+                self.capture_width, self.capture_height = default_width, default_height
+        else:
+            self._validate_width_and_height()
+
+    def _validate_fps(self) -> None:
+        """Validates and sets the camera's frames per second (FPS)."""
+
+        success = self.videocapture.set(cv2.CAP_PROP_FPS, float(self.fps))
+        actual_fps = self.videocapture.get(cv2.CAP_PROP_FPS)
+        # Use math.isclose for robust float comparison
+        if not success or not math.isclose(self.fps, actual_fps, rel_tol=1e-3):
+            raise RuntimeError(f"{self} failed to set fps={self.fps} ({actual_fps=}).")
+
+    def _validate_width_and_height(self) -> None:
+        """Validates and sets the camera's frame capture width and height."""
+
+        success = self.videocapture.set(cv2.CAP_PROP_FRAME_WIDTH, float(self.capture_width))
+        actual_width = int(round(self.videocapture.get(cv2.CAP_PROP_FRAME_WIDTH)))
+        if not success or self.capture_width != actual_width:
+            raise RuntimeError(f"{self} failed to set capture_width={self.capture_width} ({actual_width=}).")
+
+        success = self.videocapture.set(cv2.CAP_PROP_FRAME_HEIGHT, float(self.capture_height))
+        actual_height = int(round(self.videocapture.get(cv2.CAP_PROP_FRAME_HEIGHT)))
+        if not success or self.capture_height != actual_height:
+            raise RuntimeError(
+                f"{self} failed to set capture_height={self.capture_height} ({actual_height})."
+            )
+
+    @staticmethod
+    def find_cameras() -> List[Dict[str, Any]]:
+        """
+        Detects available OpenCV cameras connected to the system.
+
+        On Linux, it scans '/dev/video*' paths. On other systems (like macOS, Windows),
+        it checks indices from 0 up to `MAX_OPENCV_INDEX`.
+
+        Returns:
+            List[Dict[str, Any]]: A list of dictionaries,
+            where each dictionary contains 'type', 'id' (port index or path),
+            and the default profile properties (width, height, fps, format).
+        """
+        found_cameras_info = []
+
+        if platform.system() == "Linux":
+            possible_paths = sorted(Path("/dev").glob("video*"), key=lambda p: p.name)
+            targets_to_scan = [str(p) for p in possible_paths]
+        else:
+            targets_to_scan = list(range(MAX_OPENCV_INDEX))
+
+        for target in targets_to_scan:
+            camera = cv2.VideoCapture(target)
+            if camera.isOpened():
+                default_width = int(camera.get(cv2.CAP_PROP_FRAME_WIDTH))
+                default_height = int(camera.get(cv2.CAP_PROP_FRAME_HEIGHT))
+                default_fps = camera.get(cv2.CAP_PROP_FPS)
+                default_format = camera.get(cv2.CAP_PROP_FORMAT)
+                camera_info = {
+                    "name": f"OpenCV Camera @ {target}",
+                    "type": "OpenCV",
+                    "id": target,
+                    "backend_api": camera.getBackendName(),
+                    "default_stream_profile": {
+                        "format": default_format,
+                        "width": default_width,
+                        "height": default_height,
+                        "fps": default_fps,
+                    },
+                }
+
+                found_cameras_info.append(camera_info)
+                camera.release()
+
+        return found_cameras_info
+
+    def read(self, color_mode: ColorMode | None = None) -> np.ndarray:
+        """
+        Reads a single frame synchronously from the camera.
+
+        This is a blocking call. It waits for the next available frame from the
+        camera hardware via OpenCV.
+
+        Args:
+            color_mode (Optional[ColorMode]): If specified, overrides the default
+                color mode (`self.color_mode`) for this read operation (e.g.,
+                request RGB even if default is BGR).
+
+        Returns:
+            np.ndarray: The captured frame as a NumPy array in the format
+                       (height, width, channels), using the specified or default
+                       color mode and applying any configured rotation.
+
+        Raises:
+            DeviceNotConnectedError: If the camera is not connected.
+            RuntimeError: If reading the frame from the camera fails or if the
+                          received frame dimensions don't match expectations before rotation.
+            ValueError: If an invalid `color_mode` is requested.
+        """
+        if not self.is_connected:
+            raise DeviceNotConnectedError(f"{self} is not connected.")
+
+        start_time = time.perf_counter()
+
+        ret, frame = self.videocapture.read()
+
+        if not ret or frame is None:
+            raise RuntimeError(f"{self} read failed (status={ret}).")
+
+        processed_frame = self._postprocess_image(frame, color_mode)
+
+        read_duration_ms = (time.perf_counter() - start_time) * 1e3
+        logger.debug(f"{self} read took: {read_duration_ms:.1f}ms")
+
+        return processed_frame
+
+    def _postprocess_image(self, image: np.ndarray, color_mode: ColorMode | None = None) -> np.ndarray:
+        """
+        Applies color conversion, dimension validation, and rotation to a raw frame.
+
+        Args:
+            image (np.ndarray): The raw image frame (expected BGR format from OpenCV).
+            color_mode (Optional[ColorMode]): The target color mode (RGB or BGR). If None,
+                                             uses the instance's default `self.color_mode`.
+
+        Returns:
+            np.ndarray: The processed image frame.
+
+        Raises:
+            ValueError: If the requested `color_mode` is invalid.
+            RuntimeError: If the raw frame dimensions do not match the configured
+                          `width` and `height`.
+        """
+        requested_color_mode = self.color_mode if color_mode is None else color_mode
+
+        if requested_color_mode not in (ColorMode.RGB, ColorMode.BGR):
+            raise ValueError(
+                f"Invalid color mode '{requested_color_mode}'. Expected {ColorMode.RGB} or {ColorMode.BGR}."
+            )
+
+        h, w, c = image.shape
+
+        if h != self.capture_height or w != self.capture_width:
+            raise RuntimeError(
+                f"{self} frame width={w} or height={h} do not match configured width={self.capture_width} or height={self.capture_height}."
+            )
+
+        if c != 3:
+            raise RuntimeError(f"{self} frame channels={c} do not match expected 3 channels (RGB/BGR).")
+
+        processed_image = image
+        if requested_color_mode == ColorMode.RGB:
+            processed_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+
+        if self.rotation in [cv2.ROTATE_90_CLOCKWISE, cv2.ROTATE_90_COUNTERCLOCKWISE]:
+            processed_image = cv2.rotate(processed_image, self.rotation)
+
+        return processed_image
+
+    def _read_loop(self):
+        """
+        Internal loop run by the background thread for asynchronous reading.
+
+        On each iteration:
+        1. Reads a color frame
+        2. Stores result in latest_frame (thread-safe)
+        3. Sets new_frame_event to notify listeners
+
+        Stops on DeviceNotConnectedError, logs other errors and continues.
+        """
+        while not self.stop_event.is_set():
+            try:
+                color_image = self.read()
+
+                with self.frame_lock:
+                    self.latest_frame = color_image
+                self.new_frame_event.set()
+
+            except DeviceNotConnectedError:
+                break
+            except Exception as e:
+                logger.warning(f"Error reading frame in background thread for {self}: {e}")
+
+    def _start_read_thread(self) -> None:
+        """Starts or restarts the background read thread if it's not running."""
+        if self.thread is not None and self.thread.is_alive():
+            self.thread.join(timeout=0.1)
+        if self.stop_event is not None:
+            self.stop_event.set()
+
+        self.stop_event = Event()
+        self.thread = Thread(target=self._read_loop, args=(), name=f"{self}_read_loop")
+        self.thread.daemon = True
+        self.thread.start()
+
+    def _stop_read_thread(self) -> None:
+        """Signals the background read thread to stop and waits for it to join."""
+        if self.stop_event is not None:
+            self.stop_event.set()
+
+        if self.thread is not None and self.thread.is_alive():
+            self.thread.join(timeout=2.0)
+
+        self.thread = None
+        self.stop_event = None
+
+    def async_read(self, timeout_ms: float = 200) -> np.ndarray:
+        """
+        Reads the latest available frame asynchronously.
+
+        This method retrieves the most recent frame captured by the background
+        read thread. It does not block waiting for the camera hardware directly,
+        but may wait up to timeout_ms for the background thread to provide a frame.
+
+        Args:
+            timeout_ms (float): Maximum time in milliseconds to wait for a frame
+                to become available. Defaults to 200ms (0.2 seconds).
+
+        Returns:
+            np.ndarray: The latest captured frame as a NumPy array in the format
+                       (height, width, channels), processed according to configuration.
+
+        Raises:
+            DeviceNotConnectedError: If the camera is not connected.
+            TimeoutError: If no frame becomes available within the specified timeout.
+            RuntimeError: If an unexpected error occurs.
+        """
+        if not self.is_connected:
+            raise DeviceNotConnectedError(f"{self} is not connected.")
+
+        if self.thread is None or not self.thread.is_alive():
+            self._start_read_thread()
+
+        if not self.new_frame_event.wait(timeout=timeout_ms / 1000.0):
+            thread_alive = self.thread is not None and self.thread.is_alive()
+            raise TimeoutError(
+                f"Timed out waiting for frame from camera {self} after {timeout_ms} ms. "
+                f"Read thread alive: {thread_alive}."
+            )
+
+        with self.frame_lock:
+            frame = self.latest_frame
+            self.new_frame_event.clear()
+
+        if frame is None:
+            raise RuntimeError(f"Internal error: Event set but no frame available for {self}.")
+
+        return frame
+
+    def disconnect(self):
+        """
+        Disconnects from the camera and cleans up resources.
+
+        Stops the background read thread (if running) and releases the OpenCV
+        VideoCapture object.
+
+        Raises:
+            DeviceNotConnectedError: If the camera is already disconnected.
+        """
+        if not self.is_connected and self.thread is None:
+            raise DeviceNotConnectedError(f"{self} not connected.")
+
+        if self.thread is not None:
+            self._stop_read_thread()
+
+        if self.videocapture is not None:
+            self.videocapture.release()
+            self.videocapture = None
+
+        logger.info(f"{self} disconnected.")
--- a/lerobot/common/cameras/opencv/configuration_opencv.py
+++ b/lerobot/common/cameras/opencv/configuration_opencv.py
@@ -0,0 +1,73 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from pathlib import Path
+
+from ..configs import CameraConfig, ColorMode, Cv2Rotation
+
+
+@CameraConfig.register_subclass("opencv")
+@dataclass
+class OpenCVCameraConfig(CameraConfig):
+    """Configuration class for OpenCV-based camera devices or video files.
+
+    This class provides configuration options for cameras accessed through OpenCV,
+    supporting both physical camera devices and video files. It includes settings
+    for resolution, frame rate, color mode, and image rotation.
+
+    Example configurations:
+    ```python
+    # Basic configurations
+    OpenCVCameraConfig(0, 30, 1280, 720)   # 1280x720 @ 30FPS
+    OpenCVCameraConfig(/dev/video4, 60, 640, 480)   # 640x480 @ 60FPS
+
+    # Advanced configurations
+    OpenCVCameraConfig(128422271347, 30, 640, 480, rotation=Cv2Rotation.ROTATE_90)     # With 90° rotation
+    ```
+
+    Attributes:
+        index_or_path: Either an integer representing the camera device index,
+                      or a Path object pointing to a video file.
+        fps: Requested frames per second for the color stream.
+        width: Requested frame width in pixels for the color stream.
+        height: Requested frame height in pixels for the color stream.
+        color_mode: Color mode for image output (RGB or BGR). Defaults to RGB.
+        rotation: Image rotation setting (0°, 90°, 180°, or 270°). Defaults to no rotation.
+        warmup_s: Time reading frames before returning from connect (in seconds)
+
+    Note:
+        - Only 3-channel color output (RGB/BGR) is currently supported.
+    """
+
+    index_or_path: int | Path
+    color_mode: ColorMode = ColorMode.RGB
+    rotation: Cv2Rotation = Cv2Rotation.NO_ROTATION
+    warmup_s: int = 1
+
+    def __post_init__(self):
+        if self.color_mode not in (ColorMode.RGB, ColorMode.BGR):
+            raise ValueError(
+                f"`color_mode` is expected to be {ColorMode.RGB.value} or {ColorMode.BGR.value}, but {self.color_mode} is provided."
+            )
+
+        if self.rotation not in (
+            Cv2Rotation.NO_ROTATION,
+            Cv2Rotation.ROTATE_90,
+            Cv2Rotation.ROTATE_180,
+            Cv2Rotation.ROTATE_270,
+        ):
+            raise ValueError(
+                f"`rotation` is expected to be in {(Cv2Rotation.NO_ROTATION, Cv2Rotation.ROTATE_90, Cv2Rotation.ROTATE_180, Cv2Rotation.ROTATE_270)}, but {self.rotation} is provided."
+            )
--- a/lerobot/common/cameras/realsense/init.py
+++ b/lerobot/common/cameras/realsense/init.py
@@ -0,0 +1,16 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .camera_realsense import RealSenseCamera
+from .configuration_realsense import RealSenseCameraConfig
--- a/lerobot/common/cameras/realsense/camera_realsense.py
+++ b/lerobot/common/cameras/realsense/camera_realsense.py
@@ -0,0 +1,556 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Provides the RealSenseCamera class for capturing frames from Intel RealSense cameras.
+"""
+
+import logging
+import time
+from threading import Event, Lock, Thread
+from typing import Any, Dict, List
+
+import cv2
+import numpy as np
+
+try:
+    import pyrealsense2 as rs
+except Exception as e:
+    logging.info(f"Could not import realsense: {e}")
+
+from lerobot.common.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError
+
+from ..camera import Camera
+from ..configs import ColorMode
+from ..utils import get_cv2_rotation
+from .configuration_realsense import RealSenseCameraConfig
+
+logger = logging.getLogger(__name__)
+
+
+class RealSenseCamera(Camera):
+    """
+    Manages interactions with Intel RealSense cameras for frame and depth recording.
+
+    This class provides an interface similar to `OpenCVCamera` but tailored for
+    RealSense devices, leveraging the `pyrealsense2` library. It uses the camera's
+    unique serial number for identification, offering more stability than device
+    indices, especially on Linux. It also supports capturing depth maps alongside
+    color frames.
+
+    Use the provided utility script to find available camera indices and default profiles:
+    ```bash
+    python -m lerobot.find_cameras realsense
+    ```
+
+    A `RealSenseCamera` instance requires a configuration object specifying the
+    camera's serial number or a unique device name. If using the name, ensure only
+    one camera with that name is connected.
+
+    The camera's default settings (FPS, resolution, color mode) from the stream
+    profile are used unless overridden in the configuration.
+
+    Example:
+        ```python
+        from lerobot.common.cameras.realsense import RealSenseCamera, RealSenseCameraConfig
+        from lerobot.common.cameras import ColorMode, Cv2Rotation
+
+        # Basic usage with serial number
+        config = RealSenseCameraConfig(serial_number_or_name="0123456789") # Replace with actual SN
+        camera = RealSenseCamera(config)
+        camera.connect()
+
+        # Read 1 frame synchronously
+        color_image = camera.read()
+        print(color_image.shape)
+
+        # Read 1 frame asynchronously
+        async_image = camera.async_read()
+
+        # When done, properly disconnect the camera using
+        camera.disconnect()
+
+        # Example with depth capture and custom settings
+        custom_config = RealSenseCameraConfig(
+            serial_number_or_name="0123456789", # Replace with actual SN
+            fps=30,
+            width=1280,
+            height=720,
+            color_mode=ColorMode.BGR, # Request BGR output
+            rotation=Cv2Rotation.NO_ROTATION,
+            use_depth=True
+        )
+        depth_camera = RealSenseCamera(custom_config)
+        depth_camera.connect()
+
+        # Read 1 depth frame
+        depth_map = depth_camera.read_depth()
+
+        # Example using a unique camera name
+        name_config = RealSenseCameraConfig(serial_number_or_name="Intel RealSense D435") # If unique
+        name_camera = RealSenseCamera(name_config)
+        # ... connect, read, disconnect ...
+        ```
+    """
+
+    def __init__(self, config: RealSenseCameraConfig):
+        """
+        Initializes the RealSenseCamera instance.
+
+        Args:
+            config: The configuration settings for the camera.
+        """
+
+        super().__init__(config)
+
+        self.config = config
+
+        if config.serial_number_or_name.isdigit():
+            self.serial_number = config.serial_number_or_name
+        else:
+            self.serial_number = self._find_serial_number_from_name(config.serial_number_or_name)
+
+        self.fps = config.fps
+        self.color_mode = config.color_mode
+        self.use_depth = config.use_depth
+        self.warmup_s = config.warmup_s
+
+        self.rs_pipeline: rs.pipeline | None = None
+        self.rs_profile: rs.pipeline_profile | None = None
+
+        self.thread: Thread | None = None
+        self.stop_event: Event | None = None
+        self.frame_lock: Lock = Lock()
+        self.latest_frame: np.ndarray | None = None
+        self.new_frame_event: Event = Event()
+
+        self.rotation: int | None = get_cv2_rotation(config.rotation)
+
+        if self.height and self.width:
+            self.capture_width, self.capture_height = self.width, self.height
+            if self.rotation in [cv2.ROTATE_90_CLOCKWISE, cv2.ROTATE_90_COUNTERCLOCKWISE]:
+                self.capture_width, self.capture_height = self.height, self.width
+
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}({self.serial_number})"
+
+    @property
+    def is_connected(self) -> bool:
+        """Checks if the camera pipeline is started and streams are active."""
+        return self.rs_pipeline is not None and self.rs_profile is not None
+
+    def connect(self, warmup: bool = True):
+        """
+        Connects to the RealSense camera specified in the configuration.
+
+        Initializes the RealSense pipeline, configures the required streams (color
+        and optionally depth), starts the pipeline, and validates the actual stream settings.
+
+        Raises:
+            DeviceAlreadyConnectedError: If the camera is already connected.
+            ValueError: If the configuration is invalid (e.g., missing serial/name, name not unique).
+            ConnectionError: If the camera is found but fails to start the pipeline or no RealSense devices are detected at all.
+            RuntimeError: If the pipeline starts but fails to apply requested settings.
+        """
+        if self.is_connected:
+            raise DeviceAlreadyConnectedError(f"{self} is already connected.")
+
+        self.rs_pipeline = rs.pipeline()
+        rs_config = rs.config()
+        self._configure_rs_pipeline_config(rs_config)
+
+        try:
+            self.rs_profile = self.rs_pipeline.start(rs_config)
+        except RuntimeError as e:
+            self.rs_profile = None
+            self.rs_pipeline = None
+            raise ConnectionError(
+                f"Failed to open {self}."
+                "Run `python -m lerobot.find_cameras realsense` to find available cameras."
+            ) from e
+
+        self._configure_capture_settings()
+
+        if warmup:
+            time.sleep(
+                1
+            )  # NOTE(Steven): RS cameras need a bit of time to warm up before the first read. If we don't wait, the first read from the warmup will raise.
+            start_time = time.time()
+            while time.time() - start_time < self.warmup_s:
+                self.read()
+                time.sleep(0.1)
+
+        logger.info(f"{self} connected.")
+
+    @staticmethod
+    def find_cameras() -> List[Dict[str, Any]]:
+        """
+        Detects available Intel RealSense cameras connected to the system.
+
+        Returns:
+            List[Dict[str, Any]]: A list of dictionaries,
+            where each dictionary contains 'type', 'id' (serial number), 'name',
+            firmware version, USB type, and other available specs, and the default profile properties (width, height, fps, format).
+
+        Raises:
+            OSError: If pyrealsense2 is not installed.
+            ImportError: If pyrealsense2 is not installed.
+        """
+        found_cameras_info = []
+        context = rs.context()
+        devices = context.query_devices()
+
+        for device in devices:
+            camera_info = {
+                "name": device.get_info(rs.camera_info.name),
+                "type": "RealSense",
+                "id": device.get_info(rs.camera_info.serial_number),
+                "firmware_version": device.get_info(rs.camera_info.firmware_version),
+                "usb_type_descriptor": device.get_info(rs.camera_info.usb_type_descriptor),
+                "physical_port": device.get_info(rs.camera_info.physical_port),
+                "product_id": device.get_info(rs.camera_info.product_id),
+                "product_line": device.get_info(rs.camera_info.product_line),
+            }
+
+            # Get stream profiles for each sensor
+            sensors = device.query_sensors()
+            for sensor in sensors:
+                profiles = sensor.get_stream_profiles()
+
+                for profile in profiles:
+                    if profile.is_video_stream_profile() and profile.is_default():
+                        vprofile = profile.as_video_stream_profile()
+                        stream_info = {
+                            "stream_type": vprofile.stream_name(),
+                            "format": vprofile.format().name,
+                            "width": vprofile.width(),
+                            "height": vprofile.height(),
+                            "fps": vprofile.fps(),
+                        }
+                        camera_info["default_stream_profile"] = stream_info
+
+            found_cameras_info.append(camera_info)
+
+        return found_cameras_info
+
+    def _find_serial_number_from_name(self, name: str) -> str:
+        """Finds the serial number for a given unique camera name."""
+        camera_infos = self.find_cameras()
+        found_devices = [cam for cam in camera_infos if str(cam["name"]) == name]
+
+        if not found_devices:
+            available_names = [cam["name"] for cam in camera_infos]
+            raise ValueError(
+                f"No RealSense camera found with name '{name}'. Available camera names: {available_names}"
+            )
+
+        if len(found_devices) > 1:
+            serial_numbers = [dev["serial_number"] for dev in found_devices]
+            raise ValueError(
+                f"Multiple RealSense cameras found with name '{name}'. "
+                f"Please use a unique serial number instead. Found SNs: {serial_numbers}"
+            )
+
+        serial_number = str(found_devices[0]["serial_number"])
+        return serial_number
+
+    def _configure_rs_pipeline_config(self, rs_config):
+        """Creates and configures the RealSense pipeline configuration object."""
+        rs.config.enable_device(rs_config, self.serial_number)
+
+        if self.width and self.height and self.fps:
+            rs_config.enable_stream(
+                rs.stream.color, self.capture_width, self.capture_height, rs.format.rgb8, self.fps
+            )
+            if self.use_depth:
+                rs_config.enable_stream(
+                    rs.stream.depth, self.capture_width, self.capture_height, rs.format.z16, self.fps
+                )
+        else:
+            rs_config.enable_stream(rs.stream.color)
+            if self.use_depth:
+                rs_config.enable_stream(rs.stream.depth)
+
+    def _configure_capture_settings(self) -> None:
+        """Sets fps, width, and height from device stream if not already configured.
+
+        Uses the color stream profile to update unset attributes. Handles rotation by
+        swapping width/height when needed. Original capture dimensions are always stored.
+
+        Raises:
+            DeviceNotConnectedError: If device is not connected.
+        """
+        if not self.is_connected:
+            raise DeviceNotConnectedError(f"Cannot validate settings for {self} as it is not connected.")
+
+        stream = self.rs_profile.get_stream(rs.stream.color).as_video_stream_profile()
+
+        if self.fps is None:
+            self.fps = stream.fps()
+
+        if self.width is None or self.height is None:
+            actual_width = int(round(stream.width()))
+            actual_height = int(round(stream.height()))
+            if self.rotation in [cv2.ROTATE_90_CLOCKWISE, cv2.ROTATE_90_COUNTERCLOCKWISE]:
+                self.width, self.height = actual_height, actual_width
+                self.capture_width, self.capture_height = actual_width, actual_height
+            else:
+                self.width, self.height = actual_width, actual_height
+                self.capture_width, self.capture_height = actual_width, actual_height
+
+    def read_depth(self, timeout_ms: int = 200) -> np.ndarray:
+        """
+        Reads a single frame (depth) synchronously from the camera.
+
+        This is a blocking call. It waits for a coherent set of frames (depth)
+        from the camera hardware via the RealSense pipeline.
+
+        Args:
+            timeout_ms (int): Maximum time in milliseconds to wait for a frame. Defaults to 200ms.
+
+        Returns:
+            np.ndarray: The depth map as a NumPy array (height, width)
+                  of type `np.uint16` (raw depth values in millimeters) and rotation.
+
+        Raises:
+            DeviceNotConnectedError: If the camera is not connected.
+            RuntimeError: If reading frames from the pipeline fails or frames are invalid.
+        """
+
+        if not self.is_connected:
+            raise DeviceNotConnectedError(f"{self} is not connected.")
+        if not self.use_depth:
+            raise RuntimeError(
+                f"Failed to capture depth frame '.read_depth()'. Depth stream is not enabled for {self}."
+            )
+
+        start_time = time.perf_counter()
+
+        ret, frame = self.rs_pipeline.try_wait_for_frames(timeout_ms=timeout_ms)
+
+        if not ret or frame is None:
+            raise RuntimeError(f"{self} read_depth failed (status={ret}).")
+
+        depth_frame = frame.get_depth_frame()
+        depth_map = np.asanyarray(depth_frame.get_data())
+
+        depth_map_processed = self._postprocess_image(depth_map, depth_frame=True)
+
+        read_duration_ms = (time.perf_counter() - start_time) * 1e3
+        logger.debug(f"{self} read took: {read_duration_ms:.1f}ms")
+
+        return depth_map_processed
+
+    def read(self, color_mode: ColorMode | None = None, timeout_ms: int = 200) -> np.ndarray:
+        """
+        Reads a single frame (color) synchronously from the camera.
+
+        This is a blocking call. It waits for a coherent set of frames (color)
+        from the camera hardware via the RealSense pipeline.
+
+        Args:
+            timeout_ms (int): Maximum time in milliseconds to wait for a frame. Defaults to 200ms.
+
+        Returns:
+            np.ndarray: The captured color frame as a NumPy array
+              (height, width, channels), processed according to `color_mode` and rotation.
+
+        Raises:
+            DeviceNotConnectedError: If the camera is not connected.
+            RuntimeError: If reading frames from the pipeline fails or frames are invalid.
+            ValueError: If an invalid `color_mode` is requested.
+        """
+
+        if not self.is_connected:
+            raise DeviceNotConnectedError(f"{self} is not connected.")
+
+        start_time = time.perf_counter()
+
+        ret, frame = self.rs_pipeline.try_wait_for_frames(timeout_ms=timeout_ms)
+
+        if not ret or frame is None:
+            raise RuntimeError(f"{self} read failed (status={ret}).")
+
+        color_frame = frame.get_color_frame()
+        color_image_raw = np.asanyarray(color_frame.get_data())
+
+        color_image_processed = self._postprocess_image(color_image_raw, color_mode)
+
+        read_duration_ms = (time.perf_counter() - start_time) * 1e3
+        logger.debug(f"{self} read took: {read_duration_ms:.1f}ms")
+
+        return color_image_processed
+
+    def _postprocess_image(
+        self, image: np.ndarray, color_mode: ColorMode | None = None, depth_frame: bool = False
+    ) -> np.ndarray:
+        """
+        Applies color conversion, dimension validation, and rotation to a raw color frame.
+
+        Args:
+            image (np.ndarray): The raw image frame (expected RGB format from RealSense).
+            color_mode (Optional[ColorMode]): The target color mode (RGB or BGR). If None,
+                                             uses the instance's default `self.color_mode`.
+
+        Returns:
+            np.ndarray: The processed image frame according to `self.color_mode` and `self.rotation`.
+
+        Raises:
+            ValueError: If the requested `color_mode` is invalid.
+            RuntimeError: If the raw frame dimensions do not match the configured
+                          `width` and `height`.
+        """
+
+        if color_mode and color_mode not in (ColorMode.RGB, ColorMode.BGR):
+            raise ValueError(
+                f"Invalid requested color mode '{color_mode}'. Expected {ColorMode.RGB} or {ColorMode.BGR}."
+            )
+
+        if depth_frame:
+            h, w = image.shape
+        else:
+            h, w, c = image.shape
+
+            if c != 3:
+                raise RuntimeError(f"{self} frame channels={c} do not match expected 3 channels (RGB/BGR).")
+
+        if h != self.capture_height or w != self.capture_width:
+            raise RuntimeError(
+                f"{self} frame width={w} or height={h} do not match configured width={self.capture_width} or height={self.capture_height}."
+            )
+
+        processed_image = image
+        if self.color_mode == ColorMode.BGR:
+            processed_image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+
+        if self.rotation in [cv2.ROTATE_90_CLOCKWISE, cv2.ROTATE_90_COUNTERCLOCKWISE]:
+            processed_image = cv2.rotate(processed_image, self.rotation)
+
+        return processed_image
+
+    def _read_loop(self):
+        """
+        Internal loop run by the background thread for asynchronous reading.
+
+        On each iteration:
+        1. Reads a color frame with 500ms timeout
+        2. Stores result in latest_frame (thread-safe)
+        3. Sets new_frame_event to notify listeners
+
+        Stops on DeviceNotConnectedError, logs other errors and continues.
+        """
+        while not self.stop_event.is_set():
+            try:
+                color_image = self.read(timeout_ms=500)
+
+                with self.frame_lock:
+                    self.latest_frame = color_image
+                self.new_frame_event.set()
+
+            except DeviceNotConnectedError:
+                break
+            except Exception as e:
+                logger.warning(f"Error reading frame in background thread for {self}: {e}")
+
+    def _start_read_thread(self) -> None:
+        """Starts or restarts the background read thread if it's not running."""
+        if self.thread is not None and self.thread.is_alive():
+            self.thread.join(timeout=0.1)
+        if self.stop_event is not None:
+            self.stop_event.set()
+
+        self.stop_event = Event()
+        self.thread = Thread(target=self._read_loop, args=(), name=f"{self}_read_loop")
+        self.thread.daemon = True
+        self.thread.start()
+
+    def _stop_read_thread(self):
+        """Signals the background read thread to stop and waits for it to join."""
+        if self.stop_event is not None:
+            self.stop_event.set()
+
+        if self.thread is not None and self.thread.is_alive():
+            self.thread.join(timeout=2.0)
+
+        self.thread = None
+        self.stop_event = None
+
+    # NOTE(Steven): Missing implementation for depth for now
+    def async_read(self, timeout_ms: float = 200) -> np.ndarray:
+        """
+        Reads the latest available frame data (color) asynchronously.
+
+        This method retrieves the most recent color frame captured by the background
+        read thread. It does not block waiting for the camera hardware directly,
+        but may wait up to timeout_ms for the background thread to provide a frame.
+
+        Args:
+            timeout_ms (float): Maximum time in milliseconds to wait for a frame
+                to become available. Defaults to 200ms (0.2 seconds).
+
+        Returns:
+            np.ndarray:
+            The latest captured frame data (color image), processed according to configuration.
+
+        Raises:
+            DeviceNotConnectedError: If the camera is not connected.
+            TimeoutError: If no frame data becomes available within the specified timeout.
+            RuntimeError: If the background thread died unexpectedly or another error occurs.
+        """
+        if not self.is_connected:
+            raise DeviceNotConnectedError(f"{self} is not connected.")
+
+        if self.thread is None or not self.thread.is_alive():
+            self._start_read_thread()
+
+        if not self.new_frame_event.wait(timeout=timeout_ms / 1000.0):
+            thread_alive = self.thread is not None and self.thread.is_alive()
+            raise TimeoutError(
+                f"Timed out waiting for frame from camera {self} after {timeout_ms} ms. "
+                f"Read thread alive: {thread_alive}."
+            )
+
+        with self.frame_lock:
+            frame = self.latest_frame
+            self.new_frame_event.clear()
+
+        if frame is None:
+            raise RuntimeError(f"Internal error: Event set but no frame available for {self}.")
+
+        return frame
+
+    def disconnect(self):
+        """
+        Disconnects from the camera, stops the pipeline, and cleans up resources.
+
+        Stops the background read thread (if running) and stops the RealSense pipeline.
+
+        Raises:
+            DeviceNotConnectedError: If the camera is already disconnected (pipeline not running).
+        """
+
+        if not self.is_connected and self.thread is None:
+            raise DeviceNotConnectedError(
+                f"Attempted to disconnect {self}, but it appears already disconnected."
+            )
+
+        if self.thread is not None:
+            self._stop_read_thread()
+
+        if self.rs_pipeline is not None:
+            self.rs_pipeline.stop()
+            self.rs_pipeline = None
+            self.rs_profile = None
+
+        logger.info(f"{self} disconnected.")
--- a/lerobot/common/cameras/realsense/configuration_realsense.py
+++ b/lerobot/common/cameras/realsense/configuration_realsense.py
@@ -0,0 +1,82 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+
+from ..configs import CameraConfig, ColorMode, Cv2Rotation
+
+
+@CameraConfig.register_subclass("intelrealsense")
+@dataclass
+class RealSenseCameraConfig(CameraConfig):
+    """Configuration class for Intel RealSense cameras.
+
+    This class provides specialized configuration options for Intel RealSense cameras,
+    including support for depth sensing and device identification via serial number or name.
+
+    Example configurations for Intel RealSense D405:
+    ```python
+    # Basic configurations
+    RealSenseCameraConfig("0123456789", 30, 1280, 720)   # 1280x720 @ 30FPS
+    RealSenseCameraConfig("0123456789", 60, 640, 480)   # 640x480 @ 60FPS
+
+    # Advanced configurations
+    RealSenseCameraConfig("0123456789", 30, 640, 480, use_depth=True)  # With depth sensing
+    RealSenseCameraConfig("0123456789", 30, 640, 480, rotation=Cv2Rotation.ROTATE_90)     # With 90° rotation
+    ```
+
+    Attributes:
+        fps: Requested frames per second for the color stream.
+        width: Requested frame width in pixels for the color stream.
+        height: Requested frame height in pixels for the color stream.
+        serial_number_or_name: Unique serial number or human-readable name to identify the camera.
+        color_mode: Color mode for image output (RGB or BGR). Defaults to RGB.
+        use_depth: Whether to enable depth stream. Defaults to False.
+        rotation: Image rotation setting (0°, 90°, 180°, or 270°). Defaults to no rotation.
+        warmup_s: Time reading frames before returning from connect (in seconds)
+
+    Note:
+        - Either name or serial_number must be specified.
+        - Depth stream configuration (if enabled) will use the same FPS as the color stream.
+        - The actual resolution and FPS may be adjusted by the camera to the nearest supported mode.
+        - For `fps`, `width` and `height`, either all of them need to be set, or none of them.
+    """
+
+    serial_number_or_name: str
+    color_mode: ColorMode = ColorMode.RGB
+    use_depth: bool = False
+    rotation: Cv2Rotation = Cv2Rotation.NO_ROTATION
+    warmup_s: int = 1
+
+    def __post_init__(self):
+        if self.color_mode not in (ColorMode.RGB, ColorMode.BGR):
+            raise ValueError(
+                f"`color_mode` is expected to be {ColorMode.RGB.value} or {ColorMode.BGR.value}, but {self.color_mode} is provided."
+            )
+
+        if self.rotation not in (
+            Cv2Rotation.NO_ROTATION,
+            Cv2Rotation.ROTATE_90,
+            Cv2Rotation.ROTATE_180,
+            Cv2Rotation.ROTATE_270,
+        ):
+            raise ValueError(
+                f"`rotation` is expected to be in {(Cv2Rotation.NO_ROTATION, Cv2Rotation.ROTATE_90, Cv2Rotation.ROTATE_180, Cv2Rotation.ROTATE_270)}, but {self.rotation} is provided."
+            )
+
+        values = (self.fps, self.width, self.height)
+        if any(v is not None for v in values) and any(v is None for v in values):
+            raise ValueError(
+                "For `fps`, `width` and `height`, either all of them need to be set, or none of them."
+            )
--- a/lerobot/common/cameras/utils.py
+++ b/lerobot/common/cameras/utils.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import platform
+from pathlib import Path
+from typing import TypeAlias
+
+from .camera import Camera
+from .configs import CameraConfig, Cv2Rotation
+
+IndexOrPath: TypeAlias = int | Path
+
+
+def make_cameras_from_configs(camera_configs: dict[str, CameraConfig]) -> dict[str, Camera]:
+    cameras = {}
+
+    for key, cfg in camera_configs.items():
+        if cfg.type == "opencv":
+            from .opencv import OpenCVCamera
+
+            cameras[key] = OpenCVCamera(cfg)
+
+        elif cfg.type == "intelrealsense":
+            from .realsense.camera_realsense import RealSenseCamera
+
+            cameras[key] = RealSenseCamera(cfg)
+        else:
+            raise ValueError(f"The motor type '{cfg.type}' is not valid.")
+
+    return cameras
+
+
+def get_cv2_rotation(rotation: Cv2Rotation) -> int | None:
+    import cv2
+
+    if rotation == Cv2Rotation.ROTATE_90:
+        return cv2.ROTATE_90_CLOCKWISE
+    elif rotation == Cv2Rotation.ROTATE_180:
+        return cv2.ROTATE_180
+    elif rotation == Cv2Rotation.ROTATE_270:
+        return cv2.ROTATE_90_COUNTERCLOCKWISE
+    else:
+        return None
+
+
+def get_cv2_backend() -> int:
+    import cv2
+
+    if platform.system() == "Windows":
+        return cv2.CAP_AVFOUNDATION
+    else:
+        return cv2.CAP_ANY
--- a/lerobot/common/constants.py
+++ b/lerobot/common/constants.py
@@ -0,0 +1,52 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# keys
+import os
+from pathlib import Path
+
+from huggingface_hub.constants import HF_HOME
+
+OBS_ENV_STATE = "observation.environment_state"
+OBS_STATE = "observation.state"
+OBS_IMAGE = "observation.image"
+OBS_IMAGES = "observation.images"
+ACTION = "action"
+
+ROBOTS = "robots"
+TELEOPERATORS = "teleoperators"
+
+# files & directories
+CHECKPOINTS_DIR = "checkpoints"
+LAST_CHECKPOINT_LINK = "last"
+PRETRAINED_MODEL_DIR = "pretrained_model"
+TRAINING_STATE_DIR = "training_state"
+RNG_STATE = "rng_state.safetensors"
+TRAINING_STEP = "training_step.json"
+OPTIMIZER_STATE = "optimizer_state.safetensors"
+OPTIMIZER_PARAM_GROUPS = "optimizer_param_groups.json"
+SCHEDULER_STATE = "scheduler_state.json"
+
+if "LEROBOT_HOME" in os.environ:
+    raise ValueError(
+        f"You have a 'LEROBOT_HOME' environment variable set to '{os.getenv('LEROBOT_HOME')}'.\n"
+        "'LEROBOT_HOME' is deprecated, please use 'HF_LEROBOT_HOME' instead."
+    )
+
+# cache dir
+default_cache_path = Path(HF_HOME) / "lerobot"
+HF_LEROBOT_HOME = Path(os.getenv("HF_LEROBOT_HOME", default_cache_path)).expanduser()
+
+# calibration dir
+default_calibration_path = HF_LEROBOT_HOME / "calibration"
+HF_LEROBOT_CALIBRATION = Path(os.getenv("HF_LEROBOT_CALIBRATION", default_calibration_path)).expanduser()
--- a/lerobot/common/datasets/_video_benchmark/README.md
+++ b/lerobot/common/datasets/_video_benchmark/README.md
@@ -1,334 +0,0 @@
-# Video benchmark
-
-
-## Questions
-
-What is the optimal trade-off between:
- maximizing loading time with random access,
- minimizing memory space on disk,
- maximizing success rate of policies?
-
-How to encode videos?
- How much compression (`-crf`)? Low compression with `0`, normal compression with `20` or extreme with `56`?
- What pixel format to use (`-pix_fmt`)? `yuv444p` or `yuv420p`?
- How many key frames (`-g`)? A key frame every `10` frames?
-
-How to decode videos?
- Which `decoder`? `torchvision`, `torchaudio`, `ffmpegio`, `decord`, or `nvc`?
-
-## Metrics
-
-**Percentage of data compression (higher is better)**
-`compression_factor` is the ratio of the memory space on disk taken by the original images to encode, to the memory space taken by the encoded video. For instance, `compression_factor=4` means that the video takes 4 times less memory space on disk compared to the original images.
-
-**Percentage of loading time (higher is better)**
-`load_time_factor` is the ratio of the time it takes to load original images at given timestamps, to the time it takes to decode the exact same frames from the video. Higher is better. For instance, `load_time_factor=0.5` means that decoding from video is 2 times slower than loading the original images.
-
-**Average L2 error per pixel (lower is better)**
-`avg_per_pixel_l2_error` is the average L2 error between each decoded frame and its corresponding original image over all requested timestamps, and also divided by the number of pixels in the image to be comparable when switching to different image sizes.
-
-**Loss of a pretrained policy (higher is better)** (not available)
-`loss_pretrained` is the result of evaluating with the selected encoding/decoding settings a policy pretrained on original images. It is easier to understand than `avg_l2_error`.
-
-**Success rate after retraining (higher is better)** (not available)
-`success_rate` is the result of training and evaluating a policy with the selected encoding/decoding settings. It is the most difficult metric to get but also the very best.
-
-
-## Variables
-
-**Image content**
-We don't expect the same optimal settings for a dataset of images from a simulation, or from real-world in an appartment, or in a factory, or outdoor, etc. Hence, we run this benchmark on two datasets: `pusht` (simulation) and `umi` (real-world outdoor).
-
-**Requested timestamps**
-In this benchmark, we focus on the loading time of random access, so we are not interested in sequentially loading all frames of a video like in a movie. However, the number of consecutive timestamps requested and their spacing can greatly affect the `load_time_factor`. In fact, it is expected to get faster loading time by decoding a large number of consecutive frames from a video, than to load the same data from individual images. To reflect our robotics use case, we consider a few settings:
- `single_frame`: 1 frame,
- `2_frames`: 2 consecutive frames (e.g. `[t, t + 1 / fps]`),
- `2_frames_4_space`: 2 consecutive frames with 4 frames of spacing (e.g `[t, t + 4 / fps]`),
-
-**Data augmentations**
-We might revisit this benchmark and find better settings if we train our policies with various data augmentations to make them more robust (e.g. robust to color changes, compression, etc.).
-
-
-## Results
-
-**`decoder`**
-| repo_id | decoder | load_time_factor | avg_per_pixel_l2_error |
-| --- | --- | --- | --- |
-| lerobot/pusht | <span style="color: #32CD32;">torchvision</span> | 0.166 | 0.0000119 |
-| lerobot/pusht | ffmpegio | 0.009 | 0.0001182 |
-| lerobot/pusht | torchaudio | 0.138 | 0.0000359 |
-| lerobot/umi_cup_in_the_wild | <span style="color: #32CD32;">torchvision</span> | 0.174 | 0.0000174 |
-| lerobot/umi_cup_in_the_wild | ffmpegio | 0.010 | 0.0000735 |
-| lerobot/umi_cup_in_the_wild | torchaudio | 0.154 | 0.0000340 |
-
-### `1_frame`
-
-**`pix_fmt`**
-| repo_id | pix_fmt | compression_factor | load_time_factor | avg_per_pixel_l2_error |
-| --- | --- | --- | --- | --- |
-| lerobot/pusht | yuv420p | 3.788 | 0.224 | 0.0000760 |
-| lerobot/pusht | yuv444p | 3.646 | 0.185 | 0.0000443 |
-| lerobot/umi_cup_in_the_wild | yuv420p | 14.391 | 0.388 | 0.0000469 |
-| lerobot/umi_cup_in_the_wild | yuv444p | 14.932 | 0.329 | 0.0000397 |
-
-**`g`**
-| repo_id | g | compression_factor | load_time_factor | avg_per_pixel_l2_error |
-| --- | --- | --- | --- | --- |
-| lerobot/pusht | 1 | 2.543 | 0.204 | 0.0000556 |
-| lerobot/pusht | 2 | 3.646 | 0.182 | 0.0000443 |
-| lerobot/pusht | 3 | 4.431 | 0.174 | 0.0000450 |
-| lerobot/pusht | 4 | 5.103 | 0.163 | 0.0000448 |
-| lerobot/pusht | 5 | 5.625 | 0.163 | 0.0000436 |
-| lerobot/pusht | 6 | 5.974 | 0.155 | 0.0000427 |
-| lerobot/pusht | 10 | 6.814 | 0.130 | 0.0000410 |
-| lerobot/pusht | 15 | 7.431 | 0.105 | 0.0000406 |
-| lerobot/pusht | 20 | 7.662 | 0.097 | 0.0000400 |
-| lerobot/pusht | 40 | 8.163 | 0.061 | 0.0000405 |
-| lerobot/pusht | 100 | 8.761 | 0.039 | 0.0000422 |
-| lerobot/pusht | None | 8.909 | 0.024 | 0.0000431 |
-| lerobot/umi_cup_in_the_wild | 1 | 14.411 | 0.444 | 0.0000601 |
-| lerobot/umi_cup_in_the_wild | 2 | 14.932 | 0.345 | 0.0000397 |
-| lerobot/umi_cup_in_the_wild | 3 | 20.174 | 0.282 | 0.0000416 |
-| lerobot/umi_cup_in_the_wild | 4 | 24.889 | 0.271 | 0.0000415 |
-| lerobot/umi_cup_in_the_wild | 5 | 28.825 | 0.260 | 0.0000415 |
-| lerobot/umi_cup_in_the_wild | 6 | 31.635 | 0.249 | 0.0000415 |
-| lerobot/umi_cup_in_the_wild | 10 | 39.418 | 0.195 | 0.0000399 |
-| lerobot/umi_cup_in_the_wild | 15 | 44.577 | 0.169 | 0.0000394 |
-| lerobot/umi_cup_in_the_wild | 20 | 47.907 | 0.140 | 0.0000390 |
-| lerobot/umi_cup_in_the_wild | 40 | 52.554 | 0.096 | 0.0000384 |
-| lerobot/umi_cup_in_the_wild | 100 | 58.241 | 0.046 | 0.0000390 |
-| lerobot/umi_cup_in_the_wild | None | 60.530 | 0.022 | 0.0000400 |
-
-**`crf`**
-| repo_id | crf | compression_factor | load_time_factor | avg_per_pixel_l2_error |
-| --- | --- | --- | --- | --- |
-| lerobot/pusht | 0 | 1.699 | 0.175 | 0.0000035 |
-| lerobot/pusht | 5 | 1.409 | 0.181 | 0.0000080 |
-| lerobot/pusht | 10 | 1.842 | 0.172 | 0.0000123 |
-| lerobot/pusht | 15 | 2.322 | 0.187 | 0.0000211 |
-| lerobot/pusht | 20 | 3.050 | 0.181 | 0.0000346 |
-| lerobot/pusht | None | 3.646 | 0.189 | 0.0000443 |
-| lerobot/pusht | 25 | 3.969 | 0.186 | 0.0000521 |
-| lerobot/pusht | 30 | 5.687 | 0.184 | 0.0000850 |
-| lerobot/pusht | 40 | 10.818 | 0.193 | 0.0001726 |
-| lerobot/pusht | 50 | 18.185 | 0.183 | 0.0002606 |
-| lerobot/umi_cup_in_the_wild | 0 | 1.918 | 0.165 | 0.0000056 |
-| lerobot/umi_cup_in_the_wild | 5 | 3.207 | 0.171 | 0.0000111 |
-| lerobot/umi_cup_in_the_wild | 10 | 4.818 | 0.212 | 0.0000153 |
-| lerobot/umi_cup_in_the_wild | 15 | 7.329 | 0.261 | 0.0000218 |
-| lerobot/umi_cup_in_the_wild | 20 | 11.361 | 0.312 | 0.0000317 |
-| lerobot/umi_cup_in_the_wild | None | 14.932 | 0.339 | 0.0000397 |
-| lerobot/umi_cup_in_the_wild | 25 | 17.741 | 0.297 | 0.0000452 |
-| lerobot/umi_cup_in_the_wild | 30 | 27.983 | 0.406 | 0.0000629 |
-| lerobot/umi_cup_in_the_wild | 40 | 82.449 | 0.468 | 0.0001184 |
-| lerobot/umi_cup_in_the_wild | 50 | 186.145 | 0.515 | 0.0001879 |
-
-**best**
-| repo_id | compression_factor | load_time_factor | avg_per_pixel_l2_error |
-| --- | --- | --- | --- |
-| lerobot/pusht | 3.646 | 0.188 | 0.0000443 |
-| lerobot/umi_cup_in_the_wild | 14.932 | 0.339 | 0.0000397 |
-
-### `2_frames`
-
-**`pix_fmt`**
-| repo_id | pix_fmt | compression_factor | load_time_factor | avg_per_pixel_l2_error |
-| --- | --- | --- | --- | --- |
-| lerobot/pusht | yuv420p | 3.788 | 0.314 | 0.0000799 |
-| lerobot/pusht | yuv444p | 3.646 | 0.303 | 0.0000496 |
-| lerobot/umi_cup_in_the_wild | yuv420p | 14.391 | 0.642 | 0.0000503 |
-| lerobot/umi_cup_in_the_wild | yuv444p | 14.932 | 0.529 | 0.0000436 |
-
-**`g`**
-| repo_id | g | compression_factor | load_time_factor | avg_per_pixel_l2_error |
-| --- | --- | --- | --- | --- |
-| lerobot/pusht | 1 | 2.543 | 0.308 | 0.0000599 |
-| lerobot/pusht | 2 | 3.646 | 0.279 | 0.0000496 |
-| lerobot/pusht | 3 | 4.431 | 0.259 | 0.0000498 |
-| lerobot/pusht | 4 | 5.103 | 0.243 | 0.0000501 |
-| lerobot/pusht | 5 | 5.625 | 0.235 | 0.0000492 |
-| lerobot/pusht | 6 | 5.974 | 0.230 | 0.0000481 |
-| lerobot/pusht | 10 | 6.814 | 0.194 | 0.0000468 |
-| lerobot/pusht | 15 | 7.431 | 0.152 | 0.0000460 |
-| lerobot/pusht | 20 | 7.662 | 0.151 | 0.0000455 |
-| lerobot/pusht | 40 | 8.163 | 0.095 | 0.0000454 |
-| lerobot/pusht | 100 | 8.761 | 0.062 | 0.0000472 |
-| lerobot/pusht | None | 8.909 | 0.037 | 0.0000479 |
-| lerobot/umi_cup_in_the_wild | 1 | 14.411 | 0.638 | 0.0000625 |
-| lerobot/umi_cup_in_the_wild | 2 | 14.932 | 0.537 | 0.0000436 |
-| lerobot/umi_cup_in_the_wild | 3 | 20.174 | 0.493 | 0.0000437 |
-| lerobot/umi_cup_in_the_wild | 4 | 24.889 | 0.458 | 0.0000446 |
-| lerobot/umi_cup_in_the_wild | 5 | 28.825 | 0.438 | 0.0000445 |
-| lerobot/umi_cup_in_the_wild | 6 | 31.635 | 0.424 | 0.0000444 |
-| lerobot/umi_cup_in_the_wild | 10 | 39.418 | 0.345 | 0.0000435 |
-| lerobot/umi_cup_in_the_wild | 15 | 44.577 | 0.313 | 0.0000417 |
-| lerobot/umi_cup_in_the_wild | 20 | 47.907 | 0.264 | 0.0000421 |
-| lerobot/umi_cup_in_the_wild | 40 | 52.554 | 0.185 | 0.0000414 |
-| lerobot/umi_cup_in_the_wild | 100 | 58.241 | 0.090 | 0.0000420 |
-| lerobot/umi_cup_in_the_wild | None | 60.530 | 0.042 | 0.0000424 |
-
-**`crf`**
-| repo_id | crf | compression_factor | load_time_factor | avg_per_pixel_l2_error |
-| --- | --- | --- | --- | --- |
-| lerobot/pusht | 0 | 1.699 | 0.302 | 0.0000097 |
-| lerobot/pusht | 5 | 1.409 | 0.287 | 0.0000142 |
-| lerobot/pusht | 10 | 1.842 | 0.283 | 0.0000184 |
-| lerobot/pusht | 15 | 2.322 | 0.305 | 0.0000268 |
-| lerobot/pusht | 20 | 3.050 | 0.285 | 0.0000402 |
-| lerobot/pusht | None | 3.646 | 0.285 | 0.0000496 |
-| lerobot/pusht | 25 | 3.969 | 0.293 | 0.0000572 |
-| lerobot/pusht | 30 | 5.687 | 0.293 | 0.0000893 |
-| lerobot/pusht | 40 | 10.818 | 0.319 | 0.0001762 |
-| lerobot/pusht | 50 | 18.185 | 0.304 | 0.0002626 |
-| lerobot/umi_cup_in_the_wild | 0 | 1.918 | 0.235 | 0.0000112 |
-| lerobot/umi_cup_in_the_wild | 5 | 3.207 | 0.261 | 0.0000166 |
-| lerobot/umi_cup_in_the_wild | 10 | 4.818 | 0.333 | 0.0000207 |
-| lerobot/umi_cup_in_the_wild | 15 | 7.329 | 0.406 | 0.0000267 |
-| lerobot/umi_cup_in_the_wild | 20 | 11.361 | 0.489 | 0.0000361 |
-| lerobot/umi_cup_in_the_wild | None | 14.932 | 0.537 | 0.0000436 |
-| lerobot/umi_cup_in_the_wild | 25 | 17.741 | 0.578 | 0.0000487 |
-| lerobot/umi_cup_in_the_wild | 30 | 27.983 | 0.453 | 0.0000655 |
-| lerobot/umi_cup_in_the_wild | 40 | 82.449 | 0.767 | 0.0001192 |
-| lerobot/umi_cup_in_the_wild | 50 | 186.145 | 0.816 | 0.0001881 |
-
-**best**
-| repo_id | compression_factor | load_time_factor | avg_per_pixel_l2_error |
-| --- | --- | --- | --- |
-| lerobot/pusht | 3.646 | 0.283 | 0.0000496 |
-| lerobot/umi_cup_in_the_wild | 14.932 | 0.543 | 0.0000436 |
-
-### `2_frames_4_space`
-
-**`pix_fmt`**
-| repo_id | pix_fmt | compression_factor | load_time_factor | avg_per_pixel_l2_error |
-| --- | --- | --- | --- | --- |
-| lerobot/pusht | yuv420p | 3.788 | 0.257 | 0.0000855 |
-| lerobot/pusht | yuv444p | 3.646 | 0.261 | 0.0000556 |
-| lerobot/umi_cup_in_the_wild | yuv420p | 14.391 | 0.493 | 0.0000476 |
-| lerobot/umi_cup_in_the_wild | yuv444p | 14.932 | 0.371 | 0.0000404 |
-
-**`g`**
-| repo_id | g | compression_factor | load_time_factor | avg_per_pixel_l2_error |
-| --- | --- | --- | --- | --- |
-| lerobot/pusht | 1 | 2.543 | 0.226 | 0.0000670 |
-| lerobot/pusht | 2 | 3.646 | 0.222 | 0.0000556 |
-| lerobot/pusht | 3 | 4.431 | 0.217 | 0.0000567 |
-| lerobot/pusht | 4 | 5.103 | 0.204 | 0.0000555 |
-| lerobot/pusht | 5 | 5.625 | 0.179 | 0.0000556 |
-| lerobot/pusht | 6 | 5.974 | 0.188 | 0.0000544 |
-| lerobot/pusht | 10 | 6.814 | 0.160 | 0.0000531 |
-| lerobot/pusht | 15 | 7.431 | 0.150 | 0.0000521 |
-| lerobot/pusht | 20 | 7.662 | 0.123 | 0.0000519 |
-| lerobot/pusht | 40 | 8.163 | 0.092 | 0.0000519 |
-| lerobot/pusht | 100 | 8.761 | 0.053 | 0.0000533 |
-| lerobot/pusht | None | 8.909 | 0.034 | 0.0000541 |
-| lerobot/umi_cup_in_the_wild | 1 | 14.411 | 0.409 | 0.0000607 |
-| lerobot/umi_cup_in_the_wild | 2 | 14.932 | 0.381 | 0.0000404 |
-| lerobot/umi_cup_in_the_wild | 3 | 20.174 | 0.355 | 0.0000418 |
-| lerobot/umi_cup_in_the_wild | 4 | 24.889 | 0.346 | 0.0000425 |
-| lerobot/umi_cup_in_the_wild | 5 | 28.825 | 0.354 | 0.0000419 |
-| lerobot/umi_cup_in_the_wild | 6 | 31.635 | 0.336 | 0.0000419 |
-| lerobot/umi_cup_in_the_wild | 10 | 39.418 | 0.314 | 0.0000402 |
-| lerobot/umi_cup_in_the_wild | 15 | 44.577 | 0.269 | 0.0000397 |
-| lerobot/umi_cup_in_the_wild | 20 | 47.907 | 0.246 | 0.0000395 |
-| lerobot/umi_cup_in_the_wild | 40 | 52.554 | 0.171 | 0.0000390 |
-| lerobot/umi_cup_in_the_wild | 100 | 58.241 | 0.091 | 0.0000399 |
-| lerobot/umi_cup_in_the_wild | None | 60.530 | 0.043 | 0.0000409 |
-
-**`crf`**
-| repo_id | crf | compression_factor | load_time_factor | avg_per_pixel_l2_error |
-| --- | --- | --- | --- | --- |
-| lerobot/pusht | 0 | 1.699 | 0.212 | 0.0000193 |
-| lerobot/pusht | 5 | 1.409 | 0.211 | 0.0000232 |
-| lerobot/pusht | 10 | 1.842 | 0.199 | 0.0000270 |
-| lerobot/pusht | 15 | 2.322 | 0.198 | 0.0000347 |
-| lerobot/pusht | 20 | 3.050 | 0.211 | 0.0000469 |
-| lerobot/pusht | None | 3.646 | 0.206 | 0.0000556 |
-| lerobot/pusht | 25 | 3.969 | 0.210 | 0.0000626 |
-| lerobot/pusht | 30 | 5.687 | 0.223 | 0.0000927 |
-| lerobot/pusht | 40 | 10.818 | 0.227 | 0.0001763 |
-| lerobot/pusht | 50 | 18.185 | 0.223 | 0.0002625 |
-| lerobot/umi_cup_in_the_wild | 0 | 1.918 | 0.147 | 0.0000071 |
-| lerobot/umi_cup_in_the_wild | 5 | 3.207 | 0.182 | 0.0000125 |
-| lerobot/umi_cup_in_the_wild | 10 | 4.818 | 0.222 | 0.0000166 |
-| lerobot/umi_cup_in_the_wild | 15 | 7.329 | 0.270 | 0.0000229 |
-| lerobot/umi_cup_in_the_wild | 20 | 11.361 | 0.325 | 0.0000326 |
-| lerobot/umi_cup_in_the_wild | None | 14.932 | 0.362 | 0.0000404 |
-| lerobot/umi_cup_in_the_wild | 25 | 17.741 | 0.390 | 0.0000459 |
-| lerobot/umi_cup_in_the_wild | 30 | 27.983 | 0.437 | 0.0000633 |
-| lerobot/umi_cup_in_the_wild | 40 | 82.449 | 0.499 | 0.0001186 |
-| lerobot/umi_cup_in_the_wild | 50 | 186.145 | 0.564 | 0.0001879 |
-
-**best**
-| repo_id | compression_factor | load_time_factor | avg_per_pixel_l2_error |
-| --- | --- | --- | --- |
-| lerobot/pusht | 3.646 | 0.224 | 0.0000556 |
-| lerobot/umi_cup_in_the_wild | 14.932 | 0.368 | 0.0000404 |
-
-### `6_frames`
-
-**`pix_fmt`**
-| repo_id | pix_fmt | compression_factor | load_time_factor | avg_per_pixel_l2_error |
-| --- | --- | --- | --- | --- |
-| lerobot/pusht | yuv420p | 3.788 | 0.660 | 0.0000839 |
-| lerobot/pusht | yuv444p | 3.646 | 0.546 | 0.0000542 |
-| lerobot/umi_cup_in_the_wild | yuv420p | 14.391 | 1.225 | 0.0000497 |
-| lerobot/umi_cup_in_the_wild | yuv444p | 14.932 | 0.908 | 0.0000428 |
-
-**`g`**
-| repo_id | g | compression_factor | load_time_factor | avg_per_pixel_l2_error |
-| --- | --- | --- | --- | --- |
-| lerobot/pusht | 1 | 2.543 | 0.552 | 0.0000646 |
-| lerobot/pusht | 2 | 3.646 | 0.534 | 0.0000542 |
-| lerobot/pusht | 3 | 4.431 | 0.563 | 0.0000546 |
-| lerobot/pusht | 4 | 5.103 | 0.537 | 0.0000545 |
-| lerobot/pusht | 5 | 5.625 | 0.477 | 0.0000532 |
-| lerobot/pusht | 6 | 5.974 | 0.515 | 0.0000530 |
-| lerobot/pusht | 10 | 6.814 | 0.410 | 0.0000512 |
-| lerobot/pusht | 15 | 7.431 | 0.405 | 0.0000503 |
-| lerobot/pusht | 20 | 7.662 | 0.345 | 0.0000500 |
-| lerobot/pusht | 40 | 8.163 | 0.247 | 0.0000496 |
-| lerobot/pusht | 100 | 8.761 | 0.147 | 0.0000510 |
-| lerobot/pusht | None | 8.909 | 0.100 | 0.0000519 |
-| lerobot/umi_cup_in_the_wild | 1 | 14.411 | 0.997 | 0.0000620 |
-| lerobot/umi_cup_in_the_wild | 2 | 14.932 | 0.911 | 0.0000428 |
-| lerobot/umi_cup_in_the_wild | 3 | 20.174 | 0.869 | 0.0000433 |
-| lerobot/umi_cup_in_the_wild | 4 | 24.889 | 0.874 | 0.0000438 |
-| lerobot/umi_cup_in_the_wild | 5 | 28.825 | 0.864 | 0.0000439 |
-| lerobot/umi_cup_in_the_wild | 6 | 31.635 | 0.834 | 0.0000440 |
-| lerobot/umi_cup_in_the_wild | 10 | 39.418 | 0.781 | 0.0000421 |
-| lerobot/umi_cup_in_the_wild | 15 | 44.577 | 0.679 | 0.0000411 |
-| lerobot/umi_cup_in_the_wild | 20 | 47.907 | 0.652 | 0.0000410 |
-| lerobot/umi_cup_in_the_wild | 40 | 52.554 | 0.465 | 0.0000404 |
-| lerobot/umi_cup_in_the_wild | 100 | 58.241 | 0.245 | 0.0000413 |
-| lerobot/umi_cup_in_the_wild | None | 60.530 | 0.116 | 0.0000417 |
-
-**`crf`**
-| repo_id | crf | compression_factor | load_time_factor | avg_per_pixel_l2_error |
-| --- | --- | --- | --- | --- |
-| lerobot/pusht | 0 | 1.699 | 0.534 | 0.0000163 |
-| lerobot/pusht | 5 | 1.409 | 0.524 | 0.0000205 |
-| lerobot/pusht | 10 | 1.842 | 0.510 | 0.0000245 |
-| lerobot/pusht | 15 | 2.322 | 0.512 | 0.0000324 |
-| lerobot/pusht | 20 | 3.050 | 0.508 | 0.0000452 |
-| lerobot/pusht | None | 3.646 | 0.518 | 0.0000542 |
-| lerobot/pusht | 25 | 3.969 | 0.534 | 0.0000616 |
-| lerobot/pusht | 30 | 5.687 | 0.530 | 0.0000927 |
-| lerobot/pusht | 40 | 10.818 | 0.552 | 0.0001777 |
-| lerobot/pusht | 50 | 18.185 | 0.564 | 0.0002644 |
-| lerobot/umi_cup_in_the_wild | 0 | 1.918 | 0.401 | 0.0000101 |
-| lerobot/umi_cup_in_the_wild | 5 | 3.207 | 0.499 | 0.0000156 |
-| lerobot/umi_cup_in_the_wild | 10 | 4.818 | 0.599 | 0.0000197 |
-| lerobot/umi_cup_in_the_wild | 15 | 7.329 | 0.704 | 0.0000258 |
-| lerobot/umi_cup_in_the_wild | 20 | 11.361 | 0.834 | 0.0000352 |
-| lerobot/umi_cup_in_the_wild | None | 14.932 | 0.925 | 0.0000428 |
-| lerobot/umi_cup_in_the_wild | 25 | 17.741 | 0.978 | 0.0000480 |
-| lerobot/umi_cup_in_the_wild | 30 | 27.983 | 1.088 | 0.0000648 |
-| lerobot/umi_cup_in_the_wild | 40 | 82.449 | 1.324 | 0.0001190 |
-| lerobot/umi_cup_in_the_wild | 50 | 186.145 | 1.436 | 0.0001880 |
-
-**best**
-| repo_id | compression_factor | load_time_factor | avg_per_pixel_l2_error |
-| --- | --- | --- | --- |
-| lerobot/pusht | 3.646 | 0.546 | 0.0000542 |
-| lerobot/umi_cup_in_the_wild | 14.932 | 0.934 | 0.0000428 |
--- a/lerobot/common/datasets/_video_benchmark/run_video_benchmark.py
+++ b/lerobot/common/datasets/_video_benchmark/run_video_benchmark.py
@@ -1,372 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import json
-import random
-import shutil
-import subprocess
-import time
-from pathlib import Path
-
-import einops
-import numpy
-import PIL
-import torch
-
-from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
-from lerobot.common.datasets.video_utils import (
-    decode_video_frames_torchvision,
-)
-
-
-def get_directory_size(directory):
-    total_size = 0
-    # Iterate over all files and subdirectories recursively
-    for item in directory.rglob("*"):
-        if item.is_file():
-            # Add the file size to the total
-            total_size += item.stat().st_size
-    return total_size
-
-
-def run_video_benchmark(
-    output_dir,
-    cfg,
-    timestamps_mode,
-    seed=1337,
-):
-    output_dir = Path(output_dir)
-    if output_dir.exists():
-        shutil.rmtree(output_dir)
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    repo_id = cfg["repo_id"]
-
-    # TODO(rcadene): rewrite with hardcoding of original images and episodes
-    dataset = LeRobotDataset(repo_id)
-
-    # Get fps
-    fps = dataset.fps
-
-    # we only load first episode
-    ep_num_images = dataset.episode_data_index["to"][0].item()
-
-    # Save/Load image directory for the first episode
-    imgs_dir = Path(f"tmp/data/images/{repo_id}/observation.image_episode_000000")
-    if not imgs_dir.exists():
-        imgs_dir.mkdir(parents=True, exist_ok=True)
-        hf_dataset = dataset.hf_dataset.with_format(None)
-        imgs_dataset = hf_dataset.select_columns("observation.image")
-
-        for i, item in enumerate(imgs_dataset):
-            img = item["observation.image"]
-            img.save(str(imgs_dir / f"frame_{i:06d}.png"), quality=100)
-
-            if i >= ep_num_images - 1:
-                break
-
-    sum_original_frames_size_bytes = get_directory_size(imgs_dir)
-
-    # Encode images into video
-    video_path = output_dir / "episode_0.mp4"
-
-    g = cfg.get("g")
-    crf = cfg.get("crf")
-    pix_fmt = cfg["pix_fmt"]
-
-    cmd = f"ffmpeg -r {fps} "
-    cmd += "-f image2 "
-    cmd += "-loglevel error "
-    cmd += f"-i {str(imgs_dir / 'frame_%06d.png')} "
-    cmd += "-vcodec libx264 "
-    if g is not None:
-        cmd += f"-g {g} "  # ensures at least 1 keyframe every 10 frames
-    # cmd += "-keyint_min 10 " set a minimum of 10 frames between 2 key frames
-    # cmd += "-sc_threshold 0 " disable scene change detection to lower the number of key frames
-    if crf is not None:
-        cmd += f"-crf {crf} "
-    cmd += f"-pix_fmt {pix_fmt} "
-    cmd += f"{str(video_path)}"
-    subprocess.run(cmd.split(" "), check=True)
-
-    video_size_bytes = video_path.stat().st_size
-
-    # Set decoder
-
-    decoder = cfg["decoder"]
-    decoder_kwgs = cfg["decoder_kwgs"]
-    device = cfg["device"]
-
-    if decoder == "torchvision":
-        decode_frames_fn = decode_video_frames_torchvision
-    else:
-        raise ValueError(decoder)
-
-    # Estimate average loading time
-
-    def load_original_frames(imgs_dir, timestamps):
-        frames = []
-        for ts in timestamps:
-            idx = int(ts * fps)
-            frame = PIL.Image.open(imgs_dir / f"frame_{idx:06d}.png")
-            frame = torch.from_numpy(numpy.array(frame))
-            frame = frame.type(torch.float32) / 255
-            frame = einops.rearrange(frame, "h w c -> c h w")
-            frames.append(frame)
-        return frames
-
-    list_avg_load_time = []
-    list_avg_load_time_from_images = []
-    per_pixel_l2_errors = []
-
-    random.seed(seed)
-
-    for t in range(50):
-        # test loading 2 frames that are 4 frames appart, which might be a common setting
-        ts = random.randint(fps, ep_num_images - fps) / fps
-
-        if timestamps_mode == "1_frame":
-            timestamps = [ts]
-        elif timestamps_mode == "2_frames":
-            timestamps = [ts - 1 / fps, ts]
-        elif timestamps_mode == "2_frames_4_space":
-            timestamps = [ts - 4 / fps, ts]
-        elif timestamps_mode == "6_frames":
-            timestamps = [ts - i / fps for i in range(6)][::-1]
-        else:
-            raise ValueError(timestamps_mode)
-
-        num_frames = len(timestamps)
-
-        start_time_s = time.monotonic()
-        frames = decode_frames_fn(
-            video_path, timestamps=timestamps, tolerance_s=1e-4, device=device, **decoder_kwgs
-        )
-        avg_load_time = (time.monotonic() - start_time_s) / num_frames
-        list_avg_load_time.append(avg_load_time)
-
-        start_time_s = time.monotonic()
-        original_frames = load_original_frames(imgs_dir, timestamps)
-        avg_load_time_from_images = (time.monotonic() - start_time_s) / num_frames
-        list_avg_load_time_from_images.append(avg_load_time_from_images)
-
-        # Estimate average L2 error between original frames and decoded frames
-        for i, ts in enumerate(timestamps):
-            # are_close = torch.allclose(frames[i], original_frames[i], atol=0.02)
-            num_pixels = original_frames[i].numel()
-            per_pixel_l2_error = torch.norm(frames[i] - original_frames[i], p=2).item() / num_pixels
-
-            # save decoded frames
-            if t == 0:
-                frame_hwc = (frames[i].permute((1, 2, 0)) * 255).type(torch.uint8).cpu().numpy()
-                PIL.Image.fromarray(frame_hwc).save(output_dir / f"frame_{i:06d}.png")
-
-            # save original_frames
-            idx = int(ts * fps)
-            if t == 0:
-                original_frame = PIL.Image.open(imgs_dir / f"frame_{idx:06d}.png")
-                original_frame.save(output_dir / f"original_frame_{i:06d}.png")
-
-            per_pixel_l2_errors.append(per_pixel_l2_error)
-
-    avg_load_time = float(numpy.array(list_avg_load_time).mean())
-    avg_load_time_from_images = float(numpy.array(list_avg_load_time_from_images).mean())
-    avg_per_pixel_l2_error = float(numpy.array(per_pixel_l2_errors).mean())
-
-    # Save benchmark info
-
-    info = {
-        "sum_original_frames_size_bytes": sum_original_frames_size_bytes,
-        "video_size_bytes": video_size_bytes,
-        "avg_load_time_from_images": avg_load_time_from_images,
-        "avg_load_time": avg_load_time,
-        "compression_factor": sum_original_frames_size_bytes / video_size_bytes,
-        "load_time_factor": avg_load_time_from_images / avg_load_time,
-        "avg_per_pixel_l2_error": avg_per_pixel_l2_error,
-    }
-
-    with open(output_dir / "info.json", "w") as f:
-        json.dump(info, f)
-
-    return info
-
-
-def display_markdown_table(headers, rows):
-    for i, row in enumerate(rows):
-        new_row = []
-        for col in row:
-            if col is None:
-                new_col = "None"
-            elif isinstance(col, float):
-                new_col = f"{col:.3f}"
-                if new_col == "0.000":
-                    new_col = f"{col:.7f}"
-            elif isinstance(col, int):
-                new_col = f"{col}"
-            else:
-                new_col = col
-            new_row.append(new_col)
-        rows[i] = new_row
-
-    header_line = "| " + " | ".join(headers) + " |"
-    separator_line = "| " + " | ".join(["---" for _ in headers]) + " |"
-    body_lines = ["| " + " | ".join(row) + " |" for row in rows]
-    markdown_table = "\n".join([header_line, separator_line] + body_lines)
-    print(markdown_table)
-    print()
-
-
-def load_info(out_dir):
-    with open(out_dir / "info.json") as f:
-        info = json.load(f)
-    return info
-
-
-def main():
-    out_dir = Path("tmp/run_video_benchmark")
-    dry_run = False
-    repo_ids = ["lerobot/pusht", "lerobot/umi_cup_in_the_wild"]
-    timestamps_modes = [
-        "1_frame",
-        "2_frames",
-        "2_frames_4_space",
-        "6_frames",
-    ]
-    for timestamps_mode in timestamps_modes:
-        bench_dir = out_dir / timestamps_mode
-
-        print(f"### `{timestamps_mode}`")
-        print()
-
-        print("**`pix_fmt`**")
-        headers = ["repo_id", "pix_fmt", "compression_factor", "load_time_factor", "avg_per_pixel_l2_error"]
-        rows = []
-        for repo_id in repo_ids:
-            for pix_fmt in ["yuv420p", "yuv444p"]:
-                cfg = {
-                    "repo_id": repo_id,
-                    # video encoding
-                    "g": 2,
-                    "crf": None,
-                    "pix_fmt": pix_fmt,
-                    # video decoding
-                    "device": "cpu",
-                    "decoder": "torchvision",
-                    "decoder_kwgs": {},
-                }
-                if not dry_run:
-                    run_video_benchmark(bench_dir / repo_id / f"torchvision_{pix_fmt}", cfg, timestamps_mode)
-                info = load_info(bench_dir / repo_id / f"torchvision_{pix_fmt}")
-                rows.append(
-                    [
-                        repo_id,
-                        pix_fmt,
-                        info["compression_factor"],
-                        info["load_time_factor"],
-                        info["avg_per_pixel_l2_error"],
-                    ]
-                )
-        display_markdown_table(headers, rows)
-
-        print("**`g`**")
-        headers = ["repo_id", "g", "compression_factor", "load_time_factor", "avg_per_pixel_l2_error"]
-        rows = []
-        for repo_id in repo_ids:
-            for g in [1, 2, 3, 4, 5, 6, 10, 15, 20, 40, 100, None]:
-                cfg = {
-                    "repo_id": repo_id,
-                    # video encoding
-                    "g": g,
-                    "pix_fmt": "yuv444p",
-                    # video decoding
-                    "device": "cpu",
-                    "decoder": "torchvision",
-                    "decoder_kwgs": {},
-                }
-                if not dry_run:
-                    run_video_benchmark(bench_dir / repo_id / f"torchvision_g_{g}", cfg, timestamps_mode)
-                info = load_info(bench_dir / repo_id / f"torchvision_g_{g}")
-                rows.append(
-                    [
-                        repo_id,
-                        g,
-                        info["compression_factor"],
-                        info["load_time_factor"],
-                        info["avg_per_pixel_l2_error"],
-                    ]
-                )
-        display_markdown_table(headers, rows)
-
-        print("**`crf`**")
-        headers = ["repo_id", "crf", "compression_factor", "load_time_factor", "avg_per_pixel_l2_error"]
-        rows = []
-        for repo_id in repo_ids:
-            for crf in [0, 5, 10, 15, 20, None, 25, 30, 40, 50]:
-                cfg = {
-                    "repo_id": repo_id,
-                    # video encoding
-                    "g": 2,
-                    "crf": crf,
-                    "pix_fmt": "yuv444p",
-                    # video decoding
-                    "device": "cpu",
-                    "decoder": "torchvision",
-                    "decoder_kwgs": {},
-                }
-                if not dry_run:
-                    run_video_benchmark(bench_dir / repo_id / f"torchvision_crf_{crf}", cfg, timestamps_mode)
-                info = load_info(bench_dir / repo_id / f"torchvision_crf_{crf}")
-                rows.append(
-                    [
-                        repo_id,
-                        crf,
-                        info["compression_factor"],
-                        info["load_time_factor"],
-                        info["avg_per_pixel_l2_error"],
-                    ]
-                )
-        display_markdown_table(headers, rows)
-
-        print("**best**")
-        headers = ["repo_id", "compression_factor", "load_time_factor", "avg_per_pixel_l2_error"]
-        rows = []
-        for repo_id in repo_ids:
-            cfg = {
-                "repo_id": repo_id,
-                # video encoding
-                "g": 2,
-                "crf": None,
-                "pix_fmt": "yuv444p",
-                # video decoding
-                "device": "cpu",
-                "decoder": "torchvision",
-                "decoder_kwgs": {},
-            }
-            if not dry_run:
-                run_video_benchmark(bench_dir / repo_id / "torchvision_best", cfg, timestamps_mode)
-            info = load_info(bench_dir / repo_id / "torchvision_best")
-            rows.append(
-                [
-                    repo_id,
-                    info["compression_factor"],
-                    info["load_time_factor"],
-                    info["avg_per_pixel_l2_error"],
-                ]
-            )
-        display_markdown_table(headers, rows)
-
-
-if __name__ == "__main__":
-    main()
--- a/lerobot/common/datasets/backward_compatibility.py
+++ b/lerobot/common/datasets/backward_compatibility.py
@@ -0,0 +1,68 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import packaging.version
+
+V2_MESSAGE = """
+The dataset you requested ({repo_id}) is in {version} format.
+
+We introduced a new format since v2.0 which is not backward compatible with v1.x.
+Please, use our conversion script. Modify the following command with your own task description:
+```
+python lerobot/common/datasets/v2/convert_dataset_v1_to_v2.py \\
+    --repo-id {repo_id} \\
+    --single-task "TASK DESCRIPTION."  # <---- /!\\ Replace TASK DESCRIPTION /!\\
+```
+
+A few examples to replace TASK DESCRIPTION: "Pick up the blue cube and place it into the bin.", "Insert the
+peg into the socket.", "Slide open the ziploc bag.", "Take the elevator to the 1st floor.", "Open the top
+cabinet, store the pot inside it then close the cabinet.", "Push the T-shaped block onto the T-shaped
+target.", "Grab the spray paint on the shelf and place it in the bin on top of the robot dog.", "Fold the
+sweatshirt.", ...
+
+If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
+or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
+"""
+
+V21_MESSAGE = """
+The dataset you requested ({repo_id}) is in {version} format.
+While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
+stats instead of per-episode stats. Update your dataset stats to the new format using this command:
+```
+python lerobot/common/datasets/v21/convert_dataset_v20_to_v21.py --repo-id={repo_id}
+```
+
+If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
+or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
+"""
+
+FUTURE_MESSAGE = """
+The dataset you requested ({repo_id}) is only available in {version} format.
+As we cannot ensure forward compatibility with it, please update your current version of lerobot.
+"""
+
+
+class CompatibilityError(Exception): ...
+
+
+class BackwardCompatibilityError(CompatibilityError):
+    def __init__(self, repo_id: str, version: packaging.version.Version):
+        message = V2_MESSAGE.format(repo_id=repo_id, version=version)
+        super().__init__(message)
+
+
+class ForwardCompatibilityError(CompatibilityError):
+    def __init__(self, repo_id: str, version: packaging.version.Version):
+        message = FUTURE_MESSAGE.format(repo_id=repo_id, version=version)
+        super().__init__(message)
--- a/lerobot/common/datasets/card_template.md
+++ b/lerobot/common/datasets/card_template.md
@@ -0,0 +1,27 @@
+---
+# For reference on dataset card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/datasetcard.md?plain=1
+# Doc / guide: https://huggingface.co/docs/hub/datasets-cards
+{{ card_data }}
+---
+
+This dataset was created using [LeRobot](https://github.com/huggingface/lerobot).
+
+## Dataset Description
+
+{{ dataset_description | default("", true) }}
+
+- **Homepage:** {{ url | default("[More Information Needed]", true)}}
+- **Paper:** {{ paper | default("[More Information Needed]", true)}}
+- **License:** {{ license | default("[More Information Needed]", true)}}
+
+## Dataset Structure
+
+{{ dataset_structure | default("[More Information Needed]", true)}}
+
+## Citation
+
+**BibTeX:**
+
+```bibtex
+{{ citation_bibtex | default("[More Information Needed]", true)}}
+```
--- a/lerobot/common/datasets/compute_stats.py
+++ b/lerobot/common/datasets/compute_stats.py
@@ -0,0 +1,176 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+
+from lerobot.common.datasets.utils import load_image_as_numpy
+
+
+def estimate_num_samples(
+    dataset_len: int, min_num_samples: int = 100, max_num_samples: int = 10_000, power: float = 0.75
+) -> int:
+    """Heuristic to estimate the number of samples based on dataset size.
+    The power controls the sample growth relative to dataset size.
+    Lower the power for less number of samples.
+
+    For default arguments, we have:
+    - from 1 to ~500, num_samples=100
+    - at 1000, num_samples=177
+    - at 2000, num_samples=299
+    - at 5000, num_samples=594
+    - at 10000, num_samples=1000
+    - at 20000, num_samples=1681
+    """
+    if dataset_len < min_num_samples:
+        min_num_samples = dataset_len
+    return max(min_num_samples, min(int(dataset_len**power), max_num_samples))
+
+
+def sample_indices(data_len: int) -> list[int]:
+    num_samples = estimate_num_samples(data_len)
+    return np.round(np.linspace(0, data_len - 1, num_samples)).astype(int).tolist()
+
+
+def auto_downsample_height_width(img: np.ndarray, target_size: int = 150, max_size_threshold: int = 300):
+    _, height, width = img.shape
+
+    if max(width, height) < max_size_threshold:
+        # no downsampling needed
+        return img
+
+    downsample_factor = int(width / target_size) if width > height else int(height / target_size)
+    return img[:, ::downsample_factor, ::downsample_factor]
+
+
+def sample_images(image_paths: list[str]) -> np.ndarray:
+    sampled_indices = sample_indices(len(image_paths))
+
+    images = None
+    for i, idx in enumerate(sampled_indices):
+        path = image_paths[idx]
+        # we load as uint8 to reduce memory usage
+        img = load_image_as_numpy(path, dtype=np.uint8, channel_first=True)
+        img = auto_downsample_height_width(img)
+
+        if images is None:
+            images = np.empty((len(sampled_indices), *img.shape), dtype=np.uint8)
+
+        images[i] = img
+
+    return images
+
+
+def get_feature_stats(array: np.ndarray, axis: tuple, keepdims: bool) -> dict[str, np.ndarray]:
+    return {
+        "min": np.min(array, axis=axis, keepdims=keepdims),
+        "max": np.max(array, axis=axis, keepdims=keepdims),
+        "mean": np.mean(array, axis=axis, keepdims=keepdims),
+        "std": np.std(array, axis=axis, keepdims=keepdims),
+        "count": np.array([len(array)]),
+    }
+
+
+def compute_episode_stats(episode_data: dict[str, list[str] | np.ndarray], features: dict) -> dict:
+    ep_stats = {}
+    for key, data in episode_data.items():
+        if features[key]["dtype"] == "string":
+            continue  # HACK: we should receive np.arrays of strings
+        elif features[key]["dtype"] in ["image", "video"]:
+            ep_ft_array = sample_images(data)  # data is a list of image paths
+            axes_to_reduce = (0, 2, 3)  # keep channel dim
+            keepdims = True
+        else:
+            ep_ft_array = data  # data is already a np.ndarray
+            axes_to_reduce = 0  # compute stats over the first axis
+            keepdims = data.ndim == 1  # keep as np.array
+
+        ep_stats[key] = get_feature_stats(ep_ft_array, axis=axes_to_reduce, keepdims=keepdims)
+
+        # finally, we normalize and remove batch dim for images
+        if features[key]["dtype"] in ["image", "video"]:
+            ep_stats[key] = {
+                k: v if k == "count" else np.squeeze(v / 255.0, axis=0) for k, v in ep_stats[key].items()
+            }
+
+    return ep_stats
+
+
+def _assert_type_and_shape(stats_list: list[dict[str, dict]]):
+    for i in range(len(stats_list)):
+        for fkey in stats_list[i]:
+            for k, v in stats_list[i][fkey].items():
+                if not isinstance(v, np.ndarray):
+                    raise ValueError(
+                        f"Stats must be composed of numpy array, but key '{k}' of feature '{fkey}' is of type '{type(v)}' instead."
+                    )
+                if v.ndim == 0:
+                    raise ValueError("Number of dimensions must be at least 1, and is 0 instead.")
+                if k == "count" and v.shape != (1,):
+                    raise ValueError(f"Shape of 'count' must be (1), but is {v.shape} instead.")
+                if "image" in fkey and k != "count" and v.shape != (3, 1, 1):
+                    raise ValueError(f"Shape of '{k}' must be (3,1,1), but is {v.shape} instead.")
+
+
+def aggregate_feature_stats(stats_ft_list: list[dict[str, dict]]) -> dict[str, dict[str, np.ndarray]]:
+    """Aggregates stats for a single feature."""
+    means = np.stack([s["mean"] for s in stats_ft_list])
+    variances = np.stack([s["std"] ** 2 for s in stats_ft_list])
+    counts = np.stack([s["count"] for s in stats_ft_list])
+    total_count = counts.sum(axis=0)
+
+    # Prepare weighted mean by matching number of dimensions
+    while counts.ndim < means.ndim:
+        counts = np.expand_dims(counts, axis=-1)
+
+    # Compute the weighted mean
+    weighted_means = means * counts
+    total_mean = weighted_means.sum(axis=0) / total_count
+
+    # Compute the variance using the parallel algorithm
+    delta_means = means - total_mean
+    weighted_variances = (variances + delta_means**2) * counts
+    total_variance = weighted_variances.sum(axis=0) / total_count
+
+    return {
+        "min": np.min(np.stack([s["min"] for s in stats_ft_list]), axis=0),
+        "max": np.max(np.stack([s["max"] for s in stats_ft_list]), axis=0),
+        "mean": total_mean,
+        "std": np.sqrt(total_variance),
+        "count": total_count,
+    }
+
+
+def aggregate_stats(stats_list: list[dict[str, dict]]) -> dict[str, dict[str, np.ndarray]]:
+    """Aggregate stats from multiple compute_stats outputs into a single set of stats.
+
+    The final stats will have the union of all data keys from each of the stats dicts.
+
+    For instance:
+    - new_min = min(min_dataset_0, min_dataset_1, ...)
+    - new_max = max(max_dataset_0, max_dataset_1, ...)
+    - new_mean = (mean of all data, weighted by counts)
+    - new_std = (std of all data)
+    """
+
+    _assert_type_and_shape(stats_list)
+
+    data_keys = {key for stats in stats_list for key in stats}
+    aggregated_stats = {key: {} for key in data_keys}
+
+    for key in data_keys:
+        stats_with_key = [stats[key] for stats in stats_list if key in stats]
+        aggregated_stats[key] = aggregate_feature_stats(stats_with_key)
+
+    return aggregated_stats
--- a/lerobot/common/datasets/factory.py
+++ b/lerobot/common/datasets/factory.py
@@ -14,52 +14,105 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
+from pprint import pformat

 import torch
-from omegaconf import OmegaConf

-from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
+from lerobot.common.datasets.lerobot_dataset import (
+    LeRobotDataset,
+    LeRobotDatasetMetadata,
+    MultiLeRobotDataset,
+)
+from lerobot.common.datasets.transforms import ImageTransforms
+from lerobot.configs.policies import PreTrainedConfig
+from lerobot.configs.train import TrainPipelineConfig
+
+IMAGENET_STATS = {
+    "mean": [[[0.485]], [[0.456]], [[0.406]]],  # (c,1,1)
+    "std": [[[0.229]], [[0.224]], [[0.225]]],  # (c,1,1)
+}


-def resolve_delta_timestamps(cfg):
-    """Resolves delta_timestamps config key (in-place) by using `eval`.
+def resolve_delta_timestamps(
+    cfg: PreTrainedConfig, ds_meta: LeRobotDatasetMetadata
+) -> dict[str, list] | None:
+    """Resolves delta_timestamps by reading from the 'delta_indices' properties of the PreTrainedConfig.

-    Doesn't do anything if delta_timestamps is not specified or has already been resolve (as evidenced by
-    the data type of its values).
+    Args:
+        cfg (PreTrainedConfig): The PreTrainedConfig to read delta_indices from.
+        ds_meta (LeRobotDatasetMetadata): The dataset from which features and fps are used to build
+            delta_timestamps against.
+
+    Returns:
+        dict[str, list] | None: A dictionary of delta_timestamps, e.g.:
+            {
+                "observation.state": [-0.04, -0.02, 0]
+                "observation.action": [-0.02, 0, 0.02]
+            }
+            returns `None` if the resulting dict is empty.
    """
-    delta_timestamps = cfg.training.get("delta_timestamps")
-    if delta_timestamps is not None:
-        for key in delta_timestamps:
-            if isinstance(delta_timestamps[key], str):
-                # TODO(rcadene, alexander-soare): remove `eval` to avoid exploit
-                cfg.training.delta_timestamps[key] = eval(delta_timestamps[key])
+    delta_timestamps = {}
+    for key in ds_meta.features:
+        if key == "next.reward" and cfg.reward_delta_indices is not None:
+            delta_timestamps[key] = [i / ds_meta.fps for i in cfg.reward_delta_indices]
+        if key == "action" and cfg.action_delta_indices is not None:
+            delta_timestamps[key] = [i / ds_meta.fps for i in cfg.action_delta_indices]
+        if key.startswith("observation.") and cfg.observation_delta_indices is not None:
+            delta_timestamps[key] = [i / ds_meta.fps for i in cfg.observation_delta_indices]
+
+    if len(delta_timestamps) == 0:
+        delta_timestamps = None
+
+    return delta_timestamps


-def make_dataset(
-    cfg,
-    split="train",
-):
-    if cfg.env.name not in cfg.dataset_repo_id:
-        logging.warning(
-            f"There might be a mismatch between your training dataset ({cfg.dataset_repo_id=}) and your "
-            f"environment ({cfg.env.name=})."
-        )
+def make_dataset(cfg: TrainPipelineConfig) -> LeRobotDataset | MultiLeRobotDataset:
+    """Handles the logic of setting up delta timestamps and image transforms before creating a dataset.

-    resolve_delta_timestamps(cfg)
+    Args:
+        cfg (TrainPipelineConfig): A TrainPipelineConfig config which contains a DatasetConfig and a PreTrainedConfig.

-    # TODO(rcadene): add data augmentations
+    Raises:
+        NotImplementedError: The MultiLeRobotDataset is currently deactivated.

-    dataset = LeRobotDataset(
-        cfg.dataset_repo_id,
-        split=split,
-        delta_timestamps=cfg.training.get("delta_timestamps"),
+    Returns:
+        LeRobotDataset | MultiLeRobotDataset
+    """
+    image_transforms = (
+        ImageTransforms(cfg.dataset.image_transforms) if cfg.dataset.image_transforms.enable else None
    )

-    if cfg.get("override_dataset_stats"):
-        for key, stats_dict in cfg.override_dataset_stats.items():
-            for stats_type, listconfig in stats_dict.items():
-                # example of stats_type: min, max, mean, std
-                stats = OmegaConf.to_container(listconfig, resolve=True)
-                dataset.stats[key][stats_type] = torch.tensor(stats, dtype=torch.float32)
+    if isinstance(cfg.dataset.repo_id, str):
+        ds_meta = LeRobotDatasetMetadata(
+            cfg.dataset.repo_id, root=cfg.dataset.root, revision=cfg.dataset.revision
+        )
+        delta_timestamps = resolve_delta_timestamps(cfg.policy, ds_meta)
+        dataset = LeRobotDataset(
+            cfg.dataset.repo_id,
+            root=cfg.dataset.root,
+            episodes=cfg.dataset.episodes,
+            delta_timestamps=delta_timestamps,
+            image_transforms=image_transforms,
+            revision=cfg.dataset.revision,
+            video_backend=cfg.dataset.video_backend,
+        )
+    else:
+        raise NotImplementedError("The MultiLeRobotDataset isn't supported for now.")
+        dataset = MultiLeRobotDataset(
+            cfg.dataset.repo_id,
+            # TODO(aliberts): add proper support for multi dataset
+            # delta_timestamps=delta_timestamps,
+            image_transforms=image_transforms,
+            video_backend=cfg.dataset.video_backend,
+        )
+        logging.info(
+            "Multiple datasets were provided. Applied the following index mapping to the provided datasets: "
+            f"{pformat(dataset.repo_id_to_index, indent=2)}"
+        )
+
+    if cfg.dataset.use_imagenet_stats:
+        for key in dataset.meta.camera_keys:
+            for stats_type, stats in IMAGENET_STATS.items():
+                dataset.meta.stats[key][stats_type] = torch.tensor(stats, dtype=torch.float32)

    return dataset
--- a/lerobot/common/datasets/image_writer.py
+++ b/lerobot/common/datasets/image_writer.py
@@ -0,0 +1,178 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import multiprocessing
+import queue
+import threading
+from pathlib import Path
+
+import numpy as np
+import PIL.Image
+import torch
+
+
+def safe_stop_image_writer(func):
+    def wrapper(*args, **kwargs):
+        try:
+            return func(*args, **kwargs)
+        except Exception as e:
+            dataset = kwargs.get("dataset")
+            image_writer = getattr(dataset, "image_writer", None) if dataset else None
+            if image_writer is not None:
+                print("Waiting for image writer to terminate...")
+                image_writer.stop()
+            raise e
+
+    return wrapper
+
+
+def image_array_to_pil_image(image_array: np.ndarray, range_check: bool = True) -> PIL.Image.Image:
+    # TODO(aliberts): handle 1 channel and 4 for depth images
+    if image_array.ndim != 3:
+        raise ValueError(f"The array has {image_array.ndim} dimensions, but 3 is expected for an image.")
+
+    if image_array.shape[0] == 3:
+        # Transpose from pytorch convention (C, H, W) to (H, W, C)
+        image_array = image_array.transpose(1, 2, 0)
+
+    elif image_array.shape[-1] != 3:
+        raise NotImplementedError(
+            f"The image has {image_array.shape[-1]} channels, but 3 is required for now."
+        )
+
+    if image_array.dtype != np.uint8:
+        if range_check:
+            max_ = image_array.max().item()
+            min_ = image_array.min().item()
+            if max_ > 1.0 or min_ < 0.0:
+                raise ValueError(
+                    "The image data type is float, which requires values in the range [0.0, 1.0]. "
+                    f"However, the provided range is [{min_}, {max_}]. Please adjust the range or "
+                    "provide a uint8 image with values in the range [0, 255]."
+                )
+
+        image_array = (image_array * 255).astype(np.uint8)
+
+    return PIL.Image.fromarray(image_array)
+
+
+def write_image(image: np.ndarray | PIL.Image.Image, fpath: Path):
+    try:
+        if isinstance(image, np.ndarray):
+            img = image_array_to_pil_image(image)
+        elif isinstance(image, PIL.Image.Image):
+            img = image
+        else:
+            raise TypeError(f"Unsupported image type: {type(image)}")
+        img.save(fpath)
+    except Exception as e:
+        print(f"Error writing image {fpath}: {e}")
+
+
+def worker_thread_loop(queue: queue.Queue):
+    while True:
+        item = queue.get()
+        if item is None:
+            queue.task_done()
+            break
+        image_array, fpath = item
+        write_image(image_array, fpath)
+        queue.task_done()
+
+
+def worker_process(queue: queue.Queue, num_threads: int):
+    threads = []
+    for _ in range(num_threads):
+        t = threading.Thread(target=worker_thread_loop, args=(queue,))
+        t.daemon = True
+        t.start()
+        threads.append(t)
+    for t in threads:
+        t.join()
+
+
+class AsyncImageWriter:
+    """
+    This class abstract away the initialisation of processes or/and threads to
+    save images on disk asynchronously, which is critical to control a robot and record data
+    at a high frame rate.
+
+    When `num_processes=0`, it creates a threads pool of size `num_threads`.
+    When `num_processes>0`, it creates processes pool of size `num_processes`, where each subprocess starts
+    their own threads pool of size `num_threads`.
+
+    The optimal number of processes and threads depends on your computer capabilities.
+    We advise to use 4 threads per camera with 0 processes. If the fps is not stable, try to increase or lower
+    the number of threads. If it is still not stable, try to use 1 subprocess, or more.
+    """
+
+    def __init__(self, num_processes: int = 0, num_threads: int = 1):
+        self.num_processes = num_processes
+        self.num_threads = num_threads
+        self.queue = None
+        self.threads = []
+        self.processes = []
+        self._stopped = False
+
+        if num_threads <= 0 and num_processes <= 0:
+            raise ValueError("Number of threads and processes must be greater than zero.")
+
+        if self.num_processes == 0:
+            # Use threading
+            self.queue = queue.Queue()
+            for _ in range(self.num_threads):
+                t = threading.Thread(target=worker_thread_loop, args=(self.queue,))
+                t.daemon = True
+                t.start()
+                self.threads.append(t)
+        else:
+            # Use multiprocessing
+            self.queue = multiprocessing.JoinableQueue()
+            for _ in range(self.num_processes):
+                p = multiprocessing.Process(target=worker_process, args=(self.queue, self.num_threads))
+                p.daemon = True
+                p.start()
+                self.processes.append(p)
+
+    def save_image(self, image: torch.Tensor | np.ndarray | PIL.Image.Image, fpath: Path):
+        if isinstance(image, torch.Tensor):
+            # Convert tensor to numpy array to minimize main process time
+            image = image.cpu().numpy()
+        self.queue.put((image, fpath))
+
+    def wait_until_done(self):
+        self.queue.join()
+
+    def stop(self):
+        if self._stopped:
+            return
+
+        if self.num_processes == 0:
+            for _ in self.threads:
+                self.queue.put(None)
+            for t in self.threads:
+                t.join()
+        else:
+            num_nones = self.num_processes * self.num_threads
+            for _ in range(num_nones):
+                self.queue.put(None)
+            for p in self.processes:
+                p.join()
+                if p.is_alive():
+                    p.terminate()
+            self.queue.close()
+            self.queue.join_thread()
+
+        self._stopped = True
--- a/lerobot/common/datasets/lerobot_dataset.py
+++ b/lerobot/common/datasets/lerobot_dataset.py
--- a/lerobot/common/datasets/online_buffer.py
+++ b/lerobot/common/datasets/online_buffer.py
@@ -0,0 +1,384 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""An online buffer for the online training loop in train.py
+
+Note to maintainers: This duplicates some logic from LeRobotDataset and EpisodeAwareSampler. We should
+consider converging to one approach. Here we have opted to use numpy.memmap to back the data buffer. It's much
+faster than using HuggingFace Datasets as there's no conversion to an intermediate non-python object. Also it
+supports in-place slicing and mutation which is very handy for a dynamic buffer.
+"""
+
+import os
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import torch
+
+from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
+
+
+def _make_memmap_safe(**kwargs) -> np.memmap:
+    """Make a numpy memmap with checks on available disk space first.
+
+    Expected kwargs are: "filename", "dtype" (must by np.dtype), "mode" and "shape"
+
+    For information on dtypes:
+    https://numpy.org/doc/stable/reference/arrays.dtypes.html#arrays-dtypes-constructing
+    """
+    if kwargs["mode"].startswith("w"):
+        required_space = kwargs["dtype"].itemsize * np.prod(kwargs["shape"])  # bytes
+        stats = os.statvfs(Path(kwargs["filename"]).parent)
+        available_space = stats.f_bavail * stats.f_frsize  # bytes
+        if required_space >= available_space * 0.8:
+            raise RuntimeError(
+                f"You're about to take up {required_space} of {available_space} bytes available."
+            )
+    return np.memmap(**kwargs)
+
+
+class OnlineBuffer(torch.utils.data.Dataset):
+    """FIFO data buffer for the online training loop in train.py.
+
+    Follows the protocol of LeRobotDataset as much as is required to have it be used by the online training
+    loop in the same way that a LeRobotDataset would be used.
+
+    The underlying data structure will have data inserted in a circular fashion. Always insert after the
+    last index, and when you reach the end, wrap around to the start.
+
+    The data is stored in a numpy memmap.
+    """
+
+    NEXT_INDEX_KEY = "_next_index"
+    OCCUPANCY_MASK_KEY = "_occupancy_mask"
+    INDEX_KEY = "index"
+    FRAME_INDEX_KEY = "frame_index"
+    EPISODE_INDEX_KEY = "episode_index"
+    TIMESTAMP_KEY = "timestamp"
+    IS_PAD_POSTFIX = "_is_pad"
+
+    def __init__(
+        self,
+        write_dir: str | Path,
+        data_spec: dict[str, Any] | None,
+        buffer_capacity: int | None,
+        fps: float | None = None,
+        delta_timestamps: dict[str, list[float]] | dict[str, np.ndarray] | None = None,
+    ):
+        """
+        The online buffer can be provided from scratch or you can load an existing online buffer by passing
+        a `write_dir` associated with an existing buffer.
+
+        Args:
+            write_dir: Where to keep the numpy memmap files. One memmap file will be stored for each data key.
+                Note that if the files already exist, they are opened in read-write mode (used for training
+                resumption.)
+            data_spec: A mapping from data key to data specification, like {data_key: {"shape": tuple[int],
+                "dtype": np.dtype}}. This should include all the data that you wish to record into the buffer,
+                but note that "index", "frame_index" and "episode_index" are already accounted for by this
+                class, so you don't need to include them.
+            buffer_capacity: How many frames should be stored in the buffer as a maximum. Be aware of your
+                system's available disk space when choosing this.
+            fps: Same as the fps concept in LeRobot dataset. Here it needs to be provided for the
+                 delta_timestamps logic. You can pass None if you are not using delta_timestamps.
+            delta_timestamps: Same as the delta_timestamps concept in LeRobotDataset. This is internally
+                converted to dict[str, np.ndarray] for optimization purposes.
+
+        """
+        self.set_delta_timestamps(delta_timestamps)
+        self._fps = fps
+        # Tolerance in seconds used to discard loaded frames when their timestamps are not close enough from
+        # the requested frames. It is only used when `delta_timestamps` is provided.
+        # minus 1e-4 to account for possible numerical error
+        self.tolerance_s = 1 / self.fps - 1e-4 if fps is not None else None
+        self._buffer_capacity = buffer_capacity
+        data_spec = self._make_data_spec(data_spec, buffer_capacity)
+        Path(write_dir).mkdir(parents=True, exist_ok=True)
+        self._data = {}
+        for k, v in data_spec.items():
+            self._data[k] = _make_memmap_safe(
+                filename=Path(write_dir) / k,
+                dtype=v["dtype"] if v is not None else None,
+                mode="r+" if (Path(write_dir) / k).exists() else "w+",
+                shape=tuple(v["shape"]) if v is not None else None,
+            )
+
+    @property
+    def delta_timestamps(self) -> dict[str, np.ndarray] | None:
+        return self._delta_timestamps
+
+    def set_delta_timestamps(self, value: dict[str, list[float]] | None):
+        """Set delta_timestamps converting the values to numpy arrays.
+
+        The conversion is for an optimization in the __getitem__. The loop is much slower if the arrays
+        need to be converted into numpy arrays.
+        """
+        if value is not None:
+            self._delta_timestamps = {k: np.array(v) for k, v in value.items()}
+        else:
+            self._delta_timestamps = None
+
+    def _make_data_spec(self, data_spec: dict[str, Any], buffer_capacity: int) -> dict[str, dict[str, Any]]:
+        """Makes the data spec for np.memmap."""
+        if any(k.startswith("_") for k in data_spec):
+            raise ValueError(
+                "data_spec keys should not start with '_'. This prefix is reserved for internal logic."
+            )
+        preset_keys = {
+            OnlineBuffer.INDEX_KEY,
+            OnlineBuffer.FRAME_INDEX_KEY,
+            OnlineBuffer.EPISODE_INDEX_KEY,
+            OnlineBuffer.TIMESTAMP_KEY,
+        }
+        if len(intersection := set(data_spec).intersection(preset_keys)) > 0:
+            raise ValueError(
+                f"data_spec should not contain any of {preset_keys} as these are handled internally. "
+                f"The provided data_spec has {intersection}."
+            )
+        complete_data_spec = {
+            # _next_index will be a pointer to the next index that we should start filling from when we add
+            # more data.
+            OnlineBuffer.NEXT_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": ()},
+            # Since the memmap is initialized with all-zeros, this keeps track of which indices are occupied
+            # with real data rather than the dummy initialization.
+            OnlineBuffer.OCCUPANCY_MASK_KEY: {"dtype": np.dtype("?"), "shape": (buffer_capacity,)},
+            OnlineBuffer.INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (buffer_capacity,)},
+            OnlineBuffer.FRAME_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (buffer_capacity,)},
+            OnlineBuffer.EPISODE_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (buffer_capacity,)},
+            OnlineBuffer.TIMESTAMP_KEY: {"dtype": np.dtype("float64"), "shape": (buffer_capacity,)},
+        }
+        for k, v in data_spec.items():
+            complete_data_spec[k] = {"dtype": v["dtype"], "shape": (buffer_capacity, *v["shape"])}
+        return complete_data_spec
+
+    def add_data(self, data: dict[str, np.ndarray]):
+        """Add new data to the buffer, which could potentially mean shifting old data out.
+
+        The new data should contain all the frames (in order) of any number of episodes. The indices should
+        start from 0 (note to the developer: this can easily be generalized). See the `rollout` and
+        `eval_policy` functions in `eval.py` for more information on how the data is constructed.
+
+        Shift the incoming data index and episode_index to continue on from the last frame. Note that this
+        will be done in place!
+        """
+        if len(missing_keys := (set(self.data_keys).difference(set(data)))) > 0:
+            raise ValueError(f"Missing data keys: {missing_keys}")
+        new_data_length = len(data[self.data_keys[0]])
+        if not all(len(data[k]) == new_data_length for k in self.data_keys):
+            raise ValueError("All data items should have the same length")
+
+        next_index = self._data[OnlineBuffer.NEXT_INDEX_KEY]
+
+        # Sanity check to make sure that the new data indices start from 0.
+        assert data[OnlineBuffer.EPISODE_INDEX_KEY][0].item() == 0
+        assert data[OnlineBuffer.INDEX_KEY][0].item() == 0
+
+        # Shift the incoming indices if necessary.
+        if self.num_frames > 0:
+            last_episode_index = self._data[OnlineBuffer.EPISODE_INDEX_KEY][next_index - 1]
+            last_data_index = self._data[OnlineBuffer.INDEX_KEY][next_index - 1]
+            data[OnlineBuffer.EPISODE_INDEX_KEY] += last_episode_index + 1
+            data[OnlineBuffer.INDEX_KEY] += last_data_index + 1
+
+        # Insert the new data starting from next_index. It may be necessary to wrap around to the start.
+        n_surplus = max(0, new_data_length - (self._buffer_capacity - next_index))
+        for k in self.data_keys:
+            if n_surplus == 0:
+                slc = slice(next_index, next_index + new_data_length)
+                self._data[k][slc] = data[k]
+                self._data[OnlineBuffer.OCCUPANCY_MASK_KEY][slc] = True
+            else:
+                self._data[k][next_index:] = data[k][:-n_surplus]
+                self._data[OnlineBuffer.OCCUPANCY_MASK_KEY][next_index:] = True
+                self._data[k][:n_surplus] = data[k][-n_surplus:]
+        if n_surplus == 0:
+            self._data[OnlineBuffer.NEXT_INDEX_KEY] = next_index + new_data_length
+        else:
+            self._data[OnlineBuffer.NEXT_INDEX_KEY] = n_surplus
+
+    @property
+    def data_keys(self) -> list[str]:
+        keys = set(self._data)
+        keys.remove(OnlineBuffer.OCCUPANCY_MASK_KEY)
+        keys.remove(OnlineBuffer.NEXT_INDEX_KEY)
+        return sorted(keys)
+
+    @property
+    def fps(self) -> float | None:
+        return self._fps
+
+    @property
+    def num_episodes(self) -> int:
+        return len(
+            np.unique(self._data[OnlineBuffer.EPISODE_INDEX_KEY][self._data[OnlineBuffer.OCCUPANCY_MASK_KEY]])
+        )
+
+    @property
+    def num_frames(self) -> int:
+        return np.count_nonzero(self._data[OnlineBuffer.OCCUPANCY_MASK_KEY])
+
+    def __len__(self):
+        return self.num_frames
+
+    def _item_to_tensors(self, item: dict) -> dict:
+        item_ = {}
+        for k, v in item.items():
+            if isinstance(v, torch.Tensor):
+                item_[k] = v
+            elif isinstance(v, np.ndarray):
+                item_[k] = torch.from_numpy(v)
+            else:
+                item_[k] = torch.tensor(v)
+        return item_
+
+    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
+        if idx >= len(self) or idx < -len(self):
+            raise IndexError
+
+        item = {k: v[idx] for k, v in self._data.items() if not k.startswith("_")}
+
+        if self.delta_timestamps is None:
+            return self._item_to_tensors(item)
+
+        episode_index = item[OnlineBuffer.EPISODE_INDEX_KEY]
+        current_ts = item[OnlineBuffer.TIMESTAMP_KEY]
+        episode_data_indices = np.where(
+            np.bitwise_and(
+                self._data[OnlineBuffer.EPISODE_INDEX_KEY] == episode_index,
+                self._data[OnlineBuffer.OCCUPANCY_MASK_KEY],
+            )
+        )[0]
+        episode_timestamps = self._data[OnlineBuffer.TIMESTAMP_KEY][episode_data_indices]
+
+        for data_key in self.delta_timestamps:
+            # Note: The logic in this loop is copied from `load_previous_and_future_frames`.
+            # Get timestamps used as query to retrieve data of previous/future frames.
+            query_ts = current_ts + self.delta_timestamps[data_key]
+
+            # Compute distances between each query timestamp and all timestamps of all the frames belonging to
+            # the episode.
+            dist = np.abs(query_ts[:, None] - episode_timestamps[None, :])
+            argmin_ = np.argmin(dist, axis=1)
+            min_ = dist[np.arange(dist.shape[0]), argmin_]
+
+            is_pad = min_ > self.tolerance_s
+
+            # Check violated query timestamps are all outside the episode range.
+            assert (
+                (query_ts[is_pad] < episode_timestamps[0]) | (episode_timestamps[-1] < query_ts[is_pad])
+            ).all(), (
+                f"One or several timestamps unexpectedly violate the tolerance ({min_} > {self.tolerance_s=}"
+                ") inside the episode range."
+            )
+
+            # Load frames for this data key.
+            item[data_key] = self._data[data_key][episode_data_indices[argmin_]]
+
+            item[f"{data_key}{OnlineBuffer.IS_PAD_POSTFIX}"] = is_pad
+
+        return self._item_to_tensors(item)
+
+    def get_data_by_key(self, key: str) -> torch.Tensor:
+        """Returns all data for a given data key as a Tensor."""
+        return torch.from_numpy(self._data[key][self._data[OnlineBuffer.OCCUPANCY_MASK_KEY]])
+
+
+def compute_sampler_weights(
+    offline_dataset: LeRobotDataset,
+    offline_drop_n_last_frames: int = 0,
+    online_dataset: OnlineBuffer | None = None,
+    online_sampling_ratio: float | None = None,
+    online_drop_n_last_frames: int = 0,
+) -> torch.Tensor:
+    """Compute the sampling weights for the online training dataloader in train.py.
+
+    Args:
+        offline_dataset: The LeRobotDataset used for offline pre-training.
+        online_drop_n_last_frames: Number of frames to drop from the end of each offline dataset episode.
+        online_dataset: The OnlineBuffer used in online training.
+        online_sampling_ratio: The proportion of data that should be sampled from the online dataset. If an
+            online dataset is provided, this value must also be provided.
+        online_drop_n_first_frames: See `offline_drop_n_last_frames`. This is the same, but for the online
+            dataset.
+    Returns:
+        Tensor of weights for [offline_dataset; online_dataset], normalized to 1.
+
+    Notes to maintainers:
+        - This duplicates some logic from EpisodeAwareSampler. We should consider converging to one approach.
+        - When used with `torch.utils.data.WeightedRandomSampler`, it could completely replace
+          `EpisodeAwareSampler` as the online dataset related arguments are optional. The only missing feature
+          is the ability to turn shuffling off.
+        - Options `drop_first_n_frames` and `episode_indices_to_use` can be added easily. They were not
+          included here to avoid adding complexity.
+    """
+    if len(offline_dataset) == 0 and (online_dataset is None or len(online_dataset) == 0):
+        raise ValueError("At least one of `offline_dataset` or `online_dataset` should be contain data.")
+    if (online_dataset is None) ^ (online_sampling_ratio is None):
+        raise ValueError(
+            "`online_dataset` and `online_sampling_ratio` must be provided together or not at all."
+        )
+    offline_sampling_ratio = 0 if online_sampling_ratio is None else 1 - online_sampling_ratio
+
+    weights = []
+
+    if len(offline_dataset) > 0:
+        offline_data_mask_indices = []
+        for start_index, end_index in zip(
+            offline_dataset.episode_data_index["from"],
+            offline_dataset.episode_data_index["to"],
+            strict=True,
+        ):
+            offline_data_mask_indices.extend(
+                range(start_index.item(), end_index.item() - offline_drop_n_last_frames)
+            )
+        offline_data_mask = torch.zeros(len(offline_dataset), dtype=torch.bool)
+        offline_data_mask[torch.tensor(offline_data_mask_indices)] = True
+        weights.append(
+            torch.full(
+                size=(len(offline_dataset),),
+                fill_value=offline_sampling_ratio / offline_data_mask.sum(),
+            )
+            * offline_data_mask
+        )
+
+    if online_dataset is not None and len(online_dataset) > 0:
+        online_data_mask_indices = []
+        episode_indices = online_dataset.get_data_by_key("episode_index")
+        for episode_idx in torch.unique(episode_indices):
+            where_episode = torch.where(episode_indices == episode_idx)
+            start_index = where_episode[0][0]
+            end_index = where_episode[0][-1] + 1
+            online_data_mask_indices.extend(
+                range(start_index.item(), end_index.item() - online_drop_n_last_frames)
+            )
+        online_data_mask = torch.zeros(len(online_dataset), dtype=torch.bool)
+        online_data_mask[torch.tensor(online_data_mask_indices)] = True
+        weights.append(
+            torch.full(
+                size=(len(online_dataset),),
+                fill_value=online_sampling_ratio / online_data_mask.sum(),
+            )
+            * online_data_mask
+        )
+
+    weights = torch.cat(weights)
+
+    if weights.sum() == 0:
+        weights += 1 / len(weights)
+    else:
+        weights /= weights.sum()
+
+    return weights
--- a/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/mobile_cabinet.txt
+++ b/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/mobile_cabinet.txt
@@ -1,85 +0,0 @@
-https://drive.google.com/file/d/1_SOJkgfP5yZyVjMhTt3nwhvyUjcnlI51/view?usp=drive_link
-https://drive.google.com/file/d/1rmgN8UUzph1qwJnzG1d-uOafodn-gLvb/view?usp=drive_link
-https://drive.google.com/file/d/1NYQ-XxsBVinB6dUoZmVWweT83367P3i2/view?usp=drive_link
-https://drive.google.com/file/d/1oAv_j74zxxCJieMG7r5Vl2BeHK1__3s3/view?usp=drive_link
-https://drive.google.com/file/d/1wFUJQROsrTJt64YRuIeExhFjr2wnK5uu/view?usp=drive_link
-https://drive.google.com/file/d/1KzL3Tt0Le7jVl58XVRUcmigmXjyiuhbK/view?usp=drive_link
-https://drive.google.com/file/d/1qy_YBladeHtianSSGtgAPSHtMin7msvf/view?usp=drive_link
-https://drive.google.com/file/d/1rA_F0V_qL_nyuC_0aBKCisF4-0TIkF2Y/view?usp=drive_link
-https://drive.google.com/file/d/1hw-8qMpz9VgSt62XoASqNRuPECpCwJQP/view?usp=drive_link
-https://drive.google.com/file/d/1BpHOl9rKMzdvNGka6js7C0s40hH6vnDA/view?usp=drive_link
-https://drive.google.com/file/d/1PazhkhiDnJ-OUMyDVDFxEZNKQQqHiNWS/view?usp=drive_link
-https://drive.google.com/file/d/1lZ665R6ATl57dypxH4dGJ2NSt6XYnbuz/view?usp=drive_link
-https://drive.google.com/file/d/1V9HzLaf-tlG15wUzT7KrTDCS_z1vi5NV/view?usp=drive_link
-https://drive.google.com/file/d/1aKauWiXoKqbNwn_2xs4MrmLlaNYlVNmO/view?usp=drive_link
-https://drive.google.com/file/d/1WVD5DFhriO1YmmOgiVHhacR6HWoTPxav/view?usp=drive_link
-https://drive.google.com/file/d/1_X43WgeBAsfkhH9EmpyPki8U9joMeAGC/view?usp=drive_link
-https://drive.google.com/file/d/1t8x0GqWoNKWtnBsB7_D40Z34nL9ak4kf/view?usp=drive_link
-https://drive.google.com/file/d/15V_f26WaKOXjKnq2T3HRWAmtQUi4lbu2/view?usp=drive_link
-https://drive.google.com/file/d/11VFIAsiSDsMOBANgrOcZBpKB9AFWnLy7/view?usp=drive_link
-https://drive.google.com/file/d/1M0NS7vVaxJv3FHnuRYtdwTFYF7We4LxP/view?usp=drive_link
-https://drive.google.com/file/d/1mR0OItTNqFnVLoczcyKYlm6drAy778lO/view?usp=drive_link
-https://drive.google.com/file/d/1NbVFWDQAh-z4JJ4D-Zw6Lps9kdvpqh2j/view?usp=drive_link
-https://drive.google.com/file/d/1JQoZGBzl4W3QG26-n39tefcGN0fDRMbB/view?usp=drive_link
-https://drive.google.com/file/d/1VBjHl-TvZpncopvasIP5G9gecbB2a5f6/view?usp=drive_link
-https://drive.google.com/file/d/1VzSf6zaB21nahm7MsPwroXbJ84NIwq0b/view?usp=drive_link
-https://drive.google.com/file/d/1OtNnfMEydNtZOcivs4k6E_uJSpf8PkGy/view?usp=drive_link
-https://drive.google.com/file/d/14nVvpvsrFr_03Pa_N7MKzwnRwibOUYM6/view?usp=drive_link
-https://drive.google.com/file/d/1M8li6duiO2r3lv_9HhF_XJn0oZUIEK5F/view?usp=drive_link
-https://drive.google.com/file/d/1Cpzea6fO14lxAaNfSBifqoa4ekhCiLD1/view?usp=drive_link
-https://drive.google.com/file/d/1mbxRTm5vlbsY9UJ0jfjM6j9D7kPJjBpG/view?usp=drive_link
-https://drive.google.com/file/d/1RXD1i6IfWsHRlCxVmG04h2h5Ycm_WwZN/view?usp=drive_link
-https://drive.google.com/file/d/1QFqFSwDGOk1BkgGmqgCcc2BRWnJ6R3MA/view?usp=drive_link
-https://drive.google.com/file/d/1bFqWR8DQM0ZUxxtS2bl-RANQvukeFLzp/view?usp=drive_link
-https://drive.google.com/file/d/1pR-rH3yNGoyPdD4hJ6-3lXQ-PstBx9du/view?usp=drive_link
-https://drive.google.com/file/d/107OAwLY-hva9HeQLIK7VCh-ytdDabVjr/view?usp=drive_link
-https://drive.google.com/file/d/1Tpl08QOaSZ37GTO4awFWSdD8wBR9xdlT/view?usp=drive_link
-https://drive.google.com/file/d/1MR164AOM-0S1T6RX8xKTV2IHyaCvpqAW/view?usp=drive_link
-https://drive.google.com/file/d/1_wknJfVnStIhJ82lU_QtcrwahsqYIsr8/view?usp=drive_link
-https://drive.google.com/file/d/1ZuEktWrbYkTx0l5pj3WiZ2CJrfbDOHNo/view?usp=drive_link
-https://drive.google.com/file/d/15G_10hkkkq6yxvyI5NGZirlF-RzduR2F/view?usp=drive_link
-https://drive.google.com/file/d/1DBKxg3ONqh7dhLuX6oh1Yyo2x383V1Hp/view?usp=drive_link
-https://drive.google.com/file/d/1B5iDBkTUr5vopDddV_fHud18SqAHhauS/view?usp=drive_link
-https://drive.google.com/file/d/1acwFV0eenRkki1QcjSKH5xqOtys-P3Pr/view?usp=drive_link
-https://drive.google.com/file/d/1S47BI83xyrh-FKXsvAQqer98Biu_p8XK/view?usp=drive_link
-https://drive.google.com/file/d/1JL6DmBZl3uyq9dyLfgSqtGF06e7E9JwM/view?usp=drive_link
-https://drive.google.com/file/d/16WvRS4Kjog8Pxgr0E3sGGnI01YwL9Uql/view?usp=drive_link
-https://drive.google.com/file/d/12ttGqL33IPWg0-s1SD44rr22M6LiSQBr/view?usp=drive_link
-https://drive.google.com/file/d/1OyZqqnldTU_DliRbr6x0C4a_iWPwIN7j/view?usp=drive_link
-https://drive.google.com/file/d/1oYk00IpLnR9fesLfD15Ebe7nVBffEbcS/view?usp=drive_link
-https://drive.google.com/file/d/1eyE2-MQduCEqCd-5_kl5zsoOEERAzpZD/view?usp=drive_link
-https://drive.google.com/file/d/1ir1Ya-vO0d97pfvbePlUeuKTTRc0qIMU/view?usp=drive_link
-https://drive.google.com/file/d/1hOi-JnqlMt47gVnLZHMTqeojyYVErohl/view?usp=drive_link
-https://drive.google.com/file/d/1NFFw5_PqigQ7xGqsL-MNq2B1r5yAscCf/view?usp=drive_link
-https://drive.google.com/file/d/1uftq1-Zlh8d2sNLWrlVcKYQUwZTD7o24/view?usp=drive_link
-https://drive.google.com/file/d/1-ax19dSLPacVgk000T-m3l4flPcg07pM/view?usp=drive_link
-https://drive.google.com/file/d/126y-lgn86-ZmCz8hooF1THKJGGObw3OB/view?usp=drive_link
-https://drive.google.com/file/d/1JiDniK0VmDIkk92AbBILb8J2Ba59PWML/view?usp=drive_link
-https://drive.google.com/file/d/1kr8nPIRljiU0R4J9SMgj80o1FPQxzu9z/view?usp=drive_link
-https://drive.google.com/file/d/1bbThWRij1pKBh_kFgV8FwK0sXtTHBoLX/view?usp=drive_link
-https://drive.google.com/file/d/1WenzDW6lxk1xkOFm-OiGFfc0ROskAuKU/view?usp=drive_link
-https://drive.google.com/file/d/1MiKRzuzUn1yN-k_6kPJJzIGy7dT-nnsD/view?usp=drive_link
-https://drive.google.com/file/d/17rRg2tcmB-gNhQ0KoZJQmNfyFeoij1jH/view?usp=drive_link
-https://drive.google.com/file/d/11mokBpvrY3ld6sY5WztREtJ1jgqfQV70/view?usp=drive_link
-https://drive.google.com/file/d/1Il_6IOx9NDp1bX_KHizJfBwzTufTmn86/view?usp=drive_link
-https://drive.google.com/file/d/1KswtJGsxJ7eeBDAmNA_aeLjOxcH6MIxa/view?usp=drive_link
-https://drive.google.com/file/d/1gzMhi5uWu4C3Y6WbQ3L-08V96GxTZrRR/view?usp=drive_link
-https://drive.google.com/file/d/1nRQFtaBxfUCYc2W90Qibh0kHCt6YQCfc/view?usp=drive_link
-https://drive.google.com/file/d/1vs-gyW-KheqHbUATwAhA2mmR9GOGw7f_/view?usp=drive_link
-https://drive.google.com/file/d/1MuxzGOA2fgLaHryq82KkQumtuRJGcUOC/view?usp=drive_link
-https://drive.google.com/file/d/1IIwxZnGlqrXLUXqG6yMO0r7uhCvhpk9e/view?usp=drive_link
-https://drive.google.com/file/d/1vE7XPyaFcXP4DtTY5Y9WKIt7zWgmX-Cr/view?usp=drive_link
-https://drive.google.com/file/d/1j-bIV09gr21RC3-x1N_pK4RPLV3fmWKz/view?usp=drive_link
-https://drive.google.com/file/d/1t3nW1rD3S-EL0Oymb5U7ZAj5UMkydkln/view?usp=drive_link
-https://drive.google.com/file/d/14hbfHCdMKtJZ41F9CQReMec2jeRFTOqR/view?usp=drive_link
-https://drive.google.com/file/d/1x-hUyOSne5BW0AzQ3W6_Pf4g5yXQWi9M/view?usp=drive_link
-https://drive.google.com/file/d/1sw9JqRg6E-3P84I3ZhzTrJMu0vuiaMmP/view?usp=drive_link
-https://drive.google.com/file/d/1LuqhQlL4MGZhB_6THmkovRxrlP26BbdC/view?usp=drive_link
-https://drive.google.com/file/d/15C5K6v_lkjnMSmUvVyqHQKwh2N166e7K/view?usp=drive_link
-https://drive.google.com/file/d/1ns_9eSsQeeoZ10nlbkLy8tu0GmJFSnkt/view?usp=drive_link
-https://drive.google.com/file/d/1NpzWJeK6CqjxzjIMYe6aYdX8xGsQwD4o/view?usp=drive_link
-https://drive.google.com/file/d/1NMLezwufKJ9_8xTc9KQThSzVVD71B9Ui/view?usp=drive_link
-https://drive.google.com/file/d/1aa71DCUqs6oXlIxX35jgsmsgm-NlDxPV/view?usp=drive_link
-https://drive.google.com/file/d/1UJzkIZzAL0j-D5YQBnoq7mHvttASy12O/view?usp=drive_link
-https://drive.google.com/file/d/1nPgx36HIJFb7oI94VbRzWjpPP2GANxzG/view?usp=drive_link
-https://drive.google.com/file/d/1NovAP-KVJjqcuvWy3d6G4ptGGAIDqcCx/view?usp=drive_link
--- a/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/mobile_chair.txt
+++ b/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/mobile_chair.txt
@@ -1,55 +0,0 @@
-https://drive.google.com/file/d/11M3Ye0r5agMaaicPbVGD0q2Hb3rGklbb/view?usp=drive_link
-https://drive.google.com/file/d/1-tx7SvYYgSvXCvnf_EI2OVdwK-CkFY6S/view?usp=drive_link
-https://drive.google.com/file/d/1EWJunmOpMHaU1hE106wwpbkGYcjQXYAF/view?usp=drive_link
-https://drive.google.com/file/d/1IDn95Z7FSiCckrSENtGV4u3RyFHNQSDY/view?usp=drive_link
-https://drive.google.com/file/d/1CwzvWj1i7QOtqrZvsCZ6BdZaKNDfpN32/view?usp=drive_link
-https://drive.google.com/file/d/1HvAvlhm77nAD3Td24QPSeq8lw-Rl_aOh/view?usp=drive_link
-https://drive.google.com/file/d/1t-suKYOPhXH666RpAYNRp2QU_DOy3AeM/view?usp=drive_link
-https://drive.google.com/file/d/18xpKgWh7RWyjMN5PkLTOo-AxsAadAuRw/view?usp=drive_link
-https://drive.google.com/file/d/1oci5Eto-ztv-AQNz8EnwZveBIhxvk-xJ/view?usp=drive_link
-https://drive.google.com/file/d/1Y-t_4vxdE6NpHO0DLJR8f3mD0Q-Wj5-c/view?usp=drive_link
-https://drive.google.com/file/d/1lylRqbbbB8bgtpsBWMPACmHJreuKmllv/view?usp=drive_link
-https://drive.google.com/file/d/1yliSyMig_NXShWfQx6qyW7Ijf2Y5lFK6/view?usp=drive_link
-https://drive.google.com/file/d/1XXhwJsJbeb7KXAooGvJapnm9bjnGUmxS/view?usp=drive_link
-https://drive.google.com/file/d/1_xs1f3hW2JArKyvfF7UWubWjyROGTLs6/view?usp=drive_link
-https://drive.google.com/file/d/1WVEHpr6EqKCZbkHapQSTXJq4xE4SWFT-/view?usp=drive_link
-https://drive.google.com/file/d/1RqOHv9pEQGvW8NUA7ynffFmG999TL_Az/view?usp=drive_link
-https://drive.google.com/file/d/1cu5AgD2gh-uA3PFJmzxxzNaF3qOSlYY1/view?usp=drive_link
-https://drive.google.com/file/d/1SsrXqiPclNrnYToPZ9Uq-k3y0C4qdHT1/view?usp=drive_link
-https://drive.google.com/file/d/1-J7EXf0vjkLIfSqT8ICEsP6CTjzSLBop/view?usp=drive_link
-https://drive.google.com/file/d/11O7ewUmoZXfyyKjy_6B5RW4DpjICxqBT/view?usp=drive_link
-https://drive.google.com/file/d/1iic44kZoCsjNsfAz2cMstZ9-WQvAhblF/view?usp=drive_link
-https://drive.google.com/file/d/1yLV1lVX-2WnWQldGlnQZ0x7QBuDiVkL3/view?usp=drive_link
-https://drive.google.com/file/d/1Tybp9ru98TTbGn4eyROpUQwDFuALWXmk/view?usp=drive_link
-https://drive.google.com/file/d/13E9OTMiipVJByDs5-J19oWwAz7l94LTN/view?usp=drive_link
-https://drive.google.com/file/d/1EeTpJQdMSliw4JzSMtJ6CyTvVdexjM4M/view?usp=drive_link
-https://drive.google.com/file/d/1NHyNwoFqzeAu-1_PSpq5JfxaiD_xbpn9/view?usp=drive_link
-https://drive.google.com/file/d/1fJcS0phDp4xm_FyGaJ5wr9Pe4KqtHaxD/view?usp=drive_link
-https://drive.google.com/file/d/12AqrLUaewDPEcFRqPZeZFb_TQ0Lfi3At/view?usp=drive_link
-https://drive.google.com/file/d/1x_hd4Qsq1oJS-aj2t3qM7WbbV7KZj05b/view?usp=drive_link
-https://drive.google.com/file/d/14OUSUArmsB068hs6BuEIXQhI1Cyz8Sf0/view?usp=drive_link
-https://drive.google.com/file/d/16zlzh1T5zeUJQnFf382NXkFEKEnDub4O/view?usp=drive_link
-https://drive.google.com/file/d/1IbDltmN-NEFCNtr1TO4ILxEgQ94rtjWv/view?usp=drive_link
-https://drive.google.com/file/d/15gmlf8Gx9455pZ1AlqcCSwh3nDPxMzSr/view?usp=drive_link
-https://drive.google.com/file/d/1qHpRL1oZfIMo_vxnm8qfwQ-7l0BZIVva/view?usp=drive_link
-https://drive.google.com/file/d/1H1xskIgiFZivkYn23rMzH3xePGOh3VTC/view?usp=drive_link
-https://drive.google.com/file/d/1avls6Pv0kYiCMNVknbc1zQsgy64MUDMM/view?usp=drive_link
-https://drive.google.com/file/d/1MmWVgCj5khc8KMIifmt3EzF1o-CtPyyn/view?usp=drive_link
-https://drive.google.com/file/d/1U0kCc_xqW0WNppf4sbnK14euWKdPZtzB/view?usp=drive_link
-https://drive.google.com/file/d/16CaEyQscOuhLj23PEGDTL9DeyNkohkMn/view?usp=drive_link
-https://drive.google.com/file/d/1Iu8uM6UUJ0zW8tvN-9UiOe_4oSNzEutg/view?usp=drive_link
-https://drive.google.com/file/d/1UImqiBaIxCR-1DNJaZhHqeHhaySOtVIr/view?usp=drive_link
-https://drive.google.com/file/d/1VpU2V_leIoRIyv_lAvE7eLHBG8DxCTnp/view?usp=drive_link
-https://drive.google.com/file/d/1_Q8J27OT3Xby7QY6yHvIJauFRWEMxkRm/view?usp=drive_link
-https://drive.google.com/file/d/1bantmVo1L9Xz4tbiNw_a1UC2Z_HPO1wT/view?usp=drive_link
-https://drive.google.com/file/d/1IRIXMJMCBDkBjbaHvAlEiBogSvZ1jK_3/view?usp=drive_link
-https://drive.google.com/file/d/1mAHXKjiFbjwydypW2t5Lv8_H5x6nHegl/view?usp=drive_link
-https://drive.google.com/file/d/1SfyY796fLrBCMY39OcyuxZafqSCRZPZk/view?usp=drive_link
-https://drive.google.com/file/d/1X-44sZ8CcfzIskc0dvSx882o1yFhHaZB/view?usp=drive_link
-https://drive.google.com/file/d/1BOIWCCCk6DLD4Bmvc75ZbbLi9AQm-1ao/view?usp=drive_link
-https://drive.google.com/file/d/1RuyDtRE1kk76sw-wP8vx5SgLoPF3PA_H/view?usp=drive_link
-https://drive.google.com/file/d/1c4eoQiBbGuy3CTAQDUSkd84Ponh1roAQ/view?usp=drive_link
-https://drive.google.com/file/d/19PXB9z4Ljq6dsbf9TqcOrrP5SRbw2Tc_/view?usp=drive_link
-https://drive.google.com/file/d/1nn1VVZVoIXWdYDozR7XHXE4mPLQG80PQ/view?usp=drive_link
-https://drive.google.com/file/d/1MBdFGOKPV8GUhwoSsJ_Ky3qAMLM2Bv3K/view?usp=drive_link
-https://drive.google.com/file/d/1of3k_M-7Nh3I1TndcWedxK4ca9dn8Sc5/view?usp=drive_link
--- a/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/mobile_elevator.txt
+++ b/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/mobile_elevator.txt
@@ -1,20 +0,0 @@
-https://drive.google.com/file/d/12ctkOAdkCNGN1JLbZb5ww3XTBn2LFpGI/view?usp=drive_link
-https://drive.google.com/file/d/1G_Vd46_4fq6O64gHHjUbJX5Ld44ZZx0y/view?usp=drive_link
-https://drive.google.com/file/d/1uKgUy73B3xBogQAOUhfZjO0X5qZGsi2c/view?usp=drive_link
-https://drive.google.com/file/d/1fu9cIrfI-fE2LhdGUxbx7-8Ci_PF8Ypm/view?usp=drive_link
-https://drive.google.com/file/d/1Ygk9ZPJzx8xw2A9JF3NHbJ44TqnvSTQR/view?usp=drive_link
-https://drive.google.com/file/d/18m5xPuccNsEB20WPshm3zhxmXc6k63ED/view?usp=drive_link
-https://drive.google.com/file/d/1DiqqxC44rriviRQpqogcv0-EB-Y6nr9g/view?usp=drive_link
-https://drive.google.com/file/d/1qPdaoTVDizJXkfXLioWU7iJ8hqCXSyOQ/view?usp=drive_link
-https://drive.google.com/file/d/1Fj9kIA_mG7f67WFfACJEaZ7izcHG7vUm/view?usp=drive_link
-https://drive.google.com/file/d/1WpYehZnI2P7dUdJPfkE-ij1rqCnjZEbB/view?usp=drive_link
-https://drive.google.com/file/d/1_zwWkT4jPyzB38STWb6whlzsPzXmfA9r/view?usp=drive_link
-https://drive.google.com/file/d/1U6-J4I_fPlSFFGfhZPxS5_YzKXwXIZYp/view?usp=drive_link
-https://drive.google.com/file/d/1pRhxxcTfZp5tQo_EScvJUwfc3amiS6Vk/view?usp=drive_link
-https://drive.google.com/file/d/1lWLntqra83RlYU_gN7Vostnfydf6gutd/view?usp=drive_link
-https://drive.google.com/file/d/1vIBKo0x-NYEHV1FvRpco1lQMpRdAWAIL/view?usp=drive_link
-https://drive.google.com/file/d/1pdrLV3JTQou_XH0Aap61Ssf60iVKm1jJ/view?usp=drive_link
-https://drive.google.com/file/d/1QTsLoQ7SwmKdQHjBGVDaR2uTwfFwtrOf/view?usp=drive_link
-https://drive.google.com/file/d/1Gytai8M_12J36GY6L_TulEcOC-035jwS/view?usp=drive_link
-https://drive.google.com/file/d/14LJudNc629NT-i8xreXtzl27ce_DxOFJ/view?usp=drive_link
-https://drive.google.com/file/d/1sBvPCODbzxGAI0S3lgN5cSG9Go3lRi00/view?usp=drive_link
--- a/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/mobile_shrimp.txt
+++ b/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/mobile_shrimp.txt
@@ -1,18 +0,0 @@
-https://drive.google.com/file/d/1MJn9GbC8p9lN4gC9KDMLEkTkP_gGpXj0/view?usp=drive_link
-https://drive.google.com/file/d/1-4LXgjl7ZCOgp-8GCJmFRD8OeqN5Jf7-/view?usp=drive_link
-https://drive.google.com/file/d/1Ho06Ce0SPbqU3juaMxNUwAt3zCRLGC8W/view?usp=drive_link
-https://drive.google.com/file/d/1ivHoj7_7olBSxH-Y8kqXEW7ttITK-45j/view?usp=drive_link
-https://drive.google.com/file/d/1qjY4hM_IvZ8cq2II_n9MeJbvyeuN4oBP/view?usp=drive_link
-https://drive.google.com/file/d/1rKVhO_f92-7sw13T8hTVrza3B9oAVgoy/view?usp=drive_link
-https://drive.google.com/file/d/1pcLPHO8fBkc1-CRa88tyQtEueE4xiXNi/view?usp=drive_link
-https://drive.google.com/file/d/1Vev_chCsIeEdvQ8poEYNsOJFGy_QU8kZ/view?usp=drive_link
-https://drive.google.com/file/d/1l5G4zpRkxSLCQjvGPYSN4zfCvVRQuzMz/view?usp=drive_link
-https://drive.google.com/file/d/14vgthE1eoakXkr2-DRw50E6lAqYOiUuE/view?usp=drive_link
-https://drive.google.com/file/d/17nPSmKKmgQ2B7zkzWrZYiLM3RBuFod82/view?usp=drive_link
-https://drive.google.com/file/d/1QcDsxplVvb_ID9BVrihl5FvlC-j7waXi/view?usp=drive_link
-https://drive.google.com/file/d/18pEejBpI-eEVaWAAjBCyC0vgbX3T1Esj/view?usp=drive_link
-https://drive.google.com/file/d/1H8eH6_IRODtEFT6WoM77ltR5OoOrqXmI/view?usp=drive_link
-https://drive.google.com/file/d/1IWlpFRZhoxyG4nS13CWK4leZVk5wbNx4/view?usp=drive_link
-https://drive.google.com/file/d/1PbZA8_OCGmMLxNP9xbkLRSChniL4uGxl/view?usp=drive_link
-https://drive.google.com/file/d/1p9XAdmG2f_WeflNO4DIJ_tr1rK6M9B4B/view?usp=drive_link
-https://drive.google.com/file/d/1nS59Et1cNAvKo3Y4SeSGRuZD5TvBbCF3/view?usp=drive_link
--- a/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/mobile_wash_pan.txt
+++ b/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/mobile_wash_pan.txt
@@ -1 +0,0 @@
-https://drive.google.com/drive/folders/1S8eFg98IaGAIKVZ8QFWG1bx4mHa-O204
--- a/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/mobile_wipe_wine.txt
+++ b/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/mobile_wipe_wine.txt
@@ -1,4 +0,0 @@
-https://drive.google.com/drive/folders/1tC_g1AJ8lglBLY-fjsQrG6DMBa3Ucp-0
-https://drive.google.com/file/d/1fG_Yi2MJrFjiUVN3XoiWXLtTxHlwwaDv/view?usp=drive_link
-https://drive.google.com/file/d/1WX32VWfzzX3Blmd06DRxLwFbMJfVe7P4/view?usp=drive_link
-https://drive.google.com/file/d/18onsX3vXg3xkFwP5bVUCjdV4n9TRn0C9/view?usp=drive_link
--- a/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/sim_insertion_human.txt
+++ b/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/sim_insertion_human.txt
@@ -1,3 +0,0 @@
-https://drive.google.com/drive/folders/1RgyD0JgTX30H4IM5XZn8I3zSV_mr8pyF
-https://drive.google.com/file/d/18Cudl6nikDtgRolea7je8iF_gGKzynOP/view?usp=drive_link
-https://drive.google.com/file/d/1C1kZYyROzs-PrLc0SkDgUgMi4-L3lauE/view?usp=drive_link
--- a/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/sim_insertion_scripted.txt
+++ b/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/sim_insertion_scripted.txt
@@ -1,3 +0,0 @@
-https://drive.google.com/drive/folders/1TsojQQSXtHEoGnqgJ3gmpPQR2DPLtS2N
-https://drive.google.com/file/d/1wfMSZ24oOh5KR_0aaP3Cnu_c4ZCveduB/view?usp=drive_link
-https://drive.google.com/file/d/17EuCUWS6uCCr6yyNzpXdcdE-_TTNCKtf/view?usp=drive_link
--- a/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/sim_transfer_cube_human.txt
+++ b/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/sim_transfer_cube_human.txt
@@ -1,3 +0,0 @@
-https://drive.google.com/drive/folders/1sc-E4QYW7A0o23m1u2VWNGVq5smAsfCo
-https://drive.google.com/file/d/18smMymtr8tIxaNUQ61gW6dG50pt3MvGq/view?usp=drive_link
-https://drive.google.com/file/d/1Nk7l53d9sJoGDBKAOnNrExX5nLacATc6/view?usp=drive_link
--- a/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/sim_transfer_cube_scripted.txt
+++ b/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/sim_transfer_cube_scripted.txt
@@ -1,3 +0,0 @@
-https://drive.google.com/drive/folders/1aRyoOhQwxhyt1J8XgEig4s6kzaw__LXj
-https://drive.google.com/file/d/1pnGIOd-E4-rhz2P3VxpknMKRZCoKt6eI/view?usp=drive_link
-https://drive.google.com/file/d/1GKReZHrXU73NMiC5zKCq_UtqPVtYq8eo/view?usp=drive_link
--- a/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/static_battery.txt
+++ b/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/static_battery.txt
@@ -1,2 +0,0 @@
-https://drive.google.com/drive/folders/19qS_n7vKgDcPeTMnvDHQ5-n73xEbJz5D
-https://drive.google.com/file/d/1oC31By0A2bsBeHyUwBdQw1z4ng6yi9Za/view?usp=drive_link
--- a/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/static_candy.txt
+++ b/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/static_candy.txt
@@ -1,2 +0,0 @@
-https://drive.google.com/drive/folders/1m5rQ6UVH8Q9RQp_6c0CxkQ88-L-ScO7q
-https://drive.google.com/file/d/1wHz2qcmwcVG0C0CZ9MjQDQcmj4OY9_a3/view?usp=drive_link
--- a/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/static_coffee.txt
+++ b/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/static_coffee.txt
@@ -1,2 +0,0 @@
-https://drive.google.com/drive/folders/1seQGay470nGQ-knBI5TjsTr8iL9Qws5q
-https://drive.google.com/file/d/1T89hSX5U99wLGvGTE7yUBaQPOpyj6Sai/view?usp=drive_link
--- a/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/static_coffee_new.txt
+++ b/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/static_coffee_new.txt
@@ -1,2 +0,0 @@
-https://drive.google.com/drive/folders/1t3eDc5Rg0DveyRe8oTm6Dia_FYU5mXyf
-https://drive.google.com/file/d/1TXFaduTakvS0ZWJqKCX-HIvYglum_5CY/view?usp=drive_link
--- a/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/static_cups_open.txt
+++ b/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/static_cups_open.txt
@@ -1,2 +0,0 @@
-https://drive.google.com/drive/folders/1Z9X3DNzd6LS0FFjQemNUMoMA5yk5VQOh
-https://drive.google.com/file/d/1Wlyc0vTkjXuWB6zbaVOWhEfD7BmPgUV_/view?usp=drive_link
--- a/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/static_fork_pick_up.txt
+++ b/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/static_fork_pick_up.txt
@@ -1,53 +0,0 @@
-https://drive.google.com/drive/folders/1DYgB4ifX4uIid9m9jnC0Zdz8Nf7ZC0fc
-https://drive.google.com/file/d/1Eb-NRNk_FmVleCbU_Ng5Y4dfcjTKN7Rv/view?usp=drive_link
-https://drive.google.com/file/d/1dkhjEADakT-44l9jf-nK4x89kr4yG_qb/view?usp=drive_link
-https://drive.google.com/file/d/14hDhgcZkVqNExGb4tIXpSjMshhqZETch/view?usp=drive_link
-https://drive.google.com/file/d/1zVMEHpHbuNyP5A_lYU7RPSLB-4V0yfZw/view?usp=drive_link
-https://drive.google.com/file/d/1JtgDjBvy7FnRpFzrx_foC3quorYQFAR-/view?usp=drive_link
-https://drive.google.com/file/d/1EHdneB6F-PP0dQlX8qPaXbxmKoBy_YwO/view?usp=drive_link
-https://drive.google.com/file/d/17Z0jjVBy1OPKREPu77_n_rQzorDiapji/view?usp=drive_link
-https://drive.google.com/file/d/1F4i23qPJ_qTf5jWjfLo4ARGJChznYWt3/view?usp=drive_link
-https://drive.google.com/file/d/1kZtXWM3uS0-rLblydBfJ0mMcVnMMXw9w/view?usp=drive_link
-https://drive.google.com/file/d/1mNODox87xFfY5Z_o5mcLsr8SHb39jDik/view?usp=drive_link
-https://drive.google.com/file/d/1Ob44VdmEUA93FKDECiRb5Ogz2xQg5IWp/view?usp=drive_link
-https://drive.google.com/file/d/1fdQLdjj3Cwv33R1wZhfrLz9Del8mqgHb/view?usp=drive_link
-https://drive.google.com/file/d/1Yu3L3ft21zP__XL8pCfhb788ZleuW1n5/view?usp=drive_link
-https://drive.google.com/file/d/1ozBBWXVZ9hXDh9ooHUNroHdYm8UDqnhJ/view?usp=drive_link
-https://drive.google.com/file/d/1o0TGqvfWw_Lunxb5ubKDS21Lr_WC0h75/view?usp=drive_link
-https://drive.google.com/file/d/1jZnd5eP5L6BH5l98BPN6OnoQx3fu8e9n/view?usp=drive_link
-https://drive.google.com/file/d/1S5sYbz8wcLYp0V67v13i4PRcBxodn4Hg/view?usp=drive_link
-https://drive.google.com/file/d/1rFeg_x6ftJYwPtBv34D3h2L2cpDLeR4G/view?usp=drive_link
-https://drive.google.com/file/d/1GvS3lcm4o6nm_scUk0XxKeVFNmzjucDZ/view?usp=drive_link
-https://drive.google.com/file/d/1-9i0riphC7NhhDahcQfD1QoBXP5gF90A/view?usp=drive_link
-https://drive.google.com/file/d/15p_IqGsMbKuvzMS872THAZr-3SBtb1Fr/view?usp=drive_link
-https://drive.google.com/file/d/1ToyYcBfJL8gbQn0q_59zPLsFmm7dmMJo/view?usp=drive_link
-https://drive.google.com/file/d/1e_7PNH7CYafE4pAebP7ZdI7XFbmEcy_i/view?usp=drive_link
-https://drive.google.com/file/d/1JoabvGVsIQdug2xOhUIhetEIyDM91y_Y/view?usp=drive_link
-https://drive.google.com/file/d/1kOMw1y0lmnVaCjwZICfzCsx6e0Z8MNGR/view?usp=drive_link
-https://drive.google.com/file/d/16it_wd1JOevUQTK2_CvF_pBACTgpIPgM/view?usp=drive_link
-https://drive.google.com/file/d/1IRcCj9HnJSfbyMgr5XEERGlEnWeZQwOc/view?usp=drive_link
-https://drive.google.com/file/d/1Z2dIJfq_S3liGmPN9Rphvkmucnmw7tlb/view?usp=drive_link
-https://drive.google.com/file/d/1J3NoAjzndGx9yNyaBOJHdNny1epzUoBt/view?usp=drive_link
-https://drive.google.com/file/d/18nOvxV1k8FSmBrhT4TPo2sKKSZXougyx/view?usp=drive_link
-https://drive.google.com/file/d/1CT8FxclafFMjSd7gCWVw3VSeryeiF04i/view?usp=drive_link
-https://drive.google.com/file/d/16M9KVqQMFfSsXfypK0bocFft8Nz3j2Rt/view?usp=drive_link
-https://drive.google.com/file/d/18QPVkw6bj6HW8LTPrQLWrrUX4R6RcF42/view?usp=drive_link
-https://drive.google.com/file/d/1hQTVtA5hBTE_StXpJafTZJ3tgt2VQQ_t/view?usp=drive_link
-https://drive.google.com/file/d/1Dn-d5g69H6EgAWgsFdrcbJKtz7ySsCQ8/view?usp=drive_link
-https://drive.google.com/file/d/13hMr16483P7ALYv73yMRUN37fJdVQM62/view?usp=drive_link
-https://drive.google.com/file/d/1848yN3XMN5zJMEgApt6KzrWgfRPfimtv/view?usp=drive_link
-https://drive.google.com/file/d/1oAD9kSnS0fTgj-CjD4u9VdZ5X67IOIMa/view?usp=drive_link
-https://drive.google.com/file/d/1ilzIWLCCG5b_KgF5s0wdN2I5-lFNpwC1/view?usp=drive_link
-https://drive.google.com/file/d/1rjsT2YBjnidxod1s9s-myAYz8boHr-WB/view?usp=drive_link
-https://drive.google.com/file/d/18Gg48HTub15bd8qzbhiCUufbVy0fbN5G/view?usp=drive_link
-https://drive.google.com/file/d/1WsSnQSqmMTVSRwrhT1Y-v782My2zcjLm/view?usp=drive_link
-https://drive.google.com/file/d/1ea9ZCvoyc-xqiFXgeDcA_mOWsw7VUuoi/view?usp=drive_link
-https://drive.google.com/file/d/1wv1v3-XhPgbNzp62BXbJTDzMPu2tlDUc/view?usp=drive_link
-https://drive.google.com/file/d/18-ikzt8LoZ83Gi3goKCELs4U4z8hrRoF/view?usp=drive_link
-https://drive.google.com/file/d/16Bjhp7JNCXkGuLvyNcZowAx3W-Y-15DV/view?usp=drive_link
-https://drive.google.com/file/d/1Gc-KRI-xwcp1fMR55ugbrLg_5y3SPde-/view?usp=drive_link
-https://drive.google.com/file/d/1oP72Q386Z4Sy5MMm-t5yNogIe5Van_9k/view?usp=drive_link
-https://drive.google.com/file/d/112T90eDUDVH-SyOV7UnZl5bscAH2hcfq/view?usp=drive_link
-https://drive.google.com/file/d/1y-uKOesRRhjgDtFbG_j65f4SGg0v8XDg/view?usp=drive_link
-https://drive.google.com/file/d/1LOP05OagoI3km-ZKQBrS204A85UVk7Ok/view?usp=drive_link
-https://drive.google.com/file/d/1QkHQKgasVzWsmdPvkXgGhWyQ84d93_Az/view?usp=drive_link
--- a/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/static_pingpong_test.txt
+++ b/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/static_pingpong_test.txt
@@ -1 +0,0 @@
-https://drive.google.com/drive/folders/1Ut2cv6o6Pkfgg46DgwVUM7Z5PkNG8eJ-
--- a/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/static_pro_pencil.txt
+++ b/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/static_pro_pencil.txt
@@ -1 +0,0 @@
-https://drive.google.com/drive/folders/1FqxPV0PgvgIu8XFjtvZSPSExuNcxVVAY
--- a/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/static_screw_driver.txt
+++ b/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/static_screw_driver.txt
@@ -1,2 +0,0 @@
-https://drive.google.com/drive/folders/1SKtG0ct9q0nVdYssJNMWSOjikcXliT58
-https://drive.google.com/file/d/1nchD21O30B3i3LDoqramo1zgW5YvpJIN/view?usp=drive_link
--- a/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/static_tape.txt
+++ b/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/static_tape.txt
@@ -1,2 +0,0 @@
-https://drive.google.com/drive/folders/1_4DHf2cma0xsChLQFghwigX6Ukti5-zQ
-https://drive.google.com/file/d/1_8vS4hDNDgUQY-SmekrNaa7dF67QJYU-/view?usp=drive_link
--- a/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/static_thread_velcro.txt
+++ b/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/static_thread_velcro.txt
@@ -1,2 +0,0 @@
-https://drive.google.com/drive/folders/1_4DHf2cma0xsChLQFghwigX6Ukti5-zQ
-https://drive.google.com/file/d/1_8vS4hDNDgUQY-SmekrNaa7dF67QJYU-/view?usp=drive_link
--- a/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/static_towel.txt
+++ b/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/static_towel.txt
@@ -1,2 +0,0 @@
-https://drive.google.com/drive/folders/1fAD7vkyTGTFB_nGXIKofCU1U05oE3MFv
-https://drive.google.com/file/d/1XzyQ2B6LLvcurIonOpEu4nij2qwNWshH/view?usp=drive_link
--- a/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/static_vinh_cup.txt
+++ b/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/static_vinh_cup.txt
@@ -1,53 +0,0 @@
-https://drive.google.com/drive/folders/13EQsVsnxT86K20QAoyE_YpsFbQ7fZQdu
-https://drive.google.com/file/d/1-W_JHghZG65FNTVhw1SXhtQrazdLL3Ue/view?usp=drive_link
-https://drive.google.com/file/d/1VwRJgdWUo-2nQaNM7Bs77-fsm8iwUxEo/view?usp=drive_link
-https://drive.google.com/file/d/1wFzGRo5iYA13WLi6IV1ry64RyahQBFio/view?usp=drive_link
-https://drive.google.com/file/d/1IKtQzQ-n-UTv64hYpReu2R4cqUvmNQqD/view?usp=drive_link
-https://drive.google.com/file/d/1GicVci9OiuuZZH79i5Mg7AtWod94MzwT/view?usp=drive_link
-https://drive.google.com/file/d/1JVnIoR7EIQp70T4eAf9RX65JcTrzsjQc/view?usp=drive_link
-https://drive.google.com/file/d/1W2xr4h23ucjPrc-mBEeqnACsfaImpc0p/view?usp=drive_link
-https://drive.google.com/file/d/10xj_0V7A07o3uCa7v5omUrTC0YlPW8H3/view?usp=drive_link
-https://drive.google.com/file/d/1FOc3EMaCy8Mb0_a7PuXLAwKwvxkbKmwU/view?usp=drive_link
-https://drive.google.com/file/d/143PgDXBcf2GQ0Q07ZPMVMfBgZDd5sLJG/view?usp=drive_link
-https://drive.google.com/file/d/1pE5Tyj0LlGbGWvUzuhixp86Ibu55Ez3I/view?usp=drive_link
-https://drive.google.com/file/d/141668b1VzX80ncrVJPzhkoAeIFB4MEK9/view?usp=drive_link
-https://drive.google.com/file/d/1bw12lo37p1ZvRvErHsll7cEYi2OxscvZ/view?usp=drive_link
-https://drive.google.com/file/d/1zfnMFvbgBjl6SzYhksbaOzfbwLrCN6tb/view?usp=drive_link
-https://drive.google.com/file/d/1-GIszA6mUJMaNB-tdh9r9skc77SWA0VX/view?usp=drive_link
-https://drive.google.com/file/d/1fTB0zWFYU6zh4IIUFT2zX_OkwYqmElwY/view?usp=drive_link
-https://drive.google.com/file/d/1gPIPNKGmrO9c7gKF7SP0SuUYbIBBq8z1/view?usp=drive_link
-https://drive.google.com/file/d/12JeJ-dQd5lYyn6PlDOGdE-ChVeiZ-Uv0/view?usp=drive_link
-https://drive.google.com/file/d/100_20cgCqerU6qoh3TfTbwLy9mlDAFEG/view?usp=drive_link
-https://drive.google.com/file/d/111oAGJ76ku_pYgbBoIdZAC1_XEQcPI__/view?usp=drive_link
-https://drive.google.com/file/d/1UhC8L-354ZQ2gblPFGI35EMsVwfpuKa0/view?usp=drive_link
-https://drive.google.com/file/d/1sIXQSgUR_xdrNtGrL6QGBnkLMKErsIp1/view?usp=drive_link
-https://drive.google.com/file/d/16Ax77bDSIXnsn4GFL8XYKKT1P6bPpfMd/view?usp=drive_link
-https://drive.google.com/file/d/1pgRVYwwVIsWq_qsWqZpe1UBzZfF5Fa9D/view?usp=drive_link
-https://drive.google.com/file/d/1jtimaZkWsY1P5gC2bbS64H_WCUU7HXN2/view?usp=drive_link
-https://drive.google.com/file/d/1N6Bh02P-RiTEgtx1YH1Db_X3TGpP-X_r/view?usp=drive_link
-https://drive.google.com/file/d/14Fy8EwJ8d9Vh97Yt1VOvUChSCrfIjBij/view?usp=drive_link
-https://drive.google.com/file/d/1IRuv42dvIMPuKhcMZmuXaBjJ-lPFOmQd/view?usp=drive_link
-https://drive.google.com/file/d/16XWzNY2D8ucVVn5geBgsVdhm3ppO4que/view?usp=drive_link
-https://drive.google.com/file/d/1xsVOoQgthK_L_SDrmq_JvQgUpAvPEAY8/view?usp=drive_link
-https://drive.google.com/file/d/1bZbw66DyEMvnJnzkdUUNbKjvNKg8KFYM/view?usp=drive_link
-https://drive.google.com/file/d/1CyTVkdrNGGpouCXr4CfhKbMzE6Ah3oo3/view?usp=drive_link
-https://drive.google.com/file/d/1hDRyeM-XEDpHXpptbT8LvNnlQUR3PWOh/view?usp=drive_link
-https://drive.google.com/file/d/1XhHWxbra8Iy5irQZ83IvxwaJqHq9x4s1/view?usp=drive_link
-https://drive.google.com/file/d/1haZcn6aM1o4JlmP9tJj3x2enrxiPaDSD/view?usp=drive_link
-https://drive.google.com/file/d/1ypDyuUTbljaBZ34f-t7lj3O_0bRmyX2n/view?usp=drive_link
-https://drive.google.com/file/d/1ILEEZo_tA9_ChIAprr2mPaNVKZi5vXsO/view?usp=drive_link
-https://drive.google.com/file/d/1U7nVYFaGE8vVTfLCW33D74xOjDcqfgyJ/view?usp=drive_link
-https://drive.google.com/file/d/1rZ93_rmCov5SMDxPkfM3qthcRELZrQX6/view?usp=drive_link
-https://drive.google.com/file/d/1mYO1b_csddtyE3qT6cwLiw-m2w2_1Lxh/view?usp=drive_link
-https://drive.google.com/file/d/1xz7Q5x2jikY8wJQjMRQpRws6AnfWlHm5/view?usp=drive_link
-https://drive.google.com/file/d/1OO8GaO-0FrSZRd1kxMYwBmubyiLOWnbl/view?usp=drive_link
-https://drive.google.com/file/d/1EXn4NVDmf-4_HCy34mYwT-vwK2CFI9ev/view?usp=drive_link
-https://drive.google.com/file/d/10hH70XhXRL9C5SnAG4toHtfHqfJUJo4H/view?usp=drive_link
-https://drive.google.com/file/d/18tiBcxea0guUai4lwsXQvt0q2LZ8ZnnJ/view?usp=drive_link
-https://drive.google.com/file/d/1Q8R8qv37vk5PQ5kQ2ibx6BFLOySD0VpX/view?usp=drive_link
-https://drive.google.com/file/d/17aNriHzjhdibCyuUjQoMFZqjybJZtggG/view?usp=drive_link
-https://drive.google.com/file/d/1LVjEYHSdeKm6CotU1QguIeNEPaIaFl_1/view?usp=drive_link
-https://drive.google.com/file/d/1ufAhE_EkgJ85slg2EW8aW_grOzE_Lmxd/view?usp=drive_link
-https://drive.google.com/file/d/1wtzLtXrkw9eXRGESTPIOlpl1tInu-b2m/view?usp=drive_link
-https://drive.google.com/file/d/1Mk5qvVtD_QHwGOUApRq76TUw2T5THu6f/view?usp=drive_link
-https://drive.google.com/file/d/1y1WQ3hboWVJ68KEYQQ3OhreGuaUpSgwc/view?usp=drive_link
--- a/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/static_vinh_cup_left.txt
+++ b/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/static_vinh_cup_left.txt
@@ -1,52 +0,0 @@
-https://drive.google.com/drive/folders/1dxWh6YFZUDt6qXIoxgD9bla3CiFjZ11C
-https://drive.google.com/file/d/1hNBJN00SCAlOl0ZEgm7RRGbAGDjyBs0p/view?usp=drive_link
-https://drive.google.com/file/d/17He0CVwXGeoMmXg4SHKo-osNn7YPKVL7/view?usp=drive_link
-https://drive.google.com/file/d/1laNKUVID1x2CV6a2O2WQjwFewKu4lidL/view?usp=drive_link
-https://drive.google.com/file/d/1pNf36xbZJGRArYLmNAvRj5y6CoqdC6kB/view?usp=drive_link
-https://drive.google.com/file/d/1_4E1-y3JXk5I0ebycLYM70YDPK9g52gZ/view?usp=drive_link
-https://drive.google.com/file/d/1PHfzhGPdbolKyOpS3FnR2w7Q8zUlJXSk/view?usp=drive_link
-https://drive.google.com/file/d/17ls2PPN-Pi3tEuK059cwV2_iDT8aGhOO/view?usp=drive_link
-https://drive.google.com/file/d/1LWsg6PmCT00Kv_N_slrmcwKmQPGoBT3k/view?usp=drive_link
-https://drive.google.com/file/d/12LckrchoHTUVH7rxi8J7zD9dA19GXvoW/view?usp=drive_link
-https://drive.google.com/file/d/1VqrJKjAIkj5gtFXL69grdSeu9CyaqnSw/view?usp=drive_link
-https://drive.google.com/file/d/1g5rQYDBZvW-kUtYPeyF3qmd53v6k7kXu/view?usp=drive_link
-https://drive.google.com/file/d/10kUgaSJ0TS7teaG83G3Rf_DG4XGrBt6A/view?usp=drive_link
-https://drive.google.com/file/d/1je9XmneZQZvTma5adMJICUPDovW3ppei/view?usp=drive_link
-https://drive.google.com/file/d/1v28r6bedwZGbUPVVTVImXhK-42XdtGfj/view?usp=drive_link
-https://drive.google.com/file/d/1-TEEx9sGVvzMMaNXYfQMtY2JJ6cvl0dT/view?usp=drive_link
-https://drive.google.com/file/d/1YdBKdJFP9rJWBUX7qrOYL_gfUA8o6J9M/view?usp=drive_link
-https://drive.google.com/file/d/1X9vffwQHNUSKLXr2RlYNtbWDIFCIDfdF/view?usp=drive_link
-https://drive.google.com/file/d/11hqesqa5kvEe5FABUnZRcvmOhR373cYM/view?usp=drive_link
-https://drive.google.com/file/d/1ltTTECjEcbQPgS3UPRgMzaE2x9n6H7dC/view?usp=drive_link
-https://drive.google.com/file/d/1Zxqfa29JdwT-bfMpivi6IG2vz34d21dD/view?usp=drive_link
-https://drive.google.com/file/d/11LQlVxS5hz494dYUJ_PNRPx2NHIJbQns/view?usp=drive_link
-https://drive.google.com/file/d/1i1JhNtnZpO_E8rAv8gxBP3ZTZRvcvsZi/view?usp=drive_link
-https://drive.google.com/file/d/11jOXAr2EULUO4Qkm748634lg4UUFho5U/view?usp=drive_link
-https://drive.google.com/file/d/1rj67wur8DdB_Pipwx24bY43xu4X1eQ5e/view?usp=drive_link
-https://drive.google.com/file/d/15ZTm6lO6f_JQy_4SNfrOu3iPYn1Ro8mh/view?usp=drive_link
-https://drive.google.com/file/d/1q4gBtqWPJtCwXEvknGgN0WHGp7Vfn1b9/view?usp=drive_link
-https://drive.google.com/file/d/1t17keyre47AYqm8GgXiQ7EcvcUkeSiDQ/view?usp=drive_link
-https://drive.google.com/file/d/1OYUPGxtZgOF86Ng_BEOTXm_XOYpuQPsO/view?usp=drive_link
-https://drive.google.com/file/d/1cBjbGHi3dwWHtx6r9EQJi0JT_CE3LuHt/view?usp=drive_link
-https://drive.google.com/file/d/14qaMyF0mcbCB-fCYKNyo5_2NahSC6D5u/view?usp=drive_link
-https://drive.google.com/file/d/12FgX86eA7Y5co9ULBVK80XMsiKQSs-Ri/view?usp=drive_link
-https://drive.google.com/file/d/1yvoHWidf-jdBVw6qCCXOFfkVwKj_2hPk/view?usp=drive_link
-https://drive.google.com/file/d/1a2SugsSDlC8UtUrFzp-_KAwyZckQOvdQ/view?usp=drive_link
-https://drive.google.com/file/d/1l8pILBFSAosypWJMza2K09Vm7rug9axm/view?usp=drive_link
-https://drive.google.com/file/d/1hfPQ8dBCk97PnOhq6_MIISm3IEzcOxJG/view?usp=drive_link
-https://drive.google.com/file/d/1PPAUwlJCFKpms8cqF_k1v2_fCgDBOc3S/view?usp=drive_link
-https://drive.google.com/file/d/1lVKQZeqFfK3amEmLuFhYLUFQ2eyE8rOW/view?usp=drive_link
-https://drive.google.com/file/d/1K9iPMLfDowcIFoyzpvgn88dQ6x6kVwNG/view?usp=drive_link
-https://drive.google.com/file/d/1PNvMqG9tL7QxeLaYBGHiWYR6SYb5iIct/view?usp=drive_link
-https://drive.google.com/file/d/1xkRtzbvIkUsylx9hrFLGQsJn0h1EYu-5/view?usp=drive_link
-https://drive.google.com/file/d/1nxMRrJlSayjDIfr5CmHO1NzAw3COhsLi/view?usp=drive_link
-https://drive.google.com/file/d/1Qs3WEyMGrmagiHIkkFEueWNnJhkUeR1s/view?usp=drive_link
-https://drive.google.com/file/d/1D-G2_Q0SS3M8zyJbg_XzkF2ANPw1HTuX/view?usp=drive_link
-https://drive.google.com/file/d/1mdmJsDGO-YtJAOF_yPKl6lq4PJOIbQhT/view?usp=drive_link
-https://drive.google.com/file/d/11m9bwfop_sPmnQr_8amB6EEsrbAeG_z5/view?usp=drive_link
-https://drive.google.com/file/d/19tyYt5FMn5kru0g9o2nMJhKPnsDqkIZv/view?usp=drive_link
-https://drive.google.com/file/d/1XvTpUdsVTZ-vydvdYYmynbma--HfUGSl/view?usp=drive_link
-https://drive.google.com/file/d/1MO3hFu68J6NohTzr9aB_fY02VA6QSOqj/view?usp=drive_link
-https://drive.google.com/file/d/1Lh-UjwAk__04YOTWINF_QGVU8SjetVaY/view?usp=drive_link
-https://drive.google.com/file/d/1jkSOUwZV5GJ7rZlVeErjcu0DBQs8Np0d/view?usp=drive_link
-https://drive.google.com/file/d/1VIN1eLI-93WrVQwCjsv6XQr353DqqBYA/view?usp=drive_link
--- a/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/static_ziploc_slide.txt
+++ b/lerobot/common/datasets/push_dataset_to_hub/_aloha_raw_urls/static_ziploc_slide.txt
@@ -1,8 +0,0 @@
-https://drive.google.com/drive/folders/1EgKar7rWBmTIRmeJYZciSwjZx3uP2mHO
-https://drive.google.com/file/d/12eYWQO15atK2hBjXhynPJd9MKAj_42pz/view?usp=drive_link
-https://drive.google.com/file/d/1Ul4oEeICJDjgfYTl4H1uaisTzVYIM6wd/view?usp=drive_link
-https://drive.google.com/file/d/1WSF-OG8lKSe2wVYCv5D1aJNipxpgddk-/view?usp=drive_link
-https://drive.google.com/file/d/1_ppD5j5sFh26aWW0JmhLzJMeNB-lCArk/view?usp=drive_link
-https://drive.google.com/file/d/1WUp846dgWXYhu4oJfhHxiU6YL_7N6s4W/view?usp=drive_link
-https://drive.google.com/file/d/1HRZNAIoAQw_uYiPwnBvtBioQoqiqoXdA/view?usp=drive_link
-https://drive.google.com/file/d/1hedGq-QDMnIn8GlXXBC3GiEJ_Y-LTxyt/view?usp=drive_link
--- a/lerobot/common/datasets/push_dataset_to_hub/_diffusion_policy_replay_buffer.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/_diffusion_policy_replay_buffer.py
@@ -1,634 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Helper code for loading PushT dataset from Diffusion Policy (https://diffusion-policy.cs.columbia.edu/)
-
-Copied from the original Diffusion Policy repository and used in our `download_and_upload_dataset.py` script.
-"""
-
-from __future__ import annotations
-
-import math
-import numbers
-import os
-from functools import cached_property
-
-import numcodecs
-import numpy as np
-import zarr
-
-
-def check_chunks_compatible(chunks: tuple, shape: tuple):
-    assert len(shape) == len(chunks)
-    for c in chunks:
-        assert isinstance(c, numbers.Integral)
-        assert c > 0
-
-
-def rechunk_recompress_array(group, name, chunks=None, chunk_length=None, compressor=None, tmp_key="_temp"):
-    old_arr = group[name]
-    if chunks is None:
-        chunks = (chunk_length,) + old_arr.chunks[1:] if chunk_length is not None else old_arr.chunks
-    check_chunks_compatible(chunks, old_arr.shape)
-
-    if compressor is None:
-        compressor = old_arr.compressor
-
-    if (chunks == old_arr.chunks) and (compressor == old_arr.compressor):
-        # no change
-        return old_arr
-
-    # rechunk recompress
-    group.move(name, tmp_key)
-    old_arr = group[tmp_key]
-    n_copied, n_skipped, n_bytes_copied = zarr.copy(
-        source=old_arr,
-        dest=group,
-        name=name,
-        chunks=chunks,
-        compressor=compressor,
-    )
-    del group[tmp_key]
-    arr = group[name]
-    return arr
-
-
-def get_optimal_chunks(shape, dtype, target_chunk_bytes=2e6, max_chunk_length=None):
-    """
-    Common shapes
-    T,D
-    T,N,D
-    T,H,W,C
-    T,N,H,W,C
-    """
-    itemsize = np.dtype(dtype).itemsize
-    # reversed
-    rshape = list(shape[::-1])
-    if max_chunk_length is not None:
-        rshape[-1] = int(max_chunk_length)
-    split_idx = len(shape) - 1
-    for i in range(len(shape) - 1):
-        this_chunk_bytes = itemsize * np.prod(rshape[:i])
-        next_chunk_bytes = itemsize * np.prod(rshape[: i + 1])
-        if this_chunk_bytes <= target_chunk_bytes and next_chunk_bytes > target_chunk_bytes:
-            split_idx = i
-
-    rchunks = rshape[:split_idx]
-    item_chunk_bytes = itemsize * np.prod(rshape[:split_idx])
-    this_max_chunk_length = rshape[split_idx]
-    next_chunk_length = min(this_max_chunk_length, math.ceil(target_chunk_bytes / item_chunk_bytes))
-    rchunks.append(next_chunk_length)
-    len_diff = len(shape) - len(rchunks)
-    rchunks.extend([1] * len_diff)
-    chunks = tuple(rchunks[::-1])
-    # print(np.prod(chunks) * itemsize / target_chunk_bytes)
-    return chunks
-
-
-class ReplayBuffer:
-    """
-    Zarr-based temporal datastructure.
-    Assumes first dimension to be time. Only chunk in time dimension.
-    """
-
-    def __init__(self, root: zarr.Group | dict[str, dict]):
-        """
-        Dummy constructor. Use copy_from* and create_from* class methods instead.
-        """
-        assert "data" in root
-        assert "meta" in root
-        assert "episode_ends" in root["meta"]
-        for value in root["data"].values():
-            assert value.shape[0] == root["meta"]["episode_ends"][-1]
-        self.root = root
-
-    # ============= create constructors ===============
-    @classmethod
-    def create_empty_zarr(cls, storage=None, root=None):
-        if root is None:
-            if storage is None:
-                storage = zarr.MemoryStore()
-            root = zarr.group(store=storage)
-        root.require_group("data", overwrite=False)
-        meta = root.require_group("meta", overwrite=False)
-        if "episode_ends" not in meta:
-            meta.zeros("episode_ends", shape=(0,), dtype=np.int64, compressor=None, overwrite=False)
-        return cls(root=root)
-
-    @classmethod
-    def create_empty_numpy(cls):
-        root = {"data": {}, "meta": {"episode_ends": np.zeros((0,), dtype=np.int64)}}
-        return cls(root=root)
-
-    @classmethod
-    def create_from_group(cls, group, **kwargs):
-        if "data" not in group:
-            # create from stratch
-            buffer = cls.create_empty_zarr(root=group, **kwargs)
-        else:
-            # already exist
-            buffer = cls(root=group, **kwargs)
-        return buffer
-
-    @classmethod
-    def create_from_path(cls, zarr_path, mode="r", **kwargs):
-        """
-        Open a on-disk zarr directly (for dataset larger than memory).
-        Slower.
-        """
-        group = zarr.open(os.path.expanduser(zarr_path), mode)
-        return cls.create_from_group(group, **kwargs)
-
-    # ============= copy constructors ===============
-    @classmethod
-    def copy_from_store(
-        cls,
-        src_store,
-        store=None,
-        keys=None,
-        chunks: dict[str, tuple] | None = None,
-        compressors: dict | str | numcodecs.abc.Codec | None = None,
-        if_exists="replace",
-        **kwargs,
-    ):
-        """
-        Load to memory.
-        """
-        src_root = zarr.group(src_store)
-        if chunks is None:
-            chunks = {}
-        if compressors is None:
-            compressors = {}
-        root = None
-        if store is None:
-            # numpy backend
-            meta = {}
-            for key, value in src_root["meta"].items():
-                if len(value.shape) == 0:
-                    meta[key] = np.array(value)
-                else:
-                    meta[key] = value[:]
-
-            if keys is None:
-                keys = src_root["data"].keys()
-            data = {}
-            for key in keys:
-                arr = src_root["data"][key]
-                data[key] = arr[:]
-
-            root = {"meta": meta, "data": data}
-        else:
-            root = zarr.group(store=store)
-            # copy without recompression
-            n_copied, n_skipped, n_bytes_copied = zarr.copy_store(
-                source=src_store, dest=store, source_path="/meta", dest_path="/meta", if_exists=if_exists
-            )
-            data_group = root.create_group("data", overwrite=True)
-            if keys is None:
-                keys = src_root["data"].keys()
-            for key in keys:
-                value = src_root["data"][key]
-                cks = cls._resolve_array_chunks(chunks=chunks, key=key, array=value)
-                cpr = cls._resolve_array_compressor(compressors=compressors, key=key, array=value)
-                if cks == value.chunks and cpr == value.compressor:
-                    # copy without recompression
-                    this_path = "/data/" + key
-                    n_copied, n_skipped, n_bytes_copied = zarr.copy_store(
-                        source=src_store,
-                        dest=store,
-                        source_path=this_path,
-                        dest_path=this_path,
-                        if_exists=if_exists,
-                    )
-                else:
-                    # copy with recompression
-                    n_copied, n_skipped, n_bytes_copied = zarr.copy(
-                        source=value,
-                        dest=data_group,
-                        name=key,
-                        chunks=cks,
-                        compressor=cpr,
-                        if_exists=if_exists,
-                    )
-        buffer = cls(root=root)
-        return buffer
-
-    @classmethod
-    def copy_from_path(
-        cls,
-        zarr_path,
-        backend=None,
-        store=None,
-        keys=None,
-        chunks: dict[str, tuple] | None = None,
-        compressors: dict | str | numcodecs.abc.Codec | None = None,
-        if_exists="replace",
-        **kwargs,
-    ):
-        """
-        Copy a on-disk zarr to in-memory compressed.
-        Recommended
-        """
-        if chunks is None:
-            chunks = {}
-        if compressors is None:
-            compressors = {}
-        if backend == "numpy":
-            print("backend argument is deprecated!")
-            store = None
-        group = zarr.open(os.path.expanduser(zarr_path), "r")
-        return cls.copy_from_store(
-            src_store=group.store,
-            store=store,
-            keys=keys,
-            chunks=chunks,
-            compressors=compressors,
-            if_exists=if_exists,
-            **kwargs,
-        )
-
-    # ============= save methods ===============
-    def save_to_store(
-        self,
-        store,
-        chunks: dict[str, tuple] | None = None,
-        compressors: str | numcodecs.abc.Codec | dict | None = None,
-        if_exists="replace",
-        **kwargs,
-    ):
-        root = zarr.group(store)
-        if chunks is None:
-            chunks = {}
-        if compressors is None:
-            compressors = {}
-        if self.backend == "zarr":
-            # recompression free copy
-            n_copied, n_skipped, n_bytes_copied = zarr.copy_store(
-                source=self.root.store,
-                dest=store,
-                source_path="/meta",
-                dest_path="/meta",
-                if_exists=if_exists,
-            )
-        else:
-            meta_group = root.create_group("meta", overwrite=True)
-            # save meta, no chunking
-            for key, value in self.root["meta"].items():
-                _ = meta_group.array(name=key, data=value, shape=value.shape, chunks=value.shape)
-
-        # save data, chunk
-        data_group = root.create_group("data", overwrite=True)
-        for key, value in self.root["data"].items():
-            cks = self._resolve_array_chunks(chunks=chunks, key=key, array=value)
-            cpr = self._resolve_array_compressor(compressors=compressors, key=key, array=value)
-            if isinstance(value, zarr.Array):
-                if cks == value.chunks and cpr == value.compressor:
-                    # copy without recompression
-                    this_path = "/data/" + key
-                    n_copied, n_skipped, n_bytes_copied = zarr.copy_store(
-                        source=self.root.store,
-                        dest=store,
-                        source_path=this_path,
-                        dest_path=this_path,
-                        if_exists=if_exists,
-                    )
-                else:
-                    # copy with recompression
-                    n_copied, n_skipped, n_bytes_copied = zarr.copy(
-                        source=value,
-                        dest=data_group,
-                        name=key,
-                        chunks=cks,
-                        compressor=cpr,
-                        if_exists=if_exists,
-                    )
-            else:
-                # numpy
-                _ = data_group.array(name=key, data=value, chunks=cks, compressor=cpr)
-        return store
-
-    def save_to_path(
-        self,
-        zarr_path,
-        chunks: dict[str, tuple] | None = None,
-        compressors: str | numcodecs.abc.Codec | dict | None = None,
-        if_exists="replace",
-        **kwargs,
-    ):
-        if chunks is None:
-            chunks = {}
-        if compressors is None:
-            compressors = {}
-        store = zarr.DirectoryStore(os.path.expanduser(zarr_path))
-        return self.save_to_store(
-            store, chunks=chunks, compressors=compressors, if_exists=if_exists, **kwargs
-        )
-
-    @staticmethod
-    def resolve_compressor(compressor="default"):
-        if compressor == "default":
-            compressor = numcodecs.Blosc(cname="lz4", clevel=5, shuffle=numcodecs.Blosc.NOSHUFFLE)
-        elif compressor == "disk":
-            compressor = numcodecs.Blosc("zstd", clevel=5, shuffle=numcodecs.Blosc.BITSHUFFLE)
-        return compressor
-
-    @classmethod
-    def _resolve_array_compressor(cls, compressors: dict | str | numcodecs.abc.Codec, key, array):
-        # allows compressor to be explicitly set to None
-        cpr = "nil"
-        if isinstance(compressors, dict):
-            if key in compressors:
-                cpr = cls.resolve_compressor(compressors[key])
-            elif isinstance(array, zarr.Array):
-                cpr = array.compressor
-        else:
-            cpr = cls.resolve_compressor(compressors)
-        # backup default
-        if cpr == "nil":
-            cpr = cls.resolve_compressor("default")
-        return cpr
-
-    @classmethod
-    def _resolve_array_chunks(cls, chunks: dict | tuple, key, array):
-        cks = None
-        if isinstance(chunks, dict):
-            if key in chunks:
-                cks = chunks[key]
-            elif isinstance(array, zarr.Array):
-                cks = array.chunks
-        elif isinstance(chunks, tuple):
-            cks = chunks
-        else:
-            raise TypeError(f"Unsupported chunks type {type(chunks)}")
-        # backup default
-        if cks is None:
-            cks = get_optimal_chunks(shape=array.shape, dtype=array.dtype)
-        # check
-        check_chunks_compatible(chunks=cks, shape=array.shape)
-        return cks
-
-    # ============= properties =================
-    @cached_property
-    def data(self):
-        return self.root["data"]
-
-    @cached_property
-    def meta(self):
-        return self.root["meta"]
-
-    def update_meta(self, data):
-        # sanitize data
-        np_data = {}
-        for key, value in data.items():
-            if isinstance(value, np.ndarray):
-                np_data[key] = value
-            else:
-                arr = np.array(value)
-                if arr.dtype == object:
-                    raise TypeError(f"Invalid value type {type(value)}")
-                np_data[key] = arr
-
-        meta_group = self.meta
-        if self.backend == "zarr":
-            for key, value in np_data.items():
-                _ = meta_group.array(
-                    name=key, data=value, shape=value.shape, chunks=value.shape, overwrite=True
-                )
-        else:
-            meta_group.update(np_data)
-
-        return meta_group
-
-    @property
-    def episode_ends(self):
-        return self.meta["episode_ends"]
-
-    def get_episode_idxs(self):
-        import numba
-
-        numba.jit(nopython=True)
-
-        def _get_episode_idxs(episode_ends):
-            result = np.zeros((episode_ends[-1],), dtype=np.int64)
-            for i in range(len(episode_ends)):
-                start = 0
-                if i > 0:
-                    start = episode_ends[i - 1]
-                end = episode_ends[i]
-                for idx in range(start, end):
-                    result[idx] = i
-            return result
-
-        return _get_episode_idxs(self.episode_ends)
-
-    @property
-    def backend(self):
-        backend = "numpy"
-        if isinstance(self.root, zarr.Group):
-            backend = "zarr"
-        return backend
-
-    # =========== dict-like API ==============
-    def __repr__(self) -> str:
-        if self.backend == "zarr":
-            return str(self.root.tree())
-        else:
-            return super().__repr__()
-
-    def keys(self):
-        return self.data.keys()
-
-    def values(self):
-        return self.data.values()
-
-    def items(self):
-        return self.data.items()
-
-    def __getitem__(self, key):
-        return self.data[key]
-
-    def __contains__(self, key):
-        return key in self.data
-
-    # =========== our API ==============
-    @property
-    def n_steps(self):
-        if len(self.episode_ends) == 0:
-            return 0
-        return self.episode_ends[-1]
-
-    @property
-    def n_episodes(self):
-        return len(self.episode_ends)
-
-    @property
-    def chunk_size(self):
-        if self.backend == "zarr":
-            return next(iter(self.data.arrays()))[-1].chunks[0]
-        return None
-
-    @property
-    def episode_lengths(self):
-        ends = self.episode_ends[:]
-        ends = np.insert(ends, 0, 0)
-        lengths = np.diff(ends)
-        return lengths
-
-    def add_episode(
-        self,
-        data: dict[str, np.ndarray],
-        chunks: dict[str, tuple] | None = None,
-        compressors: str | numcodecs.abc.Codec | dict | None = None,
-    ):
-        if chunks is None:
-            chunks = {}
-        if compressors is None:
-            compressors = {}
-        assert len(data) > 0
-        is_zarr = self.backend == "zarr"
-
-        curr_len = self.n_steps
-        episode_length = None
-        for value in data.values():
-            assert len(value.shape) >= 1
-            if episode_length is None:
-                episode_length = len(value)
-            else:
-                assert episode_length == len(value)
-        new_len = curr_len + episode_length
-
-        for key, value in data.items():
-            new_shape = (new_len,) + value.shape[1:]
-            # create array
-            if key not in self.data:
-                if is_zarr:
-                    cks = self._resolve_array_chunks(chunks=chunks, key=key, array=value)
-                    cpr = self._resolve_array_compressor(compressors=compressors, key=key, array=value)
-                    arr = self.data.zeros(
-                        name=key, shape=new_shape, chunks=cks, dtype=value.dtype, compressor=cpr
-                    )
-                else:
-                    # copy data to prevent modify
-                    arr = np.zeros(shape=new_shape, dtype=value.dtype)
-                    self.data[key] = arr
-            else:
-                arr = self.data[key]
-                assert value.shape[1:] == arr.shape[1:]
-                # same method for both zarr and numpy
-                if is_zarr:
-                    arr.resize(new_shape)
-                else:
-                    arr.resize(new_shape, refcheck=False)
-            # copy data
-            arr[-value.shape[0] :] = value
-
-        # append to episode ends
-        episode_ends = self.episode_ends
-        if is_zarr:
-            episode_ends.resize(episode_ends.shape[0] + 1)
-        else:
-            episode_ends.resize(episode_ends.shape[0] + 1, refcheck=False)
-        episode_ends[-1] = new_len
-
-        # rechunk
-        if is_zarr and episode_ends.chunks[0] < episode_ends.shape[0]:
-            rechunk_recompress_array(self.meta, "episode_ends", chunk_length=int(episode_ends.shape[0] * 1.5))
-
-    def drop_episode(self):
-        is_zarr = self.backend == "zarr"
-        episode_ends = self.episode_ends[:].copy()
-        assert len(episode_ends) > 0
-        start_idx = 0
-        if len(episode_ends) > 1:
-            start_idx = episode_ends[-2]
-        for value in self.data.values():
-            new_shape = (start_idx,) + value.shape[1:]
-            if is_zarr:
-                value.resize(new_shape)
-            else:
-                value.resize(new_shape, refcheck=False)
-        if is_zarr:
-            self.episode_ends.resize(len(episode_ends) - 1)
-        else:
-            self.episode_ends.resize(len(episode_ends) - 1, refcheck=False)
-
-    def pop_episode(self):
-        assert self.n_episodes > 0
-        episode = self.get_episode(self.n_episodes - 1, copy=True)
-        self.drop_episode()
-        return episode
-
-    def extend(self, data):
-        self.add_episode(data)
-
-    def get_episode(self, idx, copy=False):
-        idx = list(range(len(self.episode_ends)))[idx]
-        start_idx = 0
-        if idx > 0:
-            start_idx = self.episode_ends[idx - 1]
-        end_idx = self.episode_ends[idx]
-        result = self.get_steps_slice(start_idx, end_idx, copy=copy)
-        return result
-
-    def get_episode_slice(self, idx):
-        start_idx = 0
-        if idx > 0:
-            start_idx = self.episode_ends[idx - 1]
-        end_idx = self.episode_ends[idx]
-        return slice(start_idx, end_idx)
-
-    def get_steps_slice(self, start, stop, step=None, copy=False):
-        _slice = slice(start, stop, step)
-
-        result = {}
-        for key, value in self.data.items():
-            x = value[_slice]
-            if copy and isinstance(value, np.ndarray):
-                x = x.copy()
-            result[key] = x
-        return result
-
-    # =========== chunking =============
-    def get_chunks(self) -> dict:
-        assert self.backend == "zarr"
-        chunks = {}
-        for key, value in self.data.items():
-            chunks[key] = value.chunks
-        return chunks
-
-    def set_chunks(self, chunks: dict):
-        assert self.backend == "zarr"
-        for key, value in chunks.items():
-            if key in self.data:
-                arr = self.data[key]
-                if value != arr.chunks:
-                    check_chunks_compatible(chunks=value, shape=arr.shape)
-                    rechunk_recompress_array(self.data, key, chunks=value)
-
-    def get_compressors(self) -> dict:
-        assert self.backend == "zarr"
-        compressors = {}
-        for key, value in self.data.items():
-            compressors[key] = value.compressor
-        return compressors
-
-    def set_compressors(self, compressors: dict):
-        assert self.backend == "zarr"
-        for key, value in compressors.items():
-            if key in self.data:
-                arr = self.data[key]
-                compressor = self.resolve_compressor(value)
-                if compressor != arr.compressor:
-                    rechunk_recompress_array(self.data, key, compressor=compressor)
--- a/lerobot/common/datasets/push_dataset_to_hub/_download_raw.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/_download_raw.py
@@ -1,169 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This file contains all obsolete download scripts. They are centralized here to not have to load
-useless dependencies when using datasets.
-"""
-
-import io
-import logging
-import shutil
-from pathlib import Path
-
-import tqdm
-from huggingface_hub import snapshot_download
-
-
-def download_raw(raw_dir, dataset_id):
-    if "aloha" in dataset_id or "image" in dataset_id:
-        download_hub(raw_dir, dataset_id)
-    elif "pusht" in dataset_id:
-        download_pusht(raw_dir)
-    elif "xarm" in dataset_id:
-        download_xarm(raw_dir)
-    elif "umi" in dataset_id:
-        download_umi(raw_dir)
-    else:
-        raise ValueError(dataset_id)
-
-
-def download_and_extract_zip(url: str, destination_folder: Path) -> bool:
-    import zipfile
-
-    import requests
-
-    print(f"downloading from {url}")
-    response = requests.get(url, stream=True)
-    if response.status_code == 200:
-        total_size = int(response.headers.get("content-length", 0))
-        progress_bar = tqdm.tqdm(total=total_size, unit="B", unit_scale=True)
-
-        zip_file = io.BytesIO()
-        for chunk in response.iter_content(chunk_size=1024):
-            if chunk:
-                zip_file.write(chunk)
-                progress_bar.update(len(chunk))
-
-        progress_bar.close()
-
-        zip_file.seek(0)
-
-        with zipfile.ZipFile(zip_file, "r") as zip_ref:
-            zip_ref.extractall(destination_folder)
-
-
-def download_pusht(raw_dir: str):
-    pusht_url = "https://diffusion-policy.cs.columbia.edu/data/training/pusht.zip"
-
-    raw_dir = Path(raw_dir)
-    raw_dir.mkdir(parents=True, exist_ok=True)
-    download_and_extract_zip(pusht_url, raw_dir)
-    # file is created inside a useful "pusht" directory, so we move it out and delete the dir
-    zarr_path = raw_dir / "pusht_cchi_v7_replay.zarr"
-    shutil.move(raw_dir / "pusht" / "pusht_cchi_v7_replay.zarr", zarr_path)
-    shutil.rmtree(raw_dir / "pusht")
-
-
-def download_xarm(raw_dir: Path):
-    """Download all xarm datasets at once"""
-    import zipfile
-
-    import gdown
-
-    raw_dir = Path(raw_dir)
-    raw_dir.mkdir(parents=True, exist_ok=True)
-    # from https://github.com/fyhMer/fowm/blob/main/scripts/download_datasets.py
-    url = "https://drive.google.com/uc?id=1nhxpykGtPDhmQKm-_B8zBSywVRdgeVya"
-    zip_path = raw_dir / "data.zip"
-    gdown.download(url, str(zip_path), quiet=False)
-    print("Extracting...")
-    with zipfile.ZipFile(str(zip_path), "r") as zip_f:
-        for pkl_path in zip_f.namelist():
-            if pkl_path.startswith("data/xarm") and pkl_path.endswith(".pkl"):
-                zip_f.extract(member=pkl_path)
-                # move to corresponding raw directory
-                extract_dir = pkl_path.replace("/buffer.pkl", "")
-                raw_pkl_path = raw_dir / "buffer.pkl"
-                shutil.move(pkl_path, raw_pkl_path)
-                shutil.rmtree(extract_dir)
-    zip_path.unlink()
-
-
-def download_hub(raw_dir: Path, dataset_id: str):
-    raw_dir = Path(raw_dir)
-    raw_dir.mkdir(parents=True, exist_ok=True)
-
-    logging.info(f"Start downloading from huggingface.co/cadene for {dataset_id}")
-    snapshot_download(f"cadene/{dataset_id}_raw", repo_type="dataset", local_dir=raw_dir)
-    logging.info(f"Finish downloading from huggingface.co/cadene for {dataset_id}")
-
-
-def download_umi(raw_dir: Path):
-    url_cup_in_the_wild = "https://real.stanford.edu/umi/data/zarr_datasets/cup_in_the_wild.zarr.zip"
-    zarr_path = raw_dir / "cup_in_the_wild.zarr"
-
-    raw_dir = Path(raw_dir)
-    raw_dir.mkdir(parents=True, exist_ok=True)
-    download_and_extract_zip(url_cup_in_the_wild, zarr_path)
-
-
-if __name__ == "__main__":
-    data_dir = Path("data")
-    dataset_ids = [
-        "pusht_image",
-        "xarm_lift_medium_image",
-        "xarm_lift_medium_replay_image",
-        "xarm_push_medium_image",
-        "xarm_push_medium_replay_image",
-        "aloha_sim_insertion_human_image",
-        "aloha_sim_insertion_scripted_image",
-        "aloha_sim_transfer_cube_human_image",
-        "aloha_sim_transfer_cube_scripted_image",
-        "pusht",
-        "xarm_lift_medium",
-        "xarm_lift_medium_replay",
-        "xarm_push_medium",
-        "xarm_push_medium_replay",
-        "aloha_sim_insertion_human",
-        "aloha_sim_insertion_scripted",
-        "aloha_sim_transfer_cube_human",
-        "aloha_sim_transfer_cube_scripted",
-        "aloha_mobile_cabinet",
-        "aloha_mobile_chair",
-        "aloha_mobile_elevator",
-        "aloha_mobile_shrimp",
-        "aloha_mobile_wash_pan",
-        "aloha_mobile_wipe_wine",
-        "aloha_static_battery",
-        "aloha_static_candy",
-        "aloha_static_coffee",
-        "aloha_static_coffee_new",
-        "aloha_static_cups_open",
-        "aloha_static_fork_pick_up",
-        "aloha_static_pingpong_test",
-        "aloha_static_pro_pencil",
-        "aloha_static_screw_driver",
-        "aloha_static_tape",
-        "aloha_static_thread_velcro",
-        "aloha_static_towel",
-        "aloha_static_vinh_cup",
-        "aloha_static_vinh_cup_left",
-        "aloha_static_ziploc_slide",
-        "umi_cup_in_the_wild",
-    ]
-    for dataset_id in dataset_ids:
-        raw_dir = data_dir / f"{dataset_id}_raw"
-        download_raw(raw_dir, dataset_id)
--- a/lerobot/common/datasets/push_dataset_to_hub/_umi_imagecodecs_numcodecs.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/_umi_imagecodecs_numcodecs.py
@@ -1,326 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# imagecodecs/numcodecs.py
-
-# Copyright (c) 2021-2022, Christoph Gohlke
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice,
-#    this list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-#    contributors may be used to endorse or promote products derived from
-#    this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-
-# Copied from: https://github.com/real-stanford/universal_manipulation_interface/blob/298776ce251f33b6b3185a98d6e7d1f9ad49168b/diffusion_policy/codecs/imagecodecs_numcodecs.py#L1
-"""Additional numcodecs implemented using imagecodecs."""
-
-__version__ = "2022.9.26"
-
-__all__ = ("register_codecs",)
-
-import imagecodecs
-import numpy
-from numcodecs.abc import Codec
-from numcodecs.registry import get_codec, register_codec
-
-# TODO (azouitine): Remove useless codecs
-
-
-def protective_squeeze(x: numpy.ndarray):
-    """
-    Squeeze dim only if it's not the last dim.
-    Image dim expected to be *, H, W, C
-    """
-    img_shape = x.shape[-3:]
-    if len(x.shape) > 3:
-        n_imgs = numpy.prod(x.shape[:-3])
-        if n_imgs > 1:
-            img_shape = (-1,) + img_shape
-    return x.reshape(img_shape)
-
-
-def get_default_image_compressor(**kwargs):
-    if imagecodecs.JPEGXL:
-        # has JPEGXL
-        this_kwargs = {
-            "effort": 3,
-            "distance": 0.3,
-            # bug in libjxl, invalid codestream for non-lossless
-            # when decoding speed > 1
-            "decodingspeed": 1,
-        }
-        this_kwargs.update(kwargs)
-        return JpegXl(**this_kwargs)
-    else:
-        this_kwargs = {"level": 50}
-        this_kwargs.update(kwargs)
-        return Jpeg2k(**this_kwargs)
-
-
-class Jpeg2k(Codec):
-    """JPEG 2000 codec for numcodecs."""
-
-    codec_id = "imagecodecs_jpeg2k"
-
-    def __init__(
-        self,
-        level=None,
-        codecformat=None,
-        colorspace=None,
-        tile=None,
-        reversible=None,
-        bitspersample=None,
-        resolutions=None,
-        numthreads=None,
-        verbose=0,
-    ):
-        self.level = level
-        self.codecformat = codecformat
-        self.colorspace = colorspace
-        self.tile = None if tile is None else tuple(tile)
-        self.reversible = reversible
-        self.bitspersample = bitspersample
-        self.resolutions = resolutions
-        self.numthreads = numthreads
-        self.verbose = verbose
-
-    def encode(self, buf):
-        buf = protective_squeeze(numpy.asarray(buf))
-        return imagecodecs.jpeg2k_encode(
-            buf,
-            level=self.level,
-            codecformat=self.codecformat,
-            colorspace=self.colorspace,
-            tile=self.tile,
-            reversible=self.reversible,
-            bitspersample=self.bitspersample,
-            resolutions=self.resolutions,
-            numthreads=self.numthreads,
-            verbose=self.verbose,
-        )
-
-    def decode(self, buf, out=None):
-        return imagecodecs.jpeg2k_decode(buf, verbose=self.verbose, numthreads=self.numthreads, out=out)
-
-
-class JpegXl(Codec):
-    """JPEG XL codec for numcodecs."""
-
-    codec_id = "imagecodecs_jpegxl"
-
-    def __init__(
-        self,
-        # encode
-        level=None,
-        effort=None,
-        distance=None,
-        lossless=None,
-        decodingspeed=None,
-        photometric=None,
-        planar=None,
-        usecontainer=None,
-        # decode
-        index=None,
-        keeporientation=None,
-        # both
-        numthreads=None,
-    ):
-        """
-        Return JPEG XL image from numpy array.
-        Float must be in nominal range 0..1.
-
-        Currently L, LA, RGB, RGBA images are supported in contig mode.
-        Extra channels are only supported for grayscale images in planar mode.
-
-        Parameters
-        ----------
-        level : Default to None, i.e. not overwriting lossess and decodingspeed options.
-            When < 0: Use lossless compression
-            When in [0,1,2,3,4]: Sets the decoding speed tier for the provided options.
-                Minimum is 0 (slowest to decode, best quality/density), and maximum
-                is 4 (fastest to decode, at the cost of some quality/density).
-        effort : Default to 3.
-            Sets encoder effort/speed level without affecting decoding speed.
-            Valid values are, from faster to slower speed: 1:lightning 2:thunder
-                3:falcon 4:cheetah 5:hare 6:wombat 7:squirrel 8:kitten 9:tortoise.
-            Speed: lightning, thunder, falcon, cheetah, hare, wombat, squirrel, kitten, tortoise
-            control the encoder effort in ascending order.
-            This also affects memory usage: using lower effort will typically reduce memory
-            consumption during encoding.
-            lightning and thunder are fast modes useful for lossless mode (modular).
-            falcon disables all of the following tools.
-            cheetah enables coefficient reordering, context clustering, and heuristics for selecting DCT sizes and quantization steps.
-            hare enables Gaborish filtering, chroma from luma, and an initial estimate of quantization steps.
-            wombat enables error diffusion quantization and full DCT size selection heuristics.
-            squirrel (default) enables dots, patches, and spline detection, and full context clustering.
-            kitten optimizes the adaptive quantization for a psychovisual metric.
-            tortoise enables a more thorough adaptive quantization search.
-        distance : Default to 1.0
-            Sets the distance level for lossy compression: target max butteraugli distance,
-            lower = higher quality. Range: 0 .. 15. 0.0 = mathematically lossless
-            (however, use JxlEncoderSetFrameLossless instead to use true lossless,
-            as setting distance to 0 alone is not the only requirement).
-            1.0 = visually lossless. Recommended range: 0.5 .. 3.0.
-        lossess : Default to False.
-            Use lossess encoding.
-        decodingspeed : Default to 0.
-            Duplicate to level. [0,4]
-        photometric : Return JxlColorSpace value.
-            Default logic is quite complicated but works most of the time.
-            Accepted value:
-                int: [-1,3]
-                str: ['RGB',
-                    'WHITEISZERO', 'MINISWHITE',
-                    'BLACKISZERO', 'MINISBLACK', 'GRAY',
-                    'XYB', 'KNOWN']
-        planar : Enable multi-channel mode.
-            Default to false.
-        usecontainer :
-            Forces the encoder to use the box-based container format (BMFF)
-            even when not necessary.
-            When using JxlEncoderUseBoxes, JxlEncoderStoreJPEGMetadata or
-            JxlEncoderSetCodestreamLevel with level 10, the encoder will
-            automatically also use the container format, it is not necessary
-            to use JxlEncoderUseContainer for those use cases.
-            By default this setting is disabled.
-        index : Selectively decode frames for animation.
-            Default to 0, decode all frames.
-            When set to > 0, decode that frame index only.
-        keeporientation :
-            Enables or disables preserving of as-in-bitstream pixeldata orientation.
-            Some images are encoded with an Orientation tag indicating that the
-            decoder must perform a rotation and/or mirroring to the encoded image data.
-
-            If skip_reorientation is JXL_FALSE (the default): the decoder will apply
-            the transformation from the orientation setting, hence rendering the image
-            according to its specified intent. When producing a JxlBasicInfo, the decoder
-            will always set the orientation field to JXL_ORIENT_IDENTITY (matching the
-            returned pixel data) and also align xsize and ysize so that they correspond
-            to the width and the height of the returned pixel data.
-
-            If skip_reorientation is JXL_TRUE: the decoder will skip applying the
-            transformation from the orientation setting, returning the image in
-            the as-in-bitstream pixeldata orientation. This may be faster to decode
-            since the decoder doesnt have to apply the transformation, but can
-            cause wrong display of the image if the orientation tag is not correctly
-            taken into account by the user.
-
-            By default, this option is disabled, and the returned pixel data is
-            re-oriented according to the images Orientation setting.
-        threads : Default to 1.
-            If <= 0, use all cores.
-            If > 32, clipped to 32.
-        """
-
-        self.level = level
-        self.effort = effort
-        self.distance = distance
-        self.lossless = bool(lossless)
-        self.decodingspeed = decodingspeed
-        self.photometric = photometric
-        self.planar = planar
-        self.usecontainer = usecontainer
-        self.index = index
-        self.keeporientation = keeporientation
-        self.numthreads = numthreads
-
-    def encode(self, buf):
-        # TODO: only squeeze all but last dim
-        buf = protective_squeeze(numpy.asarray(buf))
-        return imagecodecs.jpegxl_encode(
-            buf,
-            level=self.level,
-            effort=self.effort,
-            distance=self.distance,
-            lossless=self.lossless,
-            decodingspeed=self.decodingspeed,
-            photometric=self.photometric,
-            planar=self.planar,
-            usecontainer=self.usecontainer,
-            numthreads=self.numthreads,
-        )
-
-    def decode(self, buf, out=None):
-        return imagecodecs.jpegxl_decode(
-            buf,
-            index=self.index,
-            keeporientation=self.keeporientation,
-            numthreads=self.numthreads,
-            out=out,
-        )
-
-
-def _flat(out):
-    """Return numpy array as contiguous view of bytes if possible."""
-    if out is None:
-        return None
-    view = memoryview(out)
-    if view.readonly or not view.contiguous:
-        return None
-    return view.cast("B")
-
-
-def register_codecs(codecs=None, force=False, verbose=True):
-    """Register codecs in this module with numcodecs."""
-    for name, cls in globals().items():
-        if not hasattr(cls, "codec_id") or name == "Codec":
-            continue
-        if codecs is not None and cls.codec_id not in codecs:
-            continue
-        try:
-            try:  # noqa: SIM105
-                get_codec({"id": cls.codec_id})
-            except TypeError:
-                # registered, but failed
-                pass
-        except ValueError:
-            # not registered yet
-            pass
-        else:
-            if not force:
-                if verbose:
-                    log_warning(f"numcodec {cls.codec_id!r} already registered")
-                continue
-            if verbose:
-                log_warning(f"replacing registered numcodec {cls.codec_id!r}")
-        register_codec(cls)
-
-
-def log_warning(msg, *args, **kwargs):
-    """Log message with level WARNING."""
-    import logging
-
-    logging.getLogger(__name__).warning(msg, *args, **kwargs)
--- a/lerobot/common/datasets/push_dataset_to_hub/aloha_dora_format.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/aloha_dora_format.py
@@ -1,230 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Contains utilities to process raw data format from dora-record
-"""
-
-import logging
-import re
-from pathlib import Path
-
-import pandas as pd
-import torch
-from datasets import Dataset, Features, Image, Sequence, Value
-
-from lerobot.common.datasets.utils import (
-    hf_transform_to_torch,
-)
-from lerobot.common.datasets.video_utils import VideoFrame
-from lerobot.common.utils.utils import init_logging
-
-
-def check_format(raw_dir) -> bool:
-    assert raw_dir.exists()
-
-    leader_file = list(raw_dir.glob("*.parquet"))
-    if len(leader_file) == 0:
-        raise ValueError(f"Missing parquet files in '{raw_dir}'")
-    return True
-
-
-def load_from_raw(raw_dir: Path, out_dir: Path, fps: int):
-    # Load data stream that will be used as reference for the timestamps synchronization
-    reference_files = list(raw_dir.glob("observation.images.cam_*.parquet"))
-    if len(reference_files) == 0:
-        raise ValueError(f"Missing reference files for camera, starting with  in '{raw_dir}'")
-    # select first camera in alphanumeric order
-    reference_key = sorted(reference_files)[0].stem
-    reference_df = pd.read_parquet(raw_dir / f"{reference_key}.parquet")
-    reference_df = reference_df[["timestamp_utc", reference_key]]
-
-    # Merge all data stream using nearest backward strategy
-    df = reference_df
-    for path in raw_dir.glob("*.parquet"):
-        key = path.stem  # action or observation.state or ...
-        if key == reference_key:
-            continue
-        if "failed_episode_index" in key:
-            # TODO(rcadene): add support for removing episodes that are tagged as "failed"
-            continue
-        modality_df = pd.read_parquet(path)
-        modality_df = modality_df[["timestamp_utc", key]]
-        df = pd.merge_asof(
-            df,
-            modality_df,
-            on="timestamp_utc",
-            # "nearest" is the best option over "backward", since the latter can desynchronizes camera timestamps by
-            # matching timestamps that are too far appart, in order to fit the backward constraints. It's not the case for "nearest".
-            # However, note that "nearest" might synchronize the reference camera with other cameras on slightly future timestamps.
-            # are too far appart.
-            direction="nearest",
-            tolerance=pd.Timedelta(f"{1/fps} seconds"),
-        )
-    # Remove rows with episode_index -1 which indicates data that correspond to in-between episodes
-    df = df[df["episode_index"] != -1]
-
-    image_keys = [key for key in df if "observation.images." in key]
-
-    def get_episode_index(row):
-        episode_index_per_cam = {}
-        for key in image_keys:
-            path = row[key][0]["path"]
-            match = re.search(r"_(\d{6}).mp4", path)
-            if not match:
-                raise ValueError(path)
-            episode_index = int(match.group(1))
-            episode_index_per_cam[key] = episode_index
-        if len(set(episode_index_per_cam.values())) != 1:
-            raise ValueError(
-                f"All cameras are expected to belong to the same episode, but getting {episode_index_per_cam}"
-            )
-        return episode_index
-
-    df["episode_index"] = df.apply(get_episode_index, axis=1)
-
-    # dora only use arrays, so single values are encapsulated into a list
-    df["frame_index"] = df.groupby("episode_index").cumcount()
-    df = df.reset_index()
-    df["index"] = df.index
-
-    # set 'next.done' to True for the last frame of each episode
-    df["next.done"] = False
-    df.loc[df.groupby("episode_index").tail(1).index, "next.done"] = True
-
-    df["timestamp"] = df["timestamp_utc"].map(lambda x: x.timestamp())
-    # each episode starts with timestamp 0 to match the ones from the video
-    df["timestamp"] = df.groupby("episode_index")["timestamp"].transform(lambda x: x - x.iloc[0])
-
-    del df["timestamp_utc"]
-
-    # sanity check
-    has_nan = df.isna().any().any()
-    if has_nan:
-        raise ValueError("Dataset contains Nan values.")
-
-    # sanity check episode indices go from 0 to n-1
-    ep_ids = [ep_idx for ep_idx, _ in df.groupby("episode_index")]
-    expected_ep_ids = list(range(df["episode_index"].max() + 1))
-    if ep_ids != expected_ep_ids:
-        raise ValueError(f"Episodes indices go from {ep_ids} instead of {expected_ep_ids}")
-
-    # Create symlink to raw videos directory (that needs to be absolute not relative)
-    out_dir.mkdir(parents=True, exist_ok=True)
-    videos_dir = out_dir / "videos"
-    videos_dir.symlink_to((raw_dir / "videos").absolute())
-
-    # sanity check the video paths are well formated
-    for key in df:
-        if "observation.images." not in key:
-            continue
-        for ep_idx in ep_ids:
-            video_path = videos_dir / f"{key}_episode_{ep_idx:06d}.mp4"
-            if not video_path.exists():
-                raise ValueError(f"Video file not found in {video_path}")
-
-    data_dict = {}
-    for key in df:
-        # is video frame
-        if "observation.images." in key:
-            # we need `[0] because dora only use arrays, so single values are encapsulated into a list.
-            # it is the case for video_frame dictionary = [{"path": ..., "timestamp": ...}]
-            data_dict[key] = [video_frame[0] for video_frame in df[key].values]
-
-            # sanity check the video path is well formated
-            video_path = videos_dir.parent / data_dict[key][0]["path"]
-            if not video_path.exists():
-                raise ValueError(f"Video file not found in {video_path}")
-        # is number
-        elif df[key].iloc[0].ndim == 0 or df[key].iloc[0].shape[0] == 1:
-            data_dict[key] = torch.from_numpy(df[key].values)
-        # is vector
-        elif df[key].iloc[0].shape[0] > 1:
-            data_dict[key] = torch.stack([torch.from_numpy(x.copy()) for x in df[key].values])
-        else:
-            raise ValueError(key)
-
-    # Get the episode index containing for each unique episode index
-    first_ep_index_df = df.groupby("episode_index").agg(start_index=("index", "first")).reset_index()
-    from_ = first_ep_index_df["start_index"].tolist()
-    to_ = from_[1:] + [len(df)]
-    episode_data_index = {
-        "from": from_,
-        "to": to_,
-    }
-
-    return data_dict, episode_data_index
-
-
-def to_hf_dataset(data_dict, video) -> Dataset:
-    features = {}
-
-    keys = [key for key in data_dict if "observation.images." in key]
-    for key in keys:
-        if video:
-            features[key] = VideoFrame()
-        else:
-            features[key] = Image()
-
-    features["observation.state"] = Sequence(
-        length=data_dict["observation.state"].shape[1], feature=Value(dtype="float32", id=None)
-    )
-    if "observation.velocity" in data_dict:
-        features["observation.velocity"] = Sequence(
-            length=data_dict["observation.velocity"].shape[1], feature=Value(dtype="float32", id=None)
-        )
-    if "observation.effort" in data_dict:
-        features["observation.effort"] = Sequence(
-            length=data_dict["observation.effort"].shape[1], feature=Value(dtype="float32", id=None)
-        )
-    features["action"] = Sequence(
-        length=data_dict["action"].shape[1], feature=Value(dtype="float32", id=None)
-    )
-    features["episode_index"] = Value(dtype="int64", id=None)
-    features["frame_index"] = Value(dtype="int64", id=None)
-    features["timestamp"] = Value(dtype="float32", id=None)
-    features["next.done"] = Value(dtype="bool", id=None)
-    features["index"] = Value(dtype="int64", id=None)
-
-    hf_dataset = Dataset.from_dict(data_dict, features=Features(features))
-    hf_dataset.set_transform(hf_transform_to_torch)
-    return hf_dataset
-
-
-def from_raw_to_lerobot_format(raw_dir: Path, out_dir: Path, fps=None, video=True, debug=False):
-    init_logging()
-
-    if debug:
-        logging.warning("debug=True not implemented. Falling back to debug=False.")
-
-    # sanity check
-    check_format(raw_dir)
-
-    if fps is None:
-        fps = 30
-    else:
-        raise NotImplementedError()
-
-    if not video:
-        raise NotImplementedError()
-
-    data_df, episode_data_index = load_from_raw(raw_dir, out_dir, fps)
-    hf_dataset = to_hf_dataset(data_df, video)
-
-    info = {
-        "fps": fps,
-        "video": video,
-    }
-    return hf_dataset, episode_data_index, info
--- a/lerobot/common/datasets/push_dataset_to_hub/aloha_hdf5_format.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/aloha_hdf5_format.py
@@ -1,214 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Contains utilities to process raw data format of HDF5 files like in: https://github.com/tonyzhaozh/act
-"""
-
-import gc
-import shutil
-from pathlib import Path
-
-import h5py
-import numpy as np
-import torch
-import tqdm
-from datasets import Dataset, Features, Image, Sequence, Value
-from PIL import Image as PILImage
-
-from lerobot.common.datasets.push_dataset_to_hub.utils import concatenate_episodes, save_images_concurrently
-from lerobot.common.datasets.utils import (
-    hf_transform_to_torch,
-)
-from lerobot.common.datasets.video_utils import VideoFrame, encode_video_frames
-
-
-def get_cameras(hdf5_data):
-    # ignore depth channel, not currently handled
-    # TODO(rcadene): add depth
-    rgb_cameras = [key for key in hdf5_data["/observations/images"].keys() if "depth" not in key]  # noqa: SIM118
-    return rgb_cameras
-
-
-def check_format(raw_dir) -> bool:
-    # only frames from simulation are uncompressed
-    compressed_images = "sim" not in raw_dir.name
-
-    hdf5_paths = list(raw_dir.glob("episode_*.hdf5"))
-    assert len(hdf5_paths) != 0
-    for hdf5_path in hdf5_paths:
-        with h5py.File(hdf5_path, "r") as data:
-            assert "/action" in data
-            assert "/observations/qpos" in data
-
-            assert data["/action"].ndim == 2
-            assert data["/observations/qpos"].ndim == 2
-
-            num_frames = data["/action"].shape[0]
-            assert num_frames == data["/observations/qpos"].shape[0]
-
-            for camera in get_cameras(data):
-                assert num_frames == data[f"/observations/images/{camera}"].shape[0]
-
-                if compressed_images:
-                    assert data[f"/observations/images/{camera}"].ndim == 2
-                else:
-                    assert data[f"/observations/images/{camera}"].ndim == 4
-                    b, h, w, c = data[f"/observations/images/{camera}"].shape
-                    assert c < h and c < w, f"Expect (h,w,c) image format but ({h=},{w=},{c=}) provided."
-
-
-def load_from_raw(raw_dir, out_dir, fps, video, debug):
-    # only frames from simulation are uncompressed
-    compressed_images = "sim" not in raw_dir.name
-
-    hdf5_files = list(raw_dir.glob("*.hdf5"))
-    ep_dicts = []
-    episode_data_index = {"from": [], "to": []}
-
-    id_from = 0
-    for ep_idx, ep_path in tqdm.tqdm(enumerate(hdf5_files), total=len(hdf5_files)):
-        with h5py.File(ep_path, "r") as ep:
-            num_frames = ep["/action"].shape[0]
-
-            # last step of demonstration is considered done
-            done = torch.zeros(num_frames, dtype=torch.bool)
-            done[-1] = True
-
-            state = torch.from_numpy(ep["/observations/qpos"][:])
-            action = torch.from_numpy(ep["/action"][:])
-            if "/observations/qvel" in ep:
-                velocity = torch.from_numpy(ep["/observations/qvel"][:])
-            if "/observations/effort" in ep:
-                effort = torch.from_numpy(ep["/observations/effort"][:])
-
-            ep_dict = {}
-
-            for camera in get_cameras(ep):
-                img_key = f"observation.images.{camera}"
-
-                if compressed_images:
-                    import cv2
-
-                    # load one compressed image after the other in RAM and uncompress
-                    imgs_array = []
-                    for data in ep[f"/observations/images/{camera}"]:
-                        imgs_array.append(cv2.imdecode(data, 1))
-                    imgs_array = np.array(imgs_array)
-
-                else:
-                    # load all images in RAM
-                    imgs_array = ep[f"/observations/images/{camera}"][:]
-
-                if video:
-                    # save png images in temporary directory
-                    tmp_imgs_dir = out_dir / "tmp_images"
-                    save_images_concurrently(imgs_array, tmp_imgs_dir)
-
-                    # encode images to a mp4 video
-                    fname = f"{img_key}_episode_{ep_idx:06d}.mp4"
-                    video_path = out_dir / "videos" / fname
-                    encode_video_frames(tmp_imgs_dir, video_path, fps)
-
-                    # clean temporary images directory
-                    shutil.rmtree(tmp_imgs_dir)
-
-                    # store the reference to the video frame
-                    ep_dict[img_key] = [
-                        {"path": f"videos/{fname}", "timestamp": i / fps} for i in range(num_frames)
-                    ]
-                else:
-                    ep_dict[img_key] = [PILImage.fromarray(x) for x in imgs_array]
-
-            ep_dict["observation.state"] = state
-            if "/observations/velocity" in ep:
-                ep_dict["observation.velocity"] = velocity
-            if "/observations/effort" in ep:
-                ep_dict["observation.effort"] = effort
-            ep_dict["action"] = action
-            ep_dict["episode_index"] = torch.tensor([ep_idx] * num_frames)
-            ep_dict["frame_index"] = torch.arange(0, num_frames, 1)
-            ep_dict["timestamp"] = torch.arange(0, num_frames, 1) / fps
-            ep_dict["next.done"] = done
-            # TODO(rcadene): add reward and success by computing them in sim
-
-            assert isinstance(ep_idx, int)
-            ep_dicts.append(ep_dict)
-
-            episode_data_index["from"].append(id_from)
-            episode_data_index["to"].append(id_from + num_frames)
-
-        id_from += num_frames
-
-        gc.collect()
-
-        # process first episode only
-        if debug:
-            break
-
-    data_dict = concatenate_episodes(ep_dicts)
-    return data_dict, episode_data_index
-
-
-def to_hf_dataset(data_dict, video) -> Dataset:
-    features = {}
-
-    keys = [key for key in data_dict if "observation.images." in key]
-    for key in keys:
-        if video:
-            features[key] = VideoFrame()
-        else:
-            features[key] = Image()
-
-    features["observation.state"] = Sequence(
-        length=data_dict["observation.state"].shape[1], feature=Value(dtype="float32", id=None)
-    )
-    if "observation.velocity" in data_dict:
-        features["observation.velocity"] = Sequence(
-            length=data_dict["observation.velocity"].shape[1], feature=Value(dtype="float32", id=None)
-        )
-    if "observation.effort" in data_dict:
-        features["observation.effort"] = Sequence(
-            length=data_dict["observation.effort"].shape[1], feature=Value(dtype="float32", id=None)
-        )
-    features["action"] = Sequence(
-        length=data_dict["action"].shape[1], feature=Value(dtype="float32", id=None)
-    )
-    features["episode_index"] = Value(dtype="int64", id=None)
-    features["frame_index"] = Value(dtype="int64", id=None)
-    features["timestamp"] = Value(dtype="float32", id=None)
-    features["next.done"] = Value(dtype="bool", id=None)
-    features["index"] = Value(dtype="int64", id=None)
-
-    hf_dataset = Dataset.from_dict(data_dict, features=Features(features))
-    hf_dataset.set_transform(hf_transform_to_torch)
-    return hf_dataset
-
-
-def from_raw_to_lerobot_format(raw_dir: Path, out_dir: Path, fps=None, video=True, debug=False):
-    # sanity check
-    check_format(raw_dir)
-
-    if fps is None:
-        fps = 50
-
-    data_dir, episode_data_index = load_from_raw(raw_dir, out_dir, fps, video, debug)
-    hf_dataset = to_hf_dataset(data_dir, video)
-
-    info = {
-        "fps": fps,
-        "video": video,
-    }
-    return hf_dataset, episode_data_index, info
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`../../lerobot/common/robots/koch_follower/koch.mdx`
				`@@ -0,0 +1 @@`
				`../../lerobot/common/robots/lekiwi/lekiwi.mdx`
				`@@ -0,0 +1 @@`
				`../../lerobot/common/robots/so100_follower/so100.mdx`
				`@@ -0,0 +1 @@`
				`../../lerobot/common/robots/so101_follower/so101.mdx`
				`@@ -1 +0,0 @@`
				`https://drive.google.com/drive/folders/1S8eFg98IaGAIKVZ8QFWG1bx4mHa-O204`