use Path type instead of str

Merge pull request #28 from Cadene/user/rcadene/2024_03_14_hf_dataset
Download datasets from hugging face
2024-03-15 13:15:34 +00:00 · 2024-03-15 13:52:13 +01:00 · 2024-03-15 12:44:52 +00:00 · 2024-03-15 10:56:46 +00:00 · 2024-03-15 00:36:55 +00:00 · 2024-03-15 00:30:11 +00:00
170 changed files with 11166 additions and 431 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1 @@
+*.memmap filter=lfs diff=lfs merge=lfs -text
--- a/.github/poetry/cpu/poetry.lock
+++ b/.github/poetry/cpu/poetry.lock
--- a/.github/poetry/cpu/pyproject.toml
+++ b/.github/poetry/cpu/pyproject.toml
@@ -0,0 +1,107 @@
+[tool.poetry]
+name = "lerobot"
+version = "0.1.0"
+description = "Le robot is learning"
+authors = [
+    "Rémi Cadène <re.cadene@gmail.com>",
+    "Simon Alibert <alibert.sim@gmail.com>",
+]
+repository = "https://github.com/Cadene/lerobot"
+readme = "README.md"
+license = "MIT"
+classifiers=[
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Developers",
+    "Topic :: Software Development :: Build Tools",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3.10",
+]
+packages = [{include = "lerobot"}]
+
+
+[tool.poetry.dependencies]
+python = "^3.10"
+cython = "^3.0.8"
+termcolor = "^2.4.0"
+omegaconf = "^2.3.0"
+dm-env = "^1.6"
+pandas = "^2.2.1"
+wandb = "^0.16.3"
+moviepy = "^1.0.3"
+imageio = {extras = ["pyav"], version = "^2.34.0"}
+gdown = "^5.1.0"
+hydra-core = "^1.3.2"
+einops = "^0.7.0"
+pygame = "^2.5.2"
+pymunk = "^6.6.0"
+zarr = "^2.17.0"
+shapely = "^2.0.3"
+scikit-image = "^0.22.0"
+numba = "^0.59.0"
+mpmath = "^1.3.0"
+torch = {version = "^2.2.1", source = "torch-cpu"}
+tensordict = {git = "https://github.com/pytorch/tensordict"}
+torchrl = {git = "https://github.com/pytorch/rl", rev = "13bef426dcfa5887c6e5034a6e9697993fa92c37"}
+mujoco = "^3.1.2"
+mujoco-py = "^2.1.2.14"
+gym = "^0.26.2"
+opencv-python = "^4.9.0.80"
+diffusers = "^0.26.3"
+torchvision = {version = "^0.17.1", source = "torch-cpu"}
+h5py = "^3.10.0"
+dm = "^1.3"
+dm-control = "^1.0.16"
+huggingface-hub = "^0.21.4"
+
+
+[tool.poetry.group.dev.dependencies]
+pre-commit = "^3.6.2"
+debugpy = "^1.8.1"
+pytest = "^8.1.0"
+
+
+[[tool.poetry.source]]
+name = "torch-cpu"
+url = "https://download.pytorch.org/whl/cpu"
+priority = "supplemental"
+
+
+[tool.ruff]
+line-length = 110
+target-version = "py310"
+exclude = [
+    ".bzr",
+    ".direnv",
+    ".eggs",
+    ".git",
+    ".git-rewrite",
+    ".hg",
+    ".mypy_cache",
+    ".nox",
+    ".pants.d",
+    ".pytype",
+    ".ruff_cache",
+    ".svn",
+    ".tox",
+    ".venv",
+    "__pypackages__",
+    "_build",
+    "buck-out",
+    "build",
+    "dist",
+    "node_modules",
+    "venv",
+]
+
+
+[tool.ruff.lint]
+select = ["E4", "E7", "E9", "F", "I", "N", "B", "C4", "SIM"]
+
+
+[tool.poetry-dynamic-versioning]
+enable = true
+
+
+[build-system]
+requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning>=1.0.0,<2.0.0"]
+build-backend = "poetry_dynamic_versioning.backend"
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -17,17 +17,28 @@ jobs:
    runs-on: ubuntu-latest
    env:
      POETRY_VERSION: 1.8.1
+      DATA_DIR: tests/data
+      TMPDIR: ~/tmp
+      TEMP: ~/tmp
+      TMP: ~/tmp
+      PYOPENGL_PLATFORM: egl
+      MUJOCO_GL: egl
+      LEROBOT_TESTS_DEVICE: cpu
    steps:
      #----------------------------------------------
      #       check-out repo and set-up python
      #----------------------------------------------
      - name: Check out repository
        uses: actions/checkout@v4
+        with:
+          lfs: true
+
      - name: Set up python
        id: setup-python
        uses: actions/setup-python@v5
        with:
          python-version: '3.10'
+
      #----------------------------------------------
      #         install & configure poetry
      #----------------------------------------------
@@ -35,8 +46,9 @@ jobs:
        id: restore-poetry-cache
        uses: actions/cache/restore@v3
        with:
-          path: ~/.local  # the path depends on the OS
-          key: poetry-${{ env.POETRY_VERSION }}  # increment to reset cache
+          path: ~/.local
+          key: poetry-${{ env.POETRY_VERSION }}
+
      - name: Install Poetry
        if: steps.restore-poetry-cache.outputs.cache-hit != 'true'
        uses: snok/install-poetry@v1
@@ -44,6 +56,7 @@ jobs:
          version: ${{ env.POETRY_VERSION }}
          virtualenvs-create: true
          installer-parallel: true
+
      - name: Save cached Poetry installation
        if: |
          steps.restore-poetry-cache.outputs.cache-hit != 'true' &&
@@ -51,25 +64,32 @@ jobs:
        id: save-poetry-cache
        uses: actions/cache/save@v3
        with:
-          path: ~/.local  # the path depends on the OS
-          key: poetry-${{ env.POETRY_VERSION }}  # increment to reset cache
+          path: ~/.local
+          key: poetry-${{ env.POETRY_VERSION }}
+
      - name: Configure Poetry
        run: poetry config virtualenvs.in-project true
+
      #----------------------------------------------
      #           install dependencies
      #----------------------------------------------
+      # TODO(aliberts): move to gpu runners
+      - name: Select cpu dependencies  # HACK
+        run: cp -t . .github/poetry/cpu/pyproject.toml .github/poetry/cpu/poetry.lock
+
      - name: Load cached venv
        id: restore-dependencies-cache
        uses: actions/cache/restore@v3
        with:
          path: .venv
          key: venv-${{ steps.setup-python.outputs.python-version }}-${{ env.POETRY_VERSION }}-${{ hashFiles('**/poetry.lock') }}
+
      - name: Install dependencies
        if: steps.restore-dependencies-cache.outputs.cache-hit != 'true'
        run: |
+          mkdir ~/tmp
          poetry install --no-interaction --no-root
-          git clone https://github.com/real-stanford/diffusion_policy
-          cp -r diffusion_policy/diffusion_policy $(poetry env info -p)/lib/python3.10/site-packages/
+
      - name: Save cached venv
        if: |
            steps.restore-dependencies-cache.outputs.cache-hit != 'true' &&
@@ -79,11 +99,16 @@ jobs:
        with:
          path: .venv
          key: venv-${{ steps.setup-python.outputs.python-version }}-${{ env.POETRY_VERSION }}-${{ hashFiles('**/poetry.lock') }}
+
+      - name: Install libegl1-mesa-dev (to use MUJOCO_GL=egl)
+        run: sudo apt-get update && sudo apt-get install -y libegl1-mesa-dev
+
      #----------------------------------------------
      #             install project
      #----------------------------------------------
      - name: Install project
        run: poetry install --no-interaction
+
      #----------------------------------------------
      #               run tests
      #----------------------------------------------
@@ -91,6 +116,7 @@ jobs:
        run: |
          source .venv/bin/activate
          pytest tests
+
      - name: Test train pusht end-to-end
        run: |
          source .venv/bin/activate
@@ -98,21 +124,21 @@ jobs:
            hydra.job.name=pusht \
            env=pusht \
            wandb.enable=False \
-            offline_steps=1 \
+            offline_steps=2 \
            online_steps=0 \
-            device=cpu
-      # TODO(rcadene, aliberts): Add end-to-end test of eval checkpoint post training
-      # - name: Test eval pusht end-to-end
-      #   run: |
-      #     source .venv/bin/activate
-      #     python lerobot/scripts/eval.py
-      #       hydra.job.name=pusht \
-      #       env=pusht \
-      #       wandb.enable=False \
-      #       eval_episodes=1 \
-      #       device=cpu
-      #----------------------------------------------
-      #              cleanup
-      #----------------------------------------------
-      - name: Cleanup
-        run: rm -rf diffusion_policy data
+            device=cpu \
+            save_model=true \
+            save_freq=1 \
+            hydra.run.dir=tests/outputs/
+
+      - name: Test eval pusht end-to-end
+        run: |
+          source .venv/bin/activate
+          python lerobot/scripts/eval.py \
+            hydra.job.name=pusht \
+            env=pusht \
+            wandb.enable=False \
+            eval_episodes=1 \
+            env.episode_length=8 \
+            device=cpu \
+            policy.pretrained_model_path=tests/outputs/models/1.pt
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,3 @@
-# Custom
-diffusion_policy
-
 # Logging
 logs
 tmp
@@ -54,6 +51,7 @@ pip-log.txt
 pip-delete-this-directory.txt

 # Unit test / coverage reports
+!tests/data
 htmlcov/
 .tox/
 .nox/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,4 +1,4 @@
-exclude: ^(data/|tests/|diffusion_policy/)
+exclude: ^(data/|tests/)
 default_language_version:
    python: python3.10
 repos:
--- a/278
+++ b/278
@@ -0,0 +1,278 @@
+Copyright 2024 The Hugging Face team. All rights reserved.
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+
+## Some of lerobot's code is derived from Diffusion Policy, which is subject to the following copyright notice:
+
+MIT License
+
+Copyright (c) 2023 Columbia Artificial Intelligence and Robotics Lab
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+
+## Some of lerobot's code is derived from FOWM, which is subject to the following copyright notice:
+
+MIT License
+
+Copyright (c) 2023 Yunhai Feng
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+
+## Some of lerobot's code is derived from ALOHA, which is subject to the following copyright notice:
+
+MIT License
+
+Copyright (c) 2023 Tony Z. Zhao
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@

 ## Installation

-Create a virtual environment with python 3.10, e.g. using `conda`:
+Create a virtual environment with Python 3.10, e.g. using `conda`:
 ```
 conda create -y -n lerobot python=3.10
 conda activate lerobot
@@ -24,11 +24,9 @@ mkdir ~/tmp
 export TMPDIR='~/tmp'
 ```

-Install `diffusion_policy` #HACK
+To use [Weights and Biases](https://docs.wandb.ai/quickstart) for experiments tracking, log in with
 ```
-# from this directory
-git clone https://github.com/real-stanford/diffusion_policy
-cp -r diffusion_policy/diffusion_policy $(poetry env info -p)/lib/python3.10/site-packages/
+wandb login
 ```

 ## Usage
@@ -61,19 +59,10 @@ env=pusht

 ## TODO

- [x] priority update doesnt match FOWM or original paper
- [x] self.step=100000 should be updated at every step to adjust to horizon of planner
- [ ] prefetch replay buffer to speedup training
- [ ] parallelize env to speedup eval
- [ ] clean checkpointing / loading
- [ ] clean logging
- [ ] clean config
- [ ] clean hyperparameter tuning
- [ ] add pusht
- [ ] add aloha
- [ ] add act
- [ ] add diffusion
- [ ] add aloha 2
+If you are not sure how to contribute or want to know the next features we working on, look on this project page: [LeRobot TODO](https://github.com/users/Cadene/projects/1)
+
+Ask [Remi Cadene](re.cadene@gmail.com) for access if needed.
+

 ## Profile

@@ -114,7 +103,69 @@ pre-commit install
 pre-commit run -a
 ```

+**Adding dependencies (temporary)**
+
+Right now, for the CI to work, whenever a new dependency is added it needs to be also added to the cpu env, eg:
+
+```
+# Run in this directory, adds the package to the main env with cuda
+poetry add some-package
+
+# Adds the same package to the cpu env
+cd .github/poetry/cpu && poetry add some-package
+```
+
 **Tests**
+
+Install [git lfs](https://git-lfs.com/) to retrieve test artifacts (if you don't have it already).
+
+On Mac:
 ```
-pytest -sx tests
+brew install git-lfs
+git lfs install
 ```
+
+On Ubuntu:
+```
+sudo apt-get install git-lfs
+git lfs install
+```
+
+Pull artifacts if they're not in [tests/data](tests/data)
+```
+git lfs pull
+```
+
+When adding a new dataset, mock it with
+```
+python tests/scripts/mock_dataset.py --in-data-dir data/<dataset_id> --out-data-dir tests/data/<dataset_id>
+```
+
+Run tests
+```
+DATA_DIR="tests/data" pytest -sx tests
+```
+
+**Datasets**
+
+To add a pytorch rl dataset to the hub, first login and use a token generated from [huggingface settings](https://huggingface.co/settings/tokens) with write access:
+```
+huggingface-cli login --token $HUGGINGFACE_TOKEN --add-to-git-credential
+```
+
+Then you can upload it to the hub with:
+```
+HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli upload --repo-type dataset $HF_USER/$DATASET data/$DATASET
+```
+
+For instance, for [cadene/pusht](https://huggingface.co/datasets/cadene/pusht), we used:
+```
+HF_USER=cadene
+DATASET=pusht
+```
+
+
+## Acknowledgment
+- Our Diffusion policy and Pusht environment are adapted from [Diffusion Policy](https://diffusion-policy.cs.columbia.edu/)
+- Our TDMPC policy and Simxarm environment are adapted from [FOWM](https://www.yunhaifeng.com/FOWM/)
+- Our ACT policy and ALOHA environment are adapted from [ALOHA](https://tonyzhaozh.github.io/aloha/)
--- a/lerobot/init.py
+++ b/lerobot/init.py
@@ -0,0 +1 @@
+from lerobot.__version__ import __version__  # noqa: F401
--- a/lerobot/version.py
+++ b/lerobot/version.py
@@ -1 +1,8 @@
-__version__ = "0.0.0"
+""" To enable `lerobot.__version__` """
+
+from importlib.metadata import PackageNotFoundError, version
+
+try:
+    __version__ = version("lerobot")
+except PackageNotFoundError:
+    __version__ = "unknown"
--- a/lerobot/common/init.py
+++ b/lerobot/common/init.py
--- a/lerobot/common/datasets/init.py
+++ b/lerobot/common/datasets/init.py
--- a/lerobot/common/datasets/abstract.py
+++ b/lerobot/common/datasets/abstract.py
@@ -1,4 +1,3 @@
-import abc
 import logging
 from pathlib import Path
 from typing import Callable
@@ -7,12 +6,13 @@ import einops
 import torch
 import torchrl
 import tqdm
+from huggingface_hub import snapshot_download
 from tensordict import TensorDict
-from torchrl.data.datasets.utils import _get_root_dir
 from torchrl.data.replay_buffers.replay_buffers import TensorDictReplayBuffer
 from torchrl.data.replay_buffers.samplers import SliceSampler
 from torchrl.data.replay_buffers.storages import TensorStorage, _collate_id
 from torchrl.data.replay_buffers.writers import ImmutableDatasetWriter, Writer
+from torchrl.envs.transforms.transforms import Compose


 class AbstractExperienceReplay(TensorDictReplayBuffer):
@@ -22,7 +22,7 @@ class AbstractExperienceReplay(TensorDictReplayBuffer):
        batch_size: int = None,
        *,
        shuffle: bool = True,
-        root: Path = None,
+        root: Path | None = None,
        pin_memory: bool = False,
        prefetch: int = None,
        sampler: SliceSampler = None,
@@ -32,11 +32,8 @@ class AbstractExperienceReplay(TensorDictReplayBuffer):
    ):
        self.dataset_id = dataset_id
        self.shuffle = shuffle
-        self.root = _get_root_dir(self.dataset_id) if root is None else root
-        self.root = Path(self.root)
-        self.data_dir = self.root / self.dataset_id
-
-        storage = self._download_or_load_storage()
+        self.root = root
+        storage = self._download_or_load_dataset()

        super().__init__(
            storage=storage,
@@ -54,7 +51,7 @@ class AbstractExperienceReplay(TensorDictReplayBuffer):
        return {
            ("observation", "state"): "b c -> 1 c",
            ("observation", "image"): "b c h w -> 1 c 1 1",
-            ("action"): "b c -> 1 c",
+            ("action",): "b c -> 1 c",
        }

    @property
@@ -73,11 +70,22 @@ class AbstractExperienceReplay(TensorDictReplayBuffer):
    def num_episodes(self) -> int:
        return len(self._storage._storage["episode"].unique())

+    @property
+    def transform(self):
+        return self._transform
+
    def set_transform(self, transform):
-        self.transform = transform
+        if not isinstance(transform, Compose):
+            # required since torchrl calls `len(self._transform)` downstream
+            if isinstance(transform, list):
+                self._transform = Compose(*transform)
+            else:
+                self._transform = Compose(transform)
+        else:
+            self._transform = transform

    def compute_or_load_stats(self, num_batch=100, batch_size=32) -> TensorDict:
-        stats_path = self.data_dir / "stats.pth"
+        stats_path = Path(self.data_dir) / "stats.pth"
        if stats_path.exists():
            stats = torch.load(stats_path)
        else:
@@ -86,19 +94,12 @@ class AbstractExperienceReplay(TensorDictReplayBuffer):
            torch.save(stats, stats_path)
        return stats

-    @abc.abstractmethod
-    def _download_and_preproc(self) -> torch.StorageBase:
-        raise NotImplementedError()
-
-    def _download_or_load_storage(self):
-        if not self._is_downloaded():
-            storage = self._download_and_preproc()
+    def _download_or_load_dataset(self) -> torch.StorageBase:
+        if self.root is None:
+            self.data_dir = snapshot_download(repo_id=f"cadene/{self.dataset_id}", repo_type="dataset")
        else:
-            storage = TensorStorage(TensorDict.load_memmap(self.data_dir))
-        return storage
-
-    def _is_downloaded(self) -> bool:
-        return self.data_dir.is_dir()
+            self.data_dir = self.root / self.dataset_id
+        return TensorStorage(TensorDict.load_memmap(self.data_dir))

    def _compute_stats(self, num_batch=100, batch_size=32):
        rb = TensorDictReplayBuffer(
--- a/lerobot/common/datasets/aloha.py
+++ b/lerobot/common/datasets/aloha.py
@@ -73,11 +73,11 @@ def download(data_dir, dataset_id):

    data_dir.mkdir(parents=True, exist_ok=True)

-    gdown.download_folder(FOLDER_URLS[dataset_id], output=data_dir)
+    gdown.download_folder(FOLDER_URLS[dataset_id], output=str(data_dir))

    # because of the 50 files limit per directory, two files episode 48 and 49 were missing
-    gdown.download(EP48_URLS[dataset_id], output=data_dir / "episode_48.hdf5", fuzzy=True)
-    gdown.download(EP49_URLS[dataset_id], output=data_dir / "episode_49.hdf5", fuzzy=True)
+    gdown.download(EP48_URLS[dataset_id], output=str(data_dir / "episode_48.hdf5"), fuzzy=True)
+    gdown.download(EP49_URLS[dataset_id], output=str(data_dir / "episode_49.hdf5"), fuzzy=True)


 class AlohaExperienceReplay(AbstractExperienceReplay):
@@ -87,7 +87,7 @@ class AlohaExperienceReplay(AbstractExperienceReplay):
        batch_size: int = None,
        *,
        shuffle: bool = True,
-        root: Path = None,
+        root: Path | None = None,
        pin_memory: bool = False,
        prefetch: int = None,
        sampler: SliceSampler = None,
@@ -114,7 +114,7 @@ class AlohaExperienceReplay(AbstractExperienceReplay):
    def stats_patterns(self) -> dict:
        d = {
            ("observation", "state"): "b c -> 1 c",
-            ("action"): "b c -> 1 c",
+            ("action",): "b c -> 1 c",
        }
        for cam in CAMERAS[self.dataset_id]:
            d[("observation", "image", cam)] = "b c h w -> 1 c 1 1"
@@ -124,11 +124,9 @@ class AlohaExperienceReplay(AbstractExperienceReplay):
    def image_keys(self) -> list:
        return [("observation", "image", cam) for cam in CAMERAS[self.dataset_id]]

-    # def _is_downloaded(self) -> bool:
-    #     return False
-
-    def _download_and_preproc(self):
-        raw_dir = self.data_dir.parent / f"{self.data_dir.name}_raw"
+    def _download_and_preproc_obsolete(self):
+        assert self.root is not None
+        raw_dir = self.root / f"{self.dataset_id}_raw"
        if not raw_dir.is_dir():
            download(raw_dir, self.dataset_id)

@@ -177,7 +175,7 @@ class AlohaExperienceReplay(AbstractExperienceReplay):

                if ep_id == 0:
                    # hack to initialize tensordict data structure to store episodes
-                    td_data = ep_td[0].expand(total_num_frames).memmap_like(self.data_dir)
+                    td_data = ep_td[0].expand(total_num_frames).memmap_like(self.root / f"{self.dataset_id}")

                td_data[idxtd : idxtd + len(ep_td)] = ep_td
                idxtd = idxtd + len(ep_td)
--- a/lerobot/common/datasets/factory.py
+++ b/lerobot/common/datasets/factory.py
@@ -5,9 +5,12 @@ from pathlib import Path
 import torch
 from torchrl.data.replay_buffers import PrioritizedSliceSampler, SliceSampler

-from lerobot.common.envs.transforms import NormalizeTransform
+from lerobot.common.envs.transforms import NormalizeTransform, Prod

-DATA_DIR = Path(os.environ.get("DATA_DIR", "data"))
+# DATA_DIR specifies to location where datasets are loaded. By default, DATA_DIR is None and
+# we load from `$HOME/.cache/huggingface/hub/datasets`. For our unit tests, we set `DATA_DIR=tests/data`
+# to load a subset of our datasets for faster continuous integration.
+DATA_DIR = Path(os.environ["DATA_DIR"]) if "DATA_DIR" in os.environ else None


 def make_offline_buffer(
@@ -77,35 +80,49 @@ def make_offline_buffer(

    offline_buffer = clsfunc(
        dataset_id=dataset_id,
-        root=DATA_DIR,
        sampler=sampler,
        batch_size=batch_size,
+        root=DATA_DIR,
        pin_memory=pin_memory,
        prefetch=prefetch if isinstance(prefetch, int) else None,
    )

+    if cfg.policy.name == "tdmpc":
+        img_keys = []
+        for key in offline_buffer.image_keys:
+            img_keys.append(("next", *key))
+        img_keys += offline_buffer.image_keys
+    else:
+        img_keys = offline_buffer.image_keys
+
+    transforms = [Prod(in_keys=img_keys, prod=1 / 255)]
+
    if normalize:
        # TODO(rcadene): make normalization strategy configurable between mean_std, min_max, manual_min_max, min_max_from_spec
        stats = offline_buffer.compute_or_load_stats()
+
+        # we only normalize the state and action, since the images are usually normalized inside the model for now (except for tdmpc: see the following)
        in_keys = [("observation", "state"), ("action")]

-        if cfg.policy == "tdmpc":
-            for key in offline_buffer.image_keys:
-                # TODO(rcadene): imagenet normalization is applied inside diffusion policy, but no normalization inside tdmpc
-                in_keys.append(key)
-                # since we use next observations in tdmpc
-                in_keys.append(("next", *key))
+        if cfg.policy.name == "tdmpc":
+            # TODO(rcadene): we add img_keys to the keys to normalize for tdmpc only, since diffusion and act policies normalize the image inside the model for now
+            in_keys += img_keys
+            # TODO(racdene): since we use next observations in tdmpc, we also add them to the normalization. We are wasting a bit of compute on this for now.
+            in_keys += [("next", *key) for key in img_keys]
            in_keys.append(("next", "observation", "state"))

-        if cfg.policy == "diffusion" and cfg.env.name == "pusht":
+        if cfg.policy.name == "diffusion" and cfg.env.name == "pusht":
            # TODO(rcadene): we overwrite stats to have the same as pretrained model, but we should remove this
            stats["observation", "state", "min"] = torch.tensor([13.456424, 32.938293], dtype=torch.float32)
            stats["observation", "state", "max"] = torch.tensor([496.14618, 510.9579], dtype=torch.float32)
            stats["action", "min"] = torch.tensor([12.0, 25.0], dtype=torch.float32)
            stats["action", "max"] = torch.tensor([511.0, 511.0], dtype=torch.float32)

-        transform = NormalizeTransform(stats, in_keys, mode="min_max")
-        offline_buffer.set_transform(transform)
+        # TODO(rcadene): remove this and put it in config. Ideally we want to reproduce SOTA results just with mean_std
+        normalization_mode = "mean_std" if cfg.env.name == "aloha" else "min_max"
+        transforms.append(NormalizeTransform(stats, in_keys, mode=normalization_mode))
+
+    offline_buffer.set_transform(transforms)

    if not overwrite_sampler:
        index = torch.arange(0, offline_buffer.num_samples, 1)
--- a/lerobot/common/datasets/pusht.py
+++ b/lerobot/common/datasets/pusht.py
@@ -8,8 +8,6 @@ import pymunk
 import torch
 import torchrl
 import tqdm
-from diffusion_policy.common.replay_buffer import ReplayBuffer as DiffusionPolicyReplayBuffer
-from diffusion_policy.env.pusht.pusht_env import pymunk_to_shapely
 from tensordict import TensorDict
 from torchrl.data.replay_buffers.samplers import SliceSampler
 from torchrl.data.replay_buffers.storages import TensorStorage
@@ -17,11 +15,12 @@ from torchrl.data.replay_buffers.writers import Writer

 from lerobot.common.datasets.abstract import AbstractExperienceReplay
 from lerobot.common.datasets.utils import download_and_extract_zip
+from lerobot.common.envs.pusht.pusht_env import pymunk_to_shapely
+from lerobot.common.policies.diffusion.replay_buffer import ReplayBuffer as DiffusionPolicyReplayBuffer

 # as define in env
 SUCCESS_THRESHOLD = 0.95  # 95% coverage,

-DEFAULT_TEE_MASK = pymunk.ShapeFilter.ALL_MASKS()
 PUSHT_URL = "https://diffusion-policy.cs.columbia.edu/data/training/pusht.zip"
 PUSHT_ZARR = Path("pusht/pusht_cchi_v7_replay.zarr")

@@ -49,8 +48,10 @@ def add_tee(
    angle,
    scale=30,
    color="LightSlateGray",
-    mask=DEFAULT_TEE_MASK,
+    mask=None,
 ):
+    if mask is None:
+        mask = pymunk.ShapeFilter.ALL_MASKS()
    mass = 1
    length = 4
    vertices1 = [
@@ -89,7 +90,7 @@ class PushtExperienceReplay(AbstractExperienceReplay):
        batch_size: int = None,
        *,
        shuffle: bool = True,
-        root: Path = None,
+        root: Path | None = None,
        pin_memory: bool = False,
        prefetch: int = None,
        sampler: SliceSampler = None,
@@ -110,8 +111,9 @@ class PushtExperienceReplay(AbstractExperienceReplay):
            transform=transform,
        )

-    def _download_and_preproc(self):
-        raw_dir = self.data_dir.parent / f"{self.data_dir.name}_raw"
+    def _download_and_preproc_obsolete(self):
+        assert self.root is not None
+        raw_dir = self.root / f"{self.dataset_id}_raw"
        zarr_path = (raw_dir / PUSHT_ZARR).resolve()
        if not zarr_path.is_dir():
            raw_dir.mkdir(parents=True, exist_ok=True)
@@ -125,6 +127,9 @@ class PushtExperienceReplay(AbstractExperienceReplay):
        episode_ids = torch.from_numpy(dataset_dict.get_episode_idxs())
        num_episodes = dataset_dict.meta["episode_ends"].shape[0]
        total_frames = dataset_dict["action"].shape[0]
+        # to create test artifact
+        # num_episodes = 1
+        # total_frames = 50
        assert len(
            {dataset_dict[key].shape[0] for key in dataset_dict.keys()}  # noqa: SIM118
        ), "Some data type dont have the same number of total frames."
@@ -142,6 +147,8 @@ class PushtExperienceReplay(AbstractExperienceReplay):
        idxtd = 0
        for episode_id in tqdm.tqdm(range(num_episodes)):
            idx1 = dataset_dict.meta["episode_ends"][episode_id]
+            # to create test artifact
+            # idx1 = 51

            num_frames = idx1 - idx0

@@ -202,7 +209,7 @@ class PushtExperienceReplay(AbstractExperienceReplay):

            if episode_id == 0:
                # hack to initialize tensordict data structure to store episodes
-                td_data = ep_td[0].expand(total_frames).memmap_like(self.data_dir)
+                td_data = ep_td[0].expand(total_frames).memmap_like(self.root / f"{self.dataset_id}")

            td_data[idxtd : idxtd + len(ep_td)] = ep_td

--- a/lerobot/common/datasets/simxarm.py
+++ b/lerobot/common/datasets/simxarm.py
@@ -1,4 +1,5 @@
 import pickle
+import zipfile
 from pathlib import Path
 from typing import Callable

@@ -15,6 +16,22 @@ from torchrl.data.replay_buffers.writers import Writer
 from lerobot.common.datasets.abstract import AbstractExperienceReplay


+def download():
+    raise NotImplementedError()
+    import gdown
+
+    url = "https://drive.google.com/uc?id=1nhxpykGtPDhmQKm-_B8zBSywVRdgeVya"
+    download_path = "data.zip"
+    gdown.download(url, download_path, quiet=False)
+    print("Extracting...")
+    with zipfile.ZipFile(download_path, "r") as zip_f:
+        for member in zip_f.namelist():
+            if member.startswith("data/xarm") and member.endswith(".pkl"):
+                print(member)
+                zip_f.extract(member=member)
+    Path(download_path).unlink()
+
+
 class SimxarmExperienceReplay(AbstractExperienceReplay):
    available_datasets = [
        "xarm_lift_medium",
@@ -26,7 +43,7 @@ class SimxarmExperienceReplay(AbstractExperienceReplay):
        batch_size: int = None,
        *,
        shuffle: bool = True,
-        root: Path = None,
+        root: Path | None = None,
        pin_memory: bool = False,
        prefetch: int = None,
        sampler: SliceSampler = None,
@@ -47,11 +64,12 @@ class SimxarmExperienceReplay(AbstractExperienceReplay):
            transform=transform,
        )

-    def _download_and_preproc(self):
-        # download
-        # TODO(rcadene)
+    def _download_and_preproc_obsolete(self):
+        assert self.root is not None
+        # TODO(rcadene): finish download
+        download()

-        dataset_path = self.data_dir / "buffer.pkl"
+        dataset_path = self.root / f"{self.dataset_id}_raw" / "buffer.pkl"
        print(f"Using offline dataset '{dataset_path}'")
        with open(dataset_path, "rb") as f:
            dataset_dict = pickle.load(f)
@@ -93,7 +111,7 @@ class SimxarmExperienceReplay(AbstractExperienceReplay):

            if episode_id == 0:
                # hack to initialize tensordict data structure to store episodes
-                td_data = episode[0].expand(total_frames).memmap_like(self.data_dir)
+                td_data = episode[0].expand(total_frames).memmap_like(self.root / f"{self.dataset_id}")

            td_data[idx0:idx1] = episode

--- a/lerobot/common/envs/init.py
+++ b/lerobot/common/envs/init.py
--- a/lerobot/common/envs/abstract.py
+++ b/lerobot/common/envs/abstract.py
@@ -0,0 +1,80 @@
+import abc
+from collections import deque
+from typing import Optional
+
+from tensordict import TensorDict
+from torchrl.envs import EnvBase
+
+
+class AbstractEnv(EnvBase):
+    def __init__(
+        self,
+        task,
+        frame_skip: int = 1,
+        from_pixels: bool = False,
+        pixels_only: bool = False,
+        image_size=None,
+        seed=1337,
+        device="cpu",
+        num_prev_obs=1,
+        num_prev_action=0,
+    ):
+        super().__init__(device=device, batch_size=[])
+        self.task = task
+        self.frame_skip = frame_skip
+        self.from_pixels = from_pixels
+        self.pixels_only = pixels_only
+        self.image_size = image_size
+        self.num_prev_obs = num_prev_obs
+        self.num_prev_action = num_prev_action
+        self._rendering_hooks = []
+
+        if pixels_only:
+            assert from_pixels
+        if from_pixels:
+            assert image_size
+
+        self._make_env()
+        self._make_spec()
+        self._current_seed = self.set_seed(seed)
+
+        if self.num_prev_obs > 0:
+            self._prev_obs_image_queue = deque(maxlen=self.num_prev_obs)
+            self._prev_obs_state_queue = deque(maxlen=self.num_prev_obs)
+        if self.num_prev_action > 0:
+            raise NotImplementedError()
+            # self._prev_action_queue = deque(maxlen=self.num_prev_action)
+
+    def register_rendering_hook(self, func):
+        self._rendering_hooks.append(func)
+
+    def call_rendering_hooks(self):
+        for func in self._rendering_hooks:
+            func(self)
+
+    def reset_rendering_hooks(self):
+        self._rendering_hooks = []
+
+    @abc.abstractmethod
+    def render(self, mode="rgb_array", width=640, height=480):
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def _reset(self, tensordict: Optional[TensorDict] = None):
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def _step(self, tensordict: TensorDict):
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def _make_env(self):
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def _make_spec(self):
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def _set_seed(self, seed: Optional[int]):
+        raise NotImplementedError()
--- a/lerobot/common/envs/aloha/assets/bimanual_viperx_end_effector_insertion.xml
+++ b/lerobot/common/envs/aloha/assets/bimanual_viperx_end_effector_insertion.xml
@@ -0,0 +1,59 @@
+<mujoco>
+    <include file="scene.xml"/>
+    <include file="vx300s_dependencies.xml"/>
+
+    <equality>
+        <weld body1="mocap_left" body2="vx300s_left/gripper_link" solref="0.01 1" solimp=".25 .25 0.001" />
+        <weld body1="mocap_right" body2="vx300s_right/gripper_link" solref="0.01 1" solimp=".25 .25 0.001" />
+    </equality>
+
+
+    <worldbody>
+        <include file="vx300s_left.xml" />
+        <include file="vx300s_right.xml" />
+
+        <body mocap="true" name="mocap_left" pos="0.095 0.50 0.425">
+            <site pos="0 0 0" size="0.003 0.003 0.03" type="box" name="mocap_left_site1" rgba="1 0 0 1"/>
+            <site pos="0 0 0" size="0.003 0.03 0.003" type="box" name="mocap_left_site2" rgba="1 0 0 1"/>
+            <site pos="0 0 0" size="0.03 0.003 0.003" type="box" name="mocap_left_site3" rgba="1 0 0 1"/>
+        </body>
+        <body mocap="true" name="mocap_right" pos="-0.095 0.50 0.425">
+            <site pos="0 0 0" size="0.003 0.003 0.03" type="box" name="mocap_right_site1" rgba="1 0 0 1"/>
+            <site pos="0 0 0" size="0.003 0.03 0.003" type="box" name="mocap_right_site2" rgba="1 0 0 1"/>
+            <site pos="0 0 0" size="0.03 0.003 0.003" type="box" name="mocap_right_site3" rgba="1 0 0 1"/>
+        </body>
+
+        <body name="peg" pos="0.2 0.5 0.05">
+            <joint name="red_peg_joint" type="free" frictionloss="0.01" />
+            <inertial pos="0 0 0" mass="0.05" diaginertia="0.002 0.002 0.002" />
+            <geom condim="4" solimp="2 1 0.01" solref="0.01 1" friction="1 0.005 0.0001" pos="0 0 0" size="0.06 0.01 0.01" type="box" name="red_peg" rgba="1 0 0 1" />
+        </body>
+
+        <body name="socket" pos="-0.2 0.5 0.05">
+            <joint name="blue_socket_joint" type="free" frictionloss="0.01" />
+            <inertial pos="0 0 0" mass="0.05" diaginertia="0.002 0.002 0.002" />
+<!--            <geom condim="4" solimp="2 1 0.01" solref="0.01 1" friction="1 0.005 0.0001" pos="0 0 0" size="0.06 0.01 0.01" type="box" name="red_peg_ref" rgba="1 0 0 1" />-->
+            <geom condim="4" solimp="2 1 0.01" solref="0.01 1" friction="1 0.05 0.001" pos="0 0 -0.02" size="0.06 0.018 0.002" type="box" name="socket-1" rgba="0 0 1 1" />
+            <geom condim="4" solimp="2 1 0.01" solref="0.01 1" friction="1 0.05 0.001" pos="0 0 0.02" size="0.06 0.018 0.002" type="box" name="socket-2" rgba="0 0 1 1" />
+            <geom condim="4" solimp="2 1 0.01" solref="0.01 1" friction="1 0.05 0.001" pos="0 0.02 0" size="0.06 0.002 0.018" type="box" name="socket-3" rgba="0 0 1 1" />
+            <geom condim="4" solimp="2 1 0.01" solref="0.01 1" friction="1 0.05 0.001" pos="0 -0.02 0" size="0.06 0.002 0.018" type="box" name="socket-4" rgba="0 0 1 1" />
+            <geom condim="4" solimp="2 1 0.01" solref="0.01 1" friction="1 0.005 0.0001" pos="0 0 0" size="0.04 0.01 0.01" type="box" name="pin" rgba="1 0 0 1" />
+        </body>
+
+    </worldbody>
+
+    <actuator>
+        <position ctrllimited="true" ctrlrange="0.021 0.057" joint="vx300s_left/left_finger" kp="200"  user="1"/>
+        <position ctrllimited="true" ctrlrange="-0.057 -0.021" joint="vx300s_left/right_finger" kp="200"  user="1"/>
+
+        <position ctrllimited="true" ctrlrange="0.021 0.057" joint="vx300s_right/left_finger" kp="200"  user="1"/>
+        <position ctrllimited="true" ctrlrange="-0.057 -0.021" joint="vx300s_right/right_finger" kp="200"  user="1"/>
+
+    </actuator>
+
+    <keyframe>
+        <key qpos="0 -0.96 1.16 0 -0.3 0 0.024 -0.024  0 -0.96 1.16 0 -0.3 0 0.024 -0.024  0.2 0.5 0.05 1 0 0 0  -0.2 0.5 0.05 1 0 0 0"/>
+    </keyframe>
+
+
+</mujoco>
--- a/lerobot/common/envs/aloha/assets/bimanual_viperx_end_effector_transfer_cube.xml
+++ b/lerobot/common/envs/aloha/assets/bimanual_viperx_end_effector_transfer_cube.xml
@@ -0,0 +1,48 @@
+<mujoco>
+    <include file="scene.xml"/>
+    <include file="vx300s_dependencies.xml"/>
+
+    <equality>
+        <weld body1="mocap_left" body2="vx300s_left/gripper_link" solref="0.01 1" solimp=".25 .25 0.001" />
+        <weld body1="mocap_right" body2="vx300s_right/gripper_link" solref="0.01 1" solimp=".25 .25 0.001" />
+    </equality>
+
+
+    <worldbody>
+        <include file="vx300s_left.xml" />
+        <include file="vx300s_right.xml" />
+
+        <body mocap="true" name="mocap_left" pos="0.095 0.50 0.425">
+            <site pos="0 0 0" size="0.003 0.003 0.03" type="box" name="mocap_left_site1" rgba="1 0 0 1"/>
+            <site pos="0 0 0" size="0.003 0.03 0.003" type="box" name="mocap_left_site2" rgba="1 0 0 1"/>
+            <site pos="0 0 0" size="0.03 0.003 0.003" type="box" name="mocap_left_site3" rgba="1 0 0 1"/>
+        </body>
+        <body mocap="true" name="mocap_right" pos="-0.095 0.50 0.425">
+            <site pos="0 0 0" size="0.003 0.003 0.03" type="box" name="mocap_right_site1" rgba="1 0 0 1"/>
+            <site pos="0 0 0" size="0.003 0.03 0.003" type="box" name="mocap_right_site2" rgba="1 0 0 1"/>
+            <site pos="0 0 0" size="0.03 0.003 0.003" type="box" name="mocap_right_site3" rgba="1 0 0 1"/>
+        </body>
+
+        <body name="box" pos="0.2 0.5 0.05">
+            <joint name="red_box_joint" type="free" frictionloss="0.01" />
+            <inertial pos="0 0 0" mass="0.05" diaginertia="0.002 0.002 0.002" />
+            <geom condim="4" solimp="2 1 0.01" solref="0.01 1" friction="1 0.005 0.0001" pos="0 0 0" size="0.02 0.02 0.02" type="box" name="red_box" rgba="1 0 0 1" />
+        </body>
+
+    </worldbody>
+
+    <actuator>
+        <position ctrllimited="true" ctrlrange="0.021 0.057" joint="vx300s_left/left_finger" kp="200"  user="1"/>
+        <position ctrllimited="true" ctrlrange="-0.057 -0.021" joint="vx300s_left/right_finger" kp="200"  user="1"/>
+
+        <position ctrllimited="true" ctrlrange="0.021 0.057" joint="vx300s_right/left_finger" kp="200"  user="1"/>
+        <position ctrllimited="true" ctrlrange="-0.057 -0.021" joint="vx300s_right/right_finger" kp="200"  user="1"/>
+
+    </actuator>
+
+    <keyframe>
+        <key qpos="0 -0.96 1.16 0 -0.3 0 0.024 -0.024  0 -0.96 1.16 0 -0.3 0 0.024 -0.024  0.2 0.5 0.05 1 0 0 0"/>
+    </keyframe>
+
+
+</mujoco>
--- a/lerobot/common/envs/aloha/assets/bimanual_viperx_insertion.xml
+++ b/lerobot/common/envs/aloha/assets/bimanual_viperx_insertion.xml
@@ -0,0 +1,53 @@
+<mujoco>
+    <include file="scene.xml"/>
+    <include file="vx300s_dependencies.xml"/>
+    <worldbody>
+        <include file="vx300s_left.xml" />
+        <include file="vx300s_right.xml" />
+
+        <body name="peg" pos="0.2 0.5 0.05">
+            <joint name="red_peg_joint" type="free" frictionloss="0.01" />
+            <inertial pos="0 0 0" mass="0.05" diaginertia="0.002 0.002 0.002" />
+            <geom condim="4" solimp="2 1 0.01" solref="0.01 1" friction="1 0.005 0.0001" pos="0 0 0" size="0.06 0.01 0.01" type="box" name="red_peg" rgba="1 0 0 1" />
+        </body>
+
+        <body name="socket" pos="-0.2 0.5 0.05">
+            <joint name="blue_socket_joint" type="free" frictionloss="0.01" />
+            <inertial pos="0 0 0" mass="0.05" diaginertia="0.002 0.002 0.002" />
+<!--            <geom condim="4" solimp="2 1 0.01" solref="0.01 1" friction="1 0.005 0.0001" pos="0 0 0" size="0.06 0.01 0.01" type="box" name="red_peg_ref" rgba="1 0 0 1" />-->
+            <geom condim="4" solimp="2 1 0.01" solref="0.01 1" friction="1 0.05 0.001" pos="0 0 -0.02" size="0.06 0.018 0.002" type="box" name="socket-1" rgba="0 0 1 1" />
+            <geom condim="4" solimp="2 1 0.01" solref="0.01 1" friction="1 0.05 0.001" pos="0 0 0.02" size="0.06 0.018 0.002" type="box" name="socket-2" rgba="0 0 1 1" />
+            <geom condim="4" solimp="2 1 0.01" solref="0.01 1" friction="1 0.05 0.001" pos="0 0.02 0" size="0.06 0.002 0.018" type="box" name="socket-3" rgba="0 0 1 1" />
+            <geom condim="4" solimp="2 1 0.01" solref="0.01 1" friction="1 0.05 0.001" pos="0 -0.02 0" size="0.06 0.002 0.018" type="box" name="socket-4" rgba="0 0 1 1" />
+            <geom condim="4" solimp="2 1 0.01" solref="0.01 1" friction="1 0.005 0.0001" pos="0 0 0" size="0.04 0.01 0.01" type="box" name="pin" rgba="1 0 0 1" />
+        </body>
+
+    </worldbody>
+
+    <actuator>
+        <position ctrllimited="true" ctrlrange="-3.14158 3.14158" joint="vx300s_left/waist" kp="800"  user="1" forcelimited="true" forcerange="-150 150"/>
+        <position ctrllimited="true" ctrlrange="-1.85005 1.25664" joint="vx300s_left/shoulder" kp="1600"  user="1" forcelimited="true" forcerange="-300 300"/>
+        <position ctrllimited="true" ctrlrange="-1.76278 1.6057" joint="vx300s_left/elbow" kp="800"  user="1" forcelimited="true" forcerange="-100 100"/>
+        <position ctrllimited="true" ctrlrange="-3.14158 3.14158" joint="vx300s_left/forearm_roll" kp="10"  user="1" forcelimited="true" forcerange="-100 100"/>
+        <position ctrllimited="true" ctrlrange="-1.8675 2.23402" joint="vx300s_left/wrist_angle" kp="50"  user="1"/>
+        <position ctrllimited="true" ctrlrange="-3.14158 3.14158" joint="vx300s_left/wrist_rotate" kp="20"  user="1"/>
+        <position ctrllimited="true" ctrlrange="0.021 0.057" joint="vx300s_left/left_finger" kp="200"  user="1"/>
+        <position ctrllimited="true" ctrlrange="-0.057 -0.021" joint="vx300s_left/right_finger" kp="200"  user="1"/>
+
+        <position ctrllimited="true" ctrlrange="-3.14158 3.14158" joint="vx300s_right/waist" kp="800"  user="1" forcelimited="true" forcerange="-150 150"/>
+        <position ctrllimited="true" ctrlrange="-1.85005 1.25664" joint="vx300s_right/shoulder" kp="1600"  user="1" forcelimited="true" forcerange="-300 300"/>
+        <position ctrllimited="true" ctrlrange="-1.76278 1.6057" joint="vx300s_right/elbow" kp="800"  user="1" forcelimited="true" forcerange="-100 100"/>
+        <position ctrllimited="true" ctrlrange="-3.14158 3.14158" joint="vx300s_right/forearm_roll" kp="10"  user="1" forcelimited="true" forcerange="-100 100"/>
+        <position ctrllimited="true" ctrlrange="-1.8675 2.23402" joint="vx300s_right/wrist_angle" kp="50"  user="1"/>
+        <position ctrllimited="true" ctrlrange="-3.14158 3.14158" joint="vx300s_right/wrist_rotate" kp="20"  user="1"/>
+        <position ctrllimited="true" ctrlrange="0.021 0.057" joint="vx300s_right/left_finger" kp="200"  user="1"/>
+        <position ctrllimited="true" ctrlrange="-0.057 -0.021" joint="vx300s_right/right_finger" kp="200"  user="1"/>
+
+    </actuator>
+
+    <keyframe>
+        <key qpos="0 -0.96 1.16 0 -0.3 0 0.024 -0.024  0 -0.96 1.16 0 -0.3 0 0.024 -0.024  0.2 0.5 0.05 1 0 0 0  -0.2 0.5 0.05 1 0 0 0"/>
+    </keyframe>
+
+
+</mujoco>
--- a/lerobot/common/envs/aloha/assets/bimanual_viperx_transfer_cube.xml
+++ b/lerobot/common/envs/aloha/assets/bimanual_viperx_transfer_cube.xml
@@ -0,0 +1,42 @@
+<mujoco>
+    <include file="scene.xml"/>
+    <include file="vx300s_dependencies.xml"/>
+    <worldbody>
+        <include file="vx300s_left.xml" />
+        <include file="vx300s_right.xml" />
+
+        <body name="box" pos="0.2 0.5 0.05">
+            <joint name="red_box_joint" type="free" frictionloss="0.01" />
+            <inertial pos="0 0 0" mass="0.05" diaginertia="0.002 0.002 0.002" />
+            <geom condim="4" solimp="2 1 0.01" solref="0.01 1" friction="1 0.005 0.0001" pos="0 0 0" size="0.02 0.02 0.02" type="box" name="red_box" rgba="1 0 0 1" />
+        </body>
+
+    </worldbody>
+
+    <actuator>
+        <position ctrllimited="true" ctrlrange="-3.14158 3.14158" joint="vx300s_left/waist" kp="800"  user="1" forcelimited="true" forcerange="-150 150"/>
+        <position ctrllimited="true" ctrlrange="-1.85005 1.25664" joint="vx300s_left/shoulder" kp="1600"  user="1" forcelimited="true" forcerange="-300 300"/>
+        <position ctrllimited="true" ctrlrange="-1.76278 1.6057" joint="vx300s_left/elbow" kp="800"  user="1" forcelimited="true" forcerange="-100 100"/>
+        <position ctrllimited="true" ctrlrange="-3.14158 3.14158" joint="vx300s_left/forearm_roll" kp="10"  user="1" forcelimited="true" forcerange="-100 100"/>
+        <position ctrllimited="true" ctrlrange="-1.8675 2.23402" joint="vx300s_left/wrist_angle" kp="50"  user="1"/>
+        <position ctrllimited="true" ctrlrange="-3.14158 3.14158" joint="vx300s_left/wrist_rotate" kp="20"  user="1"/>
+        <position ctrllimited="true" ctrlrange="0.021 0.057" joint="vx300s_left/left_finger" kp="200"  user="1"/>
+        <position ctrllimited="true" ctrlrange="-0.057 -0.021" joint="vx300s_left/right_finger" kp="200"  user="1"/>
+
+        <position ctrllimited="true" ctrlrange="-3.14158 3.14158" joint="vx300s_right/waist" kp="800"  user="1" forcelimited="true" forcerange="-150 150"/>
+        <position ctrllimited="true" ctrlrange="-1.85005 1.25664" joint="vx300s_right/shoulder" kp="1600"  user="1" forcelimited="true" forcerange="-300 300"/>
+        <position ctrllimited="true" ctrlrange="-1.76278 1.6057" joint="vx300s_right/elbow" kp="800"  user="1" forcelimited="true" forcerange="-100 100"/>
+        <position ctrllimited="true" ctrlrange="-3.14158 3.14158" joint="vx300s_right/forearm_roll" kp="10"  user="1" forcelimited="true" forcerange="-100 100"/>
+        <position ctrllimited="true" ctrlrange="-1.8675 2.23402" joint="vx300s_right/wrist_angle" kp="50"  user="1"/>
+        <position ctrllimited="true" ctrlrange="-3.14158 3.14158" joint="vx300s_right/wrist_rotate" kp="20"  user="1"/>
+        <position ctrllimited="true" ctrlrange="0.021 0.057" joint="vx300s_right/left_finger" kp="200"  user="1"/>
+        <position ctrllimited="true" ctrlrange="-0.057 -0.021" joint="vx300s_right/right_finger" kp="200"  user="1"/>
+
+    </actuator>
+
+    <keyframe>
+        <key qpos="0 -0.96 1.16 0 -0.3 0 0.024 -0.024  0 -0.96 1.16 0 -0.3 0 0.024 -0.024  0.2 0.5 0.05 1 0 0 0"/>
+    </keyframe>
+
+
+</mujoco>
--- a/lerobot/common/envs/aloha/assets/scene.xml
+++ b/lerobot/common/envs/aloha/assets/scene.xml
@@ -0,0 +1,38 @@
+<mujocoinclude>
+<!--    <option timestep='0.0025' iterations="50" tolerance="1e-10" solver="Newton" jacobian="dense" cone="elliptic"/>-->
+
+    <asset>
+        <mesh file="tabletop.stl" name="tabletop" scale="0.001 0.001 0.001"/>
+    </asset>
+
+    <visual>
+        <map fogstart="1.5" fogend="5" force="0.1" znear="0.1"/>
+        <quality shadowsize="4096" offsamples="4"/>
+        <headlight ambient="0.4 0.4 0.4"/>
+    </visual>
+
+    <worldbody>
+        <light castshadow="false" directional='true' diffuse='.3 .3 .3' specular='0.3 0.3 0.3' pos='-1 -1 1'
+               dir='1 1 -1'/>
+        <light directional='true' diffuse='.3 .3 .3' specular='0.3 0.3 0.3' pos='1 -1 1' dir='-1 1 -1'/>
+        <light castshadow="false" directional='true' diffuse='.3 .3 .3' specular='0.3 0.3 0.3' pos='0 1 1'
+               dir='0 -1 -1'/>
+
+        <body name="table" pos="0 .6 0">
+            <geom group="1" mesh="tabletop" pos="0 0 0" type="mesh" conaffinity="1" contype="1" name="table" rgba="0.2 0.2 0.2 1" />
+        </body>
+        <body name="midair" pos="0 .6 0.2">
+            <site pos="0 0 0" size="0.01" type="sphere" name="midair" rgba="1 0 0 0"/>
+        </body>
+
+        <camera name="left_pillar" pos="-0.5 0.2 0.6" fovy="78" mode="targetbody" target="table"/>
+        <camera name="right_pillar" pos="0.5 0.2 0.6" fovy="78" mode="targetbody" target="table"/>
+        <camera name="top" pos="0 0.6 0.8" fovy="78" mode="targetbody" target="table"/>
+        <camera name="angle" pos="0 0 0.6" fovy="78" mode="targetbody" target="table"/>
+        <camera name="front_close" pos="0 0.2 0.4" fovy="78" mode="targetbody" target="vx300s_left/camera_focus"/>
+
+    </worldbody>
+
+
+
+</mujocoinclude>
--- a/lerobot/common/envs/aloha/assets/tabletop.stl
+++ b/lerobot/common/envs/aloha/assets/tabletop.stl
--- a/lerobot/common/envs/aloha/assets/vx300s_10_custom_finger_left.stl
+++ b/lerobot/common/envs/aloha/assets/vx300s_10_custom_finger_left.stl
--- a/lerobot/common/envs/aloha/assets/vx300s_10_custom_finger_right.stl
+++ b/lerobot/common/envs/aloha/assets/vx300s_10_custom_finger_right.stl
--- a/lerobot/common/envs/aloha/assets/vx300s_10_gripper_finger.stl
+++ b/lerobot/common/envs/aloha/assets/vx300s_10_gripper_finger.stl
--- a/lerobot/common/envs/aloha/assets/vx300s_11_ar_tag.stl
+++ b/lerobot/common/envs/aloha/assets/vx300s_11_ar_tag.stl
--- a/lerobot/common/envs/aloha/assets/vx300s_1_base.stl
+++ b/lerobot/common/envs/aloha/assets/vx300s_1_base.stl
--- a/lerobot/common/envs/aloha/assets/vx300s_2_shoulder.stl
+++ b/lerobot/common/envs/aloha/assets/vx300s_2_shoulder.stl
--- a/lerobot/common/envs/aloha/assets/vx300s_3_upper_arm.stl
+++ b/lerobot/common/envs/aloha/assets/vx300s_3_upper_arm.stl
--- a/lerobot/common/envs/aloha/assets/vx300s_4_upper_forearm.stl
+++ b/lerobot/common/envs/aloha/assets/vx300s_4_upper_forearm.stl
--- a/lerobot/common/envs/aloha/assets/vx300s_5_lower_forearm.stl
+++ b/lerobot/common/envs/aloha/assets/vx300s_5_lower_forearm.stl
--- a/lerobot/common/envs/aloha/assets/vx300s_6_wrist.stl
+++ b/lerobot/common/envs/aloha/assets/vx300s_6_wrist.stl
--- a/lerobot/common/envs/aloha/assets/vx300s_7_gripper.stl
+++ b/lerobot/common/envs/aloha/assets/vx300s_7_gripper.stl
--- a/lerobot/common/envs/aloha/assets/vx300s_8_gripper_prop.stl
+++ b/lerobot/common/envs/aloha/assets/vx300s_8_gripper_prop.stl
--- a/lerobot/common/envs/aloha/assets/vx300s_9_gripper_bar.stl
+++ b/lerobot/common/envs/aloha/assets/vx300s_9_gripper_bar.stl
--- a/lerobot/common/envs/aloha/assets/vx300s_dependencies.xml
+++ b/lerobot/common/envs/aloha/assets/vx300s_dependencies.xml
@@ -0,0 +1,17 @@
+<mujocoinclude>
+    <compiler angle="radian" inertiafromgeom="auto" inertiagrouprange="4 5"/>
+    <asset>
+        <mesh name="vx300s_1_base" file="vx300s_1_base.stl" scale="0.001 0.001 0.001" />
+        <mesh name="vx300s_2_shoulder" file="vx300s_2_shoulder.stl" scale="0.001 0.001 0.001" />
+        <mesh name="vx300s_3_upper_arm" file="vx300s_3_upper_arm.stl" scale="0.001 0.001 0.001" />
+        <mesh name="vx300s_4_upper_forearm" file="vx300s_4_upper_forearm.stl" scale="0.001 0.001 0.001" />
+        <mesh name="vx300s_5_lower_forearm" file="vx300s_5_lower_forearm.stl" scale="0.001 0.001 0.001" />
+        <mesh name="vx300s_6_wrist" file="vx300s_6_wrist.stl" scale="0.001 0.001 0.001" />
+        <mesh name="vx300s_7_gripper" file="vx300s_7_gripper.stl" scale="0.001 0.001 0.001" />
+        <mesh name="vx300s_8_gripper_prop" file="vx300s_8_gripper_prop.stl" scale="0.001 0.001 0.001" />
+        <mesh name="vx300s_9_gripper_bar" file="vx300s_9_gripper_bar.stl" scale="0.001 0.001 0.001" />
+        <mesh name="vx300s_10_gripper_finger_left" file="vx300s_10_custom_finger_left.stl" scale="0.001 0.001 0.001" />
+        <mesh name="vx300s_10_gripper_finger_right" file="vx300s_10_custom_finger_right.stl" scale="0.001 0.001 0.001" />
+    </asset>
+
+</mujocoinclude>
--- a/lerobot/common/envs/aloha/assets/vx300s_left.xml
+++ b/lerobot/common/envs/aloha/assets/vx300s_left.xml
@@ -0,0 +1,59 @@
+
+<mujocoinclude>
+    <body name="vx300s_left" pos="-0.469 0.5 0">
+        <geom quat="0.707107 0 0 0.707107" type="mesh" mesh="vx300s_1_base" name="vx300s_left/1_base" contype="0" conaffinity="0"/>
+        <body name="vx300s_left/shoulder_link" pos="0 0 0.079">
+            <inertial pos="0.000259233 -3.3552e-06 0.0116129" quat="-0.476119 0.476083 0.52279 0.522826" mass="0.798614" diaginertia="0.00120156 0.00113744 0.0009388" />
+            <joint name="vx300s_left/waist" pos="0 0 0" axis="0 0 1" limited="true" range="-3.14158 3.14158" frictionloss="50" />
+            <geom pos="0 0 -0.003" quat="0.707107 0 0 0.707107" type="mesh" mesh="vx300s_2_shoulder" name="vx300s_left/2_shoulder" />
+            <body name="vx300s_left/upper_arm_link" pos="0 0 0.04805">
+                <inertial pos="0.0206949 4e-10 0.226459" quat="0 0.0728458 0 0.997343" mass="0.792592" diaginertia="0.00911338 0.008925 0.000759317" />
+                <joint name="vx300s_left/shoulder" pos="0 0 0" axis="0 1 0" limited="true" range="-1.85005 1.25664" frictionloss="60" />
+                <geom quat="0.707107 0 0 0.707107" type="mesh" mesh="vx300s_3_upper_arm" name="vx300s_left/3_upper_arm"/>
+                <body name="vx300s_left/upper_forearm_link" pos="0.05955 0 0.3">
+                    <inertial pos="0.105723 0 0" quat="-0.000621631 0.704724 0.0105292 0.709403" mass="0.322228" diaginertia="0.00144107 0.00134228 0.000152047" />
+                    <joint name="vx300s_left/elbow" pos="0 0 0" axis="0 1 0" limited="true" range="-1.76278 1.6057" frictionloss="60" />
+                    <geom type="mesh" mesh="vx300s_4_upper_forearm" name="vx300s_left/4_upper_forearm" />
+                    <body name="vx300s_left/lower_forearm_link" pos="0.2 0 0">
+                        <inertial pos="0.0513477 0.00680462 0" quat="-0.702604 -0.0796724 -0.702604 0.0796724" mass="0.414823" diaginertia="0.0005911 0.000546493 0.000155707" />
+                        <joint name="vx300s_left/forearm_roll" pos="0 0 0" axis="1 0 0" limited="true" range="-3.14158 3.14158" frictionloss="30" />
+                        <geom quat="0 1 0 0" type="mesh" mesh="vx300s_5_lower_forearm" name="vx300s_left/5_lower_forearm"/>
+                        <body name="vx300s_left/wrist_link" pos="0.1 0 0">
+                            <inertial pos="0.046743 -7.6652e-06 0.010565" quat="-0.00100191 0.544586 0.0026583 0.8387" mass="0.115395" diaginertia="5.45707e-05 4.63101e-05 4.32692e-05" />
+                            <joint name="vx300s_left/wrist_angle" pos="0 0 0" axis="0 1 0" limited="true" range="-1.8675 2.23402" frictionloss="30" />
+                            <geom quat="0.707107 0 0 0.707107" type="mesh" mesh="vx300s_6_wrist" name="vx300s_left/6_wrist" />
+                            <body name="vx300s_left/gripper_link" pos="0.069744 0 0">
+                                <body name="vx300s_left/camera_focus" pos="0.15 0 0.01">
+                                    <site pos="0 0 0" size="0.01" type="sphere" name="left_cam_focus" rgba="0 0 1 0"/>
+                                </body>
+                                <site pos="0.15 0 0" size="0.003 0.003 0.03" type="box" name="cali_left_site1" rgba="0 0 1 0"/>
+                                <site pos="0.15 0 0" size="0.003 0.03 0.003" type="box" name="cali_left_site2" rgba="0 0 1 0"/>
+                                <site pos="0.15 0 0" size="0.03 0.003 0.003" type="box" name="cali_left_site3" rgba="0 0 1 0"/>
+                                <camera name="left_wrist" pos="-0.1 0 0.16" fovy="20" mode="targetbody" target="vx300s_left/camera_focus"/>
+                                <inertial pos="0.0395662 -2.56311e-07 0.00400649" quat="0.62033 0.619916 -0.339682 0.339869" mass="0.251652" diaginertia="0.000689546 0.000650316 0.000468142" />
+                                <joint name="vx300s_left/wrist_rotate" pos="0 0 0" axis="1 0 0" limited="true" range="-3.14158 3.14158" frictionloss="30" />
+                                <geom pos="-0.02 0 0" quat="0.707107 0 0 0.707107" type="mesh" mesh="vx300s_7_gripper" name="vx300s_left/7_gripper" />
+                                <geom pos="-0.020175 0 0" quat="0.707107 0 0 0.707107" type="mesh" mesh="vx300s_9_gripper_bar" name="vx300s_left/9_gripper_bar" />
+                                <body name="vx300s_left/gripper_prop_link" pos="0.0485 0 0">
+                                    <inertial pos="0.002378 2.85e-08 0" quat="0 0 0.897698 0.440611" mass="0.008009" diaginertia="4.2979e-06 2.8868e-06 1.5314e-06" />
+<!--                                    <joint name="vx300s_left/gripper" pos="0 0 0" axis="1 0 0" frictionloss="30" />-->
+                                    <geom pos="-0.0685 0 0" quat="0.707107 0 0 0.707107" type="mesh" mesh="vx300s_8_gripper_prop" name="vx300s_left/8_gripper_prop" />
+                                </body>
+                                <body name="vx300s_left/left_finger_link" pos="0.0687 0 0">
+                                    <inertial pos="0.017344 -0.0060692 0" quat="0.449364 0.449364 -0.54596 -0.54596" mass="0.034796" diaginertia="2.48003e-05 1.417e-05 1.20797e-05" />
+                                    <joint name="vx300s_left/left_finger" pos="0 0 0" axis="0 1 0" type="slide" limited="true" range="0.021 0.057" frictionloss="30" />
+                                    <geom condim="4" solimp="2 1 0.01" solref="0.01 1" friction="1 0.005 0.0001" pos="0.005 -0.052 0" euler="3.14 1.57 0" type="mesh" mesh="vx300s_10_gripper_finger_left" name="vx300s_left/10_left_gripper_finger"/>
+                                </body>
+                                <body name="vx300s_left/right_finger_link" pos="0.0687 0 0">
+                                    <inertial pos="0.017344 0.0060692 0" quat="0.44937 -0.44937 0.545955 -0.545955" mass="0.034796" diaginertia="2.48002e-05 1.417e-05 1.20798e-05" />
+                                    <joint name="vx300s_left/right_finger" pos="0 0 0" axis="0 1 0" type="slide" limited="true" range="-0.057 -0.021" frictionloss="30" />
+                                    <geom condim="4" solimp="2 1 0.01" solref="0.01 1" friction="1 0.005 0.0001" pos="0.005 0.052 0" euler="3.14 1.57 0" type="mesh" mesh="vx300s_10_gripper_finger_right" name="vx300s_left/10_right_gripper_finger"/>
+                                </body>
+                            </body>
+                        </body>
+                    </body>
+                </body>
+            </body>
+        </body>
+    </body>
+</mujocoinclude>
--- a/lerobot/common/envs/aloha/assets/vx300s_right.xml
+++ b/lerobot/common/envs/aloha/assets/vx300s_right.xml
@@ -0,0 +1,59 @@
+
+<mujocoinclude>
+    <body name="vx300s_right" pos="0.469 0.5 0" euler="0 0 3.1416">
+        <geom quat="0.707107 0 0 0.707107" type="mesh" mesh="vx300s_1_base" name="vx300s_right/1_base" contype="0" conaffinity="0"/>
+        <body name="vx300s_right/shoulder_link" pos="0 0 0.079">
+            <inertial pos="0.000259233 -3.3552e-06 0.0116129" quat="-0.476119 0.476083 0.52279 0.522826" mass="0.798614" diaginertia="0.00120156 0.00113744 0.0009388" />
+            <joint name="vx300s_right/waist" pos="0 0 0" axis="0 0 1" limited="true" range="-3.14158 3.14158" frictionloss="50" />
+            <geom pos="0 0 -0.003" quat="0.707107 0 0 0.707107" type="mesh" mesh="vx300s_2_shoulder" name="vx300s_right/2_shoulder" />
+            <body name="vx300s_right/upper_arm_link" pos="0 0 0.04805">
+                <inertial pos="0.0206949 4e-10 0.226459" quat="0 0.0728458 0 0.997343" mass="0.792592" diaginertia="0.00911338 0.008925 0.000759317" />
+                <joint name="vx300s_right/shoulder" pos="0 0 0" axis="0 1 0" limited="true" range="-1.85005 1.25664" frictionloss="60" />
+                <geom quat="0.707107 0 0 0.707107" type="mesh" mesh="vx300s_3_upper_arm" name="vx300s_right/3_upper_arm"/>
+                <body name="vx300s_right/upper_forearm_link" pos="0.05955 0 0.3">
+                    <inertial pos="0.105723 0 0" quat="-0.000621631 0.704724 0.0105292 0.709403" mass="0.322228" diaginertia="0.00144107 0.00134228 0.000152047" />
+                    <joint name="vx300s_right/elbow" pos="0 0 0" axis="0 1 0" limited="true" range="-1.76278 1.6057" frictionloss="60" />
+                    <geom type="mesh" mesh="vx300s_4_upper_forearm" name="vx300s_right/4_upper_forearm" />
+                    <body name="vx300s_right/lower_forearm_link" pos="0.2 0 0">
+                        <inertial pos="0.0513477 0.00680462 0" quat="-0.702604 -0.0796724 -0.702604 0.0796724" mass="0.414823" diaginertia="0.0005911 0.000546493 0.000155707" />
+                        <joint name="vx300s_right/forearm_roll" pos="0 0 0" axis="1 0 0" limited="true" range="-3.14158 3.14158" frictionloss="30" />
+                        <geom quat="0 1 0 0" type="mesh" mesh="vx300s_5_lower_forearm" name="vx300s_right/5_lower_forearm"/>
+                        <body name="vx300s_right/wrist_link" pos="0.1 0 0">
+                            <inertial pos="0.046743 -7.6652e-06 0.010565" quat="-0.00100191 0.544586 0.0026583 0.8387" mass="0.115395" diaginertia="5.45707e-05 4.63101e-05 4.32692e-05" />
+                            <joint name="vx300s_right/wrist_angle" pos="0 0 0" axis="0 1 0" limited="true" range="-1.8675 2.23402" frictionloss="30" />
+                            <geom quat="0.707107 0 0 0.707107" type="mesh" mesh="vx300s_6_wrist" name="vx300s_right/6_wrist" />
+                            <body name="vx300s_right/gripper_link" pos="0.069744 0 0">
+                                <body name="vx300s_right/camera_focus" pos="0.15 0 0.01">
+                                    <site pos="0 0 0" size="0.01" type="sphere" name="right_cam_focus" rgba="0 0 1 0"/>
+                                </body>
+                                <site pos="0.15 0 0" size="0.003 0.003 0.03" type="box" name="cali_right_site1" rgba="0 0 1 0"/>
+                                <site pos="0.15 0 0" size="0.003 0.03 0.003" type="box" name="cali_right_site2" rgba="0 0 1 0"/>
+                                <site pos="0.15 0 0" size="0.03 0.003 0.003" type="box" name="cali_right_site3" rgba="0 0 1 0"/>
+                                <camera name="right_wrist" pos="-0.1 0 0.16" fovy="20" mode="targetbody" target="vx300s_right/camera_focus"/>
+                                <inertial pos="0.0395662 -2.56311e-07 0.00400649" quat="0.62033 0.619916 -0.339682 0.339869" mass="0.251652" diaginertia="0.000689546 0.000650316 0.000468142" />
+                                <joint name="vx300s_right/wrist_rotate" pos="0 0 0" axis="1 0 0" limited="true" range="-3.14158 3.14158" frictionloss="30" />
+                                <geom pos="-0.02 0 0" quat="0.707107 0 0 0.707107" type="mesh" mesh="vx300s_7_gripper" name="vx300s_right/7_gripper" />
+                                <geom pos="-0.020175 0 0" quat="0.707107 0 0 0.707107" type="mesh" mesh="vx300s_9_gripper_bar" name="vx300s_right/9_gripper_bar" />
+                                <body name="vx300s_right/gripper_prop_link" pos="0.0485 0 0">
+                                    <inertial pos="0.002378 2.85e-08 0" quat="0 0 0.897698 0.440611" mass="0.008009" diaginertia="4.2979e-06 2.8868e-06 1.5314e-06" />
+<!--                                    <joint name="vx300s_right/gripper" pos="0 0 0" axis="1 0 0" frictionloss="30" />-->
+                                    <geom pos="-0.0685 0 0" quat="0.707107 0 0 0.707107" type="mesh" mesh="vx300s_8_gripper_prop" name="vx300s_right/8_gripper_prop" />
+                                </body>
+                                <body name="vx300s_right/left_finger_link" pos="0.0687 0 0">
+                                    <inertial pos="0.017344 -0.0060692 0" quat="0.449364 0.449364 -0.54596 -0.54596" mass="0.034796" diaginertia="2.48003e-05 1.417e-05 1.20797e-05" />
+                                    <joint name="vx300s_right/left_finger" pos="0 0 0" axis="0 1 0" type="slide" limited="true" range="0.021 0.057" frictionloss="30" />
+                                    <geom condim="4" solimp="2 1 0.01" solref="0.01 1" friction="1 0.005 0.0001" pos="0.005 -0.052 0" euler="3.14 1.57 0" type="mesh" mesh="vx300s_10_gripper_finger_left" name="vx300s_right/10_left_gripper_finger"/>
+                                </body>
+                                <body name="vx300s_right/right_finger_link" pos="0.0687 0 0">
+                                    <inertial pos="0.017344 0.0060692 0" quat="0.44937 -0.44937 0.545955 -0.545955" mass="0.034796" diaginertia="2.48002e-05 1.417e-05 1.20798e-05" />
+                                    <joint name="vx300s_right/right_finger" pos="0 0 0" axis="0 1 0" type="slide" limited="true" range="-0.057 -0.021" frictionloss="30" />
+                                    <geom condim="4" solimp="2 1 0.01" solref="0.01 1" friction="1 0.005 0.0001" pos="0.005 0.052 0" euler="3.14 1.57 0" type="mesh" mesh="vx300s_10_gripper_finger_right" name="vx300s_right/10_right_gripper_finger"/>
+                                </body>
+                            </body>
+                        </body>
+                    </body>
+                </body>
+            </body>
+        </body>
+    </body>
+</mujocoinclude>
--- a/lerobot/common/envs/aloha/constants.py
+++ b/lerobot/common/envs/aloha/constants.py
@@ -0,0 +1,163 @@
+from pathlib import Path
+
+### Simulation envs fixed constants
+DT = 0.02  # 0.02 ms -> 1/0.2 = 50 hz
+FPS = 50
+
+
+JOINTS = [
+    # absolute joint position
+    "left_arm_waist",
+    "left_arm_shoulder",
+    "left_arm_elbow",
+    "left_arm_forearm_roll",
+    "left_arm_wrist_angle",
+    "left_arm_wrist_rotate",
+    # normalized gripper position 0: close, 1: open
+    "left_arm_gripper",
+    # absolute joint position
+    "right_arm_waist",
+    "right_arm_shoulder",
+    "right_arm_elbow",
+    "right_arm_forearm_roll",
+    "right_arm_wrist_angle",
+    "right_arm_wrist_rotate",
+    # normalized gripper position 0: close, 1: open
+    "right_arm_gripper",
+]
+
+ACTIONS = [
+    # position and quaternion for end effector
+    "left_arm_waist",
+    "left_arm_shoulder",
+    "left_arm_elbow",
+    "left_arm_forearm_roll",
+    "left_arm_wrist_angle",
+    "left_arm_wrist_rotate",
+    # normalized gripper position (0: close, 1: open)
+    "left_arm_gripper",
+    "right_arm_waist",
+    "right_arm_shoulder",
+    "right_arm_elbow",
+    "right_arm_forearm_roll",
+    "right_arm_wrist_angle",
+    "right_arm_wrist_rotate",
+    # normalized gripper position (0: close, 1: open)
+    "right_arm_gripper",
+]
+
+
+START_ARM_POSE = [
+    0,
+    -0.96,
+    1.16,
+    0,
+    -0.3,
+    0,
+    0.02239,
+    -0.02239,
+    0,
+    -0.96,
+    1.16,
+    0,
+    -0.3,
+    0,
+    0.02239,
+    -0.02239,
+]
+
+ASSETS_DIR = Path(__file__).parent.resolve() / "assets"  # note: absolute path
+
+# Left finger position limits (qpos[7]), right_finger = -1 * left_finger
+MASTER_GRIPPER_POSITION_OPEN = 0.02417
+MASTER_GRIPPER_POSITION_CLOSE = 0.01244
+PUPPET_GRIPPER_POSITION_OPEN = 0.05800
+PUPPET_GRIPPER_POSITION_CLOSE = 0.01844
+
+# Gripper joint limits (qpos[6])
+MASTER_GRIPPER_JOINT_OPEN = 0.3083
+MASTER_GRIPPER_JOINT_CLOSE = -0.6842
+PUPPET_GRIPPER_JOINT_OPEN = 1.4910
+PUPPET_GRIPPER_JOINT_CLOSE = -0.6213
+
+MASTER_GRIPPER_JOINT_MID = (MASTER_GRIPPER_JOINT_OPEN + MASTER_GRIPPER_JOINT_CLOSE) / 2
+
+############################ Helper functions ############################
+
+
+def normalize_master_gripper_position(x):
+    return (x - MASTER_GRIPPER_POSITION_CLOSE) / (
+        MASTER_GRIPPER_POSITION_OPEN - MASTER_GRIPPER_POSITION_CLOSE
+    )
+
+
+def normalize_puppet_gripper_position(x):
+    return (x - PUPPET_GRIPPER_POSITION_CLOSE) / (
+        PUPPET_GRIPPER_POSITION_OPEN - PUPPET_GRIPPER_POSITION_CLOSE
+    )
+
+
+def unnormalize_master_gripper_position(x):
+    return x * (MASTER_GRIPPER_POSITION_OPEN - MASTER_GRIPPER_POSITION_CLOSE) + MASTER_GRIPPER_POSITION_CLOSE
+
+
+def unnormalize_puppet_gripper_position(x):
+    return x * (PUPPET_GRIPPER_POSITION_OPEN - PUPPET_GRIPPER_POSITION_CLOSE) + PUPPET_GRIPPER_POSITION_CLOSE
+
+
+def convert_position_from_master_to_puppet(x):
+    return unnormalize_puppet_gripper_position(normalize_master_gripper_position(x))
+
+
+def normalizer_master_gripper_joint(x):
+    return (x - MASTER_GRIPPER_JOINT_CLOSE) / (MASTER_GRIPPER_JOINT_OPEN - MASTER_GRIPPER_JOINT_CLOSE)
+
+
+def normalize_puppet_gripper_joint(x):
+    return (x - PUPPET_GRIPPER_JOINT_CLOSE) / (PUPPET_GRIPPER_JOINT_OPEN - PUPPET_GRIPPER_JOINT_CLOSE)
+
+
+def unnormalize_master_gripper_joint(x):
+    return x * (MASTER_GRIPPER_JOINT_OPEN - MASTER_GRIPPER_JOINT_CLOSE) + MASTER_GRIPPER_JOINT_CLOSE
+
+
+def unnormalize_puppet_gripper_joint(x):
+    return x * (PUPPET_GRIPPER_JOINT_OPEN - PUPPET_GRIPPER_JOINT_CLOSE) + PUPPET_GRIPPER_JOINT_CLOSE
+
+
+def convert_join_from_master_to_puppet(x):
+    return unnormalize_puppet_gripper_joint(normalizer_master_gripper_joint(x))
+
+
+def normalize_master_gripper_velocity(x):
+    return x / (MASTER_GRIPPER_POSITION_OPEN - MASTER_GRIPPER_POSITION_CLOSE)
+
+
+def normalize_puppet_gripper_velocity(x):
+    return x / (PUPPET_GRIPPER_POSITION_OPEN - PUPPET_GRIPPER_POSITION_CLOSE)
+
+
+def convert_master_from_position_to_joint(x):
+    return (
+        normalize_master_gripper_position(x) * (MASTER_GRIPPER_JOINT_OPEN - MASTER_GRIPPER_JOINT_CLOSE)
+        + MASTER_GRIPPER_JOINT_CLOSE
+    )
+
+
+def convert_master_from_joint_to_position(x):
+    return unnormalize_master_gripper_position(
+        (x - MASTER_GRIPPER_JOINT_CLOSE) / (MASTER_GRIPPER_JOINT_OPEN - MASTER_GRIPPER_JOINT_CLOSE)
+    )
+
+
+def convert_puppet_from_position_to_join(x):
+    return (
+        normalize_puppet_gripper_position(x) * (PUPPET_GRIPPER_JOINT_OPEN - PUPPET_GRIPPER_JOINT_CLOSE)
+        + PUPPET_GRIPPER_JOINT_CLOSE
+    )
+
+
+def convert_puppet_from_joint_to_position(x):
+    return unnormalize_puppet_gripper_position(
+        (x - PUPPET_GRIPPER_JOINT_CLOSE) / (PUPPET_GRIPPER_JOINT_OPEN - PUPPET_GRIPPER_JOINT_CLOSE)
+    )
--- a/lerobot/common/envs/aloha/env.py
+++ b/lerobot/common/envs/aloha/env.py
@@ -0,0 +1,311 @@
+import importlib
+import logging
+from collections import deque
+from typing import Optional
+
+import einops
+import numpy as np
+import torch
+from dm_control import mujoco
+from dm_control.rl import control
+from tensordict import TensorDict
+from torchrl.data.tensor_specs import (
+    BoundedTensorSpec,
+    CompositeSpec,
+    DiscreteTensorSpec,
+    UnboundedContinuousTensorSpec,
+)
+
+from lerobot.common.envs.abstract import AbstractEnv
+from lerobot.common.envs.aloha.constants import (
+    ACTIONS,
+    ASSETS_DIR,
+    DT,
+    JOINTS,
+)
+from lerobot.common.envs.aloha.tasks.sim import BOX_POSE, InsertionTask, TransferCubeTask
+from lerobot.common.envs.aloha.tasks.sim_end_effector import (
+    InsertionEndEffectorTask,
+    TransferCubeEndEffectorTask,
+)
+from lerobot.common.envs.aloha.utils import sample_box_pose, sample_insertion_pose
+from lerobot.common.utils import set_seed
+
+_has_gym = importlib.util.find_spec("gym") is not None
+
+
+class AlohaEnv(AbstractEnv):
+    def __init__(
+        self,
+        task,
+        frame_skip: int = 1,
+        from_pixels: bool = False,
+        pixels_only: bool = False,
+        image_size=None,
+        seed=1337,
+        device="cpu",
+        num_prev_obs=1,
+        num_prev_action=0,
+    ):
+        super().__init__(
+            task=task,
+            frame_skip=frame_skip,
+            from_pixels=from_pixels,
+            pixels_only=pixels_only,
+            image_size=image_size,
+            seed=seed,
+            device=device,
+            num_prev_obs=num_prev_obs,
+            num_prev_action=num_prev_action,
+        )
+
+    def _make_env(self):
+        if not _has_gym:
+            raise ImportError("Cannot import gym.")
+
+        if not self.from_pixels:
+            raise NotImplementedError()
+
+        self._env = self._make_env_task(self.task)
+
+    def render(self, mode="rgb_array", width=640, height=480):
+        # TODO(rcadene): render and visualizer several cameras (e.g. angle, front_close)
+        image = self._env.physics.render(height=height, width=width, camera_id="top")
+        return image
+
+    def _make_env_task(self, task_name):
+        # time limit is controlled by StepCounter in env factory
+        time_limit = float("inf")
+
+        if "sim_transfer_cube" in task_name:
+            xml_path = ASSETS_DIR / "bimanual_viperx_transfer_cube.xml"
+            physics = mujoco.Physics.from_xml_path(str(xml_path))
+            task = TransferCubeTask(random=False)
+        elif "sim_insertion" in task_name:
+            xml_path = ASSETS_DIR / "bimanual_viperx_insertion.xml"
+            physics = mujoco.Physics.from_xml_path(str(xml_path))
+            task = InsertionTask(random=False)
+        elif "sim_end_effector_transfer_cube" in task_name:
+            raise NotImplementedError()
+            xml_path = ASSETS_DIR / "bimanual_viperx_end_effector_transfer_cube.xml"
+            physics = mujoco.Physics.from_xml_path(str(xml_path))
+            task = TransferCubeEndEffectorTask(random=False)
+        elif "sim_end_effector_insertion" in task_name:
+            raise NotImplementedError()
+            xml_path = ASSETS_DIR / "bimanual_viperx_end_effector_insertion.xml"
+            physics = mujoco.Physics.from_xml_path(str(xml_path))
+            task = InsertionEndEffectorTask(random=False)
+        else:
+            raise NotImplementedError(task_name)
+
+        env = control.Environment(
+            physics, task, time_limit, control_timestep=DT, n_sub_steps=None, flat_observation=False
+        )
+        return env
+
+    def _format_raw_obs(self, raw_obs):
+        if self.from_pixels:
+            image = torch.from_numpy(raw_obs["images"]["top"].copy())
+            image = einops.rearrange(image, "h w c -> c h w")
+            assert image.dtype == torch.uint8
+            obs = {"image": {"top": image}}
+
+            if not self.pixels_only:
+                obs["state"] = torch.from_numpy(raw_obs["qpos"]).type(torch.float32)
+        else:
+            # TODO(rcadene):
+            raise NotImplementedError()
+            # obs = {"state": torch.from_numpy(raw_obs["observation"]).type(torch.float32)}
+
+        return obs
+
+    def _reset(self, tensordict: Optional[TensorDict] = None):
+        td = tensordict
+        if td is None or td.is_empty():
+            # we need to handle seed iteration, since self._env.reset() rely an internal _seed.
+            self._current_seed += 1
+            self.set_seed(self._current_seed)
+
+            # TODO(rcadene): do not use global variable for this
+            if "sim_transfer_cube" in self.task:
+                BOX_POSE[0] = sample_box_pose()  # used in sim reset
+            elif "sim_insertion" in self.task:
+                BOX_POSE[0] = np.concatenate(sample_insertion_pose())  # used in sim reset
+
+            raw_obs = self._env.reset()
+            # TODO(rcadene): add assert
+            # assert self._current_seed == self._env._seed
+
+            obs = self._format_raw_obs(raw_obs.observation)
+
+            if self.num_prev_obs > 0:
+                stacked_obs = {}
+                if "image" in obs:
+                    self._prev_obs_image_queue = deque(
+                        [obs["image"]["top"]] * (self.num_prev_obs + 1), maxlen=(self.num_prev_obs + 1)
+                    )
+                    stacked_obs["image"] = {"top": torch.stack(list(self._prev_obs_image_queue))}
+                if "state" in obs:
+                    self._prev_obs_state_queue = deque(
+                        [obs["state"]] * (self.num_prev_obs + 1), maxlen=(self.num_prev_obs + 1)
+                    )
+                    stacked_obs["state"] = torch.stack(list(self._prev_obs_state_queue))
+                obs = stacked_obs
+
+            td = TensorDict(
+                {
+                    "observation": TensorDict(obs, batch_size=[]),
+                    "done": torch.tensor([False], dtype=torch.bool),
+                },
+                batch_size=[],
+            )
+        else:
+            raise NotImplementedError()
+
+        self.call_rendering_hooks()
+        return td
+
+    def _step(self, tensordict: TensorDict):
+        td = tensordict
+        action = td["action"].numpy()
+        # step expects shape=(4,) so we pad if necessary
+        # TODO(rcadene): add info["is_success"] and info["success"] ?
+        sum_reward = 0
+
+        if action.ndim == 1:
+            action = einops.repeat(action, "c -> t c", t=self.frame_skip)
+        else:
+            if self.frame_skip > 1:
+                raise NotImplementedError()
+
+        num_action_steps = action.shape[0]
+        for i in range(num_action_steps):
+            _, reward, discount, raw_obs = self._env.step(action[i])
+            del discount  # not used
+
+            # TOOD(rcadene): add an enum
+            success = done = reward == 4
+            sum_reward += reward
+            obs = self._format_raw_obs(raw_obs)
+
+            if self.num_prev_obs > 0:
+                stacked_obs = {}
+                if "image" in obs:
+                    self._prev_obs_image_queue.append(obs["image"]["top"])
+                    stacked_obs["image"] = {"top": torch.stack(list(self._prev_obs_image_queue))}
+                if "state" in obs:
+                    self._prev_obs_state_queue.append(obs["state"])
+                    stacked_obs["state"] = torch.stack(list(self._prev_obs_state_queue))
+                obs = stacked_obs
+
+            self.call_rendering_hooks()
+
+        td = TensorDict(
+            {
+                "observation": TensorDict(obs, batch_size=[]),
+                "reward": torch.tensor([sum_reward], dtype=torch.float32),
+                # succes and done are true when coverage > self.success_threshold in env
+                "done": torch.tensor([done], dtype=torch.bool),
+                "success": torch.tensor([success], dtype=torch.bool),
+            },
+            batch_size=[],
+        )
+        return td
+
+    def _make_spec(self):
+        obs = {}
+        from omegaconf import OmegaConf
+
+        if self.from_pixels:
+            if isinstance(self.image_size, int):
+                image_shape = (3, self.image_size, self.image_size)
+            elif OmegaConf.is_list(self.image_size):
+                assert len(self.image_size) == 3  # c h w
+                assert self.image_size[0] == 3  # c is RGB
+                image_shape = tuple(self.image_size)
+            else:
+                raise ValueError(self.image_size)
+            if self.num_prev_obs > 0:
+                image_shape = (self.num_prev_obs + 1, *image_shape)
+
+            obs["image"] = {
+                "top": BoundedTensorSpec(
+                    low=0,
+                    high=255,
+                    shape=image_shape,
+                    dtype=torch.uint8,
+                    device=self.device,
+                )
+            }
+            if not self.pixels_only:
+                state_shape = (len(JOINTS),)
+                if self.num_prev_obs > 0:
+                    state_shape = (self.num_prev_obs + 1, *state_shape)
+
+                obs["state"] = UnboundedContinuousTensorSpec(
+                    # TODO: add low and high bounds
+                    shape=state_shape,
+                    dtype=torch.float32,
+                    device=self.device,
+                )
+        else:
+            # TODO(rcadene): add observation_space achieved_goal and desired_goal?
+            state_shape = (len(JOINTS),)
+            if self.num_prev_obs > 0:
+                state_shape = (self.num_prev_obs + 1, *state_shape)
+
+            obs["state"] = UnboundedContinuousTensorSpec(
+                # TODO: add low and high bounds
+                shape=state_shape,
+                dtype=torch.float32,
+                device=self.device,
+            )
+        self.observation_spec = CompositeSpec({"observation": obs})
+
+        # TODO(rcadene): valid when controling end effector?
+        # action_space = self._env.action_spec()
+        # self.action_spec = BoundedTensorSpec(
+        #     low=action_space.minimum,
+        #     high=action_space.maximum,
+        #     shape=action_space.shape,
+        #     dtype=torch.float32,
+        #     device=self.device,
+        # )
+
+        # TODO(rcaene): add bounds (where are they????)
+        self.action_spec = BoundedTensorSpec(
+            shape=(len(ACTIONS)),
+            low=-1,
+            high=1,
+            dtype=torch.float32,
+            device=self.device,
+        )
+
+        self.reward_spec = UnboundedContinuousTensorSpec(
+            shape=(1,),
+            dtype=torch.float32,
+            device=self.device,
+        )
+
+        self.done_spec = CompositeSpec(
+            {
+                "done": DiscreteTensorSpec(
+                    2,
+                    shape=(1,),
+                    dtype=torch.bool,
+                    device=self.device,
+                ),
+                "success": DiscreteTensorSpec(
+                    2,
+                    shape=(1,),
+                    dtype=torch.bool,
+                    device=self.device,
+                ),
+            }
+        )
+
+    def _set_seed(self, seed: Optional[int]):
+        set_seed(seed)
+        # TODO(rcadene): seed the env
+        # self._env.seed(seed)
+        logging.warning("Aloha env is not seeded")
--- a/lerobot/common/envs/aloha/tasks/sim.py
+++ b/lerobot/common/envs/aloha/tasks/sim.py
@@ -0,0 +1,219 @@
+import collections
+
+import numpy as np
+from dm_control.suite import base
+
+from lerobot.common.envs.aloha.constants import (
+    START_ARM_POSE,
+    normalize_puppet_gripper_position,
+    normalize_puppet_gripper_velocity,
+    unnormalize_puppet_gripper_position,
+)
+
+BOX_POSE = [None]  # to be changed from outside
+
+"""
+Environment for simulated robot bi-manual manipulation, with joint position control
+Action space:      [left_arm_qpos (6),             # absolute joint position
+                    left_gripper_positions (1),    # normalized gripper position (0: close, 1: open)
+                    right_arm_qpos (6),            # absolute joint position
+                    right_gripper_positions (1),]  # normalized gripper position (0: close, 1: open)
+
+Observation space: {"qpos": Concat[ left_arm_qpos (6),         # absolute joint position
+                                    left_gripper_position (1),  # normalized gripper position (0: close, 1: open)
+                                    right_arm_qpos (6),         # absolute joint position
+                                    right_gripper_qpos (1)]     # normalized gripper position (0: close, 1: open)
+                    "qvel": Concat[ left_arm_qvel (6),         # absolute joint velocity (rad)
+                                    left_gripper_velocity (1),  # normalized gripper velocity (pos: opening, neg: closing)
+                                    right_arm_qvel (6),         # absolute joint velocity (rad)
+                                    right_gripper_qvel (1)]     # normalized gripper velocity (pos: opening, neg: closing)
+                    "images": {"main": (480x640x3)}        # h, w, c, dtype='uint8'
+"""
+
+
+class BimanualViperXTask(base.Task):
+    def __init__(self, random=None):
+        super().__init__(random=random)
+
+    def before_step(self, action, physics):
+        left_arm_action = action[:6]
+        right_arm_action = action[7 : 7 + 6]
+        normalized_left_gripper_action = action[6]
+        normalized_right_gripper_action = action[7 + 6]
+
+        left_gripper_action = unnormalize_puppet_gripper_position(normalized_left_gripper_action)
+        right_gripper_action = unnormalize_puppet_gripper_position(normalized_right_gripper_action)
+
+        full_left_gripper_action = [left_gripper_action, -left_gripper_action]
+        full_right_gripper_action = [right_gripper_action, -right_gripper_action]
+
+        env_action = np.concatenate(
+            [left_arm_action, full_left_gripper_action, right_arm_action, full_right_gripper_action]
+        )
+        super().before_step(env_action, physics)
+        return
+
+    def initialize_episode(self, physics):
+        """Sets the state of the environment at the start of each episode."""
+        super().initialize_episode(physics)
+
+    @staticmethod
+    def get_qpos(physics):
+        qpos_raw = physics.data.qpos.copy()
+        left_qpos_raw = qpos_raw[:8]
+        right_qpos_raw = qpos_raw[8:16]
+        left_arm_qpos = left_qpos_raw[:6]
+        right_arm_qpos = right_qpos_raw[:6]
+        left_gripper_qpos = [normalize_puppet_gripper_position(left_qpos_raw[6])]
+        right_gripper_qpos = [normalize_puppet_gripper_position(right_qpos_raw[6])]
+        return np.concatenate([left_arm_qpos, left_gripper_qpos, right_arm_qpos, right_gripper_qpos])
+
+    @staticmethod
+    def get_qvel(physics):
+        qvel_raw = physics.data.qvel.copy()
+        left_qvel_raw = qvel_raw[:8]
+        right_qvel_raw = qvel_raw[8:16]
+        left_arm_qvel = left_qvel_raw[:6]
+        right_arm_qvel = right_qvel_raw[:6]
+        left_gripper_qvel = [normalize_puppet_gripper_velocity(left_qvel_raw[6])]
+        right_gripper_qvel = [normalize_puppet_gripper_velocity(right_qvel_raw[6])]
+        return np.concatenate([left_arm_qvel, left_gripper_qvel, right_arm_qvel, right_gripper_qvel])
+
+    @staticmethod
+    def get_env_state(physics):
+        raise NotImplementedError
+
+    def get_observation(self, physics):
+        obs = collections.OrderedDict()
+        obs["qpos"] = self.get_qpos(physics)
+        obs["qvel"] = self.get_qvel(physics)
+        obs["env_state"] = self.get_env_state(physics)
+        obs["images"] = {}
+        obs["images"]["top"] = physics.render(height=480, width=640, camera_id="top")
+        obs["images"]["angle"] = physics.render(height=480, width=640, camera_id="angle")
+        obs["images"]["vis"] = physics.render(height=480, width=640, camera_id="front_close")
+
+        return obs
+
+    def get_reward(self, physics):
+        # return whether left gripper is holding the box
+        raise NotImplementedError
+
+
+class TransferCubeTask(BimanualViperXTask):
+    def __init__(self, random=None):
+        super().__init__(random=random)
+        self.max_reward = 4
+
+    def initialize_episode(self, physics):
+        """Sets the state of the environment at the start of each episode."""
+        # TODO Notice: this function does not randomize the env configuration. Instead, set BOX_POSE from outside
+        # reset qpos, control and box position
+        with physics.reset_context():
+            physics.named.data.qpos[:16] = START_ARM_POSE
+            np.copyto(physics.data.ctrl, START_ARM_POSE)
+            assert BOX_POSE[0] is not None
+            physics.named.data.qpos[-7:] = BOX_POSE[0]
+            # print(f"{BOX_POSE=}")
+        super().initialize_episode(physics)
+
+    @staticmethod
+    def get_env_state(physics):
+        env_state = physics.data.qpos.copy()[16:]
+        return env_state
+
+    def get_reward(self, physics):
+        # return whether left gripper is holding the box
+        all_contact_pairs = []
+        for i_contact in range(physics.data.ncon):
+            id_geom_1 = physics.data.contact[i_contact].geom1
+            id_geom_2 = physics.data.contact[i_contact].geom2
+            name_geom_1 = physics.model.id2name(id_geom_1, "geom")
+            name_geom_2 = physics.model.id2name(id_geom_2, "geom")
+            contact_pair = (name_geom_1, name_geom_2)
+            all_contact_pairs.append(contact_pair)
+
+        touch_left_gripper = ("red_box", "vx300s_left/10_left_gripper_finger") in all_contact_pairs
+        touch_right_gripper = ("red_box", "vx300s_right/10_right_gripper_finger") in all_contact_pairs
+        touch_table = ("red_box", "table") in all_contact_pairs
+
+        reward = 0
+        if touch_right_gripper:
+            reward = 1
+        if touch_right_gripper and not touch_table:  # lifted
+            reward = 2
+        if touch_left_gripper:  # attempted transfer
+            reward = 3
+        if touch_left_gripper and not touch_table:  # successful transfer
+            reward = 4
+        return reward
+
+
+class InsertionTask(BimanualViperXTask):
+    def __init__(self, random=None):
+        super().__init__(random=random)
+        self.max_reward = 4
+
+    def initialize_episode(self, physics):
+        """Sets the state of the environment at the start of each episode."""
+        # TODO Notice: this function does not randomize the env configuration. Instead, set BOX_POSE from outside
+        # reset qpos, control and box position
+        with physics.reset_context():
+            physics.named.data.qpos[:16] = START_ARM_POSE
+            np.copyto(physics.data.ctrl, START_ARM_POSE)
+            assert BOX_POSE[0] is not None
+            physics.named.data.qpos[-7 * 2 :] = BOX_POSE[0]  # two objects
+            # print(f"{BOX_POSE=}")
+        super().initialize_episode(physics)
+
+    @staticmethod
+    def get_env_state(physics):
+        env_state = physics.data.qpos.copy()[16:]
+        return env_state
+
+    def get_reward(self, physics):
+        # return whether peg touches the pin
+        all_contact_pairs = []
+        for i_contact in range(physics.data.ncon):
+            id_geom_1 = physics.data.contact[i_contact].geom1
+            id_geom_2 = physics.data.contact[i_contact].geom2
+            name_geom_1 = physics.model.id2name(id_geom_1, "geom")
+            name_geom_2 = physics.model.id2name(id_geom_2, "geom")
+            contact_pair = (name_geom_1, name_geom_2)
+            all_contact_pairs.append(contact_pair)
+
+        touch_right_gripper = ("red_peg", "vx300s_right/10_right_gripper_finger") in all_contact_pairs
+        touch_left_gripper = (
+            ("socket-1", "vx300s_left/10_left_gripper_finger") in all_contact_pairs
+            or ("socket-2", "vx300s_left/10_left_gripper_finger") in all_contact_pairs
+            or ("socket-3", "vx300s_left/10_left_gripper_finger") in all_contact_pairs
+            or ("socket-4", "vx300s_left/10_left_gripper_finger") in all_contact_pairs
+        )
+
+        peg_touch_table = ("red_peg", "table") in all_contact_pairs
+        socket_touch_table = (
+            ("socket-1", "table") in all_contact_pairs
+            or ("socket-2", "table") in all_contact_pairs
+            or ("socket-3", "table") in all_contact_pairs
+            or ("socket-4", "table") in all_contact_pairs
+        )
+        peg_touch_socket = (
+            ("red_peg", "socket-1") in all_contact_pairs
+            or ("red_peg", "socket-2") in all_contact_pairs
+            or ("red_peg", "socket-3") in all_contact_pairs
+            or ("red_peg", "socket-4") in all_contact_pairs
+        )
+        pin_touched = ("red_peg", "pin") in all_contact_pairs
+
+        reward = 0
+        if touch_left_gripper and touch_right_gripper:  # touch both
+            reward = 1
+        if (
+            touch_left_gripper and touch_right_gripper and (not peg_touch_table) and (not socket_touch_table)
+        ):  # grasp both
+            reward = 2
+        if peg_touch_socket and (not peg_touch_table) and (not socket_touch_table):  # peg and socket touching
+            reward = 3
+        if pin_touched:  # successful insertion
+            reward = 4
+        return reward
--- a/lerobot/common/envs/aloha/tasks/sim_end_effector.py
+++ b/lerobot/common/envs/aloha/tasks/sim_end_effector.py
@@ -0,0 +1,263 @@
+import collections
+
+import numpy as np
+from dm_control.suite import base
+
+from lerobot.common.envs.aloha.constants import (
+    PUPPET_GRIPPER_POSITION_CLOSE,
+    START_ARM_POSE,
+    normalize_puppet_gripper_position,
+    normalize_puppet_gripper_velocity,
+    unnormalize_puppet_gripper_position,
+)
+from lerobot.common.envs.aloha.utils import sample_box_pose, sample_insertion_pose
+
+"""
+Environment for simulated robot bi-manual manipulation, with end-effector control.
+Action space:      [left_arm_pose (7),             # position and quaternion for end effector
+                    left_gripper_positions (1),    # normalized gripper position (0: close, 1: open)
+                    right_arm_pose (7),            # position and quaternion for end effector
+                    right_gripper_positions (1),]  # normalized gripper position (0: close, 1: open)
+
+Observation space: {"qpos": Concat[ left_arm_qpos (6),         # absolute joint position
+                                    left_gripper_position (1),  # normalized gripper position (0: close, 1: open)
+                                    right_arm_qpos (6),         # absolute joint position
+                                    right_gripper_qpos (1)]     # normalized gripper position (0: close, 1: open)
+                    "qvel": Concat[ left_arm_qvel (6),         # absolute joint velocity (rad)
+                                    left_gripper_velocity (1),  # normalized gripper velocity (pos: opening, neg: closing)
+                                    right_arm_qvel (6),         # absolute joint velocity (rad)
+                                    right_gripper_qvel (1)]     # normalized gripper velocity (pos: opening, neg: closing)
+                    "images": {"main": (480x640x3)}        # h, w, c, dtype='uint8'
+"""
+
+
+class BimanualViperXEndEffectorTask(base.Task):
+    def __init__(self, random=None):
+        super().__init__(random=random)
+
+    def before_step(self, action, physics):
+        a_len = len(action) // 2
+        action_left = action[:a_len]
+        action_right = action[a_len:]
+
+        # set mocap position and quat
+        # left
+        np.copyto(physics.data.mocap_pos[0], action_left[:3])
+        np.copyto(physics.data.mocap_quat[0], action_left[3:7])
+        # right
+        np.copyto(physics.data.mocap_pos[1], action_right[:3])
+        np.copyto(physics.data.mocap_quat[1], action_right[3:7])
+
+        # set gripper
+        g_left_ctrl = unnormalize_puppet_gripper_position(action_left[7])
+        g_right_ctrl = unnormalize_puppet_gripper_position(action_right[7])
+        np.copyto(physics.data.ctrl, np.array([g_left_ctrl, -g_left_ctrl, g_right_ctrl, -g_right_ctrl]))
+
+    def initialize_robots(self, physics):
+        # reset joint position
+        physics.named.data.qpos[:16] = START_ARM_POSE
+
+        # reset mocap to align with end effector
+        # to obtain these numbers:
+        # (1) make an ee_sim env and reset to the same start_pose
+        # (2) get env._physics.named.data.xpos['vx300s_left/gripper_link']
+        #     get env._physics.named.data.xquat['vx300s_left/gripper_link']
+        #     repeat the same for right side
+        np.copyto(physics.data.mocap_pos[0], [-0.31718881, 0.5, 0.29525084])
+        np.copyto(physics.data.mocap_quat[0], [1, 0, 0, 0])
+        # right
+        np.copyto(physics.data.mocap_pos[1], np.array([0.31718881, 0.49999888, 0.29525084]))
+        np.copyto(physics.data.mocap_quat[1], [1, 0, 0, 0])
+
+        # reset gripper control
+        close_gripper_control = np.array(
+            [
+                PUPPET_GRIPPER_POSITION_CLOSE,
+                -PUPPET_GRIPPER_POSITION_CLOSE,
+                PUPPET_GRIPPER_POSITION_CLOSE,
+                -PUPPET_GRIPPER_POSITION_CLOSE,
+            ]
+        )
+        np.copyto(physics.data.ctrl, close_gripper_control)
+
+    def initialize_episode(self, physics):
+        """Sets the state of the environment at the start of each episode."""
+        super().initialize_episode(physics)
+
+    @staticmethod
+    def get_qpos(physics):
+        qpos_raw = physics.data.qpos.copy()
+        left_qpos_raw = qpos_raw[:8]
+        right_qpos_raw = qpos_raw[8:16]
+        left_arm_qpos = left_qpos_raw[:6]
+        right_arm_qpos = right_qpos_raw[:6]
+        left_gripper_qpos = [normalize_puppet_gripper_position(left_qpos_raw[6])]
+        right_gripper_qpos = [normalize_puppet_gripper_position(right_qpos_raw[6])]
+        return np.concatenate([left_arm_qpos, left_gripper_qpos, right_arm_qpos, right_gripper_qpos])
+
+    @staticmethod
+    def get_qvel(physics):
+        qvel_raw = physics.data.qvel.copy()
+        left_qvel_raw = qvel_raw[:8]
+        right_qvel_raw = qvel_raw[8:16]
+        left_arm_qvel = left_qvel_raw[:6]
+        right_arm_qvel = right_qvel_raw[:6]
+        left_gripper_qvel = [normalize_puppet_gripper_velocity(left_qvel_raw[6])]
+        right_gripper_qvel = [normalize_puppet_gripper_velocity(right_qvel_raw[6])]
+        return np.concatenate([left_arm_qvel, left_gripper_qvel, right_arm_qvel, right_gripper_qvel])
+
+    @staticmethod
+    def get_env_state(physics):
+        raise NotImplementedError
+
+    def get_observation(self, physics):
+        # note: it is important to do .copy()
+        obs = collections.OrderedDict()
+        obs["qpos"] = self.get_qpos(physics)
+        obs["qvel"] = self.get_qvel(physics)
+        obs["env_state"] = self.get_env_state(physics)
+        obs["images"] = {}
+        obs["images"]["top"] = physics.render(height=480, width=640, camera_id="top")
+        obs["images"]["angle"] = physics.render(height=480, width=640, camera_id="angle")
+        obs["images"]["vis"] = physics.render(height=480, width=640, camera_id="front_close")
+        # used in scripted policy to obtain starting pose
+        obs["mocap_pose_left"] = np.concatenate(
+            [physics.data.mocap_pos[0], physics.data.mocap_quat[0]]
+        ).copy()
+        obs["mocap_pose_right"] = np.concatenate(
+            [physics.data.mocap_pos[1], physics.data.mocap_quat[1]]
+        ).copy()
+
+        # used when replaying joint trajectory
+        obs["gripper_ctrl"] = physics.data.ctrl.copy()
+        return obs
+
+    def get_reward(self, physics):
+        raise NotImplementedError
+
+
+class TransferCubeEndEffectorTask(BimanualViperXEndEffectorTask):
+    def __init__(self, random=None):
+        super().__init__(random=random)
+        self.max_reward = 4
+
+    def initialize_episode(self, physics):
+        """Sets the state of the environment at the start of each episode."""
+        self.initialize_robots(physics)
+        # randomize box position
+        cube_pose = sample_box_pose()
+        box_start_idx = physics.model.name2id("red_box_joint", "joint")
+        np.copyto(physics.data.qpos[box_start_idx : box_start_idx + 7], cube_pose)
+        # print(f"randomized cube position to {cube_position}")
+
+        super().initialize_episode(physics)
+
+    @staticmethod
+    def get_env_state(physics):
+        env_state = physics.data.qpos.copy()[16:]
+        return env_state
+
+    def get_reward(self, physics):
+        # return whether left gripper is holding the box
+        all_contact_pairs = []
+        for i_contact in range(physics.data.ncon):
+            id_geom_1 = physics.data.contact[i_contact].geom1
+            id_geom_2 = physics.data.contact[i_contact].geom2
+            name_geom_1 = physics.model.id2name(id_geom_1, "geom")
+            name_geom_2 = physics.model.id2name(id_geom_2, "geom")
+            contact_pair = (name_geom_1, name_geom_2)
+            all_contact_pairs.append(contact_pair)
+
+        touch_left_gripper = ("red_box", "vx300s_left/10_left_gripper_finger") in all_contact_pairs
+        touch_right_gripper = ("red_box", "vx300s_right/10_right_gripper_finger") in all_contact_pairs
+        touch_table = ("red_box", "table") in all_contact_pairs
+
+        reward = 0
+        if touch_right_gripper:
+            reward = 1
+        if touch_right_gripper and not touch_table:  # lifted
+            reward = 2
+        if touch_left_gripper:  # attempted transfer
+            reward = 3
+        if touch_left_gripper and not touch_table:  # successful transfer
+            reward = 4
+        return reward
+
+
+class InsertionEndEffectorTask(BimanualViperXEndEffectorTask):
+    def __init__(self, random=None):
+        super().__init__(random=random)
+        self.max_reward = 4
+
+    def initialize_episode(self, physics):
+        """Sets the state of the environment at the start of each episode."""
+        self.initialize_robots(physics)
+        # randomize peg and socket position
+        peg_pose, socket_pose = sample_insertion_pose()
+
+        def id2index(j_id):
+            return 16 + (j_id - 16) * 7  # first 16 is robot qpos, 7 is pose dim # hacky
+
+        peg_start_id = physics.model.name2id("red_peg_joint", "joint")
+        peg_start_idx = id2index(peg_start_id)
+        np.copyto(physics.data.qpos[peg_start_idx : peg_start_idx + 7], peg_pose)
+        # print(f"randomized cube position to {cube_position}")
+
+        socket_start_id = physics.model.name2id("blue_socket_joint", "joint")
+        socket_start_idx = id2index(socket_start_id)
+        np.copyto(physics.data.qpos[socket_start_idx : socket_start_idx + 7], socket_pose)
+        # print(f"randomized cube position to {cube_position}")
+
+        super().initialize_episode(physics)
+
+    @staticmethod
+    def get_env_state(physics):
+        env_state = physics.data.qpos.copy()[16:]
+        return env_state
+
+    def get_reward(self, physics):
+        # return whether peg touches the pin
+        all_contact_pairs = []
+        for i_contact in range(physics.data.ncon):
+            id_geom_1 = physics.data.contact[i_contact].geom1
+            id_geom_2 = physics.data.contact[i_contact].geom2
+            name_geom_1 = physics.model.id2name(id_geom_1, "geom")
+            name_geom_2 = physics.model.id2name(id_geom_2, "geom")
+            contact_pair = (name_geom_1, name_geom_2)
+            all_contact_pairs.append(contact_pair)
+
+        touch_right_gripper = ("red_peg", "vx300s_right/10_right_gripper_finger") in all_contact_pairs
+        touch_left_gripper = (
+            ("socket-1", "vx300s_left/10_left_gripper_finger") in all_contact_pairs
+            or ("socket-2", "vx300s_left/10_left_gripper_finger") in all_contact_pairs
+            or ("socket-3", "vx300s_left/10_left_gripper_finger") in all_contact_pairs
+            or ("socket-4", "vx300s_left/10_left_gripper_finger") in all_contact_pairs
+        )
+
+        peg_touch_table = ("red_peg", "table") in all_contact_pairs
+        socket_touch_table = (
+            ("socket-1", "table") in all_contact_pairs
+            or ("socket-2", "table") in all_contact_pairs
+            or ("socket-3", "table") in all_contact_pairs
+            or ("socket-4", "table") in all_contact_pairs
+        )
+        peg_touch_socket = (
+            ("red_peg", "socket-1") in all_contact_pairs
+            or ("red_peg", "socket-2") in all_contact_pairs
+            or ("red_peg", "socket-3") in all_contact_pairs
+            or ("red_peg", "socket-4") in all_contact_pairs
+        )
+        pin_touched = ("red_peg", "pin") in all_contact_pairs
+
+        reward = 0
+        if touch_left_gripper and touch_right_gripper:  # touch both
+            reward = 1
+        if (
+            touch_left_gripper and touch_right_gripper and (not peg_touch_table) and (not socket_touch_table)
+        ):  # grasp both
+            reward = 2
+        if peg_touch_socket and (not peg_touch_table) and (not socket_touch_table):  # peg and socket touching
+            reward = 3
+        if pin_touched:  # successful insertion
+            reward = 4
+        return reward
--- a/lerobot/common/envs/aloha/utils.py
+++ b/lerobot/common/envs/aloha/utils.py
@@ -0,0 +1,39 @@
+import numpy as np
+
+
+def sample_box_pose():
+    x_range = [0.0, 0.2]
+    y_range = [0.4, 0.6]
+    z_range = [0.05, 0.05]
+
+    ranges = np.vstack([x_range, y_range, z_range])
+    cube_position = np.random.uniform(ranges[:, 0], ranges[:, 1])
+
+    cube_quat = np.array([1, 0, 0, 0])
+    return np.concatenate([cube_position, cube_quat])
+
+
+def sample_insertion_pose():
+    # Peg
+    x_range = [0.1, 0.2]
+    y_range = [0.4, 0.6]
+    z_range = [0.05, 0.05]
+
+    ranges = np.vstack([x_range, y_range, z_range])
+    peg_position = np.random.uniform(ranges[:, 0], ranges[:, 1])
+
+    peg_quat = np.array([1, 0, 0, 0])
+    peg_pose = np.concatenate([peg_position, peg_quat])
+
+    # Socket
+    x_range = [-0.2, -0.1]
+    y_range = [0.4, 0.6]
+    z_range = [0.05, 0.05]
+
+    ranges = np.vstack([x_range, y_range, z_range])
+    socket_position = np.random.uniform(ranges[:, 0], ranges[:, 1])
+
+    socket_quat = np.array([1, 0, 0, 0])
+    socket_pose = np.concatenate([socket_position, socket_quat])
+
+    return peg_pose, socket_pose
--- a/lerobot/common/envs/factory.py
+++ b/lerobot/common/envs/factory.py
@@ -1,4 +1,4 @@
-from torchrl.envs.transforms import StepCounter, TransformedEnv
+from torchrl.envs.transforms import Compose, StepCounter, Transform, TransformedEnv


 def make_env(cfg, transform=None):
@@ -18,11 +18,16 @@ def make_env(cfg, transform=None):
        kwargs["task"] = cfg.env.task
        clsfunc = SimxarmEnv
    elif cfg.env.name == "pusht":
-        from lerobot.common.envs.pusht import PushtEnv
+        from lerobot.common.envs.pusht.env import PushtEnv

        # assert kwargs["seed"] > 200, "Seed 0-200 are used for the demonstration dataset, so we don't want to seed the eval env with this range."

        clsfunc = PushtEnv
+    elif cfg.env.name == "aloha":
+        from lerobot.common.envs.aloha.env import AlohaEnv
+
+        kwargs["task"] = cfg.env.task
+        clsfunc = AlohaEnv
    else:
        raise ValueError(cfg.env.name)

@@ -33,7 +38,13 @@ def make_env(cfg, transform=None):

    if transform is not None:
        # useful to add normalization
-        env.append_transform(transform)
+        if isinstance(transform, Compose):
+            for tf in transform:
+                env.append_transform(tf.clone())
+        elif isinstance(transform, Transform):
+            env.append_transform(transform.clone())
+        else:
+            raise NotImplementedError()

    return env

--- a/lerobot/common/envs/pusht/env.py
+++ b/lerobot/common/envs/pusht/env.py
@@ -11,64 +11,52 @@ from torchrl.data.tensor_specs import (
    DiscreteTensorSpec,
    UnboundedContinuousTensorSpec,
 )
-from torchrl.envs import EnvBase
 from torchrl.envs.libs.gym import _gym_to_torchrl_spec_transform

+from lerobot.common.envs.abstract import AbstractEnv
 from lerobot.common.utils import set_seed

 _has_gym = importlib.util.find_spec("gym") is not None
-_has_diffpolicy = importlib.util.find_spec("diffusion_policy") is not None and _has_gym


-class PushtEnv(EnvBase):
+class PushtEnv(AbstractEnv):
    def __init__(
        self,
+        task="pusht",
        frame_skip: int = 1,
        from_pixels: bool = False,
        pixels_only: bool = False,
        image_size=None,
        seed=1337,
        device="cpu",
-        num_prev_obs=0,
+        num_prev_obs=1,
        num_prev_action=0,
    ):
-        super().__init__(device=device, batch_size=[])
-        self.frame_skip = frame_skip
-        self.from_pixels = from_pixels
-        self.pixels_only = pixels_only
-        self.image_size = image_size
-        self.num_prev_obs = num_prev_obs
-        self.num_prev_action = num_prev_action
+        super().__init__(
+            task=task,
+            frame_skip=frame_skip,
+            from_pixels=from_pixels,
+            pixels_only=pixels_only,
+            image_size=image_size,
+            seed=seed,
+            device=device,
+            num_prev_obs=num_prev_obs,
+            num_prev_action=num_prev_action,
+        )

-        if pixels_only:
-            assert from_pixels
-        if from_pixels:
-            assert image_size
-
-        if not _has_diffpolicy:
-            raise ImportError("Cannot import diffusion_policy.")
+    def _make_env(self):
        if not _has_gym:
            raise ImportError("Cannot import gym.")

        # TODO(rcadene) (PushTEnv is similar to PushTImageEnv, but without the image rendering, it's faster to iterate on)
-        # from diffusion_policy.env.pusht.pusht_env import PushTEnv
+        # from lerobot.common.envs.pusht.pusht_env import PushTEnv

-        if not from_pixels:
+        if not self.from_pixels:
            raise NotImplementedError("Use PushTEnv, instead of PushTImageEnv")
-        from diffusion_policy.env.pusht.pusht_image_env import PushTImageEnv
+        from lerobot.common.envs.pusht.pusht_image_env import PushTImageEnv

        self._env = PushTImageEnv(render_size=self.image_size)

-        self._make_spec()
-        self._current_seed = self.set_seed(seed)
-
-        if self.num_prev_obs > 0:
-            self._prev_obs_image_queue = deque(maxlen=self.num_prev_obs)
-            self._prev_obs_state_queue = deque(maxlen=self.num_prev_obs)
-        if self.num_prev_action > 0:
-            raise NotImplementedError()
-            # self._prev_action_queue = deque(maxlen=self.num_prev_action)
-
    def render(self, mode="rgb_array", width=384, height=384):
        if width != height:
            raise NotImplementedError()
@@ -125,6 +113,8 @@ class PushtEnv(EnvBase):
            )
        else:
            raise NotImplementedError()
+
+        self.call_rendering_hooks()
        return td

    def _step(self, tensordict: TensorDict):
@@ -157,6 +147,8 @@ class PushtEnv(EnvBase):
                    stacked_obs["state"] = torch.stack(list(self._prev_obs_state_queue))
                obs = stacked_obs

+            self.call_rendering_hooks()
+
        td = TensorDict(
            {
                "observation": TensorDict(obs, batch_size=[]),
@@ -178,9 +170,9 @@ class PushtEnv(EnvBase):

            obs["image"] = BoundedTensorSpec(
                low=0,
-                high=1,
+                high=255,
                shape=image_shape,
-                dtype=torch.float32,
+                dtype=torch.uint8,
                device=self.device,
            )
            if not self.pixels_only:
--- a/lerobot/common/envs/pusht/pusht_env.py
+++ b/lerobot/common/envs/pusht/pusht_env.py
@@ -0,0 +1,378 @@
+import collections
+
+import cv2
+import gym
+import numpy as np
+import pygame
+import pymunk
+import pymunk.pygame_util
+import shapely.geometry as sg
+import skimage.transform as st
+from gym import spaces
+from pymunk.vec2d import Vec2d
+
+from lerobot.common.envs.pusht.pymunk_override import DrawOptions
+
+
+def pymunk_to_shapely(body, shapes):
+    geoms = []
+    for shape in shapes:
+        if isinstance(shape, pymunk.shapes.Poly):
+            verts = [body.local_to_world(v) for v in shape.get_vertices()]
+            verts += [verts[0]]
+            geoms.append(sg.Polygon(verts))
+        else:
+            raise RuntimeError(f"Unsupported shape type {type(shape)}")
+    geom = sg.MultiPolygon(geoms)
+    return geom
+
+
+class PushTEnv(gym.Env):
+    metadata = {"render.modes": ["human", "rgb_array"], "video.frames_per_second": 10}
+    reward_range = (0.0, 1.0)
+
+    def __init__(
+        self,
+        legacy=False,
+        block_cog=None,
+        damping=None,
+        render_action=True,
+        render_size=96,
+        reset_to_state=None,
+    ):
+        self._seed = None
+        self.seed()
+        self.window_size = ws = 512  # The size of the PyGame window
+        self.render_size = render_size
+        self.sim_hz = 100
+        # Local controller params.
+        self.k_p, self.k_v = 100, 20  # PD control.z
+        self.control_hz = self.metadata["video.frames_per_second"]
+        # legcay set_state for data compatibility
+        self.legacy = legacy
+
+        # agent_pos, block_pos, block_angle
+        self.observation_space = spaces.Box(
+            low=np.array([0, 0, 0, 0, 0], dtype=np.float64),
+            high=np.array([ws, ws, ws, ws, np.pi * 2], dtype=np.float64),
+            shape=(5,),
+            dtype=np.float64,
+        )
+
+        # positional goal for agent
+        self.action_space = spaces.Box(
+            low=np.array([0, 0], dtype=np.float64),
+            high=np.array([ws, ws], dtype=np.float64),
+            shape=(2,),
+            dtype=np.float64,
+        )
+
+        self.block_cog = block_cog
+        self.damping = damping
+        self.render_action = render_action
+
+        """
+        If human-rendering is used, `self.window` will be a reference
+        to the window that we draw to. `self.clock` will be a clock that is used
+        to ensure that the environment is rendered at the correct framerate in
+        human-mode. They will remain `None` until human-mode is used for the
+        first time.
+        """
+        self.window = None
+        self.clock = None
+        self.screen = None
+
+        self.space = None
+        self.teleop = None
+        self.render_buffer = None
+        self.latest_action = None
+        self.reset_to_state = reset_to_state
+
+    def reset(self):
+        seed = self._seed
+        self._setup()
+        if self.block_cog is not None:
+            self.block.center_of_gravity = self.block_cog
+        if self.damping is not None:
+            self.space.damping = self.damping
+
+        # use legacy RandomState for compatibility
+        state = self.reset_to_state
+        if state is None:
+            rs = np.random.RandomState(seed=seed)
+            state = np.array(
+                [
+                    rs.randint(50, 450),
+                    rs.randint(50, 450),
+                    rs.randint(100, 400),
+                    rs.randint(100, 400),
+                    rs.randn() * 2 * np.pi - np.pi,
+                ]
+            )
+        self._set_state(state)
+
+        observation = self._get_obs()
+        return observation
+
+    def step(self, action):
+        dt = 1.0 / self.sim_hz
+        self.n_contact_points = 0
+        n_steps = self.sim_hz // self.control_hz
+        if action is not None:
+            self.latest_action = action
+            for _ in range(n_steps):
+                # Step PD control.
+                # self.agent.velocity = self.k_p * (act - self.agent.position)    # P control works too.
+                acceleration = self.k_p * (action - self.agent.position) + self.k_v * (
+                    Vec2d(0, 0) - self.agent.velocity
+                )
+                self.agent.velocity += acceleration * dt
+
+                # Step physics.
+                self.space.step(dt)
+
+        # compute reward
+        goal_body = self._get_goal_pose_body(self.goal_pose)
+        goal_geom = pymunk_to_shapely(goal_body, self.block.shapes)
+        block_geom = pymunk_to_shapely(self.block, self.block.shapes)
+
+        intersection_area = goal_geom.intersection(block_geom).area
+        goal_area = goal_geom.area
+        coverage = intersection_area / goal_area
+        reward = np.clip(coverage / self.success_threshold, 0, 1)
+        done = coverage > self.success_threshold
+
+        observation = self._get_obs()
+        info = self._get_info()
+
+        return observation, reward, done, info
+
+    def render(self, mode):
+        return self._render_frame(mode)
+
+    def teleop_agent(self):
+        TeleopAgent = collections.namedtuple("TeleopAgent", ["act"])
+
+        def act(obs):
+            act = None
+            mouse_position = pymunk.pygame_util.from_pygame(Vec2d(*pygame.mouse.get_pos()), self.screen)
+            if self.teleop or (mouse_position - self.agent.position).length < 30:
+                self.teleop = True
+                act = mouse_position
+            return act
+
+        return TeleopAgent(act)
+
+    def _get_obs(self):
+        obs = np.array(
+            tuple(self.agent.position) + tuple(self.block.position) + (self.block.angle % (2 * np.pi),)
+        )
+        return obs
+
+    def _get_goal_pose_body(self, pose):
+        mass = 1
+        inertia = pymunk.moment_for_box(mass, (50, 100))
+        body = pymunk.Body(mass, inertia)
+        # preserving the legacy assignment order for compatibility
+        # the order here doesn't matter somehow, maybe because CoM is aligned with body origin
+        body.position = pose[:2].tolist()
+        body.angle = pose[2]
+        return body
+
+    def _get_info(self):
+        n_steps = self.sim_hz // self.control_hz
+        n_contact_points_per_step = int(np.ceil(self.n_contact_points / n_steps))
+        info = {
+            "pos_agent": np.array(self.agent.position),
+            "vel_agent": np.array(self.agent.velocity),
+            "block_pose": np.array(list(self.block.position) + [self.block.angle]),
+            "goal_pose": self.goal_pose,
+            "n_contacts": n_contact_points_per_step,
+        }
+        return info
+
+    def _render_frame(self, mode):
+        if self.window is None and mode == "human":
+            pygame.init()
+            pygame.display.init()
+            self.window = pygame.display.set_mode((self.window_size, self.window_size))
+        if self.clock is None and mode == "human":
+            self.clock = pygame.time.Clock()
+
+        canvas = pygame.Surface((self.window_size, self.window_size))
+        canvas.fill((255, 255, 255))
+        self.screen = canvas
+
+        draw_options = DrawOptions(canvas)
+
+        # Draw goal pose.
+        goal_body = self._get_goal_pose_body(self.goal_pose)
+        for shape in self.block.shapes:
+            goal_points = [
+                pymunk.pygame_util.to_pygame(goal_body.local_to_world(v), draw_options.surface)
+                for v in shape.get_vertices()
+            ]
+            goal_points += [goal_points[0]]
+            pygame.draw.polygon(canvas, self.goal_color, goal_points)
+
+        # Draw agent and block.
+        self.space.debug_draw(draw_options)
+
+        if mode == "human":
+            # The following line copies our drawings from `canvas` to the visible window
+            self.window.blit(canvas, canvas.get_rect())
+            pygame.event.pump()
+            pygame.display.update()
+
+            # the clock is already ticked during in step for "human"
+
+        img = np.transpose(np.array(pygame.surfarray.pixels3d(canvas)), axes=(1, 0, 2))
+        img = cv2.resize(img, (self.render_size, self.render_size))
+        if self.render_action and self.latest_action is not None:
+            action = np.array(self.latest_action)
+            coord = (action / 512 * 96).astype(np.int32)
+            marker_size = int(8 / 96 * self.render_size)
+            thickness = int(1 / 96 * self.render_size)
+            cv2.drawMarker(
+                img,
+                coord,
+                color=(255, 0, 0),
+                markerType=cv2.MARKER_CROSS,
+                markerSize=marker_size,
+                thickness=thickness,
+            )
+        return img
+
+    def close(self):
+        if self.window is not None:
+            pygame.display.quit()
+            pygame.quit()
+
+    def seed(self, seed=None):
+        if seed is None:
+            seed = np.random.randint(0, 25536)
+        self._seed = seed
+        self.np_random = np.random.default_rng(seed)
+
+    def _handle_collision(self, arbiter, space, data):
+        self.n_contact_points += len(arbiter.contact_point_set.points)
+
+    def _set_state(self, state):
+        if isinstance(state, np.ndarray):
+            state = state.tolist()
+        pos_agent = state[:2]
+        pos_block = state[2:4]
+        rot_block = state[4]
+        self.agent.position = pos_agent
+        # setting angle rotates with respect to center of mass
+        # therefore will modify the geometric position
+        # if not the same as CoM
+        # therefore should be modified first.
+        if self.legacy:
+            # for compatibility with legacy data
+            self.block.position = pos_block
+            self.block.angle = rot_block
+        else:
+            self.block.angle = rot_block
+            self.block.position = pos_block
+
+        # Run physics to take effect
+        self.space.step(1.0 / self.sim_hz)
+
+    def _set_state_local(self, state_local):
+        agent_pos_local = state_local[:2]
+        block_pose_local = state_local[2:]
+        tf_img_obj = st.AffineTransform(translation=self.goal_pose[:2], rotation=self.goal_pose[2])
+        tf_obj_new = st.AffineTransform(translation=block_pose_local[:2], rotation=block_pose_local[2])
+        tf_img_new = st.AffineTransform(matrix=tf_img_obj.params @ tf_obj_new.params)
+        agent_pos_new = tf_img_new(agent_pos_local)
+        new_state = np.array(list(agent_pos_new[0]) + list(tf_img_new.translation) + [tf_img_new.rotation])
+        self._set_state(new_state)
+        return new_state
+
+    def _setup(self):
+        self.space = pymunk.Space()
+        self.space.gravity = 0, 0
+        self.space.damping = 0
+        self.teleop = False
+        self.render_buffer = []
+
+        # Add walls.
+        walls = [
+            self._add_segment((5, 506), (5, 5), 2),
+            self._add_segment((5, 5), (506, 5), 2),
+            self._add_segment((506, 5), (506, 506), 2),
+            self._add_segment((5, 506), (506, 506), 2),
+        ]
+        self.space.add(*walls)
+
+        # Add agent, block, and goal zone.
+        self.agent = self.add_circle((256, 400), 15)
+        self.block = self.add_tee((256, 300), 0)
+        self.goal_color = pygame.Color("LightGreen")
+        self.goal_pose = np.array([256, 256, np.pi / 4])  # x, y, theta (in radians)
+
+        # Add collision handling
+        self.collision_handeler = self.space.add_collision_handler(0, 0)
+        self.collision_handeler.post_solve = self._handle_collision
+        self.n_contact_points = 0
+
+        self.max_score = 50 * 100
+        self.success_threshold = 0.95  # 95% coverage.
+
+    def _add_segment(self, a, b, radius):
+        shape = pymunk.Segment(self.space.static_body, a, b, radius)
+        shape.color = pygame.Color("LightGray")  # https://htmlcolorcodes.com/color-names
+        return shape
+
+    def add_circle(self, position, radius):
+        body = pymunk.Body(body_type=pymunk.Body.KINEMATIC)
+        body.position = position
+        body.friction = 1
+        shape = pymunk.Circle(body, radius)
+        shape.color = pygame.Color("RoyalBlue")
+        self.space.add(body, shape)
+        return body
+
+    def add_box(self, position, height, width):
+        mass = 1
+        inertia = pymunk.moment_for_box(mass, (height, width))
+        body = pymunk.Body(mass, inertia)
+        body.position = position
+        shape = pymunk.Poly.create_box(body, (height, width))
+        shape.color = pygame.Color("LightSlateGray")
+        self.space.add(body, shape)
+        return body
+
+    def add_tee(self, position, angle, scale=30, color="LightSlateGray", mask=None):
+        if mask is None:
+            mask = pymunk.ShapeFilter.ALL_MASKS()
+        mass = 1
+        length = 4
+        vertices1 = [
+            (-length * scale / 2, scale),
+            (length * scale / 2, scale),
+            (length * scale / 2, 0),
+            (-length * scale / 2, 0),
+        ]
+        inertia1 = pymunk.moment_for_poly(mass, vertices=vertices1)
+        vertices2 = [
+            (-scale / 2, scale),
+            (-scale / 2, length * scale),
+            (scale / 2, length * scale),
+            (scale / 2, scale),
+        ]
+        inertia2 = pymunk.moment_for_poly(mass, vertices=vertices1)
+        body = pymunk.Body(mass, inertia1 + inertia2)
+        shape1 = pymunk.Poly(body, vertices1)
+        shape2 = pymunk.Poly(body, vertices2)
+        shape1.color = pygame.Color(color)
+        shape2.color = pygame.Color(color)
+        shape1.filter = pymunk.ShapeFilter(mask=mask)
+        shape2.filter = pymunk.ShapeFilter(mask=mask)
+        body.center_of_gravity = (shape1.center_of_gravity + shape2.center_of_gravity) / 2
+        body.position = position
+        body.angle = angle
+        body.friction = 1
+        self.space.add(body, shape1, shape2)
+        return body
--- a/lerobot/common/envs/pusht/pusht_image_env.py
+++ b/lerobot/common/envs/pusht/pusht_image_env.py
@@ -0,0 +1,55 @@
+import cv2
+import numpy as np
+from gym import spaces
+
+from lerobot.common.envs.pusht.pusht_env import PushTEnv
+
+
+class PushTImageEnv(PushTEnv):
+    metadata = {"render.modes": ["rgb_array"], "video.frames_per_second": 10}
+
+    def __init__(self, legacy=False, block_cog=None, damping=None, render_size=96):
+        super().__init__(
+            legacy=legacy, block_cog=block_cog, damping=damping, render_size=render_size, render_action=False
+        )
+        ws = self.window_size
+        self.observation_space = spaces.Dict(
+            {
+                "image": spaces.Box(low=0, high=1, shape=(3, render_size, render_size), dtype=np.float32),
+                "agent_pos": spaces.Box(low=0, high=ws, shape=(2,), dtype=np.float32),
+            }
+        )
+        self.render_cache = None
+
+    def _get_obs(self):
+        img = super()._render_frame(mode="rgb_array")
+
+        agent_pos = np.array(self.agent.position)
+        img_obs = np.moveaxis(img, -1, 0)
+        obs = {"image": img_obs, "agent_pos": agent_pos}
+
+        # draw action
+        if self.latest_action is not None:
+            action = np.array(self.latest_action)
+            coord = (action / 512 * 96).astype(np.int32)
+            marker_size = int(8 / 96 * self.render_size)
+            thickness = int(1 / 96 * self.render_size)
+            cv2.drawMarker(
+                img,
+                coord,
+                color=(255, 0, 0),
+                markerType=cv2.MARKER_CROSS,
+                markerSize=marker_size,
+                thickness=thickness,
+            )
+        self.render_cache = img
+
+        return obs
+
+    def render(self, mode):
+        assert mode == "rgb_array"
+
+        if self.render_cache is None:
+            self._get_obs()
+
+        return self.render_cache
--- a/lerobot/common/envs/pusht/pymunk_override.py
+++ b/lerobot/common/envs/pusht/pymunk_override.py
@@ -0,0 +1,244 @@
+# ----------------------------------------------------------------------------
+# pymunk
+# Copyright (c) 2007-2016 Victor Blomqvist
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# ----------------------------------------------------------------------------
+
+"""This submodule contains helper functions to help with quick prototyping
+using pymunk together with pygame.
+
+Intended to help with debugging and prototyping, not for actual production use
+in a full application. The methods contained in this module is opinionated
+about your coordinate system and not in any way optimized.
+"""
+
+__docformat__ = "reStructuredText"
+
+__all__ = [
+    "DrawOptions",
+    "get_mouse_pos",
+    "to_pygame",
+    "from_pygame",
+    # "lighten",
+    "positive_y_is_up",
+]
+
+from typing import Sequence, Tuple
+
+import numpy as np
+import pygame
+import pymunk
+from pymunk.space_debug_draw_options import SpaceDebugColor
+from pymunk.vec2d import Vec2d
+
+positive_y_is_up: bool = False
+"""Make increasing values of y point upwards.
+
+When True::
+
+    y
+    ^
+    |      . (3, 3)
+    |
+    |   . (2, 2)
+    |
+    +------ > x
+
+When False::
+
+    +------ > x
+    |
+    |   . (2, 2)
+    |
+    |      . (3, 3)
+    v
+    y
+
+"""
+
+
+class DrawOptions(pymunk.SpaceDebugDrawOptions):
+    def __init__(self, surface: pygame.Surface) -> None:
+        """Draw a pymunk.Space on a pygame.Surface object.
+
+        Typical usage::
+
+        >>> import pymunk
+        >>> surface = pygame.Surface((10,10))
+        >>> space = pymunk.Space()
+        >>> options = pymunk.pygame_util.DrawOptions(surface)
+        >>> space.debug_draw(options)
+
+        You can control the color of a shape by setting shape.color to the color
+        you want it drawn in::
+
+        >>> c = pymunk.Circle(None, 10)
+        >>> c.color = pygame.Color("pink")
+
+        See pygame_util.demo.py for a full example
+
+        Since pygame uses a coordinate system where y points down (in contrast
+        to many other cases), you either have to make the physics simulation
+        with Pymunk also behave in that way, or flip everything when you draw.
+
+        The easiest is probably to just make the simulation behave the same
+        way as Pygame does. In that way all coordinates used are in the same
+        orientation and easy to reason about::
+
+        >>> space = pymunk.Space()
+        >>> space.gravity = (0, -1000)
+        >>> body = pymunk.Body()
+        >>> body.position = (0, 0) # will be positioned in the top left corner
+        >>> space.debug_draw(options)
+
+        To flip the drawing its possible to set the module property
+        :py:data:`positive_y_is_up` to True. Then the pygame drawing will flip
+        the simulation upside down before drawing::
+
+        >>> positive_y_is_up = True
+        >>> body = pymunk.Body()
+        >>> body.position = (0, 0)
+        >>> # Body will be position in bottom left corner
+
+        :Parameters:
+                surface : pygame.Surface
+                    Surface that the objects will be drawn on
+        """
+        self.surface = surface
+        super().__init__()
+
+    def draw_circle(
+        self,
+        pos: Vec2d,
+        angle: float,
+        radius: float,
+        outline_color: SpaceDebugColor,
+        fill_color: SpaceDebugColor,
+    ) -> None:
+        p = to_pygame(pos, self.surface)
+
+        pygame.draw.circle(self.surface, fill_color.as_int(), p, round(radius), 0)
+        pygame.draw.circle(self.surface, light_color(fill_color).as_int(), p, round(radius - 4), 0)
+
+        # circle_edge = pos + Vec2d(radius, 0).rotated(angle)
+        # p2 = to_pygame(circle_edge, self.surface)
+        # line_r = 2 if radius > 20 else 1
+        # pygame.draw.lines(self.surface, outline_color.as_int(), False, [p, p2], line_r)
+
+    def draw_segment(self, a: Vec2d, b: Vec2d, color: SpaceDebugColor) -> None:
+        p1 = to_pygame(a, self.surface)
+        p2 = to_pygame(b, self.surface)
+
+        pygame.draw.aalines(self.surface, color.as_int(), False, [p1, p2])
+
+    def draw_fat_segment(
+        self,
+        a: Tuple[float, float],
+        b: Tuple[float, float],
+        radius: float,
+        outline_color: SpaceDebugColor,
+        fill_color: SpaceDebugColor,
+    ) -> None:
+        p1 = to_pygame(a, self.surface)
+        p2 = to_pygame(b, self.surface)
+
+        r = round(max(1, radius * 2))
+        pygame.draw.lines(self.surface, fill_color.as_int(), False, [p1, p2], r)
+        if r > 2:
+            orthog = [abs(p2[1] - p1[1]), abs(p2[0] - p1[0])]
+            if orthog[0] == 0 and orthog[1] == 0:
+                return
+            scale = radius / (orthog[0] * orthog[0] + orthog[1] * orthog[1]) ** 0.5
+            orthog[0] = round(orthog[0] * scale)
+            orthog[1] = round(orthog[1] * scale)
+            points = [
+                (p1[0] - orthog[0], p1[1] - orthog[1]),
+                (p1[0] + orthog[0], p1[1] + orthog[1]),
+                (p2[0] + orthog[0], p2[1] + orthog[1]),
+                (p2[0] - orthog[0], p2[1] - orthog[1]),
+            ]
+            pygame.draw.polygon(self.surface, fill_color.as_int(), points)
+            pygame.draw.circle(
+                self.surface,
+                fill_color.as_int(),
+                (round(p1[0]), round(p1[1])),
+                round(radius),
+            )
+            pygame.draw.circle(
+                self.surface,
+                fill_color.as_int(),
+                (round(p2[0]), round(p2[1])),
+                round(radius),
+            )
+
+    def draw_polygon(
+        self,
+        verts: Sequence[Tuple[float, float]],
+        radius: float,
+        outline_color: SpaceDebugColor,
+        fill_color: SpaceDebugColor,
+    ) -> None:
+        ps = [to_pygame(v, self.surface) for v in verts]
+        ps += [ps[0]]
+
+        radius = 2
+        pygame.draw.polygon(self.surface, light_color(fill_color).as_int(), ps)
+
+        if radius > 0:
+            for i in range(len(verts)):
+                a = verts[i]
+                b = verts[(i + 1) % len(verts)]
+                self.draw_fat_segment(a, b, radius, fill_color, fill_color)
+
+    def draw_dot(self, size: float, pos: Tuple[float, float], color: SpaceDebugColor) -> None:
+        p = to_pygame(pos, self.surface)
+        pygame.draw.circle(self.surface, color.as_int(), p, round(size), 0)
+
+
+def get_mouse_pos(surface: pygame.Surface) -> Tuple[int, int]:
+    """Get position of the mouse pointer in pymunk coordinates."""
+    p = pygame.mouse.get_pos()
+    return from_pygame(p, surface)
+
+
+def to_pygame(p: Tuple[float, float], surface: pygame.Surface) -> Tuple[int, int]:
+    """Convenience method to convert pymunk coordinates to pygame surface
+    local coordinates.
+
+    Note that in case positive_y_is_up is False, this function won't actually do
+    anything except converting the point to integers.
+    """
+    if positive_y_is_up:
+        return round(p[0]), surface.get_height() - round(p[1])
+    else:
+        return round(p[0]), round(p[1])
+
+
+def from_pygame(p: Tuple[float, float], surface: pygame.Surface) -> Tuple[int, int]:
+    """Convenience method to convert pygame surface local coordinates to
+    pymunk coordinates
+    """
+    return to_pygame(p, surface)
+
+
+def light_color(color: SpaceDebugColor):
+    color = np.minimum(1.2 * np.float32([color.r, color.g, color.b, color.a]), np.float32([255]))
+    color = SpaceDebugColor(r=color[0], g=color[1], b=color[2], a=color[3])
+    return color
--- a/lerobot/common/envs/simxarm.py
+++ b/lerobot/common/envs/simxarm.py
@@ -1,6 +1,8 @@
 import importlib
+from collections import deque
 from typing import Optional

+import einops
 import numpy as np
 import torch
 from tensordict import TensorDict
@@ -10,9 +12,9 @@ from torchrl.data.tensor_specs import (
    DiscreteTensorSpec,
    UnboundedContinuousTensorSpec,
 )
-from torchrl.envs import EnvBase
 from torchrl.envs.libs.gym import _gym_to_torchrl_spec_transform

+from lerobot.common.envs.abstract import AbstractEnv
 from lerobot.common.utils import set_seed

 MAX_NUM_ACTIONS = 4
@@ -21,7 +23,7 @@ _has_gym = importlib.util.find_spec("gym") is not None
 _has_simxarm = importlib.util.find_spec("simxarm") is not None and _has_gym


-class SimxarmEnv(EnvBase):
+class SimxarmEnv(AbstractEnv):
    def __init__(
        self,
        task,
@@ -31,19 +33,22 @@ class SimxarmEnv(EnvBase):
        image_size=None,
        seed=1337,
        device="cpu",
+        num_prev_obs=0,
+        num_prev_action=0,
    ):
-        super().__init__(device=device, batch_size=[])
-        self.task = task
-        self.frame_skip = frame_skip
-        self.from_pixels = from_pixels
-        self.pixels_only = pixels_only
-        self.image_size = image_size
-
-        if pixels_only:
-            assert from_pixels
-        if from_pixels:
-            assert image_size
+        super().__init__(
+            task=task,
+            frame_skip=frame_skip,
+            from_pixels=from_pixels,
+            pixels_only=pixels_only,
+            image_size=image_size,
+            seed=seed,
+            device=device,
+            num_prev_obs=num_prev_obs,
+            num_prev_action=num_prev_action,
+        )

+    def _make_env(self):
        if not _has_simxarm:
            raise ImportError("Cannot import simxarm.")
        if not _has_gym:
@@ -63,9 +68,6 @@ class SimxarmEnv(EnvBase):
        if "w" not in TASKS[self.task]["action_space"]:
            self._action_padding[-1] = 1.0

-        self._make_spec()
-        self.set_seed(seed)
-
    def render(self, mode="rgb_array", width=384, height=384):
        return self._env.render(mode, width=width, height=height)

@@ -90,15 +92,33 @@ class SimxarmEnv(EnvBase):
        if td is None or td.is_empty():
            raw_obs = self._env.reset()

+            obs = self._format_raw_obs(raw_obs)
+
+            if self.num_prev_obs > 0:
+                stacked_obs = {}
+                if "image" in obs:
+                    self._prev_obs_image_queue = deque(
+                        [obs["image"]] * (self.num_prev_obs + 1), maxlen=(self.num_prev_obs + 1)
+                    )
+                    stacked_obs["image"] = torch.stack(list(self._prev_obs_image_queue))
+                if "state" in obs:
+                    self._prev_obs_state_queue = deque(
+                        [obs["state"]] * (self.num_prev_obs + 1), maxlen=(self.num_prev_obs + 1)
+                    )
+                    stacked_obs["state"] = torch.stack(list(self._prev_obs_state_queue))
+                obs = stacked_obs
+
            td = TensorDict(
                {
-                    "observation": self._format_raw_obs(raw_obs),
+                    "observation": TensorDict(obs, batch_size=[]),
                    "done": torch.tensor([False], dtype=torch.bool),
                },
                batch_size=[],
            )
        else:
            raise NotImplementedError()
+
+        self.call_rendering_hooks()
        return td

    def _step(self, tensordict: TensorDict):
@@ -108,10 +128,32 @@ class SimxarmEnv(EnvBase):
        action = np.concatenate([action, self._action_padding])
        # TODO(rcadene): add info["is_success"] and info["success"] ?
        sum_reward = 0
-        for _ in range(self.frame_skip):
-            raw_obs, reward, done, info = self._env.step(action)
+
+        if action.ndim == 1:
+            action = einops.repeat(action, "c -> t c", t=self.frame_skip)
+        else:
+            if self.frame_skip > 1:
+                raise NotImplementedError()
+
+        num_action_steps = action.shape[0]
+        for i in range(num_action_steps):
+            raw_obs, reward, done, info = self._env.step(action[i])
            sum_reward += reward

+            obs = self._format_raw_obs(raw_obs)
+
+            if self.num_prev_obs > 0:
+                stacked_obs = {}
+                if "image" in obs:
+                    self._prev_obs_image_queue.append(obs["image"])
+                    stacked_obs["image"] = torch.stack(list(self._prev_obs_image_queue))
+                if "state" in obs:
+                    self._prev_obs_state_queue.append(obs["state"])
+                    stacked_obs["state"] = torch.stack(list(self._prev_obs_state_queue))
+                obs = stacked_obs
+
+            self.call_rendering_hooks()
+
        td = TensorDict(
            {
                "observation": self._format_raw_obs(raw_obs),
@@ -126,23 +168,36 @@ class SimxarmEnv(EnvBase):
    def _make_spec(self):
        obs = {}
        if self.from_pixels:
+            image_shape = (3, self.image_size, self.image_size)
+            if self.num_prev_obs > 0:
+                image_shape = (self.num_prev_obs + 1, *image_shape)
+
            obs["image"] = BoundedTensorSpec(
                low=0,
                high=255,
-                shape=(3, self.image_size, self.image_size),
+                shape=image_shape,
                dtype=torch.uint8,
                device=self.device,
            )
            if not self.pixels_only:
+                state_shape = (len(self._env.robot_state),)
+                if self.num_prev_obs > 0:
+                    state_shape = (self.num_prev_obs + 1, *state_shape)
+
                obs["state"] = UnboundedContinuousTensorSpec(
-                    shape=(len(self._env.robot_state),),
+                    shape=state_shape,
                    dtype=torch.float32,
                    device=self.device,
                )
        else:
            # TODO(rcadene): add observation_space achieved_goal and desired_goal?
+            state_shape = self._env.observation_space["observation"].shape
+            if self.num_prev_obs > 0:
+                state_shape = (self.num_prev_obs + 1, *state_shape)
+
            obs["state"] = UnboundedContinuousTensorSpec(
-                shape=self._env.observation_space["observation"].shape,
+                # TODO:
+                shape=state_shape,
                dtype=torch.float32,
                device=self.device,
            )
--- a/lerobot/common/envs/transforms.py
+++ b/lerobot/common/envs/transforms.py
@@ -1,5 +1,6 @@
 from typing import Sequence

+import torch
 from tensordict import TensorDictBase
 from tensordict.nn import dispatch
 from tensordict.utils import NestedKey
@@ -7,19 +8,45 @@ from torchrl.envs.transforms import ObservationTransform, Transform


 class Prod(ObservationTransform):
+    invertible = True
+
    def __init__(self, in_keys: Sequence[NestedKey], prod: float):
        super().__init__()
        self.in_keys = in_keys
        self.prod = prod
+        self.original_dtypes = {}
+
+    def _reset(self, tensordict: TensorDictBase, tensordict_reset: TensorDictBase) -> TensorDictBase:
+        # _reset is called once when the environment reset to normalize the first observation
+        tensordict_reset = self._call(tensordict_reset)
+        return tensordict_reset
+
+    @dispatch(source="in_keys", dest="out_keys")
+    def forward(self, tensordict: TensorDictBase) -> TensorDictBase:
+        return self._call(tensordict)

    def _call(self, td):
        for key in self.in_keys:
-            td[key] *= self.prod
+            if td.get(key, None) is None:
+                continue
+            self.original_dtypes[key] = td[key].dtype
+            td[key] = td[key].type(torch.float32) * self.prod
+        return td
+
+    def _inv_call(self, td: TensorDictBase) -> TensorDictBase:
+        for key in self.in_keys:
+            if td.get(key, None) is None:
+                continue
+            td[key] = (td[key] / self.prod).type(self.original_dtypes[key])
        return td

    def transform_observation_spec(self, obs_spec):
        for key in self.in_keys:
-            obs_spec[key].space.high *= self.prod
+            if obs_spec.get(key, None) is None:
+                continue
+            obs_spec[key].space.high = obs_spec[key].space.high.type(torch.float32) * self.prod
+            obs_spec[key].space.low = obs_spec[key].space.low.type(torch.float32) * self.prod
+            obs_spec[key].dtype = torch.float32
        return obs_spec


--- a/lerobot/common/logger.py
+++ b/lerobot/common/logger.py
@@ -38,7 +38,7 @@ class Logger:
        project = cfg.get("wandb", {}).get("project")
        entity = cfg.get("wandb", {}).get("entity")
        enable_wandb = cfg.get("wandb", {}).get("enable", False)
-        run_offline = not enable_wandb or not project or not entity
+        run_offline = not enable_wandb or not project
        if run_offline:
            logging.info(colored("Logs will be saved locally.", "yellow", attrs=["bold"]))
            self._wandb = None
@@ -63,6 +63,7 @@ class Logger:
                resume=None,
            )
            print(colored("Logs will be synced with wandb.", "blue", attrs=["bold"]))
+            logging.info(f"Track this run --> {colored(wandb.run.get_url(), 'yellow', attrs=['bold'])}")
            self._wandb = wandb

    def save_model(self, policy, identifier):
--- a/lerobot/common/policies/init.py
+++ b/lerobot/common/policies/init.py
--- a/lerobot/common/policies/act/backbone.py
+++ b/lerobot/common/policies/act/backbone.py
@@ -0,0 +1,115 @@
+from typing import List
+
+import torch
+import torchvision
+from torch import nn
+from torchvision.models._utils import IntermediateLayerGetter
+
+from .position_encoding import build_position_encoding
+from .utils import NestedTensor, is_main_process
+
+
+class FrozenBatchNorm2d(torch.nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt,
+    without which any other policy_models than torchvision.policy_models.resnet[18,34,50,101]
+    produce nans.
+    """
+
+    def __init__(self, n):
+        super().__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        num_batches_tracked_key = prefix + "num_batches_tracked"
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it fuser-friendly
+        w = self.weight.reshape(1, -1, 1, 1)
+        b = self.bias.reshape(1, -1, 1, 1)
+        rv = self.running_var.reshape(1, -1, 1, 1)
+        rm = self.running_mean.reshape(1, -1, 1, 1)
+        eps = 1e-5
+        scale = w * (rv + eps).rsqrt()
+        bias = b - rm * scale
+        return x * scale + bias
+
+
+class BackboneBase(nn.Module):
+    def __init__(
+        self, backbone: nn.Module, train_backbone: bool, num_channels: int, return_interm_layers: bool
+    ):
+        super().__init__()
+        # for name, parameter in backbone.named_parameters(): # only train later layers # TODO do we want this?
+        #     if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
+        #         parameter.requires_grad_(False)
+        if return_interm_layers:
+            return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3"}
+        else:
+            return_layers = {"layer4": "0"}
+        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
+        self.num_channels = num_channels
+
+    def forward(self, tensor):
+        xs = self.body(tensor)
+        return xs
+        # out: Dict[str, NestedTensor] = {}
+        # for name, x in xs.items():
+        #     m = tensor_list.mask
+        #     assert m is not None
+        #     mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
+        #     out[name] = NestedTensor(x, mask)
+        # return out
+
+
+class Backbone(BackboneBase):
+    """ResNet backbone with frozen BatchNorm."""
+
+    def __init__(self, name: str, train_backbone: bool, return_interm_layers: bool, dilation: bool):
+        backbone = getattr(torchvision.models, name)(
+            replace_stride_with_dilation=[False, False, dilation],
+            pretrained=is_main_process(),
+            norm_layer=FrozenBatchNorm2d,
+        )  # pretrained # TODO do we want frozen batch_norm??
+        num_channels = 512 if name in ("resnet18", "resnet34") else 2048
+        super().__init__(backbone, train_backbone, num_channels, return_interm_layers)
+
+
+class Joiner(nn.Sequential):
+    def __init__(self, backbone, position_embedding):
+        super().__init__(backbone, position_embedding)
+
+    def forward(self, tensor_list: NestedTensor):
+        xs = self[0](tensor_list)
+        out: List[NestedTensor] = []
+        pos = []
+        for _, x in xs.items():
+            out.append(x)
+            # position encoding
+            pos.append(self[1](x).to(x.dtype))
+
+        return out, pos
+
+
+def build_backbone(args):
+    position_embedding = build_position_encoding(args)
+    train_backbone = args.lr_backbone > 0
+    return_interm_layers = args.masks
+    backbone = Backbone(args.backbone, train_backbone, return_interm_layers, args.dilation)
+    model = Joiner(backbone, position_embedding)
+    model.num_channels = backbone.num_channels
+    return model
--- a/lerobot/common/policies/act/detr_vae.py
+++ b/lerobot/common/policies/act/detr_vae.py
@@ -0,0 +1,212 @@
+import numpy as np
+import torch
+from torch import nn
+from torch.autograd import Variable
+
+from .backbone import build_backbone
+from .transformer import TransformerEncoder, TransformerEncoderLayer, build_transformer
+
+
+def reparametrize(mu, logvar):
+    std = logvar.div(2).exp()
+    eps = Variable(std.data.new(std.size()).normal_())
+    return mu + std * eps
+
+
+def get_sinusoid_encoding_table(n_position, d_hid):
+    def get_position_angle_vec(position):
+        return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
+
+    sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
+    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
+    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
+
+    return torch.FloatTensor(sinusoid_table).unsqueeze(0)
+
+
+class DETRVAE(nn.Module):
+    """This is the DETR module that performs object detection"""
+
+    def __init__(
+        self, backbones, transformer, encoder, state_dim, action_dim, num_queries, camera_names, vae
+    ):
+        """Initializes the model.
+        Parameters:
+            backbones: torch module of the backbone to be used. See backbone.py
+            transformer: torch module of the transformer architecture. See transformer.py
+            state_dim: robot state dimension of the environment
+            num_queries: number of object queries, ie detection slot. This is the maximal number of objects
+                         DETR can detect in a single image. For COCO, we recommend 100 queries.
+            aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
+        """
+        super().__init__()
+        self.num_queries = num_queries
+        self.camera_names = camera_names
+        self.transformer = transformer
+        self.encoder = encoder
+        self.vae = vae
+        hidden_dim = transformer.d_model
+        self.action_head = nn.Linear(hidden_dim, action_dim)
+        self.is_pad_head = nn.Linear(hidden_dim, 1)
+        self.query_embed = nn.Embedding(num_queries, hidden_dim)
+        if backbones is not None:
+            self.input_proj = nn.Conv2d(backbones[0].num_channels, hidden_dim, kernel_size=1)
+            self.backbones = nn.ModuleList(backbones)
+            self.input_proj_robot_state = nn.Linear(state_dim, hidden_dim)
+        else:
+            # input_dim = 14 + 7 # robot_state + env_state
+            self.input_proj_robot_state = nn.Linear(state_dim, hidden_dim)
+            # TODO(rcadene): understand what is env_state, and why it needs to be 7
+            self.input_proj_env_state = nn.Linear(state_dim // 2, hidden_dim)
+            self.pos = torch.nn.Embedding(2, hidden_dim)
+            self.backbones = None
+
+        # encoder extra parameters
+        self.latent_dim = 32  # final size of latent z # TODO tune
+        self.cls_embed = nn.Embedding(1, hidden_dim)  # extra cls token embedding
+        self.encoder_action_proj = nn.Linear(14, hidden_dim)  # project action to embedding
+        self.encoder_joint_proj = nn.Linear(14, hidden_dim)  # project qpos to embedding
+        self.latent_proj = nn.Linear(
+            hidden_dim, self.latent_dim * 2
+        )  # project hidden state to latent std, var
+        self.register_buffer(
+            "pos_table", get_sinusoid_encoding_table(1 + 1 + num_queries, hidden_dim)
+        )  # [CLS], qpos, a_seq
+
+        # decoder extra parameters
+        self.latent_out_proj = nn.Linear(self.latent_dim, hidden_dim)  # project latent sample to embedding
+        self.additional_pos_embed = nn.Embedding(
+            2, hidden_dim
+        )  # learned position embedding for proprio and latent
+
+    def forward(self, qpos, image, env_state, actions=None, is_pad=None):
+        """
+        qpos: batch, qpos_dim
+        image: batch, num_cam, channel, height, width
+        env_state: None
+        actions: batch, seq, action_dim
+        """
+        is_training = actions is not None  # train or val
+        bs, _ = qpos.shape
+        ### Obtain latent z from action sequence
+        if self.vae and is_training:
+            # project action sequence to embedding dim, and concat with a CLS token
+            action_embed = self.encoder_action_proj(actions)  # (bs, seq, hidden_dim)
+            qpos_embed = self.encoder_joint_proj(qpos)  # (bs, hidden_dim)
+            qpos_embed = torch.unsqueeze(qpos_embed, axis=1)  # (bs, 1, hidden_dim)
+            cls_embed = self.cls_embed.weight  # (1, hidden_dim)
+            cls_embed = torch.unsqueeze(cls_embed, axis=0).repeat(bs, 1, 1)  # (bs, 1, hidden_dim)
+            encoder_input = torch.cat(
+                [cls_embed, qpos_embed, action_embed], axis=1
+            )  # (bs, seq+1, hidden_dim)
+            encoder_input = encoder_input.permute(1, 0, 2)  # (seq+1, bs, hidden_dim)
+            # do not mask cls token
+            # cls_joint_is_pad = torch.full((bs, 2), False).to(qpos.device)  # False: not a padding
+            # is_pad = torch.cat([cls_joint_is_pad, is_pad], axis=1)  # (bs, seq+1)
+            # obtain position embedding
+            pos_embed = self.pos_table.clone().detach()
+            pos_embed = pos_embed.permute(1, 0, 2)  # (seq+1, 1, hidden_dim)
+            # query model
+            encoder_output = self.encoder(encoder_input, pos=pos_embed)  # , src_key_padding_mask=is_pad)
+            encoder_output = encoder_output[0]  # take cls output only
+            latent_info = self.latent_proj(encoder_output)
+            mu = latent_info[:, : self.latent_dim]
+            logvar = latent_info[:, self.latent_dim :]
+            latent_sample = reparametrize(mu, logvar)
+            latent_input = self.latent_out_proj(latent_sample)
+        else:
+            mu = logvar = None
+            latent_sample = torch.zeros([bs, self.latent_dim], dtype=torch.float32).to(qpos.device)
+            latent_input = self.latent_out_proj(latent_sample)
+
+        if self.backbones is not None:
+            # Image observation features and position embeddings
+            all_cam_features = []
+            all_cam_pos = []
+            for cam_id, _ in enumerate(self.camera_names):
+                features, pos = self.backbones[0](image[:, cam_id])  # HARDCODED
+                features = features[0]  # take the last layer feature
+                pos = pos[0]
+                all_cam_features.append(self.input_proj(features))
+                all_cam_pos.append(pos)
+            # proprioception features
+            proprio_input = self.input_proj_robot_state(qpos)
+            # fold camera dimension into width dimension
+            src = torch.cat(all_cam_features, axis=3)
+            pos = torch.cat(all_cam_pos, axis=3)
+            hs = self.transformer(
+                src,
+                None,
+                self.query_embed.weight,
+                pos,
+                latent_input,
+                proprio_input,
+                self.additional_pos_embed.weight,
+            )[0]
+        else:
+            qpos = self.input_proj_robot_state(qpos)
+            env_state = self.input_proj_env_state(env_state)
+            transformer_input = torch.cat([qpos, env_state], axis=1)  # seq length = 2
+            hs = self.transformer(transformer_input, None, self.query_embed.weight, self.pos.weight)[0]
+        a_hat = self.action_head(hs)
+        is_pad_hat = self.is_pad_head(hs)
+        return a_hat, is_pad_hat, [mu, logvar]
+
+
+def mlp(input_dim, hidden_dim, output_dim, hidden_depth):
+    if hidden_depth == 0:
+        mods = [nn.Linear(input_dim, output_dim)]
+    else:
+        mods = [nn.Linear(input_dim, hidden_dim), nn.ReLU(inplace=True)]
+        for _ in range(hidden_depth - 1):
+            mods += [nn.Linear(hidden_dim, hidden_dim), nn.ReLU(inplace=True)]
+        mods.append(nn.Linear(hidden_dim, output_dim))
+    trunk = nn.Sequential(*mods)
+    return trunk
+
+
+def build_encoder(args):
+    d_model = args.hidden_dim  # 256
+    dropout = args.dropout  # 0.1
+    nhead = args.nheads  # 8
+    dim_feedforward = args.dim_feedforward  # 2048
+    num_encoder_layers = args.enc_layers  # 4 # TODO shared with VAE decoder
+    normalize_before = args.pre_norm  # False
+    activation = "relu"
+
+    encoder_layer = TransformerEncoderLayer(
+        d_model, nhead, dim_feedforward, dropout, activation, normalize_before
+    )
+    encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
+    encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
+
+    return encoder
+
+
+def build(args):
+    # From state
+    # backbone = None # from state for now, no need for conv nets
+    # From image
+    backbones = []
+    backbone = build_backbone(args)
+    backbones.append(backbone)
+
+    transformer = build_transformer(args)
+
+    encoder = build_encoder(args)
+
+    model = DETRVAE(
+        backbones,
+        transformer,
+        encoder,
+        state_dim=args.state_dim,
+        action_dim=args.action_dim,
+        num_queries=args.num_queries,
+        camera_names=args.camera_names,
+        vae=args.vae,
+    )
+
+    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print("number of parameters: {:.2f}M".format(n_parameters / 1e6))
+
+    return model
--- a/lerobot/common/policies/act/policy.py
+++ b/lerobot/common/policies/act/policy.py
@@ -0,0 +1,217 @@
+import logging
+import time
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F  # noqa: N812
+import torchvision.transforms as transforms
+
+from lerobot.common.policies.act.detr_vae import build
+
+
+def build_act_model_and_optimizer(cfg):
+    model = build(cfg)
+
+    param_dicts = [
+        {"params": [p for n, p in model.named_parameters() if "backbone" not in n and p.requires_grad]},
+        {
+            "params": [p for n, p in model.named_parameters() if "backbone" in n and p.requires_grad],
+            "lr": cfg.lr_backbone,
+        },
+    ]
+    optimizer = torch.optim.AdamW(param_dicts, lr=cfg.lr, weight_decay=cfg.weight_decay)
+
+    return model, optimizer
+
+
+def kl_divergence(mu, logvar):
+    batch_size = mu.size(0)
+    assert batch_size != 0
+    if mu.data.ndimension() == 4:
+        mu = mu.view(mu.size(0), mu.size(1))
+    if logvar.data.ndimension() == 4:
+        logvar = logvar.view(logvar.size(0), logvar.size(1))
+
+    klds = -0.5 * (1 + logvar - mu.pow(2) - logvar.exp())
+    total_kld = klds.sum(1).mean(0, True)
+    dimension_wise_kld = klds.mean(0)
+    mean_kld = klds.mean(1).mean(0, True)
+
+    return total_kld, dimension_wise_kld, mean_kld
+
+
+class ActionChunkingTransformerPolicy(nn.Module):
+    def __init__(self, cfg, device, n_action_steps=1):
+        super().__init__()
+        self.cfg = cfg
+        self.n_action_steps = n_action_steps
+        self.device = device
+        self.model, self.optimizer = build_act_model_and_optimizer(cfg)
+        self.kl_weight = self.cfg.kl_weight
+        logging.info(f"KL Weight {self.kl_weight}")
+        self.to(self.device)
+
+    def update(self, replay_buffer, step):
+        del step
+
+        start_time = time.time()
+
+        self.train()
+
+        num_slices = self.cfg.batch_size
+        batch_size = self.cfg.horizon * num_slices
+
+        assert batch_size % self.cfg.horizon == 0
+        assert batch_size % num_slices == 0
+
+        def process_batch(batch, horizon, num_slices):
+            # trajectory t = 64, horizon h = 16
+            # (t h) ... -> t h ...
+            batch = batch.reshape(num_slices, horizon)
+
+            image = batch["observation", "image", "top"]
+            image = image[:, 0]  # first observation t=0
+            # batch, num_cam, channel, height, width
+            image = image.unsqueeze(1)
+            assert image.ndim == 5
+            image = image.float()
+
+            state = batch["observation", "state"]
+            state = state[:, 0]  # first observation t=0
+            # batch, qpos_dim
+            assert state.ndim == 2
+
+            action = batch["action"]
+            # batch, seq, action_dim
+            assert action.ndim == 3
+            assert action.shape[1] == horizon
+
+            if self.cfg.n_obs_steps > 1:
+                raise NotImplementedError()
+                # # keep first n observations of the slice corresponding to t=[-1,0]
+                # image = image[:, : self.cfg.n_obs_steps]
+                # state = state[:, : self.cfg.n_obs_steps]
+
+            out = {
+                "obs": {
+                    "image": image.to(self.device, non_blocking=True),
+                    "agent_pos": state.to(self.device, non_blocking=True),
+                },
+                "action": action.to(self.device, non_blocking=True),
+            }
+            return out
+
+        batch = replay_buffer.sample(batch_size)
+        batch = process_batch(batch, self.cfg.horizon, num_slices)
+
+        data_s = time.time() - start_time
+
+        loss = self.compute_loss(batch)
+        loss.backward()
+
+        grad_norm = torch.nn.utils.clip_grad_norm_(
+            self.model.parameters(),
+            self.cfg.grad_clip_norm,
+            error_if_nonfinite=False,
+        )
+
+        self.optimizer.step()
+        self.optimizer.zero_grad()
+        # self.lr_scheduler.step()
+
+        info = {
+            "loss": loss.item(),
+            "grad_norm": float(grad_norm),
+            # "lr": self.lr_scheduler.get_last_lr()[0],
+            "lr": self.cfg.lr,
+            "data_s": data_s,
+            "update_s": time.time() - start_time,
+        }
+
+        return info
+
+    def save(self, fp):
+        torch.save(self.state_dict(), fp)
+
+    def load(self, fp):
+        d = torch.load(fp)
+        self.load_state_dict(d)
+
+    def compute_loss(self, batch):
+        loss_dict = self._forward(
+            qpos=batch["obs"]["agent_pos"],
+            image=batch["obs"]["image"],
+            actions=batch["action"],
+        )
+        loss = loss_dict["loss"]
+        return loss
+
+    @torch.no_grad()
+    def forward(self, observation, step_count):
+        # TODO(rcadene): remove unused step_count
+        del step_count
+
+        self.eval()
+
+        # TODO(rcadene): remove unsqueeze hack to add bsize=1
+        observation["image", "top"] = observation["image", "top"].unsqueeze(0)
+        # observation["state"] = observation["state"].unsqueeze(0)
+
+        # TODO(rcadene): remove hack
+        # add 1 camera dimension
+        observation["image", "top"] = observation["image", "top"].unsqueeze(1)
+
+        obs_dict = {
+            "image": observation["image", "top"],
+            "agent_pos": observation["state"],
+        }
+        action = self._forward(qpos=obs_dict["agent_pos"], image=obs_dict["image"])
+
+        if self.cfg.temporal_agg:
+            # TODO(rcadene): implement temporal aggregation
+            raise NotImplementedError()
+            # all_time_actions[[t], t:t+num_queries] = action
+            # actions_for_curr_step = all_time_actions[:, t]
+            # actions_populated = torch.all(actions_for_curr_step != 0, axis=1)
+            # actions_for_curr_step = actions_for_curr_step[actions_populated]
+            # k = 0.01
+            # exp_weights = np.exp(-k * np.arange(len(actions_for_curr_step)))
+            # exp_weights = exp_weights / exp_weights.sum()
+            # exp_weights = torch.from_numpy(exp_weights).cuda().unsqueeze(dim=1)
+            # raw_action = (actions_for_curr_step * exp_weights).sum(dim=0, keepdim=True)
+
+        # remove bsize=1
+        action = action.squeeze(0)
+
+        # take first predicted action or n first actions
+        action = action[0] if self.n_action_steps == 1 else action[: self.n_action_steps]
+        return action
+
+    def _forward(self, qpos, image, actions=None, is_pad=None):
+        env_state = None
+        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        image = normalize(image)
+
+        is_training = actions is not None
+        if is_training:  # training time
+            actions = actions[:, : self.model.num_queries]
+            if is_pad is not None:
+                is_pad = is_pad[:, : self.model.num_queries]
+
+            a_hat, is_pad_hat, (mu, logvar) = self.model(qpos, image, env_state, actions, is_pad)
+
+            all_l1 = F.l1_loss(actions, a_hat, reduction="none")
+            l1 = all_l1.mean() if is_pad is None else (all_l1 * ~is_pad.unsqueeze(-1)).mean()
+
+            loss_dict = {}
+            loss_dict["l1"] = l1
+            if self.cfg.vae:
+                total_kld, dim_wise_kld, mean_kld = kl_divergence(mu, logvar)
+                loss_dict["kl"] = total_kld[0]
+                loss_dict["loss"] = loss_dict["l1"] + loss_dict["kl"] * self.kl_weight
+            else:
+                loss_dict["loss"] = loss_dict["l1"]
+            return loss_dict
+        else:
+            action, _, (_, _) = self.model(qpos, image, env_state)  # no action, sample from prior
+            return action
--- a/lerobot/common/policies/act/position_encoding.py
+++ b/lerobot/common/policies/act/position_encoding.py
@@ -0,0 +1,101 @@
+"""
+Various positional encodings for the transformer.
+"""
+import math
+
+import torch
+from torch import nn
+
+from .utils import NestedTensor
+
+
+class PositionEmbeddingSine(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+
+    def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+
+    def forward(self, tensor):
+        x = tensor
+        # mask = tensor_list.mask
+        # assert mask is not None
+        # not_mask = ~mask
+
+        not_mask = torch.ones_like(x[0, [0]])
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+
+        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
+
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+
+
+class PositionEmbeddingLearned(nn.Module):
+    """
+    Absolute pos embedding, learned.
+    """
+
+    def __init__(self, num_pos_feats=256):
+        super().__init__()
+        self.row_embed = nn.Embedding(50, num_pos_feats)
+        self.col_embed = nn.Embedding(50, num_pos_feats)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        nn.init.uniform_(self.row_embed.weight)
+        nn.init.uniform_(self.col_embed.weight)
+
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        h, w = x.shape[-2:]
+        i = torch.arange(w, device=x.device)
+        j = torch.arange(h, device=x.device)
+        x_emb = self.col_embed(i)
+        y_emb = self.row_embed(j)
+        pos = (
+            torch.cat(
+                [
+                    x_emb.unsqueeze(0).repeat(h, 1, 1),
+                    y_emb.unsqueeze(1).repeat(1, w, 1),
+                ],
+                dim=-1,
+            )
+            .permute(2, 0, 1)
+            .unsqueeze(0)
+            .repeat(x.shape[0], 1, 1, 1)
+        )
+        return pos
+
+
+def build_position_encoding(args):
+    n_steps = args.hidden_dim // 2
+    if args.position_embedding in ("v2", "sine"):
+        # TODO find a better way of exposing other arguments
+        position_embedding = PositionEmbeddingSine(n_steps, normalize=True)
+    elif args.position_embedding in ("v3", "learned"):
+        position_embedding = PositionEmbeddingLearned(n_steps)
+    else:
+        raise ValueError(f"not supported {args.position_embedding}")
+
+    return position_embedding
--- a/lerobot/common/policies/act/transformer.py
+++ b/lerobot/common/policies/act/transformer.py
@@ -0,0 +1,370 @@
+"""
+DETR Transformer class.
+
+Copy-paste from torch.nn.Transformer with modifications:
+    * positional encodings are passed in MHattention
+    * extra LN at the end of encoder is removed
+    * decoder returns a stack of activations from all decoding layers
+"""
+import copy
+from typing import Optional
+
+import torch
+import torch.nn.functional as F  # noqa: N812
+from torch import Tensor, nn
+
+
+class Transformer(nn.Module):
+    def __init__(
+        self,
+        d_model=512,
+        nhead=8,
+        num_encoder_layers=6,
+        num_decoder_layers=6,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation="relu",
+        normalize_before=False,
+        return_intermediate_dec=False,
+    ):
+        super().__init__()
+
+        encoder_layer = TransformerEncoderLayer(
+            d_model, nhead, dim_feedforward, dropout, activation, normalize_before
+        )
+        encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
+        self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
+
+        decoder_layer = TransformerDecoderLayer(
+            d_model, nhead, dim_feedforward, dropout, activation, normalize_before
+        )
+        decoder_norm = nn.LayerNorm(d_model)
+        self.decoder = TransformerDecoder(
+            decoder_layer, num_decoder_layers, decoder_norm, return_intermediate=return_intermediate_dec
+        )
+
+        self._reset_parameters()
+
+        self.d_model = d_model
+        self.nhead = nhead
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def forward(
+        self,
+        src,
+        mask,
+        query_embed,
+        pos_embed,
+        latent_input=None,
+        proprio_input=None,
+        additional_pos_embed=None,
+    ):
+        # TODO flatten only when input has H and W
+        if len(src.shape) == 4:  # has H and W
+            # flatten NxCxHxW to HWxNxC
+            bs, c, h, w = src.shape
+            src = src.flatten(2).permute(2, 0, 1)
+            pos_embed = pos_embed.flatten(2).permute(2, 0, 1).repeat(1, bs, 1)
+            query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1)
+            # mask = mask.flatten(1)
+
+            additional_pos_embed = additional_pos_embed.unsqueeze(1).repeat(1, bs, 1)  # seq, bs, dim
+            pos_embed = torch.cat([additional_pos_embed, pos_embed], axis=0)
+
+            addition_input = torch.stack([latent_input, proprio_input], axis=0)
+            src = torch.cat([addition_input, src], axis=0)
+        else:
+            assert len(src.shape) == 3
+            # flatten NxHWxC to HWxNxC
+            bs, hw, c = src.shape
+            src = src.permute(1, 0, 2)
+            pos_embed = pos_embed.unsqueeze(1).repeat(1, bs, 1)
+            query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1)
+
+        tgt = torch.zeros_like(query_embed)
+        memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed)
+        hs = self.decoder(tgt, memory, memory_key_padding_mask=mask, pos=pos_embed, query_pos=query_embed)
+        hs = hs.transpose(1, 2)
+        return hs
+
+
+class TransformerEncoder(nn.Module):
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super().__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(
+        self,
+        src,
+        mask: Optional[Tensor] = None,
+        src_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+    ):
+        output = src
+
+        for layer in self.layers:
+            output = layer(output, src_mask=mask, src_key_padding_mask=src_key_padding_mask, pos=pos)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+
+class TransformerDecoder(nn.Module):
+    def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
+        super().__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+        self.return_intermediate = return_intermediate
+
+    def forward(
+        self,
+        tgt,
+        memory,
+        tgt_mask: Optional[Tensor] = None,
+        memory_mask: Optional[Tensor] = None,
+        tgt_key_padding_mask: Optional[Tensor] = None,
+        memory_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+        query_pos: Optional[Tensor] = None,
+    ):
+        output = tgt
+
+        intermediate = []
+
+        for layer in self.layers:
+            output = layer(
+                output,
+                memory,
+                tgt_mask=tgt_mask,
+                memory_mask=memory_mask,
+                tgt_key_padding_mask=tgt_key_padding_mask,
+                memory_key_padding_mask=memory_key_padding_mask,
+                pos=pos,
+                query_pos=query_pos,
+            )
+            if self.return_intermediate:
+                intermediate.append(self.norm(output))
+
+        if self.norm is not None:
+            output = self.norm(output)
+            if self.return_intermediate:
+                intermediate.pop()
+                intermediate.append(output)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate)
+
+        return output.unsqueeze(0)
+
+
+class TransformerEncoderLayer(nn.Module):
+    def __init__(
+        self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False
+    ):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+
+    def forward_post(
+        self,
+        src,
+        src_mask: Optional[Tensor] = None,
+        src_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+    ):
+        q = k = self.with_pos_embed(src, pos)
+        src2 = self.self_attn(q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        return src
+
+    def forward_pre(
+        self,
+        src,
+        src_mask: Optional[Tensor] = None,
+        src_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+    ):
+        src2 = self.norm1(src)
+        q = k = self.with_pos_embed(src2, pos)
+        src2 = self.self_attn(q, k, value=src2, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
+        src = src + self.dropout1(src2)
+        src2 = self.norm2(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
+        src = src + self.dropout2(src2)
+        return src
+
+    def forward(
+        self,
+        src,
+        src_mask: Optional[Tensor] = None,
+        src_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+    ):
+        if self.normalize_before:
+            return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
+        return self.forward_post(src, src_mask, src_key_padding_mask, pos)
+
+
+class TransformerDecoderLayer(nn.Module):
+    def __init__(
+        self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False
+    ):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(dropout)
+
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+
+    def forward_post(
+        self,
+        tgt,
+        memory,
+        tgt_mask: Optional[Tensor] = None,
+        memory_mask: Optional[Tensor] = None,
+        tgt_key_padding_mask: Optional[Tensor] = None,
+        memory_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+        query_pos: Optional[Tensor] = None,
+    ):
+        q = k = self.with_pos_embed(tgt, query_pos)
+        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+        tgt2 = self.multihead_attn(
+            query=self.with_pos_embed(tgt, query_pos),
+            key=self.with_pos_embed(memory, pos),
+            value=memory,
+            attn_mask=memory_mask,
+            key_padding_mask=memory_key_padding_mask,
+        )[0]
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout3(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+
+    def forward_pre(
+        self,
+        tgt,
+        memory,
+        tgt_mask: Optional[Tensor] = None,
+        memory_mask: Optional[Tensor] = None,
+        tgt_key_padding_mask: Optional[Tensor] = None,
+        memory_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+        query_pos: Optional[Tensor] = None,
+    ):
+        tgt2 = self.norm1(tgt)
+        q = k = self.with_pos_embed(tgt2, query_pos)
+        tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout1(tgt2)
+        tgt2 = self.norm2(tgt)
+        tgt2 = self.multihead_attn(
+            query=self.with_pos_embed(tgt2, query_pos),
+            key=self.with_pos_embed(memory, pos),
+            value=memory,
+            attn_mask=memory_mask,
+            key_padding_mask=memory_key_padding_mask,
+        )[0]
+        tgt = tgt + self.dropout2(tgt2)
+        tgt2 = self.norm3(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
+        tgt = tgt + self.dropout3(tgt2)
+        return tgt
+
+    def forward(
+        self,
+        tgt,
+        memory,
+        tgt_mask: Optional[Tensor] = None,
+        memory_mask: Optional[Tensor] = None,
+        tgt_key_padding_mask: Optional[Tensor] = None,
+        memory_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+        query_pos: Optional[Tensor] = None,
+    ):
+        if self.normalize_before:
+            return self.forward_pre(
+                tgt,
+                memory,
+                tgt_mask,
+                memory_mask,
+                tgt_key_padding_mask,
+                memory_key_padding_mask,
+                pos,
+                query_pos,
+            )
+        return self.forward_post(
+            tgt, memory, tgt_mask, memory_mask, tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos
+        )
+
+
+def _get_clones(module, n):
+    return nn.ModuleList([copy.deepcopy(module) for _ in range(n)])
+
+
+def build_transformer(args):
+    return Transformer(
+        d_model=args.hidden_dim,
+        dropout=args.dropout,
+        nhead=args.nheads,
+        dim_feedforward=args.dim_feedforward,
+        num_encoder_layers=args.enc_layers,
+        num_decoder_layers=args.dec_layers,
+        normalize_before=args.pre_norm,
+        return_intermediate_dec=True,
+    )
+
+
+def _get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    raise RuntimeError(f"activation should be relu/gelu, not {activation}.")
--- a/lerobot/common/policies/act/utils.py
+++ b/lerobot/common/policies/act/utils.py
@@ -0,0 +1,477 @@
+"""
+Misc functions, including distributed helpers.
+
+Mostly copy-paste from torchvision references.
+"""
+import datetime
+import os
+import pickle
+import subprocess
+import time
+from collections import defaultdict, deque
+from typing import List, Optional
+
+import torch
+import torch.distributed as dist
+
+# needed due to empty tensor bug in pytorch and torchvision 0.5
+import torchvision
+from packaging import version
+from torch import Tensor
+
+if version.parse(torchvision.__version__) < version.parse("0.7"):
+    from torchvision.ops import _new_empty_tensor
+    from torchvision.ops.misc import _output_size
+
+
+class SmoothedValue:
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_avail_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
+        dist.barrier()
+        dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+
+    @property
+    def global_avg(self):
+        return self.total / self.count
+
+    @property
+    def max(self):
+        return max(self.deque)
+
+    @property
+    def value(self):
+        return self.deque[-1]
+
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median, avg=self.avg, global_avg=self.global_avg, max=self.max, value=self.value
+        )
+
+
+def all_gather(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+
+    # serialized to a Tensor
+    buffer = pickle.dumps(data)
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to("cuda")
+
+    # obtain Tensor size of each rank
+    local_size = torch.tensor([tensor.numel()], device="cuda")
+    size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)]
+    dist.all_gather(size_list, local_size)
+    size_list = [int(size.item()) for size in size_list]
+    max_size = max(size_list)
+
+    # receiving Tensor from all ranks
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    tensor_list = []
+    for _ in size_list:
+        tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda"))
+    if local_size != max_size:
+        padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda")
+        tensor = torch.cat((tensor, padding), dim=0)
+    dist.all_gather(tensor_list, tensor)
+
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list, strict=False):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+
+    return data_list
+
+
+def reduce_dict(input_dict, average=True):
+    """
+    Args:
+        input_dict (dict): all the values will be reduced
+        average (bool): whether to do average or sum
+    Reduce the values in the dictionary from all processes so that all processes
+    have the averaged results. Returns a dict with the same fields as
+    input_dict, after reduction.
+    """
+    world_size = get_world_size()
+    if world_size < 2:
+        return input_dict
+    with torch.no_grad():
+        names = []
+        values = []
+        # sort the keys so that they are consistent across processes
+        for k in sorted(input_dict.keys()):
+            names.append(k)
+            values.append(input_dict[k])
+        values = torch.stack(values, dim=0)
+        dist.all_reduce(values)
+        if average:
+            values /= world_size
+        reduced_dict = {k: v for k, v in zip(names, values, strict=False)}  # noqa: C416
+    return reduced_dict
+
+
+class MetricLogger:
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(type(self).__name__, attr))
+
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append("{}: {}".format(name, str(meter)))
+        return self.delimiter.join(loss_str)
+
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+
+    def log_every(self, iterable, print_freq, header=None):
+        if not header:
+            header = ""
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt="{avg:.4f}")
+        data_time = SmoothedValue(fmt="{avg:.4f}")
+        space_fmt = ":" + str(len(str(len(iterable)))) + "d"
+        if torch.cuda.is_available():
+            log_msg = self.delimiter.join(
+                [
+                    header,
+                    "[{0" + space_fmt + "}/{1}]",
+                    "eta: {eta}",
+                    "{meters}",
+                    "time: {time}",
+                    "data: {data}",
+                    "max mem: {memory:.0f}",
+                ]
+            )
+        else:
+            log_msg = self.delimiter.join(
+                [
+                    header,
+                    "[{0" + space_fmt + "}/{1}]",
+                    "eta: {eta}",
+                    "{meters}",
+                    "time: {time}",
+                    "data: {data}",
+                ]
+            )
+        mega_b = 1024.0 * 1024.0
+        for i, obj in enumerate(iterable):
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len(iterable) - 1:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print(
+                        log_msg.format(
+                            i,
+                            len(iterable),
+                            eta=eta_string,
+                            meters=str(self),
+                            time=str(iter_time),
+                            data=str(data_time),
+                            memory=torch.cuda.max_memory_allocated() / mega_b,
+                        )
+                    )
+                else:
+                    print(
+                        log_msg.format(
+                            i,
+                            len(iterable),
+                            eta=eta_string,
+                            meters=str(self),
+                            time=str(iter_time),
+                            data=str(data_time),
+                        )
+                    )
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print("{} Total time: {} ({:.4f} s / it)".format(header, total_time_str, total_time / len(iterable)))
+
+
+def get_sha():
+    cwd = os.path.dirname(os.path.abspath(__file__))
+
+    def _run(command):
+        return subprocess.check_output(command, cwd=cwd).decode("ascii").strip()
+
+    sha = "N/A"
+    diff = "clean"
+    branch = "N/A"
+    try:
+        sha = _run(["git", "rev-parse", "HEAD"])
+        subprocess.check_output(["git", "diff"], cwd=cwd)
+        diff = _run(["git", "diff-index", "HEAD"])
+        diff = "has uncommited changes" if diff else "clean"
+        branch = _run(["git", "rev-parse", "--abbrev-ref", "HEAD"])
+    except Exception:
+        pass
+    message = f"sha: {sha}, status: {diff}, branch: {branch}"
+    return message
+
+
+def collate_fn(batch):
+    batch = list(zip(*batch, strict=False))
+    batch[0] = nested_tensor_from_tensor_list(batch[0])
+    return tuple(batch)
+
+
+def _max_by_axis(the_list):
+    # type: (List[List[int]]) -> List[int]
+    maxes = the_list[0]
+    for sublist in the_list[1:]:
+        for index, item in enumerate(sublist):
+            maxes[index] = max(maxes[index], item)
+    return maxes
+
+
+class NestedTensor:
+    def __init__(self, tensors, mask: Optional[Tensor]):
+        self.tensors = tensors
+        self.mask = mask
+
+    def to(self, device):
+        # type: (Device) -> NestedTensor # noqa
+        cast_tensor = self.tensors.to(device)
+        mask = self.mask
+        if mask is not None:
+            assert mask is not None
+            cast_mask = mask.to(device)
+        else:
+            cast_mask = None
+        return NestedTensor(cast_tensor, cast_mask)
+
+    def decompose(self):
+        return self.tensors, self.mask
+
+    def __repr__(self):
+        return str(self.tensors)
+
+
+def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
+    # TODO make this more general
+    if tensor_list[0].ndim == 3:
+        if torchvision._is_tracing():
+            # nested_tensor_from_tensor_list() does not export well to ONNX
+            # call _onnx_nested_tensor_from_tensor_list() instead
+            return _onnx_nested_tensor_from_tensor_list(tensor_list)
+
+        # TODO make it support different-sized images
+        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
+        # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
+        batch_shape = [len(tensor_list)] + max_size
+        b, c, h, w = batch_shape
+        dtype = tensor_list[0].dtype
+        device = tensor_list[0].device
+        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
+        mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
+        for img, pad_img, m in zip(tensor_list, tensor, mask, strict=False):
+            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+            m[: img.shape[1], : img.shape[2]] = False
+    else:
+        raise ValueError("not supported")
+    return NestedTensor(tensor, mask)
+
+
+# _onnx_nested_tensor_from_tensor_list() is an implementation of
+# nested_tensor_from_tensor_list() that is supported by ONNX tracing.
+@torch.jit.unused
+def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
+    max_size = []
+    for i in range(tensor_list[0].dim()):
+        max_size_i = torch.max(torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)).to(
+            torch.int64
+        )
+        max_size.append(max_size_i)
+    max_size = tuple(max_size)
+
+    # work around for
+    # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+    # m[: img.shape[1], :img.shape[2]] = False
+    # which is not yet supported in onnx
+    padded_imgs = []
+    padded_masks = []
+    for img in tensor_list:
+        padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape), strict=False)]
+        padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
+        padded_imgs.append(padded_img)
+
+        m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
+        padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
+        padded_masks.append(padded_mask.to(torch.bool))
+
+    tensor = torch.stack(padded_imgs)
+    mask = torch.stack(padded_masks)
+
+    return NestedTensor(tensor, mask=mask)
+
+
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    import builtins as __builtin__
+
+    builtin_print = __builtin__.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop("force", False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+
+    __builtin__.print = print
+
+
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def is_main_process():
+    return get_rank() == 0
+
+
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+
+
+def init_distributed_mode(args):
+    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ["WORLD_SIZE"])
+        args.gpu = int(os.environ["LOCAL_RANK"])
+    elif "SLURM_PROCID" in os.environ:
+        args.rank = int(os.environ["SLURM_PROCID"])
+        args.gpu = args.rank % torch.cuda.device_count()
+    else:
+        print("Not using distributed mode")
+        args.distributed = False
+        return
+
+    args.distributed = True
+
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = "nccl"
+    print("| distributed init (rank {}): {}".format(args.rank, args.dist_url), flush=True)
+    torch.distributed.init_process_group(
+        backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank
+    )
+    torch.distributed.barrier()
+    setup_for_distributed(args.rank == 0)
+
+
+@torch.no_grad()
+def accuracy(output, target, topk=(1,)):
+    """Computes the precision@k for the specified values of k"""
+    if target.numel() == 0:
+        return [torch.zeros([], device=output.device)]
+    maxk = max(topk)
+    batch_size = target.size(0)
+
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+    res = []
+    for k in topk:
+        correct_k = correct[:k].view(-1).float().sum(0)
+        res.append(correct_k.mul_(100.0 / batch_size))
+    return res
+
+
+def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None):
+    # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor
+    """
+    Equivalent to nn.functional.interpolate, but with support for empty batch sizes.
+    This will eventually be supported natively by PyTorch, and this
+    class can go away.
+    """
+    if version.parse(torchvision.__version__) < version.parse("0.7"):
+        if input.numel() > 0:
+            return torch.nn.functional.interpolate(input, size, scale_factor, mode, align_corners)
+
+        output_shape = _output_size(2, input, size, scale_factor)
+        output_shape = list(input.shape[:-2]) + list(output_shape)
+        return _new_empty_tensor(input, output_shape)
+    else:
+        return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners)
--- a/lerobot/common/policies/diffusion/diffusion_unet_image_policy.py
+++ b/lerobot/common/policies/diffusion/diffusion_unet_image_policy.py
@@ -5,11 +5,33 @@ import torch.nn.functional as F  # noqa: N812
 from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
 from einops import reduce

-from diffusion_policy.common.pytorch_util import dict_apply
-from diffusion_policy.model.diffusion.conditional_unet1d import ConditionalUnet1D
-from diffusion_policy.model.diffusion.mask_generator import LowdimMaskGenerator
-from diffusion_policy.model.vision.multi_image_obs_encoder import MultiImageObsEncoder
-from diffusion_policy.policy.base_image_policy import BaseImagePolicy
+from lerobot.common.policies.diffusion.model.conditional_unet1d import ConditionalUnet1D
+from lerobot.common.policies.diffusion.model.mask_generator import LowdimMaskGenerator
+from lerobot.common.policies.diffusion.model.module_attr_mixin import ModuleAttrMixin
+from lerobot.common.policies.diffusion.model.multi_image_obs_encoder import MultiImageObsEncoder
+from lerobot.common.policies.diffusion.model.normalizer import LinearNormalizer
+from lerobot.common.policies.diffusion.pytorch_utils import dict_apply
+
+
+class BaseImagePolicy(ModuleAttrMixin):
+    # init accepts keyword argument shape_meta, see config/task/*_image.yaml
+
+    def predict_action(self, obs_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        """
+        obs_dict:
+            str: B,To,*
+        return: B,Ta,Da
+        """
+        raise NotImplementedError()
+
+    # reset state for stateful policies
+    def reset(self):
+        pass
+
+    # ========== training ===========
+    # no standard training interface except setting normalizer
+    def set_normalizer(self, normalizer: LinearNormalizer):
+        raise NotImplementedError()


 class DiffusionUnetImagePolicy(BaseImagePolicy):
--- a/lerobot/common/policies/diffusion/model/conditional_unet1d.py
+++ b/lerobot/common/policies/diffusion/model/conditional_unet1d.py
@@ -0,0 +1,286 @@
+import logging
+from typing import Union
+
+import einops
+import torch
+import torch.nn as nn
+from einops.layers.torch import Rearrange
+
+from lerobot.common.policies.diffusion.model.conv1d_components import Conv1dBlock, Downsample1d, Upsample1d
+from lerobot.common.policies.diffusion.model.positional_embedding import SinusoidalPosEmb
+
+logger = logging.getLogger(__name__)
+
+
+class ConditionalResidualBlock1D(nn.Module):
+    def __init__(
+        self, in_channels, out_channels, cond_dim, kernel_size=3, n_groups=8, cond_predict_scale=False
+    ):
+        super().__init__()
+
+        self.blocks = nn.ModuleList(
+            [
+                Conv1dBlock(in_channels, out_channels, kernel_size, n_groups=n_groups),
+                Conv1dBlock(out_channels, out_channels, kernel_size, n_groups=n_groups),
+            ]
+        )
+
+        # FiLM modulation https://arxiv.org/abs/1709.07871
+        # predicts per-channel scale and bias
+        cond_channels = out_channels
+        if cond_predict_scale:
+            cond_channels = out_channels * 2
+        self.cond_predict_scale = cond_predict_scale
+        self.out_channels = out_channels
+        self.cond_encoder = nn.Sequential(
+            nn.Mish(),
+            nn.Linear(cond_dim, cond_channels),
+            Rearrange("batch t -> batch t 1"),
+        )
+
+        # make sure dimensions compatible
+        self.residual_conv = (
+            nn.Conv1d(in_channels, out_channels, 1) if in_channels != out_channels else nn.Identity()
+        )
+
+    def forward(self, x, cond):
+        """
+        x : [ batch_size x in_channels x horizon ]
+        cond : [ batch_size x cond_dim]
+
+        returns:
+        out : [ batch_size x out_channels x horizon ]
+        """
+        out = self.blocks[0](x)
+        embed = self.cond_encoder(cond)
+        if self.cond_predict_scale:
+            embed = embed.reshape(embed.shape[0], 2, self.out_channels, 1)
+            scale = embed[:, 0, ...]
+            bias = embed[:, 1, ...]
+            out = scale * out + bias
+        else:
+            out = out + embed
+        out = self.blocks[1](out)
+        out = out + self.residual_conv(x)
+        return out
+
+
+class ConditionalUnet1D(nn.Module):
+    def __init__(
+        self,
+        input_dim,
+        local_cond_dim=None,
+        global_cond_dim=None,
+        diffusion_step_embed_dim=256,
+        down_dims=None,
+        kernel_size=3,
+        n_groups=8,
+        cond_predict_scale=False,
+    ):
+        super().__init__()
+        if down_dims is None:
+            down_dims = [256, 512, 1024]
+
+        all_dims = [input_dim] + list(down_dims)
+        start_dim = down_dims[0]
+
+        dsed = diffusion_step_embed_dim
+        diffusion_step_encoder = nn.Sequential(
+            SinusoidalPosEmb(dsed),
+            nn.Linear(dsed, dsed * 4),
+            nn.Mish(),
+            nn.Linear(dsed * 4, dsed),
+        )
+        cond_dim = dsed
+        if global_cond_dim is not None:
+            cond_dim += global_cond_dim
+
+        in_out = list(zip(all_dims[:-1], all_dims[1:], strict=False))
+
+        local_cond_encoder = None
+        if local_cond_dim is not None:
+            _, dim_out = in_out[0]
+            dim_in = local_cond_dim
+            local_cond_encoder = nn.ModuleList(
+                [
+                    # down encoder
+                    ConditionalResidualBlock1D(
+                        dim_in,
+                        dim_out,
+                        cond_dim=cond_dim,
+                        kernel_size=kernel_size,
+                        n_groups=n_groups,
+                        cond_predict_scale=cond_predict_scale,
+                    ),
+                    # up encoder
+                    ConditionalResidualBlock1D(
+                        dim_in,
+                        dim_out,
+                        cond_dim=cond_dim,
+                        kernel_size=kernel_size,
+                        n_groups=n_groups,
+                        cond_predict_scale=cond_predict_scale,
+                    ),
+                ]
+            )
+
+        mid_dim = all_dims[-1]
+        self.mid_modules = nn.ModuleList(
+            [
+                ConditionalResidualBlock1D(
+                    mid_dim,
+                    mid_dim,
+                    cond_dim=cond_dim,
+                    kernel_size=kernel_size,
+                    n_groups=n_groups,
+                    cond_predict_scale=cond_predict_scale,
+                ),
+                ConditionalResidualBlock1D(
+                    mid_dim,
+                    mid_dim,
+                    cond_dim=cond_dim,
+                    kernel_size=kernel_size,
+                    n_groups=n_groups,
+                    cond_predict_scale=cond_predict_scale,
+                ),
+            ]
+        )
+
+        down_modules = nn.ModuleList([])
+        for ind, (dim_in, dim_out) in enumerate(in_out):
+            is_last = ind >= (len(in_out) - 1)
+            down_modules.append(
+                nn.ModuleList(
+                    [
+                        ConditionalResidualBlock1D(
+                            dim_in,
+                            dim_out,
+                            cond_dim=cond_dim,
+                            kernel_size=kernel_size,
+                            n_groups=n_groups,
+                            cond_predict_scale=cond_predict_scale,
+                        ),
+                        ConditionalResidualBlock1D(
+                            dim_out,
+                            dim_out,
+                            cond_dim=cond_dim,
+                            kernel_size=kernel_size,
+                            n_groups=n_groups,
+                            cond_predict_scale=cond_predict_scale,
+                        ),
+                        Downsample1d(dim_out) if not is_last else nn.Identity(),
+                    ]
+                )
+            )
+
+        up_modules = nn.ModuleList([])
+        for ind, (dim_in, dim_out) in enumerate(reversed(in_out[1:])):
+            is_last = ind >= (len(in_out) - 1)
+            up_modules.append(
+                nn.ModuleList(
+                    [
+                        ConditionalResidualBlock1D(
+                            dim_out * 2,
+                            dim_in,
+                            cond_dim=cond_dim,
+                            kernel_size=kernel_size,
+                            n_groups=n_groups,
+                            cond_predict_scale=cond_predict_scale,
+                        ),
+                        ConditionalResidualBlock1D(
+                            dim_in,
+                            dim_in,
+                            cond_dim=cond_dim,
+                            kernel_size=kernel_size,
+                            n_groups=n_groups,
+                            cond_predict_scale=cond_predict_scale,
+                        ),
+                        Upsample1d(dim_in) if not is_last else nn.Identity(),
+                    ]
+                )
+            )
+
+        final_conv = nn.Sequential(
+            Conv1dBlock(start_dim, start_dim, kernel_size=kernel_size),
+            nn.Conv1d(start_dim, input_dim, 1),
+        )
+
+        self.diffusion_step_encoder = diffusion_step_encoder
+        self.local_cond_encoder = local_cond_encoder
+        self.up_modules = up_modules
+        self.down_modules = down_modules
+        self.final_conv = final_conv
+
+        logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+
+    def forward(
+        self,
+        sample: torch.Tensor,
+        timestep: Union[torch.Tensor, float, int],
+        local_cond=None,
+        global_cond=None,
+        **kwargs,
+    ):
+        """
+        x: (B,T,input_dim)
+        timestep: (B,) or int, diffusion step
+        local_cond: (B,T,local_cond_dim)
+        global_cond: (B,global_cond_dim)
+        output: (B,T,input_dim)
+        """
+        sample = einops.rearrange(sample, "b h t -> b t h")
+
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            timesteps = torch.tensor([timesteps], dtype=torch.long, device=sample.device)
+        elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+
+        global_feature = self.diffusion_step_encoder(timesteps)
+
+        if global_cond is not None:
+            global_feature = torch.cat([global_feature, global_cond], axis=-1)
+
+        # encode local features
+        h_local = []
+        if local_cond is not None:
+            local_cond = einops.rearrange(local_cond, "b h t -> b t h")
+            resnet, resnet2 = self.local_cond_encoder
+            x = resnet(local_cond, global_feature)
+            h_local.append(x)
+            x = resnet2(local_cond, global_feature)
+            h_local.append(x)
+
+        x = sample
+        h = []
+        for idx, (resnet, resnet2, downsample) in enumerate(self.down_modules):
+            x = resnet(x, global_feature)
+            if idx == 0 and len(h_local) > 0:
+                x = x + h_local[0]
+            x = resnet2(x, global_feature)
+            h.append(x)
+            x = downsample(x)
+
+        for mid_module in self.mid_modules:
+            x = mid_module(x, global_feature)
+
+        for idx, (resnet, resnet2, upsample) in enumerate(self.up_modules):
+            x = torch.cat((x, h.pop()), dim=1)
+            x = resnet(x, global_feature)
+            # The correct condition should be:
+            # if idx == (len(self.up_modules)-1) and len(h_local) > 0:
+            # However this change will break compatibility with published checkpoints.
+            # Therefore it is left as a comment.
+            if idx == len(self.up_modules) and len(h_local) > 0:
+                x = x + h_local[1]
+            x = resnet2(x, global_feature)
+            x = upsample(x)
+
+        x = self.final_conv(x)
+
+        x = einops.rearrange(x, "b t h -> b h t")
+        return x
--- a/lerobot/common/policies/diffusion/model/conv1d_components.py
+++ b/lerobot/common/policies/diffusion/model/conv1d_components.py
@@ -0,0 +1,47 @@
+import torch.nn as nn
+
+# from einops.layers.torch import Rearrange
+
+
+class Downsample1d(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.conv = nn.Conv1d(dim, dim, 3, 2, 1)
+
+    def forward(self, x):
+        return self.conv(x)
+
+
+class Upsample1d(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.conv = nn.ConvTranspose1d(dim, dim, 4, 2, 1)
+
+    def forward(self, x):
+        return self.conv(x)
+
+
+class Conv1dBlock(nn.Module):
+    """
+    Conv1d --> GroupNorm --> Mish
+    """
+
+    def __init__(self, inp_channels, out_channels, kernel_size, n_groups=8):
+        super().__init__()
+
+        self.block = nn.Sequential(
+            nn.Conv1d(inp_channels, out_channels, kernel_size, padding=kernel_size // 2),
+            # Rearrange('batch channels horizon -> batch channels 1 horizon'),
+            nn.GroupNorm(n_groups, out_channels),
+            # Rearrange('batch channels 1 horizon -> batch channels horizon'),
+            nn.Mish(),
+        )
+
+    def forward(self, x):
+        return self.block(x)
+
+
+# def test():
+#     cb = Conv1dBlock(256, 128, kernel_size=3)
+#     x = torch.zeros((1,256,16))
+#     o = cb(x)
--- a/lerobot/common/policies/diffusion/model/crop_randomizer.py
+++ b/lerobot/common/policies/diffusion/model/crop_randomizer.py
@@ -0,0 +1,294 @@
+import torch
+import torch.nn as nn
+import torchvision.transforms.functional as ttf
+
+import lerobot.common.policies.diffusion.model.tensor_utils as tu
+
+
+class CropRandomizer(nn.Module):
+    """
+    Randomly sample crops at input, and then average across crop features at output.
+    """
+
+    def __init__(
+        self,
+        input_shape,
+        crop_height,
+        crop_width,
+        num_crops=1,
+        pos_enc=False,
+    ):
+        """
+        Args:
+            input_shape (tuple, list): shape of input (not including batch dimension)
+            crop_height (int): crop height
+            crop_width (int): crop width
+            num_crops (int): number of random crops to take
+            pos_enc (bool): if True, add 2 channels to the output to encode the spatial
+                location of the cropped pixels in the source image
+        """
+        super().__init__()
+
+        assert len(input_shape) == 3  # (C, H, W)
+        assert crop_height < input_shape[1]
+        assert crop_width < input_shape[2]
+
+        self.input_shape = input_shape
+        self.crop_height = crop_height
+        self.crop_width = crop_width
+        self.num_crops = num_crops
+        self.pos_enc = pos_enc
+
+    def output_shape_in(self, input_shape=None):
+        """
+        Function to compute output shape from inputs to this module. Corresponds to
+        the @forward_in operation, where raw inputs (usually observation modalities)
+        are passed in.
+
+        Args:
+            input_shape (iterable of int): shape of input. Does not include batch dimension.
+                Some modules may not need this argument, if their output does not depend
+                on the size of the input, or if they assume fixed size input.
+
+        Returns:
+            out_shape ([int]): list of integers corresponding to output shape
+        """
+
+        # outputs are shape (C, CH, CW), or maybe C + 2 if using position encoding, because
+        # the number of crops are reshaped into the batch dimension, increasing the batch
+        # size from B to B * N
+        out_c = self.input_shape[0] + 2 if self.pos_enc else self.input_shape[0]
+        return [out_c, self.crop_height, self.crop_width]
+
+    def output_shape_out(self, input_shape=None):
+        """
+        Function to compute output shape from inputs to this module. Corresponds to
+        the @forward_out operation, where processed inputs (usually encoded observation
+        modalities) are passed in.
+
+        Args:
+            input_shape (iterable of int): shape of input. Does not include batch dimension.
+                Some modules may not need this argument, if their output does not depend
+                on the size of the input, or if they assume fixed size input.
+
+        Returns:
+            out_shape ([int]): list of integers corresponding to output shape
+        """
+
+        # since the forward_out operation splits [B * N, ...] -> [B, N, ...]
+        # and then pools to result in [B, ...], only the batch dimension changes,
+        # and so the other dimensions retain their shape.
+        return list(input_shape)
+
+    def forward_in(self, inputs):
+        """
+        Samples N random crops for each input in the batch, and then reshapes
+        inputs to [B * N, ...].
+        """
+        assert len(inputs.shape) >= 3  # must have at least (C, H, W) dimensions
+        if self.training:
+            # generate random crops
+            out, _ = sample_random_image_crops(
+                images=inputs,
+                crop_height=self.crop_height,
+                crop_width=self.crop_width,
+                num_crops=self.num_crops,
+                pos_enc=self.pos_enc,
+            )
+            # [B, N, ...] -> [B * N, ...]
+            return tu.join_dimensions(out, 0, 1)
+        else:
+            # take center crop during eval
+            out = ttf.center_crop(img=inputs, output_size=(self.crop_height, self.crop_width))
+            if self.num_crops > 1:
+                B, C, H, W = out.shape  # noqa: N806
+                out = out.unsqueeze(1).expand(B, self.num_crops, C, H, W).reshape(-1, C, H, W)
+                # [B * N, ...]
+            return out
+
+    def forward_out(self, inputs):
+        """
+        Splits the outputs from shape [B * N, ...] -> [B, N, ...] and then average across N
+        to result in shape [B, ...] to make sure the network output is consistent with
+        what would have happened if there were no randomization.
+        """
+        if self.num_crops <= 1:
+            return inputs
+        else:
+            batch_size = inputs.shape[0] // self.num_crops
+            out = tu.reshape_dimensions(
+                inputs, begin_axis=0, end_axis=0, target_dims=(batch_size, self.num_crops)
+            )
+            return out.mean(dim=1)
+
+    def forward(self, inputs):
+        return self.forward_in(inputs)
+
+    def __repr__(self):
+        """Pretty print network."""
+        header = "{}".format(str(self.__class__.__name__))
+        msg = header + "(input_shape={}, crop_size=[{}, {}], num_crops={})".format(
+            self.input_shape, self.crop_height, self.crop_width, self.num_crops
+        )
+        return msg
+
+
+def crop_image_from_indices(images, crop_indices, crop_height, crop_width):
+    """
+    Crops images at the locations specified by @crop_indices. Crops will be
+    taken across all channels.
+
+    Args:
+        images (torch.Tensor): batch of images of shape [..., C, H, W]
+
+        crop_indices (torch.Tensor): batch of indices of shape [..., N, 2] where
+            N is the number of crops to take per image and each entry corresponds
+            to the pixel height and width of where to take the crop. Note that
+            the indices can also be of shape [..., 2] if only 1 crop should
+            be taken per image. Leading dimensions must be consistent with
+            @images argument. Each index specifies the top left of the crop.
+            Values must be in range [0, H - CH - 1] x [0, W - CW - 1] where
+            H and W are the height and width of @images and CH and CW are
+            @crop_height and @crop_width.
+
+        crop_height (int): height of crop to take
+
+        crop_width (int): width of crop to take
+
+    Returns:
+        crops (torch.Tesnor): cropped images of shape [..., C, @crop_height, @crop_width]
+    """
+
+    # make sure length of input shapes is consistent
+    assert crop_indices.shape[-1] == 2
+    ndim_im_shape = len(images.shape)
+    ndim_indices_shape = len(crop_indices.shape)
+    assert (ndim_im_shape == ndim_indices_shape + 1) or (ndim_im_shape == ndim_indices_shape + 2)
+
+    # maybe pad so that @crop_indices is shape [..., N, 2]
+    is_padded = False
+    if ndim_im_shape == ndim_indices_shape + 2:
+        crop_indices = crop_indices.unsqueeze(-2)
+        is_padded = True
+
+    # make sure leading dimensions between images and indices are consistent
+    assert images.shape[:-3] == crop_indices.shape[:-2]
+
+    device = images.device
+    image_c, image_h, image_w = images.shape[-3:]
+    num_crops = crop_indices.shape[-2]
+
+    # make sure @crop_indices are in valid range
+    assert (crop_indices[..., 0] >= 0).all().item()
+    assert (crop_indices[..., 0] < (image_h - crop_height)).all().item()
+    assert (crop_indices[..., 1] >= 0).all().item()
+    assert (crop_indices[..., 1] < (image_w - crop_width)).all().item()
+
+    # convert each crop index (ch, cw) into a list of pixel indices that correspond to the entire window.
+
+    # 2D index array with columns [0, 1, ..., CH - 1] and shape [CH, CW]
+    crop_ind_grid_h = torch.arange(crop_height).to(device)
+    crop_ind_grid_h = tu.unsqueeze_expand_at(crop_ind_grid_h, size=crop_width, dim=-1)
+    # 2D index array with rows [0, 1, ..., CW - 1] and shape [CH, CW]
+    crop_ind_grid_w = torch.arange(crop_width).to(device)
+    crop_ind_grid_w = tu.unsqueeze_expand_at(crop_ind_grid_w, size=crop_height, dim=0)
+    # combine into shape [CH, CW, 2]
+    crop_in_grid = torch.cat((crop_ind_grid_h.unsqueeze(-1), crop_ind_grid_w.unsqueeze(-1)), dim=-1)
+
+    # Add above grid with the offset index of each sampled crop to get 2d indices for each crop.
+    # After broadcasting, this will be shape [..., N, CH, CW, 2] and each crop has a [CH, CW, 2]
+    # shape array that tells us which pixels from the corresponding source image to grab.
+    grid_reshape = [1] * len(crop_indices.shape[:-1]) + [crop_height, crop_width, 2]
+    all_crop_inds = crop_indices.unsqueeze(-2).unsqueeze(-2) + crop_in_grid.reshape(grid_reshape)
+
+    # For using @torch.gather, convert to flat indices from 2D indices, and also
+    # repeat across the channel dimension. To get flat index of each pixel to grab for
+    # each sampled crop, we just use the mapping: ind = h_ind * @image_w + w_ind
+    all_crop_inds = all_crop_inds[..., 0] * image_w + all_crop_inds[..., 1]  # shape [..., N, CH, CW]
+    all_crop_inds = tu.unsqueeze_expand_at(all_crop_inds, size=image_c, dim=-3)  # shape [..., N, C, CH, CW]
+    all_crop_inds = tu.flatten(all_crop_inds, begin_axis=-2)  # shape [..., N, C, CH * CW]
+
+    # Repeat and flatten the source images -> [..., N, C, H * W] and then use gather to index with crop pixel inds
+    images_to_crop = tu.unsqueeze_expand_at(images, size=num_crops, dim=-4)
+    images_to_crop = tu.flatten(images_to_crop, begin_axis=-2)
+    crops = torch.gather(images_to_crop, dim=-1, index=all_crop_inds)
+    # [..., N, C, CH * CW] -> [..., N, C, CH, CW]
+    reshape_axis = len(crops.shape) - 1
+    crops = tu.reshape_dimensions(
+        crops, begin_axis=reshape_axis, end_axis=reshape_axis, target_dims=(crop_height, crop_width)
+    )
+
+    if is_padded:
+        # undo padding -> [..., C, CH, CW]
+        crops = crops.squeeze(-4)
+    return crops
+
+
+def sample_random_image_crops(images, crop_height, crop_width, num_crops, pos_enc=False):
+    """
+    For each image, randomly sample @num_crops crops of size (@crop_height, @crop_width), from
+    @images.
+
+    Args:
+        images (torch.Tensor): batch of images of shape [..., C, H, W]
+
+        crop_height (int): height of crop to take
+
+        crop_width (int): width of crop to take
+
+        num_crops (n): number of crops to sample
+
+        pos_enc (bool): if True, also add 2 channels to the outputs that gives a spatial
+            encoding of the original source pixel locations. This means that the
+            output crops will contain information about where in the source image
+            it was sampled from.
+
+    Returns:
+        crops (torch.Tensor): crops of shape (..., @num_crops, C, @crop_height, @crop_width)
+            if @pos_enc is False, otherwise (..., @num_crops, C + 2, @crop_height, @crop_width)
+
+        crop_inds (torch.Tensor): sampled crop indices of shape (..., N, 2)
+    """
+    device = images.device
+
+    # maybe add 2 channels of spatial encoding to the source image
+    source_im = images
+    if pos_enc:
+        # spatial encoding [y, x] in [0, 1]
+        h, w = source_im.shape[-2:]
+        pos_y, pos_x = torch.meshgrid(torch.arange(h), torch.arange(w))
+        pos_y = pos_y.float().to(device) / float(h)
+        pos_x = pos_x.float().to(device) / float(w)
+        position_enc = torch.stack((pos_y, pos_x))  # shape [C, H, W]
+
+        # unsqueeze and expand to match leading dimensions -> shape [..., C, H, W]
+        leading_shape = source_im.shape[:-3]
+        position_enc = position_enc[(None,) * len(leading_shape)]
+        position_enc = position_enc.expand(*leading_shape, -1, -1, -1)
+
+        # concat across channel dimension with input
+        source_im = torch.cat((source_im, position_enc), dim=-3)
+
+    # make sure sample boundaries ensure crops are fully within the images
+    image_c, image_h, image_w = source_im.shape[-3:]
+    max_sample_h = image_h - crop_height
+    max_sample_w = image_w - crop_width
+
+    # Sample crop locations for all tensor dimensions up to the last 3, which are [C, H, W].
+    # Each gets @num_crops samples - typically this will just be the batch dimension (B), so
+    # we will sample [B, N] indices, but this supports having more than one leading dimension,
+    # or possibly no leading dimension.
+    #
+    # Trick: sample in [0, 1) with rand, then re-scale to [0, M) and convert to long to get sampled ints
+    crop_inds_h = (max_sample_h * torch.rand(*source_im.shape[:-3], num_crops).to(device)).long()
+    crop_inds_w = (max_sample_w * torch.rand(*source_im.shape[:-3], num_crops).to(device)).long()
+    crop_inds = torch.cat((crop_inds_h.unsqueeze(-1), crop_inds_w.unsqueeze(-1)), dim=-1)  # shape [..., N, 2]
+
+    crops = crop_image_from_indices(
+        images=source_im,
+        crop_indices=crop_inds,
+        crop_height=crop_height,
+        crop_width=crop_width,
+    )
+
+    return crops, crop_inds
--- a/lerobot/common/policies/diffusion/model/dict_of_tensor_mixin.py
+++ b/lerobot/common/policies/diffusion/model/dict_of_tensor_mixin.py
@@ -0,0 +1,41 @@
+import torch
+import torch.nn as nn
+
+
+class DictOfTensorMixin(nn.Module):
+    def __init__(self, params_dict=None):
+        super().__init__()
+        if params_dict is None:
+            params_dict = nn.ParameterDict()
+        self.params_dict = params_dict
+
+    @property
+    def device(self):
+        return next(iter(self.parameters())).device
+
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        def dfs_add(dest, keys, value: torch.Tensor):
+            if len(keys) == 1:
+                dest[keys[0]] = value
+                return
+
+            if keys[0] not in dest:
+                dest[keys[0]] = nn.ParameterDict()
+            dfs_add(dest[keys[0]], keys[1:], value)
+
+        def load_dict(state_dict, prefix):
+            out_dict = nn.ParameterDict()
+            for key, value in state_dict.items():
+                value: torch.Tensor
+                if key.startswith(prefix):
+                    param_keys = key[len(prefix) :].split(".")[1:]
+                    # if len(param_keys) == 0:
+                    #     import pdb; pdb.set_trace()
+                    dfs_add(out_dict, param_keys, value.clone())
+            return out_dict
+
+        self.params_dict = load_dict(state_dict, prefix + "params_dict")
+        self.params_dict.requires_grad_(False)
+        return
--- a/lerobot/common/policies/diffusion/model/ema_model.py
+++ b/lerobot/common/policies/diffusion/model/ema_model.py
@@ -0,0 +1,84 @@
+import torch
+from torch.nn.modules.batchnorm import _BatchNorm
+
+
+class EMAModel:
+    """
+    Exponential Moving Average of models weights
+    """
+
+    def __init__(
+        self, model, update_after_step=0, inv_gamma=1.0, power=2 / 3, min_value=0.0, max_value=0.9999
+    ):
+        """
+        @crowsonkb's notes on EMA Warmup:
+            If gamma=1 and power=1, implements a simple average. gamma=1, power=2/3 are good values for models you plan
+            to train for a million or more steps (reaches decay factor 0.999 at 31.6K steps, 0.9999 at 1M steps),
+            gamma=1, power=3/4 for models you plan to train for less (reaches decay factor 0.999 at 10K steps, 0.9999
+            at 215.4k steps).
+        Args:
+            inv_gamma (float): Inverse multiplicative factor of EMA warmup. Default: 1.
+            power (float): Exponential factor of EMA warmup. Default: 2/3.
+            min_value (float): The minimum EMA decay rate. Default: 0.
+        """
+
+        self.averaged_model = model
+        self.averaged_model.eval()
+        self.averaged_model.requires_grad_(False)
+
+        self.update_after_step = update_after_step
+        self.inv_gamma = inv_gamma
+        self.power = power
+        self.min_value = min_value
+        self.max_value = max_value
+
+        self.decay = 0.0
+        self.optimization_step = 0
+
+    def get_decay(self, optimization_step):
+        """
+        Compute the decay factor for the exponential moving average.
+        """
+        step = max(0, optimization_step - self.update_after_step - 1)
+        value = 1 - (1 + step / self.inv_gamma) ** -self.power
+
+        if step <= 0:
+            return 0.0
+
+        return max(self.min_value, min(value, self.max_value))
+
+    @torch.no_grad()
+    def step(self, new_model):
+        self.decay = self.get_decay(self.optimization_step)
+
+        # old_all_dataptrs = set()
+        # for param in new_model.parameters():
+        #     data_ptr = param.data_ptr()
+        #     if data_ptr != 0:
+        #         old_all_dataptrs.add(data_ptr)
+
+        # all_dataptrs = set()
+        for module, ema_module in zip(new_model.modules(), self.averaged_model.modules(), strict=False):
+            for param, ema_param in zip(
+                module.parameters(recurse=False), ema_module.parameters(recurse=False), strict=False
+            ):
+                # iterative over immediate parameters only.
+                if isinstance(param, dict):
+                    raise RuntimeError("Dict parameter not supported")
+
+                # data_ptr = param.data_ptr()
+                # if data_ptr != 0:
+                #     all_dataptrs.add(data_ptr)
+
+                if isinstance(module, _BatchNorm):
+                    # skip batchnorms
+                    ema_param.copy_(param.to(dtype=ema_param.dtype).data)
+                elif not param.requires_grad:
+                    ema_param.copy_(param.to(dtype=ema_param.dtype).data)
+                else:
+                    ema_param.mul_(self.decay)
+                    ema_param.add_(param.data.to(dtype=ema_param.dtype), alpha=1 - self.decay)
+
+        # verify that iterating over module and then parameters is identical to parameters recursively.
+        # assert old_all_dataptrs == all_dataptrs
+        self.optimization_step += 1
--- a/lerobot/common/policies/diffusion/model/lr_scheduler.py
+++ b/lerobot/common/policies/diffusion/model/lr_scheduler.py
@@ -0,0 +1,46 @@
+from diffusers.optimization import TYPE_TO_SCHEDULER_FUNCTION, Optimizer, Optional, SchedulerType, Union
+
+
+def get_scheduler(
+    name: Union[str, SchedulerType],
+    optimizer: Optimizer,
+    num_warmup_steps: Optional[int] = None,
+    num_training_steps: Optional[int] = None,
+    **kwargs,
+):
+    """
+    Added kwargs vs diffuser's original implementation
+
+    Unified API to get any scheduler from its name.
+
+    Args:
+        name (`str` or `SchedulerType`):
+            The name of the scheduler to use.
+        optimizer (`torch.optim.Optimizer`):
+            The optimizer that will be used during training.
+        num_warmup_steps (`int`, *optional*):
+            The number of warmup steps to do. This is not required by all schedulers (hence the argument being
+            optional), the function will raise an error if it's unset and the scheduler type requires it.
+        num_training_steps (`int``, *optional*):
+            The number of training steps to do. This is not required by all schedulers (hence the argument being
+            optional), the function will raise an error if it's unset and the scheduler type requires it.
+    """
+    name = SchedulerType(name)
+    schedule_func = TYPE_TO_SCHEDULER_FUNCTION[name]
+    if name == SchedulerType.CONSTANT:
+        return schedule_func(optimizer, **kwargs)
+
+    # All other schedulers require `num_warmup_steps`
+    if num_warmup_steps is None:
+        raise ValueError(f"{name} requires `num_warmup_steps`, please provide that argument.")
+
+    if name == SchedulerType.CONSTANT_WITH_WARMUP:
+        return schedule_func(optimizer, num_warmup_steps=num_warmup_steps, **kwargs)
+
+    # All other schedulers require `num_training_steps`
+    if num_training_steps is None:
+        raise ValueError(f"{name} requires `num_training_steps`, please provide that argument.")
+
+    return schedule_func(
+        optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps, **kwargs
+    )
--- a/lerobot/common/policies/diffusion/model/mask_generator.py
+++ b/lerobot/common/policies/diffusion/model/mask_generator.py
@@ -0,0 +1,65 @@
+import torch
+
+from lerobot.common.policies.diffusion.model.module_attr_mixin import ModuleAttrMixin
+
+
+class LowdimMaskGenerator(ModuleAttrMixin):
+    def __init__(
+        self,
+        action_dim,
+        obs_dim,
+        # obs mask setup
+        max_n_obs_steps=2,
+        fix_obs_steps=True,
+        # action mask
+        action_visible=False,
+    ):
+        super().__init__()
+        self.action_dim = action_dim
+        self.obs_dim = obs_dim
+        self.max_n_obs_steps = max_n_obs_steps
+        self.fix_obs_steps = fix_obs_steps
+        self.action_visible = action_visible
+
+    @torch.no_grad()
+    def forward(self, shape, seed=None):
+        device = self.device
+        B, T, D = shape  # noqa: N806
+        assert (self.action_dim + self.obs_dim) == D
+
+        # create all tensors on this device
+        rng = torch.Generator(device=device)
+        if seed is not None:
+            rng = rng.manual_seed(seed)
+
+        # generate dim mask
+        dim_mask = torch.zeros(size=shape, dtype=torch.bool, device=device)
+        is_action_dim = dim_mask.clone()
+        is_action_dim[..., : self.action_dim] = True
+        is_obs_dim = ~is_action_dim
+
+        # generate obs mask
+        if self.fix_obs_steps:
+            obs_steps = torch.full((B,), fill_value=self.max_n_obs_steps, device=device)
+        else:
+            obs_steps = torch.randint(
+                low=1, high=self.max_n_obs_steps + 1, size=(B,), generator=rng, device=device
+            )
+
+        steps = torch.arange(0, T, device=device).reshape(1, T).expand(B, T)
+        obs_mask = (obs_steps > steps.T).T.reshape(B, T, 1).expand(B, T, D)
+        obs_mask = obs_mask & is_obs_dim
+
+        # generate action mask
+        if self.action_visible:
+            action_steps = torch.maximum(
+                obs_steps - 1, torch.tensor(0, dtype=obs_steps.dtype, device=obs_steps.device)
+            )
+            action_mask = (action_steps > steps.T).T.reshape(B, T, 1).expand(B, T, D)
+            action_mask = action_mask & is_action_dim
+
+        mask = obs_mask
+        if self.action_visible:
+            mask = mask | action_mask
+
+        return mask
--- a/lerobot/common/policies/diffusion/model/module_attr_mixin.py
+++ b/lerobot/common/policies/diffusion/model/module_attr_mixin.py
@@ -0,0 +1,15 @@
+import torch.nn as nn
+
+
+class ModuleAttrMixin(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self._dummy_variable = nn.Parameter()
+
+    @property
+    def device(self):
+        return next(iter(self.parameters())).device
+
+    @property
+    def dtype(self):
+        return next(iter(self.parameters())).dtype
--- a/lerobot/common/policies/diffusion/model/multi_image_obs_encoder.py
+++ b/lerobot/common/policies/diffusion/model/multi_image_obs_encoder.py
@@ -5,9 +5,9 @@ import torch
 import torch.nn as nn
 import torchvision

-from diffusion_policy.common.pytorch_util import replace_submodules
-from diffusion_policy.model.common.module_attr_mixin import ModuleAttrMixin
-from diffusion_policy.model.vision.crop_randomizer import CropRandomizer
+from lerobot.common.policies.diffusion.model.crop_randomizer import CropRandomizer
+from lerobot.common.policies.diffusion.model.module_attr_mixin import ModuleAttrMixin
+from lerobot.common.policies.diffusion.pytorch_utils import replace_submodules


 class MultiImageObsEncoder(ModuleAttrMixin):
--- a/lerobot/common/policies/diffusion/model/normalizer.py
+++ b/lerobot/common/policies/diffusion/model/normalizer.py
@@ -0,0 +1,358 @@
+from typing import Dict, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import zarr
+
+from lerobot.common.policies.diffusion.model.dict_of_tensor_mixin import DictOfTensorMixin
+from lerobot.common.policies.diffusion.pytorch_utils import dict_apply
+
+
+class LinearNormalizer(DictOfTensorMixin):
+    avaliable_modes = ["limits", "gaussian"]
+
+    @torch.no_grad()
+    def fit(
+        self,
+        data: Union[Dict, torch.Tensor, np.ndarray, zarr.Array],
+        last_n_dims=1,
+        dtype=torch.float32,
+        mode="limits",
+        output_max=1.0,
+        output_min=-1.0,
+        range_eps=1e-4,
+        fit_offset=True,
+    ):
+        if isinstance(data, dict):
+            for key, value in data.items():
+                self.params_dict[key] = _fit(
+                    value,
+                    last_n_dims=last_n_dims,
+                    dtype=dtype,
+                    mode=mode,
+                    output_max=output_max,
+                    output_min=output_min,
+                    range_eps=range_eps,
+                    fit_offset=fit_offset,
+                )
+        else:
+            self.params_dict["_default"] = _fit(
+                data,
+                last_n_dims=last_n_dims,
+                dtype=dtype,
+                mode=mode,
+                output_max=output_max,
+                output_min=output_min,
+                range_eps=range_eps,
+                fit_offset=fit_offset,
+            )
+
+    def __call__(self, x: Union[Dict, torch.Tensor, np.ndarray]) -> torch.Tensor:
+        return self.normalize(x)
+
+    def __getitem__(self, key: str):
+        return SingleFieldLinearNormalizer(self.params_dict[key])
+
+    def __setitem__(self, key: str, value: "SingleFieldLinearNormalizer"):
+        self.params_dict[key] = value.params_dict
+
+    def _normalize_impl(self, x, forward=True):
+        if isinstance(x, dict):
+            result = {}
+            for key, value in x.items():
+                params = self.params_dict[key]
+                result[key] = _normalize(value, params, forward=forward)
+            return result
+        else:
+            if "_default" not in self.params_dict:
+                raise RuntimeError("Not initialized")
+            params = self.params_dict["_default"]
+            return _normalize(x, params, forward=forward)
+
+    def normalize(self, x: Union[Dict, torch.Tensor, np.ndarray]) -> torch.Tensor:
+        return self._normalize_impl(x, forward=True)
+
+    def unnormalize(self, x: Union[Dict, torch.Tensor, np.ndarray]) -> torch.Tensor:
+        return self._normalize_impl(x, forward=False)
+
+    def get_input_stats(self) -> Dict:
+        if len(self.params_dict) == 0:
+            raise RuntimeError("Not initialized")
+        if len(self.params_dict) == 1 and "_default" in self.params_dict:
+            return self.params_dict["_default"]["input_stats"]
+
+        result = {}
+        for key, value in self.params_dict.items():
+            if key != "_default":
+                result[key] = value["input_stats"]
+        return result
+
+    def get_output_stats(self, key="_default"):
+        input_stats = self.get_input_stats()
+        if "min" in input_stats:
+            # no dict
+            return dict_apply(input_stats, self.normalize)
+
+        result = {}
+        for key, group in input_stats.items():
+            this_dict = {}
+            for name, value in group.items():
+                this_dict[name] = self.normalize({key: value})[key]
+            result[key] = this_dict
+        return result
+
+
+class SingleFieldLinearNormalizer(DictOfTensorMixin):
+    avaliable_modes = ["limits", "gaussian"]
+
+    @torch.no_grad()
+    def fit(
+        self,
+        data: Union[torch.Tensor, np.ndarray, zarr.Array],
+        last_n_dims=1,
+        dtype=torch.float32,
+        mode="limits",
+        output_max=1.0,
+        output_min=-1.0,
+        range_eps=1e-4,
+        fit_offset=True,
+    ):
+        self.params_dict = _fit(
+            data,
+            last_n_dims=last_n_dims,
+            dtype=dtype,
+            mode=mode,
+            output_max=output_max,
+            output_min=output_min,
+            range_eps=range_eps,
+            fit_offset=fit_offset,
+        )
+
+    @classmethod
+    def create_fit(cls, data: Union[torch.Tensor, np.ndarray, zarr.Array], **kwargs):
+        obj = cls()
+        obj.fit(data, **kwargs)
+        return obj
+
+    @classmethod
+    def create_manual(
+        cls,
+        scale: Union[torch.Tensor, np.ndarray],
+        offset: Union[torch.Tensor, np.ndarray],
+        input_stats_dict: Dict[str, Union[torch.Tensor, np.ndarray]],
+    ):
+        def to_tensor(x):
+            if not isinstance(x, torch.Tensor):
+                x = torch.from_numpy(x)
+            x = x.flatten()
+            return x
+
+        # check
+        for x in [offset] + list(input_stats_dict.values()):
+            assert x.shape == scale.shape
+            assert x.dtype == scale.dtype
+
+        params_dict = nn.ParameterDict(
+            {
+                "scale": to_tensor(scale),
+                "offset": to_tensor(offset),
+                "input_stats": nn.ParameterDict(dict_apply(input_stats_dict, to_tensor)),
+            }
+        )
+        return cls(params_dict)
+
+    @classmethod
+    def create_identity(cls, dtype=torch.float32):
+        scale = torch.tensor([1], dtype=dtype)
+        offset = torch.tensor([0], dtype=dtype)
+        input_stats_dict = {
+            "min": torch.tensor([-1], dtype=dtype),
+            "max": torch.tensor([1], dtype=dtype),
+            "mean": torch.tensor([0], dtype=dtype),
+            "std": torch.tensor([1], dtype=dtype),
+        }
+        return cls.create_manual(scale, offset, input_stats_dict)
+
+    def normalize(self, x: Union[torch.Tensor, np.ndarray]) -> torch.Tensor:
+        return _normalize(x, self.params_dict, forward=True)
+
+    def unnormalize(self, x: Union[torch.Tensor, np.ndarray]) -> torch.Tensor:
+        return _normalize(x, self.params_dict, forward=False)
+
+    def get_input_stats(self):
+        return self.params_dict["input_stats"]
+
+    def get_output_stats(self):
+        return dict_apply(self.params_dict["input_stats"], self.normalize)
+
+    def __call__(self, x: Union[torch.Tensor, np.ndarray]) -> torch.Tensor:
+        return self.normalize(x)
+
+
+def _fit(
+    data: Union[torch.Tensor, np.ndarray, zarr.Array],
+    last_n_dims=1,
+    dtype=torch.float32,
+    mode="limits",
+    output_max=1.0,
+    output_min=-1.0,
+    range_eps=1e-4,
+    fit_offset=True,
+):
+    assert mode in ["limits", "gaussian"]
+    assert last_n_dims >= 0
+    assert output_max > output_min
+
+    # convert data to torch and type
+    if isinstance(data, zarr.Array):
+        data = data[:]
+    if isinstance(data, np.ndarray):
+        data = torch.from_numpy(data)
+    if dtype is not None:
+        data = data.type(dtype)
+
+    # convert shape
+    dim = 1
+    if last_n_dims > 0:
+        dim = np.prod(data.shape[-last_n_dims:])
+    data = data.reshape(-1, dim)
+
+    # compute input stats min max mean std
+    input_min, _ = data.min(axis=0)
+    input_max, _ = data.max(axis=0)
+    input_mean = data.mean(axis=0)
+    input_std = data.std(axis=0)
+
+    # compute scale and offset
+    if mode == "limits":
+        if fit_offset:
+            # unit scale
+            input_range = input_max - input_min
+            ignore_dim = input_range < range_eps
+            input_range[ignore_dim] = output_max - output_min
+            scale = (output_max - output_min) / input_range
+            offset = output_min - scale * input_min
+            offset[ignore_dim] = (output_max + output_min) / 2 - input_min[ignore_dim]
+            # ignore dims scaled to mean of output max and min
+        else:
+            # use this when data is pre-zero-centered.
+            assert output_max > 0
+            assert output_min < 0
+            # unit abs
+            output_abs = min(abs(output_min), abs(output_max))
+            input_abs = torch.maximum(torch.abs(input_min), torch.abs(input_max))
+            ignore_dim = input_abs < range_eps
+            input_abs[ignore_dim] = output_abs
+            # don't scale constant channels
+            scale = output_abs / input_abs
+            offset = torch.zeros_like(input_mean)
+    elif mode == "gaussian":
+        ignore_dim = input_std < range_eps
+        scale = input_std.clone()
+        scale[ignore_dim] = 1
+        scale = 1 / scale
+
+        offset = -input_mean * scale if fit_offset else torch.zeros_like(input_mean)
+
+    # save
+    this_params = nn.ParameterDict(
+        {
+            "scale": scale,
+            "offset": offset,
+            "input_stats": nn.ParameterDict(
+                {"min": input_min, "max": input_max, "mean": input_mean, "std": input_std}
+            ),
+        }
+    )
+    for p in this_params.parameters():
+        p.requires_grad_(False)
+    return this_params
+
+
+def _normalize(x, params, forward=True):
+    assert "scale" in params
+    if isinstance(x, np.ndarray):
+        x = torch.from_numpy(x)
+    scale = params["scale"]
+    offset = params["offset"]
+    x = x.to(device=scale.device, dtype=scale.dtype)
+    src_shape = x.shape
+    x = x.reshape(-1, scale.shape[0])
+    x = x * scale + offset if forward else (x - offset) / scale
+    x = x.reshape(src_shape)
+    return x
+
+
+def test():
+    data = torch.zeros((100, 10, 9, 2)).uniform_()
+    data[..., 0, 0] = 0
+
+    normalizer = SingleFieldLinearNormalizer()
+    normalizer.fit(data, mode="limits", last_n_dims=2)
+    datan = normalizer.normalize(data)
+    assert datan.shape == data.shape
+    assert np.allclose(datan.max(), 1.0)
+    assert np.allclose(datan.min(), -1.0)
+    dataun = normalizer.unnormalize(datan)
+    assert torch.allclose(data, dataun, atol=1e-7)
+
+    _ = normalizer.get_input_stats()
+    _ = normalizer.get_output_stats()
+
+    normalizer = SingleFieldLinearNormalizer()
+    normalizer.fit(data, mode="limits", last_n_dims=1, fit_offset=False)
+    datan = normalizer.normalize(data)
+    assert datan.shape == data.shape
+    assert np.allclose(datan.max(), 1.0, atol=1e-3)
+    assert np.allclose(datan.min(), 0.0, atol=1e-3)
+    dataun = normalizer.unnormalize(datan)
+    assert torch.allclose(data, dataun, atol=1e-7)
+
+    data = torch.zeros((100, 10, 9, 2)).uniform_()
+    normalizer = SingleFieldLinearNormalizer()
+    normalizer.fit(data, mode="gaussian", last_n_dims=0)
+    datan = normalizer.normalize(data)
+    assert datan.shape == data.shape
+    assert np.allclose(datan.mean(), 0.0, atol=1e-3)
+    assert np.allclose(datan.std(), 1.0, atol=1e-3)
+    dataun = normalizer.unnormalize(datan)
+    assert torch.allclose(data, dataun, atol=1e-7)
+
+    # dict
+    data = torch.zeros((100, 10, 9, 2)).uniform_()
+    data[..., 0, 0] = 0
+
+    normalizer = LinearNormalizer()
+    normalizer.fit(data, mode="limits", last_n_dims=2)
+    datan = normalizer.normalize(data)
+    assert datan.shape == data.shape
+    assert np.allclose(datan.max(), 1.0)
+    assert np.allclose(datan.min(), -1.0)
+    dataun = normalizer.unnormalize(datan)
+    assert torch.allclose(data, dataun, atol=1e-7)
+
+    _ = normalizer.get_input_stats()
+    _ = normalizer.get_output_stats()
+
+    data = {
+        "obs": torch.zeros((1000, 128, 9, 2)).uniform_() * 512,
+        "action": torch.zeros((1000, 128, 2)).uniform_() * 512,
+    }
+    normalizer = LinearNormalizer()
+    normalizer.fit(data)
+    datan = normalizer.normalize(data)
+    dataun = normalizer.unnormalize(datan)
+    for key in data:
+        assert torch.allclose(data[key], dataun[key], atol=1e-4)
+
+    _ = normalizer.get_input_stats()
+    _ = normalizer.get_output_stats()
+
+    state_dict = normalizer.state_dict()
+    n = LinearNormalizer()
+    n.load_state_dict(state_dict)
+    datan = n.normalize(data)
+    dataun = n.unnormalize(datan)
+    for key in data:
+        assert torch.allclose(data[key], dataun[key], atol=1e-4)
--- a/lerobot/common/policies/diffusion/model/positional_embedding.py
+++ b/lerobot/common/policies/diffusion/model/positional_embedding.py
@@ -0,0 +1,19 @@
+import math
+
+import torch
+import torch.nn as nn
+
+
+class SinusoidalPosEmb(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, x):
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
+        emb = x[:, None] * emb[None, :]
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
--- a/lerobot/common/policies/diffusion/model/tensor_utils.py
+++ b/lerobot/common/policies/diffusion/model/tensor_utils.py
@@ -0,0 +1,971 @@
+"""
+A collection of utilities for working with nested tensor structures consisting
+of numpy arrays and torch tensors.
+"""
+import collections
+
+import numpy as np
+import torch
+
+
+def recursive_dict_list_tuple_apply(x, type_func_dict):
+    """
+    Recursively apply functions to a nested dictionary or list or tuple, given a dictionary of
+    {data_type: function_to_apply}.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        type_func_dict (dict): a mapping from data types to the functions to be
+            applied for each data type.
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    assert list not in type_func_dict
+    assert tuple not in type_func_dict
+    assert dict not in type_func_dict
+
+    if isinstance(x, (dict, collections.OrderedDict)):
+        new_x = collections.OrderedDict() if isinstance(x, collections.OrderedDict) else {}
+        for k, v in x.items():
+            new_x[k] = recursive_dict_list_tuple_apply(v, type_func_dict)
+        return new_x
+    elif isinstance(x, (list, tuple)):
+        ret = [recursive_dict_list_tuple_apply(v, type_func_dict) for v in x]
+        if isinstance(x, tuple):
+            ret = tuple(ret)
+        return ret
+    else:
+        for t, f in type_func_dict.items():
+            if isinstance(x, t):
+                return f(x)
+        else:
+            raise NotImplementedError("Cannot handle data type %s" % str(type(x)))
+
+
+def map_tensor(x, func):
+    """
+    Apply function @func to torch.Tensor objects in a nested dictionary or
+    list or tuple.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        func (function): function to apply to each tensor
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: func,
+            type(None): lambda x: x,
+        },
+    )
+
+
+def map_ndarray(x, func):
+    """
+    Apply function @func to np.ndarray objects in a nested dictionary or
+    list or tuple.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        func (function): function to apply to each array
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            np.ndarray: func,
+            type(None): lambda x: x,
+        },
+    )
+
+
+def map_tensor_ndarray(x, tensor_func, ndarray_func):
+    """
+    Apply function @tensor_func to torch.Tensor objects and @ndarray_func to
+    np.ndarray objects in a nested dictionary or list or tuple.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        tensor_func (function): function to apply to each tensor
+        ndarray_Func (function): function to apply to each array
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: tensor_func,
+            np.ndarray: ndarray_func,
+            type(None): lambda x: x,
+        },
+    )
+
+
+def clone(x):
+    """
+    Clones all torch tensors and numpy arrays in nested dictionary or list
+    or tuple and returns a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x: x.clone(),
+            np.ndarray: lambda x: x.copy(),
+            type(None): lambda x: x,
+        },
+    )
+
+
+def detach(x):
+    """
+    Detaches all torch tensors in nested dictionary or list
+    or tuple and returns a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x: x.detach(),
+        },
+    )
+
+
+def to_batch(x):
+    """
+    Introduces a leading batch dimension of 1 for all torch tensors and numpy
+    arrays in nested dictionary or list or tuple and returns a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x: x[None, ...],
+            np.ndarray: lambda x: x[None, ...],
+            type(None): lambda x: x,
+        },
+    )
+
+
+def to_sequence(x):
+    """
+    Introduces a time dimension of 1 at dimension 1 for all torch tensors and numpy
+    arrays in nested dictionary or list or tuple and returns a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x: x[:, None, ...],
+            np.ndarray: lambda x: x[:, None, ...],
+            type(None): lambda x: x,
+        },
+    )
+
+
+def index_at_time(x, ind):
+    """
+    Indexes all torch tensors and numpy arrays in dimension 1 with index @ind in
+    nested dictionary or list or tuple and returns a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        ind (int): index
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x: x[:, ind, ...],
+            np.ndarray: lambda x: x[:, ind, ...],
+            type(None): lambda x: x,
+        },
+    )
+
+
+def unsqueeze(x, dim):
+    """
+    Adds dimension of size 1 at dimension @dim in all torch tensors and numpy arrays
+    in nested dictionary or list or tuple and returns a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        dim (int): dimension
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x: x.unsqueeze(dim=dim),
+            np.ndarray: lambda x: np.expand_dims(x, axis=dim),
+            type(None): lambda x: x,
+        },
+    )
+
+
+def contiguous(x):
+    """
+    Makes all torch tensors and numpy arrays contiguous in nested dictionary or
+    list or tuple and returns a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x: x.contiguous(),
+            np.ndarray: lambda x: np.ascontiguousarray(x),
+            type(None): lambda x: x,
+        },
+    )
+
+
+def to_device(x, device):
+    """
+    Sends all torch tensors in nested dictionary or list or tuple to device
+    @device, and returns a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        device (torch.Device): device to send tensors to
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x, d=device: x.to(d),
+            type(None): lambda x: x,
+        },
+    )
+
+
+def to_tensor(x):
+    """
+    Converts all numpy arrays in nested dictionary or list or tuple to
+    torch tensors (and leaves existing torch Tensors as-is), and returns
+    a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x: x,
+            np.ndarray: lambda x: torch.from_numpy(x),
+            type(None): lambda x: x,
+        },
+    )
+
+
+def to_numpy(x):
+    """
+    Converts all torch tensors in nested dictionary or list or tuple to
+    numpy (and leaves existing numpy arrays as-is), and returns
+    a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+
+    def f(tensor):
+        if tensor.is_cuda:
+            return tensor.detach().cpu().numpy()
+        else:
+            return tensor.detach().numpy()
+
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: f,
+            np.ndarray: lambda x: x,
+            type(None): lambda x: x,
+        },
+    )
+
+
+def to_list(x):
+    """
+    Converts all torch tensors and numpy arrays in nested dictionary or list
+    or tuple to a list, and returns a new nested structure. Useful for
+    json encoding.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+
+    def f(tensor):
+        if tensor.is_cuda:
+            return tensor.detach().cpu().numpy().tolist()
+        else:
+            return tensor.detach().numpy().tolist()
+
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: f,
+            np.ndarray: lambda x: x.tolist(),
+            type(None): lambda x: x,
+        },
+    )
+
+
+def to_float(x):
+    """
+    Converts all torch tensors and numpy arrays in nested dictionary or list
+    or tuple to float type entries, and returns a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x: x.float(),
+            np.ndarray: lambda x: x.astype(np.float32),
+            type(None): lambda x: x,
+        },
+    )
+
+
+def to_uint8(x):
+    """
+    Converts all torch tensors and numpy arrays in nested dictionary or list
+    or tuple to uint8 type entries, and returns a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x: x.byte(),
+            np.ndarray: lambda x: x.astype(np.uint8),
+            type(None): lambda x: x,
+        },
+    )
+
+
+def to_torch(x, device):
+    """
+    Converts all numpy arrays and torch tensors in nested dictionary or list or tuple to
+    torch tensors on device @device and returns a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        device (torch.Device): device to send tensors to
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return to_device(to_float(to_tensor(x)), device)
+
+
+def to_one_hot_single(tensor, num_class):
+    """
+    Convert tensor to one-hot representation, assuming a certain number of total class labels.
+
+    Args:
+        tensor (torch.Tensor): tensor containing integer labels
+        num_class (int): number of classes
+
+    Returns:
+        x (torch.Tensor): tensor containing one-hot representation of labels
+    """
+    x = torch.zeros(tensor.size() + (num_class,)).to(tensor.device)
+    x.scatter_(-1, tensor.unsqueeze(-1), 1)
+    return x
+
+
+def to_one_hot(tensor, num_class):
+    """
+    Convert all tensors in nested dictionary or list or tuple to one-hot representation,
+    assuming a certain number of total class labels.
+
+    Args:
+        tensor (dict or list or tuple): a possibly nested dictionary or list or tuple
+        num_class (int): number of classes
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return map_tensor(tensor, func=lambda x, nc=num_class: to_one_hot_single(x, nc))
+
+
+def flatten_single(x, begin_axis=1):
+    """
+    Flatten a tensor in all dimensions from @begin_axis onwards.
+
+    Args:
+        x (torch.Tensor): tensor to flatten
+        begin_axis (int): which axis to flatten from
+
+    Returns:
+        y (torch.Tensor): flattened tensor
+    """
+    fixed_size = x.size()[:begin_axis]
+    _s = list(fixed_size) + [-1]
+    return x.reshape(*_s)
+
+
+def flatten(x, begin_axis=1):
+    """
+    Flatten all tensors in nested dictionary or list or tuple, from @begin_axis onwards.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        begin_axis (int): which axis to flatten from
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x, b=begin_axis: flatten_single(x, begin_axis=b),
+        },
+    )
+
+
+def reshape_dimensions_single(x, begin_axis, end_axis, target_dims):
+    """
+    Reshape selected dimensions in a tensor to a target dimension.
+
+    Args:
+        x (torch.Tensor): tensor to reshape
+        begin_axis (int): begin dimension
+        end_axis (int): end dimension
+        target_dims (tuple or list): target shape for the range of dimensions
+            (@begin_axis, @end_axis)
+
+    Returns:
+        y (torch.Tensor): reshaped tensor
+    """
+    assert begin_axis <= end_axis
+    assert begin_axis >= 0
+    assert end_axis < len(x.shape)
+    assert isinstance(target_dims, (tuple, list))
+    s = x.shape
+    final_s = []
+    for i in range(len(s)):
+        if i == begin_axis:
+            final_s.extend(target_dims)
+        elif i < begin_axis or i > end_axis:
+            final_s.append(s[i])
+    return x.reshape(*final_s)
+
+
+def reshape_dimensions(x, begin_axis, end_axis, target_dims):
+    """
+    Reshape selected dimensions for all tensors in nested dictionary or list or tuple
+    to a target dimension.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        begin_axis (int): begin dimension
+        end_axis (int): end dimension
+        target_dims (tuple or list): target shape for the range of dimensions
+            (@begin_axis, @end_axis)
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x, b=begin_axis, e=end_axis, t=target_dims: reshape_dimensions_single(
+                x, begin_axis=b, end_axis=e, target_dims=t
+            ),
+            np.ndarray: lambda x, b=begin_axis, e=end_axis, t=target_dims: reshape_dimensions_single(
+                x, begin_axis=b, end_axis=e, target_dims=t
+            ),
+            type(None): lambda x: x,
+        },
+    )
+
+
+def join_dimensions(x, begin_axis, end_axis):
+    """
+    Joins all dimensions between dimensions (@begin_axis, @end_axis) into a flat dimension, for
+    all tensors in nested dictionary or list or tuple.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        begin_axis (int): begin dimension
+        end_axis (int): end dimension
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x, b=begin_axis, e=end_axis: reshape_dimensions_single(
+                x, begin_axis=b, end_axis=e, target_dims=[-1]
+            ),
+            np.ndarray: lambda x, b=begin_axis, e=end_axis: reshape_dimensions_single(
+                x, begin_axis=b, end_axis=e, target_dims=[-1]
+            ),
+            type(None): lambda x: x,
+        },
+    )
+
+
+def expand_at_single(x, size, dim):
+    """
+    Expand a tensor at a single dimension @dim by @size
+
+    Args:
+        x (torch.Tensor): input tensor
+        size (int): size to expand
+        dim (int): dimension to expand
+
+    Returns:
+        y (torch.Tensor): expanded tensor
+    """
+    assert dim < x.ndimension()
+    assert x.shape[dim] == 1
+    expand_dims = [-1] * x.ndimension()
+    expand_dims[dim] = size
+    return x.expand(*expand_dims)
+
+
+def expand_at(x, size, dim):
+    """
+    Expand all tensors in nested dictionary or list or tuple at a single
+    dimension @dim by @size.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        size (int): size to expand
+        dim (int): dimension to expand
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return map_tensor(x, lambda t, s=size, d=dim: expand_at_single(t, s, d))
+
+
+def unsqueeze_expand_at(x, size, dim):
+    """
+    Unsqueeze and expand a tensor at a dimension @dim by @size.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        size (int): size to expand
+        dim (int): dimension to unsqueeze and expand
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    x = unsqueeze(x, dim)
+    return expand_at(x, size, dim)
+
+
+def repeat_by_expand_at(x, repeats, dim):
+    """
+    Repeat a dimension by combining expand and reshape operations.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        repeats (int): number of times to repeat the target dimension
+        dim (int): dimension to repeat on
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    x = unsqueeze_expand_at(x, repeats, dim + 1)
+    return join_dimensions(x, dim, dim + 1)
+
+
+def named_reduce_single(x, reduction, dim):
+    """
+    Reduce tensor at a dimension by named reduction functions.
+
+    Args:
+        x (torch.Tensor): tensor to be reduced
+        reduction (str): one of ["sum", "max", "mean", "flatten"]
+        dim (int): dimension to be reduced (or begin axis for flatten)
+
+    Returns:
+        y (torch.Tensor): reduced tensor
+    """
+    assert x.ndimension() > dim
+    assert reduction in ["sum", "max", "mean", "flatten"]
+    if reduction == "flatten":
+        x = flatten(x, begin_axis=dim)
+    elif reduction == "max":
+        x = torch.max(x, dim=dim)[0]  # [B, D]
+    elif reduction == "sum":
+        x = torch.sum(x, dim=dim)
+    else:
+        x = torch.mean(x, dim=dim)
+    return x
+
+
+def named_reduce(x, reduction, dim):
+    """
+    Reduces all tensors in nested dictionary or list or tuple at a dimension
+    using a named reduction function.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        reduction (str): one of ["sum", "max", "mean", "flatten"]
+        dim (int): dimension to be reduced (or begin axis for flatten)
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return map_tensor(x, func=lambda t, r=reduction, d=dim: named_reduce_single(t, r, d))
+
+
+def gather_along_dim_with_dim_single(x, target_dim, source_dim, indices):
+    """
+    This function indexes out a target dimension of a tensor in a structured way,
+    by allowing a different value to be selected for each member of a flat index
+    tensor (@indices) corresponding to a source dimension. This can be interpreted
+    as moving along the source dimension, using the corresponding index value
+    in @indices to select values for all other dimensions outside of the
+    source and target dimensions. A common use case is to gather values
+    in target dimension 1 for each batch member (target dimension 0).
+
+    Args:
+        x (torch.Tensor): tensor to gather values for
+        target_dim (int): dimension to gather values along
+        source_dim (int): dimension to hold constant and use for gathering values
+            from the other dimensions
+        indices (torch.Tensor): flat index tensor with same shape as tensor @x along
+            @source_dim
+
+    Returns:
+        y (torch.Tensor): gathered tensor, with dimension @target_dim indexed out
+    """
+    assert len(indices.shape) == 1
+    assert x.shape[source_dim] == indices.shape[0]
+
+    # unsqueeze in all dimensions except the source dimension
+    new_shape = [1] * x.ndimension()
+    new_shape[source_dim] = -1
+    indices = indices.reshape(*new_shape)
+
+    # repeat in all dimensions - but preserve shape of source dimension,
+    # and make sure target_dimension has singleton dimension
+    expand_shape = list(x.shape)
+    expand_shape[source_dim] = -1
+    expand_shape[target_dim] = 1
+    indices = indices.expand(*expand_shape)
+
+    out = x.gather(dim=target_dim, index=indices)
+    return out.squeeze(target_dim)
+
+
+def gather_along_dim_with_dim(x, target_dim, source_dim, indices):
+    """
+    Apply @gather_along_dim_with_dim_single to all tensors in a nested
+    dictionary or list or tuple.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        target_dim (int): dimension to gather values along
+        source_dim (int): dimension to hold constant and use for gathering values
+            from the other dimensions
+        indices (torch.Tensor): flat index tensor with same shape as tensor @x along
+            @source_dim
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return map_tensor(
+        x, lambda y, t=target_dim, s=source_dim, i=indices: gather_along_dim_with_dim_single(y, t, s, i)
+    )
+
+
+def gather_sequence_single(seq, indices):
+    """
+    Given a tensor with leading dimensions [B, T, ...], gather an element from each sequence in
+    the batch given an index for each sequence.
+
+    Args:
+        seq (torch.Tensor): tensor with leading dimensions [B, T, ...]
+        indices (torch.Tensor): tensor indices of shape [B]
+
+    Return:
+        y (torch.Tensor): indexed tensor of shape [B, ....]
+    """
+    return gather_along_dim_with_dim_single(seq, target_dim=1, source_dim=0, indices=indices)
+
+
+def gather_sequence(seq, indices):
+    """
+    Given a nested dictionary or list or tuple, gathers an element from each sequence of the batch
+    for tensors with leading dimensions [B, T, ...].
+
+    Args:
+        seq (dict or list or tuple): a possibly nested dictionary or list or tuple with tensors
+            of leading dimensions [B, T, ...]
+        indices (torch.Tensor): tensor indices of shape [B]
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple with tensors of shape [B, ...]
+    """
+    return gather_along_dim_with_dim(seq, target_dim=1, source_dim=0, indices=indices)
+
+
+def pad_sequence_single(seq, padding, batched=False, pad_same=True, pad_values=None):
+    """
+    Pad input tensor or array @seq in the time dimension (dimension 1).
+
+    Args:
+        seq (np.ndarray or torch.Tensor): sequence to be padded
+        padding (tuple): begin and end padding, e.g. [1, 1] pads both begin and end of the sequence by 1
+        batched (bool): if sequence has the batch dimension
+        pad_same (bool): if pad by duplicating
+        pad_values (scalar or (ndarray, Tensor)): values to be padded if not pad_same
+
+    Returns:
+        padded sequence (np.ndarray or torch.Tensor)
+    """
+    assert isinstance(seq, (np.ndarray, torch.Tensor))
+    assert pad_same or pad_values is not None
+    if pad_values is not None:
+        assert isinstance(pad_values, float)
+    repeat_func = np.repeat if isinstance(seq, np.ndarray) else torch.repeat_interleave
+    concat_func = np.concatenate if isinstance(seq, np.ndarray) else torch.cat
+    ones_like_func = np.ones_like if isinstance(seq, np.ndarray) else torch.ones_like
+    seq_dim = 1 if batched else 0
+
+    begin_pad = []
+    end_pad = []
+
+    if padding[0] > 0:
+        pad = seq[[0]] if pad_same else ones_like_func(seq[[0]]) * pad_values
+        begin_pad.append(repeat_func(pad, padding[0], seq_dim))
+    if padding[1] > 0:
+        pad = seq[[-1]] if pad_same else ones_like_func(seq[[-1]]) * pad_values
+        end_pad.append(repeat_func(pad, padding[1], seq_dim))
+
+    return concat_func(begin_pad + [seq] + end_pad, seq_dim)
+
+
+def pad_sequence(seq, padding, batched=False, pad_same=True, pad_values=None):
+    """
+    Pad a nested dictionary or list or tuple of sequence tensors in the time dimension (dimension 1).
+
+    Args:
+        seq (dict or list or tuple): a possibly nested dictionary or list or tuple with tensors
+            of leading dimensions [B, T, ...]
+        padding (tuple): begin and end padding, e.g. [1, 1] pads both begin and end of the sequence by 1
+        batched (bool): if sequence has the batch dimension
+        pad_same (bool): if pad by duplicating
+        pad_values (scalar or (ndarray, Tensor)): values to be padded if not pad_same
+
+    Returns:
+        padded sequence (dict or list or tuple)
+    """
+    return recursive_dict_list_tuple_apply(
+        seq,
+        {
+            torch.Tensor: lambda x, p=padding, b=batched, ps=pad_same, pv=pad_values: pad_sequence_single(
+                x, p, b, ps, pv
+            ),
+            np.ndarray: lambda x, p=padding, b=batched, ps=pad_same, pv=pad_values: pad_sequence_single(
+                x, p, b, ps, pv
+            ),
+            type(None): lambda x: x,
+        },
+    )
+
+
+def assert_size_at_dim_single(x, size, dim, msg):
+    """
+    Ensure that array or tensor @x has size @size in dim @dim.
+
+    Args:
+        x (np.ndarray or torch.Tensor): input array or tensor
+        size (int): size that tensors should have at @dim
+        dim (int): dimension to check
+        msg (str): text to display if assertion fails
+    """
+    assert x.shape[dim] == size, msg
+
+
+def assert_size_at_dim(x, size, dim, msg):
+    """
+    Ensure that arrays and tensors in nested dictionary or list or tuple have
+    size @size in dim @dim.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        size (int): size that tensors should have at @dim
+        dim (int): dimension to check
+    """
+    map_tensor(x, lambda t, s=size, d=dim, m=msg: assert_size_at_dim_single(t, s, d, m))
+
+
+def get_shape(x):
+    """
+    Get all shapes of arrays and tensors in nested dictionary or list or tuple.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple that contains each array or
+            tensor's shape
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x: x.shape,
+            np.ndarray: lambda x: x.shape,
+            type(None): lambda x: x,
+        },
+    )
+
+
+def list_of_flat_dict_to_dict_of_list(list_of_dict):
+    """
+    Helper function to go from a list of flat dictionaries to a dictionary of lists.
+    By "flat" we mean that none of the values are dictionaries, but are numpy arrays,
+    floats, etc.
+
+    Args:
+        list_of_dict (list): list of flat dictionaries
+
+    Returns:
+        dict_of_list (dict): dictionary of lists
+    """
+    assert isinstance(list_of_dict, list)
+    dic = collections.OrderedDict()
+    for i in range(len(list_of_dict)):
+        for k in list_of_dict[i]:
+            if k not in dic:
+                dic[k] = []
+            dic[k].append(list_of_dict[i][k])
+    return dic
+
+
+def flatten_nested_dict_list(d, parent_key="", sep="_", item_key=""):
+    """
+    Flatten a nested dict or list to a list.
+
+    For example, given a dict
+    {
+        a: 1
+        b: {
+            c: 2
+        }
+        c: 3
+    }
+
+    the function would return [(a, 1), (b_c, 2), (c, 3)]
+
+    Args:
+        d (dict, list): a nested dict or list to be flattened
+        parent_key (str): recursion helper
+        sep (str): separator for nesting keys
+        item_key (str): recursion helper
+    Returns:
+        list: a list of (key, value) tuples
+    """
+    items = []
+    if isinstance(d, (tuple, list)):
+        new_key = parent_key + sep + item_key if len(parent_key) > 0 else item_key
+        for i, v in enumerate(d):
+            items.extend(flatten_nested_dict_list(v, new_key, sep=sep, item_key=str(i)))
+        return items
+    elif isinstance(d, dict):
+        new_key = parent_key + sep + item_key if len(parent_key) > 0 else item_key
+        for k, v in d.items():
+            assert isinstance(k, str)
+            items.extend(flatten_nested_dict_list(v, new_key, sep=sep, item_key=k))
+        return items
+    else:
+        new_key = parent_key + sep + item_key if len(parent_key) > 0 else item_key
+        return [(new_key, d)]
+
+
+def time_distributed(inputs, op, activation=None, inputs_as_kwargs=False, inputs_as_args=False, **kwargs):
+    """
+    Apply function @op to all tensors in nested dictionary or list or tuple @inputs in both the
+    batch (B) and time (T) dimension, where the tensors are expected to have shape [B, T, ...].
+    Will do this by reshaping tensors to [B * T, ...], passing through the op, and then reshaping
+    outputs to [B, T, ...].
+
+    Args:
+        inputs (list or tuple or dict): a possibly nested dictionary or list or tuple with tensors
+            of leading dimensions [B, T, ...]
+        op: a layer op that accepts inputs
+        activation: activation to apply at the output
+        inputs_as_kwargs (bool): whether to feed input as a kwargs dict to the op
+        inputs_as_args (bool) whether to feed input as a args list to the op
+        kwargs (dict): other kwargs to supply to the op
+
+    Returns:
+        outputs (dict or list or tuple): new nested dict-list-tuple with tensors of leading dimension [B, T].
+    """
+    batch_size, seq_len = flatten_nested_dict_list(inputs)[0][1].shape[:2]
+    inputs = join_dimensions(inputs, 0, 1)
+    if inputs_as_kwargs:
+        outputs = op(**inputs, **kwargs)
+    elif inputs_as_args:
+        outputs = op(*inputs, **kwargs)
+    else:
+        outputs = op(inputs, **kwargs)
+
+    if activation is not None:
+        outputs = map_tensor(outputs, activation)
+    outputs = reshape_dimensions(outputs, begin_axis=0, end_axis=0, target_dims=(batch_size, seq_len))
+    return outputs
--- a/lerobot/common/policies/diffusion/policy.py
+++ b/lerobot/common/policies/diffusion/policy.py
@@ -4,10 +4,10 @@ import time
 import hydra
 import torch
 import torch.nn as nn
-from diffusion_policy.model.common.lr_scheduler import get_scheduler

-from .diffusion_unet_image_policy import DiffusionUnetImagePolicy
-from .multi_image_obs_encoder import MultiImageObsEncoder
+from lerobot.common.policies.diffusion.diffusion_unet_image_policy import DiffusionUnetImagePolicy
+from lerobot.common.policies.diffusion.model.lr_scheduler import get_scheduler
+from lerobot.common.policies.diffusion.model.multi_image_obs_encoder import MultiImageObsEncoder


 class DiffusionPolicy(nn.Module):
--- a/lerobot/common/policies/diffusion/pytorch_utils.py
+++ b/lerobot/common/policies/diffusion/pytorch_utils.py
@@ -0,0 +1,76 @@
+from typing import Callable, Dict
+
+import torch
+import torch.nn as nn
+import torchvision
+
+
+def get_resnet(name, weights=None, **kwargs):
+    """
+    name: resnet18, resnet34, resnet50
+    weights: "IMAGENET1K_V1", "r3m"
+    """
+    # load r3m weights
+    if (weights == "r3m") or (weights == "R3M"):
+        return get_r3m(name=name, **kwargs)
+
+    func = getattr(torchvision.models, name)
+    resnet = func(weights=weights, **kwargs)
+    resnet.fc = torch.nn.Identity()
+    return resnet
+
+
+def get_r3m(name, **kwargs):
+    """
+    name: resnet18, resnet34, resnet50
+    """
+    import r3m
+
+    r3m.device = "cpu"
+    model = r3m.load_r3m(name)
+    r3m_model = model.module
+    resnet_model = r3m_model.convnet
+    resnet_model = resnet_model.to("cpu")
+    return resnet_model
+
+
+def dict_apply(
+    x: Dict[str, torch.Tensor], func: Callable[[torch.Tensor], torch.Tensor]
+) -> Dict[str, torch.Tensor]:
+    result = {}
+    for key, value in x.items():
+        if isinstance(value, dict):
+            result[key] = dict_apply(value, func)
+        else:
+            result[key] = func(value)
+    return result
+
+
+def replace_submodules(
+    root_module: nn.Module, predicate: Callable[[nn.Module], bool], func: Callable[[nn.Module], nn.Module]
+) -> nn.Module:
+    """
+    predicate: Return true if the module is to be replaced.
+    func: Return new module to use.
+    """
+    if predicate(root_module):
+        return func(root_module)
+
+    bn_list = [k.split(".") for k, m in root_module.named_modules(remove_duplicate=True) if predicate(m)]
+    for *parent, k in bn_list:
+        parent_module = root_module
+        if len(parent) > 0:
+            parent_module = root_module.get_submodule(".".join(parent))
+        if isinstance(parent_module, nn.Sequential):
+            src_module = parent_module[int(k)]
+        else:
+            src_module = getattr(parent_module, k)
+        tgt_module = func(src_module)
+        if isinstance(parent_module, nn.Sequential):
+            parent_module[int(k)] = tgt_module
+        else:
+            setattr(parent_module, k, tgt_module)
+    # verify that all BN are replaced
+    bn_list = [k.split(".") for k, m in root_module.named_modules(remove_duplicate=True) if predicate(m)]
+    assert len(bn_list) == 0
+    return root_module
--- a/lerobot/common/policies/diffusion/replay_buffer.py
+++ b/lerobot/common/policies/diffusion/replay_buffer.py
@@ -0,0 +1,614 @@
+from __future__ import annotations
+
+import math
+import numbers
+import os
+from functools import cached_property
+
+import numcodecs
+import numpy as np
+import zarr
+
+
+def check_chunks_compatible(chunks: tuple, shape: tuple):
+    assert len(shape) == len(chunks)
+    for c in chunks:
+        assert isinstance(c, numbers.Integral)
+        assert c > 0
+
+
+def rechunk_recompress_array(group, name, chunks=None, chunk_length=None, compressor=None, tmp_key="_temp"):
+    old_arr = group[name]
+    if chunks is None:
+        chunks = (chunk_length,) + old_arr.chunks[1:] if chunk_length is not None else old_arr.chunks
+    check_chunks_compatible(chunks, old_arr.shape)
+
+    if compressor is None:
+        compressor = old_arr.compressor
+
+    if (chunks == old_arr.chunks) and (compressor == old_arr.compressor):
+        # no change
+        return old_arr
+
+    # rechunk recompress
+    group.move(name, tmp_key)
+    old_arr = group[tmp_key]
+    n_copied, n_skipped, n_bytes_copied = zarr.copy(
+        source=old_arr,
+        dest=group,
+        name=name,
+        chunks=chunks,
+        compressor=compressor,
+    )
+    del group[tmp_key]
+    arr = group[name]
+    return arr
+
+
+def get_optimal_chunks(shape, dtype, target_chunk_bytes=2e6, max_chunk_length=None):
+    """
+    Common shapes
+    T,D
+    T,N,D
+    T,H,W,C
+    T,N,H,W,C
+    """
+    itemsize = np.dtype(dtype).itemsize
+    # reversed
+    rshape = list(shape[::-1])
+    if max_chunk_length is not None:
+        rshape[-1] = int(max_chunk_length)
+    split_idx = len(shape) - 1
+    for i in range(len(shape) - 1):
+        this_chunk_bytes = itemsize * np.prod(rshape[:i])
+        next_chunk_bytes = itemsize * np.prod(rshape[: i + 1])
+        if this_chunk_bytes <= target_chunk_bytes and next_chunk_bytes > target_chunk_bytes:
+            split_idx = i
+
+    rchunks = rshape[:split_idx]
+    item_chunk_bytes = itemsize * np.prod(rshape[:split_idx])
+    this_max_chunk_length = rshape[split_idx]
+    next_chunk_length = min(this_max_chunk_length, math.ceil(target_chunk_bytes / item_chunk_bytes))
+    rchunks.append(next_chunk_length)
+    len_diff = len(shape) - len(rchunks)
+    rchunks.extend([1] * len_diff)
+    chunks = tuple(rchunks[::-1])
+    # print(np.prod(chunks) * itemsize / target_chunk_bytes)
+    return chunks
+
+
+class ReplayBuffer:
+    """
+    Zarr-based temporal datastructure.
+    Assumes first dimension to be time. Only chunk in time dimension.
+    """
+
+    def __init__(self, root: zarr.Group | dict[str, dict]):
+        """
+        Dummy constructor. Use copy_from* and create_from* class methods instead.
+        """
+        assert "data" in root
+        assert "meta" in root
+        assert "episode_ends" in root["meta"]
+        for value in root["data"].values():
+            assert value.shape[0] == root["meta"]["episode_ends"][-1]
+        self.root = root
+
+    # ============= create constructors ===============
+    @classmethod
+    def create_empty_zarr(cls, storage=None, root=None):
+        if root is None:
+            if storage is None:
+                storage = zarr.MemoryStore()
+            root = zarr.group(store=storage)
+        root.require_group("data", overwrite=False)
+        meta = root.require_group("meta", overwrite=False)
+        if "episode_ends" not in meta:
+            meta.zeros("episode_ends", shape=(0,), dtype=np.int64, compressor=None, overwrite=False)
+        return cls(root=root)
+
+    @classmethod
+    def create_empty_numpy(cls):
+        root = {"data": {}, "meta": {"episode_ends": np.zeros((0,), dtype=np.int64)}}
+        return cls(root=root)
+
+    @classmethod
+    def create_from_group(cls, group, **kwargs):
+        if "data" not in group:
+            # create from stratch
+            buffer = cls.create_empty_zarr(root=group, **kwargs)
+        else:
+            # already exist
+            buffer = cls(root=group, **kwargs)
+        return buffer
+
+    @classmethod
+    def create_from_path(cls, zarr_path, mode="r", **kwargs):
+        """
+        Open a on-disk zarr directly (for dataset larger than memory).
+        Slower.
+        """
+        group = zarr.open(os.path.expanduser(zarr_path), mode)
+        return cls.create_from_group(group, **kwargs)
+
+    # ============= copy constructors ===============
+    @classmethod
+    def copy_from_store(
+        cls,
+        src_store,
+        store=None,
+        keys=None,
+        chunks: dict[str, tuple] | None = None,
+        compressors: dict | str | numcodecs.abc.Codec | None = None,
+        if_exists="replace",
+        **kwargs,
+    ):
+        """
+        Load to memory.
+        """
+        src_root = zarr.group(src_store)
+        if chunks is None:
+            chunks = {}
+        if compressors is None:
+            compressors = {}
+        root = None
+        if store is None:
+            # numpy backend
+            meta = {}
+            for key, value in src_root["meta"].items():
+                if len(value.shape) == 0:
+                    meta[key] = np.array(value)
+                else:
+                    meta[key] = value[:]
+
+            if keys is None:
+                keys = src_root["data"].keys()
+            data = {}
+            for key in keys:
+                arr = src_root["data"][key]
+                data[key] = arr[:]
+
+            root = {"meta": meta, "data": data}
+        else:
+            root = zarr.group(store=store)
+            # copy without recompression
+            n_copied, n_skipped, n_bytes_copied = zarr.copy_store(
+                source=src_store, dest=store, source_path="/meta", dest_path="/meta", if_exists=if_exists
+            )
+            data_group = root.create_group("data", overwrite=True)
+            if keys is None:
+                keys = src_root["data"].keys()
+            for key in keys:
+                value = src_root["data"][key]
+                cks = cls._resolve_array_chunks(chunks=chunks, key=key, array=value)
+                cpr = cls._resolve_array_compressor(compressors=compressors, key=key, array=value)
+                if cks == value.chunks and cpr == value.compressor:
+                    # copy without recompression
+                    this_path = "/data/" + key
+                    n_copied, n_skipped, n_bytes_copied = zarr.copy_store(
+                        source=src_store,
+                        dest=store,
+                        source_path=this_path,
+                        dest_path=this_path,
+                        if_exists=if_exists,
+                    )
+                else:
+                    # copy with recompression
+                    n_copied, n_skipped, n_bytes_copied = zarr.copy(
+                        source=value,
+                        dest=data_group,
+                        name=key,
+                        chunks=cks,
+                        compressor=cpr,
+                        if_exists=if_exists,
+                    )
+        buffer = cls(root=root)
+        return buffer
+
+    @classmethod
+    def copy_from_path(
+        cls,
+        zarr_path,
+        backend=None,
+        store=None,
+        keys=None,
+        chunks: dict[str, tuple] | None = None,
+        compressors: dict | str | numcodecs.abc.Codec | None = None,
+        if_exists="replace",
+        **kwargs,
+    ):
+        """
+        Copy a on-disk zarr to in-memory compressed.
+        Recommended
+        """
+        if chunks is None:
+            chunks = {}
+        if compressors is None:
+            compressors = {}
+        if backend == "numpy":
+            print("backend argument is deprecated!")
+            store = None
+        group = zarr.open(os.path.expanduser(zarr_path), "r")
+        return cls.copy_from_store(
+            src_store=group.store,
+            store=store,
+            keys=keys,
+            chunks=chunks,
+            compressors=compressors,
+            if_exists=if_exists,
+            **kwargs,
+        )
+
+    # ============= save methods ===============
+    def save_to_store(
+        self,
+        store,
+        chunks: dict[str, tuple] | None = None,
+        compressors: str | numcodecs.abc.Codec | dict | None = None,
+        if_exists="replace",
+        **kwargs,
+    ):
+        root = zarr.group(store)
+        if chunks is None:
+            chunks = {}
+        if compressors is None:
+            compressors = {}
+        if self.backend == "zarr":
+            # recompression free copy
+            n_copied, n_skipped, n_bytes_copied = zarr.copy_store(
+                source=self.root.store,
+                dest=store,
+                source_path="/meta",
+                dest_path="/meta",
+                if_exists=if_exists,
+            )
+        else:
+            meta_group = root.create_group("meta", overwrite=True)
+            # save meta, no chunking
+            for key, value in self.root["meta"].items():
+                _ = meta_group.array(name=key, data=value, shape=value.shape, chunks=value.shape)
+
+        # save data, chunk
+        data_group = root.create_group("data", overwrite=True)
+        for key, value in self.root["data"].items():
+            cks = self._resolve_array_chunks(chunks=chunks, key=key, array=value)
+            cpr = self._resolve_array_compressor(compressors=compressors, key=key, array=value)
+            if isinstance(value, zarr.Array):
+                if cks == value.chunks and cpr == value.compressor:
+                    # copy without recompression
+                    this_path = "/data/" + key
+                    n_copied, n_skipped, n_bytes_copied = zarr.copy_store(
+                        source=self.root.store,
+                        dest=store,
+                        source_path=this_path,
+                        dest_path=this_path,
+                        if_exists=if_exists,
+                    )
+                else:
+                    # copy with recompression
+                    n_copied, n_skipped, n_bytes_copied = zarr.copy(
+                        source=value,
+                        dest=data_group,
+                        name=key,
+                        chunks=cks,
+                        compressor=cpr,
+                        if_exists=if_exists,
+                    )
+            else:
+                # numpy
+                _ = data_group.array(name=key, data=value, chunks=cks, compressor=cpr)
+        return store
+
+    def save_to_path(
+        self,
+        zarr_path,
+        chunks: dict[str, tuple] | None = None,
+        compressors: str | numcodecs.abc.Codec | dict | None = None,
+        if_exists="replace",
+        **kwargs,
+    ):
+        if chunks is None:
+            chunks = {}
+        if compressors is None:
+            compressors = {}
+        store = zarr.DirectoryStore(os.path.expanduser(zarr_path))
+        return self.save_to_store(
+            store, chunks=chunks, compressors=compressors, if_exists=if_exists, **kwargs
+        )
+
+    @staticmethod
+    def resolve_compressor(compressor="default"):
+        if compressor == "default":
+            compressor = numcodecs.Blosc(cname="lz4", clevel=5, shuffle=numcodecs.Blosc.NOSHUFFLE)
+        elif compressor == "disk":
+            compressor = numcodecs.Blosc("zstd", clevel=5, shuffle=numcodecs.Blosc.BITSHUFFLE)
+        return compressor
+
+    @classmethod
+    def _resolve_array_compressor(cls, compressors: dict | str | numcodecs.abc.Codec, key, array):
+        # allows compressor to be explicitly set to None
+        cpr = "nil"
+        if isinstance(compressors, dict):
+            if key in compressors:
+                cpr = cls.resolve_compressor(compressors[key])
+            elif isinstance(array, zarr.Array):
+                cpr = array.compressor
+        else:
+            cpr = cls.resolve_compressor(compressors)
+        # backup default
+        if cpr == "nil":
+            cpr = cls.resolve_compressor("default")
+        return cpr
+
+    @classmethod
+    def _resolve_array_chunks(cls, chunks: dict | tuple, key, array):
+        cks = None
+        if isinstance(chunks, dict):
+            if key in chunks:
+                cks = chunks[key]
+            elif isinstance(array, zarr.Array):
+                cks = array.chunks
+        elif isinstance(chunks, tuple):
+            cks = chunks
+        else:
+            raise TypeError(f"Unsupported chunks type {type(chunks)}")
+        # backup default
+        if cks is None:
+            cks = get_optimal_chunks(shape=array.shape, dtype=array.dtype)
+        # check
+        check_chunks_compatible(chunks=cks, shape=array.shape)
+        return cks
+
+    # ============= properties =================
+    @cached_property
+    def data(self):
+        return self.root["data"]
+
+    @cached_property
+    def meta(self):
+        return self.root["meta"]
+
+    def update_meta(self, data):
+        # sanitize data
+        np_data = {}
+        for key, value in data.items():
+            if isinstance(value, np.ndarray):
+                np_data[key] = value
+            else:
+                arr = np.array(value)
+                if arr.dtype == object:
+                    raise TypeError(f"Invalid value type {type(value)}")
+                np_data[key] = arr
+
+        meta_group = self.meta
+        if self.backend == "zarr":
+            for key, value in np_data.items():
+                _ = meta_group.array(
+                    name=key, data=value, shape=value.shape, chunks=value.shape, overwrite=True
+                )
+        else:
+            meta_group.update(np_data)
+
+        return meta_group
+
+    @property
+    def episode_ends(self):
+        return self.meta["episode_ends"]
+
+    def get_episode_idxs(self):
+        import numba
+
+        numba.jit(nopython=True)
+
+        def _get_episode_idxs(episode_ends):
+            result = np.zeros((episode_ends[-1],), dtype=np.int64)
+            for i in range(len(episode_ends)):
+                start = 0
+                if i > 0:
+                    start = episode_ends[i - 1]
+                end = episode_ends[i]
+                for idx in range(start, end):
+                    result[idx] = i
+            return result
+
+        return _get_episode_idxs(self.episode_ends)
+
+    @property
+    def backend(self):
+        backend = "numpy"
+        if isinstance(self.root, zarr.Group):
+            backend = "zarr"
+        return backend
+
+    # =========== dict-like API ==============
+    def __repr__(self) -> str:
+        if self.backend == "zarr":
+            return str(self.root.tree())
+        else:
+            return super().__repr__()
+
+    def keys(self):
+        return self.data.keys()
+
+    def values(self):
+        return self.data.values()
+
+    def items(self):
+        return self.data.items()
+
+    def __getitem__(self, key):
+        return self.data[key]
+
+    def __contains__(self, key):
+        return key in self.data
+
+    # =========== our API ==============
+    @property
+    def n_steps(self):
+        if len(self.episode_ends) == 0:
+            return 0
+        return self.episode_ends[-1]
+
+    @property
+    def n_episodes(self):
+        return len(self.episode_ends)
+
+    @property
+    def chunk_size(self):
+        if self.backend == "zarr":
+            return next(iter(self.data.arrays()))[-1].chunks[0]
+        return None
+
+    @property
+    def episode_lengths(self):
+        ends = self.episode_ends[:]
+        ends = np.insert(ends, 0, 0)
+        lengths = np.diff(ends)
+        return lengths
+
+    def add_episode(
+        self,
+        data: dict[str, np.ndarray],
+        chunks: dict[str, tuple] | None = None,
+        compressors: str | numcodecs.abc.Codec | dict | None = None,
+    ):
+        if chunks is None:
+            chunks = {}
+        if compressors is None:
+            compressors = {}
+        assert len(data) > 0
+        is_zarr = self.backend == "zarr"
+
+        curr_len = self.n_steps
+        episode_length = None
+        for value in data.values():
+            assert len(value.shape) >= 1
+            if episode_length is None:
+                episode_length = len(value)
+            else:
+                assert episode_length == len(value)
+        new_len = curr_len + episode_length
+
+        for key, value in data.items():
+            new_shape = (new_len,) + value.shape[1:]
+            # create array
+            if key not in self.data:
+                if is_zarr:
+                    cks = self._resolve_array_chunks(chunks=chunks, key=key, array=value)
+                    cpr = self._resolve_array_compressor(compressors=compressors, key=key, array=value)
+                    arr = self.data.zeros(
+                        name=key, shape=new_shape, chunks=cks, dtype=value.dtype, compressor=cpr
+                    )
+                else:
+                    # copy data to prevent modify
+                    arr = np.zeros(shape=new_shape, dtype=value.dtype)
+                    self.data[key] = arr
+            else:
+                arr = self.data[key]
+                assert value.shape[1:] == arr.shape[1:]
+                # same method for both zarr and numpy
+                if is_zarr:
+                    arr.resize(new_shape)
+                else:
+                    arr.resize(new_shape, refcheck=False)
+            # copy data
+            arr[-value.shape[0] :] = value
+
+        # append to episode ends
+        episode_ends = self.episode_ends
+        if is_zarr:
+            episode_ends.resize(episode_ends.shape[0] + 1)
+        else:
+            episode_ends.resize(episode_ends.shape[0] + 1, refcheck=False)
+        episode_ends[-1] = new_len
+
+        # rechunk
+        if is_zarr and episode_ends.chunks[0] < episode_ends.shape[0]:
+            rechunk_recompress_array(self.meta, "episode_ends", chunk_length=int(episode_ends.shape[0] * 1.5))
+
+    def drop_episode(self):
+        is_zarr = self.backend == "zarr"
+        episode_ends = self.episode_ends[:].copy()
+        assert len(episode_ends) > 0
+        start_idx = 0
+        if len(episode_ends) > 1:
+            start_idx = episode_ends[-2]
+        for value in self.data.values():
+            new_shape = (start_idx,) + value.shape[1:]
+            if is_zarr:
+                value.resize(new_shape)
+            else:
+                value.resize(new_shape, refcheck=False)
+        if is_zarr:
+            self.episode_ends.resize(len(episode_ends) - 1)
+        else:
+            self.episode_ends.resize(len(episode_ends) - 1, refcheck=False)
+
+    def pop_episode(self):
+        assert self.n_episodes > 0
+        episode = self.get_episode(self.n_episodes - 1, copy=True)
+        self.drop_episode()
+        return episode
+
+    def extend(self, data):
+        self.add_episode(data)
+
+    def get_episode(self, idx, copy=False):
+        idx = list(range(len(self.episode_ends)))[idx]
+        start_idx = 0
+        if idx > 0:
+            start_idx = self.episode_ends[idx - 1]
+        end_idx = self.episode_ends[idx]
+        result = self.get_steps_slice(start_idx, end_idx, copy=copy)
+        return result
+
+    def get_episode_slice(self, idx):
+        start_idx = 0
+        if idx > 0:
+            start_idx = self.episode_ends[idx - 1]
+        end_idx = self.episode_ends[idx]
+        return slice(start_idx, end_idx)
+
+    def get_steps_slice(self, start, stop, step=None, copy=False):
+        _slice = slice(start, stop, step)
+
+        result = {}
+        for key, value in self.data.items():
+            x = value[_slice]
+            if copy and isinstance(value, np.ndarray):
+                x = x.copy()
+            result[key] = x
+        return result
+
+    # =========== chunking =============
+    def get_chunks(self) -> dict:
+        assert self.backend == "zarr"
+        chunks = {}
+        for key, value in self.data.items():
+            chunks[key] = value.chunks
+        return chunks
+
+    def set_chunks(self, chunks: dict):
+        assert self.backend == "zarr"
+        for key, value in chunks.items():
+            if key in self.data:
+                arr = self.data[key]
+                if value != arr.chunks:
+                    check_chunks_compatible(chunks=value, shape=arr.shape)
+                    rechunk_recompress_array(self.data, key, chunks=value)
+
+    def get_compressors(self) -> dict:
+        assert self.backend == "zarr"
+        compressors = {}
+        for key, value in self.data.items():
+            compressors[key] = value.compressor
+        return compressors
+
+    def set_compressors(self, compressors: dict):
+        assert self.backend == "zarr"
+        for key, value in compressors.items():
+            if key in self.data:
+                arr = self.data[key]
+                compressor = self.resolve_compressor(value)
+                if compressor != arr.compressor:
+                    rechunk_recompress_array(self.data, key, compressor=compressor)
--- a/lerobot/common/policies/factory.py
+++ b/lerobot/common/policies/factory.py
@@ -1,6 +1,6 @@
 def make_policy(cfg):
    if cfg.policy.name == "tdmpc":
-        from lerobot.common.policies.tdmpc import TDMPC
+        from lerobot.common.policies.tdmpc.policy import TDMPC

        policy = TDMPC(cfg.policy, cfg.device)
    elif cfg.policy.name == "diffusion":
@@ -17,6 +17,12 @@ def make_policy(cfg):
            n_action_steps=cfg.n_action_steps + cfg.n_latency_steps,
            **cfg.policy,
        )
+    elif cfg.policy.name == "act":
+        from lerobot.common.policies.act.policy import ActionChunkingTransformerPolicy
+
+        policy = ActionChunkingTransformerPolicy(
+            cfg.policy, cfg.device, n_action_steps=cfg.n_action_steps + cfg.n_latency_steps
+        )
    else:
        raise ValueError(cfg.policy.name)

--- a/lerobot/common/policies/tdmpc/helper.py
+++ b/lerobot/common/policies/tdmpc/helper.py
--- a/lerobot/common/policies/tdmpc/policy.py
+++ b/lerobot/common/policies/tdmpc/policy.py
@@ -8,7 +8,7 @@ import numpy as np
 import torch
 import torch.nn as nn

-import lerobot.common.policies.tdmpc_helper as h
+import lerobot.common.policies.tdmpc.helper as h

 FIRST_FRAME = 0

--- a/lerobot/configs/default.yaml
+++ b/lerobot/configs/default.yaml
@@ -31,5 +31,4 @@ policy: ???
 wandb:
  enable: true
  project: lerobot
-  entity: rcadene  # insert your own
  notes: ""
--- a/lerobot/configs/env/aloha.yaml
+++ b/lerobot/configs/env/aloha.yaml
@@ -15,11 +15,11 @@ env:
  task: sim_insertion_human
  from_pixels: True
  pixels_only: False
-  image_size: 96
+  image_size: [3, 480, 640]
  action_repeat: 1
-  episode_length: 300
+  episode_length: 400
  fps: ${fps}

 policy:
-  state_dim: 2
-  action_dim: 2
+  state_dim: 14
+  action_dim: 14
--- a/lerobot/configs/policy/act.yaml
+++ b/lerobot/configs/policy/act.yaml
@@ -0,0 +1,58 @@
+# @package _global_
+
+offline_steps: 1344000
+online_steps: 0
+
+eval_episodes: 1
+eval_freq: 10000
+save_freq: 100000
+log_freq: 250
+
+horizon: 100
+n_obs_steps: 1
+n_latency_steps: 0
+# when temporal_agg=False, n_action_steps=horizon
+n_action_steps: ${horizon}
+
+policy:
+  name: act
+
+  pretrained_model_path:
+
+  lr: 1e-5
+  lr_backbone: 1e-5
+  weight_decay: 1e-4
+  grad_clip_norm: 10
+  backbone: resnet18
+  num_queries: ${horizon} # chunk_size
+  horizon: ${horizon} # chunk_size
+  kl_weight: 10
+  hidden_dim: 512
+  dim_feedforward: 3200
+  enc_layers: 4
+  dec_layers: 7
+  nheads: 8
+  #camera_names: [top, front_close, left_pillar, right_pillar]
+  camera_names: [top]
+  position_embedding: sine
+  masks: false
+  dilation: false
+  dropout: 0.1
+  pre_norm: false
+
+  vae: true
+
+  batch_size: 8
+
+  per_alpha: 0.6
+  per_beta: 0.4
+
+  balanced_sampling: false
+  utd: 1
+
+  n_obs_steps: ${n_obs_steps}
+
+  temporal_agg: false
+
+  state_dim: ???
+  action_dim: ???
--- a/lerobot/configs/policy/diffusion.yaml
+++ b/lerobot/configs/policy/diffusion.yaml
@@ -74,7 +74,6 @@ noise_scheduler:
  prediction_type: epsilon # or sample

 obs_encoder:
-  # _target_: diffusion_policy.model.vision.multi_image_obs_encoder.MultiImageObsEncoder
  shape_meta: ${shape_meta}
  # resize_shape: null
  # crop_shape: [76, 76]
@@ -85,12 +84,12 @@ obs_encoder:
  imagenet_norm: True

 rgb_model:
-  _target_: diffusion_policy.model.vision.model_getter.get_resnet
+  _target_: lerobot.common.policies.diffusion.pytorch_utils.get_resnet
  name: resnet18
  weights: null

 ema:
-  _target_: diffusion_policy.model.diffusion.ema_model.EMAModel
+  _target_: lerobot.common.policies.diffusion.model.ema_model.EMAModel
  update_after_step: 0
  inv_gamma: 1.0
  power: 0.75
--- a/lerobot/scripts/download.py
+++ b/lerobot/scripts/download.py
@@ -1,22 +0,0 @@
-# TODO(rcadene): obsolete remove
-import os
-import zipfile
-
-import gdown
-
-
-def download():
-    url = "https://drive.google.com/uc?id=1nhxpykGtPDhmQKm-_B8zBSywVRdgeVya"
-    download_path = "data.zip"
-    gdown.download(url, download_path, quiet=False)
-    print("Extracting...")
-    with zipfile.ZipFile(download_path, "r") as zip_f:
-        for member in zip_f.namelist():
-            if member.startswith("data/xarm") and member.endswith(".pkl"):
-                print(member)
-                zip_f.extract(member=member)
-    os.remove(download_path)
-
-
-if __name__ == "__main__":
-    download()
--- a/lerobot/scripts/eval.py
+++ b/lerobot/scripts/eval.py
@@ -38,27 +38,18 @@ def eval_policy(
    successes = []
    threads = []
    for i in tqdm.tqdm(range(num_episodes)):
-        tensordict = env.reset()
-
        ep_frames = []
-
        if save_video or (return_first_video and i == 0):

-            def rendering_callback(env, td=None):
+            def render_frame(env):
                ep_frames.append(env.render())  # noqa: B023

-            # render first frame before rollout
-            rendering_callback(env)
-        else:
-            rendering_callback = None
+            env.register_rendering_hook(render_frame)

        with torch.inference_mode():
            rollout = env.rollout(
                max_steps=max_steps,
                policy=policy,
-                callback=rendering_callback,
-                auto_reset=False,
-                tensordict=tensordict,
                auto_cast_to_device=True,
            )
        # print(", ".join([f"{x:.3f}" for x in rollout["next", "reward"][:,0].tolist()]))
@@ -85,6 +76,8 @@ def eval_policy(
            if return_first_video and i == 0:
                first_video = stacked_frames.transpose(0, 3, 1, 2)

+    env.reset_rendering_hooks()
+
    for thread in threads:
        thread.join()

@@ -126,7 +119,7 @@ def eval(cfg: dict, out_dir=None):
    offline_buffer = make_offline_buffer(cfg)

    logging.info("make_env")
-    env = make_env(cfg, transform=offline_buffer._transform)
+    env = make_env(cfg, transform=offline_buffer.transform)

    if cfg.policy.pretrained_model_path:
        policy = make_policy(cfg)
--- a/lerobot/scripts/train.py
+++ b/lerobot/scripts/train.py
@@ -1,4 +1,5 @@
 import logging
+from pathlib import Path

 import hydra
 import numpy as np
@@ -142,11 +143,11 @@ def train(cfg: dict, out_dir=None, job_name=None):
        online_buffer = TensorDictReplayBuffer(
            storage=LazyMemmapStorage(100_000),
            sampler=online_sampler,
-            transform=offline_buffer._transform,
+            transform=offline_buffer.transform,
        )

    logging.info("make_env")
-    env = make_env(cfg, transform=offline_buffer._transform)
+    env = make_env(cfg, transform=offline_buffer.transform)

    logging.info("make_policy")
    policy = make_policy(cfg)
@@ -192,6 +193,8 @@ def train(cfg: dict, out_dir=None, job_name=None):
                num_episodes=cfg.eval_episodes,
                max_steps=cfg.env.episode_length // cfg.n_action_steps,
                return_first_video=True,
+                video_dir=Path(out_dir) / "eval",
+                save_video=True,
            )
            log_eval_info(logger, eval_info, step, cfg, offline_buffer, is_offline)
            if cfg.wandb.enable:
--- a/poetry.lock
+++ b/poetry.lock
@@ -340,69 +340,69 @@ files = [

 [[package]]
 name = "cython"
-version = "3.0.8"
+version = "3.0.9"
 description = "The Cython compiler for writing C extensions in the Python language."
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 files = [
-    {file = "Cython-3.0.8-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a846e0a38e2b24e9a5c5dc74b0e54c6e29420d88d1dafabc99e0fc0f3e338636"},
-    {file = "Cython-3.0.8-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45523fdc2b78d79b32834cc1cc12dc2ca8967af87e22a3ee1bff20e77c7f5520"},
-    {file = "Cython-3.0.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:baa0b7f3f841fe087410cab66778e2d3fb20ae2d2078a2be3dffe66c6574be39"},
-    {file = "Cython-3.0.8-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e87294e33e40c289c77a135f491cd721bd089f193f956f7b8ed5aa2d0b8c558f"},
-    {file = "Cython-3.0.8-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:a1df7a129344b1215c20096d33c00193437df1a8fcca25b71f17c23b1a44f782"},
-    {file = "Cython-3.0.8-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:13c2a5e57a0358da467d97667297bf820b62a1a87ae47c5f87938b9bb593acbd"},
-    {file = "Cython-3.0.8-cp310-cp310-win32.whl", hash = "sha256:96b028f044f5880e3cb18ecdcfc6c8d3ce9d0af28418d5ab464509f26d8adf12"},
-    {file = "Cython-3.0.8-cp310-cp310-win_amd64.whl", hash = "sha256:8140597a8b5cc4f119a1190f5a2228a84f5ca6d8d9ec386cfce24663f48b2539"},
-    {file = "Cython-3.0.8-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:aae26f9663e50caf9657148403d9874eea41770ecdd6caf381d177c2b1bb82ba"},
-    {file = "Cython-3.0.8-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:547eb3cdb2f8c6f48e6865d5a741d9dd051c25b3ce076fbca571727977b28ac3"},
-    {file = "Cython-3.0.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a567d4b9ba70b26db89d75b243529de9e649a2f56384287533cf91512705bee"},
-    {file = "Cython-3.0.8-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:51d1426263b0e82fb22bda8ea60dc77a428581cc19e97741011b938445d383f1"},
-    {file = "Cython-3.0.8-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c26daaeccda072459b48d211415fd1e5507c06bcd976fa0d5b8b9f1063467d7b"},
-    {file = "Cython-3.0.8-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:289ce7838208211cd166e975865fd73b0649bf118170b6cebaedfbdaf4a37795"},
-    {file = "Cython-3.0.8-cp311-cp311-win32.whl", hash = "sha256:c8aa05f5e17f8042a3be052c24f2edc013fb8af874b0bf76907d16c51b4e7871"},
-    {file = "Cython-3.0.8-cp311-cp311-win_amd64.whl", hash = "sha256:000dc9e135d0eec6ecb2b40a5b02d0868a2f8d2e027a41b0fe16a908a9e6de02"},
-    {file = "Cython-3.0.8-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:90d3fe31db55685d8cb97d43b0ec39ef614fcf660f83c77ed06aa670cb0e164f"},
-    {file = "Cython-3.0.8-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e24791ddae2324e88e3c902a765595c738f19ae34ee66bfb1a6dac54b1833419"},
-    {file = "Cython-3.0.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2f020fa1c0552052e0660790b8153b79e3fc9a15dbd8f1d0b841fe5d204a6ae6"},
-    {file = "Cython-3.0.8-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:18bfa387d7a7f77d7b2526af69a65dbd0b731b8d941aaff5becff8e21f6d7717"},
-    {file = "Cython-3.0.8-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:fe81b339cffd87c0069c6049b4d33e28bdd1874625ee515785bf42c9fdff3658"},
-    {file = "Cython-3.0.8-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:80fd94c076e1e1b1ee40a309be03080b75f413e8997cddcf401a118879863388"},
-    {file = "Cython-3.0.8-cp312-cp312-win32.whl", hash = "sha256:85077915a93e359a9b920280d214dc0cf8a62773e1f3d7d30fab8ea4daed670c"},
-    {file = "Cython-3.0.8-cp312-cp312-win_amd64.whl", hash = "sha256:0cb2dcc565c7851f75d496f724a384a790fab12d1b82461b663e66605bec429a"},
-    {file = "Cython-3.0.8-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:870d2a0a7e3cbd5efa65aecdb38d715ea337a904ea7bb22324036e78fb7068e7"},
-    {file = "Cython-3.0.8-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7e8f2454128974905258d86534f4fd4f91d2f1343605657ecab779d80c9d6d5e"},
-    {file = "Cython-3.0.8-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1949d6aa7bc792554bee2b67a9fe41008acbfe22f4f8df7b6ec7b799613a4b3"},
-    {file = "Cython-3.0.8-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c9f2c6e1b8f3bcd6cb230bac1843f85114780bb8be8614855b1628b36bb510e0"},
-    {file = "Cython-3.0.8-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:05d7eddc668ae7993643f32c7661f25544e791edb745758672ea5b1a82ecffa6"},
-    {file = "Cython-3.0.8-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:bfabe115deef4ada5d23c87bddb11289123336dcc14347011832c07db616dd93"},
-    {file = "Cython-3.0.8-cp36-cp36m-win32.whl", hash = "sha256:0c38c9f0bcce2df0c3347285863621be904ac6b64c5792d871130569d893efd7"},
-    {file = "Cython-3.0.8-cp36-cp36m-win_amd64.whl", hash = "sha256:6c46939c3983217d140999de7c238c3141f56b1ea349e47ca49cae899969aa2c"},
-    {file = "Cython-3.0.8-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:115f0a50f752da6c99941b103b5cb090da63eb206abbc7c2ad33856ffc73f064"},
-    {file = "Cython-3.0.8-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c9c0f29246734561c90f36e70ed0506b61aa3d044e4cc4cba559065a2a741fae"},
-    {file = "Cython-3.0.8-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ab75242869ff71e5665fe5c96f3378e79e792fa3c11762641b6c5afbbbbe026"},
-    {file = "Cython-3.0.8-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6717c06e9cfc6c1df18543cd31a21f5d8e378a40f70c851fa2d34f0597037abc"},
-    {file = "Cython-3.0.8-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:9d3f74388db378a3c6fd06e79a809ed98df3f56484d317b81ee762dbf3c263e0"},
-    {file = "Cython-3.0.8-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ae7ac561fd8253a9ae96311e91d12af5f701383564edc11d6338a7b60b285a6f"},
-    {file = "Cython-3.0.8-cp37-cp37m-win32.whl", hash = "sha256:97b2a45845b993304f1799664fa88da676ee19442b15fdcaa31f9da7e1acc434"},
-    {file = "Cython-3.0.8-cp37-cp37m-win_amd64.whl", hash = "sha256:9e2be2b340fea46fb849d378f9b80d3c08ff2e81e2bfbcdb656e2e3cd8c6b2dc"},
-    {file = "Cython-3.0.8-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2cde23c555470db3f149ede78b518e8274853745289c956a0e06ad8d982e4db9"},
-    {file = "Cython-3.0.8-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7990ca127e1f1beedaf8fc8bf66541d066ef4723ad7d8d47a7cbf842e0f47580"},
-    {file = "Cython-3.0.8-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b983c8e6803f016146c26854d9150ddad5662960c804ea7f0c752c9266752f0"},
-    {file = "Cython-3.0.8-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a973268d7ca1a2bdf78575e459a94a78e1a0a9bb62a7db0c50041949a73b02ff"},
-    {file = "Cython-3.0.8-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:61a237bc9dd23c7faef0fcfce88c11c65d0c9bb73c74ccfa408b3a012073c20e"},
-    {file = "Cython-3.0.8-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:3a3d67f079598af49e90ff9655bf85bd358f093d727eb21ca2708f467c489cae"},
-    {file = "Cython-3.0.8-cp38-cp38-win32.whl", hash = "sha256:17a642bb01a693e34c914106566f59844b4461665066613913463a719e0dd15d"},
-    {file = "Cython-3.0.8-cp38-cp38-win_amd64.whl", hash = "sha256:2cdfc32252f3b6dc7c94032ab744dcedb45286733443c294d8f909a4854e7f83"},
-    {file = "Cython-3.0.8-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fa97893d99385386925d00074654aeae3a98867f298d1e12ceaf38a9054a9bae"},
-    {file = "Cython-3.0.8-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f05c0bf9d085c031df8f583f0d506aa3be1692023de18c45d0aaf78685bbb944"},
-    {file = "Cython-3.0.8-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de892422582f5758bd8de187e98ac829330ec1007bc42c661f687792999988a7"},
-    {file = "Cython-3.0.8-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:314f2355a1f1d06e3c431eaad4708cf10037b5e91e4b231d89c913989d0bdafd"},
-    {file = "Cython-3.0.8-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:78825a3774211e7d5089730f00cdf7f473042acc9ceb8b9eeebe13ed3a5541de"},
-    {file = "Cython-3.0.8-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:df8093deabc55f37028190cf5e575c26aad23fc673f34b85d5f45076bc37ce39"},
-    {file = "Cython-3.0.8-cp39-cp39-win32.whl", hash = "sha256:1aca1b97e0095b3a9a6c33eada3f661a4ed0d499067d121239b193e5ba3bb4f0"},
-    {file = "Cython-3.0.8-cp39-cp39-win_amd64.whl", hash = "sha256:16873d78be63bd38ffb759da7ab82814b36f56c769ee02b1d5859560e4c3ac3c"},
-    {file = "Cython-3.0.8-py2.py3-none-any.whl", hash = "sha256:171b27051253d3f9108e9759e504ba59ff06e7f7ba944457f94deaf9c21bf0b6"},
-    {file = "Cython-3.0.8.tar.gz", hash = "sha256:8333423d8fd5765e7cceea3a9985dd1e0a5dfeb2734629e1a2ed2d6233d39de6"},
+    {file = "Cython-3.0.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:296bd30d4445ac61b66c9d766567f6e81a6e262835d261e903c60c891a6729d3"},
+    {file = "Cython-3.0.9-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f496b52845cb45568a69d6359a2c335135233003e708ea02155c10ce3548aa89"},
+    {file = "Cython-3.0.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:858c3766b9aa3ab8a413392c72bbab1c144a9766b7c7bfdef64e2e414363fa0c"},
+    {file = "Cython-3.0.9-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c0eb1e6ef036028a52525fd9a012a556f6dd4788a0e8755fe864ba0e70cde2ff"},
+    {file = "Cython-3.0.9-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:c8191941073ea5896321de3c8c958fd66e5f304b0cd1f22c59edd0b86c4dd90d"},
+    {file = "Cython-3.0.9-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:e32b016030bc72a8a22a1f21f470a2f57573761a4f00fbfe8347263f4fbdb9f1"},
+    {file = "Cython-3.0.9-cp310-cp310-win32.whl", hash = "sha256:d6f3ff1cd6123973fe03e0fb8ee936622f976c0c41138969975824d08886572b"},
+    {file = "Cython-3.0.9-cp310-cp310-win_amd64.whl", hash = "sha256:56f3b643dbe14449248bbeb9a63fe3878a24256664bc8c8ef6efd45d102596d8"},
+    {file = "Cython-3.0.9-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:35e6665a20d6b8a152d72b7fd87dbb2af6bb6b18a235b71add68122d594dbd41"},
+    {file = "Cython-3.0.9-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f92f4960c40ad027bd8c364c50db11104eadc59ffeb9e5b7f605ca2f05946e20"},
+    {file = "Cython-3.0.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:38df37d0e732fbd9a2fef898788492e82b770c33d1e4ed12444bbc8a3b3f89c0"},
+    {file = "Cython-3.0.9-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ad7fd88ebaeaf2e76fd729a8919fae80dab3d6ac0005e28494261d52ff347a8f"},
+    {file = "Cython-3.0.9-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1365d5f76bf4d19df3d19ce932584c9bb76e9fb096185168918ef9b36e06bfa4"},
+    {file = "Cython-3.0.9-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c232e7f279388ac9625c3e5a5a9f0078a9334959c5d6458052c65bbbba895e1e"},
+    {file = "Cython-3.0.9-cp311-cp311-win32.whl", hash = "sha256:357e2fad46a25030b0c0496487e01a9dc0fdd0c09df0897f554d8ba3c1bc4872"},
+    {file = "Cython-3.0.9-cp311-cp311-win_amd64.whl", hash = "sha256:1315aee506506e8d69cf6631d8769e6b10131fdcc0eb66df2698f2a3ddaeeff2"},
+    {file = "Cython-3.0.9-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:157973807c2796addbed5fbc4d9c882ab34bbc60dc297ca729504901479d5df7"},
+    {file = "Cython-3.0.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:00b105b5d050645dd59e6767bc0f18b48a4aa11c85f42ec7dd8181606f4059e3"},
+    {file = "Cython-3.0.9-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac5536d09bef240cae0416d5a703d298b74c7bbc397da803ac9d344e732d4369"},
+    {file = "Cython-3.0.9-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:09c44501d476d16aaa4cbc29c87f8c0f54fc20e69b650d59cbfa4863426fc70c"},
+    {file = "Cython-3.0.9-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:cc9c3b9f20d8e298618e5ccd32083ca386e785b08f9893fbec4c50b6b85be772"},
+    {file = "Cython-3.0.9-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a30d96938c633e3ec37000ac3796525da71254ef109e66bdfd78f29891af6454"},
+    {file = "Cython-3.0.9-cp312-cp312-win32.whl", hash = "sha256:757ca93bdd80702546df4d610d2494ef2e74249cac4d5ba9464589fb464bd8a3"},
+    {file = "Cython-3.0.9-cp312-cp312-win_amd64.whl", hash = "sha256:1dc320a9905ab95414013f6de805efbff9e17bb5fb3b90bbac533f017bec8136"},
+    {file = "Cython-3.0.9-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:4ae349960ebe0da0d33724eaa7f1eb866688fe5434cc67ce4dbc06d6a719fbfc"},
+    {file = "Cython-3.0.9-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:63d2537bf688247f76ded6dee28ebd26274f019309aef1eb4f2f9c5c482fde2d"},
+    {file = "Cython-3.0.9-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36f5a2dfc724bea1f710b649f02d802d80fc18320c8e6396684ba4a48412445a"},
+    {file = "Cython-3.0.9-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:deaf4197d4b0bcd5714a497158ea96a2bd6d0f9636095437448f7e06453cc83d"},
+    {file = "Cython-3.0.9-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:000af6deb7412eb7ac0c635ff5e637fb8725dd0a7b88cc58dfc2b3de14e701c4"},
+    {file = "Cython-3.0.9-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:15c7f5c2d35bed9aa5f2a51eaac0df23ae72f2dbacf62fc672dd6bfaa75d2d6f"},
+    {file = "Cython-3.0.9-cp36-cp36m-win32.whl", hash = "sha256:f49aa4970cd3bec66ac22e701def16dca2a49c59cceba519898dd7526e0be2c0"},
+    {file = "Cython-3.0.9-cp36-cp36m-win_amd64.whl", hash = "sha256:4558814fa025b193058d42eeee498a53d6b04b2980d01339fc2444b23fd98e58"},
+    {file = "Cython-3.0.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:539cd1d74fd61f6cfc310fa6bbbad5adc144627f2b7486a07075d4e002fd6aad"},
+    {file = "Cython-3.0.9-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c3232926cd406ee02eabb732206f6e882c3aed9d58f0fea764013d9240405bcf"},
+    {file = "Cython-3.0.9-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:33b6ac376538a7fc8c567b85d3c71504308a9318702ec0485dd66c059f3165cb"},
+    {file = "Cython-3.0.9-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2cc92504b5d22ac66031ffb827bd3a967fc75a5f0f76ab48bce62df19be6fdfd"},
+    {file = "Cython-3.0.9-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:22b8fae756c5c0d8968691bed520876de452f216c28ec896a00739a12dba3bd9"},
+    {file = "Cython-3.0.9-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:9cda0d92a09f3520f29bd91009f1194ba9600777c02c30c6d2d4ac65fb63e40d"},
+    {file = "Cython-3.0.9-cp37-cp37m-win32.whl", hash = "sha256:ec612418490941ed16c50c8d3784c7bdc4c4b2a10c361259871790b02ec8c1db"},
+    {file = "Cython-3.0.9-cp37-cp37m-win_amd64.whl", hash = "sha256:976c8d2bedc91ff6493fc973d38b2dc01020324039e2af0e049704a8e1b22936"},
+    {file = "Cython-3.0.9-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5055988b007c92256b6e9896441c3055556038c3497fcbf8c921a6c1fce90719"},
+    {file = "Cython-3.0.9-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d9360606d964c2d0492a866464efcf9d0a92715644eede3f6a2aa696de54a137"},
+    {file = "Cython-3.0.9-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:02c6e809f060bed073dc7cba1648077fe3b68208863d517c8b39f3920eecf9dd"},
+    {file = "Cython-3.0.9-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:95ed792c966f969cea7489c32ff90150b415c1f3567db8d5a9d489c7c1602dac"},
+    {file = "Cython-3.0.9-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:8edd59d22950b400b03ca78d27dc694d2836a92ef0cac4f64cb4b2ff902f7e25"},
+    {file = "Cython-3.0.9-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:4cf0ed273bf60e97922fcbbdd380c39693922a597760160b4b4355e6078ca188"},
+    {file = "Cython-3.0.9-cp38-cp38-win32.whl", hash = "sha256:5eb9bd4ae12ebb2bc79a193d95aacf090fbd8d7013e11ed5412711650cb34934"},
+    {file = "Cython-3.0.9-cp38-cp38-win_amd64.whl", hash = "sha256:44457279da56e0f829bb1fc5a5dc0836e5d498dbcf9b2324f32f7cc9d2ec6569"},
+    {file = "Cython-3.0.9-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c4b419a1adc2af43f4660e2f6eaf1e4fac2dbac59490771eb8ac3d6063f22356"},
+    {file = "Cython-3.0.9-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f836192140f033b2319a0128936367c295c2b32e23df05b03b672a6015757ea"},
+    {file = "Cython-3.0.9-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2fd198c1a7f8e9382904d622cc0efa3c184605881fd5262c64cbb7168c4c1ec5"},
+    {file = "Cython-3.0.9-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a274fe9ca5c53fafbcf5c8f262f8ad6896206a466f0eeb40aaf36a7951e957c0"},
+    {file = "Cython-3.0.9-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:158c38360bbc5063341b1e78d3737f1251050f89f58a3df0d10fb171c44262be"},
+    {file = "Cython-3.0.9-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8bf30b045f7deda0014b042c1b41c1d272facc762ab657529e3b05505888e878"},
+    {file = "Cython-3.0.9-cp39-cp39-win32.whl", hash = "sha256:9a001fd95c140c94d934078544ff60a3c46aca2dc86e75a76e4121d3cd1f4b33"},
+    {file = "Cython-3.0.9-cp39-cp39-win_amd64.whl", hash = "sha256:530c01c4aebba709c0ec9c7ecefe07177d0b9fd7ffee29450a118d92192ccbdf"},
+    {file = "Cython-3.0.9-py2.py3-none-any.whl", hash = "sha256:bf96417714353c5454c2e3238fca9338599330cf51625cdc1ca698684465646f"},
+    {file = "Cython-3.0.9.tar.gz", hash = "sha256:a2d354f059d1f055d34cfaa62c5b68bc78ac2ceab6407148d47fb508cf3ba4f3"},
 ]

 [[package]]
@@ -477,21 +477,6 @@ test = ["GitPython (<3.1.19)", "Jinja2", "compel (==0.1.8)", "datasets", "invisi
 torch = ["accelerate (>=0.11.0)", "torch (>=1.4,<2.2.0)"]
 training = ["Jinja2", "accelerate (>=0.11.0)", "datasets", "peft (>=0.6.0)", "protobuf (>=3.20.3,<4)", "tensorboard"]

-[[package]]
-name = "diffusion_policy"
-version = "0.0.0"
-description = ""
-optional = false
-python-versions = "*"
-files = []
-develop = false
-
-[package.source]
-type = "git"
-url = "https://github.com/real-stanford/diffusion_policy"
-reference = "HEAD"
-resolved_reference = "548a52bbb105518058e27bf34dcf90bf6f73681a"
-
 [[package]]
 name = "distlib"
 version = "0.3.8"
@@ -503,6 +488,37 @@ files = [
    {file = "distlib-0.3.8.tar.gz", hash = "sha256:1530ea13e350031b6312d8580ddb6b27a104275a31106523b8f123787f494f64"},
 ]

+[[package]]
+name = "dm-control"
+version = "1.0.14"
+description = "Continuous control environments and MuJoCo Python bindings."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "dm_control-1.0.14-py3-none-any.whl", hash = "sha256:883c63244a7ebf598700a97564ed19fffd3479ca79efd090aed881609cdb9fc6"},
+    {file = "dm_control-1.0.14.tar.gz", hash = "sha256:def1ece747b6f175c581150826b50f1a6134086dab34f8f3fd2d088ea035cf3d"},
+]
+
+[package.dependencies]
+absl-py = ">=0.7.0"
+dm-env = "*"
+dm-tree = "!=0.1.2"
+glfw = "*"
+labmaze = "*"
+lxml = "*"
+mujoco = ">=2.3.7"
+numpy = ">=1.9.0"
+protobuf = ">=3.19.4"
+pyopengl = ">=3.1.4"
+pyparsing = ">=3.0.0"
+requests = "*"
+scipy = "*"
+setuptools = "!=50.0.0"
+tqdm = "*"
+
+[package.extras]
+hdf5 = ["h5py"]
+
 [[package]]
 name = "dm-env"
 version = "1.6"
@@ -599,43 +615,6 @@ files = [
    {file = "einops-0.7.0.tar.gz", hash = "sha256:b2b04ad6081a3b227080c9bf5e3ace7160357ff03043cd66cc5b2319eb7031d1"},
 ]

-[[package]]
-name = "etils"
-version = "1.7.0"
-description = "Collection of common python utils"
-optional = false
-python-versions = ">=3.10"
-files = [
-    {file = "etils-1.7.0-py3-none-any.whl", hash = "sha256:61af8f7c242171de15e22e5da02d527cb9e677d11f8bcafe18fcc3548eee3e60"},
-    {file = "etils-1.7.0.tar.gz", hash = "sha256:97b68fd25e185683215286ef3a54e38199b6245f5fe8be6bedc1189be4256350"},
-]
-
-[package.dependencies]
-fsspec = {version = "*", optional = true, markers = "extra == \"epath\""}
-importlib_resources = {version = "*", optional = true, markers = "extra == \"epath\""}
-typing_extensions = {version = "*", optional = true, markers = "extra == \"epy\""}
-zipp = {version = "*", optional = true, markers = "extra == \"epath\""}
-
-[package.extras]
-all = ["etils[array-types]", "etils[eapp]", "etils[ecolab]", "etils[edc]", "etils[enp]", "etils[epath-gcs]", "etils[epath-s3]", "etils[epath]", "etils[epy]", "etils[etqdm]", "etils[etree-dm]", "etils[etree-jax]", "etils[etree-tf]", "etils[etree]"]
-array-types = ["etils[enp]"]
-dev = ["chex", "dataclass_array", "optree", "pyink", "pylint (>=2.6.0)", "pytest", "pytest-subtests", "pytest-xdist", "torch"]
-docs = ["etils[all,dev]", "sphinx-apitree[ext]"]
-eapp = ["absl-py", "etils[epy]", "simple_parsing"]
-ecolab = ["etils[enp]", "etils[epy]", "etils[etree]", "jupyter", "mediapy", "numpy", "packaging", "protobuf"]
-edc = ["etils[epy]"]
-enp = ["etils[epy]", "numpy"]
-epath = ["etils[epy]", "fsspec", "importlib_resources", "typing_extensions", "zipp"]
-epath-gcs = ["etils[epath]", "gcsfs"]
-epath-s3 = ["etils[epath]", "s3fs"]
-epy = ["typing_extensions"]
-etqdm = ["absl-py", "etils[epy]", "tqdm"]
-etree = ["etils[array-types]", "etils[enp]", "etils[epy]", "etils[etqdm]"]
-etree-dm = ["dm-tree", "etils[etree]"]
-etree-jax = ["etils[etree]", "jax[cpu]"]
-etree-tf = ["etils[etree]", "tensorflow"]
-lazy-imports = ["etils[ecolab]"]
-
 [[package]]
 name = "exceptiongroup"
 version = "1.2.0"
@@ -859,20 +838,93 @@ files = [
 [package.dependencies]
 numpy = ">=1.17.3"

+[[package]]
+name = "hf-transfer"
+version = "0.1.6"
+description = ""
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "hf_transfer-0.1.6-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:6fd3d61f9229d27def007e53540412507b74ac2fdb1a29985ae0b6a5137749a2"},
+    {file = "hf_transfer-0.1.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b043bb78df1225de043eb041de9d97783fcca14a0bdc1b1d560fc172fc21b648"},
+    {file = "hf_transfer-0.1.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7db60dd18eae4fa6ea157235fb82196cde5313995b396d1b591aad3b790a7f8f"},
+    {file = "hf_transfer-0.1.6-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:30d31dbab9b5a558cce407b8728e39d87d7af1ef8745ddb90187e9ae0b9e1e90"},
+    {file = "hf_transfer-0.1.6-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f6b368bddd757efc7af3126ba81f9ac8f9435e2cc00902cb3d64f2be28d8f719"},
+    {file = "hf_transfer-0.1.6-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aa2086d8aefaaa3e144e167324574882004c0cec49bf2d0638ec4b74732d8da0"},
+    {file = "hf_transfer-0.1.6-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:45d8985a0940bfe1535cb4ca781f5c11e47c83798ef3373ee1f5d57bbe527a9c"},
+    {file = "hf_transfer-0.1.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2f42b89735f1cde22f2a795d1f0915741023235666be7de45879e533c7d6010c"},
+    {file = "hf_transfer-0.1.6-cp310-none-win32.whl", hash = "sha256:2d2c4c4613f3ad45b6ce6291e347b2d3ba1b86816635681436567e461cb3c961"},
+    {file = "hf_transfer-0.1.6-cp310-none-win_amd64.whl", hash = "sha256:78b0eed8d8dce60168a46e584b9742b816af127d7e410a713e12c31249195342"},
+    {file = "hf_transfer-0.1.6-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:f1d8c172153f9a6cdaecf137612c42796076f61f6bea1072c90ac2e17c1ab6fa"},
+    {file = "hf_transfer-0.1.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2c601996351f90c514a75a0eeb02bf700b1ad1db2d946cbfe4b60b79e29f0b2f"},
+    {file = "hf_transfer-0.1.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e585c808405557d3f5488f385706abb696997bbae262ea04520757e30836d9d"},
+    {file = "hf_transfer-0.1.6-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ec51af1e8cf4268c268bd88932ade3d7ca895a3c661b42493503f02610ae906b"},
+    {file = "hf_transfer-0.1.6-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d106fdf996332f6df3ed3fab6d6332df82e8c1fb4b20fd81a491ca4d2ab5616a"},
+    {file = "hf_transfer-0.1.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e9c2ee9e9fde5a0319cc0e8ddfea10897482bc06d5709b10a238f1bc2ebcbc0b"},
+    {file = "hf_transfer-0.1.6-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f394ea32bc7802b061e549d3133efc523b4ae4fd19bf4b74b183ca6066eef94e"},
+    {file = "hf_transfer-0.1.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4282f09902114cd67fca98a1a1bad569a44521a8395fedf327e966714f68b977"},
+    {file = "hf_transfer-0.1.6-cp311-none-win32.whl", hash = "sha256:276dbf307d5ab6f1bcbf57b5918bfcf9c59d6848ccb28242349e1bb5985f983b"},
+    {file = "hf_transfer-0.1.6-cp311-none-win_amd64.whl", hash = "sha256:fa475175c51451186bea804471995fa8e7b2a48a61dcca55534911dc25955527"},
+    {file = "hf_transfer-0.1.6-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:23d157a67acfa00007799323a1c441b2bbacc7dee625b016b7946fe0e25e6c89"},
+    {file = "hf_transfer-0.1.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6067342a2864b988f861cd2d31bd78eb1e84d153a3f6df38485b6696d9ad3013"},
+    {file = "hf_transfer-0.1.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:91cfcb3070e205b58fa8dc8bcb6a62ccc40913fcdb9cd1ff7c364c8e3aa85345"},
+    {file = "hf_transfer-0.1.6-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:eb76064ac5165d5eeaaf8d0903e8bf55477221ecc2a4a4d69f0baca065ab905b"},
+    {file = "hf_transfer-0.1.6-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9dabd3a177d83028f164984cf4dd859f77ec1e20c97a6f307ff8fcada0785ef1"},
+    {file = "hf_transfer-0.1.6-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d0bf4254e44f64a26e0a5b73b5d7e8d91bb36870718fb4f8e126ec943ff4c805"},
+    {file = "hf_transfer-0.1.6-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5d32c1b106f38f336ceb21531f4db9b57d777b9a33017dafdb6a5316388ebe50"},
+    {file = "hf_transfer-0.1.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff05aba3c83921e5c7635ba9f07c693cc893350c447644824043aeac27b285f5"},
+    {file = "hf_transfer-0.1.6-cp312-none-win32.whl", hash = "sha256:051ef0c55607652cb5974f59638da035773254b9a07d7ee5b574fe062de4c9d1"},
+    {file = "hf_transfer-0.1.6-cp312-none-win_amd64.whl", hash = "sha256:716fb5c574fcbdd8092ce73f9b6c66f42e3544337490f77c60ec07df02bd081b"},
+    {file = "hf_transfer-0.1.6-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c0c981134a55965e279cb7be778c1ccaf93f902fc9ebe31da4f30caf824cc4d"},
+    {file = "hf_transfer-0.1.6-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1ef1f145f04c5b573915bcb1eb5db4039c74f6b46fce73fc473c4287e613b623"},
+    {file = "hf_transfer-0.1.6-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d0a7609b004db3347dbb7796df45403eceb171238210d054d93897d6d84c63a4"},
+    {file = "hf_transfer-0.1.6-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:60f0864bf5996773dbd5f8ae4d1649041f773fe9d5769f4c0eeb5553100acef3"},
+    {file = "hf_transfer-0.1.6-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5d01e55d630ffe70a4f5d0ed576a04c6a48d7c65ca9a7d18f2fca385f20685a9"},
+    {file = "hf_transfer-0.1.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d855946c5062b665190de15b2bdbd4c8eddfee35350bfb7564592e23d36fbbd3"},
+    {file = "hf_transfer-0.1.6-cp37-none-win32.whl", hash = "sha256:fd40b2409cfaf3e8aba20169ee09552f69140e029adeec261b988903ff0c8f6f"},
+    {file = "hf_transfer-0.1.6-cp37-none-win_amd64.whl", hash = "sha256:0e0eba49d46d3b5481919aea0794aec625fbc6ecdf13fe7e0e9f3fc5d5ad5971"},
+    {file = "hf_transfer-0.1.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7e669fecb29fc454449739f9f53ed9253197e7c19e6a6eaa0f08334207af4287"},
+    {file = "hf_transfer-0.1.6-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:89f701802892e5eb84f89f402686861f87dc227d6082b05f4e9d9b4e8015a3c3"},
+    {file = "hf_transfer-0.1.6-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b6f2b0c8b95b01409275d789a9b74d5f2e146346f985d384bf50ec727caf1ccc"},
+    {file = "hf_transfer-0.1.6-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aa855a2fa262792a230f9efcdb5da6d431b747d1861d2a69fe7834b19aea077e"},
+    {file = "hf_transfer-0.1.6-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4aa8ca349afb2f0713475426946261eb2035e4efb50ebd2c1d5ad04f395f4217"},
+    {file = "hf_transfer-0.1.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:01255f043996bc7d1bae62d8afc5033a90c7e36ce308b988eeb84afe0a69562f"},
+    {file = "hf_transfer-0.1.6-cp38-none-win32.whl", hash = "sha256:60b1db183e8a7540cd4f8b2160ff4de55f77cb0c3fc6a10be1e7c30eb1b2bdeb"},
+    {file = "hf_transfer-0.1.6-cp38-none-win_amd64.whl", hash = "sha256:fb8be3cba6aaa50ab2e9dffbd25c8eb2046785eeff642cf0cdd0dd9ae6be3539"},
+    {file = "hf_transfer-0.1.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d09af35e3e3f09b664e6429e9a0dc200f29c5bdfd88bdd9666de51183b1fe202"},
+    {file = "hf_transfer-0.1.6-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a4505bd707cc14d85c800f961fad8ca76f804a8ad22fbb7b1a217d8d0c15e6a5"},
+    {file = "hf_transfer-0.1.6-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2c453fd8b0be9740faa23cecd1f28ee9ead7d900cefa64ff836960c503a744c9"},
+    {file = "hf_transfer-0.1.6-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:13cb8884e718a78c3b81a8cdec9c7ac196dd42961fce55c3ccff3dd783e5ad7a"},
+    {file = "hf_transfer-0.1.6-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:39cd39df171a2b5404de69c4e6cd14eee47f6fe91c1692f939bfb9e59a0110d8"},
+    {file = "hf_transfer-0.1.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8ff0629ee9f98df57a783599602eb498f9ec3619dc69348b12e4d9d754abf0e9"},
+    {file = "hf_transfer-0.1.6-cp39-none-win32.whl", hash = "sha256:164a6ce445eb0cc7c645f5b6e1042c003d33292520c90052b6325f30c98e4c5f"},
+    {file = "hf_transfer-0.1.6-cp39-none-win_amd64.whl", hash = "sha256:11b8b4b73bf455f13218c5f827698a30ae10998ca31b8264b51052868c7a9f11"},
+    {file = "hf_transfer-0.1.6-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:16957ba057376a99ea361074ce1094f61b58e769defa6be2422ae59c0b6a6530"},
+    {file = "hf_transfer-0.1.6-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7db952112e3b8ee1a5cbf500d2443e9ce4fb893281c5310a3e31469898628005"},
+    {file = "hf_transfer-0.1.6-pp37-pypy37_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d39d826a7344f5e39f438d62632acd00467aa54a083b66496f61ef67a9885a56"},
+    {file = "hf_transfer-0.1.6-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a4e2653fbfa92e7651db73d99b697c8684e7345c479bd6857da80bed6138abb2"},
+    {file = "hf_transfer-0.1.6-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:144277e6a86add10b90ec3b583253aec777130312256bfc8d5ade5377e253807"},
+    {file = "hf_transfer-0.1.6-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3bb53bcd16365313b2aa0dbdc28206f577d70770f31249cdabc387ac5841edcc"},
+    {file = "hf_transfer-0.1.6-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:990d73a5a68d8261980f146c51f4c5f9995314011cb225222021ad7c39f3af2d"},
+    {file = "hf_transfer-0.1.6-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:652406037029ab9b4097b4c5f29321bad5f64c2b46fbff142509d918aec87c29"},
+    {file = "hf_transfer-0.1.6.tar.gz", hash = "sha256:deb505a7d417d7055fd7b3549eadb91dfe782941261f3344025c486c16d1d2f9"},
+]
+
 [[package]]
 name = "huggingface-hub"
-version = "0.21.3"
+version = "0.21.4"
 description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
 optional = false
 python-versions = ">=3.8.0"
 files = [
-    {file = "huggingface_hub-0.21.3-py3-none-any.whl", hash = "sha256:b183144336fdf2810a8c109822e0bb6ef1fd61c65da6fb60e8c3f658b7144016"},
-    {file = "huggingface_hub-0.21.3.tar.gz", hash = "sha256:26a15b604e4fc7bad37c467b76456543ec849386cbca9cd7e1e135f53e500423"},
+    {file = "huggingface_hub-0.21.4-py3-none-any.whl", hash = "sha256:df37c2c37fc6c82163cdd8a67ede261687d80d1e262526d6c0ce73b6b3630a7b"},
+    {file = "huggingface_hub-0.21.4.tar.gz", hash = "sha256:e1f4968c93726565a80edf6dc309763c7b546d0cfe79aa221206034d50155531"},
 ]

 [package.dependencies]
 filelock = "*"
 fsspec = ">=2023.5.0"
+hf-transfer = {version = ">=0.1.4", optional = true, markers = "extra == \"hf_transfer\""}
 packaging = ">=20.9"
 pyyaml = ">=5.1"
 requests = "*"
@@ -986,37 +1038,22 @@ setuptools = "*"

 [[package]]
 name = "importlib-metadata"
-version = "7.0.1"
+version = "7.0.2"
 description = "Read metadata from Python packages"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "importlib_metadata-7.0.1-py3-none-any.whl", hash = "sha256:4805911c3a4ec7c3966410053e9ec6a1fecd629117df5adee56dfc9432a1081e"},
-    {file = "importlib_metadata-7.0.1.tar.gz", hash = "sha256:f238736bb06590ae52ac1fab06a3a9ef1d8dce2b7a35b5ab329371d6c8f5d2cc"},
+    {file = "importlib_metadata-7.0.2-py3-none-any.whl", hash = "sha256:f4bc4c0c070c490abf4ce96d715f68e95923320370efb66143df00199bb6c100"},
+    {file = "importlib_metadata-7.0.2.tar.gz", hash = "sha256:198f568f3230878cb1b44fbd7975f87906c22336dba2e4a7f05278c281fbd792"},
 ]

 [package.dependencies]
 zipp = ">=0.5"

 [package.extras]
-docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-lint"]
+docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
 perf = ["ipython"]
-testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)", "pytest-ruff"]
-
-[[package]]
-name = "importlib-resources"
-version = "6.1.2"
-description = "Read resources from Python packages"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "importlib_resources-6.1.2-py3-none-any.whl", hash = "sha256:9a0a862501dc38b68adebc82970140c9e4209fc99601782925178f8386339938"},
-    {file = "importlib_resources-6.1.2.tar.gz", hash = "sha256:308abf8474e2dba5f867d279237cd4076482c3de7104a40b41426370e891549b"},
-]
-
-[package.extras]
-docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-lint"]
-testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-ruff (>=0.2.1)", "zipp (>=3.17)"]
+testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-perf (>=0.9.2)", "pytest-ruff (>=0.2.1)"]

 [[package]]
 name = "iniconfig"
@@ -1046,6 +1083,50 @@ MarkupSafe = ">=2.0"
 [package.extras]
 i18n = ["Babel (>=2.7)"]

+[[package]]
+name = "labmaze"
+version = "1.0.6"
+description = "LabMaze: DeepMind Lab's text maze generator."
+optional = false
+python-versions = "*"
+files = [
+    {file = "labmaze-1.0.6-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:b2ddef976dfd8d992b19cfa6c633f2eba7576d759c2082da534e3f727479a84a"},
+    {file = "labmaze-1.0.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:157efaa93228c8ccce5cae337902dd652093e0fba9d3a0f6506e4bee272bb66f"},
+    {file = "labmaze-1.0.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b3ce98b9541c5fe6a306e411e7d018121dd646f2c9978d763fad86f9f30c5f57"},
+    {file = "labmaze-1.0.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4e6433bd49bc541791de8191040526fddfebb77151620eb04203453f43ee486a"},
+    {file = "labmaze-1.0.6-cp310-cp310-win_amd64.whl", hash = "sha256:6a507fc35961f1b1479708e2716f65e0d0611cefb55f31a77be29ce2339b6fef"},
+    {file = "labmaze-1.0.6-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:a0c2cb9dec971814ea9c5d7150af15fa3964482131fa969e0afb94bd224348af"},
+    {file = "labmaze-1.0.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2c6ba9538d819543f4be448d36b4926a3881e53646a2b331ebb5a1f353047d05"},
+    {file = "labmaze-1.0.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:70635d1cdb0147a02efb6b3f607a52cdc51723bc3dcc42717a0d4ef55fa0a987"},
+    {file = "labmaze-1.0.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff472793238bd9b6dabea8094594d6074ad3c111455de3afcae72f6c40c6817e"},
+    {file = "labmaze-1.0.6-cp311-cp311-win_amd64.whl", hash = "sha256:2317e65e12fa3d1abecda7e0488dab15456cee8a2e717a586bfc8f02a91579e7"},
+    {file = "labmaze-1.0.6-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:e36b6fadcd78f22057b597c1c77823e806a0987b3bdfbf850e14b6b5b502075e"},
+    {file = "labmaze-1.0.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d1a4f8de29c2c3d7f14163759b69cd3f237093b85334c983619c1db5403a223b"},
+    {file = "labmaze-1.0.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a394f8bb857fcaa2884b809d63e750841c2662a106cfe8c045f2112d201ac7d5"},
+    {file = "labmaze-1.0.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d17abb69d4dfc56183afb5c317e8b2eaca0587abb3aabd2326efd3143c81f4e"},
+    {file = "labmaze-1.0.6-cp312-cp312-win_amd64.whl", hash = "sha256:5af997598cc46b1929d1c5a1febc32fd56c75874fe481a2a5982c65cee8450c9"},
+    {file = "labmaze-1.0.6-cp37-cp37m-macosx_10_12_x86_64.whl", hash = "sha256:a4c5bc6e56baa55ce63b97569afec2f80cab0f6b952752a131e1f83eed190a53"},
+    {file = "labmaze-1.0.6-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3955f24fe5f708e1e97495b4cfe284b70ae4fd51be5e17b75a6fc04ffbd67bca"},
+    {file = "labmaze-1.0.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ed96ddc0bb8d66df36428c94db83949fd84a15867e8250763a4c5e3d82104c54"},
+    {file = "labmaze-1.0.6-cp37-cp37m-win_amd64.whl", hash = "sha256:3bd0458a29e55aa09f146e28a168d2e00b8ccf19e2259a3f71154cfff3536b1d"},
+    {file = "labmaze-1.0.6-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:33f5154edc83dff55a150e54b60c8582fdafc7ec45195049809cbcc01f5e8f34"},
+    {file = "labmaze-1.0.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0971055ef2a5f7d8517fdc42b67c057093698f1eb911f46faa7018867b73fcc9"},
+    {file = "labmaze-1.0.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:de18d09680007302abf49111f3fe822d8435e4fbc4468b9ec07d50a78e267865"},
+    {file = "labmaze-1.0.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f18126066db2218a52853c7dd490b4c3d8129fc22eb3a47eb23007524b911d53"},
+    {file = "labmaze-1.0.6-cp38-cp38-win_amd64.whl", hash = "sha256:f9aef09a76877342bb4d634b7e05f43b038a49c4f34adfb8f1b8ac57c29472f2"},
+    {file = "labmaze-1.0.6-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:5dd28899418f1b8b1c7d1e1b40a4593150a7cfa95ca91e23860b9785b82cc0ee"},
+    {file = "labmaze-1.0.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:965569f37ee33090b4d4b3aa5aa7c9dcc4f62e2ae5d761e7f73ec76fc9d8aa96"},
+    {file = "labmaze-1.0.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:05eccfa98c0e781bc9f939076ae600b2e25ca736e123f2a530606aedec3b531c"},
+    {file = "labmaze-1.0.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bee8c94e0fb3fc2d8180214947245c1d74a3489349a9da90b868296e77a521e9"},
+    {file = "labmaze-1.0.6-cp39-cp39-win_amd64.whl", hash = "sha256:d486e9ca3a335ad628e3bd48a09c42f1aa5f51040952ef0fe32507afedcd694b"},
+    {file = "labmaze-1.0.6.tar.gz", hash = "sha256:2e8de7094042a77d6972f1965cf5c9e8f971f1b34d225752f343190a825ebe73"},
+]
+
+[package.dependencies]
+absl-py = "*"
+numpy = ">=1.8.0"
+setuptools = "!=50.0.0"
+
 [[package]]
 name = "lazy-loader"
 version = "0.3"
@@ -1091,6 +1172,99 @@ files = [
    {file = "llvmlite-0.42.0.tar.gz", hash = "sha256:f92b09243c0cc3f457da8b983f67bd8e1295d0f5b3746c7a1861d7a99403854a"},
 ]

+[[package]]
+name = "lxml"
+version = "5.1.0"
+description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API."
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "lxml-5.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:704f5572ff473a5f897745abebc6df40f22d4133c1e0a1f124e4f2bd3330ff7e"},
+    {file = "lxml-5.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9d3c0f8567ffe7502d969c2c1b809892dc793b5d0665f602aad19895f8d508da"},
+    {file = "lxml-5.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5fcfbebdb0c5d8d18b84118842f31965d59ee3e66996ac842e21f957eb76138c"},
+    {file = "lxml-5.1.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2f37c6d7106a9d6f0708d4e164b707037b7380fcd0b04c5bd9cae1fb46a856fb"},
+    {file = "lxml-5.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2befa20a13f1a75c751f47e00929fb3433d67eb9923c2c0b364de449121f447c"},
+    {file = "lxml-5.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:22b7ee4c35f374e2c20337a95502057964d7e35b996b1c667b5c65c567d2252a"},
+    {file = "lxml-5.1.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:bf8443781533b8d37b295016a4b53c1494fa9a03573c09ca5104550c138d5c05"},
+    {file = "lxml-5.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:82bddf0e72cb2af3cbba7cec1d2fd11fda0de6be8f4492223d4a268713ef2147"},
+    {file = "lxml-5.1.0-cp310-cp310-win32.whl", hash = "sha256:b66aa6357b265670bb574f050ffceefb98549c721cf28351b748be1ef9577d93"},
+    {file = "lxml-5.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:4946e7f59b7b6a9e27bef34422f645e9a368cb2be11bf1ef3cafc39a1f6ba68d"},
+    {file = "lxml-5.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:14deca1460b4b0f6b01f1ddc9557704e8b365f55c63070463f6c18619ebf964f"},
+    {file = "lxml-5.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ed8c3d2cd329bf779b7ed38db176738f3f8be637bb395ce9629fc76f78afe3d4"},
+    {file = "lxml-5.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:436a943c2900bb98123b06437cdd30580a61340fbdb7b28aaf345a459c19046a"},
+    {file = "lxml-5.1.0-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:acb6b2f96f60f70e7f34efe0c3ea34ca63f19ca63ce90019c6cbca6b676e81fa"},
+    {file = "lxml-5.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:af8920ce4a55ff41167ddbc20077f5698c2e710ad3353d32a07d3264f3a2021e"},
+    {file = "lxml-5.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7cfced4a069003d8913408e10ca8ed092c49a7f6cefee9bb74b6b3e860683b45"},
+    {file = "lxml-5.1.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:9e5ac3437746189a9b4121db2a7b86056ac8786b12e88838696899328fc44bb2"},
+    {file = "lxml-5.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:f4c9bda132ad108b387c33fabfea47866af87f4ea6ffb79418004f0521e63204"},
+    {file = "lxml-5.1.0-cp311-cp311-win32.whl", hash = "sha256:bc64d1b1dab08f679fb89c368f4c05693f58a9faf744c4d390d7ed1d8223869b"},
+    {file = "lxml-5.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:a5ab722ae5a873d8dcee1f5f45ddd93c34210aed44ff2dc643b5025981908cda"},
+    {file = "lxml-5.1.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:9aa543980ab1fbf1720969af1d99095a548ea42e00361e727c58a40832439114"},
+    {file = "lxml-5.1.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6f11b77ec0979f7e4dc5ae081325a2946f1fe424148d3945f943ceaede98adb8"},
+    {file = "lxml-5.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a36c506e5f8aeb40680491d39ed94670487ce6614b9d27cabe45d94cd5d63e1e"},
+    {file = "lxml-5.1.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f643ffd2669ffd4b5a3e9b41c909b72b2a1d5e4915da90a77e119b8d48ce867a"},
+    {file = "lxml-5.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:16dd953fb719f0ffc5bc067428fc9e88f599e15723a85618c45847c96f11f431"},
+    {file = "lxml-5.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:16018f7099245157564d7148165132c70adb272fb5a17c048ba70d9cc542a1a1"},
+    {file = "lxml-5.1.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:82cd34f1081ae4ea2ede3d52f71b7be313756e99b4b5f829f89b12da552d3aa3"},
+    {file = "lxml-5.1.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:19a1bc898ae9f06bccb7c3e1dfd73897ecbbd2c96afe9095a6026016e5ca97b8"},
+    {file = "lxml-5.1.0-cp312-cp312-win32.whl", hash = "sha256:13521a321a25c641b9ea127ef478b580b5ec82aa2e9fc076c86169d161798b01"},
+    {file = "lxml-5.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:1ad17c20e3666c035db502c78b86e58ff6b5991906e55bdbef94977700c72623"},
+    {file = "lxml-5.1.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:24ef5a4631c0b6cceaf2dbca21687e29725b7c4e171f33a8f8ce23c12558ded1"},
+    {file = "lxml-5.1.0-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8d2900b7f5318bc7ad8631d3d40190b95ef2aa8cc59473b73b294e4a55e9f30f"},
+    {file = "lxml-5.1.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:601f4a75797d7a770daed8b42b97cd1bb1ba18bd51a9382077a6a247a12aa38d"},
+    {file = "lxml-5.1.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b4b68c961b5cc402cbd99cca5eb2547e46ce77260eb705f4d117fd9c3f932b95"},
+    {file = "lxml-5.1.0-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:afd825e30f8d1f521713a5669b63657bcfe5980a916c95855060048b88e1adb7"},
+    {file = "lxml-5.1.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:262bc5f512a66b527d026518507e78c2f9c2bd9eb5c8aeeb9f0eb43fcb69dc67"},
+    {file = "lxml-5.1.0-cp36-cp36m-win32.whl", hash = "sha256:e856c1c7255c739434489ec9c8aa9cdf5179785d10ff20add308b5d673bed5cd"},
+    {file = "lxml-5.1.0-cp36-cp36m-win_amd64.whl", hash = "sha256:c7257171bb8d4432fe9d6fdde4d55fdbe663a63636a17f7f9aaba9bcb3153ad7"},
+    {file = "lxml-5.1.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b9e240ae0ba96477682aa87899d94ddec1cc7926f9df29b1dd57b39e797d5ab5"},
+    {file = "lxml-5.1.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a96f02ba1bcd330807fc060ed91d1f7a20853da6dd449e5da4b09bfcc08fdcf5"},
+    {file = "lxml-5.1.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e3898ae2b58eeafedfe99e542a17859017d72d7f6a63de0f04f99c2cb125936"},
+    {file = "lxml-5.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:61c5a7edbd7c695e54fca029ceb351fc45cd8860119a0f83e48be44e1c464862"},
+    {file = "lxml-5.1.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:3aeca824b38ca78d9ee2ab82bd9883083d0492d9d17df065ba3b94e88e4d7ee6"},
+    {file = "lxml-5.1.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:8f52fe6859b9db71ee609b0c0a70fea5f1e71c3462ecf144ca800d3f434f0764"},
+    {file = "lxml-5.1.0-cp37-cp37m-win32.whl", hash = "sha256:d42e3a3fc18acc88b838efded0e6ec3edf3e328a58c68fbd36a7263a874906c8"},
+    {file = "lxml-5.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:eac68f96539b32fce2c9b47eb7c25bb2582bdaf1bbb360d25f564ee9e04c542b"},
+    {file = "lxml-5.1.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ae15347a88cf8af0949a9872b57a320d2605ae069bcdf047677318bc0bba45b1"},
+    {file = "lxml-5.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c26aab6ea9c54d3bed716b8851c8bfc40cb249b8e9880e250d1eddde9f709bf5"},
+    {file = "lxml-5.1.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:342e95bddec3a698ac24378d61996b3ee5ba9acfeb253986002ac53c9a5f6f84"},
+    {file = "lxml-5.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:725e171e0b99a66ec8605ac77fa12239dbe061482ac854d25720e2294652eeaa"},
+    {file = "lxml-5.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d184e0d5c918cff04cdde9dbdf9600e960161d773666958c9d7b565ccc60c45"},
+    {file = "lxml-5.1.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:98f3f020a2b736566c707c8e034945c02aa94e124c24f77ca097c446f81b01f1"},
+    {file = "lxml-5.1.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6d48fc57e7c1e3df57be5ae8614bab6d4e7b60f65c5457915c26892c41afc59e"},
+    {file = "lxml-5.1.0-cp38-cp38-win32.whl", hash = "sha256:7ec465e6549ed97e9f1e5ed51c657c9ede767bc1c11552f7f4d022c4df4a977a"},
+    {file = "lxml-5.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:b21b4031b53d25b0858d4e124f2f9131ffc1530431c6d1321805c90da78388d1"},
+    {file = "lxml-5.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:52427a7eadc98f9e62cb1368a5079ae826f94f05755d2d567d93ee1bc3ceb354"},
+    {file = "lxml-5.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6a2a2c724d97c1eb8cf966b16ca2915566a4904b9aad2ed9a09c748ffe14f969"},
+    {file = "lxml-5.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:843b9c835580d52828d8f69ea4302537337a21e6b4f1ec711a52241ba4a824f3"},
+    {file = "lxml-5.1.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9b99f564659cfa704a2dd82d0684207b1aadf7d02d33e54845f9fc78e06b7581"},
+    {file = "lxml-5.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f8b0c78e7aac24979ef09b7f50da871c2de2def043d468c4b41f512d831e912"},
+    {file = "lxml-5.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9bcf86dfc8ff3e992fed847c077bd875d9e0ba2fa25d859c3a0f0f76f07f0c8d"},
+    {file = "lxml-5.1.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:49a9b4af45e8b925e1cd6f3b15bbba2c81e7dba6dce170c677c9cda547411e14"},
+    {file = "lxml-5.1.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:280f3edf15c2a967d923bcfb1f8f15337ad36f93525828b40a0f9d6c2ad24890"},
+    {file = "lxml-5.1.0-cp39-cp39-win32.whl", hash = "sha256:ed7326563024b6e91fef6b6c7a1a2ff0a71b97793ac33dbbcf38f6005e51ff6e"},
+    {file = "lxml-5.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:8d7b4beebb178e9183138f552238f7e6613162a42164233e2bda00cb3afac58f"},
+    {file = "lxml-5.1.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:9bd0ae7cc2b85320abd5e0abad5ccee5564ed5f0cc90245d2f9a8ef330a8deae"},
+    {file = "lxml-5.1.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d8c1d679df4361408b628f42b26a5d62bd3e9ba7f0c0e7969f925021554755aa"},
+    {file = "lxml-5.1.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:2ad3a8ce9e8a767131061a22cd28fdffa3cd2dc193f399ff7b81777f3520e372"},
+    {file = "lxml-5.1.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:304128394c9c22b6569eba2a6d98392b56fbdfbad58f83ea702530be80d0f9df"},
+    {file = "lxml-5.1.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d74fcaf87132ffc0447b3c685a9f862ffb5b43e70ea6beec2fb8057d5d2a1fea"},
+    {file = "lxml-5.1.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:8cf5877f7ed384dabfdcc37922c3191bf27e55b498fecece9fd5c2c7aaa34c33"},
+    {file = "lxml-5.1.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:877efb968c3d7eb2dad540b6cabf2f1d3c0fbf4b2d309a3c141f79c7e0061324"},
+    {file = "lxml-5.1.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3f14a4fb1c1c402a22e6a341a24c1341b4a3def81b41cd354386dcb795f83897"},
+    {file = "lxml-5.1.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:25663d6e99659544ee8fe1b89b1a8c0aaa5e34b103fab124b17fa958c4a324a6"},
+    {file = "lxml-5.1.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:8b9f19df998761babaa7f09e6bc169294eefafd6149aaa272081cbddc7ba4ca3"},
+    {file = "lxml-5.1.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e53d7e6a98b64fe54775d23a7c669763451340c3d44ad5e3a3b48a1efbdc96f"},
+    {file = "lxml-5.1.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:c3cd1fc1dc7c376c54440aeaaa0dcc803d2126732ff5c6b68ccd619f2e64be4f"},
+    {file = "lxml-5.1.0.tar.gz", hash = "sha256:3eea6ed6e6c918e468e693c41ef07f3c3acc310b70ddd9cc72d9ef84bc9564ca"},
+]
+
+[package.extras]
+cssselect = ["cssselect (>=0.7)"]
+html5 = ["html5lib"]
+htmlsoup = ["BeautifulSoup4"]
+source = ["Cython (>=3.0.7)"]
+
 [[package]]
 name = "markupsafe"
 version = "2.1.5"
@@ -1203,42 +1377,40 @@ tests = ["pytest (>=4.6)"]

 [[package]]
 name = "mujoco"
-version = "3.1.2"
+version = "2.3.7"
 description = "MuJoCo Physics Simulator"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "mujoco-3.1.2-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:fe6b3542695a5363f348ee45625b3492734f29cdc9f493ca25eae719f974370e"},
-    {file = "mujoco-3.1.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f07e2d1f01f1401f1a503187016f8c017d9402618c659e1482243640a1e11288"},
-    {file = "mujoco-3.1.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:93863eccc9d77d96ce62dda2a6f61cbd880379e8d774f802568d64b9613fce39"},
-    {file = "mujoco-3.1.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3586c642390c16fef58b01a86071cec6814c471586e2f4115c3733c4aec64fb7"},
-    {file = "mujoco-3.1.2-cp310-cp310-win_amd64.whl", hash = "sha256:0da77394c664945b78f199c627b609fe091ec0c4641b9d8f713637344a17821a"},
-    {file = "mujoco-3.1.2-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:b6f12904d0478c191e4770ecf9006e20953f0488a2411a8ddc62592721c136dc"},
-    {file = "mujoco-3.1.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f69b8d42b50c10f8d12df4948fc9d4dd6706841e7b163c1d7ce83448965acb1c"},
-    {file = "mujoco-3.1.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:10119e39b1f45fb76b18bea242fea1d6ccf4b2285f8bd5e2cb1e2cbdeb69bdcd"},
-    {file = "mujoco-3.1.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a65868506dd45dddfe7be84857e57b49bc102334fc0439aa848a4d4d285d89b"},
-    {file = "mujoco-3.1.2-cp311-cp311-win_amd64.whl", hash = "sha256:92bc73972e39539f23a05bb411c45f9be17191fe01171ac15ffafed381ee4366"},
-    {file = "mujoco-3.1.2-cp312-cp312-macosx_10_16_x86_64.whl", hash = "sha256:835d6b64ca4dc2f6a83291275fd48bd83edc888039d247958bf5b2c759db4340"},
-    {file = "mujoco-3.1.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2ce94ca3cf14fc519981674c5b85f1055356dcdcd63bbc0ec6c340084438f27f"},
-    {file = "mujoco-3.1.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:250d9de4bd0d31fa4165faf01a1f838c429434f1263faacd95b977580f24eae7"},
-    {file = "mujoco-3.1.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ea009d10bbf0aba9bc835f051d25f07a2c3edbaa06627ac2348766a1f3760b9"},
-    {file = "mujoco-3.1.2-cp312-cp312-win_amd64.whl", hash = "sha256:a0460d2ebdad4926f48b8c774da473e011c3b3afd0ccb6b6be1087b788c34011"},
-    {file = "mujoco-3.1.2-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:4ca7cae89e258a338e02229edcf8f177b459ac5e9f859ffffa07fc2c9fcfb6aa"},
-    {file = "mujoco-3.1.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:33b4fe9b5f891b29ef0fc2b0b975bc3a8a4b87774eecaf4364a83ddc6a7762ba"},
-    {file = "mujoco-3.1.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ed230980f33bafaf1fa8b32ef25b82b069a245de15ee6ce7127e7e054cfad16"},
-    {file = "mujoco-3.1.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41cc610ac40f325c9d49d9885ac6cb61822ed938f6c23cb183b261a7a28472ca"},
-    {file = "mujoco-3.1.2-cp38-cp38-win_amd64.whl", hash = "sha256:90a172b904a6ca8e6a1be80ab7c393aaff7592843a2a6853a4f97a9204031c41"},
-    {file = "mujoco-3.1.2-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:93201291a0c5b573b4cbb19a6b08c99673f9fba167f402174eae5ffa23066d24"},
-    {file = "mujoco-3.1.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0398985bb28c2686cdeeaf4ded46e602a49ec12115ac77474144ca940e5261c5"},
-    {file = "mujoco-3.1.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2e76b5cb07ab3088c81966ac774d573df027fa5f4e78c20953a547528a2a698"},
-    {file = "mujoco-3.1.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd5c3f4ae858e812cb3f03332693bcdc343b2bce55b164523acf52dea2736c9e"},
-    {file = "mujoco-3.1.2-cp39-cp39-win_amd64.whl", hash = "sha256:ca25ff2646b06609526ef8681c0e123cd854a53c9ff23cb91dd5058a2794dab4"},
-    {file = "mujoco-3.1.2.tar.gz", hash = "sha256:53530bc1a91903f3fd4b1e99818cc38fbd9911700db29b2c9fc839f23bfacbb8"},
+    {file = "mujoco-2.3.7-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:e8714a5ff6a1561b364b7b4648d4c0c8d13e751874cf7401c309b9d23fa9598b"},
+    {file = "mujoco-2.3.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a934315f858a4e0c4b90a682fde519471cfdd7baa64435179da8cd20d4ae3f99"},
+    {file = "mujoco-2.3.7-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:36513024330f88b5f9a43558efef5692b33599bffd5141029b690a27918ffcbe"},
+    {file = "mujoco-2.3.7-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6d4eede8ba8210fbd3d3cd1dbf69e24dd1541aa74c5af5b8adbbbf65504b6dba"},
+    {file = "mujoco-2.3.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ab85fafc9d5a091c712947573b7e694512d283876bf7f33ae3f8daad3a20c0db"},
+    {file = "mujoco-2.3.7-cp310-cp310-win_amd64.whl", hash = "sha256:f8b7e13fef8c813d91b78f975ed0815157692777907ffa4b4be53a4edb75019b"},
+    {file = "mujoco-2.3.7-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:779520216f72a8e370e3f0cdd71b45c3b7384c63331a3189194c930a3e7cff5c"},
+    {file = "mujoco-2.3.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9d4018053879016282d27ab7a91e292c72d44efb5a88553feacfe5b843dde103"},
+    {file = "mujoco-2.3.7-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:3149b16b8122ee62642474bfd2871064e8edc40235471cf5d84be3569afc0312"},
+    {file = "mujoco-2.3.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c08660a8d52ef3efde76095f0991e807703a950c1e882d2bcd984b9a846626f7"},
+    {file = "mujoco-2.3.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:426af8965f8636d94a0f75740c3024a62b3e585020ee817ef5208ec844a1ad94"},
+    {file = "mujoco-2.3.7-cp311-cp311-win_amd64.whl", hash = "sha256:215415a8e98a4b50625beae859079d5e0810b2039e50420f0ba81763c34abb59"},
+    {file = "mujoco-2.3.7-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:8b78d14f4c60cea3c58e046bd4de453fb5b9b33aca6a25fc91d39a53f3a5342a"},
+    {file = "mujoco-2.3.7-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5c6f5a51d6f537a4bf294cf73816f3a6384573f8f10a5452b044df2771412a96"},
+    {file = "mujoco-2.3.7-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:ea8911e6047f92d7d775701f37e4c093971b6def3160f01d0b6926e29a7e962e"},
+    {file = "mujoco-2.3.7-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7473a3de4dd1a8762d569ffb139196b4c5e7eca27d256df97b6cd4c66d2a09b2"},
+    {file = "mujoco-2.3.7-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40e7e2d8f93d2495ec74efec84e5118ecc6e1d85157a844789c73c9ac9a4e28e"},
+    {file = "mujoco-2.3.7-cp38-cp38-win_amd64.whl", hash = "sha256:720bc228a2023b3b0ed6af78f5b0f8ea36867be321d473321555c57dbf6e4e5b"},
+    {file = "mujoco-2.3.7-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:855e79686366442aa410246043b44f7d842d3900d68fe7e37feb42147db9d707"},
+    {file = "mujoco-2.3.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:98947f4a742d34d36f3c3f83e9167025bb0414bbaa4bd859b0673bdab9959963"},
+    {file = "mujoco-2.3.7-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:d42818f2ee5d1632dbce31d136ed5ff868db54b04e4e9aca0c5a3ac329f8a90f"},
+    {file = "mujoco-2.3.7-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9237e1ba14bced9449c31199e6d5be49547f3a4c99bc83b196af7ca45fd73b83"},
+    {file = "mujoco-2.3.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:39b728ea638245b150e2650c5433e6952e0ed3798c63e47e264574270caea2a3"},
+    {file = "mujoco-2.3.7-cp39-cp39-win_amd64.whl", hash = "sha256:9c721a5042b99d948d5f0296a534bcce3f142c777c4d7642f503a539513f3912"},
+    {file = "mujoco-2.3.7.tar.gz", hash = "sha256:422041f1ce37c6d151fbced1048df626837e94fe3cd9f813585907046336a7d0"},
 ]

 [package.dependencies]
 absl-py = "*"
-etils = {version = "*", extras = ["epath"]}
 glfw = "*"
 numpy = "*"
 pyopengl = "*"
@@ -1534,13 +1706,13 @@ files = [

 [[package]]
 name = "nvidia-nvjitlink-cu12"
-version = "12.3.101"
+version = "12.4.99"
 description = "Nvidia JIT LTO Library"
 optional = false
 python-versions = ">=3"
 files = [
-    {file = "nvidia_nvjitlink_cu12-12.3.101-py3-none-manylinux1_x86_64.whl", hash = "sha256:64335a8088e2b9d196ae8665430bc6a2b7e6ef2eb877a9c735c804bd4ff6467c"},
-    {file = "nvidia_nvjitlink_cu12-12.3.101-py3-none-win_amd64.whl", hash = "sha256:1b2e317e437433753530792f13eece58f0aec21a2b05903be7bffe58a606cbd1"},
+    {file = "nvidia_nvjitlink_cu12-12.4.99-py3-none-manylinux2014_x86_64.whl", hash = "sha256:c6428836d20fe7e327191c175791d38570e10762edc588fb46749217cd444c74"},
+    {file = "nvidia_nvjitlink_cu12-12.4.99-py3-none-win_amd64.whl", hash = "sha256:991905ffa2144cb603d8ca7962d75c35334ae82bf92820b6ba78157277da1ad2"},
 ]

 [[package]]
@@ -1595,13 +1767,13 @@ numpy = [

 [[package]]
 name = "packaging"
-version = "23.2"
+version = "24.0"
 description = "Core utilities for Python packages"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "packaging-23.2-py3-none-any.whl", hash = "sha256:8c491190033a9af7e1d931d0b5dacc2ef47509b34dd0de67ed209b5203fc88c7"},
-    {file = "packaging-23.2.tar.gz", hash = "sha256:048fb0e9405036518eaaf48a55953c750c11e1a1b68e0dd1a9d62ed0c092cfc5"},
+    {file = "packaging-24.0-py3-none-any.whl", hash = "sha256:2ddfb553fdf02fb784c234c7ba6ccc288296ceabec964ad2eae3777778130bc5"},
+    {file = "packaging-24.0.tar.gz", hash = "sha256:eb82c5e3e56209074766e6885bb04b8c38a0c015d0a30036ebe7ece34c9989e9"},
 ]

 [[package]]
@@ -2031,6 +2203,20 @@ files = [
    {file = "PyOpenGL-3.1.7.tar.gz", hash = "sha256:eef31a3888e6984fd4d8e6c9961b184c9813ca82604d37fe3da80eb000a76c86"},
 ]

+[[package]]
+name = "pyparsing"
+version = "3.1.2"
+description = "pyparsing module - Classes and methods to define and execute parsing grammars"
+optional = false
+python-versions = ">=3.6.8"
+files = [
+    {file = "pyparsing-3.1.2-py3-none-any.whl", hash = "sha256:f9db75911801ed778fe61bb643079ff86601aca99fcae6345aa67292038fb742"},
+    {file = "pyparsing-3.1.2.tar.gz", hash = "sha256:a1bac0ce561155ecc3ed78ca94d3c9378656ad4c94c1270de543f621420f94ad"},
+]
+
+[package.extras]
+diagrams = ["jinja2", "railroad-diagrams"]
+
 [[package]]
 name = "pysocks"
 version = "1.7.1"
@@ -2045,13 +2231,13 @@ files = [

 [[package]]
 name = "pytest"
-version = "8.1.0"
+version = "8.1.1"
 description = "pytest: simple powerful testing with Python"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "pytest-8.1.0-py3-none-any.whl", hash = "sha256:ee32db7af8de4629a455806befa90559f307424c07b8413ccfc30bf5b221dd7e"},
-    {file = "pytest-8.1.0.tar.gz", hash = "sha256:f8fa04ab8f98d185113ae60ea6d79c22f8143b14bc1caeced44a0ab844928323"},
+    {file = "pytest-8.1.1-py3-none-any.whl", hash = "sha256:2a8386cfc11fa9d2c50ee7b2a57e7d898ef90470a7a34c4b949ff59662bb78b7"},
+    {file = "pytest-8.1.1.tar.gz", hash = "sha256:ac978141a75948948817d360297b7aae0fcb9d6ff6bc9ec6d514b85d5a65c044"},
 ]

 [package.dependencies]
@@ -2067,13 +2253,13 @@ testing = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygm

 [[package]]
 name = "python-dateutil"
-version = "2.8.2"
+version = "2.9.0.post0"
 description = "Extensions to the standard Python datetime module"
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
 files = [
-    {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"},
-    {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"},
+    {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"},
+    {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"},
 ]

 [package.dependencies]
@@ -2498,13 +2684,13 @@ test = ["asv", "gmpy2", "hypothesis", "mpmath", "pooch", "pytest", "pytest-cov",

 [[package]]
 name = "sentry-sdk"
-version = "1.40.6"
+version = "1.41.0"
 description = "Python client for Sentry (https://sentry.io)"
 optional = false
 python-versions = "*"
 files = [
-    {file = "sentry-sdk-1.40.6.tar.gz", hash = "sha256:f143f3fb4bb57c90abef6e2ad06b5f6f02b2ca13e4060ec5c0549c7a9ccce3fa"},
-    {file = "sentry_sdk-1.40.6-py2.py3-none-any.whl", hash = "sha256:becda09660df63e55f307570e9817c664392655a7328bbc414b507e9cb874c67"},
+    {file = "sentry-sdk-1.41.0.tar.gz", hash = "sha256:4f2d6c43c07925d8cd10dfbd0970ea7cb784f70e79523cca9dbcd72df38e5a46"},
+    {file = "sentry_sdk-1.41.0-py2.py3-none-any.whl", hash = "sha256:be4f8f4b29a80b6a3b71f0f31487beb9e296391da20af8504498a328befed53f"},
 ]

 [package.dependencies]
@@ -2530,7 +2716,7 @@ huey = ["huey (>=2)"]
 loguru = ["loguru (>=0.5)"]
 opentelemetry = ["opentelemetry-distro (>=0.35b0)"]
 opentelemetry-experimental = ["opentelemetry-distro (>=0.40b0,<1.0)", "opentelemetry-instrumentation-aiohttp-client (>=0.40b0,<1.0)", "opentelemetry-instrumentation-django (>=0.40b0,<1.0)", "opentelemetry-instrumentation-fastapi (>=0.40b0,<1.0)", "opentelemetry-instrumentation-flask (>=0.40b0,<1.0)", "opentelemetry-instrumentation-requests (>=0.40b0,<1.0)", "opentelemetry-instrumentation-sqlite3 (>=0.40b0,<1.0)", "opentelemetry-instrumentation-urllib (>=0.40b0,<1.0)"]
-pure-eval = ["asttokens", "executing", "pure_eval"]
+pure-eval = ["asttokens", "executing", "pure-eval"]
 pymongo = ["pymongo (>=3.1)"]
 pyspark = ["pyspark (>=2.4.4)"]
 quart = ["blinker (>=1.1)", "quart (>=0.16.1)"]
@@ -2784,7 +2970,7 @@ tests = ["pytest", "pytest-benchmark", "pytest-instafail", "pytest-rerunfailures
 type = "git"
 url = "https://github.com/pytorch/tensordict"
 reference = "HEAD"
-resolved_reference = "551331d83e2979dd4505db1f49895740e6e5c95f"
+resolved_reference = "ed22554d6860731610df784b2f5d09f31d3dbc7a"

 [[package]]
 name = "termcolor"
@@ -2903,13 +3089,13 @@ tensordict = ">=0.4.0"
 torch = ">=2.1.0"

 [package.extras]
-all = ["ale-py", "atari-py", "dm-control", "git", "gym", "gym[accept-rom-license]", "gymnasium", "h5py", "huggingface-hub", "hydra-core (>=1.1)", "hydra-submitit-launcher", "minari", "moviepy", "mujoco", "pandas", "pettingzoo (>=1.24.1)", "pillow", "pygame", "pytest", "pytest-instafail", "pyyaml", "requests", "scikit-learn", "scipy", "tensorboard", "torchsnapshot", "torchvision", "tqdm", "vmas (>=1.2.10)", "wandb"]
+all = ["ale-py", "atari-py", "dm_control", "git", "gym", "gym[accept-rom-license]", "gymnasium", "h5py", "huggingface_hub", "hydra-core (>=1.1)", "hydra-submitit-launcher", "minari", "moviepy", "mujoco", "pandas", "pettingzoo (>=1.24.1)", "pillow", "pygame", "pytest", "pytest-instafail", "pyyaml", "requests", "scikit-learn", "scipy", "tensorboard", "torchsnapshot", "torchvision", "tqdm", "vmas (>=1.2.10)", "wandb"]
 atari = ["ale-py", "atari-py", "gym", "gym[accept-rom-license]", "pygame"]
 checkpointing = ["torchsnapshot"]
-dm-control = ["dm-control"]
+dm-control = ["dm_control"]
 gym-continuous = ["gymnasium", "mujoco"]
 marl = ["pettingzoo (>=1.24.1)", "vmas (>=1.2.10)"]
-offline-data = ["h5py", "huggingface-hub", "minari", "pandas", "pillow", "requests", "scikit-learn", "torchvision", "tqdm"]
+offline-data = ["h5py", "huggingface_hub", "minari", "pandas", "pillow", "requests", "scikit-learn", "torchvision", "tqdm"]
 rendering = ["moviepy"]
 tests = ["pytest", "pytest-instafail", "pyyaml", "scipy"]
 utils = ["git", "hydra-core (>=1.1)", "hydra-submitit-launcher", "tensorboard", "tqdm", "wandb"]
@@ -3066,13 +3252,13 @@ test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess

 [[package]]
 name = "wandb"
-version = "0.16.3"
+version = "0.16.4"
 description = "A CLI and library for interacting with the Weights & Biases API."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "wandb-0.16.3-py3-none-any.whl", hash = "sha256:b8907ddd775c27dc6c12687386a86b5d6acf291060f9ae680bbc61cc8fc03237"},
-    {file = "wandb-0.16.3.tar.gz", hash = "sha256:d789acda32053b18b7a160d0595837e45a3c8a79d25e1fe1f051875303f480ec"},
+    {file = "wandb-0.16.4-py3-none-any.whl", hash = "sha256:bb9eb5aa2c2c85e11c76040c4271366f54d4975167aa6320ba86c3f2d97fe5fa"},
+    {file = "wandb-0.16.4.tar.gz", hash = "sha256:8752c67d1347a4c29777e64dc1e1a742a66c5ecde03aebadf2b0d62183fa307c"},
 ]

 [package.dependencies]
@@ -3093,8 +3279,9 @@ async = ["httpx (>=0.23.0)"]
 aws = ["boto3"]
 azure = ["azure-identity", "azure-storage-blob"]
 gcp = ["google-cloud-storage"]
+importers = ["filelock", "mlflow", "polars", "rich", "tenacity"]
 kubeflow = ["google-cloud-storage", "kubernetes", "minio", "sh"]
-launch = ["PyYAML (>=6.0.0)", "awscli", "azure-containerregistry", "azure-identity", "azure-storage-blob", "boto3", "botocore", "chardet", "google-auth", "google-cloud-aiplatform", "google-cloud-artifact-registry", "google-cloud-compute", "google-cloud-storage", "iso8601", "kubernetes", "kubernetes-asyncio", "nbconvert", "nbformat", "optuna", "pydantic", "typing-extensions"]
+launch = ["PyYAML (>=6.0.0)", "awscli", "azure-containerregistry", "azure-identity", "azure-storage-blob", "boto3", "botocore", "chardet", "google-auth", "google-cloud-aiplatform", "google-cloud-artifact-registry", "google-cloud-compute", "google-cloud-storage", "iso8601", "kubernetes", "kubernetes-asyncio", "nbconvert", "nbformat", "optuna", "pydantic", "tomli", "typing-extensions"]
 media = ["bokeh", "moviepy", "numpy", "pillow", "plotly (>=5.18.0)", "rdkit-pypi", "soundfile"]
 models = ["cloudpickle"]
 perf = ["orjson"]
@@ -3103,13 +3290,13 @@ sweeps = ["sweeps (>=0.2.0)"]

 [[package]]
 name = "zarr"
-version = "2.17.0"
+version = "2.17.1"
 description = "An implementation of chunked, compressed, N-dimensional arrays for Python"
 optional = false
 python-versions = ">=3.9"
 files = [
-    {file = "zarr-2.17.0-py3-none-any.whl", hash = "sha256:d287cb61019c4a0a0f386f76eeaa7f0b1160b1cb90cf96173a4b6cbc135df6e1"},
-    {file = "zarr-2.17.0.tar.gz", hash = "sha256:6390a2b8af31babaab4c963efc45bf1da7f9500c9aafac193f84cf019a7c66b0"},
+    {file = "zarr-2.17.1-py3-none-any.whl", hash = "sha256:e25df2741a6e92645f3890f30f3136d5b57a0f8f831094b024bbcab5f2797bc7"},
+    {file = "zarr-2.17.1.tar.gz", hash = "sha256:564b3aa072122546fe69a0fa21736f466b20fad41754334b62619f088ce46261"},
 ]

 [package.dependencies]
@@ -3140,4 +3327,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "9c3e86956dd11bc8d7823e5e6c5e74a073051b495f71f96179113d99791f7ca0"
+content-hash = "ee86b84a795e6a3e9c2d79f244a87b55589adbe46d549ac38adf48be27c04cf9"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,6 +18,7 @@ classifiers=[
 ]
 packages = [{include = "lerobot"}]

+
 [tool.poetry.dependencies]
 python = "^3.10"
 cython = "^3.0.8"
@@ -41,14 +42,15 @@ mpmath = "^1.3.0"
 torch = "^2.2.1"
 tensordict = {git = "https://github.com/pytorch/tensordict"}
 torchrl = {git = "https://github.com/pytorch/rl", rev = "13bef426dcfa5887c6e5034a6e9697993fa92c37"}
-mujoco = "^3.1.2"
+mujoco = "2.3.7"
 mujoco-py = "^2.1.2.14"
 gym = "^0.26.2"
 opencv-python = "^4.9.0.80"
-diffusion-policy = {git = "https://github.com/real-stanford/diffusion_policy"}
 diffusers = "^0.26.3"
 torchvision = "^0.17.1"
 h5py = "^3.10.0"
+dm-control = "1.0.14"
+huggingface-hub = {extras = ["hf-transfer"], version = "^0.21.4"}


 [tool.poetry.group.dev.dependencies]
@@ -56,9 +58,6 @@ pre-commit = "^3.6.2"
 debugpy = "^1.8.1"
 pytest = "^8.1.0"

-[build-system]
-requires = ["poetry-core"]
-build-backend = "poetry.core.masonry.api"

 [tool.ruff]
 line-length = 110
@@ -87,5 +86,15 @@ exclude = [
    "venv",
 ]

+
 [tool.ruff.lint]
 select = ["E4", "E7", "E9", "F", "I", "N", "B", "C4", "SIM"]
+
+
+[tool.poetry-dynamic-versioning]
+enable = true
+
+
+[build-system]
+requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning>=1.0.0,<2.0.0"]
+build-backend = "poetry_dynamic_versioning.backend"
--- a/sbatch.sh
+++ b/sbatch.sh
@@ -17,6 +17,7 @@ apptainer exec --nv \
 ~/apptainer/nvidia_cuda:12.2.2-devel-ubuntu22.04.sif $SHELL

 source ~/.bashrc
-conda activate fowm
+#conda activate fowm
+conda activate lerobot

 srun $CMD
--- a/tests/data/aloha_sim_insertion_human/action.memmap
+++ b/tests/data/aloha_sim_insertion_human/action.memmap
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d789deddb081a9f4b626342391de8f48949d38fb5fdead87b5c0737b46c0877a
+size 2800
--- a/tests/data/aloha_sim_insertion_human/episode.memmap
+++ b/tests/data/aloha_sim_insertion_human/episode.memmap
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a12e561363385e9dfeeab326368731c030ed4b374e7f5897ac819159d2884c5
+size 400
--- a/tests/data/aloha_sim_insertion_human/frame_id.memmap
+++ b/tests/data/aloha_sim_insertion_human/frame_id.memmap
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c202d9cfc7858fd49d522047e16948359bbbb2eda2d3825d552e45a78d5f8585
+size 400
--- a/tests/data/aloha_sim_insertion_human/meta.json
+++ b/tests/data/aloha_sim_insertion_human/meta.json
@@ -0,0 +1 @@
+{"action": {"device": "cpu", "shape": [50, 14], "dtype": "torch.float32"}, "episode": {"device": "cpu", "shape": [50], "dtype": "torch.int64"}, "frame_id": {"device": "cpu", "shape": [50], "dtype": "torch.int64"}, "shape": [50], "device": "cpu", "_type": "<class 'tensordict._td.TensorDict'>"}
--- a/tests/data/aloha_sim_insertion_human/next/done.memmap
+++ b/tests/data/aloha_sim_insertion_human/next/done.memmap
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc2786e1f9910a9d811400edcddaf7075195f7a16b216dcbefba3bc7c4f2ae51
+size 50
--- a/tests/data/aloha_sim_insertion_human/next/meta.json
+++ b/tests/data/aloha_sim_insertion_human/next/meta.json
@@ -0,0 +1 @@
+{"done": {"device": "cpu", "shape": [50, 1], "dtype": "torch.bool"}, "shape": [50], "device": "cpu", "_type": "<class 'tensordict._td.TensorDict'>"}
--- a/tests/data/aloha_sim_insertion_human/next/observation/image/meta.json
+++ b/tests/data/aloha_sim_insertion_human/next/observation/image/meta.json
@@ -0,0 +1 @@
+{"top": {"device": "cpu", "shape": [50, 3, 480, 640], "dtype": "torch.uint8"}, "shape": [50], "device": "cpu", "_type": "<class 'tensordict._td.TensorDict'>"}
--- a/tests/data/aloha_sim_insertion_human/next/observation/image/top.memmap
+++ b/tests/data/aloha_sim_insertion_human/next/observation/image/top.memmap
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5c632e3cb06be729e5d673e3ecca1d6f6527b0f48cfe3dc03d7eea4f9eb3bbd7
+size 46080000
--- a/tests/data/aloha_sim_insertion_human/next/observation/meta.json
+++ b/tests/data/aloha_sim_insertion_human/next/observation/meta.json
@@ -0,0 +1 @@
+{"state": {"device": "cpu", "shape": [50, 14], "dtype": "torch.float32"}, "shape": [50], "device": "cpu", "_type": "<class 'tensordict._td.TensorDict'>"}
--- a/tests/data/aloha_sim_insertion_human/next/observation/state.memmap
+++ b/tests/data/aloha_sim_insertion_human/next/observation/state.memmap
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e231f2e07e1cd030137ea2e938b570b112db2c694c6d21b37ceb8f8559e19088
+size 2800
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Alexander Soare	d374873849	use Path type instead of str	2024-03-15 13:15:34 +00:00
Remi	9c88071bc7	Merge pull request #28 from Cadene/user/rcadene/2024_03_14_hf_dataset Download datasets from hugging face	2024-03-15 13:52:13 +01:00
Cadene	5805a7ffb1	small fix in type + comments	2024-03-15 12:44:52 +00:00
Cadene	41521f7e96	self.root is Path or None + The following packages are already present in the pyproject.toml and will be skipped: - huggingface-hub If you want to update it to the latest compatible version, you can use `poetry update package`. If you prefer to upgrade it to the latest available version, you can use `poetry add package@latest`. Nothing to add.	2024-03-15 10:56:46 +00:00
Cadene	b10c9507d4	Small fix	2024-03-15 00:36:55 +00:00
Cadene	a311d38796	Add aloha + improve readme	2024-03-15 00:30:11 +00:00
Cadene	19730b3412	Add pusht on hf dataset (WIP)	2024-03-14 16:59:37 +00:00
Simon Alibert	95e84079ef	Merge pull request #25 from Cadene/user/aliberts/2024_03_13_ci_fix CI env fix	2024-03-14 15:24:56 +01:00
Simon Alibert	8e856f1bf7	Update readme	2024-03-14 15:24:38 +01:00
Simon Alibert	8c2b47752a	Remove cuda env copy	2024-03-14 13:55:35 +01:00
Simon Alibert	f515cb6efd	Add dm-control	2024-03-14 13:42:03 +01:00
Simon Alibert	c3f8d14fd8	CI env fix	2024-03-14 13:29:27 +01:00
Remi	8c56770318	Merge pull request #11 from Cadene/user/rcadene/2024_03_06_aloha_env [WIP] Add Aloha env and ACT policy	2024-03-12 16:02:48 +01:00
Cadene	998dd2b874	info	2024-03-12 14:45:35 +00:00
Cadene	7331df81d2	info	2024-03-12 14:44:23 +00:00
Cadene	2c5d49cad5	Add sudo apt-get install -y libegl1-mesa-dev	2024-03-12 14:30:28 +00:00
Cadene	5881eec376	Add DEVICE constant from LEROBOT_TESTS_DEVICE	2024-03-12 14:14:39 +00:00
Cadene	29c73844b1	Update test.yml	2024-03-12 13:52:53 +00:00
Simon Alibert	f9258898ff	CI fix attempt	2024-03-12 12:24:10 +01:00
Remi Cadene	9d002032d1	Add Aloha env and ACT policy WIP Aloha env tests pass Rendering works (fps look fast tho? TODO action bounding is too wide [-1,1]) Update README Copy past from act repo Remove download.py add a WIP for Simxarm Remove download.py add a WIP for Simxarm Add act yaml (TODO: try train.py) Training can runs (TODO: eval) Add tasks without end_effector that are compatible with dataset, Eval can run (TODO: training and pretrained model) Add AbstractEnv, Refactor AlohaEnv, Add rendering_hook in env, Minor modifications, (TODO: Refactor Pusht and Simxarm) poetry lock fix bug in compute_stats for action normalization fix more bugs in normalization fix training fix import PushtEnv inheriates AbstractEnv, Improve factory Normalization Add _make_env to EnvAbstract Add call_rendering_hooks to pusht env SimxarmEnv inherites from AbstractEnv (NOT TESTED) Add aloha tests artifacts + update pusht stats fix image normalization: before env was in [0,1] but dataset in [0,255], and now both in [0,255] Small fix on simxarm Add next to obs Add top camera to Aloha env (TODO: make it compatible with set of cameras) Add top camera to Aloha env (TODO: make it compatible with set of cameras)	2024-03-12 10:27:48 +00:00
Remi	060bac7672	Merge pull request #23 from Cadene/kashif-patch-1 Update README.md	2024-03-12 10:11:42 +01:00
Kashif Rasul	337208f28d	Update README.md fix some readme typos	2024-03-12 10:05:05 +01:00
Simon Alibert	48e70e044e	Merge pull request #18 from Cadene/user/aliberts/2024_03_11_dynamic_build_versioning Add dynamic build versioning	2024-03-11 19:06:51 +01:00
Simon Alibert	4449c06823	Add comment Co-authored-by: Remi <re.cadene@gmail.com>	2024-03-11 19:06:22 +01:00
Simon Alibert	a94800fc8a	Merge pull request #20 from Cadene/user/aliberts/2024_03_11_license Add license	2024-03-11 14:32:05 +01:00
Simon Alibert	a207b416b7	Typo fix	2024-03-11 14:20:05 +01:00
Simon Alibert	78690d197f	Merge pull request #19 from Cadene/user/aliberts/2024_03_11_wandb_config Configure wandb entity outside config	2024-03-11 14:17:44 +01:00
Simon Alibert	6d6c84b4a3	Remove entity from config Co-authored-by: Remi <re.cadene@gmail.com>	2024-03-11 14:14:17 +01:00
Simon Alibert	772a826bf2	Add FOWM & ALOHA copyright notices	2024-03-11 13:54:20 +01:00
Simon Alibert	2cb8ae5037	Add acknowledgements	2024-03-11 13:49:08 +01:00
Remi	fab2b3240b	Merge pull request #17 from Cadene/user/rcadene/2024_03_11_bugfix_compute_stats Fix bugs with normalization	2024-03-11 13:44:07 +01:00
Cadene	84a1647c01	fix import	2024-03-11 12:41:14 +00:00
Cadene	ccd5dc5a42	fix training	2024-03-11 12:33:15 +00:00
Simon Alibert	c1e9c13ade	Add license	2024-03-11 13:31:52 +01:00
Simon Alibert	00fe4f4f18	Configure wandb entity outside config	2024-03-11 13:09:46 +01:00
Simon Alibert	225eebde40	Add dynamic build versioning	2024-03-11 12:10:46 +01:00
Cadene	816b2e9d63	fix more bugs in normalization	2024-03-11 11:03:51 +00:00
Cadene	a7ef4a6a33	fix bug in compute_stats for action normalization	2024-03-11 09:47:54 +00:00
Remi	d4ea4f0ad1	Merge pull request #16 from Cadene/user/aliberts/2024_03_09_integrate_diffusion_policy Integrate diffusion policy	2024-03-10 17:02:16 +01:00
Simon Alibert	f54ee7cda0	Fix paths	2024-03-10 16:51:50 +01:00
Simon Alibert	134009f337	Remove init files	2024-03-10 16:38:49 +01:00
Simon Alibert	7982425670	Remove diffusion-policy dependency	2024-03-10 16:36:30 +01:00
Simon Alibert	6c867d78ef	Integrate pusht env from diffusion	2024-03-10 16:33:03 +01:00
Simon Alibert	302b78962c	Integrate diffusion policy	2024-03-10 15:31:17 +01:00
Simon Alibert	59397fb44a	Move tdmpc files	2024-03-09 18:44:36 +01:00
Simon Alibert	1cc621ec36	Merge pull request #15 from Cadene/user/aliberts/2024_03_09_add_ci_eval_test Add end-to-end pusht eval test	2024-03-09 16:54:50 +01:00
Simon Alibert	471ebfef62	Remove cleanup + reduce eval steps	2024-03-09 16:50:17 +01:00
Simon Alibert	30753d879c	Fix	2024-03-09 16:41:31 +01:00
Simon Alibert	c6fb40fb29	Add end-to-end pusht eval test	2024-03-09 16:37:34 +01:00
Simon Alibert	fa7a947acc	Merge pull request #12 from Cadene/user/aliberts/2024_03_08_test_data Add pusht test artifact	2024-03-09 16:00:39 +01:00
Simon Alibert	450e32e4b5	update readme	2024-03-09 15:57:29 +01:00
Simon Alibert	0da85b2cef	Removed debugging lines	2024-03-09 15:51:42 +01:00
Simon Alibert	f2c7ab5b3b	test ci	2024-03-09 15:41:23 +01:00
Cadene	cde866dac0	reupload	2024-03-09 15:36:20 +01:00
Cadene	a54a0feb63	remove mock	2024-03-09 15:36:20 +01:00
Cadene	f440a681ad	Add mock_dataset.py	2024-03-09 15:36:20 +01:00
Cadene	35bd577deb	Add mock_dataset.py	2024-03-09 15:36:20 +01:00
Simon Alibert	327f60e4be	Update readme	2024-03-09 15:36:20 +01:00
Simon Alibert	74ad9d5154	Change pusht artifacts path	2024-03-09 15:36:20 +01:00
Simon Alibert	89eaab140b	Add pusht test artifact	2024-03-09 15:36:20 +01:00
Simon Alibert	7dbdbb051c	Add git lfs in CI checkout	2024-03-09 15:35:20 +01:00
				`@@ -0,0 +1 @@`
				`*.memmap filter=lfs diff=lfs merge=lfs -text`
				`@@ -0,0 +1 @@`
				`from lerobot.__version__ import __version__ # noqa: F401`
				`@@ -0,0 +1 @@`
				`{"action": {"device": "cpu", "shape": [50, 14], "dtype": "torch.float32"}, "episode": {"device": "cpu", "shape": [50], "dtype": "torch.int64"}, "frame_id": {"device": "cpu", "shape": [50], "dtype": "torch.int64"}, "shape": [50], "device": "cpu", "_type": "<class 'tensordict._td.TensorDict'>"}`
				`@@ -0,0 +1 @@`
				`{"done": {"device": "cpu", "shape": [50, 1], "dtype": "torch.bool"}, "shape": [50], "device": "cpu", "_type": "<class 'tensordict._td.TensorDict'>"}`
				`@@ -0,0 +1 @@`
				`{"top": {"device": "cpu", "shape": [50, 3, 480, 640], "dtype": "torch.uint8"}, "shape": [50], "device": "cpu", "_type": "<class 'tensordict._td.TensorDict'>"}`
				`@@ -0,0 +1 @@`
				`{"state": {"device": "cpu", "shape": [50, 14], "dtype": "torch.float32"}, "shape": [50], "device": "cpu", "_type": "<class 'tensordict._td.TensorDict'>"}`