Add AbstractEnv, Refactor AlohaEnv, Add rendering_hook in env, Minor modifications, (TODO: Refactor Pusht and Simxarm)

Add tasks without end_effector that are compatible with dataset, Eval can run (TODO: training and pretrained model)
Training can runs (TODO: eval)
2024-03-10 22:00:48 +00:00 · 2024-03-10 10:52:12 +00:00 · 2024-03-09 16:52:08 +00:00 · 2024-03-08 18:08:28 +00:00 · 2024-03-08 18:07:49 +00:00 · 2024-03-08 18:07:33 +00:00
212 changed files with 1125 additions and 11803 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,2 +0,0 @@
-*.memmap filter=lfs diff=lfs merge=lfs -text
-*.stl filter=lfs diff=lfs merge=lfs -text
--- a/.github/poetry/cpu/poetry.lock
+++ b/.github/poetry/cpu/poetry.lock
--- a/.github/poetry/cpu/pyproject.toml
+++ b/.github/poetry/cpu/pyproject.toml
@@ -1,109 +0,0 @@
-[tool.poetry]
-name = "lerobot"
-version = "0.1.0"
-description = "Le robot is learning"
-authors = [
-    "Rémi Cadène <re.cadene@gmail.com>",
-    "Simon Alibert <alibert.sim@gmail.com>",
-]
-repository = "https://github.com/Cadene/lerobot"
-readme = "README.md"
-license = "MIT"
-classifiers=[
-    "Development Status :: 3 - Alpha",
-    "Intended Audience :: Developers",
-    "Topic :: Software Development :: Build Tools",
-    "License :: OSI Approved :: MIT License",
-    "Programming Language :: Python :: 3.10",
-]
-packages = [{include = "lerobot"}]
-
-
-[tool.poetry.dependencies]
-python = "^3.10"
-termcolor = "^2.4.0"
-omegaconf = "^2.3.0"
-dm-env = "^1.6"
-pandas = "^2.2.1"
-wandb = "^0.16.3"
-moviepy = "^1.0.3"
-imageio = {extras = ["pyav"], version = "^2.34.0"}
-gdown = "^5.1.0"
-hydra-core = "^1.3.2"
-einops = "^0.7.0"
-pygame = "^2.5.2"
-pymunk = "^6.6.0"
-zarr = "^2.17.0"
-shapely = "^2.0.3"
-scikit-image = "^0.22.0"
-numba = "^0.59.0"
-mpmath = "^1.3.0"
-torch = {version = "^2.2.1", source = "torch-cpu"}
-tensordict = {git = "https://github.com/pytorch/tensordict"}
-torchrl = {git = "https://github.com/pytorch/rl", rev = "13bef426dcfa5887c6e5034a6e9697993fa92c37"}
-mujoco = "^2.3.7"
-opencv-python = "^4.9.0.80"
-diffusers = "^0.26.3"
-torchvision = {version = "^0.17.1", source = "torch-cpu"}
-h5py = "^3.10.0"
-dm = "^1.3"
-dm-control = "1.0.14"
-robomimic = "0.2.0"
-huggingface-hub = "^0.21.4"
-gymnasium-robotics = "^1.2.4"
-gymnasium = "^0.29.1"
-cmake = "^3.29.0.1"
-
-
-[tool.poetry.group.dev.dependencies]
-pre-commit = "^3.6.2"
-debugpy = "^1.8.1"
-pytest = "^8.1.0"
-pytest-cov = "^5.0.0"
-
-
-[[tool.poetry.source]]
-name = "torch-cpu"
-url = "https://download.pytorch.org/whl/cpu"
-priority = "supplemental"
-
-
-[tool.ruff]
-line-length = 110
-target-version = "py310"
-exclude = [
-    ".bzr",
-    ".direnv",
-    ".eggs",
-    ".git",
-    ".git-rewrite",
-    ".hg",
-    ".mypy_cache",
-    ".nox",
-    ".pants.d",
-    ".pytype",
-    ".ruff_cache",
-    ".svn",
-    ".tox",
-    ".venv",
-    "__pypackages__",
-    "_build",
-    "buck-out",
-    "build",
-    "dist",
-    "node_modules",
-    "venv",
-]
-
-
-[tool.ruff.lint]
-select = ["E4", "E7", "E9", "F", "I", "N", "B", "C4", "SIM"]
-
-
-[tool.poetry-dynamic-versioning]
-enable = true
-
-
-[build-system]
-requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning>=1.0.0,<2.0.0"]
-build-backend = "poetry_dynamic_versioning.backend"
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -1,4 +1,4 @@
-name: Tests
+name: Test

 on:
  pull_request:
@@ -10,30 +10,24 @@ on:
      - main

 jobs:
-  tests:
+  test:
    if: |
      ${{ github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'CI') }} ||
      ${{ github.event_name == 'push' }}
    runs-on: ubuntu-latest
    env:
-      POETRY_VERSION: 1.8.2
-      DATA_DIR: tests/data
-      MUJOCO_GL: egl
+      POETRY_VERSION: 1.8.1
    steps:
      #----------------------------------------------
      #       check-out repo and set-up python
      #----------------------------------------------
      - name: Check out repository
        uses: actions/checkout@v4
-        with:
-          lfs: true
-
      - name: Set up python
        id: setup-python
        uses: actions/setup-python@v5
        with:
          python-version: '3.10'
-
      #----------------------------------------------
      #         install & configure poetry
      #----------------------------------------------
@@ -41,9 +35,8 @@ jobs:
        id: restore-poetry-cache
        uses: actions/cache/restore@v3
        with:
-          path: ~/.local
-          key: poetry-${{ env.POETRY_VERSION }}
-
+          path: ~/.local  # the path depends on the OS
+          key: poetry-${{ env.POETRY_VERSION }}  # increment to reset cache
      - name: Install Poetry
        if: steps.restore-poetry-cache.outputs.cache-hit != 'true'
        uses: snok/install-poetry@v1
@@ -51,7 +44,6 @@ jobs:
          version: ${{ env.POETRY_VERSION }}
          virtualenvs-create: true
          installer-parallel: true
-
      - name: Save cached Poetry installation
        if: |
          steps.restore-poetry-cache.outputs.cache-hit != 'true' &&
@@ -59,36 +51,25 @@ jobs:
        id: save-poetry-cache
        uses: actions/cache/save@v3
        with:
-          path: ~/.local
-          key: poetry-${{ env.POETRY_VERSION }}
-
+          path: ~/.local  # the path depends on the OS
+          key: poetry-${{ env.POETRY_VERSION }}  # increment to reset cache
      - name: Configure Poetry
        run: poetry config virtualenvs.in-project true
-
      #----------------------------------------------
      #           install dependencies
      #----------------------------------------------
-      # TODO(aliberts): move to gpu runners
-      - name: Select cpu dependencies  # HACK
-        run: cp -t . .github/poetry/cpu/pyproject.toml .github/poetry/cpu/poetry.lock
-
      - name: Load cached venv
        id: restore-dependencies-cache
        uses: actions/cache/restore@v3
        with:
          path: .venv
          key: venv-${{ steps.setup-python.outputs.python-version }}-${{ env.POETRY_VERSION }}-${{ hashFiles('**/poetry.lock') }}
-
      - name: Install dependencies
        if: steps.restore-dependencies-cache.outputs.cache-hit != 'true'
-        env:
-          TMPDIR: ~/tmp
-          TEMP: ~/tmp
-          TMP: ~/tmp
        run: |
-          mkdir ~/tmp
          poetry install --no-interaction --no-root
-
+          git clone https://github.com/real-stanford/diffusion_policy
+          cp -r diffusion_policy/diffusion_policy $(poetry env info -p)/lib/python3.10/site-packages/
      - name: Save cached venv
        if: |
            steps.restore-dependencies-cache.outputs.cache-hit != 'true' &&
@@ -98,137 +79,40 @@ jobs:
        with:
          path: .venv
          key: venv-${{ steps.setup-python.outputs.python-version }}-${{ env.POETRY_VERSION }}-${{ hashFiles('**/poetry.lock') }}
-
-      - name: Install libegl1-mesa-dev (to use MUJOCO_GL=egl)
-        run: sudo apt-get update && sudo apt-get install -y libegl1-mesa-dev
-
      #----------------------------------------------
      #             install project
      #----------------------------------------------
      - name: Install project
        run: poetry install --no-interaction
-
      #----------------------------------------------
-      #            run tests & coverage
+      #               run tests
      #----------------------------------------------
      - name: Run tests
-        env:
-          LEROBOT_TESTS_DEVICE: cpu
        run: |
          source .venv/bin/activate
-          pytest --cov=./lerobot --cov-report=xml tests
-
-      #   TODO(aliberts): Link with HF Codecov account
-      # - name: Upload coverage reports to Codecov with GitHub Action
-      #   uses: codecov/codecov-action@v4
-      #   with:
-      #     files: ./coverage.xml
-      #     verbose: true
-
-      #----------------------------------------------
-      #            run end-to-end tests
-      #----------------------------------------------
-      - name: Test train ACT on ALOHA end-to-end
+          pytest tests
+      - name: Test train pusht end-to-end
        run: |
          source .venv/bin/activate
          python lerobot/scripts/train.py \
-            policy=act \
-            env=aloha \
-            wandb.enable=False \
-            offline_steps=2 \
-            online_steps=0 \
-            device=cpu \
-            save_model=true \
-            save_freq=2 \
-            horizon=20 \
-            policy.batch_size=2 \
-            hydra.run.dir=tests/outputs/act/
-
-      - name: Test eval ACT on ALOHA end-to-end
-        run: |
-          source .venv/bin/activate
-          python lerobot/scripts/eval.py \
-            --config tests/outputs/act/.hydra/config.yaml \
-            eval_episodes=1 \
-            env.episode_length=8 \
-            device=cpu \
-            policy.pretrained_model_path=tests/outputs/act/models/2.pt
-
-      # TODO(aliberts): This takes ~2mn to run, needs to be improved
-      # - name: Test eval ACT on ALOHA end-to-end (policy is None)
-      #   run: |
-      #     source .venv/bin/activate
-      #     python lerobot/scripts/eval.py \
-      #       --config lerobot/configs/default.yaml \
-      #       policy=act \
-      #       env=aloha \
-      #       eval_episodes=1 \
-      #       device=cpu
-
-      - name: Test train Diffusion on PushT end-to-end
-        run: |
-          source .venv/bin/activate
-          python lerobot/scripts/train.py \
-            policy=diffusion \
+            hydra.job.name=pusht \
            env=pusht \
            wandb.enable=False \
-            offline_steps=2 \
-            online_steps=0 \
-            device=cpu \
-            save_model=true \
-            save_freq=2 \
-            hydra.run.dir=tests/outputs/diffusion/
-
-      - name: Test eval Diffusion on PushT end-to-end
-        run: |
-          source .venv/bin/activate
-          python lerobot/scripts/eval.py \
-            --config tests/outputs/diffusion/.hydra/config.yaml \
-            eval_episodes=1 \
-            env.episode_length=8 \
-            device=cpu \
-            policy.pretrained_model_path=tests/outputs/diffusion/models/2.pt
-
-      - name: Test eval Diffusion on PushT end-to-end (policy is None)
-        run: |
-          source .venv/bin/activate
-          python lerobot/scripts/eval.py \
-            --config lerobot/configs/default.yaml \
-            policy=diffusion  \
-            env=pusht \
-            eval_episodes=1 \
-            device=cpu
-
-      - name: Test train TDMPC on Simxarm end-to-end
-        run: |
-          source .venv/bin/activate
-          python lerobot/scripts/train.py \
-            policy=tdmpc \
-            env=simxarm \
-            wandb.enable=False \
            offline_steps=1 \
-            online_steps=1 \
-            device=cpu \
-            save_model=true \
-            save_freq=2 \
-            hydra.run.dir=tests/outputs/tdmpc/
-
-      - name: Test eval TDMPC on Simxarm end-to-end
-        run: |
-          source .venv/bin/activate
-          python lerobot/scripts/eval.py \
-            --config tests/outputs/tdmpc/.hydra/config.yaml \
-            eval_episodes=1 \
-            env.episode_length=8 \
-            device=cpu \
-            policy.pretrained_model_path=tests/outputs/tdmpc/models/2.pt
-
-      - name: Test eval TDPMC on Simxarm end-to-end (policy is None)
-        run: |
-          source .venv/bin/activate
-          python lerobot/scripts/eval.py \
-            --config lerobot/configs/default.yaml \
-            policy=tdmpc  \
-            env=simxarm \
-            eval_episodes=1 \
+            online_steps=0 \
            device=cpu
+      # TODO(rcadene, aliberts): Add end-to-end test of eval checkpoint post training
+      # - name: Test eval pusht end-to-end
+      #   run: |
+      #     source .venv/bin/activate
+      #     python lerobot/scripts/eval.py
+      #       hydra.job.name=pusht \
+      #       env=pusht \
+      #       wandb.enable=False \
+      #       eval_episodes=1 \
+      #       device=cpu
+      #----------------------------------------------
+      #              cleanup
+      #----------------------------------------------
+      - name: Cleanup
+        run: rm -rf diffusion_policy data
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
+# Custom
+diffusion_policy
+
 # Logging
 logs
 tmp
@@ -51,7 +54,6 @@ pip-log.txt
 pip-delete-this-directory.txt

 # Unit test / coverage reports
-!tests/data
 htmlcov/
 .tox/
 .nox/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,4 +1,4 @@
-exclude: ^(data/|tests/)
+exclude: ^(data/|tests/|diffusion_policy/)
 default_language_version:
    python: python3.10
 repos:
@@ -14,11 +14,11 @@ repos:
      - id: end-of-file-fixer
      - id: trailing-whitespace
  - repo: https://github.com/asottile/pyupgrade
-    rev: v3.15.2
+    rev: v3.15.1
    hooks:
    -   id: pyupgrade
  - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.3.4
+    rev: v0.2.2
    hooks:
      - id: ruff
        args: [--fix]
--- a/507
+++ b/507
@@ -1,507 +0,0 @@
-Copyright 2024 The Hugging Face team. All rights reserved.
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-
-
-## Some of lerobot's code is derived from Diffusion Policy, which is subject to the following copyright notice:
-
-MIT License
-
-Copyright (c) 2023 Columbia Artificial Intelligence and Robotics Lab
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-
-## Some of lerobot's code is derived from FOWM, which is subject to the following copyright notice:
-
-MIT License
-
-Copyright (c) 2023 Yunhai Feng
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-
-## Some of lerobot's code is derived from simxarm, which is subject to the following copyright notice:
-
-MIT License
-
-Copyright (c) 2023 Nicklas Hansen & Yanjie Ze
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-
-## Some of lerobot's code is derived from ALOHA, which is subject to the following copyright notice:
-
-MIT License
-
-Copyright (c) 2023 Tony Z. Zhao
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-## Some of lerobot's code is derived from DETR, which is subject to the following copyright notice:
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright 2020 - present, Facebook, Inc
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--- a/README.md
+++ b/README.md
@@ -1,360 +1,74 @@
-<p align="center">
-  <picture>
-    <source media="(prefers-color-scheme: dark)" srcset="media/lerobot-logo-thumbnail.png">
-    <source media="(prefers-color-scheme: light)" srcset="media/lerobot-logo-thumbnail.png">
-    <img alt="LeRobot, Hugging Face Robotics Library" src="media/lerobot-logo-thumbnail.png" style="max-width: 100%;">
-  </picture>
-  <br/>
-  <br/>
-</p>
-
-<div align="center">
-
-[![Tests](https://github.com/huggingface/lerobot/actions/workflows/test.yml/badge.svg?branch=main)](https://github.com/huggingface/lerobot/actions/workflows/test.yml?query=branch%3Amain)
-[![Coverage](https://codecov.io/gh/huggingface/lerobot/branch/main/graph/badge.svg?token=TODO)](https://codecov.io/gh/huggingface/lerobot)
-[![Python versions](https://img.shields.io/pypi/pyversions/lerobot)](https://www.python.org/downloads/)
-[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/huggingface/lerobot/blob/main/LICENSE)
-[![Status](https://img.shields.io/pypi/status/lerobot)](https://pypi.org/project/lerobot/)
-[![Version](https://img.shields.io/pypi/v/lerobot)](https://pypi.org/project/lerobot/)
-[![Examples](https://img.shields.io/badge/Examples-green.svg)](https://github.com/huggingface/lerobot/tree/main/examples)
-[![Discord](https://dcbadge.vercel.app/api/server/C5P34WJ68S?style=flat)](https://discord.gg/s3KuuzsPFb)
-
-</div>
-
-<h3 align="center">
-    <p>State-of-the-art Machine Learning for real-world robotics</p>
-</h3>
-
---
-
-
-🤗 LeRobot aims to provide models, datasets, and tools for real-world robotics in PyTorch. The goal is to lower the barrier for entry to robotics so that everyone can contribute and benefit from sharing datasets and pretrained models.
-
-🤗 LeRobot contains state-of-the-art approaches that have been shown to transfer to the real-world with a focus on imitation learning and reinforcement learning.
-
-🤗 LeRobot already provides a set of pretrained models, datasets with human collected demonstrations, and simulated environments so that everyone can get started. In the coming weeks, the plan is to add more and more support for real-world robotics on the most affordable and capable robots out there.
-
-🤗 LeRobot hosts pretrained models and datasets on this HuggingFace community page: [huggingface.co/lerobot](https://huggingface.co/lerobot)
-
-#### Examples of pretrained models and environments
-
-<table>
-  <tr>
-    <td><img src="http://remicadene.com/assets/gif/aloha_act.gif" width="100%" alt="ACT policy on ALOHA env"/></td>
-    <td><img src="http://remicadene.com/assets/gif/simxarm_tdmpc.gif" width="100%" alt="TDMPC policy on SimXArm env"/></td>
-    <td><img src="http://remicadene.com/assets/gif/pusht_diffusion.gif" width="100%" alt="Diffusion policy on PushT env"/></td>
-  </tr>
-  <tr>
-    <td align="center">ACT policy on ALOHA env</td>
-    <td align="center">TDMPC policy on SimXArm env</td>
-    <td align="center">Diffusion policy on PushT env</td>
-  </tr>
-</table>
-
-### Acknowledgment
-
- ACT policy and ALOHA environment are adapted from [ALOHA](https://tonyzhaozh.github.io/aloha/)
- Diffusion policy and Pusht environment are adapted from [Diffusion Policy](https://diffusion-policy.cs.columbia.edu/)
- TDMPC policy and Simxarm environment are adapted from [FOWM](https://www.yunhaifeng.com/FOWM/)
- Abstractions and utilities for Reinforcement Learning come from [TorchRL](https://github.com/pytorch/rl)
+# LeRobot

 ## Installation

-Download our source code:
-```bash
-git clone https://github.com/huggingface/lerobot.git
-cd lerobot
+Create a virtual environment with python 3.10, e.g. using `conda`:
 ```
-
-Create a virtual environment with Python 3.10 and activate it, e.g. with [`miniconda`](https://docs.anaconda.com/free/miniconda/index.html):
-```bash
 conda create -y -n lerobot python=3.10
 conda activate lerobot
 ```

-Then, install 🤗 LeRobot:
-```bash
-python -m pip install .
+[Install `poetry`](https://python-poetry.org/docs/#installation) (if you don't have it already)
+```
+curl -sSL https://install.python-poetry.org | python -
 ```

-To use [Weights and Biases](https://docs.wandb.ai/quickstart) for experiments tracking, log in with
-```bash
-wandb login
+Install dependencies
 ```
-
-## Walkthrough
-
-```
-.
-├── lerobot
-|   ├── configs          # contains hydra yaml files with all options that you can override in the command line
-|   |   ├── default.yaml   # selected by default, it loads pusht environment and diffusion policy
-|   |   ├── env            # various sim environments and their datasets: aloha.yaml, pusht.yaml, simxarm.yaml
-|   |   └── policy         # various policies: act.yaml, diffusion.yaml, tdmpc.yaml
-|   ├── common           # contains classes and utilities
-|   |   ├── datasets       # various datasets of human demonstrations: aloha, pusht, simxarm
-|   |   ├── envs           # various sim environments: aloha, pusht, simxarm
-|   |   └── policies       # various policies: act, diffusion, tdmpc
-|   └── scripts                  # contains functions to execute via command line
-|       ├── visualize_dataset.py  # load a dataset and render its demonstrations
-|       ├── eval.py               # load policy and evaluate it on an environment
-|       └── train.py              # train a policy via imitation learning and/or reinforcement learning
-├── outputs               # contains results of scripts execution: logs, videos, model checkpoints
-├── .github
-|   └── workflows
-|       └── test.yml      # defines install settings for continuous integration and specifies end-to-end tests
-└── tests                 # contains pytest utilities for continuous integration
-
-```
-
-### Visualize datasets
-
-You can import our dataset class, download the data from the HuggingFace hub and use our rendering utilities:
-```python
-""" Copy pasted from `examples/1_visualize_dataset.py` """
-import lerobot
-from lerobot.common.datasets.aloha import AlohaDataset
-from torchrl.data.replay_buffers import SamplerWithoutReplacement
-from lerobot.scripts.visualize_dataset import render_dataset
-
-print(lerobot.available_datasets)
-# >>> ['aloha_sim_insertion_human', 'aloha_sim_insertion_scripted', 'aloha_sim_transfer_cube_human', 'aloha_sim_transfer_cube_scripted', 'pusht', 'xarm_lift_medium']
-
-# we use this sampler to sample 1 frame after the other
-sampler = SamplerWithoutReplacement(shuffle=False)
-
-dataset = AlohaDataset("aloha_sim_transfer_cube_human", sampler=sampler)
-
-video_paths = render_dataset(
-    dataset,
-    out_dir="outputs/visualize_dataset/example",
-    max_num_samples=300,
-    fps=50,
-)
-print(video_paths)
-# >>> ['outputs/visualize_dataset/example/episode_0.mp4']
-```
-
-Or you can achieve the same result by executing our script from the command line:
-```bash
-python lerobot/scripts/visualize_dataset.py \
-env=aloha \
-task=sim_sim_transfer_cube_human \
-hydra.run.dir=outputs/visualize_dataset/example
-# >>> ['outputs/visualize_dataset/example/episode_0.mp4']
-```
-
-### Evaluate a pretrained policy
-
-Check out [example 2](./examples/2_evaluate_pretrained_policy.py) to see how you can load a pretrained policy from HuggingFace hub, load up the corresponding environment and model, and run an evaluation.
-
-Or you can achieve the same result by executing our script from the command line:
-```bash
-python lerobot/scripts/eval.py \
--hub-id lerobot/diffusion_policy_pusht_image \
-eval_episodes=10 \
-hydra.run.dir=outputs/eval/example_hub
-```
-
-After training your own policy, you can also re-evaluate the checkpoints with:
-```bash
-python lerobot/scripts/eval.py \
--config PATH/TO/FOLDER/config.yaml \
-policy.pretrained_model_path=PATH/TO/FOLDER/weights.pth \
-eval_episodes=10 \
-hydra.run.dir=outputs/eval/example_dir
-```
-
-See `python lerobot/scripts/eval.py --help` for more instructions.
-
-### Train your own policy
-
-You can import our dataset, environment, policy classes, and use our training utilities (if some data is missing, it will be automatically downloaded from HuggingFace hub): check out [example 3](./examples/3_train_policy.py). After you run this, you may want to revisit [example 2](./examples/2_evaluate_pretrained_policy.py) to evaluate your training output!
-
-In general, you can use our training script to easily train any policy on any environment:
-```bash
-python lerobot/scripts/train.py \
-env=aloha \
-task=sim_insertion \
-dataset_id=aloha_sim_insertion_scripted \
-policy=act \
-hydra.run.dir=outputs/train/aloha_act
-```
-
-## Contribute
-
-Feel free to open issues and PRs, and to coordinate your efforts with the community on our [Discord Channel](https://discord.gg/VjFz58wn3R). For specific inquiries, reach out to [Remi Cadene](remi.cadene@huggingface.co).
-
-### TODO
-
-If you are not sure how to contribute or want to know the next features we working on, look on this project page: [LeRobot TODO](https://github.com/orgs/huggingface/projects/46)
-
-### Follow our style
-
-```bash
-# install if needed
-pre-commit install
-# apply style and linter checks before git commit
-pre-commit
-```
-
-### Add dependencies
-
-Instead of using `pip` directly, we use `poetry` for development purposes to easily track our dependencies.
-If you don't have it already, follow the [instructions](https://python-poetry.org/docs/#installation) to install it.
-
-Install the project with:
-```bash
 poetry install
 ```

-Then, the equivalent of `pip install some-package`, would just be:
-```bash
-poetry add some-package
+If you encounter a disk space error, try to change your tmp dir to a location where you have enough disk space, e.g.
+```
+mkdir ~/tmp
+export TMPDIR='~/tmp'
 ```

-**NOTE:** Currently, to ensure the CI works properly, any new package must also be added in the CPU-only environment dedicated to the CI. To do this, you should create a separate environment and add the new package there as well. For example:
-```bash
-# Add the new package to your main poetry env
-poetry add some-package
-# Add the same package to the CPU-only env dedicated to CI
-conda create -y -n lerobot-ci python=3.10
-conda activate lerobot-ci
-cd .github/poetry/cpu
-poetry add some-package
+Install `diffusion_policy` #HACK
+```
+# from this directory
+git clone https://github.com/real-stanford/diffusion_policy
+cp -r diffusion_policy/diffusion_policy $(poetry env info -p)/lib/python3.10/site-packages/
 ```

-### Run tests locally
-
-Install [git lfs](https://git-lfs.com/) to retrieve test artifacts (if you don't have it already).
-
-On Mac:
-```bash
-brew install git-lfs
-git lfs install
-```
-
-On Ubuntu:
-```bash
-sudo apt-get install git-lfs
-git lfs install
-```
-
-Pull artifacts if they're not in [tests/data](tests/data)
-```bash
-git lfs pull
-```
-
-When adding a new dataset, mock it with
-```bash
-python tests/scripts/mock_dataset.py --in-data-dir data/$DATASET --out-data-dir tests/data/$DATASET
-```
-
-Run tests
-```bash
-DATA_DIR="tests/data" pytest -sx tests
-```
-
-### Add a new dataset
-
-To add a dataset to the hub, first login and use a token generated from [huggingface settings](https://huggingface.co/settings/tokens) with write access:
-```bash
-huggingface-cli login --token ${HUGGINGFACE_TOKEN} --add-to-git-credential
-```
-
-Then you can upload it to the hub with:
-```bash
-HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli upload $HF_USER/$DATASET data/$DATASET \
--repo-type dataset  \
--revision v1.0
-```
-
-You will need to set the corresponding version as a default argument in your dataset class:
-```python
-  version: str | None = "v1.0",
-```
-See: [`lerobot/common/datasets/pusht.py`](https://github.com/Cadene/lerobot/blob/main/lerobot/common/datasets/pusht.py)
-
-For instance, for [lerobot/pusht](https://huggingface.co/datasets/lerobot/pusht), we used:
-```bash
-HF_USER=lerobot
-DATASET=pusht
-```
-
-If you want to improve an existing dataset, you can download it locally with:
-```bash
-mkdir -p data/$DATASET
-HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download ${HF_USER}/$DATASET \
--repo-type dataset \
--local-dir data/$DATASET \
--local-dir-use-symlinks=False \
--revision v1.0
-```
-
-Iterate on your code and dataset with:
-```bash
-DATA_DIR=data python train.py
-```
-
-Upload a new version (v2.0 or v1.1 if the changes are respectively more or less significant):
-```bash
-HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli upload $HF_USER/$DATASET data/$DATASET \
--repo-type dataset \
--revision v1.1 \
--delete "*"
-```
-
-Then you will need to set the corresponding version as a default argument in your dataset class:
-```python
-  version: str | None = "v1.1",
-```
-See: [`lerobot/common/datasets/pusht.py`](https://github.com/Cadene/lerobot/blob/main/lerobot/common/datasets/pusht.py)
+## Usage


-Finally, you might want to mock the dataset if you need to update the unit tests as well:
-```bash
-python tests/scripts/mock_dataset.py --in-data-dir data/$DATASET --out-data-dir tests/data/$DATASET
-```
-
-### Add a pretrained policy
-
-Once you have trained a policy you may upload it to the HuggingFace hub.
-
-Firstly, make sure you have a model repository set up on the hub. The hub ID looks like HF_USER/REPO_NAME.
-
-Secondly, assuming you have trained a policy, you need:
-
- `config.yaml` which you can get from the `.hydra` directory of your training output folder.
- `model.pt` which should be one of the saved models in the `models` directory of your training output folder (they won't be named `model.pt` but you will need to choose one).
- `stats.pth` which should point to the same file in the dataset directory (found in `data/{dataset_name}`).
-
-To upload these to the hub, prepare a folder with the following structure (you can use symlinks rather than copying):
+### Train

 ```
-to_upload
-    ├── config.yaml
-    ├── model.pt
-    └── stats.pth
+python lerobot/scripts/train.py \
+hydra.job.name=pusht \
+env=pusht
 ```

-With the folder prepared, run the following with a desired revision ID.
+### Visualize offline buffer

-```bash
-huggingface-cli upload $HUB_ID to_upload --revision $REVISION_ID
+```
+python lerobot/scripts/visualize_dataset.py \
+hydra.run.dir=tmp/$(date +"%Y_%m_%d") \
+env=pusht
 ```

-If you want this to be the default revision also run the following (don't worry, it won't upload the files again; it will just adjust the file pointers):
+### Visualize online buffer / Eval

-```bash
-huggingface-cli upload $HUB_ID to_upload
+```
+python lerobot/scripts/eval.py \
+hydra.run.dir=tmp/$(date +"%Y_%m_%d") \
+env=pusht
 ```

-See `eval.py` for an example of how a user may use your policy.
+
+## TODO
+
+If you don't know how to contribute or want to know the next features we working on, look on this project page: [LeRobot TODO](https://github.com/users/Cadene/projects/1)
+
+Ask [Remi Cadene](re.cadene@gmail.com) for access if needed.


-### Improve your code with profiling
+## Profile

-An example of a code snippet to profile the evaluation of a policy:
+**Example**
 ```python
 from torch.profiler import profile, record_function, ProfilerActivity

@@ -373,12 +87,25 @@ with profile(
    with record_function("eval_policy"):
        for i in range(num_episodes):
            prof.step()
-            # insert code to profile, potentially whole body of eval_policy function
 ```

 ```bash
 python lerobot/scripts/eval.py \
--config outputs/pusht/.hydra/config.yaml \
-pretrained_model_path=outputs/pusht/model.pt \
+pretrained_model_path=/home/rcadene/code/fowm/logs/xarm_lift/all/default/2/models/final.pt \
 eval_episodes=7
 ```
+
+## Contribute
+
+**Style**
+```
+# install if needed
+pre-commit install
+# apply style and linter checks before git commit
+pre-commit run -a
+```
+
+**Tests**
+```
+pytest -sx tests
+```
--- a/examples/1_visualize_dataset.py
+++ b/examples/1_visualize_dataset.py
@@ -1,24 +0,0 @@
-import os
-
-from torchrl.data.replay_buffers import SamplerWithoutReplacement
-
-import lerobot
-from lerobot.common.datasets.aloha import AlohaDataset
-from lerobot.scripts.visualize_dataset import render_dataset
-
-print(lerobot.available_datasets)
-# >>> ['aloha_sim_insertion_human', 'aloha_sim_insertion_scripted', 'aloha_sim_transfer_cube_human', 'aloha_sim_transfer_cube_scripted', 'pusht', 'xarm_lift_medium']
-
-# we use this sampler to sample 1 frame after the other
-sampler = SamplerWithoutReplacement(shuffle=False)
-
-dataset = AlohaDataset("aloha_sim_transfer_cube_human", sampler=sampler, root=os.environ.get("DATA_DIR"))
-
-video_paths = render_dataset(
-    dataset,
-    out_dir="outputs/visualize_dataset/example",
-    max_num_samples=300,
-    fps=50,
-)
-print(video_paths)
-# ['outputs/visualize_dataset/example/episode_0.mp4']
--- a/examples/2_evaluate_pretrained_policy.py
+++ b/examples/2_evaluate_pretrained_policy.py
@@ -1,39 +0,0 @@
-"""
-This scripts demonstrates how to evaluate a pretrained policy from the HuggingFace Hub or from your local
-training outputs directory. In the latter case, you might want to run examples/3_train_policy.py first.
-"""
-
-from pathlib import Path
-
-from huggingface_hub import snapshot_download
-
-from lerobot.common.utils import init_hydra_config
-from lerobot.scripts.eval import eval
-
-# Get a pretrained policy from the hub.
-hub_id = "lerobot/diffusion_policy_pusht_image"
-folder = Path(snapshot_download(hub_id))
-# OR uncomment the following to evaluate a policy from the local outputs/train folder.
-# folder = Path("outputs/train/example_pusht_diffusion")
-
-config_path = folder / "config.yaml"
-weights_path = folder / "model.pt"
-stats_path = folder / "stats.pth"  # normalization stats
-
-# Override some config parameters to do with evaluation.
-overrides = [
-    f"policy.pretrained_model_path={weights_path}",
-    "eval_episodes=10",
-    "rollout_batch_size=10",
-    "device=cuda",
-]
-
-# Create a Hydra config.
-cfg = init_hydra_config(config_path, overrides)
-
-# Evaluate the policy and save the outputs including metrics and videos.
-eval(
-    cfg,
-    out_dir=f"outputs/eval/example_{cfg.env.name}_{cfg.policy.name}",
-    stats_path=stats_path,
-)
--- a/examples/3_train_policy.py
+++ b/examples/3_train_policy.py
@@ -1,55 +0,0 @@
-"""This scripts demonstrates how to train Diffusion Policy on the PushT environment.
-
-Once you have trained a model with this script, you can try to evaluate it on
-examples/2_evaluate_pretrained_policy.py
-"""
-
-import os
-from pathlib import Path
-
-import torch
-from omegaconf import OmegaConf
-from tqdm import trange
-
-from lerobot.common.datasets.factory import make_offline_buffer
-from lerobot.common.policies.diffusion.policy import DiffusionPolicy
-from lerobot.common.utils import init_hydra_config
-
-output_directory = Path("outputs/train/example_pusht_diffusion")
-os.makedirs(output_directory, exist_ok=True)
-
-overrides = [
-    "env=pusht",
-    "policy=diffusion",
-    # Adjust as you prefer. 5000 steps are needed to get something worth evaluating.
-    "offline_steps=5000",
-    "log_freq=250",
-    "device=cuda",
-]
-
-cfg = init_hydra_config("lerobot/configs/default.yaml", overrides)
-
-policy = DiffusionPolicy(
-    cfg=cfg.policy,
-    cfg_device=cfg.device,
-    cfg_noise_scheduler=cfg.noise_scheduler,
-    cfg_rgb_model=cfg.rgb_model,
-    cfg_obs_encoder=cfg.obs_encoder,
-    cfg_optimizer=cfg.optimizer,
-    cfg_ema=cfg.ema,
-    n_action_steps=cfg.n_action_steps + cfg.n_latency_steps,
-    **cfg.policy,
-)
-policy.train()
-
-offline_buffer = make_offline_buffer(cfg)
-
-for offline_step in trange(cfg.offline_steps):
-    train_info = policy.update(offline_buffer, offline_step)
-    if offline_step % cfg.log_freq == 0:
-        print(train_info)
-
-# Save the policy, configuration, and normalization stats for later use.
-policy.save_pretrained(output_directory / "model.pt")
-OmegaConf.save(cfg, output_directory / "config.yaml")
-torch.save(offline_buffer.transform[-1].stats, output_directory / "stats.pth")
--- a/lerobot/init.py
+++ b/lerobot/init.py
@@ -1,59 +0,0 @@
-"""
-This file contains lists of available environments, dataset and policies to reflect the current state of LeRobot library.
-We do not want to import all the dependencies, but instead we keep it lightweight to ensure fast access to these variables.
-
-Example:
-    ```python
-        import lerobot
-        print(lerobot.available_envs)
-        print(lerobot.available_tasks_per_env)
-        print(lerobot.available_datasets_per_env)
-        print(lerobot.available_datasets)
-        print(lerobot.available_policies)
-    ```
-
-Note:
-    When implementing a concrete class (e.g. `AlohaDataset`, `PushtEnv`, `DiffusionPolicy`), you need to:
-        1. set the required class attributes:
-            - for classes inheriting from `AbstractDataset`: `available_datasets`
-            - for classes inheriting from `AbstractEnv`: `name`, `available_tasks`
-            - for classes inheriting from `AbstractPolicy`: `name`
-        2. update variables in `lerobot/__init__.py` (e.g. `available_envs`, `available_datasets_per_envs`, `available_policies`)
-        3. update variables in `tests/test_available.py` by importing your new class
-"""
-
-from lerobot.__version__ import __version__  # noqa: F401
-
-available_envs = [
-    "aloha",
-    "pusht",
-    "simxarm",
-]
-
-available_tasks_per_env = {
-    "aloha": [
-        "sim_insertion",
-        "sim_transfer_cube",
-    ],
-    "pusht": ["pusht"],
-    "simxarm": ["lift"],
-}
-
-available_datasets_per_env = {
-    "aloha": [
-        "aloha_sim_insertion_human",
-        "aloha_sim_insertion_scripted",
-        "aloha_sim_transfer_cube_human",
-        "aloha_sim_transfer_cube_scripted",
-    ],
-    "pusht": ["pusht"],
-    "simxarm": ["xarm_lift_medium"],
-}
-
-available_datasets = [dataset for env in available_envs for dataset in available_datasets_per_env[env]]
-
-available_policies = [
-    "act",
-    "diffusion",
-    "tdmpc",
-]
--- a/lerobot/version.py
+++ b/lerobot/version.py
@@ -1,8 +1 @@
-"""To enable `lerobot.__version__`"""
-
-from importlib.metadata import PackageNotFoundError, version
-
-try:
-    __version__ = version("lerobot")
-except PackageNotFoundError:
-    __version__ = "unknown"
+__version__ = "0.0.0"
--- a/lerobot/common/envs/simxarm/simxarm/tasks/init.py
+++ b/lerobot/common/envs/simxarm/simxarm/tasks/init.py
--- a/lerobot/common/datasets/init.py
+++ b/lerobot/common/datasets/init.py
--- a/lerobot/common/datasets/abstract.py
+++ b/lerobot/common/datasets/abstract.py
@@ -1,3 +1,4 @@
+import abc
 import logging
 from pathlib import Path
 from typing import Callable
@@ -6,78 +7,36 @@ import einops
 import torch
 import torchrl
 import tqdm
-from huggingface_hub import snapshot_download
 from tensordict import TensorDict
+from torchrl.data.datasets.utils import _get_root_dir
 from torchrl.data.replay_buffers.replay_buffers import TensorDictReplayBuffer
-from torchrl.data.replay_buffers.samplers import Sampler
+from torchrl.data.replay_buffers.samplers import SliceSampler
 from torchrl.data.replay_buffers.storages import TensorStorage, _collate_id
 from torchrl.data.replay_buffers.writers import ImmutableDatasetWriter, Writer
-from torchrl.envs.transforms.transforms import Compose
-
-HF_USER = "lerobot"


-class AbstractDataset(TensorDictReplayBuffer):
-    """
-    AbstractDataset represents a dataset in the context of imitation learning or reinforcement learning.
-    This class is designed to be subclassed by concrete implementations that specify particular types of datasets.
-    These implementations can vary based on the source of the data, the environment the data pertains to,
-    or the specific kind of data manipulation applied.
-
-    Note:
-        - `TensorDictReplayBuffer` is the base class from which `AbstractDataset` inherits. It provides the foundational
-           functionality for storing and retrieving `TensorDict`-like data.
-        - `available_datasets` should be overridden by concrete subclasses to list the specific dataset variants supported.
-           It is expected that these variants correspond to a HuggingFace dataset on the hub.
-           For instance, the `AlohaDataset` which inherites from `AbstractDataset` has 4 available dataset variants:
-            - [aloha_sim_transfer_cube_scripted](https://huggingface.co/datasets/lerobot/aloha_sim_transfer_cube_scripted)
-            - [aloha_sim_insertion_scripted](https://huggingface.co/datasets/lerobot/aloha_sim_insertion_scripted)
-            - [aloha_sim_transfer_cube_human](https://huggingface.co/datasets/lerobot/aloha_sim_transfer_cube_human)
-            - [aloha_sim_insertion_human](https://huggingface.co/datasets/lerobot/aloha_sim_insertion_human)
-        - When implementing a concrete class (e.g. `AlohaDataset`, `PushtEnv`, `DiffusionPolicy`), you need to:
-            1. set the required class attributes:
-                - for classes inheriting from `AbstractDataset`: `available_datasets`
-                - for classes inheriting from `AbstractEnv`: `name`, `available_tasks`
-                - for classes inheriting from `AbstractPolicy`: `name`
-            2. update variables in `lerobot/__init__.py` (e.g. `available_envs`, `available_datasets_per_envs`, `available_policies`)
-            3. update variables in `tests/test_available.py` by importing your new class
-    """
-
-    available_datasets: list[str] | None = None
-
+class AbstractExperienceReplay(TensorDictReplayBuffer):
    def __init__(
        self,
        dataset_id: str,
-        version: str | None = None,
-        batch_size: int | None = None,
+        batch_size: int = None,
        *,
        shuffle: bool = True,
-        root: Path | None = None,
+        root: Path = None,
        pin_memory: bool = False,
        prefetch: int = None,
-        sampler: Sampler | None = None,
-        collate_fn: Callable | None = None,
-        writer: Writer | None = None,
+        sampler: SliceSampler = None,
+        collate_fn: Callable = None,
+        writer: Writer = None,
        transform: "torchrl.envs.Transform" = None,
    ):
-        assert (
-            self.available_datasets is not None
-        ), "Subclasses of `AbstractDataset` should set the `available_datasets` class attribute."
-        assert (
-            dataset_id in self.available_datasets
-        ), f"The provided dataset ({dataset_id}) is not on the list of available datasets {self.available_datasets}."
-
        self.dataset_id = dataset_id
-        self.version = version
        self.shuffle = shuffle
-        self.root = root if root is None else Path(root)
+        self.root = _get_root_dir(self.dataset_id) if root is None else root
+        self.root = Path(self.root)
+        self.data_dir = self.root / self.dataset_id

-        if self.root is not None and self.version is not None:
-            logging.warning(
-                f"The version of the dataset ({self.version}) is not enforced when root is provided ({self.root})."
-            )
-
-        storage = self._download_or_load_dataset()
+        storage = self._download_or_load_storage()

        super().__init__(
            storage=storage,
@@ -93,9 +52,9 @@ class AbstractDataset(TensorDictReplayBuffer):
    @property
    def stats_patterns(self) -> dict:
        return {
-            ("observation", "state"): "b c -> c",
-            ("observation", "image"): "b c h w -> c 1 1",
-            ("action",): "b c -> c",
+            ("observation", "state"): "b c -> 1 c",
+            ("observation", "image"): "b c h w -> 1 c 1 1",
+            ("action"): "b c -> 1 c",
        }

    @property
@@ -114,19 +73,8 @@ class AbstractDataset(TensorDictReplayBuffer):
    def num_episodes(self) -> int:
        return len(self._storage._storage["episode"].unique())

-    @property
-    def transform(self):
-        return self._transform
-
    def set_transform(self, transform):
-        if not isinstance(transform, Compose):
-            # required since torchrl calls `len(self._transform)` downstream
-            if isinstance(transform, list):
-                self._transform = Compose(*transform)
-            else:
-                self._transform = Compose(transform)
-        else:
-            self._transform = transform
+        self.transform = transform

    def compute_or_load_stats(self, num_batch=100, batch_size=32) -> TensorDict:
        stats_path = self.data_dir / "stats.pth"
@@ -138,16 +86,19 @@ class AbstractDataset(TensorDictReplayBuffer):
            torch.save(stats, stats_path)
        return stats

-    def _download_or_load_dataset(self) -> torch.StorageBase:
-        if self.root is None:
-            self.data_dir = Path(
-                snapshot_download(
-                    repo_id=f"{HF_USER}/{self.dataset_id}", repo_type="dataset", revision=self.version
-                )
-            )
+    @abc.abstractmethod
+    def _download_and_preproc(self) -> torch.StorageBase:
+        raise NotImplementedError()
+
+    def _download_or_load_storage(self):
+        if not self._is_downloaded():
+            storage = self._download_and_preproc()
        else:
-            self.data_dir = self.root / self.dataset_id
-        return TensorStorage(TensorDict.load_memmap(self.data_dir / "replay_buffer"))
+            storage = TensorStorage(TensorDict.load_memmap(self.data_dir))
+        return storage
+
+    def _is_downloaded(self) -> bool:
+        return self.data_dir.is_dir()

    def _compute_stats(self, num_batch=100, batch_size=32):
        rb = TensorDictReplayBuffer(
--- a/lerobot/common/datasets/aloha.py
+++ b/lerobot/common/datasets/aloha.py
@@ -9,11 +9,11 @@ import torch
 import torchrl
 import tqdm
 from tensordict import TensorDict
-from torchrl.data.replay_buffers.samplers import Sampler
+from torchrl.data.replay_buffers.samplers import SliceSampler
 from torchrl.data.replay_buffers.storages import TensorStorage
 from torchrl.data.replay_buffers.writers import Writer

-from lerobot.common.datasets.abstract import AbstractDataset
+from lerobot.common.datasets.abstract import AbstractExperienceReplay

 DATASET_IDS = [
    "aloha_sim_insertion_human",
@@ -80,27 +80,25 @@ def download(data_dir, dataset_id):
    gdown.download(EP49_URLS[dataset_id], output=str(data_dir / "episode_49.hdf5"), fuzzy=True)


-class AlohaDataset(AbstractDataset):
-    available_datasets = DATASET_IDS
-
+class AlohaExperienceReplay(AbstractExperienceReplay):
    def __init__(
        self,
        dataset_id: str,
-        version: str | None = "v1.2",
-        batch_size: int | None = None,
+        batch_size: int = None,
        *,
        shuffle: bool = True,
-        root: Path | None = None,
+        root: Path = None,
        pin_memory: bool = False,
        prefetch: int = None,
-        sampler: Sampler | None = None,
-        collate_fn: Callable | None = None,
-        writer: Writer | None = None,
+        sampler: SliceSampler = None,
+        collate_fn: Callable = None,
+        writer: Writer = None,
        transform: "torchrl.envs.Transform" = None,
    ):
+        assert dataset_id in DATASET_IDS
+
        super().__init__(
            dataset_id,
-            version,
            batch_size,
            shuffle=shuffle,
            root=root,
@@ -115,20 +113,19 @@ class AlohaDataset(AbstractDataset):
    @property
    def stats_patterns(self) -> dict:
        d = {
-            ("observation", "state"): "b c -> c",
-            ("action",): "b c -> c",
+            ("observation", "state"): "b c -> 1 c",
+            ("action"): "b c -> 1 c",
        }
        for cam in CAMERAS[self.dataset_id]:
-            d[("observation", "image", cam)] = "b c h w -> c 1 1"
+            d[("observation", "image", cam)] = "b c h w -> 1 c 1 1"
        return d

    @property
    def image_keys(self) -> list:
        return [("observation", "image", cam) for cam in CAMERAS[self.dataset_id]]

-    def _download_and_preproc_obsolete(self):
-        assert self.root is not None
-        raw_dir = self.root / f"{self.dataset_id}_raw"
+    def _download_and_preproc(self):
+        raw_dir = self.data_dir.parent / f"{self.data_dir.name}_raw"
        if not raw_dir.is_dir():
            download(raw_dir, self.dataset_id)

@@ -177,7 +174,7 @@ class AlohaDataset(AbstractDataset):

                if ep_id == 0:
                    # hack to initialize tensordict data structure to store episodes
-                    td_data = ep_td[0].expand(total_num_frames).memmap_like(self.root / f"{self.dataset_id}")
+                    td_data = ep_td[0].expand(total_num_frames).memmap_like(self.data_dir)

                td_data[idxtd : idxtd + len(ep_td)] = ep_td
                idxtd = idxtd + len(ep_td)
--- a/lerobot/common/datasets/factory.py
+++ b/lerobot/common/datasets/factory.py
@@ -5,22 +5,13 @@ from pathlib import Path
 import torch
 from torchrl.data.replay_buffers import PrioritizedSliceSampler, SliceSampler

-from lerobot.common.transforms import NormalizeTransform, Prod
+from lerobot.common.envs.transforms import NormalizeTransform

-# DATA_DIR specifies to location where datasets are loaded. By default, DATA_DIR is None and
-# we load from `$HOME/.cache/huggingface/hub/datasets`. For our unit tests, we set `DATA_DIR=tests/data`
-# to load a subset of our datasets for faster continuous integration.
-DATA_DIR = Path(os.environ["DATA_DIR"]) if "DATA_DIR" in os.environ else None
+DATA_DIR = Path(os.environ.get("DATA_DIR", "data"))


 def make_offline_buffer(
-    cfg,
-    overwrite_sampler=None,
-    # set normalize=False to remove all transformations and keep images unnormalized in [0,255]
-    normalize=True,
-    overwrite_batch_size=None,
-    overwrite_prefetch=None,
-    stats_path=None,
+    cfg, overwrite_sampler=None, normalize=True, overwrite_batch_size=None, overwrite_prefetch=None
 ):
    if cfg.policy.balanced_sampling:
        assert cfg.online_steps > 0
@@ -65,69 +56,56 @@ def make_offline_buffer(
        sampler = overwrite_sampler

    if cfg.env.name == "simxarm":
-        from lerobot.common.datasets.simxarm import SimxarmDataset
+        from lerobot.common.datasets.simxarm import SimxarmExperienceReplay

-        clsfunc = SimxarmDataset
+        clsfunc = SimxarmExperienceReplay
+        dataset_id = f"xarm_{cfg.env.task}_medium"

    elif cfg.env.name == "pusht":
-        from lerobot.common.datasets.pusht import PushtDataset
+        from lerobot.common.datasets.pusht import PushtExperienceReplay

-        clsfunc = PushtDataset
+        clsfunc = PushtExperienceReplay
+        dataset_id = "pusht"

    elif cfg.env.name == "aloha":
-        from lerobot.common.datasets.aloha import AlohaDataset
+        from lerobot.common.datasets.aloha import AlohaExperienceReplay

-        clsfunc = AlohaDataset
+        clsfunc = AlohaExperienceReplay
+        dataset_id = f"aloha_{cfg.env.task}"
    else:
        raise ValueError(cfg.env.name)

    offline_buffer = clsfunc(
-        dataset_id=cfg.dataset_id,
+        dataset_id=dataset_id,
+        root=DATA_DIR,
        sampler=sampler,
        batch_size=batch_size,
-        root=DATA_DIR,
        pin_memory=pin_memory,
        prefetch=prefetch if isinstance(prefetch, int) else None,
    )

-    if cfg.policy.name == "tdmpc":
-        img_keys = []
-        for key in offline_buffer.image_keys:
-            img_keys.append(("next", *key))
-        img_keys += offline_buffer.image_keys
-    else:
-        img_keys = offline_buffer.image_keys
-
    if normalize:
-        transforms = [Prod(in_keys=img_keys, prod=1 / 255)]
-
-        # TODO(rcadene): make normalization strategy configurable between mean_std, min_max, manual_min_max,
-        # min_max_from_spec
-        stats = offline_buffer.compute_or_load_stats() if stats_path is None else torch.load(stats_path)
-
-        # we only normalize the state and action, since the images are usually normalized inside the model for
-        # now (except for tdmpc: see the following)
+        # TODO(rcadene): make normalization strategy configurable between mean_std, min_max, manual_min_max, min_max_from_spec
+        stats = offline_buffer.compute_or_load_stats()
        in_keys = [("observation", "state"), ("action")]

-        if cfg.policy.name == "tdmpc":
-            # TODO(rcadene): we add img_keys to the keys to normalize for tdmpc only, since diffusion and act policies normalize the image inside the model for now
-            in_keys += img_keys
-            # TODO(racdene): since we use next observations in tdmpc, we also add them to the normalization. We are wasting a bit of compute on this for now.
-            in_keys += [("next", *key) for key in img_keys]
+        if cfg.policy == "tdmpc":
+            for key in offline_buffer.image_keys:
+                # TODO(rcadene): imagenet normalization is applied inside diffusion policy, but no normalization inside tdmpc
+                in_keys.append(key)
+                # since we use next observations in tdmpc
+                in_keys.append(("next", *key))
            in_keys.append(("next", "observation", "state"))

-        if cfg.policy.name == "diffusion" and cfg.env.name == "pusht":
+        if cfg.policy == "diffusion" and cfg.env.name == "pusht":
            # TODO(rcadene): we overwrite stats to have the same as pretrained model, but we should remove this
            stats["observation", "state", "min"] = torch.tensor([13.456424, 32.938293], dtype=torch.float32)
            stats["observation", "state", "max"] = torch.tensor([496.14618, 510.9579], dtype=torch.float32)
            stats["action", "min"] = torch.tensor([12.0, 25.0], dtype=torch.float32)
            stats["action", "max"] = torch.tensor([511.0, 511.0], dtype=torch.float32)

-        # TODO(rcadene): remove this and put it in config. Ideally we want to reproduce SOTA results just with mean_std
-        normalization_mode = "mean_std" if cfg.env.name == "aloha" else "min_max"
-        transforms.append(NormalizeTransform(stats, in_keys, mode=normalization_mode))
-
-        offline_buffer.set_transform(transforms)
+        transform = NormalizeTransform(stats, in_keys, mode="min_max")
+        offline_buffer.set_transform(transform)

    if not overwrite_sampler:
        index = torch.arange(0, offline_buffer.num_samples, 1)
--- a/lerobot/common/datasets/pusht.py
+++ b/lerobot/common/datasets/pusht.py
@@ -8,19 +8,20 @@ import pymunk
 import torch
 import torchrl
 import tqdm
+from diffusion_policy.common.replay_buffer import ReplayBuffer as DiffusionPolicyReplayBuffer
+from diffusion_policy.env.pusht.pusht_env import pymunk_to_shapely
 from tensordict import TensorDict
-from torchrl.data.replay_buffers.samplers import Sampler
+from torchrl.data.replay_buffers.samplers import SliceSampler
 from torchrl.data.replay_buffers.storages import TensorStorage
 from torchrl.data.replay_buffers.writers import Writer

-from lerobot.common.datasets.abstract import AbstractDataset
+from lerobot.common.datasets.abstract import AbstractExperienceReplay
 from lerobot.common.datasets.utils import download_and_extract_zip
-from lerobot.common.envs.pusht.pusht_env import pymunk_to_shapely
-from lerobot.common.policies.diffusion.replay_buffer import ReplayBuffer as DiffusionPolicyReplayBuffer

 # as define in env
 SUCCESS_THRESHOLD = 0.95  # 95% coverage,

+DEFAULT_TEE_MASK = pymunk.ShapeFilter.ALL_MASKS()
 PUSHT_URL = "https://diffusion-policy.cs.columbia.edu/data/training/pusht.zip"
 PUSHT_ZARR = Path("pusht/pusht_cchi_v7_replay.zarr")

@@ -48,10 +49,8 @@ def add_tee(
    angle,
    scale=30,
    color="LightSlateGray",
-    mask=None,
+    mask=DEFAULT_TEE_MASK,
 ):
-    if mask is None:
-        mask = pymunk.ShapeFilter.ALL_MASKS()
    mass = 1
    length = 4
    vertices1 = [
@@ -83,27 +82,23 @@ def add_tee(
    return body


-class PushtDataset(AbstractDataset):
-    available_datasets = ["pusht"]
-
+class PushtExperienceReplay(AbstractExperienceReplay):
    def __init__(
        self,
        dataset_id: str,
-        version: str | None = "v1.2",
-        batch_size: int | None = None,
+        batch_size: int = None,
        *,
        shuffle: bool = True,
-        root: Path | None = None,
+        root: Path = None,
        pin_memory: bool = False,
        prefetch: int = None,
-        sampler: Sampler | None = None,
-        collate_fn: Callable | None = None,
-        writer: Writer | None = None,
+        sampler: SliceSampler = None,
+        collate_fn: Callable = None,
+        writer: Writer = None,
        transform: "torchrl.envs.Transform" = None,
    ):
        super().__init__(
            dataset_id,
-            version,
            batch_size,
            shuffle=shuffle,
            root=root,
@@ -115,9 +110,8 @@ class PushtDataset(AbstractDataset):
            transform=transform,
        )

-    def _download_and_preproc_obsolete(self):
-        assert self.root is not None
-        raw_dir = self.root / f"{self.dataset_id}_raw"
+    def _download_and_preproc(self):
+        raw_dir = self.data_dir.parent / f"{self.data_dir.name}_raw"
        zarr_path = (raw_dir / PUSHT_ZARR).resolve()
        if not zarr_path.is_dir():
            raw_dir.mkdir(parents=True, exist_ok=True)
@@ -131,9 +125,6 @@ class PushtDataset(AbstractDataset):
        episode_ids = torch.from_numpy(dataset_dict.get_episode_idxs())
        num_episodes = dataset_dict.meta["episode_ends"].shape[0]
        total_frames = dataset_dict["action"].shape[0]
-        # to create test artifact
-        # num_episodes = 1
-        # total_frames = 50
        assert len(
            {dataset_dict[key].shape[0] for key in dataset_dict.keys()}  # noqa: SIM118
        ), "Some data type dont have the same number of total frames."
@@ -151,8 +142,6 @@ class PushtDataset(AbstractDataset):
        idxtd = 0
        for episode_id in tqdm.tqdm(range(num_episodes)):
            idx1 = dataset_dict.meta["episode_ends"][episode_id]
-            # to create test artifact
-            # idx1 = 51

            num_frames = idx1 - idx0

@@ -213,7 +202,7 @@ class PushtDataset(AbstractDataset):

            if episode_id == 0:
                # hack to initialize tensordict data structure to store episodes
-                td_data = ep_td[0].expand(total_frames).memmap_like(self.root / f"{self.dataset_id}")
+                td_data = ep_td[0].expand(total_frames).memmap_like(self.data_dir)

            td_data[idxtd : idxtd + len(ep_td)] = ep_td

--- a/lerobot/common/datasets/simxarm.py
+++ b/lerobot/common/datasets/simxarm.py
@@ -8,12 +8,12 @@ import torchrl
 import tqdm
 from tensordict import TensorDict
 from torchrl.data.replay_buffers.samplers import (
-    Sampler,
+    SliceSampler,
 )
 from torchrl.data.replay_buffers.storages import TensorStorage
 from torchrl.data.replay_buffers.writers import Writer

-from lerobot.common.datasets.abstract import AbstractDataset
+from lerobot.common.datasets.abstract import AbstractExperienceReplay


 def download():
@@ -32,7 +32,7 @@ def download():
    Path(download_path).unlink()


-class SimxarmDataset(AbstractDataset):
+class SimxarmExperienceReplay(AbstractExperienceReplay):
    available_datasets = [
        "xarm_lift_medium",
    ]
@@ -40,21 +40,19 @@ class SimxarmDataset(AbstractDataset):
    def __init__(
        self,
        dataset_id: str,
-        version: str | None = "v1.1",
-        batch_size: int | None = None,
+        batch_size: int = None,
        *,
        shuffle: bool = True,
-        root: Path | None = None,
+        root: Path = None,
        pin_memory: bool = False,
        prefetch: int = None,
-        sampler: Sampler | None = None,
-        collate_fn: Callable | None = None,
-        writer: Writer | None = None,
+        sampler: SliceSampler = None,
+        collate_fn: Callable = None,
+        writer: Writer = None,
        transform: "torchrl.envs.Transform" = None,
    ):
        super().__init__(
            dataset_id,
-            version,
            batch_size,
            shuffle=shuffle,
            root=root,
@@ -66,12 +64,11 @@ class SimxarmDataset(AbstractDataset):
            transform=transform,
        )

-    def _download_and_preproc_obsolete(self):
-        # assert self.root is not None
+    def _download_and_preproc(self):
        # TODO(rcadene): finish download
-        # download()
+        download()

-        dataset_path = self.root / f"{self.dataset_id}" / "buffer.pkl"
+        dataset_path = self.data_dir / "buffer.pkl"
        print(f"Using offline dataset '{dataset_path}'")
        with open(dataset_path, "rb") as f:
            dataset_dict = pickle.load(f)
@@ -105,19 +102,15 @@ class SimxarmDataset(AbstractDataset):
                    "frame_id": torch.arange(0, num_frames, 1),
                    ("next", "observation", "image"): next_image,
                    ("next", "observation", "state"): next_state,
-                    ("next", "reward"): next_reward,
-                    ("next", "done"): next_done,
+                    ("next", "observation", "reward"): next_reward,
+                    ("next", "observation", "done"): next_done,
                },
                batch_size=num_frames,
            )

            if episode_id == 0:
                # hack to initialize tensordict data structure to store episodes
-                td_data = (
-                    episode[0]
-                    .expand(total_frames)
-                    .memmap_like(self.root / f"{self.dataset_id}" / "replay_buffer")
-                )
+                td_data = episode[0].expand(total_frames).memmap_like(self.data_dir)

            td_data[idx0:idx1] = episode

--- a/lerobot/common/envs/init.py
+++ b/lerobot/common/envs/init.py
--- a/lerobot/common/envs/abstract.py
+++ b/lerobot/common/envs/abstract.py
@@ -1,27 +1,12 @@
+import abc
 from collections import deque
 from typing import Optional

 from tensordict import TensorDict
 from torchrl.envs import EnvBase

-from lerobot.common.utils import set_global_seed
-

 class AbstractEnv(EnvBase):
-    """
-    Note:
-        When implementing a concrete class (e.g. `AlohaDataset`, `PushtEnv`, `DiffusionPolicy`), you need to:
-            1. set the required class attributes:
-                - for classes inheriting from `AbstractDataset`: `available_datasets`
-                - for classes inheriting from `AbstractEnv`: `name`, `available_tasks`
-                - for classes inheriting from `AbstractPolicy`: `name`
-            2. update variables in `lerobot/__init__.py` (e.g. `available_envs`, `available_datasets_per_envs`, `available_policies`)
-            3. update variables in `tests/test_available.py` by importing your new class
-    """
-
-    name: str | None = None  # same name should be used to instantiate the environment in factory.py
-    available_tasks: list[str] | None = None  # for instance: sim_insertion, sim_transfer_cube, pusht, lift
-
    def __init__(
        self,
        task,
@@ -35,14 +20,6 @@ class AbstractEnv(EnvBase):
        num_prev_action=0,
    ):
        super().__init__(device=device, batch_size=[])
-        assert self.name is not None, "Subclasses of `AbstractEnv` should set the `name` class attribute."
-        assert (
-            self.available_tasks is not None
-        ), "Subclasses of `AbstractEnv` should set the `available_tasks` class attribute."
-        assert (
-            task in self.available_tasks
-        ), f"The provided task ({task}) is not on the list of available tasks {self.available_tasks}."
-
        self.task = task
        self.frame_skip = frame_skip
        self.from_pixels = from_pixels
@@ -50,21 +27,15 @@ class AbstractEnv(EnvBase):
        self.image_size = image_size
        self.num_prev_obs = num_prev_obs
        self.num_prev_action = num_prev_action
+        self._rendering_hooks = []

        if pixels_only:
            assert from_pixels
        if from_pixels:
            assert image_size

-        self._make_env()
        self._make_spec()
-
-        # self._next_seed will be used for the next reset. It is recommended that when self.set_seed is called
-        # you store the return value in self._next_seed (it will be a new randomly generated seed).
-        self._next_seed = seed
-        # Don't store the result of this in self._next_seed, as we want to make sure that the first time
-        # self._reset is called, we use seed.
-        self.set_seed(seed)
+        self._current_seed = self.set_seed(seed)

        if self.num_prev_obs > 0:
            self._prev_obs_image_queue = deque(maxlen=self.num_prev_obs)
@@ -73,20 +44,32 @@ class AbstractEnv(EnvBase):
            raise NotImplementedError()
            # self._prev_action_queue = deque(maxlen=self.num_prev_action)

+    def register_rendering_hook(self, func):
+        self._rendering_hooks.append(func)
+
+    def call_rendering_hooks(self):
+        for func in self._rendering_hooks:
+            func(self)
+
+    def reset_rendering_hooks(self):
+        self._rendering_hooks = []
+
+    @abc.abstractmethod
    def render(self, mode="rgb_array", width=640, height=480):
-        raise NotImplementedError("Abstract method")
+        raise NotImplementedError()

+    @abc.abstractmethod
    def _reset(self, tensordict: Optional[TensorDict] = None):
-        raise NotImplementedError("Abstract method")
+        raise NotImplementedError()

+    @abc.abstractmethod
    def _step(self, tensordict: TensorDict):
-        raise NotImplementedError("Abstract method")
-
-    def _make_env(self):
-        raise NotImplementedError("Abstract method")
+        raise NotImplementedError()

+    @abc.abstractmethod
    def _make_spec(self):
-        raise NotImplementedError("Abstract method")
+        raise NotImplementedError()

+    @abc.abstractmethod
    def _set_seed(self, seed: Optional[int]):
-        set_global_seed(seed)
+        raise NotImplementedError()
--- a/lerobot/common/envs/aloha/assets/tabletop.stl
+++ b/lerobot/common/envs/aloha/assets/tabletop.stl
--- a/lerobot/common/envs/aloha/assets/vx300s_10_custom_finger_left.stl
+++ b/lerobot/common/envs/aloha/assets/vx300s_10_custom_finger_left.stl
--- a/lerobot/common/envs/aloha/assets/vx300s_10_custom_finger_right.stl
+++ b/lerobot/common/envs/aloha/assets/vx300s_10_custom_finger_right.stl
--- a/lerobot/common/envs/aloha/assets/vx300s_10_gripper_finger.stl
+++ b/lerobot/common/envs/aloha/assets/vx300s_10_gripper_finger.stl
--- a/lerobot/common/envs/aloha/assets/vx300s_11_ar_tag.stl
+++ b/lerobot/common/envs/aloha/assets/vx300s_11_ar_tag.stl
--- a/lerobot/common/envs/aloha/assets/vx300s_1_base.stl
+++ b/lerobot/common/envs/aloha/assets/vx300s_1_base.stl
--- a/lerobot/common/envs/aloha/assets/vx300s_2_shoulder.stl
+++ b/lerobot/common/envs/aloha/assets/vx300s_2_shoulder.stl
--- a/lerobot/common/envs/aloha/assets/vx300s_3_upper_arm.stl
+++ b/lerobot/common/envs/aloha/assets/vx300s_3_upper_arm.stl
--- a/lerobot/common/envs/aloha/assets/vx300s_4_upper_forearm.stl
+++ b/lerobot/common/envs/aloha/assets/vx300s_4_upper_forearm.stl
--- a/lerobot/common/envs/aloha/assets/vx300s_5_lower_forearm.stl
+++ b/lerobot/common/envs/aloha/assets/vx300s_5_lower_forearm.stl
--- a/lerobot/common/envs/aloha/assets/vx300s_6_wrist.stl
+++ b/lerobot/common/envs/aloha/assets/vx300s_6_wrist.stl
--- a/lerobot/common/envs/aloha/assets/vx300s_7_gripper.stl
+++ b/lerobot/common/envs/aloha/assets/vx300s_7_gripper.stl
--- a/lerobot/common/envs/aloha/assets/vx300s_8_gripper_prop.stl
+++ b/lerobot/common/envs/aloha/assets/vx300s_8_gripper_prop.stl
--- a/lerobot/common/envs/aloha/assets/vx300s_9_gripper_bar.stl
+++ b/lerobot/common/envs/aloha/assets/vx300s_9_gripper_bar.stl
--- a/lerobot/common/envs/aloha/env.py
+++ b/lerobot/common/envs/aloha/env.py
@@ -29,16 +29,12 @@ from lerobot.common.envs.aloha.tasks.sim_end_effector import (
    TransferCubeEndEffectorTask,
 )
 from lerobot.common.envs.aloha.utils import sample_box_pose, sample_insertion_pose
-from lerobot.common.utils import set_global_seed
+from lerobot.common.utils import set_seed

-_has_gym = importlib.util.find_spec("gymnasium") is not None
+_has_gym = importlib.util.find_spec("gym") is not None


 class AlohaEnv(AbstractEnv):
-    name = "aloha"
-    available_tasks = ["sim_insertion", "sim_transfer_cube"]
-    _reset_warning_issued = False
-
    def __init__(
        self,
        task,
@@ -62,15 +58,13 @@ class AlohaEnv(AbstractEnv):
            num_prev_obs=num_prev_obs,
            num_prev_action=num_prev_action,
        )
-
-    def _make_env(self):
        if not _has_gym:
-            raise ImportError("Cannot import gymnasium.")
+            raise ImportError("Cannot import gym.")

-        if not self.from_pixels:
+        if not from_pixels:
            raise NotImplementedError()

-        self._env = self._make_env_task(self.task)
+        self._env = self._make_env_task(task)

    def render(self, mode="rgb_array", width=640, height=480):
        # TODO(rcadene): render and visualizer several cameras (e.g. angle, front_close)
@@ -111,8 +105,7 @@ class AlohaEnv(AbstractEnv):
        if self.from_pixels:
            image = torch.from_numpy(raw_obs["images"]["top"].copy())
            image = einops.rearrange(image, "h w c -> c h w")
-            assert image.dtype == torch.uint8
-            obs = {"image": {"top": image}}
+            obs = {"image": image.type(torch.float32) / 255.0}

            if not self.pixels_only:
                obs["state"] = torch.from_numpy(raw_obs["qpos"]).type(torch.float32)
@@ -124,74 +117,91 @@ class AlohaEnv(AbstractEnv):
        return obs

    def _reset(self, tensordict: Optional[TensorDict] = None):
-        if tensordict is not None and not AlohaEnv._reset_warning_issued:
-            logging.warning(f"{self.__class__.__name__}._reset ignores the provided tensordict.")
-            AlohaEnv._reset_warning_issued = True
+        td = tensordict
+        if td is None or td.is_empty():
+            # we need to handle seed iteration, since self._env.reset() rely an internal _seed.
+            self._current_seed += 1
+            self.set_seed(self._current_seed)

-        # Seed the environment and update the seed to be used for the next reset.
-        self._next_seed = self.set_seed(self._next_seed)
+            # TODO(rcadene): do not use global variable for this
+            if "sim_transfer_cube" in self.task:
+                BOX_POSE[0] = sample_box_pose()  # used in sim reset
+            elif "sim_insertion" in self.task:
+                BOX_POSE[0] = np.concatenate(sample_insertion_pose())  # used in sim reset

-        # TODO(rcadene): do not use global variable for this
-        if "sim_transfer_cube" in self.task:
-            BOX_POSE[0] = sample_box_pose()  # used in sim reset
-        elif "sim_insertion" in self.task:
-            BOX_POSE[0] = np.concatenate(sample_insertion_pose())  # used in sim reset
+            raw_obs = self._env.reset()
+            # TODO(rcadene): add assert
+            # assert self._current_seed == self._env._seed

-        raw_obs = self._env.reset()
+            obs = self._format_raw_obs(raw_obs.observation)

-        obs = self._format_raw_obs(raw_obs.observation)
+            if self.num_prev_obs > 0:
+                stacked_obs = {}
+                if "image" in obs:
+                    self._prev_obs_image_queue = deque(
+                        [obs["image"]] * (self.num_prev_obs + 1), maxlen=(self.num_prev_obs + 1)
+                    )
+                    stacked_obs["image"] = torch.stack(list(self._prev_obs_image_queue))
+                if "state" in obs:
+                    self._prev_obs_state_queue = deque(
+                        [obs["state"]] * (self.num_prev_obs + 1), maxlen=(self.num_prev_obs + 1)
+                    )
+                    stacked_obs["state"] = torch.stack(list(self._prev_obs_state_queue))
+                obs = stacked_obs

-        if self.num_prev_obs > 0:
-            stacked_obs = {}
-            if "image" in obs:
-                self._prev_obs_image_queue = deque(
-                    [obs["image"]["top"]] * (self.num_prev_obs + 1), maxlen=(self.num_prev_obs + 1)
-                )
-                stacked_obs["image"] = {"top": torch.stack(list(self._prev_obs_image_queue))}
-            if "state" in obs:
-                self._prev_obs_state_queue = deque(
-                    [obs["state"]] * (self.num_prev_obs + 1), maxlen=(self.num_prev_obs + 1)
-                )
-                stacked_obs["state"] = torch.stack(list(self._prev_obs_state_queue))
-            obs = stacked_obs
-
-        td = TensorDict(
-            {
-                "observation": TensorDict(obs, batch_size=[]),
-                "done": torch.tensor([False], dtype=torch.bool),
-            },
-            batch_size=[],
-        )
+            td = TensorDict(
+                {
+                    "observation": TensorDict(obs, batch_size=[]),
+                    "done": torch.tensor([False], dtype=torch.bool),
+                },
+                batch_size=[],
+            )
+        else:
+            raise NotImplementedError()

+        self.call_rendering_hooks()
        return td

    def _step(self, tensordict: TensorDict):
        td = tensordict
        action = td["action"].numpy()
-        assert action.ndim == 1
+        # step expects shape=(4,) so we pad if necessary
        # TODO(rcadene): add info["is_success"] and info["success"] ?
+        sum_reward = 0

-        _, reward, _, raw_obs = self._env.step(action)
+        if action.ndim == 1:
+            action = einops.repeat(action, "c -> t c", t=self.frame_skip)
+        else:
+            if self.frame_skip > 1:
+                raise NotImplementedError()

-        # TODO(rcadene): add an enum
-        success = done = reward == 4
-        obs = self._format_raw_obs(raw_obs)
+        num_action_steps = action.shape[0]
+        for i in range(num_action_steps):
+            _, reward, discount, raw_obs = self._env.step(action[i])
+            del discount  # not used

-        if self.num_prev_obs > 0:
-            stacked_obs = {}
-            if "image" in obs:
-                self._prev_obs_image_queue.append(obs["image"]["top"])
-                stacked_obs["image"] = {"top": torch.stack(list(self._prev_obs_image_queue))}
-            if "state" in obs:
-                self._prev_obs_state_queue.append(obs["state"])
-                stacked_obs["state"] = torch.stack(list(self._prev_obs_state_queue))
-            obs = stacked_obs
+            # TOOD(rcadene): add an enum
+            success = done = reward == 4
+            sum_reward += reward
+            obs = self._format_raw_obs(raw_obs)
+
+            if self.num_prev_obs > 0:
+                stacked_obs = {}
+                if "image" in obs:
+                    self._prev_obs_image_queue.append(obs["image"])
+                    stacked_obs["image"] = torch.stack(list(self._prev_obs_image_queue))
+                if "state" in obs:
+                    self._prev_obs_state_queue.append(obs["state"])
+                    stacked_obs["state"] = torch.stack(list(self._prev_obs_state_queue))
+                obs = stacked_obs
+
+            self.call_rendering_hooks()

        td = TensorDict(
            {
                "observation": TensorDict(obs, batch_size=[]),
-                "reward": torch.tensor([reward], dtype=torch.float32),
-                # success and done are true when coverage > self.success_threshold in env
+                "reward": torch.tensor([sum_reward], dtype=torch.float32),
+                # succes and done are true when coverage > self.success_threshold in env
                "done": torch.tensor([done], dtype=torch.bool),
                "success": torch.tensor([success], dtype=torch.bool),
            },
@@ -206,7 +216,7 @@ class AlohaEnv(AbstractEnv):
        if self.from_pixels:
            if isinstance(self.image_size, int):
                image_shape = (3, self.image_size, self.image_size)
-            elif OmegaConf.is_list(self.image_size) or isinstance(self.image_size, list):
+            elif OmegaConf.is_list(self.image_size):
                assert len(self.image_size) == 3  # c h w
                assert self.image_size[0] == 3  # c is RGB
                image_shape = tuple(self.image_size)
@@ -215,15 +225,13 @@ class AlohaEnv(AbstractEnv):
            if self.num_prev_obs > 0:
                image_shape = (self.num_prev_obs + 1, *image_shape)

-            obs["image"] = {
-                "top": BoundedTensorSpec(
-                    low=0,
-                    high=255,
-                    shape=image_shape,
-                    dtype=torch.uint8,
-                    device=self.device,
-                )
-            }
+            obs["image"] = BoundedTensorSpec(
+                low=0,
+                high=1,
+                shape=image_shape,
+                dtype=torch.float32,
+                device=self.device,
+            )
            if not self.pixels_only:
                state_shape = (len(JOINTS),)
                if self.num_prev_obs > 0:
@@ -292,7 +300,7 @@ class AlohaEnv(AbstractEnv):
        )

    def _set_seed(self, seed: Optional[int]):
-        set_global_seed(seed)
+        set_seed(seed)
        # TODO(rcadene): seed the env
        # self._env.seed(seed)
        logging.warning("Aloha env is not seeded")
--- a/lerobot/common/envs/factory.py
+++ b/lerobot/common/envs/factory.py
@@ -1,28 +1,24 @@
-from torchrl.envs import SerialEnv
-from torchrl.envs.transforms import Compose, StepCounter, Transform, TransformedEnv
+from torchrl.envs.transforms import StepCounter, TransformedEnv


 def make_env(cfg, transform=None):
-    """
-    Note: The returned environment is wrapped in a torchrl.SerialEnv with cfg.rollout_batch_size underlying
-    environments. The env therefore returns batches.`
-    """
-
    kwargs = {
        "frame_skip": cfg.env.action_repeat,
        "from_pixels": cfg.env.from_pixels,
        "pixels_only": cfg.env.pixels_only,
        "image_size": cfg.env.image_size,
+        # TODO(rcadene): do we want a specific eval_env_seed?
+        "seed": cfg.seed,
        "num_prev_obs": cfg.n_obs_steps - 1,
    }

    if cfg.env.name == "simxarm":
-        from lerobot.common.envs.simxarm.env import SimxarmEnv
+        from lerobot.common.envs.simxarm import SimxarmEnv

        kwargs["task"] = cfg.env.task
        clsfunc = SimxarmEnv
    elif cfg.env.name == "pusht":
-        from lerobot.common.envs.pusht.env import PushtEnv
+        from lerobot.common.envs.pusht import PushtEnv

        # assert kwargs["seed"] > 200, "Seed 0-200 are used for the demonstration dataset, so we don't want to seed the eval env with this range."

@@ -35,30 +31,37 @@ def make_env(cfg, transform=None):
    else:
        raise ValueError(cfg.env.name)

-    def _make_env(seed):
-        nonlocal kwargs
-        kwargs["seed"] = seed
-        env = clsfunc(**kwargs)
+    env = clsfunc(**kwargs)

-        # limit rollout to max_steps
-        env = TransformedEnv(env, StepCounter(max_steps=cfg.env.episode_length))
+    # limit rollout to max_steps
+    env = TransformedEnv(env, StepCounter(max_steps=cfg.env.episode_length))

-        if transform is not None:
-            # useful to add normalization
-            if isinstance(transform, Compose):
-                for tf in transform:
-                    env.append_transform(tf.clone())
-            elif isinstance(transform, Transform):
-                env.append_transform(transform.clone())
-            else:
-                raise NotImplementedError()
+    if transform is not None:
+        # useful to add normalization
+        env.append_transform(transform)

-        return env
+    return env

-    return SerialEnv(
-        cfg.rollout_batch_size,
-        create_env_fn=_make_env,
-        create_env_kwargs=[
-            {"seed": env_seed} for env_seed in range(cfg.seed, cfg.seed + cfg.rollout_batch_size)
-        ],
-    )
+
+# def make_env(env_name, frame_skip, device, is_test=False):
+#     env = GymEnv(
+#         env_name,
+#         frame_skip=frame_skip,
+#         from_pixels=True,
+#         pixels_only=False,
+#         device=device,
+#     )
+#     env = TransformedEnv(env)
+#     env.append_transform(NoopResetEnv(noops=30, random=True))
+#     if not is_test:
+#         env.append_transform(EndOfLifeTransform())
+#         env.append_transform(RewardClipping(-1, 1))
+#     env.append_transform(ToTensorImage())
+#     env.append_transform(GrayScale())
+#     env.append_transform(Resize(84, 84))
+#     env.append_transform(CatFrames(N=4, dim=-3))
+#     env.append_transform(RewardSum())
+#     env.append_transform(StepCounter(max_steps=4500))
+#     env.append_transform(DoubleToFloat())
+#     env.append_transform(VecNorm(in_keys=["pixels"]))
+#     return env
--- a/lerobot/common/envs/simxarm/env.py
+++ b/lerobot/common/envs/simxarm/env.py
@@ -1,10 +1,8 @@
 import importlib
-import logging
 from collections import deque
 from typing import Optional

 import einops
-import numpy as np
 import torch
 from tensordict import TensorDict
 from torchrl.data.tensor_specs import (
@@ -13,23 +11,18 @@ from torchrl.data.tensor_specs import (
    DiscreteTensorSpec,
    UnboundedContinuousTensorSpec,
 )
+from torchrl.envs import EnvBase
 from torchrl.envs.libs.gym import _gym_to_torchrl_spec_transform

-from lerobot.common.envs.abstract import AbstractEnv
-from lerobot.common.utils import set_global_seed
+from lerobot.common.utils import set_seed

-MAX_NUM_ACTIONS = 4
-
-_has_gym = importlib.util.find_spec("gymnasium") is not None
+_has_gym = importlib.util.find_spec("gym") is not None
+_has_diffpolicy = importlib.util.find_spec("diffusion_policy") is not None and _has_gym


-class SimxarmEnv(AbstractEnv):
-    name = "simxarm"
-    available_tasks = ["lift"]
-
+class PushtEnv(EnvBase):
    def __init__(
        self,
-        task,
        frame_skip: int = 1,
        from_pixels: bool = False,
        pixels_only: bool = False,
@@ -39,60 +32,73 @@ class SimxarmEnv(AbstractEnv):
        num_prev_obs=0,
        num_prev_action=0,
    ):
-        super().__init__(
-            task=task,
-            frame_skip=frame_skip,
-            from_pixels=from_pixels,
-            pixels_only=pixels_only,
-            image_size=image_size,
-            seed=seed,
-            device=device,
-            num_prev_obs=num_prev_obs,
-            num_prev_action=num_prev_action,
-        )
+        super().__init__(device=device, batch_size=[])
+        self.frame_skip = frame_skip
+        self.from_pixels = from_pixels
+        self.pixels_only = pixels_only
+        self.image_size = image_size
+        self.num_prev_obs = num_prev_obs
+        self.num_prev_action = num_prev_action

-    def _make_env(self):
+        if pixels_only:
+            assert from_pixels
+        if from_pixels:
+            assert image_size
+
+        if not _has_diffpolicy:
+            raise ImportError("Cannot import diffusion_policy.")
        if not _has_gym:
-            raise ImportError("Cannot import gymnasium.")
+            raise ImportError("Cannot import gym.")

-        import gymnasium
+        # TODO(rcadene) (PushTEnv is similar to PushTImageEnv, but without the image rendering, it's faster to iterate on)
+        # from diffusion_policy.env.pusht.pusht_env import PushTEnv

-        from lerobot.common.envs.simxarm.simxarm import TASKS
+        if not from_pixels:
+            raise NotImplementedError("Use PushTEnv, instead of PushTImageEnv")
+        from diffusion_policy.env.pusht.pusht_image_env import PushTImageEnv

-        if self.task not in TASKS:
-            raise ValueError(f"Unknown task {self.task}. Must be one of {list(TASKS.keys())}")
+        self._env = PushTImageEnv(render_size=self.image_size)

-        self._env = TASKS[self.task]["env"]()
+        self._make_spec()
+        self._current_seed = self.set_seed(seed)

-        num_actions = len(TASKS[self.task]["action_space"])
-        self._action_space = gymnasium.spaces.Box(low=-1.0, high=1.0, shape=(num_actions,))
-        self._action_padding = np.zeros((MAX_NUM_ACTIONS - num_actions), dtype=np.float32)
-        if "w" not in TASKS[self.task]["action_space"]:
-            self._action_padding[-1] = 1.0
+        if self.num_prev_obs > 0:
+            self._prev_obs_image_queue = deque(maxlen=self.num_prev_obs)
+            self._prev_obs_state_queue = deque(maxlen=self.num_prev_obs)
+        if self.num_prev_action > 0:
+            raise NotImplementedError()
+            # self._prev_action_queue = deque(maxlen=self.num_prev_action)

    def render(self, mode="rgb_array", width=384, height=384):
-        return self._env.render(mode, width=width, height=height)
+        if width != height:
+            raise NotImplementedError()
+        tmp = self._env.render_size
+        self._env.render_size = width
+        out = self._env.render(mode)
+        self._env.render_size = tmp
+        return out

    def _format_raw_obs(self, raw_obs):
        if self.from_pixels:
-            image = self.render(mode="rgb_array", width=self.image_size, height=self.image_size)
-            image = image.transpose(2, 0, 1)  # (H, W, C) -> (C, H, W)
-            image = torch.tensor(image.copy(), dtype=torch.uint8)
-
+            image = torch.from_numpy(raw_obs["image"])
            obs = {"image": image}

            if not self.pixels_only:
-                obs["state"] = torch.tensor(self._env.robot_state, dtype=torch.float32)
+                obs["state"] = torch.from_numpy(raw_obs["agent_pos"]).type(torch.float32)
        else:
-            obs = {"state": torch.tensor(raw_obs["observation"], dtype=torch.float32)}
+            # TODO:
+            obs = {"state": torch.from_numpy(raw_obs["observation"]).type(torch.float32)}

-        # obs = TensorDict(obs, batch_size=[])
        return obs

    def _reset(self, tensordict: Optional[TensorDict] = None):
        td = tensordict
        if td is None or td.is_empty():
+            # we need to handle seed iteration, since self._env.reset() rely an internal _seed.
+            self._current_seed += 1
+            self.set_seed(self._current_seed)
            raw_obs = self._env.reset()
+            assert self._current_seed == self._env._seed

            obs = self._format_raw_obs(raw_obs)

@@ -119,14 +125,12 @@ class SimxarmEnv(AbstractEnv):
            )
        else:
            raise NotImplementedError()
-
        return td

    def _step(self, tensordict: TensorDict):
        td = tensordict
        action = td["action"].numpy()
        # step expects shape=(4,) so we pad if necessary
-        action = np.concatenate([action, self._action_padding])
        # TODO(rcadene): add info["is_success"] and info["success"] ?
        sum_reward = 0

@@ -155,10 +159,11 @@ class SimxarmEnv(AbstractEnv):

        td = TensorDict(
            {
-                "observation": self._format_raw_obs(raw_obs),
+                "observation": TensorDict(obs, batch_size=[]),
                "reward": torch.tensor([sum_reward], dtype=torch.float32),
+                # succes and done are true when coverage > self.success_threshold in env
                "done": torch.tensor([done], dtype=torch.bool),
-                "success": torch.tensor([info["success"]], dtype=torch.bool),
+                "success": torch.tensor([done], dtype=torch.bool),
            },
            batch_size=[],
        )
@@ -173,17 +178,19 @@ class SimxarmEnv(AbstractEnv):

            obs["image"] = BoundedTensorSpec(
                low=0,
-                high=255,
+                high=1,
                shape=image_shape,
-                dtype=torch.uint8,
+                dtype=torch.float32,
                device=self.device,
            )
            if not self.pixels_only:
-                state_shape = (len(self._env.robot_state),)
+                state_shape = self._env.observation_space["agent_pos"].shape
                if self.num_prev_obs > 0:
                    state_shape = (self.num_prev_obs + 1, *state_shape)

-                obs["state"] = UnboundedContinuousTensorSpec(
+                obs["state"] = BoundedTensorSpec(
+                    low=0,
+                    high=512,
                    shape=state_shape,
                    dtype=torch.float32,
                    device=self.device,
@@ -203,7 +210,7 @@ class SimxarmEnv(AbstractEnv):
        self.observation_spec = CompositeSpec({"observation": obs})

        self.action_spec = _gym_to_torchrl_spec_transform(
-            self._action_space,
+            self._env.action_space,
            device=self.device,
        )

@@ -231,7 +238,5 @@ class SimxarmEnv(AbstractEnv):
        )

    def _set_seed(self, seed: Optional[int]):
-        set_global_seed(seed)
-        self._seed = seed
-        # TODO(aliberts): change self._reset so that it takes in a seed value
-        logging.warning("simxarm env is not properly seeded")
+        set_seed(seed)
+        self._env.seed(seed)
--- a/lerobot/common/envs/pusht/env.py
+++ b/lerobot/common/envs/pusht/env.py
@@ -1,245 +0,0 @@
-import importlib
-import logging
-from collections import deque
-from typing import Optional
-
-import cv2
-import numpy as np
-import torch
-from tensordict import TensorDict
-from torchrl.data.tensor_specs import (
-    BoundedTensorSpec,
-    CompositeSpec,
-    DiscreteTensorSpec,
-    UnboundedContinuousTensorSpec,
-)
-from torchrl.envs.libs.gym import _gym_to_torchrl_spec_transform
-
-from lerobot.common.envs.abstract import AbstractEnv
-from lerobot.common.utils import set_global_seed
-
-_has_gym = importlib.util.find_spec("gymnasium") is not None
-
-
-class PushtEnv(AbstractEnv):
-    name = "pusht"
-    available_tasks = ["pusht"]
-    _reset_warning_issued = False
-
-    def __init__(
-        self,
-        task="pusht",
-        frame_skip: int = 1,
-        from_pixels: bool = False,
-        pixels_only: bool = False,
-        image_size=None,
-        seed=1337,
-        device="cpu",
-        num_prev_obs=1,
-        num_prev_action=0,
-    ):
-        super().__init__(
-            task=task,
-            frame_skip=frame_skip,
-            from_pixels=from_pixels,
-            pixels_only=pixels_only,
-            image_size=image_size,
-            seed=seed,
-            device=device,
-            num_prev_obs=num_prev_obs,
-            num_prev_action=num_prev_action,
-        )
-
-    def _make_env(self):
-        if not _has_gym:
-            raise ImportError("Cannot import gymnasium.")
-
-        # TODO(rcadene) (PushTEnv is similar to PushTImageEnv, but without the image rendering, it's faster to iterate on)
-        # from lerobot.common.envs.pusht.pusht_env import PushTEnv
-
-        if not self.from_pixels:
-            raise NotImplementedError("Use PushTEnv, instead of PushTImageEnv")
-        from lerobot.common.envs.pusht.pusht_image_env import PushTImageEnv
-
-        self._env = PushTImageEnv(render_size=self.image_size)
-
-    def render(self, mode="rgb_array", width=96, height=96, with_marker=True):
-        """
-        with_marker adds a cursor showing the targeted action for the controller.
-        """
-        if width != height:
-            raise NotImplementedError()
-        tmp = self._env.render_size
-        if width != self._env.render_size:
-            self._env.render_cache = None
-            self._env.render_size = width
-        out = self._env.render(mode).copy()
-        if with_marker and self._env.latest_action is not None:
-            action = np.array(self._env.latest_action)
-            coord = (action / 512 * self._env.render_size).astype(np.int32)
-            marker_size = int(8 / 96 * self._env.render_size)
-            thickness = int(1 / 96 * self._env.render_size)
-            cv2.drawMarker(
-                out,
-                coord,
-                color=(255, 0, 0),
-                markerType=cv2.MARKER_CROSS,
-                markerSize=marker_size,
-                thickness=thickness,
-            )
-        self._env.render_size = tmp
-        return out
-
-    def _format_raw_obs(self, raw_obs):
-        if self.from_pixels:
-            image = torch.from_numpy(raw_obs["image"])
-            obs = {"image": image}
-
-            if not self.pixels_only:
-                obs["state"] = torch.from_numpy(raw_obs["agent_pos"]).type(torch.float32)
-        else:
-            # TODO:
-            obs = {"state": torch.from_numpy(raw_obs["observation"]).type(torch.float32)}
-
-        return obs
-
-    def _reset(self, tensordict: Optional[TensorDict] = None):
-        if tensordict is not None and not PushtEnv._reset_warning_issued:
-            logging.warning(f"{self.__class__.__name__}._reset ignores the provided tensordict.")
-            PushtEnv._reset_warning_issued = True
-
-        # Seed the environment and update the seed to be used for the next reset.
-        self._next_seed = self.set_seed(self._next_seed)
-        raw_obs = self._env.reset()
-
-        obs = self._format_raw_obs(raw_obs)
-
-        if self.num_prev_obs > 0:
-            stacked_obs = {}
-            if "image" in obs:
-                self._prev_obs_image_queue = deque(
-                    [obs["image"]] * (self.num_prev_obs + 1), maxlen=(self.num_prev_obs + 1)
-                )
-                stacked_obs["image"] = torch.stack(list(self._prev_obs_image_queue))
-            if "state" in obs:
-                self._prev_obs_state_queue = deque(
-                    [obs["state"]] * (self.num_prev_obs + 1), maxlen=(self.num_prev_obs + 1)
-                )
-                stacked_obs["state"] = torch.stack(list(self._prev_obs_state_queue))
-            obs = stacked_obs
-
-        td = TensorDict(
-            {
-                "observation": TensorDict(obs, batch_size=[]),
-                "done": torch.tensor([False], dtype=torch.bool),
-            },
-            batch_size=[],
-        )
-
-        return td
-
-    def _step(self, tensordict: TensorDict):
-        td = tensordict
-        action = td["action"].numpy()
-        assert action.ndim == 1
-        # TODO(rcadene): add info["is_success"] and info["success"] ?
-
-        raw_obs, reward, done, info = self._env.step(action)
-
-        obs = self._format_raw_obs(raw_obs)
-
-        if self.num_prev_obs > 0:
-            stacked_obs = {}
-            if "image" in obs:
-                self._prev_obs_image_queue.append(obs["image"])
-                stacked_obs["image"] = torch.stack(list(self._prev_obs_image_queue))
-            if "state" in obs:
-                self._prev_obs_state_queue.append(obs["state"])
-                stacked_obs["state"] = torch.stack(list(self._prev_obs_state_queue))
-            obs = stacked_obs
-
-        td = TensorDict(
-            {
-                "observation": TensorDict(obs, batch_size=[]),
-                "reward": torch.tensor([reward], dtype=torch.float32),
-                # success and done are true when coverage > self.success_threshold in env
-                "done": torch.tensor([done], dtype=torch.bool),
-                "success": torch.tensor([done], dtype=torch.bool),
-            },
-            batch_size=[],
-        )
-        return td
-
-    def _make_spec(self):
-        obs = {}
-        if self.from_pixels:
-            image_shape = (3, self.image_size, self.image_size)
-            if self.num_prev_obs > 0:
-                image_shape = (self.num_prev_obs + 1, *image_shape)
-
-            obs["image"] = BoundedTensorSpec(
-                low=0,
-                high=255,
-                shape=image_shape,
-                dtype=torch.uint8,
-                device=self.device,
-            )
-            if not self.pixels_only:
-                state_shape = self._env.observation_space["agent_pos"].shape
-                if self.num_prev_obs > 0:
-                    state_shape = (self.num_prev_obs + 1, *state_shape)
-
-                obs["state"] = BoundedTensorSpec(
-                    low=0,
-                    high=512,
-                    shape=state_shape,
-                    dtype=torch.float32,
-                    device=self.device,
-                )
-        else:
-            # TODO(rcadene): add observation_space achieved_goal and desired_goal?
-            state_shape = self._env.observation_space["observation"].shape
-            if self.num_prev_obs > 0:
-                state_shape = (self.num_prev_obs + 1, *state_shape)
-
-            obs["state"] = UnboundedContinuousTensorSpec(
-                # TODO:
-                shape=state_shape,
-                dtype=torch.float32,
-                device=self.device,
-            )
-        self.observation_spec = CompositeSpec({"observation": obs})
-
-        self.action_spec = _gym_to_torchrl_spec_transform(
-            self._env.action_space,
-            device=self.device,
-        )
-
-        self.reward_spec = UnboundedContinuousTensorSpec(
-            shape=(1,),
-            dtype=torch.float32,
-            device=self.device,
-        )
-
-        self.done_spec = CompositeSpec(
-            {
-                "done": DiscreteTensorSpec(
-                    2,
-                    shape=(1,),
-                    dtype=torch.bool,
-                    device=self.device,
-                ),
-                "success": DiscreteTensorSpec(
-                    2,
-                    shape=(1,),
-                    dtype=torch.bool,
-                    device=self.device,
-                ),
-            }
-        )
-
-    def _set_seed(self, seed: Optional[int]):
-        # Set global seed.
-        set_global_seed(seed)
-        # Set PushTImageEnv seed as it relies on it's own internal _seed attribute.
-        self._env.seed(seed)
--- a/lerobot/common/envs/pusht/pusht_env.py
+++ b/lerobot/common/envs/pusht/pusht_env.py
@@ -1,378 +0,0 @@
-import collections
-
-import cv2
-import gymnasium as gym
-import numpy as np
-import pygame
-import pymunk
-import pymunk.pygame_util
-import shapely.geometry as sg
-import skimage.transform as st
-from gymnasium import spaces
-from pymunk.vec2d import Vec2d
-
-from lerobot.common.envs.pusht.pymunk_override import DrawOptions
-
-
-def pymunk_to_shapely(body, shapes):
-    geoms = []
-    for shape in shapes:
-        if isinstance(shape, pymunk.shapes.Poly):
-            verts = [body.local_to_world(v) for v in shape.get_vertices()]
-            verts += [verts[0]]
-            geoms.append(sg.Polygon(verts))
-        else:
-            raise RuntimeError(f"Unsupported shape type {type(shape)}")
-    geom = sg.MultiPolygon(geoms)
-    return geom
-
-
-class PushTEnv(gym.Env):
-    metadata = {"render.modes": ["human", "rgb_array"], "video.frames_per_second": 10}
-    reward_range = (0.0, 1.0)
-
-    def __init__(
-        self,
-        legacy=True,  # compatibility with original
-        block_cog=None,
-        damping=None,
-        render_action=True,
-        render_size=96,
-        reset_to_state=None,
-    ):
-        self._seed = None
-        self.seed()
-        self.window_size = ws = 512  # The size of the PyGame window
-        self.render_size = render_size
-        self.sim_hz = 100
-        # Local controller params.
-        self.k_p, self.k_v = 100, 20  # PD control.z
-        self.control_hz = self.metadata["video.frames_per_second"]
-        # legcay set_state for data compatibility
-        self.legacy = legacy
-
-        # agent_pos, block_pos, block_angle
-        self.observation_space = spaces.Box(
-            low=np.array([0, 0, 0, 0, 0], dtype=np.float64),
-            high=np.array([ws, ws, ws, ws, np.pi * 2], dtype=np.float64),
-            shape=(5,),
-            dtype=np.float64,
-        )
-
-        # positional goal for agent
-        self.action_space = spaces.Box(
-            low=np.array([0, 0], dtype=np.float64),
-            high=np.array([ws, ws], dtype=np.float64),
-            shape=(2,),
-            dtype=np.float64,
-        )
-
-        self.block_cog = block_cog
-        self.damping = damping
-        self.render_action = render_action
-
-        """
-        If human-rendering is used, `self.window` will be a reference
-        to the window that we draw to. `self.clock` will be a clock that is used
-        to ensure that the environment is rendered at the correct framerate in
-        human-mode. They will remain `None` until human-mode is used for the
-        first time.
-        """
-        self.window = None
-        self.clock = None
-        self.screen = None
-
-        self.space = None
-        self.teleop = None
-        self.render_buffer = None
-        self.latest_action = None
-        self.reset_to_state = reset_to_state
-
-    def reset(self):
-        seed = self._seed
-        self._setup()
-        if self.block_cog is not None:
-            self.block.center_of_gravity = self.block_cog
-        if self.damping is not None:
-            self.space.damping = self.damping
-
-        # use legacy RandomState for compatibility
-        state = self.reset_to_state
-        if state is None:
-            rs = np.random.RandomState(seed=seed)
-            state = np.array(
-                [
-                    rs.randint(50, 450),
-                    rs.randint(50, 450),
-                    rs.randint(100, 400),
-                    rs.randint(100, 400),
-                    rs.randn() * 2 * np.pi - np.pi,
-                ]
-            )
-        self._set_state(state)
-
-        observation = self._get_obs()
-        return observation
-
-    def step(self, action):
-        dt = 1.0 / self.sim_hz
-        self.n_contact_points = 0
-        n_steps = self.sim_hz // self.control_hz
-        if action is not None:
-            self.latest_action = action
-            for _ in range(n_steps):
-                # Step PD control.
-                # self.agent.velocity = self.k_p * (act - self.agent.position)    # P control works too.
-                acceleration = self.k_p * (action - self.agent.position) + self.k_v * (
-                    Vec2d(0, 0) - self.agent.velocity
-                )
-                self.agent.velocity += acceleration * dt
-
-                # Step physics.
-                self.space.step(dt)
-
-        # compute reward
-        goal_body = self._get_goal_pose_body(self.goal_pose)
-        goal_geom = pymunk_to_shapely(goal_body, self.block.shapes)
-        block_geom = pymunk_to_shapely(self.block, self.block.shapes)
-
-        intersection_area = goal_geom.intersection(block_geom).area
-        goal_area = goal_geom.area
-        coverage = intersection_area / goal_area
-        reward = np.clip(coverage / self.success_threshold, 0, 1)
-        done = coverage > self.success_threshold
-
-        observation = self._get_obs()
-        info = self._get_info()
-
-        return observation, reward, done, info
-
-    def render(self, mode):
-        return self._render_frame(mode)
-
-    def teleop_agent(self):
-        TeleopAgent = collections.namedtuple("TeleopAgent", ["act"])
-
-        def act(obs):
-            act = None
-            mouse_position = pymunk.pygame_util.from_pygame(Vec2d(*pygame.mouse.get_pos()), self.screen)
-            if self.teleop or (mouse_position - self.agent.position).length < 30:
-                self.teleop = True
-                act = mouse_position
-            return act
-
-        return TeleopAgent(act)
-
-    def _get_obs(self):
-        obs = np.array(
-            tuple(self.agent.position) + tuple(self.block.position) + (self.block.angle % (2 * np.pi),)
-        )
-        return obs
-
-    def _get_goal_pose_body(self, pose):
-        mass = 1
-        inertia = pymunk.moment_for_box(mass, (50, 100))
-        body = pymunk.Body(mass, inertia)
-        # preserving the legacy assignment order for compatibility
-        # the order here doesn't matter somehow, maybe because CoM is aligned with body origin
-        body.position = pose[:2].tolist()
-        body.angle = pose[2]
-        return body
-
-    def _get_info(self):
-        n_steps = self.sim_hz // self.control_hz
-        n_contact_points_per_step = int(np.ceil(self.n_contact_points / n_steps))
-        info = {
-            "pos_agent": np.array(self.agent.position),
-            "vel_agent": np.array(self.agent.velocity),
-            "block_pose": np.array(list(self.block.position) + [self.block.angle]),
-            "goal_pose": self.goal_pose,
-            "n_contacts": n_contact_points_per_step,
-        }
-        return info
-
-    def _render_frame(self, mode):
-        if self.window is None and mode == "human":
-            pygame.init()
-            pygame.display.init()
-            self.window = pygame.display.set_mode((self.window_size, self.window_size))
-        if self.clock is None and mode == "human":
-            self.clock = pygame.time.Clock()
-
-        canvas = pygame.Surface((self.window_size, self.window_size))
-        canvas.fill((255, 255, 255))
-        self.screen = canvas
-
-        draw_options = DrawOptions(canvas)
-
-        # Draw goal pose.
-        goal_body = self._get_goal_pose_body(self.goal_pose)
-        for shape in self.block.shapes:
-            goal_points = [
-                pymunk.pygame_util.to_pygame(goal_body.local_to_world(v), draw_options.surface)
-                for v in shape.get_vertices()
-            ]
-            goal_points += [goal_points[0]]
-            pygame.draw.polygon(canvas, self.goal_color, goal_points)
-
-        # Draw agent and block.
-        self.space.debug_draw(draw_options)
-
-        if mode == "human":
-            # The following line copies our drawings from `canvas` to the visible window
-            self.window.blit(canvas, canvas.get_rect())
-            pygame.event.pump()
-            pygame.display.update()
-
-            # the clock is already ticked during in step for "human"
-
-        img = np.transpose(np.array(pygame.surfarray.pixels3d(canvas)), axes=(1, 0, 2))
-        img = cv2.resize(img, (self.render_size, self.render_size))
-        if self.render_action and self.latest_action is not None:
-            action = np.array(self.latest_action)
-            coord = (action / 512 * 96).astype(np.int32)
-            marker_size = int(8 / 96 * self.render_size)
-            thickness = int(1 / 96 * self.render_size)
-            cv2.drawMarker(
-                img,
-                coord,
-                color=(255, 0, 0),
-                markerType=cv2.MARKER_CROSS,
-                markerSize=marker_size,
-                thickness=thickness,
-            )
-        return img
-
-    def close(self):
-        if self.window is not None:
-            pygame.display.quit()
-            pygame.quit()
-
-    def seed(self, seed=None):
-        if seed is None:
-            seed = np.random.randint(0, 25536)
-        self._seed = seed
-        self.np_random = np.random.default_rng(seed)
-
-    def _handle_collision(self, arbiter, space, data):
-        self.n_contact_points += len(arbiter.contact_point_set.points)
-
-    def _set_state(self, state):
-        if isinstance(state, np.ndarray):
-            state = state.tolist()
-        pos_agent = state[:2]
-        pos_block = state[2:4]
-        rot_block = state[4]
-        self.agent.position = pos_agent
-        # setting angle rotates with respect to center of mass
-        # therefore will modify the geometric position
-        # if not the same as CoM
-        # therefore should be modified first.
-        if self.legacy:
-            # for compatibility with legacy data
-            self.block.position = pos_block
-            self.block.angle = rot_block
-        else:
-            self.block.angle = rot_block
-            self.block.position = pos_block
-
-        # Run physics to take effect
-        self.space.step(1.0 / self.sim_hz)
-
-    def _set_state_local(self, state_local):
-        agent_pos_local = state_local[:2]
-        block_pose_local = state_local[2:]
-        tf_img_obj = st.AffineTransform(translation=self.goal_pose[:2], rotation=self.goal_pose[2])
-        tf_obj_new = st.AffineTransform(translation=block_pose_local[:2], rotation=block_pose_local[2])
-        tf_img_new = st.AffineTransform(matrix=tf_img_obj.params @ tf_obj_new.params)
-        agent_pos_new = tf_img_new(agent_pos_local)
-        new_state = np.array(list(agent_pos_new[0]) + list(tf_img_new.translation) + [tf_img_new.rotation])
-        self._set_state(new_state)
-        return new_state
-
-    def _setup(self):
-        self.space = pymunk.Space()
-        self.space.gravity = 0, 0
-        self.space.damping = 0
-        self.teleop = False
-        self.render_buffer = []
-
-        # Add walls.
-        walls = [
-            self._add_segment((5, 506), (5, 5), 2),
-            self._add_segment((5, 5), (506, 5), 2),
-            self._add_segment((506, 5), (506, 506), 2),
-            self._add_segment((5, 506), (506, 506), 2),
-        ]
-        self.space.add(*walls)
-
-        # Add agent, block, and goal zone.
-        self.agent = self.add_circle((256, 400), 15)
-        self.block = self.add_tee((256, 300), 0)
-        self.goal_color = pygame.Color("LightGreen")
-        self.goal_pose = np.array([256, 256, np.pi / 4])  # x, y, theta (in radians)
-
-        # Add collision handling
-        self.collision_handeler = self.space.add_collision_handler(0, 0)
-        self.collision_handeler.post_solve = self._handle_collision
-        self.n_contact_points = 0
-
-        self.max_score = 50 * 100
-        self.success_threshold = 0.95  # 95% coverage.
-
-    def _add_segment(self, a, b, radius):
-        shape = pymunk.Segment(self.space.static_body, a, b, radius)
-        shape.color = pygame.Color("LightGray")  # https://htmlcolorcodes.com/color-names
-        return shape
-
-    def add_circle(self, position, radius):
-        body = pymunk.Body(body_type=pymunk.Body.KINEMATIC)
-        body.position = position
-        body.friction = 1
-        shape = pymunk.Circle(body, radius)
-        shape.color = pygame.Color("RoyalBlue")
-        self.space.add(body, shape)
-        return body
-
-    def add_box(self, position, height, width):
-        mass = 1
-        inertia = pymunk.moment_for_box(mass, (height, width))
-        body = pymunk.Body(mass, inertia)
-        body.position = position
-        shape = pymunk.Poly.create_box(body, (height, width))
-        shape.color = pygame.Color("LightSlateGray")
-        self.space.add(body, shape)
-        return body
-
-    def add_tee(self, position, angle, scale=30, color="LightSlateGray", mask=None):
-        if mask is None:
-            mask = pymunk.ShapeFilter.ALL_MASKS()
-        mass = 1
-        length = 4
-        vertices1 = [
-            (-length * scale / 2, scale),
-            (length * scale / 2, scale),
-            (length * scale / 2, 0),
-            (-length * scale / 2, 0),
-        ]
-        inertia1 = pymunk.moment_for_poly(mass, vertices=vertices1)
-        vertices2 = [
-            (-scale / 2, scale),
-            (-scale / 2, length * scale),
-            (scale / 2, length * scale),
-            (scale / 2, scale),
-        ]
-        inertia2 = pymunk.moment_for_poly(mass, vertices=vertices1)
-        body = pymunk.Body(mass, inertia1 + inertia2)
-        shape1 = pymunk.Poly(body, vertices1)
-        shape2 = pymunk.Poly(body, vertices2)
-        shape1.color = pygame.Color(color)
-        shape2.color = pygame.Color(color)
-        shape1.filter = pymunk.ShapeFilter(mask=mask)
-        shape2.filter = pymunk.ShapeFilter(mask=mask)
-        body.center_of_gravity = (shape1.center_of_gravity + shape2.center_of_gravity) / 2
-        body.position = position
-        body.angle = angle
-        body.friction = 1
-        self.space.add(body, shape1, shape2)
-        return body
--- a/lerobot/common/envs/pusht/pusht_image_env.py
+++ b/lerobot/common/envs/pusht/pusht_image_env.py
@@ -1,41 +0,0 @@
-import numpy as np
-from gymnasium import spaces
-
-from lerobot.common.envs.pusht.pusht_env import PushTEnv
-
-
-class PushTImageEnv(PushTEnv):
-    metadata = {"render.modes": ["rgb_array"], "video.frames_per_second": 10}
-
-    # Note: legacy defaults to True for compatibility with original
-    def __init__(self, legacy=True, block_cog=None, damping=None, render_size=96):
-        super().__init__(
-            legacy=legacy, block_cog=block_cog, damping=damping, render_size=render_size, render_action=False
-        )
-        ws = self.window_size
-        self.observation_space = spaces.Dict(
-            {
-                "image": spaces.Box(low=0, high=1, shape=(3, render_size, render_size), dtype=np.float32),
-                "agent_pos": spaces.Box(low=0, high=ws, shape=(2,), dtype=np.float32),
-            }
-        )
-        self.render_cache = None
-
-    def _get_obs(self):
-        img = super()._render_frame(mode="rgb_array")
-
-        agent_pos = np.array(self.agent.position)
-        img_obs = np.moveaxis(img, -1, 0)
-        obs = {"image": img_obs, "agent_pos": agent_pos}
-
-        self.render_cache = img
-
-        return obs
-
-    def render(self, mode):
-        assert mode == "rgb_array"
-
-        if self.render_cache is None:
-            self._get_obs()
-
-        return self.render_cache
--- a/lerobot/common/envs/pusht/pymunk_override.py
+++ b/lerobot/common/envs/pusht/pymunk_override.py
@@ -1,244 +0,0 @@
-# ----------------------------------------------------------------------------
-# pymunk
-# Copyright (c) 2007-2016 Victor Blomqvist
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-# ----------------------------------------------------------------------------
-
-"""This submodule contains helper functions to help with quick prototyping
-using pymunk together with pygame.
-
-Intended to help with debugging and prototyping, not for actual production use
-in a full application. The methods contained in this module is opinionated
-about your coordinate system and not in any way optimized.
-"""
-
-__docformat__ = "reStructuredText"
-
-__all__ = [
-    "DrawOptions",
-    "get_mouse_pos",
-    "to_pygame",
-    "from_pygame",
-    # "lighten",
-    "positive_y_is_up",
-]
-
-from typing import Sequence, Tuple
-
-import numpy as np
-import pygame
-import pymunk
-from pymunk.space_debug_draw_options import SpaceDebugColor
-from pymunk.vec2d import Vec2d
-
-positive_y_is_up: bool = False
-"""Make increasing values of y point upwards.
-
-When True::
-
-    y
-    ^
-    |      . (3, 3)
-    |
-    |   . (2, 2)
-    |
-    +------ > x
-
-When False::
-
-    +------ > x
-    |
-    |   . (2, 2)
-    |
-    |      . (3, 3)
-    v
-    y
-
-"""
-
-
-class DrawOptions(pymunk.SpaceDebugDrawOptions):
-    def __init__(self, surface: pygame.Surface) -> None:
-        """Draw a pymunk.Space on a pygame.Surface object.
-
-        Typical usage::
-
-        >>> import pymunk
-        >>> surface = pygame.Surface((10,10))
-        >>> space = pymunk.Space()
-        >>> options = pymunk.pygame_util.DrawOptions(surface)
-        >>> space.debug_draw(options)
-
-        You can control the color of a shape by setting shape.color to the color
-        you want it drawn in::
-
-        >>> c = pymunk.Circle(None, 10)
-        >>> c.color = pygame.Color("pink")
-
-        See pygame_util.demo.py for a full example
-
-        Since pygame uses a coordinate system where y points down (in contrast
-        to many other cases), you either have to make the physics simulation
-        with Pymunk also behave in that way, or flip everything when you draw.
-
-        The easiest is probably to just make the simulation behave the same
-        way as Pygame does. In that way all coordinates used are in the same
-        orientation and easy to reason about::
-
-        >>> space = pymunk.Space()
-        >>> space.gravity = (0, -1000)
-        >>> body = pymunk.Body()
-        >>> body.position = (0, 0) # will be positioned in the top left corner
-        >>> space.debug_draw(options)
-
-        To flip the drawing its possible to set the module property
-        :py:data:`positive_y_is_up` to True. Then the pygame drawing will flip
-        the simulation upside down before drawing::
-
-        >>> positive_y_is_up = True
-        >>> body = pymunk.Body()
-        >>> body.position = (0, 0)
-        >>> # Body will be position in bottom left corner
-
-        :Parameters:
-                surface : pygame.Surface
-                    Surface that the objects will be drawn on
-        """
-        self.surface = surface
-        super().__init__()
-
-    def draw_circle(
-        self,
-        pos: Vec2d,
-        angle: float,
-        radius: float,
-        outline_color: SpaceDebugColor,
-        fill_color: SpaceDebugColor,
-    ) -> None:
-        p = to_pygame(pos, self.surface)
-
-        pygame.draw.circle(self.surface, fill_color.as_int(), p, round(radius), 0)
-        pygame.draw.circle(self.surface, light_color(fill_color).as_int(), p, round(radius - 4), 0)
-
-        # circle_edge = pos + Vec2d(radius, 0).rotated(angle)
-        # p2 = to_pygame(circle_edge, self.surface)
-        # line_r = 2 if radius > 20 else 1
-        # pygame.draw.lines(self.surface, outline_color.as_int(), False, [p, p2], line_r)
-
-    def draw_segment(self, a: Vec2d, b: Vec2d, color: SpaceDebugColor) -> None:
-        p1 = to_pygame(a, self.surface)
-        p2 = to_pygame(b, self.surface)
-
-        pygame.draw.aalines(self.surface, color.as_int(), False, [p1, p2])
-
-    def draw_fat_segment(
-        self,
-        a: Tuple[float, float],
-        b: Tuple[float, float],
-        radius: float,
-        outline_color: SpaceDebugColor,
-        fill_color: SpaceDebugColor,
-    ) -> None:
-        p1 = to_pygame(a, self.surface)
-        p2 = to_pygame(b, self.surface)
-
-        r = round(max(1, radius * 2))
-        pygame.draw.lines(self.surface, fill_color.as_int(), False, [p1, p2], r)
-        if r > 2:
-            orthog = [abs(p2[1] - p1[1]), abs(p2[0] - p1[0])]
-            if orthog[0] == 0 and orthog[1] == 0:
-                return
-            scale = radius / (orthog[0] * orthog[0] + orthog[1] * orthog[1]) ** 0.5
-            orthog[0] = round(orthog[0] * scale)
-            orthog[1] = round(orthog[1] * scale)
-            points = [
-                (p1[0] - orthog[0], p1[1] - orthog[1]),
-                (p1[0] + orthog[0], p1[1] + orthog[1]),
-                (p2[0] + orthog[0], p2[1] + orthog[1]),
-                (p2[0] - orthog[0], p2[1] - orthog[1]),
-            ]
-            pygame.draw.polygon(self.surface, fill_color.as_int(), points)
-            pygame.draw.circle(
-                self.surface,
-                fill_color.as_int(),
-                (round(p1[0]), round(p1[1])),
-                round(radius),
-            )
-            pygame.draw.circle(
-                self.surface,
-                fill_color.as_int(),
-                (round(p2[0]), round(p2[1])),
-                round(radius),
-            )
-
-    def draw_polygon(
-        self,
-        verts: Sequence[Tuple[float, float]],
-        radius: float,
-        outline_color: SpaceDebugColor,
-        fill_color: SpaceDebugColor,
-    ) -> None:
-        ps = [to_pygame(v, self.surface) for v in verts]
-        ps += [ps[0]]
-
-        radius = 2
-        pygame.draw.polygon(self.surface, light_color(fill_color).as_int(), ps)
-
-        if radius > 0:
-            for i in range(len(verts)):
-                a = verts[i]
-                b = verts[(i + 1) % len(verts)]
-                self.draw_fat_segment(a, b, radius, fill_color, fill_color)
-
-    def draw_dot(self, size: float, pos: Tuple[float, float], color: SpaceDebugColor) -> None:
-        p = to_pygame(pos, self.surface)
-        pygame.draw.circle(self.surface, color.as_int(), p, round(size), 0)
-
-
-def get_mouse_pos(surface: pygame.Surface) -> Tuple[int, int]:
-    """Get position of the mouse pointer in pymunk coordinates."""
-    p = pygame.mouse.get_pos()
-    return from_pygame(p, surface)
-
-
-def to_pygame(p: Tuple[float, float], surface: pygame.Surface) -> Tuple[int, int]:
-    """Convenience method to convert pymunk coordinates to pygame surface
-    local coordinates.
-
-    Note that in case positive_y_is_up is False, this function won't actually do
-    anything except converting the point to integers.
-    """
-    if positive_y_is_up:
-        return round(p[0]), surface.get_height() - round(p[1])
-    else:
-        return round(p[0]), round(p[1])
-
-
-def from_pygame(p: Tuple[float, float], surface: pygame.Surface) -> Tuple[int, int]:
-    """Convenience method to convert pygame surface local coordinates to
-    pymunk coordinates
-    """
-    return to_pygame(p, surface)
-
-
-def light_color(color: SpaceDebugColor):
-    color = np.minimum(1.2 * np.float32([color.r, color.g, color.b, color.a]), np.float32([255]))
-    color = SpaceDebugColor(r=color[0], g=color[1], b=color[2], a=color[3])
-    return color
--- a/lerobot/common/envs/simxarm.py
+++ b/lerobot/common/envs/simxarm.py
@@ -0,0 +1,181 @@
+import importlib
+from typing import Optional
+
+import numpy as np
+import torch
+from tensordict import TensorDict
+from torchrl.data.tensor_specs import (
+    BoundedTensorSpec,
+    CompositeSpec,
+    DiscreteTensorSpec,
+    UnboundedContinuousTensorSpec,
+)
+from torchrl.envs import EnvBase
+from torchrl.envs.libs.gym import _gym_to_torchrl_spec_transform
+
+from lerobot.common.utils import set_seed
+
+MAX_NUM_ACTIONS = 4
+
+_has_gym = importlib.util.find_spec("gym") is not None
+_has_simxarm = importlib.util.find_spec("simxarm") is not None and _has_gym
+
+
+class SimxarmEnv(EnvBase):
+    def __init__(
+        self,
+        task,
+        frame_skip: int = 1,
+        from_pixels: bool = False,
+        pixels_only: bool = False,
+        image_size=None,
+        seed=1337,
+        device="cpu",
+    ):
+        super().__init__(device=device, batch_size=[])
+        self.task = task
+        self.frame_skip = frame_skip
+        self.from_pixels = from_pixels
+        self.pixels_only = pixels_only
+        self.image_size = image_size
+
+        if pixels_only:
+            assert from_pixels
+        if from_pixels:
+            assert image_size
+
+        if not _has_simxarm:
+            raise ImportError("Cannot import simxarm.")
+        if not _has_gym:
+            raise ImportError("Cannot import gym.")
+
+        import gym
+        from simxarm import TASKS
+
+        if self.task not in TASKS:
+            raise ValueError(f"Unknown task {self.task}. Must be one of {list(TASKS.keys())}")
+
+        self._env = TASKS[self.task]["env"]()
+
+        num_actions = len(TASKS[self.task]["action_space"])
+        self._action_space = gym.spaces.Box(low=-1.0, high=1.0, shape=(num_actions,))
+        self._action_padding = np.zeros((MAX_NUM_ACTIONS - num_actions), dtype=np.float32)
+        if "w" not in TASKS[self.task]["action_space"]:
+            self._action_padding[-1] = 1.0
+
+        self._make_spec()
+        self.set_seed(seed)
+
+    def render(self, mode="rgb_array", width=384, height=384):
+        return self._env.render(mode, width=width, height=height)
+
+    def _format_raw_obs(self, raw_obs):
+        if self.from_pixels:
+            image = self.render(mode="rgb_array", width=self.image_size, height=self.image_size)
+            image = image.transpose(2, 0, 1)  # (H, W, C) -> (C, H, W)
+            image = torch.tensor(image.copy(), dtype=torch.uint8)
+
+            obs = {"image": image}
+
+            if not self.pixels_only:
+                obs["state"] = torch.tensor(self._env.robot_state, dtype=torch.float32)
+        else:
+            obs = {"state": torch.tensor(raw_obs["observation"], dtype=torch.float32)}
+
+        obs = TensorDict(obs, batch_size=[])
+        return obs
+
+    def _reset(self, tensordict: Optional[TensorDict] = None):
+        td = tensordict
+        if td is None or td.is_empty():
+            raw_obs = self._env.reset()
+
+            td = TensorDict(
+                {
+                    "observation": self._format_raw_obs(raw_obs),
+                    "done": torch.tensor([False], dtype=torch.bool),
+                },
+                batch_size=[],
+            )
+        else:
+            raise NotImplementedError()
+        return td
+
+    def _step(self, tensordict: TensorDict):
+        td = tensordict
+        action = td["action"].numpy()
+        # step expects shape=(4,) so we pad if necessary
+        action = np.concatenate([action, self._action_padding])
+        # TODO(rcadene): add info["is_success"] and info["success"] ?
+        sum_reward = 0
+        for _ in range(self.frame_skip):
+            raw_obs, reward, done, info = self._env.step(action)
+            sum_reward += reward
+
+        td = TensorDict(
+            {
+                "observation": self._format_raw_obs(raw_obs),
+                "reward": torch.tensor([sum_reward], dtype=torch.float32),
+                "done": torch.tensor([done], dtype=torch.bool),
+                "success": torch.tensor([info["success"]], dtype=torch.bool),
+            },
+            batch_size=[],
+        )
+        return td
+
+    def _make_spec(self):
+        obs = {}
+        if self.from_pixels:
+            obs["image"] = BoundedTensorSpec(
+                low=0,
+                high=255,
+                shape=(3, self.image_size, self.image_size),
+                dtype=torch.uint8,
+                device=self.device,
+            )
+            if not self.pixels_only:
+                obs["state"] = UnboundedContinuousTensorSpec(
+                    shape=(len(self._env.robot_state),),
+                    dtype=torch.float32,
+                    device=self.device,
+                )
+        else:
+            # TODO(rcadene): add observation_space achieved_goal and desired_goal?
+            obs["state"] = UnboundedContinuousTensorSpec(
+                shape=self._env.observation_space["observation"].shape,
+                dtype=torch.float32,
+                device=self.device,
+            )
+        self.observation_spec = CompositeSpec({"observation": obs})
+
+        self.action_spec = _gym_to_torchrl_spec_transform(
+            self._action_space,
+            device=self.device,
+        )
+
+        self.reward_spec = UnboundedContinuousTensorSpec(
+            shape=(1,),
+            dtype=torch.float32,
+            device=self.device,
+        )
+
+        self.done_spec = CompositeSpec(
+            {
+                "done": DiscreteTensorSpec(
+                    2,
+                    shape=(1,),
+                    dtype=torch.bool,
+                    device=self.device,
+                ),
+                "success": DiscreteTensorSpec(
+                    2,
+                    shape=(1,),
+                    dtype=torch.bool,
+                    device=self.device,
+                ),
+            }
+        )
+
+    def _set_seed(self, seed: Optional[int]):
+        set_seed(seed)
+        self._env.seed(seed)
--- a/lerobot/common/envs/simxarm/simxarm/init.py
+++ b/lerobot/common/envs/simxarm/simxarm/init.py
@@ -1,166 +0,0 @@
-from collections import OrderedDict, deque
-
-import gymnasium as gym
-import numpy as np
-from gymnasium.wrappers import TimeLimit
-
-from lerobot.common.envs.simxarm.simxarm.tasks.base import Base as Base
-from lerobot.common.envs.simxarm.simxarm.tasks.lift import Lift
-from lerobot.common.envs.simxarm.simxarm.tasks.peg_in_box import PegInBox
-from lerobot.common.envs.simxarm.simxarm.tasks.push import Push
-from lerobot.common.envs.simxarm.simxarm.tasks.reach import Reach
-
-TASKS = OrderedDict(
-    (
-        (
-            "reach",
-            {
-                "env": Reach,
-                "action_space": "xyz",
-                "episode_length": 50,
-                "description": "Reach a target location with the end effector",
-            },
-        ),
-        (
-            "push",
-            {
-                "env": Push,
-                "action_space": "xyz",
-                "episode_length": 50,
-                "description": "Push a cube to a target location",
-            },
-        ),
-        (
-            "peg_in_box",
-            {
-                "env": PegInBox,
-                "action_space": "xyz",
-                "episode_length": 50,
-                "description": "Insert a peg into a box",
-            },
-        ),
-        (
-            "lift",
-            {
-                "env": Lift,
-                "action_space": "xyzw",
-                "episode_length": 50,
-                "description": "Lift a cube above a height threshold",
-            },
-        ),
-    )
-)
-
-
-class SimXarmWrapper(gym.Wrapper):
-    """
-    A wrapper for the SimXarm environments. This wrapper is used to
-    convert the action and observation spaces to the correct format.
-    """
-
-    def __init__(self, env, task, obs_mode, image_size, action_repeat, frame_stack=1, channel_last=False):
-        super().__init__(env)
-        self._env = env
-        self.obs_mode = obs_mode
-        self.image_size = image_size
-        self.action_repeat = action_repeat
-        self.frame_stack = frame_stack
-        self._frames = deque([], maxlen=frame_stack)
-        self.channel_last = channel_last
-        self._max_episode_steps = task["episode_length"] // action_repeat
-
-        image_shape = (
-            (image_size, image_size, 3 * frame_stack)
-            if channel_last
-            else (3 * frame_stack, image_size, image_size)
-        )
-        if obs_mode == "state":
-            self.observation_space = env.observation_space["observation"]
-        elif obs_mode == "rgb":
-            self.observation_space = gym.spaces.Box(low=0, high=255, shape=image_shape, dtype=np.uint8)
-        elif obs_mode == "all":
-            self.observation_space = gym.spaces.Dict(
-                state=gym.spaces.Box(low=-np.inf, high=np.inf, shape=(4,), dtype=np.float32),
-                rgb=gym.spaces.Box(low=0, high=255, shape=image_shape, dtype=np.uint8),
-            )
-        else:
-            raise ValueError(f"Unknown obs_mode {obs_mode}. Must be one of [rgb, all, state]")
-        self.action_space = gym.spaces.Box(low=-1.0, high=1.0, shape=(len(task["action_space"]),))
-        self.action_padding = np.zeros(4 - len(task["action_space"]), dtype=np.float32)
-        if "w" not in task["action_space"]:
-            self.action_padding[-1] = 1.0
-
-    def _render_obs(self):
-        obs = self.render(mode="rgb_array", width=self.image_size, height=self.image_size)
-        if not self.channel_last:
-            obs = obs.transpose(2, 0, 1)
-        return obs.copy()
-
-    def _update_frames(self, reset=False):
-        pixels = self._render_obs()
-        self._frames.append(pixels)
-        if reset:
-            for _ in range(1, self.frame_stack):
-                self._frames.append(pixels)
-        assert len(self._frames) == self.frame_stack
-
-    def transform_obs(self, obs, reset=False):
-        if self.obs_mode == "state":
-            return obs["observation"]
-        elif self.obs_mode == "rgb":
-            self._update_frames(reset=reset)
-            rgb_obs = np.concatenate(list(self._frames), axis=-1 if self.channel_last else 0)
-            return rgb_obs
-        elif self.obs_mode == "all":
-            self._update_frames(reset=reset)
-            rgb_obs = np.concatenate(list(self._frames), axis=-1 if self.channel_last else 0)
-            return OrderedDict((("rgb", rgb_obs), ("state", self.robot_state)))
-        else:
-            raise ValueError(f"Unknown obs_mode {self.obs_mode}. Must be one of [rgb, all, state]")
-
-    def reset(self):
-        return self.transform_obs(self._env.reset(), reset=True)
-
-    def step(self, action):
-        action = np.concatenate([action, self.action_padding])
-        reward = 0.0
-        for _ in range(self.action_repeat):
-            obs, r, done, info = self._env.step(action)
-            reward += r
-        return self.transform_obs(obs), reward, done, info
-
-    def render(self, mode="rgb_array", width=384, height=384, **kwargs):
-        return self._env.render(mode, width=width, height=height)
-
-    @property
-    def state(self):
-        return self._env.robot_state
-
-
-def make(task, obs_mode="state", image_size=84, action_repeat=1, frame_stack=1, channel_last=False, seed=0):
-    """
-    Create a new environment.
-    Args:
-            task (str): The task to create an environment for. Must be one of:
-                    - 'reach'
-                    - 'push'
-                    - 'peg-in-box'
-                    - 'lift'
-            obs_mode (str): The observation mode to use. Must be one of:
-                    - 'state': Only state observations
-                    - 'rgb': RGB images
-                    - 'all': RGB images and state observations
-            image_size (int): The size of the image observations
-            action_repeat (int): The number of times to repeat the action
-            seed (int): The random seed to use
-    Returns:
-            gym.Env: The environment
-    """
-    if task not in TASKS:
-        raise ValueError(f"Unknown task {task}. Must be one of {list(TASKS.keys())}")
-    env = TASKS[task]["env"]()
-    env = TimeLimit(env, TASKS[task]["episode_length"])
-    env = SimXarmWrapper(env, TASKS[task], obs_mode, image_size, action_repeat, frame_stack, channel_last)
-    env.seed(seed)
-
-    return env
--- a/lerobot/common/envs/simxarm/simxarm/tasks/assets/lift.xml
+++ b/lerobot/common/envs/simxarm/simxarm/tasks/assets/lift.xml
@@ -1,53 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-
-<mujoco>
-	<compiler angle="radian" coordinate="local" meshdir="mesh" texturedir="texture"></compiler>
-	<size nconmax="2000" njmax="500"/>
-
-	<option timestep="0.002">
-		<flag warmstart="enable"></flag>
-	</option>
-
-	<include file="shared.xml"></include>
-
-	<worldbody>
-		<body name="floor0" pos="0 0 0">
-			<geom name="floorgeom0" pos="1.2 -2.0 0" size="20.0 20.0 1" type="plane" condim="3" material="floor_mat"></geom>
-		</body>
-
-		<include file="xarm.xml"></include>
-
-		<body pos="0.75 0 0.6325" name="pedestal0">
-			<geom name="pedestalgeom0" size="0.1 0.1 0.01" pos="0.32 0.27 0" type="box" mass="2000" material="pedestal_mat"></geom>
-			<site pos="0.30 0.30 0" size="0.075 0.075 0.002" type="box" name="robotmountsite0" rgba="0.55 0.54 0.53 1" />
-		</body>
-
-		<body pos="1.5 0.075 0.3425" name="table0">
-			<geom name="tablegeom0" size="0.3 0.6 0.2" pos="0 0 0" type="box" material="table_mat" density="2000" friction="1 1 1"></geom>
-		</body>
-
-		<body name="object" pos="1.405 0.3 0.58625">
-			<joint name="object_joint0" type="free" limited="false"></joint>
-			<geom size="0.035 0.035 0.035" type="box" name="object0" material="block_mat" density="50000" condim="4" friction="1 1 1" solimp="1 1 1" solref="0.02 1"></geom>
-			<site name="object_site" pos="0 0 0" size="0.035 0.035 0.035" rgba="1 0 0 0" type="box"></site>
-		</body>
-
-		<light directional="true" ambient="0.1 0.1 0.1" diffuse="0 0 0" specular="0 0 0" castshadow="false" pos="1.65 0 10" dir="-0.57 -0.57 -0.57" name="light0"></light>
-        <light directional="true" ambient="0.1 0.1 0.1" diffuse="0 0 0" specular="0 0 0" castshadow="false" pos="0 -4 4" dir="0 1 -0.1" name="light1"></light>
-        <light directional="true" ambient="0.05 0.05 0.05" diffuse="0 0 0" specular="0 0 0" castshadow="false" pos="2.13 1.6 2.5" name="light2"></light>
-        <light pos="0 0 2" dir="0.2 0.2 -0.8" directional="true"  diffuse="0.3 0.3 0.3"  castshadow="false" name="light3"></light>
-
-		<camera fovy="50" name="camera0" pos="0.9559 1.0 1.1" euler="-1.1 -0.6 3.4" />
-	</worldbody>
-
-	<equality>
-    	<connect body2="left_finger" body1="left_inner_knuckle" anchor="0.0 0.035 0.042" solimp="0.9 0.95 0.001 0.5 2" solref="0.0002 1.0" ></connect>
-    	<connect body2="right_finger" body1="right_inner_knuckle" anchor="0.0 -0.035 0.042" solimp="0.9 0.95 0.001 0.5 2" solref="0.0002 1.0" ></connect>
-        <joint joint1="left_inner_knuckle_joint" joint2="right_inner_knuckle_joint"></joint>
-    </equality>
-
-    <actuator>
-		<motor ctrllimited="true" ctrlrange="-1.0 1.0" joint="left_inner_knuckle_joint" gear="200.0"/>
-        <motor ctrllimited="true" ctrlrange="-1.0 1.0" joint="right_inner_knuckle_joint" gear="200.0"/>
-    </actuator>
-</mujoco>
--- a/lerobot/common/envs/simxarm/simxarm/tasks/assets/mesh/base_link.stl
+++ b/lerobot/common/envs/simxarm/simxarm/tasks/assets/mesh/base_link.stl
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:21fb81ae7fba19e3c6b2d2ca60c8051712ba273357287eb5a397d92d61c7a736
-size 1211434
--- a/lerobot/common/envs/simxarm/simxarm/tasks/assets/mesh/block_inner.stl
+++ b/lerobot/common/envs/simxarm/simxarm/tasks/assets/mesh/block_inner.stl
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:be68ce180d11630a667a5f37f4dffcc3feebe4217d4bb3912c813b6d9ca3ec66
-size 3284
--- a/lerobot/common/envs/simxarm/simxarm/tasks/assets/mesh/block_inner2.stl
+++ b/lerobot/common/envs/simxarm/simxarm/tasks/assets/mesh/block_inner2.stl
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2c6448552bf6b1c4f17334d686a5320ce051bcdfe31431edf69303d8a570d1de
-size 3284
--- a/lerobot/common/envs/simxarm/simxarm/tasks/assets/mesh/block_outer.stl
+++ b/lerobot/common/envs/simxarm/simxarm/tasks/assets/mesh/block_outer.stl
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:748b9e197e6521914f18d1f6383a36f211136b3f33f2ad2a8c11b9f921c2cf86
-size 6284
--- a/lerobot/common/envs/simxarm/simxarm/tasks/assets/mesh/left_finger.stl
+++ b/lerobot/common/envs/simxarm/simxarm/tasks/assets/mesh/left_finger.stl
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a44756eb72f9c214cb37e61dc209cd7073fdff3e4271a7423476ef6fd090d2d4
-size 242684
--- a/lerobot/common/envs/simxarm/simxarm/tasks/assets/mesh/left_inner_knuckle.stl
+++ b/lerobot/common/envs/simxarm/simxarm/tasks/assets/mesh/left_inner_knuckle.stl
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e8e48692ad26837bb3d6a97582c89784d09948fc09bfe4e5a59017859ff04dac
-size 366284
--- a/lerobot/common/envs/simxarm/simxarm/tasks/assets/mesh/left_outer_knuckle.stl
+++ b/lerobot/common/envs/simxarm/simxarm/tasks/assets/mesh/left_outer_knuckle.stl
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:501665812b08d67e764390db781e839adc6896a9540301d60adf606f57648921
-size 22284
--- a/lerobot/common/envs/simxarm/simxarm/tasks/assets/mesh/link1.stl
+++ b/lerobot/common/envs/simxarm/simxarm/tasks/assets/mesh/link1.stl
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:34b541122df84d2ef5fcb91b715eb19659dc15ad8d44a191dde481f780265636
-size 184184
--- a/lerobot/common/envs/simxarm/simxarm/tasks/assets/mesh/link2.stl
+++ b/lerobot/common/envs/simxarm/simxarm/tasks/assets/mesh/link2.stl
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:61e641cd47c169ecef779683332e00e4914db729bf02dfb61bfbe69351827455
-size 225584
--- a/lerobot/common/envs/simxarm/simxarm/tasks/assets/mesh/link3.stl
+++ b/lerobot/common/envs/simxarm/simxarm/tasks/assets/mesh/link3.stl
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9e2798e7946dd70046c95455d5ba96392d0b54a6069caba91dc4ca66e1379b42
-size 237084
--- a/lerobot/common/envs/simxarm/simxarm/tasks/assets/mesh/link4.stl
+++ b/lerobot/common/envs/simxarm/simxarm/tasks/assets/mesh/link4.stl
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c757fee95f873191a0633c355c07a360032960771cabbd7593a6cdb0f1ffb089
-size 243684
--- a/lerobot/common/envs/simxarm/simxarm/tasks/assets/mesh/link5.stl
+++ b/lerobot/common/envs/simxarm/simxarm/tasks/assets/mesh/link5.stl
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:715ad5787c5dab57589937fd47289882707b5e1eb997e340d567785b02f4ec90
-size 229084
--- a/lerobot/common/envs/simxarm/simxarm/tasks/assets/mesh/link6.stl
+++ b/lerobot/common/envs/simxarm/simxarm/tasks/assets/mesh/link6.stl
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:85b320aa420497827223d16d492bba8de091173374e361396fc7a5dad7bdb0cb
-size 399384
--- a/lerobot/common/envs/simxarm/simxarm/tasks/assets/mesh/link7.stl
+++ b/lerobot/common/envs/simxarm/simxarm/tasks/assets/mesh/link7.stl
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:97115d848fbf802cb770cd9be639ae2af993103b9d9bbb0c50c943c738a36f18
-size 231684
--- a/lerobot/common/envs/simxarm/simxarm/tasks/assets/mesh/link_base.stl
+++ b/lerobot/common/envs/simxarm/simxarm/tasks/assets/mesh/link_base.stl
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f6fcbc18258090eb56c21cfb17baa5ae43abc98b1958cd366f3a73b9898fc7f0
-size 2106184
--- a/lerobot/common/envs/simxarm/simxarm/tasks/assets/mesh/right_finger.stl
+++ b/lerobot/common/envs/simxarm/simxarm/tasks/assets/mesh/right_finger.stl
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c5dee87c7f37baf554b8456ebfe0b3e8ed0b22b8938bd1add6505c2ad6d32c7d
-size 242684
--- a/lerobot/common/envs/simxarm/simxarm/tasks/assets/mesh/right_inner_knuckle.stl
+++ b/lerobot/common/envs/simxarm/simxarm/tasks/assets/mesh/right_inner_knuckle.stl
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b41dd2c2c550281bf78d7cc6fa117b14786700e5c453560a0cb5fd6dfa0ffb3e
-size 366284
--- a/lerobot/common/envs/simxarm/simxarm/tasks/assets/mesh/right_outer_knuckle.stl
+++ b/lerobot/common/envs/simxarm/simxarm/tasks/assets/mesh/right_outer_knuckle.stl
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:75ca1107d0a42a0f03802a9a49cab48419b31851ee8935f8f1ca06be1c1c91e8
-size 22284
--- a/lerobot/common/envs/simxarm/simxarm/tasks/assets/peg_in_box.xml
+++ b/lerobot/common/envs/simxarm/simxarm/tasks/assets/peg_in_box.xml
@@ -1,74 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-
-<mujoco>
-	<compiler angle="radian" coordinate="local" meshdir="mesh" texturedir="texture"></compiler>
-	<size nconmax="2000" njmax="500"/>
-
-	<option timestep="0.001">
-		<flag warmstart="enable"></flag>
-	</option>
-
-	<include file="shared.xml"></include>
-
-	<worldbody>
-		<body name="floor0" pos="0 0 0">
-			<geom name="floorgeom0" pos="1.2 -2.0 0" size="1.0 10.0 1" type="plane" condim="3" material="floor_mat"></geom>
-		</body>
-
-		<include file="xarm.xml"></include>
-
-		<body pos="0.75 0 0.6325" name="pedestal0">
-			<geom name="pedestalgeom0" size="0.1 0.1 0.01" pos="0.32 0.27 0" type="box" mass="2000" material="pedestal_mat"></geom>
-			<site pos="0.30 0.30 0" size="0.075 0.075 0.002" type="box" name="robotmountsite0" rgba="0.55 0.54 0.53 1" />
-		</body>
-
-		<body pos="1.5 0.075 0.3425" name="table0">
-			<geom name="tablegeom0" size="0.3 0.6 0.2" pos="0 0 0" type="box" material="table_mat" density="2000" friction="1 0.005 0.0002"></geom>
-		</body>
-
-		<body name="box0" pos="1.605 0.25 0.55">
-			<joint name="box_joint0" type="free" limited="false"></joint>
-			<site name="box_site" pos="0 0.075 -0.01" size="0.02" rgba="0 0 0 0" type="sphere"></site>
-			<geom name="box_side0" pos="0 0 0" size="0.065 0.002 0.04" type= "box" rgba="0.8 0.1 0.1 1" mass ="1" condim="4" />
-			<geom name="box_side1" pos="0 0.149 0" size="0.065 0.002 0.04" type="box" rgba="0.9 0.2 0.2 1" mass ="2" condim="4" />
-			<geom name="box_side2" pos="0.064 0.074 0" size="0.002 0.075 0.04" type="box" rgba="0.8 0.1 0.1 1" mass ="2" condim="4" />
-			<geom name="box_side3" pos="-0.064 0.074 0" size="0.002 0.075 0.04" type="box" rgba="0.9 0.2 0.2 1" mass ="2" condim="4" />
-			<geom name="box_side4" pos="-0 0.074 -0.038" size="0.065 0.075 0.002" type="box" rgba="0.5 0 0 1" mass ="2" condim="4"/>
-		</body>
-
-		<body name="object0" pos="1.4 0.25 0.65">
-			<joint name="object_joint0" type="free" limited="false"></joint>
-			<geom name="object_target0" type="cylinder" pos="0 0 -0.05" size="0.03 0.035" rgba="0.6 0.8 0.5 1" mass ="0.1" condim="3"  />
-			<site name="object_site" pos="0 0 -0.05" size="0.0325 0.0375" rgba="0 0 0 0" type="cylinder"></site>
-			<body name="B0" pos="0 0 0" euler="0 0 0 ">
-				<joint name="B0:joint" type="slide" limited="true" axis="0 0 1" damping="0.05" range="0.0001 0.0001001" solimpfriction="0.98 0.98 0.95" frictionloss="1"></joint>
-				<geom type="capsule" size="0.002 0.03" rgba="0 0 0 1" mass="0.001" condim="4"/>
-				<body name="B1" pos="0 0 0.04" euler="0 3.14 0 ">
-					<joint name="B1:joint1" type="hinge" axis="1 0 0" range="-0.1 0.1"  frictionloss="1"></joint>
-					<joint name="B1:joint2" type="hinge" axis="0 1 0" range="-0.1 0.1"  frictionloss="1"></joint>
-					<joint name="B1:joint3" type="hinge" axis="0 0 1" range="-0.1 0.1"  frictionloss="1"></joint>
-					<geom type="capsule" size="0.002 0.004" rgba="1 0 0 0" mass="0.001" condim="4"/>
-				</body>
-			</body>
-		</body>
-
-		<light directional="true" ambient="0.1 0.1 0.1" diffuse="0 0 0" specular="0 0 0" castshadow="false" pos="1.65 0 10" dir="-0.57 -0.57 -0.57" name="light0"></light>
-		<light directional="true" ambient="0.1 0.1 0.1" diffuse="0 0 0" specular="0 0 0" castshadow="false" pos="0 -4 4" dir="0 1 -0.1" name="light1"></light>
-		<light directional="true" ambient="0.05 0.05 0.05" diffuse="0 0 0" specular="0 0 0" castshadow="false" pos="2.13 1.6 2.5" name="light2"></light>
-		<light pos="0 0 2" dir="0.2 0.2 -0.8" directional="true"  diffuse="0.3 0.3 0.3"  castshadow="false" name="light3"></light>
-
-		<camera fovy="50" name="camera0" pos="0.9559 1.0 1.1" euler="-1.1 -0.6 3.4" />
-	</worldbody>
-
-	<equality>
-		<connect body2="left_finger" body1="left_inner_knuckle" anchor="0.0 0.035 0.042" solimp="0.9 0.95 0.001 0.5 2" solref="0.0002 1.0" ></connect>
-		<connect body2="right_finger" body1="right_inner_knuckle" anchor="0.0 -0.035 0.042" solimp="0.9 0.95 0.001 0.5 2" solref="0.0002 1.0" ></connect>
-		<weld body1="right_hand" body2="B1" solimp="0.99 0.99 0.99" solref="0.02 1"></weld>
-		<joint joint1="left_inner_knuckle_joint" joint2="right_inner_knuckle_joint"></joint>
-	</equality>
-
-	<actuator>
-		<motor ctrllimited="true" ctrlrange="-1.0 1.0" joint="left_inner_knuckle_joint" gear="200.0"/>
-		<motor ctrllimited="true" ctrlrange="-1.0 1.0" joint="right_inner_knuckle_joint" gear="200.0"/>
-	</actuator>
-</mujoco>
--- a/lerobot/common/envs/simxarm/simxarm/tasks/assets/push.xml
+++ b/lerobot/common/envs/simxarm/simxarm/tasks/assets/push.xml
@@ -1,54 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-
-<mujoco>
-	<compiler angle="radian" coordinate="local" meshdir="mesh" texturedir="texture"></compiler>
-	<size nconmax="2000" njmax="500"/>
-
-	<option timestep="0.002">
-		<flag warmstart="enable"></flag>
-	</option>
-
-	<include file="shared.xml"></include>
-
-	<worldbody>
-		<body name="floor0" pos="0 0 0">
-			<geom name="floorgeom0" pos="1.2 -2.0 0" size="1.0 10.0 1" type="plane" condim="3" material="floor_mat"></geom>
-			<site name="target0" pos="1.565 0.3 0.545" size="0.0475 0.001" rgba="1 0 0 1" type="cylinder"></site>
-		</body>
-
-		<include file="xarm.xml"></include>
-
-		<body pos="0.75 0 0.6325" name="pedestal0">
-			<geom name="pedestalgeom0" size="0.1 0.1 0.01" pos="0.32 0.27 0" type="box" mass="2000" material="pedestal_mat"></geom>
-			<site pos="0.30 0.30 0" size="0.075 0.075 0.002" type="box" name="robotmountsite0" rgba="0.55 0.54 0.53 1" />
-		</body>
-
-		<body pos="1.5 0.075 0.3425" name="table0">
-			<geom name="tablegeom0" size="0.3 0.6 0.2" pos="0 0 0" type="box" material="table_mat" density="2000" friction="1 0.005 0.0002"></geom>
-		</body>
-
-		<body name="object" pos="1.655 0.3 0.68">
-			<joint name="object_joint0" type="free" limited="false"></joint>
-			<geom size="0.024 0.024 0.024" type="box" name="object" material="block_mat" density="50000" condim="4" friction="1 1 1" solimp="1 1 1" solref="0.02 1"></geom>
-			<site name="object_site" pos="0 0 0" size="0.024 0.024 0.024" rgba="0 0 0 0" type="box"></site>
-		</body>
-
-		<light directional="true" ambient="0.1 0.1 0.1" diffuse="0 0 0" specular="0 0 0" castshadow="false" pos="1.65 0 10" dir="-0.57 -0.57 -0.57" name="light0"></light>
-        <light directional="true" ambient="0.1 0.1 0.1" diffuse="0 0 0" specular="0 0 0" castshadow="false" pos="0 -4 4" dir="0 1 -0.1" name="light1"></light>
-        <light directional="true" ambient="0.05 0.05 0.05" diffuse="0 0 0" specular="0 0 0" castshadow="false" pos="2.13 1.6 2.5" name="light2"></light>
-        <light pos="0 0 2" dir="0.2 0.2 -0.8" directional="true"  diffuse="0.3 0.3 0.3"  castshadow="false" name="light3"></light>
-
-		<camera fovy="50" name="camera0" pos="0.9559 1.0 1.1" euler="-1.1 -0.6 3.4" />
-	</worldbody>
-
-	<equality>
-    	<connect body2="left_finger" body1="left_inner_knuckle" anchor="0.0 0.035 0.042" solimp="0.9 0.95 0.001 0.5 2" solref="0.0002 1.0" ></connect>
-    	<connect body2="right_finger" body1="right_inner_knuckle" anchor="0.0 -0.035 0.042" solimp="0.9 0.95 0.001 0.5 2" solref="0.0002 1.0" ></connect>
-        <joint joint1="left_inner_knuckle_joint" joint2="right_inner_knuckle_joint"></joint>
-    </equality>
-
-    <actuator>
-    	<motor ctrllimited="true" ctrlrange="-1.0 1.0" joint="left_inner_knuckle_joint" gear="200.0"/>
-        <motor ctrllimited="true" ctrlrange="-1.0 1.0" joint="right_inner_knuckle_joint" gear="200.0"/>
-    </actuator>
-</mujoco>
--- a/lerobot/common/envs/simxarm/simxarm/tasks/assets/reach.xml
+++ b/lerobot/common/envs/simxarm/simxarm/tasks/assets/reach.xml
@@ -1,48 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-
-<mujoco>
-	<compiler angle="radian" coordinate="local" meshdir="mesh" texturedir="texture"></compiler>
-	<size nconmax="2000" njmax="500"/>
-
-	<option timestep="0.002">
-		<flag warmstart="enable"></flag>
-	</option>
-
-	<include file="shared.xml"></include>
-
-	<worldbody>
-		<body name="floor0" pos="0 0 0">
-			<geom name="floorgeom0" pos="1.2 -2.0 0" size="1.0 10.0 1" type="plane" condim="3" material="floor_mat"></geom>
-			<site name="target0" pos="1.605 0.3 0.58" size="0.0475 0.001" rgba="1 0 0 1" type="cylinder"></site>
-		</body>
-
-		<include file="xarm.xml"></include>
-
-		<body pos="0.75 0 0.6325" name="pedestal0">
-			<geom name="pedestalgeom0" size="0.1 0.1 0.01" pos="0.32 0.27 0" type="box" mass="2000" material="pedestal_mat"></geom>
-			<site pos="0.30 0.30 0" size="0.075 0.075 0.002" type="box" name="robotmountsite0" rgba="0.55 0.54 0.53 1" />
-		</body>
-
-		<body pos="1.5 0.075 0.3425" name="table0">
-			<geom name="tablegeom0" size="0.3 0.6 0.2" pos="0 0 0" type="box" material="table_mat" density="2000" friction="1 0.005 0.0002"></geom>
-		</body>
-
-		<light directional="true" ambient="0.1 0.1 0.1" diffuse="0 0 0" specular="0 0 0" castshadow="false" pos="1.65 0 10" dir="-0.57 -0.57 -0.57" name="light0"></light>
-        <light directional="true" ambient="0.1 0.1 0.1" diffuse="0 0 0" specular="0 0 0" castshadow="false" pos="0 -4 4" dir="0 1 -0.1" name="light1"></light>
-        <light directional="true" ambient="0.05 0.05 0.05" diffuse="0 0 0" specular="0 0 0" castshadow="false" pos="2.13 1.6 2.5" name="light2"></light>
-        <light pos="0 0 2" dir="0.2 0.2 -0.8" directional="true"  diffuse="0.3 0.3 0.3"  castshadow="false" name="light3"></light>
-
-		<camera fovy="50" name="camera0" pos="0.9559 1.0 1.1" euler="-1.1 -0.6 3.4" />
-	</worldbody>
-
-	<equality>
-    	<connect body2="left_finger" body1="left_inner_knuckle" anchor="0.0 0.035 0.042" solimp="0.9 0.95 0.001 0.5 2" solref="0.0002 1.0" ></connect>
-    	<connect body2="right_finger" body1="right_inner_knuckle" anchor="0.0 -0.035 0.042" solimp="0.9 0.95 0.001 0.5 2" solref="0.0002 1.0" ></connect>
-        <joint joint1="left_inner_knuckle_joint" joint2="right_inner_knuckle_joint"></joint>
-    </equality>
-
-    <actuator>
-    	<motor ctrllimited="true" ctrlrange="-1.0 1.0" joint="left_inner_knuckle_joint" gear="200.0"/>
-        <motor ctrllimited="true" ctrlrange="-1.0 1.0" joint="right_inner_knuckle_joint" gear="200.0"/>
-    </actuator>
-</mujoco>
--- a/lerobot/common/envs/simxarm/simxarm/tasks/assets/shared.xml
+++ b/lerobot/common/envs/simxarm/simxarm/tasks/assets/shared.xml
@@ -1,51 +0,0 @@
-<mujoco>
-    <asset>
-        <texture type="skybox" builtin="gradient" rgb1="0.0 0.0 0.0" rgb2="0.0 0.0 0.0" width="32" height="32"></texture>
-        <material name="floor_mat" specular="0" shininess="0.0" reflectance="0" rgba="0.043 0.055 0.051 1"></material>
-
-        <material name="table_mat" specular="0.2" shininess="0.2" reflectance="0" rgba="1 1 1 1"></material>
-        <material name="pedestal_mat" specular="0.35" shininess="0.5" reflectance="0" rgba="0.705 0.585 0.405 1"></material>
-        <material name="block_mat" specular="0.5" shininess="0.9" reflectance="0.05" rgba="0.373 0.678 0.627 1"></material>
-
-        <material name="robot0:geomMat" shininess="0.03" specular="0.4"></material>
-        <material name="robot0:gripper_finger_mat" shininess="0.03" specular="0.4" reflectance="0"></material>
-        <material name="robot0:gripper_mat" shininess="0.03" specular="0.4" reflectance="0"></material>
-        <material name="background:gripper_mat" shininess="0.03" specular="0.4" reflectance="0"></material>
-        <material name="robot0:arm_mat" shininess="0.03" specular="0.4" reflectance="0"></material>
-        <material name="robot0:head_mat" shininess="0.03" specular="0.4" reflectance="0"></material>
-        <material name="robot0:torso_mat" shininess="0.03" specular="0.4" reflectance="0"></material>
-        <material name="robot0:base_mat" shininess="0.03" specular="0.4" reflectance="0"></material>
-
-        <mesh name="link_base" file="link_base.stl" />
-        <mesh name="link1" file="link1.stl" />
-        <mesh name="link2" file="link2.stl" />
-        <mesh name="link3" file="link3.stl" />
-        <mesh name="link4" file="link4.stl" />
-        <mesh name="link5" file="link5.stl" />
-        <mesh name="link6" file="link6.stl" />
-        <mesh name="link7" file="link7.stl" />
-        <mesh name="base_link" file="base_link.stl" />
-        <mesh name="left_outer_knuckle" file="left_outer_knuckle.stl" />
-        <mesh name="left_finger" file="left_finger.stl" />
-        <mesh name="left_inner_knuckle" file="left_inner_knuckle.stl" />
-        <mesh name="right_outer_knuckle" file="right_outer_knuckle.stl" />
-        <mesh name="right_finger" file="right_finger.stl" />
-        <mesh name="right_inner_knuckle" file="right_inner_knuckle.stl" />
-    </asset>
-
-    <equality>
-        <weld body1="robot0:mocap2" body2="link7" solimp="0.9 0.95 0.001" solref="0.02 1"></weld>
-    </equality>
-
-    <default>
-        <joint armature="1" damping="0.1" limited="true"/>
-         <default class="robot0:blue">
-            <geom rgba="0.086 0.506 0.767 1.0"></geom>
-        </default>
-
-        <default class="robot0:grey">
-            <geom rgba="0.356 0.361 0.376 1.0"></geom>
-        </default>
-    </default>
-
-</mujoco>
--- a/lerobot/common/envs/simxarm/simxarm/tasks/assets/xarm.xml
+++ b/lerobot/common/envs/simxarm/simxarm/tasks/assets/xarm.xml
@@ -1,88 +0,0 @@
-<mujoco model="xarm7">
-    <body mocap="true" name="robot0:mocap2" pos="0 0 0">
-        <geom conaffinity="0" contype="0" pos="0 0 0" rgba="0 0.5 0 0" size="0.005 0.005 0.005" type="box"></geom>
-        <geom conaffinity="0" contype="0" pos="0 0 0" rgba="0.5 0 0 0" size="1 0.005 0.005" type="box"></geom>
-        <geom conaffinity="0" contype="0" pos="0 0 0" rgba="0 0 0.5 0" size="0.005 1 0.001" type="box"></geom>
-        <geom conaffinity="0" contype="0" pos="0 0 0" rgba="0.5 0.5 0 0" size="0.005 0.005 1" type="box"></geom>
-    </body>
-
-    <body name="link0" pos="1.09 0.28 0.655">
-        <geom name="bb" type="mesh" mesh="link_base" material="robot0:base_mat" rgba="1 1 1 1"/>
-        <body name="link1" pos="0 0 0.267">
-            <inertial pos="-0.0042142 0.02821 -0.0087788" quat="0.917781 -0.277115 0.0606681 0.277858" mass="0.42603" diaginertia="0.00144551 0.00137757 0.000823511" />
-            <joint name="joint1" pos="0 0 0" axis="0 0 1" limited="true" range="-6.28319 6.28319" damping="10" frictionloss="1" />
-            <geom name="j1" type="mesh" mesh="link1" material="robot0:arm_mat" rgba="1 1 1 1"/>
-            <body name="link2" pos="0 0 0" quat="0.707105 -0.707108 0 0">
-                <inertial pos="-3.3178e-05 -0.12849 0.026337" quat="0.447793 0.894132 -0.00224061 0.00218314" mass="0.56095" diaginertia="0.00319151 0.00311598 0.000980804" />
-                <joint name="joint2" pos="0 0 0" axis="0 0 1" limited="true" range="-2.059 2.0944" damping="10" frictionloss="1" />
-                <geom name="j2"  type="mesh" mesh="link2" material="robot0:head_mat" rgba="1 1 1 1"/>
-                <body name="link3" pos="0 -0.293 0" quat="0.707105 0.707108 0 0">
-                    <inertial pos="0.04223 -0.023258 -0.0096674" quat="0.883205 0.339803 0.323238 0.000542237" mass="0.44463" diaginertia="0.00133227 0.00119126 0.000780475" />
-                    <joint name="joint3" pos="0 0 0" axis="0 0 1" limited="true" range="-6.28319 6.28319" damping="5" frictionloss="1" />
-                    <geom name="j3" type="mesh" mesh="link3" material="robot0:gripper_mat" rgba="1 1 1 1"/>
-                    <body name="link4" pos="0.0525 0 0" quat="0.707105 0.707108 0 0">
-                        <inertial pos="0.067148 -0.10732 0.024479" quat="0.0654142 0.483317 -0.738663 0.465298" mass="0.52387" diaginertia="0.00288984 0.00282705 0.000894409" />
-                        <joint name="joint4" pos="0 0 0" axis="0 0 1" limited="true" range="-0.19198 3.927" damping="5" frictionloss="1" />
-                        <geom name="j4" type="mesh" mesh="link4" material="robot0:arm_mat" rgba="1 1 1 1"/>
-                        <body name="link5" pos="0.0775 -0.3425 0" quat="0.707105 0.707108 0 0">
-                            <inertial pos="-0.00023397 0.036705 -0.080064" quat="0.981064 -0.19003 0.00637998 0.0369004" mass="0.18554" diaginertia="0.00099553 0.000988613 0.000247126" />
-                            <joint name="joint5" pos="0 0 0" axis="0 0 1" limited="true" range="-6.28319 6.28319" damping="5" frictionloss="1" />
-                            <geom name="j5" type="mesh"  material="robot0:gripper_mat" rgba="1 1 1 1" mesh="link5" />
-                            <body name="link6" pos="0 0 0" quat="0.707105 0.707108 0 0">
-                                <inertial pos="0.058911 0.028469 0.0068428" quat="-0.188705 0.793535 0.166088 0.554173" mass="0.31344" diaginertia="0.000827892 0.000768871 0.000386708" />
-                                <joint name="joint6" pos="0 0 0" axis="0 0 1" limited="true" range="-1.69297 3.14159" damping="2" frictionloss="1" />
-                                <geom name="j6" type="mesh" material="robot0:gripper_mat" rgba="1 1 1 1" mesh="link6" />
-                                <body name="link7" pos="0.076 0.097 0" quat="0.707105 -0.707108 0 0">
-                                    <inertial pos="-0.000420033 -0.00287433 0.0257078" quat="0.999372 -0.0349129 -0.00605634 0.000551744" mass="0.85624" diaginertia="0.00137671 0.00118744 0.000514968" />
-                                    <joint name="joint7" pos="0 0 0" axis="0 0 1" limited="true" range="-6.28319 6.28319" damping="2" frictionloss="1" />
-                                    <geom name="j8" material="robot0:gripper_mat" type="mesh" rgba="0.753 0.753 0.753 1" mesh="link7" />
-                                    <geom name="j9" material="robot0:gripper_mat" type="mesh" rgba="1 1 1 1" mesh="base_link" />
-                                    <site name="grasp" pos="0 0 0.16" rgba="1 0 0 0" type="sphere" size="0.01" group="1"/>
-                                    <body name="left_outer_knuckle" pos="0 0.035 0.059098">
-                                        <inertial pos="0 0.021559 0.015181" quat="0.47789 0.87842 0 0" mass="0.033618" diaginertia="1.9111e-05 1.79089e-05 1.90167e-06" />
-                                        <joint name="drive_joint" pos="0 0 0" axis="1 0 0" limited="true" range="0 0.85" />
-                                        <geom type="mesh" rgba="0 0 0 1" conaffinity="1" contype="0"  mesh="left_outer_knuckle" />
-                                        <body name="left_finger" pos="0 0.035465 0.042039">
-                                            <inertial pos="0 -0.016413 0.029258" quat="0.697634 0.115353 -0.115353 0.697634" mass="0.048304" diaginertia="1.88037e-05 1.7493e-05 3.56792e-06" />
-                                            <joint name="left_finger_joint" pos="0 0 0" axis="-1 0 0" limited="true" range="0 0.85" />
-                                            <geom name="j10" material="robot0:gripper_finger_mat" type="mesh" rgba="0 0 0 1" conaffinity="3" contype="2"  mesh="left_finger" friction='1.5 1.5 1.5' solref='0.01 1' solimp='0.99 0.99 0.01'/>
-                                            <body name="right_hand" pos="0 -0.03 0.05" quat="-0.7071 0 0 0.7071">
-                                                <site name="ee" pos="0 0 0" rgba="0 0 1 0" type="sphere" group="1"/>
-                                                <site name="ee_x" pos="0 0 0" size="0.005 .1"  quat="0.707105 0.707108 0 0 " rgba="1 0 0 0" type="cylinder" group="1"/>
-                                                <site name="ee_z" pos="0 0 0" size="0.005 .1" quat="0.707105 0 0 0.707108" rgba="0 0 1 0" type="cylinder" group="1"/>
-                                                <site name="ee_y" pos="0 0 0" size="0.005 .1" quat="0.707105 0 0.707108 0 " rgba="0 1 0 0" type="cylinder" group="1"/>
-                                            </body>
-                                        </body>
-                                    </body>
-                                    <body name="left_inner_knuckle" pos="0 0.02 0.074098">
-                                        <inertial pos="1.86601e-06 0.0220468 0.0261335" quat="0.664139 -0.242732 0.242713 0.664146" mass="0.0230126" diaginertia="8.34216e-06 6.0949e-06 2.75601e-06" />
-                                        <joint name="left_inner_knuckle_joint" pos="0 0 0" axis="1 0 0" limited="true" range="0 0.85" />
-                                        <geom type="mesh" rgba="0 0 0 1" conaffinity="1" contype="0"  mesh="left_inner_knuckle" friction='1.5 1.5 1.5' solref='0.01 1' solimp='0.99 0.99 0.01'/>
-                                    </body>
-                                    <body name="right_outer_knuckle" pos="0 -0.035 0.059098">
-                                        <inertial pos="0 -0.021559 0.015181" quat="0.87842 0.47789 0 0" mass="0.033618" diaginertia="1.9111e-05 1.79089e-05 1.90167e-06" />
-                                        <joint name="right_outer_knuckle_joint" pos="0 0 0" axis="-1 0 0" limited="true" range="0 0.85" />
-                                        <geom type="mesh" rgba="0 0 0 1" conaffinity="1" contype="0"  mesh="right_outer_knuckle" />
-                                        <body name="right_finger" pos="0 -0.035465 0.042039">
-                                            <inertial pos="0 0.016413 0.029258" quat="0.697634 -0.115356 0.115356 0.697634" mass="0.048304" diaginertia="1.88038e-05 1.7493e-05 3.56779e-06" />
-                                            <joint name="right_finger_joint" pos="0 0 0" axis="1 0 0" limited="true" range="0 0.85" />
-                                            <geom name="j11" material="robot0:gripper_finger_mat" type="mesh" rgba="0 0 0 1" conaffinity="3" contype="2" mesh="right_finger" friction='1.5 1.5 1.5' solref='0.01 1' solimp='0.99 0.99 0.01'/>
-                                            <body name="left_hand" pos="0 0.03 0.05" quat="-0.7071 0 0 0.7071">
-                                                <site name="ee_2" pos="0 0 0" rgba="1 0 0 0" type="sphere" size="0.01" group="1"/>
-                                            </body>
-                                        </body>
-                                    </body>
-                                    <body name="right_inner_knuckle" pos="0 -0.02 0.074098">
-                                        <inertial pos="1.866e-06 -0.022047 0.026133" quat="0.66415 0.242702 -0.242721 0.664144" mass="0.023013" diaginertia="8.34209e-06 6.0949e-06 2.75601e-06" />
-                                        <joint name="right_inner_knuckle_joint" pos="0 0 0" axis="-1 0 0" limited="true" range="0 0.85" />
-                                        <geom type="mesh" rgba="0 0 0 1" conaffinity="1" contype="0" mesh="right_inner_knuckle" friction='1.5 1.5 1.5' solref='0.01 1' solimp='0.99 0.99 0.01'/>
-                                    </body>
-                                </body>
-                            </body>
-                        </body>
-                    </body>
-                </body>
-            </body>
-        </body>
-    </body>
-</mujoco>
--- a/lerobot/common/envs/simxarm/simxarm/tasks/base.py
+++ b/lerobot/common/envs/simxarm/simxarm/tasks/base.py
@@ -1,145 +0,0 @@
-import os
-
-import mujoco
-import numpy as np
-from gymnasium_robotics.envs import robot_env
-
-from lerobot.common.envs.simxarm.simxarm.tasks import mocap
-
-
-class Base(robot_env.MujocoRobotEnv):
-    """
-    Superclass for all simxarm environments.
-    Args:
-            xml_name (str): name of the xml environment file
-            gripper_rotation (list): initial rotation of the gripper (given as a quaternion)
-    """
-
-    def __init__(self, xml_name, gripper_rotation=None):
-        if gripper_rotation is None:
-            gripper_rotation = [0, 1, 0, 0]
-        self.gripper_rotation = np.array(gripper_rotation, dtype=np.float32)
-        self.center_of_table = np.array([1.655, 0.3, 0.63625])
-        self.max_z = 1.2
-        self.min_z = 0.2
-        super().__init__(
-            model_path=os.path.join(os.path.dirname(__file__), "assets", xml_name + ".xml"),
-            n_substeps=20,
-            n_actions=4,
-            initial_qpos={},
-        )
-
-    @property
-    def dt(self):
-        return self.n_substeps * self.model.opt.timestep
-
-    @property
-    def eef(self):
-        return self._utils.get_site_xpos(self.model, self.data, "grasp")
-
-    @property
-    def obj(self):
-        return self._utils.get_site_xpos(self.model, self.data, "object_site")
-
-    @property
-    def robot_state(self):
-        gripper_angle = self._utils.get_joint_qpos(self.model, self.data, "right_outer_knuckle_joint")
-        return np.concatenate([self.eef, gripper_angle])
-
-    def is_success(self):
-        return NotImplementedError()
-
-    def get_reward(self):
-        raise NotImplementedError()
-
-    def _sample_goal(self):
-        raise NotImplementedError()
-
-    def get_obs(self):
-        return self._get_obs()
-
-    def _step_callback(self):
-        self._mujoco.mj_forward(self.model, self.data)
-
-    def _limit_gripper(self, gripper_pos, pos_ctrl):
-        if gripper_pos[0] > self.center_of_table[0] - 0.105 + 0.15:
-            pos_ctrl[0] = min(pos_ctrl[0], 0)
-        if gripper_pos[0] < self.center_of_table[0] - 0.105 - 0.3:
-            pos_ctrl[0] = max(pos_ctrl[0], 0)
-        if gripper_pos[1] > self.center_of_table[1] + 0.3:
-            pos_ctrl[1] = min(pos_ctrl[1], 0)
-        if gripper_pos[1] < self.center_of_table[1] - 0.3:
-            pos_ctrl[1] = max(pos_ctrl[1], 0)
-        if gripper_pos[2] > self.max_z:
-            pos_ctrl[2] = min(pos_ctrl[2], 0)
-        if gripper_pos[2] < self.min_z:
-            pos_ctrl[2] = max(pos_ctrl[2], 0)
-        return pos_ctrl
-
-    def _apply_action(self, action):
-        assert action.shape == (4,)
-        action = action.copy()
-        pos_ctrl, gripper_ctrl = action[:3], action[3]
-        pos_ctrl = self._limit_gripper(
-            self._utils.get_site_xpos(self.model, self.data, "grasp"), pos_ctrl
-        ) * (1 / self.n_substeps)
-        gripper_ctrl = np.array([gripper_ctrl, gripper_ctrl])
-        mocap.apply_action(
-            self.model,
-            self._model_names,
-            self.data,
-            np.concatenate([pos_ctrl, self.gripper_rotation, gripper_ctrl]),
-        )
-
-    def _render_callback(self):
-        self._mujoco.mj_forward(self.model, self.data)
-
-    def _reset_sim(self):
-        self.data.time = self.initial_time
-        self.data.qpos[:] = np.copy(self.initial_qpos)
-        self.data.qvel[:] = np.copy(self.initial_qvel)
-        self._sample_goal()
-        self._mujoco.mj_step(self.model, self.data, nstep=10)
-        return True
-
-    def _set_gripper(self, gripper_pos, gripper_rotation):
-        self._utils.set_mocap_pos(self.model, self.data, "robot0:mocap", gripper_pos)
-        self._utils.set_mocap_quat(self.model, self.data, "robot0:mocap", gripper_rotation)
-        self._utils.set_joint_qpos(self.model, self.data, "right_outer_knuckle_joint", 0)
-        self.data.qpos[10] = 0.0
-        self.data.qpos[12] = 0.0
-
-    def _env_setup(self, initial_qpos):
-        for name, value in initial_qpos.items():
-            self.data.set_joint_qpos(name, value)
-        mocap.reset(self.model, self.data)
-        mujoco.mj_forward(self.model, self.data)
-        self._sample_goal()
-        mujoco.mj_forward(self.model, self.data)
-
-    def reset(self):
-        self._reset_sim()
-        return self._get_obs()
-
-    def step(self, action):
-        assert action.shape == (4,)
-        assert self.action_space.contains(action), "{!r} ({}) invalid".format(action, type(action))
-        self._apply_action(action)
-        self._mujoco.mj_step(self.model, self.data, nstep=2)
-        self._step_callback()
-        obs = self._get_obs()
-        reward = self.get_reward()
-        done = False
-        info = {"is_success": self.is_success(), "success": self.is_success()}
-        return obs, reward, done, info
-
-    def render(self, mode="rgb_array", width=384, height=384):
-        self._render_callback()
-        # HACK
-        self.model.vis.global_.offwidth = width
-        self.model.vis.global_.offheight = height
-        return self.mujoco_renderer.render(mode)
-
-    def close(self):
-        if self.mujoco_renderer is not None:
-            self.mujoco_renderer.close()
--- a/lerobot/common/envs/simxarm/simxarm/tasks/lift.py
+++ b/lerobot/common/envs/simxarm/simxarm/tasks/lift.py
@@ -1,100 +0,0 @@
-import numpy as np
-
-from lerobot.common.envs.simxarm.simxarm import Base
-
-
-class Lift(Base):
-    def __init__(self):
-        self._z_threshold = 0.15
-        super().__init__("lift")
-
-    @property
-    def z_target(self):
-        return self._init_z + self._z_threshold
-
-    def is_success(self):
-        return self.obj[2] >= self.z_target
-
-    def get_reward(self):
-        reach_dist = np.linalg.norm(self.obj - self.eef)
-        reach_dist_xy = np.linalg.norm(self.obj[:-1] - self.eef[:-1])
-        pick_completed = self.obj[2] >= (self.z_target - 0.01)
-        obj_dropped = (self.obj[2] < (self._init_z + 0.005)) and (reach_dist > 0.02)
-
-        # Reach
-        if reach_dist < 0.05:
-            reach_reward = -reach_dist + max(self._action[-1], 0) / 50
-        elif reach_dist_xy < 0.05:
-            reach_reward = -reach_dist
-        else:
-            z_bonus = np.linalg.norm(np.linalg.norm(self.obj[-1] - self.eef[-1]))
-            reach_reward = -reach_dist - 2 * z_bonus
-
-        # Pick
-        if pick_completed and not obj_dropped:
-            pick_reward = self.z_target
-        elif (reach_dist < 0.1) and (self.obj[2] > (self._init_z + 0.005)):
-            pick_reward = min(self.z_target, self.obj[2])
-        else:
-            pick_reward = 0
-
-        return reach_reward / 100 + pick_reward
-
-    def _get_obs(self):
-        eef_velp = self._utils.get_site_xvelp(self.model, self.data, "grasp") * self.dt
-        gripper_angle = self._utils.get_joint_qpos(self.model, self.data, "right_outer_knuckle_joint")
-        eef = self.eef - self.center_of_table
-
-        obj = self.obj - self.center_of_table
-        obj_rot = self._utils.get_joint_qpos(self.model, self.data, "object_joint0")[-4:]
-        obj_velp = self._utils.get_site_xvelp(self.model, self.data, "object_site") * self.dt
-        obj_velr = self._utils.get_site_xvelr(self.model, self.data, "object_site") * self.dt
-
-        obs = np.concatenate(
-            [
-                eef,
-                eef_velp,
-                obj,
-                obj_rot,
-                obj_velp,
-                obj_velr,
-                eef - obj,
-                np.array(
-                    [
-                        np.linalg.norm(eef - obj),
-                        np.linalg.norm(eef[:-1] - obj[:-1]),
-                        self.z_target,
-                        self.z_target - obj[-1],
-                        self.z_target - eef[-1],
-                    ]
-                ),
-                gripper_angle,
-            ],
-            axis=0,
-        )
-        return {"observation": obs, "state": eef, "achieved_goal": eef, "desired_goal": eef}
-
-    def _sample_goal(self):
-        # Gripper
-        gripper_pos = np.array([1.280, 0.295, 0.735]) + self.np_random.uniform(-0.05, 0.05, size=3)
-        super()._set_gripper(gripper_pos, self.gripper_rotation)
-
-        # Object
-        object_pos = self.center_of_table - np.array([0.15, 0.10, 0.07])
-        object_pos[0] += self.np_random.uniform(-0.05, 0.05, size=1)
-        object_pos[1] += self.np_random.uniform(-0.05, 0.05, size=1)
-        object_qpos = self._utils.get_joint_qpos(self.model, self.data, "object_joint0")
-        object_qpos[:3] = object_pos
-        self._utils.set_joint_qpos(self.model, self.data, "object_joint0", object_qpos)
-        self._init_z = object_pos[2]
-
-        # Goal
-        return object_pos + np.array([0, 0, self._z_threshold])
-
-    def reset(self):
-        self._action = np.zeros(4)
-        return super().reset()
-
-    def step(self, action):
-        self._action = action.copy()
-        return super().step(action)
--- a/lerobot/common/envs/simxarm/simxarm/tasks/mocap.py
+++ b/lerobot/common/envs/simxarm/simxarm/tasks/mocap.py
@@ -1,67 +0,0 @@
-# import mujoco_py
-import mujoco
-import numpy as np
-
-
-def apply_action(model, model_names, data, action):
-    if model.nmocap > 0:
-        pos_action, gripper_action = np.split(action, (model.nmocap * 7,))
-        if data.ctrl is not None:
-            for i in range(gripper_action.shape[0]):
-                data.ctrl[i] = gripper_action[i]
-        pos_action = pos_action.reshape(model.nmocap, 7)
-        pos_delta, quat_delta = pos_action[:, :3], pos_action[:, 3:]
-        reset_mocap2body_xpos(model, model_names, data)
-        data.mocap_pos[:] = data.mocap_pos + pos_delta
-        data.mocap_quat[:] = data.mocap_quat + quat_delta
-
-
-def reset(model, data):
-    if model.nmocap > 0 and model.eq_data is not None:
-        for i in range(model.eq_data.shape[0]):
-            # if sim.model.eq_type[i] == mujoco_py.const.EQ_WELD:
-            if model.eq_type[i] == mujoco.mjtEq.mjEQ_WELD:
-                # model.eq_data[i, :] = np.array([0., 0., 0., 1., 0., 0., 0.])
-                model.eq_data[i, :] = np.array(
-                    [
-                        0.0,
-                        0.0,
-                        0.0,
-                        1.0,
-                        0.0,
-                        0.0,
-                        0.0,
-                        0.0,
-                        0.0,
-                        0.0,
-                        0.0,
-                    ]
-                )
-    # sim.forward()
-    mujoco.mj_forward(model, data)
-
-
-def reset_mocap2body_xpos(model, model_names, data):
-    if model.eq_type is None or model.eq_obj1id is None or model.eq_obj2id is None:
-        return
-
-    # For all weld constraints
-    for eq_type, obj1_id, obj2_id in zip(model.eq_type, model.eq_obj1id, model.eq_obj2id, strict=False):
-        # if eq_type != mujoco_py.const.EQ_WELD:
-        if eq_type != mujoco.mjtEq.mjEQ_WELD:
-            continue
-        # body2 = model.body_id2name(obj2_id)
-        body2 = model_names.body_id2name[obj2_id]
-        if body2 == "B0" or body2 == "B9" or body2 == "B1":
-            continue
-        mocap_id = model.body_mocapid[obj1_id]
-        if mocap_id != -1:
-            # obj1 is the mocap, obj2 is the welded body
-            body_idx = obj2_id
-        else:
-            # obj2 is the mocap, obj1 is the welded body
-            mocap_id = model.body_mocapid[obj2_id]
-            body_idx = obj1_id
-        assert mocap_id != -1
-        data.mocap_pos[mocap_id][:] = data.xpos[body_idx]
-        data.mocap_quat[mocap_id][:] = data.xquat[body_idx]
--- a/lerobot/common/envs/simxarm/simxarm/tasks/peg_in_box.py
+++ b/lerobot/common/envs/simxarm/simxarm/tasks/peg_in_box.py
@@ -1,86 +0,0 @@
-import numpy as np
-
-from lerobot.common.envs.simxarm.simxarm import Base
-
-
-class PegInBox(Base):
-    def __init__(self):
-        super().__init__("peg_in_box")
-
-    def _reset_sim(self):
-        self._act_magnitude = 0
-        super()._reset_sim()
-        for _ in range(10):
-            self._apply_action(np.array([0, 0, 0, 1], dtype=np.float32))
-            self.sim.step()
-
-    @property
-    def box(self):
-        return self.sim.data.get_site_xpos("box_site")
-
-    def is_success(self):
-        return np.linalg.norm(self.obj - self.box) <= 0.05
-
-    def get_reward(self):
-        dist_xy = np.linalg.norm(self.obj[:2] - self.box[:2])
-        dist_xyz = np.linalg.norm(self.obj - self.box)
-        return float(dist_xy <= 0.045) * (2 - 6 * dist_xyz) - 0.2 * np.square(self._act_magnitude) - dist_xy
-
-    def _get_obs(self):
-        eef_velp = self.sim.data.get_site_xvelp("grasp") * self.dt
-        gripper_angle = self.sim.data.get_joint_qpos("right_outer_knuckle_joint")
-        eef, box = self.eef - self.center_of_table, self.box - self.center_of_table
-
-        obj = self.obj - self.center_of_table
-        obj_rot = self.sim.data.get_joint_qpos("object_joint0")[-4:]
-        obj_velp = self.sim.data.get_site_xvelp("object_site") * self.dt
-        obj_velr = self.sim.data.get_site_xvelr("object_site") * self.dt
-
-        obs = np.concatenate(
-            [
-                eef,
-                eef_velp,
-                box,
-                obj,
-                obj_rot,
-                obj_velp,
-                obj_velr,
-                eef - box,
-                eef - obj,
-                obj - box,
-                np.array(
-                    [
-                        np.linalg.norm(eef - box),
-                        np.linalg.norm(eef - obj),
-                        np.linalg.norm(obj - box),
-                        gripper_angle,
-                    ]
-                ),
-            ],
-            axis=0,
-        )
-        return {"observation": obs, "state": eef, "achieved_goal": eef, "desired_goal": box}
-
-    def _sample_goal(self):
-        # Gripper
-        gripper_pos = np.array([1.280, 0.295, 0.9]) + self.np_random.uniform(-0.05, 0.05, size=3)
-        super()._set_gripper(gripper_pos, self.gripper_rotation)
-
-        # Object
-        object_pos = gripper_pos - np.array([0, 0, 0.06]) + self.np_random.uniform(-0.005, 0.005, size=3)
-        object_qpos = self.sim.data.get_joint_qpos("object_joint0")
-        object_qpos[:3] = object_pos
-        self.sim.data.set_joint_qpos("object_joint0", object_qpos)
-
-        # Box
-        box_pos = np.array([1.61, 0.18, 0.58])
-        box_pos[:2] += self.np_random.uniform(-0.11, 0.11, size=2)
-        box_qpos = self.sim.data.get_joint_qpos("box_joint0")
-        box_qpos[:3] = box_pos
-        self.sim.data.set_joint_qpos("box_joint0", box_qpos)
-
-        return self.box
-
-    def step(self, action):
-        self._act_magnitude = np.linalg.norm(action[:3])
-        return super().step(action)
--- a/lerobot/common/envs/simxarm/simxarm/tasks/push.py
+++ b/lerobot/common/envs/simxarm/simxarm/tasks/push.py
@@ -1,78 +0,0 @@
-import numpy as np
-
-from lerobot.common.envs.simxarm.simxarm import Base
-
-
-class Push(Base):
-    def __init__(self):
-        super().__init__("push")
-
-    def _reset_sim(self):
-        self._act_magnitude = 0
-        super()._reset_sim()
-
-    def is_success(self):
-        return np.linalg.norm(self.obj - self.goal) <= 0.05
-
-    def get_reward(self):
-        dist = np.linalg.norm(self.obj - self.goal)
-        penalty = self._act_magnitude**2
-        return -(dist + 0.15 * penalty)
-
-    def _get_obs(self):
-        eef_velp = self.sim.data.get_site_xvelp("grasp") * self.dt
-        gripper_angle = self.sim.data.get_joint_qpos("right_outer_knuckle_joint")
-        eef, goal = self.eef - self.center_of_table, self.goal - self.center_of_table
-
-        obj = self.obj - self.center_of_table
-        obj_rot = self.sim.data.get_joint_qpos("object_joint0")[-4:]
-        obj_velp = self.sim.data.get_site_xvelp("object_site") * self.dt
-        obj_velr = self.sim.data.get_site_xvelr("object_site") * self.dt
-
-        obs = np.concatenate(
-            [
-                eef,
-                eef_velp,
-                goal,
-                obj,
-                obj_rot,
-                obj_velp,
-                obj_velr,
-                eef - goal,
-                eef - obj,
-                obj - goal,
-                np.array(
-                    [
-                        np.linalg.norm(eef - goal),
-                        np.linalg.norm(eef - obj),
-                        np.linalg.norm(obj - goal),
-                        gripper_angle,
-                    ]
-                ),
-            ],
-            axis=0,
-        )
-        return {"observation": obs, "state": eef, "achieved_goal": eef, "desired_goal": goal}
-
-    def _sample_goal(self):
-        # Gripper
-        gripper_pos = np.array([1.280, 0.295, 0.735]) + self.np_random.uniform(-0.05, 0.05, size=3)
-        super()._set_gripper(gripper_pos, self.gripper_rotation)
-
-        # Object
-        object_pos = self.center_of_table - np.array([0.25, 0, 0.07])
-        object_pos[0] += self.np_random.uniform(-0.08, 0.08, size=1)
-        object_pos[1] += self.np_random.uniform(-0.08, 0.08, size=1)
-        object_qpos = self.sim.data.get_joint_qpos("object_joint0")
-        object_qpos[:3] = object_pos
-        self.sim.data.set_joint_qpos("object_joint0", object_qpos)
-
-        # Goal
-        self.goal = np.array([1.600, 0.200, 0.545])
-        self.goal[:2] += self.np_random.uniform(-0.1, 0.1, size=2)
-        self.sim.model.site_pos[self.sim.model.site_name2id("target0")] = self.goal
-        return self.goal
-
-    def step(self, action):
-        self._act_magnitude = np.linalg.norm(action[:3])
-        return super().step(action)
--- a/lerobot/common/envs/simxarm/simxarm/tasks/reach.py
+++ b/lerobot/common/envs/simxarm/simxarm/tasks/reach.py
@@ -1,44 +0,0 @@
-import numpy as np
-
-from lerobot.common.envs.simxarm.simxarm import Base
-
-
-class Reach(Base):
-    def __init__(self):
-        super().__init__("reach")
-
-    def _reset_sim(self):
-        self._act_magnitude = 0
-        super()._reset_sim()
-
-    def is_success(self):
-        return np.linalg.norm(self.eef - self.goal) <= 0.05
-
-    def get_reward(self):
-        dist = np.linalg.norm(self.eef - self.goal)
-        penalty = self._act_magnitude**2
-        return -(dist + 0.15 * penalty)
-
-    def _get_obs(self):
-        eef_velp = self.sim.data.get_site_xvelp("grasp") * self.dt
-        gripper_angle = self.sim.data.get_joint_qpos("right_outer_knuckle_joint")
-        eef, goal = self.eef - self.center_of_table, self.goal - self.center_of_table
-        obs = np.concatenate(
-            [eef, eef_velp, goal, eef - goal, np.array([np.linalg.norm(eef - goal), gripper_angle])], axis=0
-        )
-        return {"observation": obs, "state": eef, "achieved_goal": eef, "desired_goal": goal}
-
-    def _sample_goal(self):
-        # Gripper
-        gripper_pos = np.array([1.280, 0.295, 0.735]) + self.np_random.uniform(-0.05, 0.05, size=3)
-        super()._set_gripper(gripper_pos, self.gripper_rotation)
-
-        # Goal
-        self.goal = np.array([1.550, 0.287, 0.580])
-        self.goal[:2] += self.np_random.uniform(-0.125, 0.125, size=2)
-        self.sim.model.site_pos[self.sim.model.site_name2id("target0")] = self.goal
-        return self.goal
-
-    def step(self, action):
-        self._act_magnitude = np.linalg.norm(action[:3])
-        return super().step(action)
--- a/lerobot/common/envs/transforms.py
+++ b/lerobot/common/envs/transforms.py
@@ -1,6 +1,5 @@
 from typing import Sequence

-import torch
 from tensordict import TensorDictBase
 from tensordict.nn import dispatch
 from tensordict.utils import NestedKey
@@ -8,45 +7,19 @@ from torchrl.envs.transforms import ObservationTransform, Transform


 class Prod(ObservationTransform):
-    invertible = True
-
    def __init__(self, in_keys: Sequence[NestedKey], prod: float):
        super().__init__()
        self.in_keys = in_keys
        self.prod = prod
-        self.original_dtypes = {}
-
-    def _reset(self, tensordict: TensorDictBase, tensordict_reset: TensorDictBase) -> TensorDictBase:
-        # _reset is called once when the environment reset to normalize the first observation
-        tensordict_reset = self._call(tensordict_reset)
-        return tensordict_reset
-
-    @dispatch(source="in_keys", dest="out_keys")
-    def forward(self, tensordict: TensorDictBase) -> TensorDictBase:
-        return self._call(tensordict)

    def _call(self, td):
        for key in self.in_keys:
-            if td.get(key, None) is None:
-                continue
-            self.original_dtypes[key] = td[key].dtype
-            td[key] = td[key].type(torch.float32) * self.prod
-        return td
-
-    def _inv_call(self, td: TensorDictBase) -> TensorDictBase:
-        for key in self.in_keys:
-            if td.get(key, None) is None:
-                continue
-            td[key] = (td[key] / self.prod).type(self.original_dtypes[key])
+            td[key] *= self.prod
        return td

    def transform_observation_spec(self, obs_spec):
        for key in self.in_keys:
-            if obs_spec.get(key, None) is None:
-                continue
-            obs_spec[key].space.high = obs_spec[key].space.high.type(torch.float32) * self.prod
-            obs_spec[key].space.low = obs_spec[key].space.low.type(torch.float32) * self.prod
-            obs_spec[key].dtype = torch.float32
+            obs_spec[key].space.high *= self.prod
        return obs_spec


--- a/lerobot/common/logger.py
+++ b/lerobot/common/logger.py
@@ -5,7 +5,6 @@ from pathlib import Path
 from omegaconf import OmegaConf
 from termcolor import colored

-from lerobot.common.policies.abstract import AbstractPolicy

 def log_output_dir(out_dir):
    logging.info(colored("Output dir:", "yellow", attrs=["bold"]) + f" {out_dir}")
@@ -31,7 +30,6 @@ class Logger:
        self._model_dir = self._log_dir / "models"
        self._buffer_dir = self._log_dir / "buffers"
        self._save_model = cfg.save_model
-        self._disable_wandb_artifact = cfg.wandb.disable_artifact
        self._save_buffer = cfg.save_buffer
        self._group = cfg_to_group(cfg)
        self._seed = cfg.seed
@@ -40,7 +38,7 @@ class Logger:
        project = cfg.get("wandb", {}).get("project")
        entity = cfg.get("wandb", {}).get("entity")
        enable_wandb = cfg.get("wandb", {}).get("enable", False)
-        run_offline = not enable_wandb or not project
+        run_offline = not enable_wandb or not project or not entity
        if run_offline:
            logging.info(colored("Logs will be saved locally.", "yellow", attrs=["bold"]))
            self._wandb = None
@@ -65,18 +63,16 @@ class Logger:
                resume=None,
            )
            print(colored("Logs will be synced with wandb.", "blue", attrs=["bold"]))
-            logging.info(f"Track this run --> {colored(wandb.run.get_url(), 'yellow', attrs=['bold'])}")
            self._wandb = wandb

-    def save_model(self, policy: AbstractPolicy, identifier):
+    def save_model(self, policy, identifier):
        if self._save_model:
            self._model_dir.mkdir(parents=True, exist_ok=True)
            fp = self._model_dir / f"{str(identifier)}.pt"
-            policy.save_pretrained(fp)
-            if self._wandb and not self._disable_wandb_artifact:
-                # note wandb artifact does not accept ":" in its name
+            policy.save(fp)
+            if self._wandb:
                artifact = self._wandb.Artifact(
-                    self._group.replace(":", "_") + "-" + str(self._seed) + "-" + str(identifier),
+                    self._group + "-" + str(self._seed) + "-" + str(identifier),
                    type="model",
                )
                artifact.add_file(fp)
--- a/lerobot/common/policies/init.py
+++ b/lerobot/common/policies/init.py
--- a/lerobot/common/policies/abstract.py
+++ b/lerobot/common/policies/abstract.py
@@ -1,93 +0,0 @@
-from collections import deque
-
-import torch
-from torch import Tensor, nn
-from huggingface_hub import PyTorchModelHubMixin
-
-
-class AbstractPolicy(nn.Module, PyTorchModelHubMixin):
-    """Base policy which all policies should be derived from.
-
-    The forward method should generally not be overriden as it plays the role of handling multi-step policies. See its
-    documentation for more information.
-
-    The policy is a PyTorchModelHubMixin, which means that it can be saved and loaded from the Hugging Face Hub and/or to a local directory.
-    # Save policy weights to local directory
-    >>> policy.save_pretrained("my-awesome-policy")
-
-    # Push policy weights to the Hub
-    >>> policy.push_to_hub("my-awesome-policy")
-
-    # Download and initialize policy from the Hub
-    >>> policy = MyPolicy.from_pretrained("username/my-awesome-policy")
-
-    Note:
-        When implementing a concrete class (e.g. `AlohaDataset`, `PushtEnv`, `DiffusionPolicy`), you need to:
-            1. set the required class attributes:
-                - for classes inheriting from `AbstractDataset`: `available_datasets`
-                - for classes inheriting from `AbstractEnv`: `name`, `available_tasks`
-                - for classes inheriting from `AbstractPolicy`: `name`
-            2. update variables in `lerobot/__init__.py` (e.g. `available_envs`, `available_datasets_per_envs`, `available_policies`)
-            3. update variables in `tests/test_available.py` by importing your new class
-    """
-
-    name: str | None = None  # same name should be used to instantiate the policy in factory.py
-
-    def __init__(self, n_action_steps: int | None = None):
-        """
-        n_action_steps: Sets the cache size for storing action trajectories. If None, it is assumed that a single
-            action is returned by `select_actions` and that doesn't have a horizon dimension. The `forward` method then
-            adds that dimension.
-        """
-        super().__init__()
-        assert self.name is not None, "Subclasses of `AbstractPolicy` should set the `name` class attribute."
-        self.n_action_steps = n_action_steps
-        self.clear_action_queue()
-
-    def update(self, replay_buffer, step):
-        """One step of the policy's learning algorithm."""
-        raise NotImplementedError("Abstract method")
-
-    def save(self, fp):  # TODO: remove this method since we are using PyTorchModelHubMixin
-        torch.save(self.state_dict(), fp)
-
-    def load(self, fp):  # TODO: remove this method since we are using PyTorchModelHubMixin
-        d = torch.load(fp)
-        self.load_state_dict(d)
-
-    def select_actions(self, observation) -> Tensor:
-        """Select an action (or trajectory of actions) based on an observation during rollout.
-
-        If n_action_steps was provided at initialization, this should return a (batch_size, n_action_steps, *) tensor of
-        actions. Otherwise if n_actions_steps is None, this should return a (batch_size, *) tensor of actions.
-        """
-        raise NotImplementedError("Abstract method")
-
-    def clear_action_queue(self):
-        """This should be called whenever the environment is reset."""
-        if self.n_action_steps is not None:
-            self._action_queue = deque([], maxlen=self.n_action_steps)
-
-    def forward(self, *args, **kwargs) -> Tensor:
-        """Inference step that makes multi-step policies compatible with their single-step environments.
-
-        WARNING: In general, this should not be overriden.
-
-        Consider a "policy" that observes the environment then charts a course of N actions to take. To make this fit
-        into the formalism of a TorchRL environment, we view it as being effectively a policy that (1) makes an
-        observation and prepares a queue of actions, (2) consumes that queue when queried, regardless of the environment
-        observation, (3) repopulates the action queue when empty. This method handles the aforementioned logic so that
-        the subclass doesn't have to.
-
-        This method effectively wraps the `select_actions` method of the subclass. The following assumptions are made:
-        1. The `select_actions` method returns a Tensor of actions with shape (B, H, *) where B is the batch size, H is
-           the action trajectory horizon and * is the action dimensions.
-        2. Prior to the `select_actions` method being called, theres is an `n_action_steps` instance attribute defined.
-        """
-        if self.n_action_steps is None:
-            return self.select_actions(*args, **kwargs)
-        if len(self._action_queue) == 0:
-            # `select_actions` returns a (batch_size, n_action_steps, *) tensor, but the queue effectively has shape
-            # (n_action_steps, batch_size, *), hence the transpose.
-            self._action_queue.extend(self.select_actions(*args, **kwargs).transpose(0, 1))
-        return self._action_queue.popleft()
--- a/lerobot/common/policies/act/policy.py
+++ b/lerobot/common/policies/act/policy.py
@@ -2,12 +2,11 @@ import logging
 import time

 import torch
+import torch.nn as nn
 import torch.nn.functional as F  # noqa: N812
 import torchvision.transforms as transforms

-from lerobot.common.policies.abstract import AbstractPolicy
 from lerobot.common.policies.act.detr_vae import build
-from lerobot.common.utils import get_safe_torch_device


 def build_act_model_and_optimizer(cfg):
@@ -41,17 +40,16 @@ def kl_divergence(mu, logvar):
    return total_kld, dimension_wise_kld, mean_kld


-class ActionChunkingTransformerPolicy(AbstractPolicy):
-    name = "act"
-
+class ActionChunkingTransformerPolicy(nn.Module):
    def __init__(self, cfg, device, n_action_steps=1):
-        super().__init__(n_action_steps)
+        super().__init__()
        self.cfg = cfg
        self.n_action_steps = n_action_steps
-        self.device = get_safe_torch_device(device)
+        self.device = device
        self.model, self.optimizer = build_act_model_and_optimizer(cfg)
        self.kl_weight = self.cfg.kl_weight
        logging.info(f"KL Weight {self.kl_weight}")
+
        self.to(self.device)

    def update(self, replay_buffer, step):
@@ -136,8 +134,8 @@ class ActionChunkingTransformerPolicy(AbstractPolicy):
    def save(self, fp):
        torch.save(self.state_dict(), fp)

-    def load(self, fp, device=None):
-        d = torch.load(fp, map_location=device)
+    def load(self, fp):
+        d = torch.load(fp)
        self.load_state_dict(d)

    def compute_loss(self, batch):
@@ -150,21 +148,22 @@ class ActionChunkingTransformerPolicy(AbstractPolicy):
        return loss

    @torch.no_grad()
-    def select_actions(self, observation, step_count):
-        if observation["image"].shape[0] != 1:
-            raise NotImplementedError("Batch size > 1 not handled")
-
+    def forward(self, observation, step_count):
        # TODO(rcadene): remove unused step_count
        del step_count

        self.eval()

+        # TODO(rcadene): remove unsqueeze hack to add bsize=1
+        observation["image"] = observation["image"].unsqueeze(0)
+        observation["state"] = observation["state"].unsqueeze(0)
+
        # TODO(rcadene): remove hack
        # add 1 camera dimension
-        observation["image", "top"] = observation["image", "top"].unsqueeze(1)
+        observation["image"] = observation["image"].unsqueeze(1)

        obs_dict = {
-            "image": observation["image", "top"],
+            "image": observation["image"],
            "agent_pos": observation["state"],
        }
        action = self._forward(qpos=obs_dict["agent_pos"], image=obs_dict["image"])
@@ -182,8 +181,11 @@ class ActionChunkingTransformerPolicy(AbstractPolicy):
            # exp_weights = torch.from_numpy(exp_weights).cuda().unsqueeze(dim=1)
            # raw_action = (actions_for_curr_step * exp_weights).sum(dim=0, keepdim=True)

+        # remove bsize=1
+        action = action.squeeze(0)
+
        # take first predicted action or n first actions
-        action = action[: self.n_action_steps]
+        action = action[0] if self.n_action_steps == 1 else action[: self.n_action_steps]
        return action

    def _forward(self, qpos, image, actions=None, is_pad=None):
--- a/lerobot/common/policies/act/position_encoding.py
+++ b/lerobot/common/policies/act/position_encoding.py
@@ -1,7 +1,6 @@
 """
 Various positional encodings for the transformer.
 """
-
 import math

 import torch
--- a/lerobot/common/policies/act/transformer.py
+++ b/lerobot/common/policies/act/transformer.py
@@ -6,7 +6,6 @@ Copy-paste from torch.nn.Transformer with modifications:
    * extra LN at the end of encoder is removed
    * decoder returns a stack of activations from all decoding layers
 """
-
 import copy
 from typing import Optional

--- a/lerobot/common/policies/act/utils.py
+++ b/lerobot/common/policies/act/utils.py
@@ -3,7 +3,6 @@ Misc functions, including distributed helpers.

 Mostly copy-paste from torchvision references.
 """
-
 import datetime
 import os
 import pickle
--- a/lerobot/common/policies/diffusion/diffusion_unet_image_policy.py
+++ b/lerobot/common/policies/diffusion/diffusion_unet_image_policy.py
@@ -1,44 +1,3 @@
-"""Code from the original diffusion policy project.
-
-Notes on how to load a checkpoint from the original repository:
-
-In the original repository, run the eval and use a breakpoint to extract the policy weights.
-
-```
-torch.save(policy.state_dict(), "weights.pt")
-```
-
-In this repository, add a breakpoint somewhere after creating an equivalent policy and load in the weights:
-
-```
-loaded = torch.load("weights.pt")
-aligned = {}
-their_prefix = "obs_encoder.obs_nets.image.backbone"
-our_prefix = "obs_encoder.key_model_map.image.backbone"
-aligned.update({our_prefix + k.removeprefix(their_prefix): v for k, v in loaded.items() if k.startswith(their_prefix)})
-their_prefix = "obs_encoder.obs_nets.image.pool"
-our_prefix = "obs_encoder.key_model_map.image.pool"
-aligned.update({our_prefix + k.removeprefix(their_prefix): v for k, v in loaded.items() if k.startswith(their_prefix)})
-their_prefix = "obs_encoder.obs_nets.image.nets.3"
-our_prefix = "obs_encoder.key_model_map.image.out"
-aligned.update({our_prefix + k.removeprefix(their_prefix): v for k, v in loaded.items() if k.startswith(their_prefix)})
-aligned.update({k: v for k, v in loaded.items() if k.startswith('model.')})
-# Note: here you are loading into the ema model.
-missing_keys, unexpected_keys = policy.ema_diffusion.load_state_dict(aligned, strict=False)
-assert all('_dummy_variable' in k for k in missing_keys)
-assert len(unexpected_keys) == 0
-```
-
-Then in that same runtime you can also save the weights with the new aligned state_dict:
-
-```
-policy.save_pretrained("my-policy")
-```
-
-Now you can remove the breakpoint and extra code and load in the weights just like with any other lerobot checkpoint.
-
-"""
-
 from typing import Dict

 import torch
@@ -46,33 +5,11 @@ import torch.nn.functional as F  # noqa: N812
 from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
 from einops import reduce

-from lerobot.common.policies.diffusion.model.conditional_unet1d import ConditionalUnet1D
-from lerobot.common.policies.diffusion.model.mask_generator import LowdimMaskGenerator
-from lerobot.common.policies.diffusion.model.module_attr_mixin import ModuleAttrMixin
-from lerobot.common.policies.diffusion.model.multi_image_obs_encoder import MultiImageObsEncoder
-from lerobot.common.policies.diffusion.model.normalizer import LinearNormalizer
-from lerobot.common.policies.diffusion.pytorch_utils import dict_apply
-
-
-class BaseImagePolicy(ModuleAttrMixin):
-    # init accepts keyword argument shape_meta, see config/task/*_image.yaml
-
-    def predict_action(self, obs_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
-        """
-        obs_dict:
-            str: B,To,*
-        return: B,Ta,Da
-        """
-        raise NotImplementedError()
-
-    # reset state for stateful policies
-    def reset(self):
-        pass
-
-    # ========== training ===========
-    # no standard training interface except setting normalizer
-    def set_normalizer(self, normalizer: LinearNormalizer):
-        raise NotImplementedError()
+from diffusion_policy.common.pytorch_util import dict_apply
+from diffusion_policy.model.diffusion.conditional_unet1d import ConditionalUnet1D
+from diffusion_policy.model.diffusion.mask_generator import LowdimMaskGenerator
+from diffusion_policy.model.vision.multi_image_obs_encoder import MultiImageObsEncoder
+from diffusion_policy.policy.base_image_policy import BaseImagePolicy


 class DiffusionUnetImagePolicy(BaseImagePolicy):
@@ -231,10 +168,11 @@ class DiffusionUnetImagePolicy(BaseImagePolicy):

        # run sampling
        nsample = self.conditional_sample(
-            cond_data, cond_mask, local_cond=local_cond, global_cond=global_cond
+            cond_data, cond_mask, local_cond=local_cond, global_cond=global_cond, **self.kwargs
        )

        action_pred = nsample[..., :action_dim]
+
        # get action
        start = n_obs_steps - 1
        end = start + self.n_action_steps
--- a/lerobot/common/policies/diffusion/model/conditional_unet1d.py
+++ b/lerobot/common/policies/diffusion/model/conditional_unet1d.py
@@ -1,286 +0,0 @@
-import logging
-from typing import Union
-
-import einops
-import torch
-import torch.nn as nn
-from einops.layers.torch import Rearrange
-
-from lerobot.common.policies.diffusion.model.conv1d_components import Conv1dBlock, Downsample1d, Upsample1d
-from lerobot.common.policies.diffusion.model.positional_embedding import SinusoidalPosEmb
-
-logger = logging.getLogger(__name__)
-
-
-class ConditionalResidualBlock1D(nn.Module):
-    def __init__(
-        self, in_channels, out_channels, cond_dim, kernel_size=3, n_groups=8, cond_predict_scale=False
-    ):
-        super().__init__()
-
-        self.blocks = nn.ModuleList(
-            [
-                Conv1dBlock(in_channels, out_channels, kernel_size, n_groups=n_groups),
-                Conv1dBlock(out_channels, out_channels, kernel_size, n_groups=n_groups),
-            ]
-        )
-
-        # FiLM modulation https://arxiv.org/abs/1709.07871
-        # predicts per-channel scale and bias
-        cond_channels = out_channels
-        if cond_predict_scale:
-            cond_channels = out_channels * 2
-        self.cond_predict_scale = cond_predict_scale
-        self.out_channels = out_channels
-        self.cond_encoder = nn.Sequential(
-            nn.Mish(),
-            nn.Linear(cond_dim, cond_channels),
-            Rearrange("batch t -> batch t 1"),
-        )
-
-        # make sure dimensions compatible
-        self.residual_conv = (
-            nn.Conv1d(in_channels, out_channels, 1) if in_channels != out_channels else nn.Identity()
-        )
-
-    def forward(self, x, cond):
-        """
-        x : [ batch_size x in_channels x horizon ]
-        cond : [ batch_size x cond_dim]
-
-        returns:
-        out : [ batch_size x out_channels x horizon ]
-        """
-        out = self.blocks[0](x)
-        embed = self.cond_encoder(cond)
-        if self.cond_predict_scale:
-            embed = embed.reshape(embed.shape[0], 2, self.out_channels, 1)
-            scale = embed[:, 0, ...]
-            bias = embed[:, 1, ...]
-            out = scale * out + bias
-        else:
-            out = out + embed
-        out = self.blocks[1](out)
-        out = out + self.residual_conv(x)
-        return out
-
-
-class ConditionalUnet1D(nn.Module):
-    def __init__(
-        self,
-        input_dim,
-        local_cond_dim=None,
-        global_cond_dim=None,
-        diffusion_step_embed_dim=256,
-        down_dims=None,
-        kernel_size=3,
-        n_groups=8,
-        cond_predict_scale=False,
-    ):
-        super().__init__()
-        if down_dims is None:
-            down_dims = [256, 512, 1024]
-
-        all_dims = [input_dim] + list(down_dims)
-        start_dim = down_dims[0]
-
-        dsed = diffusion_step_embed_dim
-        diffusion_step_encoder = nn.Sequential(
-            SinusoidalPosEmb(dsed),
-            nn.Linear(dsed, dsed * 4),
-            nn.Mish(),
-            nn.Linear(dsed * 4, dsed),
-        )
-        cond_dim = dsed
-        if global_cond_dim is not None:
-            cond_dim += global_cond_dim
-
-        in_out = list(zip(all_dims[:-1], all_dims[1:], strict=False))
-
-        local_cond_encoder = None
-        if local_cond_dim is not None:
-            _, dim_out = in_out[0]
-            dim_in = local_cond_dim
-            local_cond_encoder = nn.ModuleList(
-                [
-                    # down encoder
-                    ConditionalResidualBlock1D(
-                        dim_in,
-                        dim_out,
-                        cond_dim=cond_dim,
-                        kernel_size=kernel_size,
-                        n_groups=n_groups,
-                        cond_predict_scale=cond_predict_scale,
-                    ),
-                    # up encoder
-                    ConditionalResidualBlock1D(
-                        dim_in,
-                        dim_out,
-                        cond_dim=cond_dim,
-                        kernel_size=kernel_size,
-                        n_groups=n_groups,
-                        cond_predict_scale=cond_predict_scale,
-                    ),
-                ]
-            )
-
-        mid_dim = all_dims[-1]
-        self.mid_modules = nn.ModuleList(
-            [
-                ConditionalResidualBlock1D(
-                    mid_dim,
-                    mid_dim,
-                    cond_dim=cond_dim,
-                    kernel_size=kernel_size,
-                    n_groups=n_groups,
-                    cond_predict_scale=cond_predict_scale,
-                ),
-                ConditionalResidualBlock1D(
-                    mid_dim,
-                    mid_dim,
-                    cond_dim=cond_dim,
-                    kernel_size=kernel_size,
-                    n_groups=n_groups,
-                    cond_predict_scale=cond_predict_scale,
-                ),
-            ]
-        )
-
-        down_modules = nn.ModuleList([])
-        for ind, (dim_in, dim_out) in enumerate(in_out):
-            is_last = ind >= (len(in_out) - 1)
-            down_modules.append(
-                nn.ModuleList(
-                    [
-                        ConditionalResidualBlock1D(
-                            dim_in,
-                            dim_out,
-                            cond_dim=cond_dim,
-                            kernel_size=kernel_size,
-                            n_groups=n_groups,
-                            cond_predict_scale=cond_predict_scale,
-                        ),
-                        ConditionalResidualBlock1D(
-                            dim_out,
-                            dim_out,
-                            cond_dim=cond_dim,
-                            kernel_size=kernel_size,
-                            n_groups=n_groups,
-                            cond_predict_scale=cond_predict_scale,
-                        ),
-                        Downsample1d(dim_out) if not is_last else nn.Identity(),
-                    ]
-                )
-            )
-
-        up_modules = nn.ModuleList([])
-        for ind, (dim_in, dim_out) in enumerate(reversed(in_out[1:])):
-            is_last = ind >= (len(in_out) - 1)
-            up_modules.append(
-                nn.ModuleList(
-                    [
-                        ConditionalResidualBlock1D(
-                            dim_out * 2,
-                            dim_in,
-                            cond_dim=cond_dim,
-                            kernel_size=kernel_size,
-                            n_groups=n_groups,
-                            cond_predict_scale=cond_predict_scale,
-                        ),
-                        ConditionalResidualBlock1D(
-                            dim_in,
-                            dim_in,
-                            cond_dim=cond_dim,
-                            kernel_size=kernel_size,
-                            n_groups=n_groups,
-                            cond_predict_scale=cond_predict_scale,
-                        ),
-                        Upsample1d(dim_in) if not is_last else nn.Identity(),
-                    ]
-                )
-            )
-
-        final_conv = nn.Sequential(
-            Conv1dBlock(start_dim, start_dim, kernel_size=kernel_size),
-            nn.Conv1d(start_dim, input_dim, 1),
-        )
-
-        self.diffusion_step_encoder = diffusion_step_encoder
-        self.local_cond_encoder = local_cond_encoder
-        self.up_modules = up_modules
-        self.down_modules = down_modules
-        self.final_conv = final_conv
-
-        logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
-
-    def forward(
-        self,
-        sample: torch.Tensor,
-        timestep: Union[torch.Tensor, float, int],
-        local_cond=None,
-        global_cond=None,
-        **kwargs,
-    ):
-        """
-        x: (B,T,input_dim)
-        timestep: (B,) or int, diffusion step
-        local_cond: (B,T,local_cond_dim)
-        global_cond: (B,global_cond_dim)
-        output: (B,T,input_dim)
-        """
-        sample = einops.rearrange(sample, "b h t -> b t h")
-
-        # 1. time
-        timesteps = timestep
-        if not torch.is_tensor(timesteps):
-            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
-            timesteps = torch.tensor([timesteps], dtype=torch.long, device=sample.device)
-        elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
-            timesteps = timesteps[None].to(sample.device)
-        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-        timesteps = timesteps.expand(sample.shape[0])
-
-        global_feature = self.diffusion_step_encoder(timesteps)
-
-        if global_cond is not None:
-            global_feature = torch.cat([global_feature, global_cond], axis=-1)
-
-        # encode local features
-        h_local = []
-        if local_cond is not None:
-            local_cond = einops.rearrange(local_cond, "b h t -> b t h")
-            resnet, resnet2 = self.local_cond_encoder
-            x = resnet(local_cond, global_feature)
-            h_local.append(x)
-            x = resnet2(local_cond, global_feature)
-            h_local.append(x)
-
-        x = sample
-        h = []
-        for idx, (resnet, resnet2, downsample) in enumerate(self.down_modules):
-            x = resnet(x, global_feature)
-            if idx == 0 and len(h_local) > 0:
-                x = x + h_local[0]
-            x = resnet2(x, global_feature)
-            h.append(x)
-            x = downsample(x)
-
-        for mid_module in self.mid_modules:
-            x = mid_module(x, global_feature)
-
-        for idx, (resnet, resnet2, upsample) in enumerate(self.up_modules):
-            x = torch.cat((x, h.pop()), dim=1)
-            x = resnet(x, global_feature)
-            # The correct condition should be:
-            # if idx == (len(self.up_modules)-1) and len(h_local) > 0:
-            # However this change will break compatibility with published checkpoints.
-            # Therefore it is left as a comment.
-            if idx == len(self.up_modules) and len(h_local) > 0:
-                x = x + h_local[1]
-            x = resnet2(x, global_feature)
-            x = upsample(x)
-
-        x = self.final_conv(x)
-
-        x = einops.rearrange(x, "b t h -> b h t")
-        return x
--- a/lerobot/common/policies/diffusion/model/conv1d_components.py
+++ b/lerobot/common/policies/diffusion/model/conv1d_components.py
@@ -1,47 +0,0 @@
-import torch.nn as nn
-
-# from einops.layers.torch import Rearrange
-
-
-class Downsample1d(nn.Module):
-    def __init__(self, dim):
-        super().__init__()
-        self.conv = nn.Conv1d(dim, dim, 3, 2, 1)
-
-    def forward(self, x):
-        return self.conv(x)
-
-
-class Upsample1d(nn.Module):
-    def __init__(self, dim):
-        super().__init__()
-        self.conv = nn.ConvTranspose1d(dim, dim, 4, 2, 1)
-
-    def forward(self, x):
-        return self.conv(x)
-
-
-class Conv1dBlock(nn.Module):
-    """
-    Conv1d --> GroupNorm --> Mish
-    """
-
-    def __init__(self, inp_channels, out_channels, kernel_size, n_groups=8):
-        super().__init__()
-
-        self.block = nn.Sequential(
-            nn.Conv1d(inp_channels, out_channels, kernel_size, padding=kernel_size // 2),
-            # Rearrange('batch channels horizon -> batch channels 1 horizon'),
-            nn.GroupNorm(n_groups, out_channels),
-            # Rearrange('batch channels 1 horizon -> batch channels horizon'),
-            nn.Mish(),
-        )
-
-    def forward(self, x):
-        return self.block(x)
-
-
-# def test():
-#     cb = Conv1dBlock(256, 128, kernel_size=3)
-#     x = torch.zeros((1,256,16))
-#     o = cb(x)
--- a/lerobot/common/policies/diffusion/model/crop_randomizer.py
+++ b/lerobot/common/policies/diffusion/model/crop_randomizer.py
@@ -1,294 +0,0 @@
-import torch
-import torch.nn as nn
-import torchvision.transforms.functional as ttf
-
-import lerobot.common.policies.diffusion.model.tensor_utils as tu
-
-
-class CropRandomizer(nn.Module):
-    """
-    Randomly sample crops at input, and then average across crop features at output.
-    """
-
-    def __init__(
-        self,
-        input_shape,
-        crop_height,
-        crop_width,
-        num_crops=1,
-        pos_enc=False,
-    ):
-        """
-        Args:
-            input_shape (tuple, list): shape of input (not including batch dimension)
-            crop_height (int): crop height
-            crop_width (int): crop width
-            num_crops (int): number of random crops to take
-            pos_enc (bool): if True, add 2 channels to the output to encode the spatial
-                location of the cropped pixels in the source image
-        """
-        super().__init__()
-
-        assert len(input_shape) == 3  # (C, H, W)
-        assert crop_height < input_shape[1]
-        assert crop_width < input_shape[2]
-
-        self.input_shape = input_shape
-        self.crop_height = crop_height
-        self.crop_width = crop_width
-        self.num_crops = num_crops
-        self.pos_enc = pos_enc
-
-    def output_shape_in(self, input_shape=None):
-        """
-        Function to compute output shape from inputs to this module. Corresponds to
-        the @forward_in operation, where raw inputs (usually observation modalities)
-        are passed in.
-
-        Args:
-            input_shape (iterable of int): shape of input. Does not include batch dimension.
-                Some modules may not need this argument, if their output does not depend
-                on the size of the input, or if they assume fixed size input.
-
-        Returns:
-            out_shape ([int]): list of integers corresponding to output shape
-        """
-
-        # outputs are shape (C, CH, CW), or maybe C + 2 if using position encoding, because
-        # the number of crops are reshaped into the batch dimension, increasing the batch
-        # size from B to B * N
-        out_c = self.input_shape[0] + 2 if self.pos_enc else self.input_shape[0]
-        return [out_c, self.crop_height, self.crop_width]
-
-    def output_shape_out(self, input_shape=None):
-        """
-        Function to compute output shape from inputs to this module. Corresponds to
-        the @forward_out operation, where processed inputs (usually encoded observation
-        modalities) are passed in.
-
-        Args:
-            input_shape (iterable of int): shape of input. Does not include batch dimension.
-                Some modules may not need this argument, if their output does not depend
-                on the size of the input, or if they assume fixed size input.
-
-        Returns:
-            out_shape ([int]): list of integers corresponding to output shape
-        """
-
-        # since the forward_out operation splits [B * N, ...] -> [B, N, ...]
-        # and then pools to result in [B, ...], only the batch dimension changes,
-        # and so the other dimensions retain their shape.
-        return list(input_shape)
-
-    def forward_in(self, inputs):
-        """
-        Samples N random crops for each input in the batch, and then reshapes
-        inputs to [B * N, ...].
-        """
-        assert len(inputs.shape) >= 3  # must have at least (C, H, W) dimensions
-        if self.training:
-            # generate random crops
-            out, _ = sample_random_image_crops(
-                images=inputs,
-                crop_height=self.crop_height,
-                crop_width=self.crop_width,
-                num_crops=self.num_crops,
-                pos_enc=self.pos_enc,
-            )
-            # [B, N, ...] -> [B * N, ...]
-            return tu.join_dimensions(out, 0, 1)
-        else:
-            # take center crop during eval
-            out = ttf.center_crop(img=inputs, output_size=(self.crop_height, self.crop_width))
-            if self.num_crops > 1:
-                B, C, H, W = out.shape  # noqa: N806
-                out = out.unsqueeze(1).expand(B, self.num_crops, C, H, W).reshape(-1, C, H, W)
-                # [B * N, ...]
-            return out
-
-    def forward_out(self, inputs):
-        """
-        Splits the outputs from shape [B * N, ...] -> [B, N, ...] and then average across N
-        to result in shape [B, ...] to make sure the network output is consistent with
-        what would have happened if there were no randomization.
-        """
-        if self.num_crops <= 1:
-            return inputs
-        else:
-            batch_size = inputs.shape[0] // self.num_crops
-            out = tu.reshape_dimensions(
-                inputs, begin_axis=0, end_axis=0, target_dims=(batch_size, self.num_crops)
-            )
-            return out.mean(dim=1)
-
-    def forward(self, inputs):
-        return self.forward_in(inputs)
-
-    def __repr__(self):
-        """Pretty print network."""
-        header = "{}".format(str(self.__class__.__name__))
-        msg = header + "(input_shape={}, crop_size=[{}, {}], num_crops={})".format(
-            self.input_shape, self.crop_height, self.crop_width, self.num_crops
-        )
-        return msg
-
-
-def crop_image_from_indices(images, crop_indices, crop_height, crop_width):
-    """
-    Crops images at the locations specified by @crop_indices. Crops will be
-    taken across all channels.
-
-    Args:
-        images (torch.Tensor): batch of images of shape [..., C, H, W]
-
-        crop_indices (torch.Tensor): batch of indices of shape [..., N, 2] where
-            N is the number of crops to take per image and each entry corresponds
-            to the pixel height and width of where to take the crop. Note that
-            the indices can also be of shape [..., 2] if only 1 crop should
-            be taken per image. Leading dimensions must be consistent with
-            @images argument. Each index specifies the top left of the crop.
-            Values must be in range [0, H - CH - 1] x [0, W - CW - 1] where
-            H and W are the height and width of @images and CH and CW are
-            @crop_height and @crop_width.
-
-        crop_height (int): height of crop to take
-
-        crop_width (int): width of crop to take
-
-    Returns:
-        crops (torch.Tesnor): cropped images of shape [..., C, @crop_height, @crop_width]
-    """
-
-    # make sure length of input shapes is consistent
-    assert crop_indices.shape[-1] == 2
-    ndim_im_shape = len(images.shape)
-    ndim_indices_shape = len(crop_indices.shape)
-    assert (ndim_im_shape == ndim_indices_shape + 1) or (ndim_im_shape == ndim_indices_shape + 2)
-
-    # maybe pad so that @crop_indices is shape [..., N, 2]
-    is_padded = False
-    if ndim_im_shape == ndim_indices_shape + 2:
-        crop_indices = crop_indices.unsqueeze(-2)
-        is_padded = True
-
-    # make sure leading dimensions between images and indices are consistent
-    assert images.shape[:-3] == crop_indices.shape[:-2]
-
-    device = images.device
-    image_c, image_h, image_w = images.shape[-3:]
-    num_crops = crop_indices.shape[-2]
-
-    # make sure @crop_indices are in valid range
-    assert (crop_indices[..., 0] >= 0).all().item()
-    assert (crop_indices[..., 0] < (image_h - crop_height)).all().item()
-    assert (crop_indices[..., 1] >= 0).all().item()
-    assert (crop_indices[..., 1] < (image_w - crop_width)).all().item()
-
-    # convert each crop index (ch, cw) into a list of pixel indices that correspond to the entire window.
-
-    # 2D index array with columns [0, 1, ..., CH - 1] and shape [CH, CW]
-    crop_ind_grid_h = torch.arange(crop_height).to(device)
-    crop_ind_grid_h = tu.unsqueeze_expand_at(crop_ind_grid_h, size=crop_width, dim=-1)
-    # 2D index array with rows [0, 1, ..., CW - 1] and shape [CH, CW]
-    crop_ind_grid_w = torch.arange(crop_width).to(device)
-    crop_ind_grid_w = tu.unsqueeze_expand_at(crop_ind_grid_w, size=crop_height, dim=0)
-    # combine into shape [CH, CW, 2]
-    crop_in_grid = torch.cat((crop_ind_grid_h.unsqueeze(-1), crop_ind_grid_w.unsqueeze(-1)), dim=-1)
-
-    # Add above grid with the offset index of each sampled crop to get 2d indices for each crop.
-    # After broadcasting, this will be shape [..., N, CH, CW, 2] and each crop has a [CH, CW, 2]
-    # shape array that tells us which pixels from the corresponding source image to grab.
-    grid_reshape = [1] * len(crop_indices.shape[:-1]) + [crop_height, crop_width, 2]
-    all_crop_inds = crop_indices.unsqueeze(-2).unsqueeze(-2) + crop_in_grid.reshape(grid_reshape)
-
-    # For using @torch.gather, convert to flat indices from 2D indices, and also
-    # repeat across the channel dimension. To get flat index of each pixel to grab for
-    # each sampled crop, we just use the mapping: ind = h_ind * @image_w + w_ind
-    all_crop_inds = all_crop_inds[..., 0] * image_w + all_crop_inds[..., 1]  # shape [..., N, CH, CW]
-    all_crop_inds = tu.unsqueeze_expand_at(all_crop_inds, size=image_c, dim=-3)  # shape [..., N, C, CH, CW]
-    all_crop_inds = tu.flatten(all_crop_inds, begin_axis=-2)  # shape [..., N, C, CH * CW]
-
-    # Repeat and flatten the source images -> [..., N, C, H * W] and then use gather to index with crop pixel inds
-    images_to_crop = tu.unsqueeze_expand_at(images, size=num_crops, dim=-4)
-    images_to_crop = tu.flatten(images_to_crop, begin_axis=-2)
-    crops = torch.gather(images_to_crop, dim=-1, index=all_crop_inds)
-    # [..., N, C, CH * CW] -> [..., N, C, CH, CW]
-    reshape_axis = len(crops.shape) - 1
-    crops = tu.reshape_dimensions(
-        crops, begin_axis=reshape_axis, end_axis=reshape_axis, target_dims=(crop_height, crop_width)
-    )
-
-    if is_padded:
-        # undo padding -> [..., C, CH, CW]
-        crops = crops.squeeze(-4)
-    return crops
-
-
-def sample_random_image_crops(images, crop_height, crop_width, num_crops, pos_enc=False):
-    """
-    For each image, randomly sample @num_crops crops of size (@crop_height, @crop_width), from
-    @images.
-
-    Args:
-        images (torch.Tensor): batch of images of shape [..., C, H, W]
-
-        crop_height (int): height of crop to take
-
-        crop_width (int): width of crop to take
-
-        num_crops (n): number of crops to sample
-
-        pos_enc (bool): if True, also add 2 channels to the outputs that gives a spatial
-            encoding of the original source pixel locations. This means that the
-            output crops will contain information about where in the source image
-            it was sampled from.
-
-    Returns:
-        crops (torch.Tensor): crops of shape (..., @num_crops, C, @crop_height, @crop_width)
-            if @pos_enc is False, otherwise (..., @num_crops, C + 2, @crop_height, @crop_width)
-
-        crop_inds (torch.Tensor): sampled crop indices of shape (..., N, 2)
-    """
-    device = images.device
-
-    # maybe add 2 channels of spatial encoding to the source image
-    source_im = images
-    if pos_enc:
-        # spatial encoding [y, x] in [0, 1]
-        h, w = source_im.shape[-2:]
-        pos_y, pos_x = torch.meshgrid(torch.arange(h), torch.arange(w))
-        pos_y = pos_y.float().to(device) / float(h)
-        pos_x = pos_x.float().to(device) / float(w)
-        position_enc = torch.stack((pos_y, pos_x))  # shape [C, H, W]
-
-        # unsqueeze and expand to match leading dimensions -> shape [..., C, H, W]
-        leading_shape = source_im.shape[:-3]
-        position_enc = position_enc[(None,) * len(leading_shape)]
-        position_enc = position_enc.expand(*leading_shape, -1, -1, -1)
-
-        # concat across channel dimension with input
-        source_im = torch.cat((source_im, position_enc), dim=-3)
-
-    # make sure sample boundaries ensure crops are fully within the images
-    image_c, image_h, image_w = source_im.shape[-3:]
-    max_sample_h = image_h - crop_height
-    max_sample_w = image_w - crop_width
-
-    # Sample crop locations for all tensor dimensions up to the last 3, which are [C, H, W].
-    # Each gets @num_crops samples - typically this will just be the batch dimension (B), so
-    # we will sample [B, N] indices, but this supports having more than one leading dimension,
-    # or possibly no leading dimension.
-    #
-    # Trick: sample in [0, 1) with rand, then re-scale to [0, M) and convert to long to get sampled ints
-    crop_inds_h = (max_sample_h * torch.rand(*source_im.shape[:-3], num_crops).to(device)).long()
-    crop_inds_w = (max_sample_w * torch.rand(*source_im.shape[:-3], num_crops).to(device)).long()
-    crop_inds = torch.cat((crop_inds_h.unsqueeze(-1), crop_inds_w.unsqueeze(-1)), dim=-1)  # shape [..., N, 2]
-
-    crops = crop_image_from_indices(
-        images=source_im,
-        crop_indices=crop_inds,
-        crop_height=crop_height,
-        crop_width=crop_width,
-    )
-
-    return crops, crop_inds
--- a/lerobot/common/policies/diffusion/model/dict_of_tensor_mixin.py
+++ b/lerobot/common/policies/diffusion/model/dict_of_tensor_mixin.py
@@ -1,41 +0,0 @@
-import torch
-import torch.nn as nn
-
-
-class DictOfTensorMixin(nn.Module):
-    def __init__(self, params_dict=None):
-        super().__init__()
-        if params_dict is None:
-            params_dict = nn.ParameterDict()
-        self.params_dict = params_dict
-
-    @property
-    def device(self):
-        return next(iter(self.parameters())).device
-
-    def _load_from_state_dict(
-        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
-    ):
-        def dfs_add(dest, keys, value: torch.Tensor):
-            if len(keys) == 1:
-                dest[keys[0]] = value
-                return
-
-            if keys[0] not in dest:
-                dest[keys[0]] = nn.ParameterDict()
-            dfs_add(dest[keys[0]], keys[1:], value)
-
-        def load_dict(state_dict, prefix):
-            out_dict = nn.ParameterDict()
-            for key, value in state_dict.items():
-                value: torch.Tensor
-                if key.startswith(prefix):
-                    param_keys = key[len(prefix) :].split(".")[1:]
-                    # if len(param_keys) == 0:
-                    #     import pdb; pdb.set_trace()
-                    dfs_add(out_dict, param_keys, value.clone())
-            return out_dict
-
-        self.params_dict = load_dict(state_dict, prefix + "params_dict")
-        self.params_dict.requires_grad_(False)
-        return
--- a/lerobot/common/policies/diffusion/model/ema_model.py
+++ b/lerobot/common/policies/diffusion/model/ema_model.py
@@ -1,84 +0,0 @@
-import torch
-from torch.nn.modules.batchnorm import _BatchNorm
-
-
-class EMAModel:
-    """
-    Exponential Moving Average of models weights
-    """
-
-    def __init__(
-        self, model, update_after_step=0, inv_gamma=1.0, power=2 / 3, min_value=0.0, max_value=0.9999
-    ):
-        """
-        @crowsonkb's notes on EMA Warmup:
-            If gamma=1 and power=1, implements a simple average. gamma=1, power=2/3 are good values for models you plan
-            to train for a million or more steps (reaches decay factor 0.999 at 31.6K steps, 0.9999 at 1M steps),
-            gamma=1, power=3/4 for models you plan to train for less (reaches decay factor 0.999 at 10K steps, 0.9999
-            at 215.4k steps).
-        Args:
-            inv_gamma (float): Inverse multiplicative factor of EMA warmup. Default: 1.
-            power (float): Exponential factor of EMA warmup. Default: 2/3.
-            min_value (float): The minimum EMA decay rate. Default: 0.
-        """
-
-        self.averaged_model = model
-        self.averaged_model.eval()
-        self.averaged_model.requires_grad_(False)
-
-        self.update_after_step = update_after_step
-        self.inv_gamma = inv_gamma
-        self.power = power
-        self.min_value = min_value
-        self.max_value = max_value
-
-        self.decay = 0.0
-        self.optimization_step = 0
-
-    def get_decay(self, optimization_step):
-        """
-        Compute the decay factor for the exponential moving average.
-        """
-        step = max(0, optimization_step - self.update_after_step - 1)
-        value = 1 - (1 + step / self.inv_gamma) ** -self.power
-
-        if step <= 0:
-            return 0.0
-
-        return max(self.min_value, min(value, self.max_value))
-
-    @torch.no_grad()
-    def step(self, new_model):
-        self.decay = self.get_decay(self.optimization_step)
-
-        # old_all_dataptrs = set()
-        # for param in new_model.parameters():
-        #     data_ptr = param.data_ptr()
-        #     if data_ptr != 0:
-        #         old_all_dataptrs.add(data_ptr)
-
-        # all_dataptrs = set()
-        for module, ema_module in zip(new_model.modules(), self.averaged_model.modules(), strict=False):
-            for param, ema_param in zip(
-                module.parameters(recurse=False), ema_module.parameters(recurse=False), strict=False
-            ):
-                # iterative over immediate parameters only.
-                if isinstance(param, dict):
-                    raise RuntimeError("Dict parameter not supported")
-
-                # data_ptr = param.data_ptr()
-                # if data_ptr != 0:
-                #     all_dataptrs.add(data_ptr)
-
-                if isinstance(module, _BatchNorm):
-                    # skip batchnorms
-                    ema_param.copy_(param.to(dtype=ema_param.dtype).data)
-                elif not param.requires_grad:
-                    ema_param.copy_(param.to(dtype=ema_param.dtype).data)
-                else:
-                    ema_param.mul_(self.decay)
-                    ema_param.add_(param.data.to(dtype=ema_param.dtype), alpha=1 - self.decay)
-
-        # verify that iterating over module and then parameters is identical to parameters recursively.
-        # assert old_all_dataptrs == all_dataptrs
-        self.optimization_step += 1
--- a/lerobot/common/policies/diffusion/model/lr_scheduler.py
+++ b/lerobot/common/policies/diffusion/model/lr_scheduler.py
@@ -1,46 +0,0 @@
-from diffusers.optimization import TYPE_TO_SCHEDULER_FUNCTION, Optimizer, Optional, SchedulerType, Union
-
-
-def get_scheduler(
-    name: Union[str, SchedulerType],
-    optimizer: Optimizer,
-    num_warmup_steps: Optional[int] = None,
-    num_training_steps: Optional[int] = None,
-    **kwargs,
-):
-    """
-    Added kwargs vs diffuser's original implementation
-
-    Unified API to get any scheduler from its name.
-
-    Args:
-        name (`str` or `SchedulerType`):
-            The name of the scheduler to use.
-        optimizer (`torch.optim.Optimizer`):
-            The optimizer that will be used during training.
-        num_warmup_steps (`int`, *optional*):
-            The number of warmup steps to do. This is not required by all schedulers (hence the argument being
-            optional), the function will raise an error if it's unset and the scheduler type requires it.
-        num_training_steps (`int``, *optional*):
-            The number of training steps to do. This is not required by all schedulers (hence the argument being
-            optional), the function will raise an error if it's unset and the scheduler type requires it.
-    """
-    name = SchedulerType(name)
-    schedule_func = TYPE_TO_SCHEDULER_FUNCTION[name]
-    if name == SchedulerType.CONSTANT:
-        return schedule_func(optimizer, **kwargs)
-
-    # All other schedulers require `num_warmup_steps`
-    if num_warmup_steps is None:
-        raise ValueError(f"{name} requires `num_warmup_steps`, please provide that argument.")
-
-    if name == SchedulerType.CONSTANT_WITH_WARMUP:
-        return schedule_func(optimizer, num_warmup_steps=num_warmup_steps, **kwargs)
-
-    # All other schedulers require `num_training_steps`
-    if num_training_steps is None:
-        raise ValueError(f"{name} requires `num_training_steps`, please provide that argument.")
-
-    return schedule_func(
-        optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps, **kwargs
-    )
--- a/lerobot/common/policies/diffusion/model/mask_generator.py
+++ b/lerobot/common/policies/diffusion/model/mask_generator.py
@@ -1,65 +0,0 @@
-import torch
-
-from lerobot.common.policies.diffusion.model.module_attr_mixin import ModuleAttrMixin
-
-
-class LowdimMaskGenerator(ModuleAttrMixin):
-    def __init__(
-        self,
-        action_dim,
-        obs_dim,
-        # obs mask setup
-        max_n_obs_steps=2,
-        fix_obs_steps=True,
-        # action mask
-        action_visible=False,
-    ):
-        super().__init__()
-        self.action_dim = action_dim
-        self.obs_dim = obs_dim
-        self.max_n_obs_steps = max_n_obs_steps
-        self.fix_obs_steps = fix_obs_steps
-        self.action_visible = action_visible
-
-    @torch.no_grad()
-    def forward(self, shape, seed=None):
-        device = self.device
-        B, T, D = shape  # noqa: N806
-        assert (self.action_dim + self.obs_dim) == D
-
-        # create all tensors on this device
-        rng = torch.Generator(device=device)
-        if seed is not None:
-            rng = rng.manual_seed(seed)
-
-        # generate dim mask
-        dim_mask = torch.zeros(size=shape, dtype=torch.bool, device=device)
-        is_action_dim = dim_mask.clone()
-        is_action_dim[..., : self.action_dim] = True
-        is_obs_dim = ~is_action_dim
-
-        # generate obs mask
-        if self.fix_obs_steps:
-            obs_steps = torch.full((B,), fill_value=self.max_n_obs_steps, device=device)
-        else:
-            obs_steps = torch.randint(
-                low=1, high=self.max_n_obs_steps + 1, size=(B,), generator=rng, device=device
-            )
-
-        steps = torch.arange(0, T, device=device).reshape(1, T).expand(B, T)
-        obs_mask = (obs_steps > steps.T).T.reshape(B, T, 1).expand(B, T, D)
-        obs_mask = obs_mask & is_obs_dim
-
-        # generate action mask
-        if self.action_visible:
-            action_steps = torch.maximum(
-                obs_steps - 1, torch.tensor(0, dtype=obs_steps.dtype, device=obs_steps.device)
-            )
-            action_mask = (action_steps > steps.T).T.reshape(B, T, 1).expand(B, T, D)
-            action_mask = action_mask & is_action_dim
-
-        mask = obs_mask
-        if self.action_visible:
-            mask = mask | action_mask
-
-        return mask
--- a/lerobot/common/policies/diffusion/model/module_attr_mixin.py
+++ b/lerobot/common/policies/diffusion/model/module_attr_mixin.py
@@ -1,15 +0,0 @@
-import torch.nn as nn
-
-
-class ModuleAttrMixin(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self._dummy_variable = nn.Parameter()
-
-    @property
-    def device(self):
-        return next(iter(self.parameters())).device
-
-    @property
-    def dtype(self):
-        return next(iter(self.parameters())).dtype
--- a/lerobot/common/policies/diffusion/model/normalizer.py
+++ b/lerobot/common/policies/diffusion/model/normalizer.py
@@ -1,358 +0,0 @@
-from typing import Dict, Union
-
-import numpy as np
-import torch
-import torch.nn as nn
-import zarr
-
-from lerobot.common.policies.diffusion.model.dict_of_tensor_mixin import DictOfTensorMixin
-from lerobot.common.policies.diffusion.pytorch_utils import dict_apply
-
-
-class LinearNormalizer(DictOfTensorMixin):
-    avaliable_modes = ["limits", "gaussian"]
-
-    @torch.no_grad()
-    def fit(
-        self,
-        data: Union[Dict, torch.Tensor, np.ndarray, zarr.Array],
-        last_n_dims=1,
-        dtype=torch.float32,
-        mode="limits",
-        output_max=1.0,
-        output_min=-1.0,
-        range_eps=1e-4,
-        fit_offset=True,
-    ):
-        if isinstance(data, dict):
-            for key, value in data.items():
-                self.params_dict[key] = _fit(
-                    value,
-                    last_n_dims=last_n_dims,
-                    dtype=dtype,
-                    mode=mode,
-                    output_max=output_max,
-                    output_min=output_min,
-                    range_eps=range_eps,
-                    fit_offset=fit_offset,
-                )
-        else:
-            self.params_dict["_default"] = _fit(
-                data,
-                last_n_dims=last_n_dims,
-                dtype=dtype,
-                mode=mode,
-                output_max=output_max,
-                output_min=output_min,
-                range_eps=range_eps,
-                fit_offset=fit_offset,
-            )
-
-    def __call__(self, x: Union[Dict, torch.Tensor, np.ndarray]) -> torch.Tensor:
-        return self.normalize(x)
-
-    def __getitem__(self, key: str):
-        return SingleFieldLinearNormalizer(self.params_dict[key])
-
-    def __setitem__(self, key: str, value: "SingleFieldLinearNormalizer"):
-        self.params_dict[key] = value.params_dict
-
-    def _normalize_impl(self, x, forward=True):
-        if isinstance(x, dict):
-            result = {}
-            for key, value in x.items():
-                params = self.params_dict[key]
-                result[key] = _normalize(value, params, forward=forward)
-            return result
-        else:
-            if "_default" not in self.params_dict:
-                raise RuntimeError("Not initialized")
-            params = self.params_dict["_default"]
-            return _normalize(x, params, forward=forward)
-
-    def normalize(self, x: Union[Dict, torch.Tensor, np.ndarray]) -> torch.Tensor:
-        return self._normalize_impl(x, forward=True)
-
-    def unnormalize(self, x: Union[Dict, torch.Tensor, np.ndarray]) -> torch.Tensor:
-        return self._normalize_impl(x, forward=False)
-
-    def get_input_stats(self) -> Dict:
-        if len(self.params_dict) == 0:
-            raise RuntimeError("Not initialized")
-        if len(self.params_dict) == 1 and "_default" in self.params_dict:
-            return self.params_dict["_default"]["input_stats"]
-
-        result = {}
-        for key, value in self.params_dict.items():
-            if key != "_default":
-                result[key] = value["input_stats"]
-        return result
-
-    def get_output_stats(self, key="_default"):
-        input_stats = self.get_input_stats()
-        if "min" in input_stats:
-            # no dict
-            return dict_apply(input_stats, self.normalize)
-
-        result = {}
-        for key, group in input_stats.items():
-            this_dict = {}
-            for name, value in group.items():
-                this_dict[name] = self.normalize({key: value})[key]
-            result[key] = this_dict
-        return result
-
-
-class SingleFieldLinearNormalizer(DictOfTensorMixin):
-    avaliable_modes = ["limits", "gaussian"]
-
-    @torch.no_grad()
-    def fit(
-        self,
-        data: Union[torch.Tensor, np.ndarray, zarr.Array],
-        last_n_dims=1,
-        dtype=torch.float32,
-        mode="limits",
-        output_max=1.0,
-        output_min=-1.0,
-        range_eps=1e-4,
-        fit_offset=True,
-    ):
-        self.params_dict = _fit(
-            data,
-            last_n_dims=last_n_dims,
-            dtype=dtype,
-            mode=mode,
-            output_max=output_max,
-            output_min=output_min,
-            range_eps=range_eps,
-            fit_offset=fit_offset,
-        )
-
-    @classmethod
-    def create_fit(cls, data: Union[torch.Tensor, np.ndarray, zarr.Array], **kwargs):
-        obj = cls()
-        obj.fit(data, **kwargs)
-        return obj
-
-    @classmethod
-    def create_manual(
-        cls,
-        scale: Union[torch.Tensor, np.ndarray],
-        offset: Union[torch.Tensor, np.ndarray],
-        input_stats_dict: Dict[str, Union[torch.Tensor, np.ndarray]],
-    ):
-        def to_tensor(x):
-            if not isinstance(x, torch.Tensor):
-                x = torch.from_numpy(x)
-            x = x.flatten()
-            return x
-
-        # check
-        for x in [offset] + list(input_stats_dict.values()):
-            assert x.shape == scale.shape
-            assert x.dtype == scale.dtype
-
-        params_dict = nn.ParameterDict(
-            {
-                "scale": to_tensor(scale),
-                "offset": to_tensor(offset),
-                "input_stats": nn.ParameterDict(dict_apply(input_stats_dict, to_tensor)),
-            }
-        )
-        return cls(params_dict)
-
-    @classmethod
-    def create_identity(cls, dtype=torch.float32):
-        scale = torch.tensor([1], dtype=dtype)
-        offset = torch.tensor([0], dtype=dtype)
-        input_stats_dict = {
-            "min": torch.tensor([-1], dtype=dtype),
-            "max": torch.tensor([1], dtype=dtype),
-            "mean": torch.tensor([0], dtype=dtype),
-            "std": torch.tensor([1], dtype=dtype),
-        }
-        return cls.create_manual(scale, offset, input_stats_dict)
-
-    def normalize(self, x: Union[torch.Tensor, np.ndarray]) -> torch.Tensor:
-        return _normalize(x, self.params_dict, forward=True)
-
-    def unnormalize(self, x: Union[torch.Tensor, np.ndarray]) -> torch.Tensor:
-        return _normalize(x, self.params_dict, forward=False)
-
-    def get_input_stats(self):
-        return self.params_dict["input_stats"]
-
-    def get_output_stats(self):
-        return dict_apply(self.params_dict["input_stats"], self.normalize)
-
-    def __call__(self, x: Union[torch.Tensor, np.ndarray]) -> torch.Tensor:
-        return self.normalize(x)
-
-
-def _fit(
-    data: Union[torch.Tensor, np.ndarray, zarr.Array],
-    last_n_dims=1,
-    dtype=torch.float32,
-    mode="limits",
-    output_max=1.0,
-    output_min=-1.0,
-    range_eps=1e-4,
-    fit_offset=True,
-):
-    assert mode in ["limits", "gaussian"]
-    assert last_n_dims >= 0
-    assert output_max > output_min
-
-    # convert data to torch and type
-    if isinstance(data, zarr.Array):
-        data = data[:]
-    if isinstance(data, np.ndarray):
-        data = torch.from_numpy(data)
-    if dtype is not None:
-        data = data.type(dtype)
-
-    # convert shape
-    dim = 1
-    if last_n_dims > 0:
-        dim = np.prod(data.shape[-last_n_dims:])
-    data = data.reshape(-1, dim)
-
-    # compute input stats min max mean std
-    input_min, _ = data.min(axis=0)
-    input_max, _ = data.max(axis=0)
-    input_mean = data.mean(axis=0)
-    input_std = data.std(axis=0)
-
-    # compute scale and offset
-    if mode == "limits":
-        if fit_offset:
-            # unit scale
-            input_range = input_max - input_min
-            ignore_dim = input_range < range_eps
-            input_range[ignore_dim] = output_max - output_min
-            scale = (output_max - output_min) / input_range
-            offset = output_min - scale * input_min
-            offset[ignore_dim] = (output_max + output_min) / 2 - input_min[ignore_dim]
-            # ignore dims scaled to mean of output max and min
-        else:
-            # use this when data is pre-zero-centered.
-            assert output_max > 0
-            assert output_min < 0
-            # unit abs
-            output_abs = min(abs(output_min), abs(output_max))
-            input_abs = torch.maximum(torch.abs(input_min), torch.abs(input_max))
-            ignore_dim = input_abs < range_eps
-            input_abs[ignore_dim] = output_abs
-            # don't scale constant channels
-            scale = output_abs / input_abs
-            offset = torch.zeros_like(input_mean)
-    elif mode == "gaussian":
-        ignore_dim = input_std < range_eps
-        scale = input_std.clone()
-        scale[ignore_dim] = 1
-        scale = 1 / scale
-
-        offset = -input_mean * scale if fit_offset else torch.zeros_like(input_mean)
-
-    # save
-    this_params = nn.ParameterDict(
-        {
-            "scale": scale,
-            "offset": offset,
-            "input_stats": nn.ParameterDict(
-                {"min": input_min, "max": input_max, "mean": input_mean, "std": input_std}
-            ),
-        }
-    )
-    for p in this_params.parameters():
-        p.requires_grad_(False)
-    return this_params
-
-
-def _normalize(x, params, forward=True):
-    assert "scale" in params
-    if isinstance(x, np.ndarray):
-        x = torch.from_numpy(x)
-    scale = params["scale"]
-    offset = params["offset"]
-    x = x.to(device=scale.device, dtype=scale.dtype)
-    src_shape = x.shape
-    x = x.reshape(-1, scale.shape[0])
-    x = x * scale + offset if forward else (x - offset) / scale
-    x = x.reshape(src_shape)
-    return x
-
-
-def test():
-    data = torch.zeros((100, 10, 9, 2)).uniform_()
-    data[..., 0, 0] = 0
-
-    normalizer = SingleFieldLinearNormalizer()
-    normalizer.fit(data, mode="limits", last_n_dims=2)
-    datan = normalizer.normalize(data)
-    assert datan.shape == data.shape
-    assert np.allclose(datan.max(), 1.0)
-    assert np.allclose(datan.min(), -1.0)
-    dataun = normalizer.unnormalize(datan)
-    assert torch.allclose(data, dataun, atol=1e-7)
-
-    _ = normalizer.get_input_stats()
-    _ = normalizer.get_output_stats()
-
-    normalizer = SingleFieldLinearNormalizer()
-    normalizer.fit(data, mode="limits", last_n_dims=1, fit_offset=False)
-    datan = normalizer.normalize(data)
-    assert datan.shape == data.shape
-    assert np.allclose(datan.max(), 1.0, atol=1e-3)
-    assert np.allclose(datan.min(), 0.0, atol=1e-3)
-    dataun = normalizer.unnormalize(datan)
-    assert torch.allclose(data, dataun, atol=1e-7)
-
-    data = torch.zeros((100, 10, 9, 2)).uniform_()
-    normalizer = SingleFieldLinearNormalizer()
-    normalizer.fit(data, mode="gaussian", last_n_dims=0)
-    datan = normalizer.normalize(data)
-    assert datan.shape == data.shape
-    assert np.allclose(datan.mean(), 0.0, atol=1e-3)
-    assert np.allclose(datan.std(), 1.0, atol=1e-3)
-    dataun = normalizer.unnormalize(datan)
-    assert torch.allclose(data, dataun, atol=1e-7)
-
-    # dict
-    data = torch.zeros((100, 10, 9, 2)).uniform_()
-    data[..., 0, 0] = 0
-
-    normalizer = LinearNormalizer()
-    normalizer.fit(data, mode="limits", last_n_dims=2)
-    datan = normalizer.normalize(data)
-    assert datan.shape == data.shape
-    assert np.allclose(datan.max(), 1.0)
-    assert np.allclose(datan.min(), -1.0)
-    dataun = normalizer.unnormalize(datan)
-    assert torch.allclose(data, dataun, atol=1e-7)
-
-    _ = normalizer.get_input_stats()
-    _ = normalizer.get_output_stats()
-
-    data = {
-        "obs": torch.zeros((1000, 128, 9, 2)).uniform_() * 512,
-        "action": torch.zeros((1000, 128, 2)).uniform_() * 512,
-    }
-    normalizer = LinearNormalizer()
-    normalizer.fit(data)
-    datan = normalizer.normalize(data)
-    dataun = normalizer.unnormalize(datan)
-    for key in data:
-        assert torch.allclose(data[key], dataun[key], atol=1e-4)
-
-    _ = normalizer.get_input_stats()
-    _ = normalizer.get_output_stats()
-
-    state_dict = normalizer.state_dict()
-    n = LinearNormalizer()
-    n.load_state_dict(state_dict)
-    datan = n.normalize(data)
-    dataun = n.unnormalize(datan)
-    for key in data:
-        assert torch.allclose(data[key], dataun[key], atol=1e-4)
--- a/lerobot/common/policies/diffusion/model/positional_embedding.py
+++ b/lerobot/common/policies/diffusion/model/positional_embedding.py
@@ -1,19 +0,0 @@
-import math
-
-import torch
-import torch.nn as nn
-
-
-class SinusoidalPosEmb(nn.Module):
-    def __init__(self, dim):
-        super().__init__()
-        self.dim = dim
-
-    def forward(self, x):
-        device = x.device
-        half_dim = self.dim // 2
-        emb = math.log(10000) / (half_dim - 1)
-        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
-        emb = x[:, None] * emb[None, :]
-        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
-        return emb
--- a/lerobot/common/policies/diffusion/model/tensor_utils.py
+++ b/lerobot/common/policies/diffusion/model/tensor_utils.py
@@ -1,972 +0,0 @@
-"""
-A collection of utilities for working with nested tensor structures consisting
-of numpy arrays and torch tensors.
-"""
-
-import collections
-
-import numpy as np
-import torch
-
-
-def recursive_dict_list_tuple_apply(x, type_func_dict):
-    """
-    Recursively apply functions to a nested dictionary or list or tuple, given a dictionary of
-    {data_type: function_to_apply}.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-        type_func_dict (dict): a mapping from data types to the functions to be
-            applied for each data type.
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    assert list not in type_func_dict
-    assert tuple not in type_func_dict
-    assert dict not in type_func_dict
-
-    if isinstance(x, (dict, collections.OrderedDict)):
-        new_x = collections.OrderedDict() if isinstance(x, collections.OrderedDict) else {}
-        for k, v in x.items():
-            new_x[k] = recursive_dict_list_tuple_apply(v, type_func_dict)
-        return new_x
-    elif isinstance(x, (list, tuple)):
-        ret = [recursive_dict_list_tuple_apply(v, type_func_dict) for v in x]
-        if isinstance(x, tuple):
-            ret = tuple(ret)
-        return ret
-    else:
-        for t, f in type_func_dict.items():
-            if isinstance(x, t):
-                return f(x)
-        else:
-            raise NotImplementedError("Cannot handle data type %s" % str(type(x)))
-
-
-def map_tensor(x, func):
-    """
-    Apply function @func to torch.Tensor objects in a nested dictionary or
-    list or tuple.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-        func (function): function to apply to each tensor
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: func,
-            type(None): lambda x: x,
-        },
-    )
-
-
-def map_ndarray(x, func):
-    """
-    Apply function @func to np.ndarray objects in a nested dictionary or
-    list or tuple.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-        func (function): function to apply to each array
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            np.ndarray: func,
-            type(None): lambda x: x,
-        },
-    )
-
-
-def map_tensor_ndarray(x, tensor_func, ndarray_func):
-    """
-    Apply function @tensor_func to torch.Tensor objects and @ndarray_func to
-    np.ndarray objects in a nested dictionary or list or tuple.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-        tensor_func (function): function to apply to each tensor
-        ndarray_Func (function): function to apply to each array
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: tensor_func,
-            np.ndarray: ndarray_func,
-            type(None): lambda x: x,
-        },
-    )
-
-
-def clone(x):
-    """
-    Clones all torch tensors and numpy arrays in nested dictionary or list
-    or tuple and returns a new nested structure.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: lambda x: x.clone(),
-            np.ndarray: lambda x: x.copy(),
-            type(None): lambda x: x,
-        },
-    )
-
-
-def detach(x):
-    """
-    Detaches all torch tensors in nested dictionary or list
-    or tuple and returns a new nested structure.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: lambda x: x.detach(),
-        },
-    )
-
-
-def to_batch(x):
-    """
-    Introduces a leading batch dimension of 1 for all torch tensors and numpy
-    arrays in nested dictionary or list or tuple and returns a new nested structure.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: lambda x: x[None, ...],
-            np.ndarray: lambda x: x[None, ...],
-            type(None): lambda x: x,
-        },
-    )
-
-
-def to_sequence(x):
-    """
-    Introduces a time dimension of 1 at dimension 1 for all torch tensors and numpy
-    arrays in nested dictionary or list or tuple and returns a new nested structure.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: lambda x: x[:, None, ...],
-            np.ndarray: lambda x: x[:, None, ...],
-            type(None): lambda x: x,
-        },
-    )
-
-
-def index_at_time(x, ind):
-    """
-    Indexes all torch tensors and numpy arrays in dimension 1 with index @ind in
-    nested dictionary or list or tuple and returns a new nested structure.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-        ind (int): index
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: lambda x: x[:, ind, ...],
-            np.ndarray: lambda x: x[:, ind, ...],
-            type(None): lambda x: x,
-        },
-    )
-
-
-def unsqueeze(x, dim):
-    """
-    Adds dimension of size 1 at dimension @dim in all torch tensors and numpy arrays
-    in nested dictionary or list or tuple and returns a new nested structure.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-        dim (int): dimension
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: lambda x: x.unsqueeze(dim=dim),
-            np.ndarray: lambda x: np.expand_dims(x, axis=dim),
-            type(None): lambda x: x,
-        },
-    )
-
-
-def contiguous(x):
-    """
-    Makes all torch tensors and numpy arrays contiguous in nested dictionary or
-    list or tuple and returns a new nested structure.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: lambda x: x.contiguous(),
-            np.ndarray: lambda x: np.ascontiguousarray(x),
-            type(None): lambda x: x,
-        },
-    )
-
-
-def to_device(x, device):
-    """
-    Sends all torch tensors in nested dictionary or list or tuple to device
-    @device, and returns a new nested structure.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-        device (torch.Device): device to send tensors to
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: lambda x, d=device: x.to(d),
-            type(None): lambda x: x,
-        },
-    )
-
-
-def to_tensor(x):
-    """
-    Converts all numpy arrays in nested dictionary or list or tuple to
-    torch tensors (and leaves existing torch Tensors as-is), and returns
-    a new nested structure.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: lambda x: x,
-            np.ndarray: lambda x: torch.from_numpy(x),
-            type(None): lambda x: x,
-        },
-    )
-
-
-def to_numpy(x):
-    """
-    Converts all torch tensors in nested dictionary or list or tuple to
-    numpy (and leaves existing numpy arrays as-is), and returns
-    a new nested structure.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-
-    def f(tensor):
-        if tensor.is_cuda:
-            return tensor.detach().cpu().numpy()
-        else:
-            return tensor.detach().numpy()
-
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: f,
-            np.ndarray: lambda x: x,
-            type(None): lambda x: x,
-        },
-    )
-
-
-def to_list(x):
-    """
-    Converts all torch tensors and numpy arrays in nested dictionary or list
-    or tuple to a list, and returns a new nested structure. Useful for
-    json encoding.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-
-    def f(tensor):
-        if tensor.is_cuda:
-            return tensor.detach().cpu().numpy().tolist()
-        else:
-            return tensor.detach().numpy().tolist()
-
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: f,
-            np.ndarray: lambda x: x.tolist(),
-            type(None): lambda x: x,
-        },
-    )
-
-
-def to_float(x):
-    """
-    Converts all torch tensors and numpy arrays in nested dictionary or list
-    or tuple to float type entries, and returns a new nested structure.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: lambda x: x.float(),
-            np.ndarray: lambda x: x.astype(np.float32),
-            type(None): lambda x: x,
-        },
-    )
-
-
-def to_uint8(x):
-    """
-    Converts all torch tensors and numpy arrays in nested dictionary or list
-    or tuple to uint8 type entries, and returns a new nested structure.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: lambda x: x.byte(),
-            np.ndarray: lambda x: x.astype(np.uint8),
-            type(None): lambda x: x,
-        },
-    )
-
-
-def to_torch(x, device):
-    """
-    Converts all numpy arrays and torch tensors in nested dictionary or list or tuple to
-    torch tensors on device @device and returns a new nested structure.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-        device (torch.Device): device to send tensors to
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return to_device(to_float(to_tensor(x)), device)
-
-
-def to_one_hot_single(tensor, num_class):
-    """
-    Convert tensor to one-hot representation, assuming a certain number of total class labels.
-
-    Args:
-        tensor (torch.Tensor): tensor containing integer labels
-        num_class (int): number of classes
-
-    Returns:
-        x (torch.Tensor): tensor containing one-hot representation of labels
-    """
-    x = torch.zeros(tensor.size() + (num_class,)).to(tensor.device)
-    x.scatter_(-1, tensor.unsqueeze(-1), 1)
-    return x
-
-
-def to_one_hot(tensor, num_class):
-    """
-    Convert all tensors in nested dictionary or list or tuple to one-hot representation,
-    assuming a certain number of total class labels.
-
-    Args:
-        tensor (dict or list or tuple): a possibly nested dictionary or list or tuple
-        num_class (int): number of classes
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return map_tensor(tensor, func=lambda x, nc=num_class: to_one_hot_single(x, nc))
-
-
-def flatten_single(x, begin_axis=1):
-    """
-    Flatten a tensor in all dimensions from @begin_axis onwards.
-
-    Args:
-        x (torch.Tensor): tensor to flatten
-        begin_axis (int): which axis to flatten from
-
-    Returns:
-        y (torch.Tensor): flattened tensor
-    """
-    fixed_size = x.size()[:begin_axis]
-    _s = list(fixed_size) + [-1]
-    return x.reshape(*_s)
-
-
-def flatten(x, begin_axis=1):
-    """
-    Flatten all tensors in nested dictionary or list or tuple, from @begin_axis onwards.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-        begin_axis (int): which axis to flatten from
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: lambda x, b=begin_axis: flatten_single(x, begin_axis=b),
-        },
-    )
-
-
-def reshape_dimensions_single(x, begin_axis, end_axis, target_dims):
-    """
-    Reshape selected dimensions in a tensor to a target dimension.
-
-    Args:
-        x (torch.Tensor): tensor to reshape
-        begin_axis (int): begin dimension
-        end_axis (int): end dimension
-        target_dims (tuple or list): target shape for the range of dimensions
-            (@begin_axis, @end_axis)
-
-    Returns:
-        y (torch.Tensor): reshaped tensor
-    """
-    assert begin_axis <= end_axis
-    assert begin_axis >= 0
-    assert end_axis < len(x.shape)
-    assert isinstance(target_dims, (tuple, list))
-    s = x.shape
-    final_s = []
-    for i in range(len(s)):
-        if i == begin_axis:
-            final_s.extend(target_dims)
-        elif i < begin_axis or i > end_axis:
-            final_s.append(s[i])
-    return x.reshape(*final_s)
-
-
-def reshape_dimensions(x, begin_axis, end_axis, target_dims):
-    """
-    Reshape selected dimensions for all tensors in nested dictionary or list or tuple
-    to a target dimension.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-        begin_axis (int): begin dimension
-        end_axis (int): end dimension
-        target_dims (tuple or list): target shape for the range of dimensions
-            (@begin_axis, @end_axis)
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: lambda x, b=begin_axis, e=end_axis, t=target_dims: reshape_dimensions_single(
-                x, begin_axis=b, end_axis=e, target_dims=t
-            ),
-            np.ndarray: lambda x, b=begin_axis, e=end_axis, t=target_dims: reshape_dimensions_single(
-                x, begin_axis=b, end_axis=e, target_dims=t
-            ),
-            type(None): lambda x: x,
-        },
-    )
-
-
-def join_dimensions(x, begin_axis, end_axis):
-    """
-    Joins all dimensions between dimensions (@begin_axis, @end_axis) into a flat dimension, for
-    all tensors in nested dictionary or list or tuple.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-        begin_axis (int): begin dimension
-        end_axis (int): end dimension
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: lambda x, b=begin_axis, e=end_axis: reshape_dimensions_single(
-                x, begin_axis=b, end_axis=e, target_dims=[-1]
-            ),
-            np.ndarray: lambda x, b=begin_axis, e=end_axis: reshape_dimensions_single(
-                x, begin_axis=b, end_axis=e, target_dims=[-1]
-            ),
-            type(None): lambda x: x,
-        },
-    )
-
-
-def expand_at_single(x, size, dim):
-    """
-    Expand a tensor at a single dimension @dim by @size
-
-    Args:
-        x (torch.Tensor): input tensor
-        size (int): size to expand
-        dim (int): dimension to expand
-
-    Returns:
-        y (torch.Tensor): expanded tensor
-    """
-    assert dim < x.ndimension()
-    assert x.shape[dim] == 1
-    expand_dims = [-1] * x.ndimension()
-    expand_dims[dim] = size
-    return x.expand(*expand_dims)
-
-
-def expand_at(x, size, dim):
-    """
-    Expand all tensors in nested dictionary or list or tuple at a single
-    dimension @dim by @size.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-        size (int): size to expand
-        dim (int): dimension to expand
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return map_tensor(x, lambda t, s=size, d=dim: expand_at_single(t, s, d))
-
-
-def unsqueeze_expand_at(x, size, dim):
-    """
-    Unsqueeze and expand a tensor at a dimension @dim by @size.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-        size (int): size to expand
-        dim (int): dimension to unsqueeze and expand
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    x = unsqueeze(x, dim)
-    return expand_at(x, size, dim)
-
-
-def repeat_by_expand_at(x, repeats, dim):
-    """
-    Repeat a dimension by combining expand and reshape operations.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-        repeats (int): number of times to repeat the target dimension
-        dim (int): dimension to repeat on
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    x = unsqueeze_expand_at(x, repeats, dim + 1)
-    return join_dimensions(x, dim, dim + 1)
-
-
-def named_reduce_single(x, reduction, dim):
-    """
-    Reduce tensor at a dimension by named reduction functions.
-
-    Args:
-        x (torch.Tensor): tensor to be reduced
-        reduction (str): one of ["sum", "max", "mean", "flatten"]
-        dim (int): dimension to be reduced (or begin axis for flatten)
-
-    Returns:
-        y (torch.Tensor): reduced tensor
-    """
-    assert x.ndimension() > dim
-    assert reduction in ["sum", "max", "mean", "flatten"]
-    if reduction == "flatten":
-        x = flatten(x, begin_axis=dim)
-    elif reduction == "max":
-        x = torch.max(x, dim=dim)[0]  # [B, D]
-    elif reduction == "sum":
-        x = torch.sum(x, dim=dim)
-    else:
-        x = torch.mean(x, dim=dim)
-    return x
-
-
-def named_reduce(x, reduction, dim):
-    """
-    Reduces all tensors in nested dictionary or list or tuple at a dimension
-    using a named reduction function.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-        reduction (str): one of ["sum", "max", "mean", "flatten"]
-        dim (int): dimension to be reduced (or begin axis for flatten)
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return map_tensor(x, func=lambda t, r=reduction, d=dim: named_reduce_single(t, r, d))
-
-
-def gather_along_dim_with_dim_single(x, target_dim, source_dim, indices):
-    """
-    This function indexes out a target dimension of a tensor in a structured way,
-    by allowing a different value to be selected for each member of a flat index
-    tensor (@indices) corresponding to a source dimension. This can be interpreted
-    as moving along the source dimension, using the corresponding index value
-    in @indices to select values for all other dimensions outside of the
-    source and target dimensions. A common use case is to gather values
-    in target dimension 1 for each batch member (target dimension 0).
-
-    Args:
-        x (torch.Tensor): tensor to gather values for
-        target_dim (int): dimension to gather values along
-        source_dim (int): dimension to hold constant and use for gathering values
-            from the other dimensions
-        indices (torch.Tensor): flat index tensor with same shape as tensor @x along
-            @source_dim
-
-    Returns:
-        y (torch.Tensor): gathered tensor, with dimension @target_dim indexed out
-    """
-    assert len(indices.shape) == 1
-    assert x.shape[source_dim] == indices.shape[0]
-
-    # unsqueeze in all dimensions except the source dimension
-    new_shape = [1] * x.ndimension()
-    new_shape[source_dim] = -1
-    indices = indices.reshape(*new_shape)
-
-    # repeat in all dimensions - but preserve shape of source dimension,
-    # and make sure target_dimension has singleton dimension
-    expand_shape = list(x.shape)
-    expand_shape[source_dim] = -1
-    expand_shape[target_dim] = 1
-    indices = indices.expand(*expand_shape)
-
-    out = x.gather(dim=target_dim, index=indices)
-    return out.squeeze(target_dim)
-
-
-def gather_along_dim_with_dim(x, target_dim, source_dim, indices):
-    """
-    Apply @gather_along_dim_with_dim_single to all tensors in a nested
-    dictionary or list or tuple.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-        target_dim (int): dimension to gather values along
-        source_dim (int): dimension to hold constant and use for gathering values
-            from the other dimensions
-        indices (torch.Tensor): flat index tensor with same shape as tensor @x along
-            @source_dim
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return map_tensor(
-        x, lambda y, t=target_dim, s=source_dim, i=indices: gather_along_dim_with_dim_single(y, t, s, i)
-    )
-
-
-def gather_sequence_single(seq, indices):
-    """
-    Given a tensor with leading dimensions [B, T, ...], gather an element from each sequence in
-    the batch given an index for each sequence.
-
-    Args:
-        seq (torch.Tensor): tensor with leading dimensions [B, T, ...]
-        indices (torch.Tensor): tensor indices of shape [B]
-
-    Return:
-        y (torch.Tensor): indexed tensor of shape [B, ....]
-    """
-    return gather_along_dim_with_dim_single(seq, target_dim=1, source_dim=0, indices=indices)
-
-
-def gather_sequence(seq, indices):
-    """
-    Given a nested dictionary or list or tuple, gathers an element from each sequence of the batch
-    for tensors with leading dimensions [B, T, ...].
-
-    Args:
-        seq (dict or list or tuple): a possibly nested dictionary or list or tuple with tensors
-            of leading dimensions [B, T, ...]
-        indices (torch.Tensor): tensor indices of shape [B]
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple with tensors of shape [B, ...]
-    """
-    return gather_along_dim_with_dim(seq, target_dim=1, source_dim=0, indices=indices)
-
-
-def pad_sequence_single(seq, padding, batched=False, pad_same=True, pad_values=None):
-    """
-    Pad input tensor or array @seq in the time dimension (dimension 1).
-
-    Args:
-        seq (np.ndarray or torch.Tensor): sequence to be padded
-        padding (tuple): begin and end padding, e.g. [1, 1] pads both begin and end of the sequence by 1
-        batched (bool): if sequence has the batch dimension
-        pad_same (bool): if pad by duplicating
-        pad_values (scalar or (ndarray, Tensor)): values to be padded if not pad_same
-
-    Returns:
-        padded sequence (np.ndarray or torch.Tensor)
-    """
-    assert isinstance(seq, (np.ndarray, torch.Tensor))
-    assert pad_same or pad_values is not None
-    if pad_values is not None:
-        assert isinstance(pad_values, float)
-    repeat_func = np.repeat if isinstance(seq, np.ndarray) else torch.repeat_interleave
-    concat_func = np.concatenate if isinstance(seq, np.ndarray) else torch.cat
-    ones_like_func = np.ones_like if isinstance(seq, np.ndarray) else torch.ones_like
-    seq_dim = 1 if batched else 0
-
-    begin_pad = []
-    end_pad = []
-
-    if padding[0] > 0:
-        pad = seq[[0]] if pad_same else ones_like_func(seq[[0]]) * pad_values
-        begin_pad.append(repeat_func(pad, padding[0], seq_dim))
-    if padding[1] > 0:
-        pad = seq[[-1]] if pad_same else ones_like_func(seq[[-1]]) * pad_values
-        end_pad.append(repeat_func(pad, padding[1], seq_dim))
-
-    return concat_func(begin_pad + [seq] + end_pad, seq_dim)
-
-
-def pad_sequence(seq, padding, batched=False, pad_same=True, pad_values=None):
-    """
-    Pad a nested dictionary or list or tuple of sequence tensors in the time dimension (dimension 1).
-
-    Args:
-        seq (dict or list or tuple): a possibly nested dictionary or list or tuple with tensors
-            of leading dimensions [B, T, ...]
-        padding (tuple): begin and end padding, e.g. [1, 1] pads both begin and end of the sequence by 1
-        batched (bool): if sequence has the batch dimension
-        pad_same (bool): if pad by duplicating
-        pad_values (scalar or (ndarray, Tensor)): values to be padded if not pad_same
-
-    Returns:
-        padded sequence (dict or list or tuple)
-    """
-    return recursive_dict_list_tuple_apply(
-        seq,
-        {
-            torch.Tensor: lambda x, p=padding, b=batched, ps=pad_same, pv=pad_values: pad_sequence_single(
-                x, p, b, ps, pv
-            ),
-            np.ndarray: lambda x, p=padding, b=batched, ps=pad_same, pv=pad_values: pad_sequence_single(
-                x, p, b, ps, pv
-            ),
-            type(None): lambda x: x,
-        },
-    )
-
-
-def assert_size_at_dim_single(x, size, dim, msg):
-    """
-    Ensure that array or tensor @x has size @size in dim @dim.
-
-    Args:
-        x (np.ndarray or torch.Tensor): input array or tensor
-        size (int): size that tensors should have at @dim
-        dim (int): dimension to check
-        msg (str): text to display if assertion fails
-    """
-    assert x.shape[dim] == size, msg
-
-
-def assert_size_at_dim(x, size, dim, msg):
-    """
-    Ensure that arrays and tensors in nested dictionary or list or tuple have
-    size @size in dim @dim.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-        size (int): size that tensors should have at @dim
-        dim (int): dimension to check
-    """
-    map_tensor(x, lambda t, s=size, d=dim, m=msg: assert_size_at_dim_single(t, s, d, m))
-
-
-def get_shape(x):
-    """
-    Get all shapes of arrays and tensors in nested dictionary or list or tuple.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple that contains each array or
-            tensor's shape
-    """
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: lambda x: x.shape,
-            np.ndarray: lambda x: x.shape,
-            type(None): lambda x: x,
-        },
-    )
-
-
-def list_of_flat_dict_to_dict_of_list(list_of_dict):
-    """
-    Helper function to go from a list of flat dictionaries to a dictionary of lists.
-    By "flat" we mean that none of the values are dictionaries, but are numpy arrays,
-    floats, etc.
-
-    Args:
-        list_of_dict (list): list of flat dictionaries
-
-    Returns:
-        dict_of_list (dict): dictionary of lists
-    """
-    assert isinstance(list_of_dict, list)
-    dic = collections.OrderedDict()
-    for i in range(len(list_of_dict)):
-        for k in list_of_dict[i]:
-            if k not in dic:
-                dic[k] = []
-            dic[k].append(list_of_dict[i][k])
-    return dic
-
-
-def flatten_nested_dict_list(d, parent_key="", sep="_", item_key=""):
-    """
-    Flatten a nested dict or list to a list.
-
-    For example, given a dict
-    {
-        a: 1
-        b: {
-            c: 2
-        }
-        c: 3
-    }
-
-    the function would return [(a, 1), (b_c, 2), (c, 3)]
-
-    Args:
-        d (dict, list): a nested dict or list to be flattened
-        parent_key (str): recursion helper
-        sep (str): separator for nesting keys
-        item_key (str): recursion helper
-    Returns:
-        list: a list of (key, value) tuples
-    """
-    items = []
-    if isinstance(d, (tuple, list)):
-        new_key = parent_key + sep + item_key if len(parent_key) > 0 else item_key
-        for i, v in enumerate(d):
-            items.extend(flatten_nested_dict_list(v, new_key, sep=sep, item_key=str(i)))
-        return items
-    elif isinstance(d, dict):
-        new_key = parent_key + sep + item_key if len(parent_key) > 0 else item_key
-        for k, v in d.items():
-            assert isinstance(k, str)
-            items.extend(flatten_nested_dict_list(v, new_key, sep=sep, item_key=k))
-        return items
-    else:
-        new_key = parent_key + sep + item_key if len(parent_key) > 0 else item_key
-        return [(new_key, d)]
-
-
-def time_distributed(inputs, op, activation=None, inputs_as_kwargs=False, inputs_as_args=False, **kwargs):
-    """
-    Apply function @op to all tensors in nested dictionary or list or tuple @inputs in both the
-    batch (B) and time (T) dimension, where the tensors are expected to have shape [B, T, ...].
-    Will do this by reshaping tensors to [B * T, ...], passing through the op, and then reshaping
-    outputs to [B, T, ...].
-
-    Args:
-        inputs (list or tuple or dict): a possibly nested dictionary or list or tuple with tensors
-            of leading dimensions [B, T, ...]
-        op: a layer op that accepts inputs
-        activation: activation to apply at the output
-        inputs_as_kwargs (bool): whether to feed input as a kwargs dict to the op
-        inputs_as_args (bool) whether to feed input as a args list to the op
-        kwargs (dict): other kwargs to supply to the op
-
-    Returns:
-        outputs (dict or list or tuple): new nested dict-list-tuple with tensors of leading dimension [B, T].
-    """
-    batch_size, seq_len = flatten_nested_dict_list(inputs)[0][1].shape[:2]
-    inputs = join_dimensions(inputs, 0, 1)
-    if inputs_as_kwargs:
-        outputs = op(**inputs, **kwargs)
-    elif inputs_as_args:
-        outputs = op(*inputs, **kwargs)
-    else:
-        outputs = op(inputs, **kwargs)
-
-    if activation is not None:
-        outputs = map_tensor(outputs, activation)
-    outputs = reshape_dimensions(outputs, begin_axis=0, end_axis=0, target_dims=(batch_size, seq_len))
-    return outputs
--- a/lerobot/common/policies/diffusion/model/multi_image_obs_encoder.py
+++ b/lerobot/common/policies/diffusion/model/multi_image_obs_encoder.py
@@ -1,38 +1,13 @@
 import copy
-from typing import Dict, Optional, Tuple, Union
+from typing import Dict, Tuple, Union

 import torch
 import torch.nn as nn
 import torchvision
-from robomimic.models.base_nets import ResNet18Conv, SpatialSoftmax

-from lerobot.common.policies.diffusion.model.crop_randomizer import CropRandomizer
-from lerobot.common.policies.diffusion.model.module_attr_mixin import ModuleAttrMixin
-from lerobot.common.policies.diffusion.pytorch_utils import replace_submodules
-
-
-class RgbEncoder(nn.Module):
-    """Following `VisualCore` from Robomimic 0.2.0."""
-
-    def __init__(self, input_shape, relu=True, pretrained=False, num_keypoints=32):
-        """
-        input_shape: channel-first input shape (C, H, W)
-        resnet_name: a timm model name.
-        pretrained: whether to use timm pretrained weights.
-        relu: whether to use relu as a final step.
-        num_keypoints: Number of keypoints for SpatialSoftmax (default value of 32 matches PushT Image).
-        """
-        super().__init__()
-        self.backbone = ResNet18Conv(input_channel=input_shape[0], pretrained=pretrained)
-        # Figure out the feature map shape.
-        with torch.inference_mode():
-            feat_map_shape = tuple(self.backbone(torch.zeros(size=(1, *input_shape))).shape[1:])
-        self.pool = SpatialSoftmax(feat_map_shape, num_kp=num_keypoints)
-        self.out = nn.Linear(num_keypoints * 2, num_keypoints * 2)
-        self.relu = nn.ReLU() if relu else nn.Identity()
-
-    def forward(self, x):
-        return self.relu(self.out(torch.flatten(self.pool(self.backbone(x)), start_dim=1)))
+from diffusion_policy.common.pytorch_util import replace_submodules
+from diffusion_policy.model.common.module_attr_mixin import ModuleAttrMixin
+from diffusion_policy.model.vision.crop_randomizer import CropRandomizer


 class MultiImageObsEncoder(ModuleAttrMixin):
@@ -49,7 +24,7 @@ class MultiImageObsEncoder(ModuleAttrMixin):
        share_rgb_model: bool = False,
        # renormalize rgb input with imagenet normalization
        # assuming input in [0,1]
-        norm_mean_std: Optional[tuple[float, float]] = None,
+        imagenet_norm: bool = False,
    ):
        """
        Assumes rgb input: B,C,H,W
@@ -123,9 +98,10 @@ class MultiImageObsEncoder(ModuleAttrMixin):
                        this_normalizer = torchvision.transforms.CenterCrop(size=(h, w))
                # configure normalizer
                this_normalizer = nn.Identity()
-                if norm_mean_std is not None:
+                if imagenet_norm:
+                    # TODO(rcadene): move normalizer to dataset and env
                    this_normalizer = torchvision.transforms.Normalize(
-                        mean=norm_mean_std[0], std=norm_mean_std[1]
+                        mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
                    )

                this_transform = nn.Sequential(this_resizer, this_randomizer, this_normalizer)
@@ -148,17 +124,6 @@ class MultiImageObsEncoder(ModuleAttrMixin):
    def forward(self, obs_dict):
        batch_size = None
        features = []
-
-        # process lowdim input
-        for key in self.low_dim_keys:
-            data = obs_dict[key]
-            if batch_size is None:
-                batch_size = data.shape[0]
-            else:
-                assert batch_size == data.shape[0]
-            assert data.shape[1:] == self.key_shape_map[key]
-            features.append(data)
-
        # process rgb input
        if self.share_rgb_model:
            # pass all rgb obs to rgb model
@@ -196,6 +161,16 @@ class MultiImageObsEncoder(ModuleAttrMixin):
                feature = self.key_model_map[key](img)
                features.append(feature)

+        # process lowdim input
+        for key in self.low_dim_keys:
+            data = obs_dict[key]
+            if batch_size is None:
+                batch_size = data.shape[0]
+            else:
+                assert batch_size == data.shape[0]
+            assert data.shape[1:] == self.key_shape_map[key]
+            features.append(data)
+
        # concatenate all features
        result = torch.cat(features, dim=-1)
        return result
--- a/lerobot/common/policies/diffusion/policy.py
+++ b/lerobot/common/policies/diffusion/policy.py
@@ -1,20 +1,16 @@
 import copy
-import logging
 import time

 import hydra
 import torch
+import torch.nn as nn
+from diffusion_policy.model.common.lr_scheduler import get_scheduler

-from lerobot.common.policies.abstract import AbstractPolicy
-from lerobot.common.policies.diffusion.diffusion_unet_image_policy import DiffusionUnetImagePolicy
-from lerobot.common.policies.diffusion.model.lr_scheduler import get_scheduler
-from lerobot.common.policies.diffusion.model.multi_image_obs_encoder import MultiImageObsEncoder, RgbEncoder
-from lerobot.common.utils import get_safe_torch_device
+from .diffusion_unet_image_policy import DiffusionUnetImagePolicy
+from .multi_image_obs_encoder import MultiImageObsEncoder


-class DiffusionPolicy(AbstractPolicy):
-    name = "diffusion"
-
+class DiffusionPolicy(nn.Module):
    def __init__(
        self,
        cfg,
@@ -38,14 +34,11 @@ class DiffusionPolicy(AbstractPolicy):
        # parameters passed to step
        **kwargs,
    ):
-        super().__init__(n_action_steps)
+        super().__init__()
        self.cfg = cfg

        noise_scheduler = hydra.utils.instantiate(cfg_noise_scheduler)
-        rgb_model_input_shape = copy.deepcopy(shape_meta.obs.image.shape)
-        if cfg_obs_encoder.crop_shape is not None:
-            rgb_model_input_shape[1:] = cfg_obs_encoder.crop_shape
-        rgb_model = RgbEncoder(input_shape=rgb_model_input_shape, **cfg_rgb_model)
+        rgb_model = hydra.utils.instantiate(cfg_rgb_model)
        obs_encoder = MultiImageObsEncoder(
            rgb_model=rgb_model,
            **cfg_obs_encoder,
@@ -69,16 +62,15 @@ class DiffusionPolicy(AbstractPolicy):
            **kwargs,
        )

-        self.device = get_safe_torch_device(cfg_device)
-        self.diffusion.to(self.device)
+        self.device = torch.device(cfg_device)
+        if torch.cuda.is_available() and cfg_device == "cuda":
+            self.diffusion.cuda()

-        self.ema_diffusion = None
        self.ema = None
        if self.cfg.use_ema:
-            self.ema_diffusion = copy.deepcopy(self.diffusion)
            self.ema = hydra.utils.instantiate(
                cfg_ema,
-                model=self.ema_diffusion,
+                model=copy.deepcopy(self.diffusion),
            )

        self.optimizer = hydra.utils.instantiate(
@@ -101,22 +93,21 @@ class DiffusionPolicy(AbstractPolicy):
        )

    @torch.no_grad()
-    def select_actions(self, observation, step_count):
-        """
-        Note: this uses the ema model weights if self.training == False, otherwise the non-ema model weights.
-        """
+    def forward(self, observation, step_count):
        # TODO(rcadene): remove unused step_count
        del step_count

+        # TODO(rcadene): remove unsqueeze hack to add bsize=1
+        observation["image"] = observation["image"].unsqueeze(0)
+        observation["state"] = observation["state"].unsqueeze(0)
+
        obs_dict = {
            "image": observation["image"],
            "agent_pos": observation["state"],
        }
-        if self.training:
-            out = self.diffusion.predict_action(obs_dict)
-        else:
-            out = self.ema_diffusion.predict_action(obs_dict)
-        action = out["action"]
+        out = self.diffusion.predict_action(obs_dict)
+
+        action = out["action"].squeeze(0)
        return action

    def update(self, replay_buffer, step):
@@ -203,12 +194,6 @@ class DiffusionPolicy(AbstractPolicy):
    def save(self, fp):
        torch.save(self.state_dict(), fp)

-    def load(self, fp, device=None):
-        d = torch.load(fp, map_location=device)
-        missing_keys, unexpected_keys = self.load_state_dict(d, strict=False)
-        if len(missing_keys) > 0:
-            assert all(k.startswith("ema_diffusion.") for k in missing_keys)
-            logging.warning(
-                "DiffusionPolicy.load expected ema parameters in loaded state dict but none were found."
-            )
-        assert len(unexpected_keys) == 0
+    def load(self, fp):
+        d = torch.load(fp)
+        self.load_state_dict(d)
--- a/lerobot/common/policies/diffusion/pytorch_utils.py
+++ b/lerobot/common/policies/diffusion/pytorch_utils.py
@@ -1,76 +0,0 @@
-from typing import Callable, Dict
-
-import torch
-import torch.nn as nn
-import torchvision
-
-
-def get_resnet(name, weights=None, **kwargs):
-    """
-    name: resnet18, resnet34, resnet50
-    weights: "IMAGENET1K_V1", "r3m"
-    """
-    # load r3m weights
-    if (weights == "r3m") or (weights == "R3M"):
-        return get_r3m(name=name, **kwargs)
-
-    func = getattr(torchvision.models, name)
-    resnet = func(weights=weights, **kwargs)
-    resnet.fc = torch.nn.Identity()
-    return resnet
-
-
-def get_r3m(name, **kwargs):
-    """
-    name: resnet18, resnet34, resnet50
-    """
-    import r3m
-
-    r3m.device = "cpu"
-    model = r3m.load_r3m(name)
-    r3m_model = model.module
-    resnet_model = r3m_model.convnet
-    resnet_model = resnet_model.to("cpu")
-    return resnet_model
-
-
-def dict_apply(
-    x: Dict[str, torch.Tensor], func: Callable[[torch.Tensor], torch.Tensor]
-) -> Dict[str, torch.Tensor]:
-    result = {}
-    for key, value in x.items():
-        if isinstance(value, dict):
-            result[key] = dict_apply(value, func)
-        else:
-            result[key] = func(value)
-    return result
-
-
-def replace_submodules(
-    root_module: nn.Module, predicate: Callable[[nn.Module], bool], func: Callable[[nn.Module], nn.Module]
-) -> nn.Module:
-    """
-    predicate: Return true if the module is to be replaced.
-    func: Return new module to use.
-    """
-    if predicate(root_module):
-        return func(root_module)
-
-    bn_list = [k.split(".") for k, m in root_module.named_modules(remove_duplicate=True) if predicate(m)]
-    for *parent, k in bn_list:
-        parent_module = root_module
-        if len(parent) > 0:
-            parent_module = root_module.get_submodule(".".join(parent))
-        if isinstance(parent_module, nn.Sequential):
-            src_module = parent_module[int(k)]
-        else:
-            src_module = getattr(parent_module, k)
-        tgt_module = func(src_module)
-        if isinstance(parent_module, nn.Sequential):
-            parent_module[int(k)] = tgt_module
-        else:
-            setattr(parent_module, k, tgt_module)
-    # verify that all BN are replaced
-    bn_list = [k.split(".") for k, m in root_module.named_modules(remove_duplicate=True) if predicate(m)]
-    assert len(bn_list) == 0
-    return root_module
--- a/lerobot/common/policies/diffusion/replay_buffer.py
+++ b/lerobot/common/policies/diffusion/replay_buffer.py
@@ -1,614 +0,0 @@
-from __future__ import annotations
-
-import math
-import numbers
-import os
-from functools import cached_property
-
-import numcodecs
-import numpy as np
-import zarr
-
-
-def check_chunks_compatible(chunks: tuple, shape: tuple):
-    assert len(shape) == len(chunks)
-    for c in chunks:
-        assert isinstance(c, numbers.Integral)
-        assert c > 0
-
-
-def rechunk_recompress_array(group, name, chunks=None, chunk_length=None, compressor=None, tmp_key="_temp"):
-    old_arr = group[name]
-    if chunks is None:
-        chunks = (chunk_length,) + old_arr.chunks[1:] if chunk_length is not None else old_arr.chunks
-    check_chunks_compatible(chunks, old_arr.shape)
-
-    if compressor is None:
-        compressor = old_arr.compressor
-
-    if (chunks == old_arr.chunks) and (compressor == old_arr.compressor):
-        # no change
-        return old_arr
-
-    # rechunk recompress
-    group.move(name, tmp_key)
-    old_arr = group[tmp_key]
-    n_copied, n_skipped, n_bytes_copied = zarr.copy(
-        source=old_arr,
-        dest=group,
-        name=name,
-        chunks=chunks,
-        compressor=compressor,
-    )
-    del group[tmp_key]
-    arr = group[name]
-    return arr
-
-
-def get_optimal_chunks(shape, dtype, target_chunk_bytes=2e6, max_chunk_length=None):
-    """
-    Common shapes
-    T,D
-    T,N,D
-    T,H,W,C
-    T,N,H,W,C
-    """
-    itemsize = np.dtype(dtype).itemsize
-    # reversed
-    rshape = list(shape[::-1])
-    if max_chunk_length is not None:
-        rshape[-1] = int(max_chunk_length)
-    split_idx = len(shape) - 1
-    for i in range(len(shape) - 1):
-        this_chunk_bytes = itemsize * np.prod(rshape[:i])
-        next_chunk_bytes = itemsize * np.prod(rshape[: i + 1])
-        if this_chunk_bytes <= target_chunk_bytes and next_chunk_bytes > target_chunk_bytes:
-            split_idx = i
-
-    rchunks = rshape[:split_idx]
-    item_chunk_bytes = itemsize * np.prod(rshape[:split_idx])
-    this_max_chunk_length = rshape[split_idx]
-    next_chunk_length = min(this_max_chunk_length, math.ceil(target_chunk_bytes / item_chunk_bytes))
-    rchunks.append(next_chunk_length)
-    len_diff = len(shape) - len(rchunks)
-    rchunks.extend([1] * len_diff)
-    chunks = tuple(rchunks[::-1])
-    # print(np.prod(chunks) * itemsize / target_chunk_bytes)
-    return chunks
-
-
-class ReplayBuffer:
-    """
-    Zarr-based temporal datastructure.
-    Assumes first dimension to be time. Only chunk in time dimension.
-    """
-
-    def __init__(self, root: zarr.Group | dict[str, dict]):
-        """
-        Dummy constructor. Use copy_from* and create_from* class methods instead.
-        """
-        assert "data" in root
-        assert "meta" in root
-        assert "episode_ends" in root["meta"]
-        for value in root["data"].values():
-            assert value.shape[0] == root["meta"]["episode_ends"][-1]
-        self.root = root
-
-    # ============= create constructors ===============
-    @classmethod
-    def create_empty_zarr(cls, storage=None, root=None):
-        if root is None:
-            if storage is None:
-                storage = zarr.MemoryStore()
-            root = zarr.group(store=storage)
-        root.require_group("data", overwrite=False)
-        meta = root.require_group("meta", overwrite=False)
-        if "episode_ends" not in meta:
-            meta.zeros("episode_ends", shape=(0,), dtype=np.int64, compressor=None, overwrite=False)
-        return cls(root=root)
-
-    @classmethod
-    def create_empty_numpy(cls):
-        root = {"data": {}, "meta": {"episode_ends": np.zeros((0,), dtype=np.int64)}}
-        return cls(root=root)
-
-    @classmethod
-    def create_from_group(cls, group, **kwargs):
-        if "data" not in group:
-            # create from stratch
-            buffer = cls.create_empty_zarr(root=group, **kwargs)
-        else:
-            # already exist
-            buffer = cls(root=group, **kwargs)
-        return buffer
-
-    @classmethod
-    def create_from_path(cls, zarr_path, mode="r", **kwargs):
-        """
-        Open a on-disk zarr directly (for dataset larger than memory).
-        Slower.
-        """
-        group = zarr.open(os.path.expanduser(zarr_path), mode)
-        return cls.create_from_group(group, **kwargs)
-
-    # ============= copy constructors ===============
-    @classmethod
-    def copy_from_store(
-        cls,
-        src_store,
-        store=None,
-        keys=None,
-        chunks: dict[str, tuple] | None = None,
-        compressors: dict | str | numcodecs.abc.Codec | None = None,
-        if_exists="replace",
-        **kwargs,
-    ):
-        """
-        Load to memory.
-        """
-        src_root = zarr.group(src_store)
-        if chunks is None:
-            chunks = {}
-        if compressors is None:
-            compressors = {}
-        root = None
-        if store is None:
-            # numpy backend
-            meta = {}
-            for key, value in src_root["meta"].items():
-                if len(value.shape) == 0:
-                    meta[key] = np.array(value)
-                else:
-                    meta[key] = value[:]
-
-            if keys is None:
-                keys = src_root["data"].keys()
-            data = {}
-            for key in keys:
-                arr = src_root["data"][key]
-                data[key] = arr[:]
-
-            root = {"meta": meta, "data": data}
-        else:
-            root = zarr.group(store=store)
-            # copy without recompression
-            n_copied, n_skipped, n_bytes_copied = zarr.copy_store(
-                source=src_store, dest=store, source_path="/meta", dest_path="/meta", if_exists=if_exists
-            )
-            data_group = root.create_group("data", overwrite=True)
-            if keys is None:
-                keys = src_root["data"].keys()
-            for key in keys:
-                value = src_root["data"][key]
-                cks = cls._resolve_array_chunks(chunks=chunks, key=key, array=value)
-                cpr = cls._resolve_array_compressor(compressors=compressors, key=key, array=value)
-                if cks == value.chunks and cpr == value.compressor:
-                    # copy without recompression
-                    this_path = "/data/" + key
-                    n_copied, n_skipped, n_bytes_copied = zarr.copy_store(
-                        source=src_store,
-                        dest=store,
-                        source_path=this_path,
-                        dest_path=this_path,
-                        if_exists=if_exists,
-                    )
-                else:
-                    # copy with recompression
-                    n_copied, n_skipped, n_bytes_copied = zarr.copy(
-                        source=value,
-                        dest=data_group,
-                        name=key,
-                        chunks=cks,
-                        compressor=cpr,
-                        if_exists=if_exists,
-                    )
-        buffer = cls(root=root)
-        return buffer
-
-    @classmethod
-    def copy_from_path(
-        cls,
-        zarr_path,
-        backend=None,
-        store=None,
-        keys=None,
-        chunks: dict[str, tuple] | None = None,
-        compressors: dict | str | numcodecs.abc.Codec | None = None,
-        if_exists="replace",
-        **kwargs,
-    ):
-        """
-        Copy a on-disk zarr to in-memory compressed.
-        Recommended
-        """
-        if chunks is None:
-            chunks = {}
-        if compressors is None:
-            compressors = {}
-        if backend == "numpy":
-            print("backend argument is deprecated!")
-            store = None
-        group = zarr.open(os.path.expanduser(zarr_path), "r")
-        return cls.copy_from_store(
-            src_store=group.store,
-            store=store,
-            keys=keys,
-            chunks=chunks,
-            compressors=compressors,
-            if_exists=if_exists,
-            **kwargs,
-        )
-
-    # ============= save methods ===============
-    def save_to_store(
-        self,
-        store,
-        chunks: dict[str, tuple] | None = None,
-        compressors: str | numcodecs.abc.Codec | dict | None = None,
-        if_exists="replace",
-        **kwargs,
-    ):
-        root = zarr.group(store)
-        if chunks is None:
-            chunks = {}
-        if compressors is None:
-            compressors = {}
-        if self.backend == "zarr":
-            # recompression free copy
-            n_copied, n_skipped, n_bytes_copied = zarr.copy_store(
-                source=self.root.store,
-                dest=store,
-                source_path="/meta",
-                dest_path="/meta",
-                if_exists=if_exists,
-            )
-        else:
-            meta_group = root.create_group("meta", overwrite=True)
-            # save meta, no chunking
-            for key, value in self.root["meta"].items():
-                _ = meta_group.array(name=key, data=value, shape=value.shape, chunks=value.shape)
-
-        # save data, chunk
-        data_group = root.create_group("data", overwrite=True)
-        for key, value in self.root["data"].items():
-            cks = self._resolve_array_chunks(chunks=chunks, key=key, array=value)
-            cpr = self._resolve_array_compressor(compressors=compressors, key=key, array=value)
-            if isinstance(value, zarr.Array):
-                if cks == value.chunks and cpr == value.compressor:
-                    # copy without recompression
-                    this_path = "/data/" + key
-                    n_copied, n_skipped, n_bytes_copied = zarr.copy_store(
-                        source=self.root.store,
-                        dest=store,
-                        source_path=this_path,
-                        dest_path=this_path,
-                        if_exists=if_exists,
-                    )
-                else:
-                    # copy with recompression
-                    n_copied, n_skipped, n_bytes_copied = zarr.copy(
-                        source=value,
-                        dest=data_group,
-                        name=key,
-                        chunks=cks,
-                        compressor=cpr,
-                        if_exists=if_exists,
-                    )
-            else:
-                # numpy
-                _ = data_group.array(name=key, data=value, chunks=cks, compressor=cpr)
-        return store
-
-    def save_to_path(
-        self,
-        zarr_path,
-        chunks: dict[str, tuple] | None = None,
-        compressors: str | numcodecs.abc.Codec | dict | None = None,
-        if_exists="replace",
-        **kwargs,
-    ):
-        if chunks is None:
-            chunks = {}
-        if compressors is None:
-            compressors = {}
-        store = zarr.DirectoryStore(os.path.expanduser(zarr_path))
-        return self.save_to_store(
-            store, chunks=chunks, compressors=compressors, if_exists=if_exists, **kwargs
-        )
-
-    @staticmethod
-    def resolve_compressor(compressor="default"):
-        if compressor == "default":
-            compressor = numcodecs.Blosc(cname="lz4", clevel=5, shuffle=numcodecs.Blosc.NOSHUFFLE)
-        elif compressor == "disk":
-            compressor = numcodecs.Blosc("zstd", clevel=5, shuffle=numcodecs.Blosc.BITSHUFFLE)
-        return compressor
-
-    @classmethod
-    def _resolve_array_compressor(cls, compressors: dict | str | numcodecs.abc.Codec, key, array):
-        # allows compressor to be explicitly set to None
-        cpr = "nil"
-        if isinstance(compressors, dict):
-            if key in compressors:
-                cpr = cls.resolve_compressor(compressors[key])
-            elif isinstance(array, zarr.Array):
-                cpr = array.compressor
-        else:
-            cpr = cls.resolve_compressor(compressors)
-        # backup default
-        if cpr == "nil":
-            cpr = cls.resolve_compressor("default")
-        return cpr
-
-    @classmethod
-    def _resolve_array_chunks(cls, chunks: dict | tuple, key, array):
-        cks = None
-        if isinstance(chunks, dict):
-            if key in chunks:
-                cks = chunks[key]
-            elif isinstance(array, zarr.Array):
-                cks = array.chunks
-        elif isinstance(chunks, tuple):
-            cks = chunks
-        else:
-            raise TypeError(f"Unsupported chunks type {type(chunks)}")
-        # backup default
-        if cks is None:
-            cks = get_optimal_chunks(shape=array.shape, dtype=array.dtype)
-        # check
-        check_chunks_compatible(chunks=cks, shape=array.shape)
-        return cks
-
-    # ============= properties =================
-    @cached_property
-    def data(self):
-        return self.root["data"]
-
-    @cached_property
-    def meta(self):
-        return self.root["meta"]
-
-    def update_meta(self, data):
-        # sanitize data
-        np_data = {}
-        for key, value in data.items():
-            if isinstance(value, np.ndarray):
-                np_data[key] = value
-            else:
-                arr = np.array(value)
-                if arr.dtype == object:
-                    raise TypeError(f"Invalid value type {type(value)}")
-                np_data[key] = arr
-
-        meta_group = self.meta
-        if self.backend == "zarr":
-            for key, value in np_data.items():
-                _ = meta_group.array(
-                    name=key, data=value, shape=value.shape, chunks=value.shape, overwrite=True
-                )
-        else:
-            meta_group.update(np_data)
-
-        return meta_group
-
-    @property
-    def episode_ends(self):
-        return self.meta["episode_ends"]
-
-    def get_episode_idxs(self):
-        import numba
-
-        numba.jit(nopython=True)
-
-        def _get_episode_idxs(episode_ends):
-            result = np.zeros((episode_ends[-1],), dtype=np.int64)
-            for i in range(len(episode_ends)):
-                start = 0
-                if i > 0:
-                    start = episode_ends[i - 1]
-                end = episode_ends[i]
-                for idx in range(start, end):
-                    result[idx] = i
-            return result
-
-        return _get_episode_idxs(self.episode_ends)
-
-    @property
-    def backend(self):
-        backend = "numpy"
-        if isinstance(self.root, zarr.Group):
-            backend = "zarr"
-        return backend
-
-    # =========== dict-like API ==============
-    def __repr__(self) -> str:
-        if self.backend == "zarr":
-            return str(self.root.tree())
-        else:
-            return super().__repr__()
-
-    def keys(self):
-        return self.data.keys()
-
-    def values(self):
-        return self.data.values()
-
-    def items(self):
-        return self.data.items()
-
-    def __getitem__(self, key):
-        return self.data[key]
-
-    def __contains__(self, key):
-        return key in self.data
-
-    # =========== our API ==============
-    @property
-    def n_steps(self):
-        if len(self.episode_ends) == 0:
-            return 0
-        return self.episode_ends[-1]
-
-    @property
-    def n_episodes(self):
-        return len(self.episode_ends)
-
-    @property
-    def chunk_size(self):
-        if self.backend == "zarr":
-            return next(iter(self.data.arrays()))[-1].chunks[0]
-        return None
-
-    @property
-    def episode_lengths(self):
-        ends = self.episode_ends[:]
-        ends = np.insert(ends, 0, 0)
-        lengths = np.diff(ends)
-        return lengths
-
-    def add_episode(
-        self,
-        data: dict[str, np.ndarray],
-        chunks: dict[str, tuple] | None = None,
-        compressors: str | numcodecs.abc.Codec | dict | None = None,
-    ):
-        if chunks is None:
-            chunks = {}
-        if compressors is None:
-            compressors = {}
-        assert len(data) > 0
-        is_zarr = self.backend == "zarr"
-
-        curr_len = self.n_steps
-        episode_length = None
-        for value in data.values():
-            assert len(value.shape) >= 1
-            if episode_length is None:
-                episode_length = len(value)
-            else:
-                assert episode_length == len(value)
-        new_len = curr_len + episode_length
-
-        for key, value in data.items():
-            new_shape = (new_len,) + value.shape[1:]
-            # create array
-            if key not in self.data:
-                if is_zarr:
-                    cks = self._resolve_array_chunks(chunks=chunks, key=key, array=value)
-                    cpr = self._resolve_array_compressor(compressors=compressors, key=key, array=value)
-                    arr = self.data.zeros(
-                        name=key, shape=new_shape, chunks=cks, dtype=value.dtype, compressor=cpr
-                    )
-                else:
-                    # copy data to prevent modify
-                    arr = np.zeros(shape=new_shape, dtype=value.dtype)
-                    self.data[key] = arr
-            else:
-                arr = self.data[key]
-                assert value.shape[1:] == arr.shape[1:]
-                # same method for both zarr and numpy
-                if is_zarr:
-                    arr.resize(new_shape)
-                else:
-                    arr.resize(new_shape, refcheck=False)
-            # copy data
-            arr[-value.shape[0] :] = value
-
-        # append to episode ends
-        episode_ends = self.episode_ends
-        if is_zarr:
-            episode_ends.resize(episode_ends.shape[0] + 1)
-        else:
-            episode_ends.resize(episode_ends.shape[0] + 1, refcheck=False)
-        episode_ends[-1] = new_len
-
-        # rechunk
-        if is_zarr and episode_ends.chunks[0] < episode_ends.shape[0]:
-            rechunk_recompress_array(self.meta, "episode_ends", chunk_length=int(episode_ends.shape[0] * 1.5))
-
-    def drop_episode(self):
-        is_zarr = self.backend == "zarr"
-        episode_ends = self.episode_ends[:].copy()
-        assert len(episode_ends) > 0
-        start_idx = 0
-        if len(episode_ends) > 1:
-            start_idx = episode_ends[-2]
-        for value in self.data.values():
-            new_shape = (start_idx,) + value.shape[1:]
-            if is_zarr:
-                value.resize(new_shape)
-            else:
-                value.resize(new_shape, refcheck=False)
-        if is_zarr:
-            self.episode_ends.resize(len(episode_ends) - 1)
-        else:
-            self.episode_ends.resize(len(episode_ends) - 1, refcheck=False)
-
-    def pop_episode(self):
-        assert self.n_episodes > 0
-        episode = self.get_episode(self.n_episodes - 1, copy=True)
-        self.drop_episode()
-        return episode
-
-    def extend(self, data):
-        self.add_episode(data)
-
-    def get_episode(self, idx, copy=False):
-        idx = list(range(len(self.episode_ends)))[idx]
-        start_idx = 0
-        if idx > 0:
-            start_idx = self.episode_ends[idx - 1]
-        end_idx = self.episode_ends[idx]
-        result = self.get_steps_slice(start_idx, end_idx, copy=copy)
-        return result
-
-    def get_episode_slice(self, idx):
-        start_idx = 0
-        if idx > 0:
-            start_idx = self.episode_ends[idx - 1]
-        end_idx = self.episode_ends[idx]
-        return slice(start_idx, end_idx)
-
-    def get_steps_slice(self, start, stop, step=None, copy=False):
-        _slice = slice(start, stop, step)
-
-        result = {}
-        for key, value in self.data.items():
-            x = value[_slice]
-            if copy and isinstance(value, np.ndarray):
-                x = x.copy()
-            result[key] = x
-        return result
-
-    # =========== chunking =============
-    def get_chunks(self) -> dict:
-        assert self.backend == "zarr"
-        chunks = {}
-        for key, value in self.data.items():
-            chunks[key] = value.chunks
-        return chunks
-
-    def set_chunks(self, chunks: dict):
-        assert self.backend == "zarr"
-        for key, value in chunks.items():
-            if key in self.data:
-                arr = self.data[key]
-                if value != arr.chunks:
-                    check_chunks_compatible(chunks=value, shape=arr.shape)
-                    rechunk_recompress_array(self.data, key, chunks=value)
-
-    def get_compressors(self) -> dict:
-        assert self.backend == "zarr"
-        compressors = {}
-        for key, value in self.data.items():
-            compressors[key] = value.compressor
-        return compressors
-
-    def set_compressors(self, compressors: dict):
-        assert self.backend == "zarr"
-        for key, value in compressors.items():
-            if key in self.data:
-                arr = self.data[key]
-                compressor = self.resolve_compressor(value)
-                if compressor != arr.compressor:
-                    rechunk_recompress_array(self.data, key, compressor=compressor)
--- a/lerobot/common/policies/factory.py
+++ b/lerobot/common/policies/factory.py
@@ -1,53 +1,32 @@
-""" Factory for policies
-"""
-
-from lerobot.common.policies.abstract import AbstractPolicy
-
-
-def make_policy(cfg: dict) -> AbstractPolicy:
-    """ Instantiate a policy from the configuration.
-        Currently supports TD-MPC, Diffusion, and ACT: select the policy with cfg.policy.name: tdmpc, diffusion, act.
-    
-    Args:
-        cfg: The configuration (DictConfig)
-
-    """
-    policy_kwargs = {}
-    if cfg.policy.name != "diffusion" and cfg.rollout_batch_size > 1:
-        raise NotImplementedError("Only diffusion policy supports rollout_batch_size > 1 for the time being.")
-
+def make_policy(cfg):
    if cfg.policy.name == "tdmpc":
-        from lerobot.common.policies.tdmpc.policy import TDMPCPolicy
+        from lerobot.common.policies.tdmpc import TDMPC

-        policy_cls = TDMPCPolicy
-        policy_kwargs = {"cfg": cfg.policy, "device": cfg.device}
+        policy = TDMPC(cfg.policy, cfg.device)
    elif cfg.policy.name == "diffusion":
        from lerobot.common.policies.diffusion.policy import DiffusionPolicy

-        policy_cls = DiffusionPolicy
-        policy_kwargs = {
-            "cfg": cfg.policy,
-            "cfg_device": cfg.device,
-            "cfg_noise_scheduler": cfg.noise_scheduler,
-            "cfg_rgb_model": cfg.rgb_model,
-            "cfg_obs_encoder": cfg.obs_encoder,
-            "cfg_optimizer": cfg.optimizer,
-            "cfg_ema": cfg.ema,
-            "n_action_steps": cfg.n_action_steps + cfg.n_latency_steps,
+        policy = DiffusionPolicy(
+            cfg=cfg.policy,
+            cfg_device=cfg.device,
+            cfg_noise_scheduler=cfg.noise_scheduler,
+            cfg_rgb_model=cfg.rgb_model,
+            cfg_obs_encoder=cfg.obs_encoder,
+            cfg_optimizer=cfg.optimizer,
+            cfg_ema=cfg.ema,
+            n_action_steps=cfg.n_action_steps + cfg.n_latency_steps,
            **cfg.policy,
-        }
+        )
    elif cfg.policy.name == "act":
        from lerobot.common.policies.act.policy import ActionChunkingTransformerPolicy

-        policy_cls = ActionChunkingTransformerPolicy
-        policy_kwargs = {"cfg": cfg.policy, "device": cfg.device, "n_action_steps": cfg.n_action_steps + cfg.n_latency_steps}
+        policy = ActionChunkingTransformerPolicy(
+            cfg.policy, cfg.device, n_action_steps=cfg.n_action_steps + cfg.n_latency_steps
+        )
    else:
        raise ValueError(cfg.policy.name)

    if cfg.policy.pretrained_model_path:
-        # policy.load(cfg.policy.pretrained_model_path, device=cfg.device)
-        policy = policy_cls.from_pretrained(cfg.policy.pretrained_model_path, map_location=cfg.device, **policy_kwargs)
-    
        # TODO(rcadene): hack for old pretrained models from fowm
        if cfg.policy.name == "tdmpc" and "fowm" in cfg.policy.pretrained_model_path:
            if "offline" in cfg.pretrained_model_path:
@@ -56,5 +35,6 @@ def make_policy(cfg: dict) -> AbstractPolicy:
                policy.step[0] = 100000
            else:
                raise NotImplementedError()
+        policy.load(cfg.policy.pretrained_model_path)

    return policy
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Cadene	7bf36cd413	Add AbstractEnv, Refactor AlohaEnv, Add rendering_hook in env, Minor modifications, (TODO: Refactor Pusht and Simxarm)	2024-03-10 22:00:48 +00:00
Cadene	b49f7b70e2	Add tasks without end_effector that are compatible with dataset, Eval can run (TODO: training and pretrained model)	2024-03-10 10:52:12 +00:00
Cadene	f1230cdac0	Training can runs (TODO: eval)	2024-03-09 16:52:08 +00:00
Cadene	5395829596	Add act yaml (TODO: try train.py)	2024-03-08 18:08:28 +00:00
Cadene	a45802c281	Remove download.py add a WIP for Simxarm	2024-03-08 18:07:49 +00:00
Cadene	167a51cb69	Remove download.py add a WIP for Simxarm	2024-03-08 18:07:33 +00:00
Cadene	fbc66a082b	Copy past from act repo	2024-03-08 16:54:43 +00:00
Cadene	603455e313	Update README	2024-03-08 16:15:56 +00:00
Cadene	6500945be5	Rendering works (fps look fast tho? TODO action bounding is too wide [-1,1])	2024-03-08 15:33:35 +00:00
Cadene	ebbcad8c05	WIP Aloha env tests pass	2024-03-08 14:37:23 +00:00
Remi Cadene	d98b435b4c	WIP	2024-03-08 12:08:16 +00:00