WIP

Move dependencies to extra
Merge remote-tracking branch 'origin/main' into user/aliberts/2024_05_14_compare_policies
2024-05-21 16:31:48 +02:00 · 2024-05-21 16:29:44 +02:00 · 2024-05-21 10:14:10 +02:00 · 2024-05-16 19:31:57 +02:00 · 2024-05-16 17:04:33 +02:00 · 2024-05-16 16:55:40 +02:00
19 changed files with 1126 additions and 842 deletions
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@@ -10,6 +10,7 @@ on:

 env:
  PYTHON_VERSION: "3.10"
+#   CI_SLACK_CHANNEL: ${{ secrets.CI_DOCKER_CHANNEL }}

 jobs:
  latest-cpu:
@@ -50,6 +51,30 @@ jobs:
          tags: huggingface/lerobot-cpu
          build-args: PYTHON_VERSION=${{ env.PYTHON_VERSION }}

+    #   - name: Post to a Slack channel
+    #     id: slack
+    #     #uses: slackapi/slack-github-action@v1.25.0
+    #     uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
+    #     with:
+    #       # Slack channel id, channel name, or user id to post message.
+    #       # See also: https://api.slack.com/methods/chat.postMessage#channels
+    #       channel-id: ${{ env.CI_SLACK_CHANNEL }}
+    #       # For posting a rich message using Block Kit
+    #       payload: |
+    #         {
+    #           "text": "lerobot-cpu Docker Image build result: ${{ job.status }}\n${{ github.event.pull_request.html_url || github.event.head_commit.url }}",
+    #           "blocks": [
+    #             {
+    #               "type": "section",
+    #               "text": {
+    #                 "type": "mrkdwn",
+    #                 "text": "lerobot-cpu Docker Image build result: ${{ job.status }}\n${{ github.event.pull_request.html_url || github.event.head_commit.url }}"
+    #               }
+    #             }
+    #           ]
+    #         }
+    #     env:
+    #       SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

  latest-cuda:
    name: GPU
@@ -88,40 +113,27 @@ jobs:
          tags: huggingface/lerobot-gpu
          build-args: PYTHON_VERSION=${{ env.PYTHON_VERSION }}

-
-  latest-cuda-dev:
-    name: GPU Dev
-    runs-on: ubuntu-latest
-    steps:
-      - name: Cleanup disk
-        run: |
-          sudo df -h
-          # sudo ls -l /usr/local/lib/
-          # sudo ls -l /usr/share/
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /usr/share/dotnet
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo df -h
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Check out code
-        uses: actions/checkout@v4
-
-      - name: Login to DockerHub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-
-      - name: Build and Push GPU dev
-        uses: docker/build-push-action@v5
-        with:
-          context: .
-          file: ./docker/lerobot-gpu-dev/Dockerfile
-          push: true
-          tags: huggingface/lerobot-gpu:dev
-          build-args: PYTHON_VERSION=${{ env.PYTHON_VERSION }}
+      # - name: Post to a Slack channel
+      #   id: slack
+      #   #uses: slackapi/slack-github-action@v1.25.0
+      #   uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
+      #   with:
+      #     # Slack channel id, channel name, or user id to post message.
+      #     # See also: https://api.slack.com/methods/chat.postMessage#channels
+      #     channel-id: ${{ env.CI_SLACK_CHANNEL }}
+      #     # For posting a rich message using Block Kit
+      #     payload: |
+      #       {
+      #         "text": "lerobot-gpu Docker Image build result: ${{ job.status }}\n${{ github.event.pull_request.html_url || github.event.head_commit.url }}",
+      #         "blocks": [
+      #           {
+      #             "type": "section",
+      #             "text": {
+      #               "type": "mrkdwn",
+      #               "text": "lerobot-gpu Docker Image build result: ${{ job.status }}\n${{ github.event.pull_request.html_url || github.event.head_commit.url }}"
+      #             }
+      #           }
+      #         ]
+      #       }
+      #   env:
+      #     SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
--- a/.gitignore
+++ b/.gitignore
@@ -2,16 +2,11 @@
 logs
 tmp
 wandb
-
-# Data
 data
 outputs
-
-# Apple
-.DS_Store
-
-# VS Code
 .vscode
+rl
+.DS_Store

 # HPC
 nautilus/*.yaml
@@ -95,7 +90,6 @@ instance/
 docs/_build/

 # PyBuilder
-.pybuilder/
 target/

 # Jupyter Notebook
@@ -108,6 +102,13 @@ ipython_config.py
 # pyenv
 .python-version

+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 __pypackages__/

@@ -118,15 +119,6 @@ celerybeat.pid
 # SageMath parsed files
 *.sage.py

-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
 # Spyder project settings
 .spyderproject
 .spyproject
@@ -144,9 +136,3 @@ dmypy.json

 # Pyre type checker
 .pyre/
-
-# pytype static type analyzer
-.pytype/
-
-# Cython debug symbols
-cython_debug/
--- a/19
+++ b/19
@@ -27,7 +27,6 @@ test-end-to-end:
 	${MAKE} test-tdmpc-ete-train
 	${MAKE} test-tdmpc-ete-eval
 	${MAKE} test-default-ete-eval
-	${MAKE} test-act-pusht-tutorial

 test-act-ete-train:
 	python lerobot/scripts/train.py \
@@ -143,21 +142,3 @@ test-default-ete-eval:
 		eval.batch_size=1 \
 		env.episode_length=8 \
 		device=cpu \
-
-
-test-act-pusht-tutorial:
-	cp examples/advanced/1_train_act_pusht/act_pusht.yaml lerobot/configs/policy/created_by_Makefile.yaml
-	python lerobot/scripts/train.py \
-		policy=created_by_Makefile.yaml \
-		env=pusht \
-		wandb.enable=False \
-		training.offline_steps=2 \
-		eval.n_episodes=1 \
-		eval.batch_size=1 \
-		env.episode_length=2 \
-		device=cpu \
-		training.save_model=true \
-		training.save_freq=2 \
-		training.batch_size=2 \
-		hydra.run.dir=tests/outputs/act_pusht/
-	rm lerobot/configs/policy/created_by_Makefile.yaml
--- a/README.md
+++ b/README.md
@@ -77,10 +77,6 @@ Install 🤗 LeRobot:
 pip install .
 ```

-> **NOTE:** Depending on your platform, If you encounter any build errors during this step
-you may need to install `cmake` and `build-essential` for building some of our dependencies.
-On linux: `sudo apt-get install cmake build-essential`
-
 For simulations, 🤗 LeRobot comes with gymnasium environments that can be installed as extras:
 - [aloha](https://github.com/huggingface/gym-aloha)
 - [xarm](https://github.com/huggingface/gym-xarm)
@@ -103,7 +99,6 @@ wandb login
 ```
 .
 ├── examples             # contains demonstration examples, start here to learn about LeRobot
-|   └── advanced         # contains even more examples for those who have mastered the basics
 ├── lerobot
 |   ├── configs          # contains hydra yaml files with all options that you can override in the command line
 |   |   ├── default.yaml   # selected by default, it loads pusht environment and diffusion policy
@@ -163,10 +158,9 @@ See `python lerobot/scripts/eval.py --help` for more instructions.

 ### Train your own policy

-Check out [example 3](./examples/3_train_policy.py) that illustrates how to train a model using our core library in python, and [example 4](./examples/4_train_policy_with_script.md) that shows how to use our training script from command line.
+Check out [example 3](./examples/3_train_policy.py) that illustrates how to start training a model.

 In general, you can use our training script to easily train any policy. Here is an example of training the ACT policy on trajectories collected by humans on the Aloha simulation environment for the insertion task:
-
 ```bash
 python lerobot/scripts/train.py \
    policy=act \
@@ -190,19 +184,7 @@ A link to the wandb logs for the run will also show up in yellow in your termina

 ![](media/wandb.png)

-Note: For efficiency, during training every checkpoint is evaluated on a low number of episodes. You may use `eval.n_episodes=500` to evaluate on more episodes than the default. Or, after training, you may want to re-evaluate your best checkpoints on more episodes or change the evaluation settings. See `python lerobot/scripts/eval.py --help` for more instructions.
-
-#### Reproduce state-of-the-art (SOTA)
-
-We have organized our configuration files (found under [`lerobot/configs`](./lerobot/configs)) such that they reproduce SOTA results from a given model variant in their respective original works. Simply running:
-
-```bash
-python lerobot/scripts/train.py policy=diffusion env=pusht
-```
-
-reproduces SOTA results for Diffusion Policy on the PushT task.
-
-Pretrained policies, along with reproduction details, can be found under the "Models" section of https://huggingface.co/lerobot.
+Note: For efficiency, during training every checkpoint is evaluated on a low number of episodes. After training, you may want to re-evaluate your best checkpoints on more episodes or change the evaluation settings. See `python lerobot/scripts/eval.py --help` for more instructions.

 ## Contribute

--- a/docker/lerobot-gpu-dev/Dockerfile
+++ b/docker/lerobot-gpu-dev/Dockerfile
@@ -1,40 +0,0 @@
-FROM nvidia/cuda:12.4.1-base-ubuntu22.04
-
-# Configure image
-ARG PYTHON_VERSION=3.10
-ARG DEBIAN_FRONTEND=noninteractive
-
-# Install apt dependencies
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    build-essential cmake \
-    git git-lfs openssh-client \
-    nano vim less util-linux \
-    htop atop nvtop \
-    sed gawk grep curl wget \
-    tcpdump sysstat screen tmux \
-    libglib2.0-0 libgl1-mesa-glx libegl1-mesa ffmpeg \
-    python${PYTHON_VERSION} python${PYTHON_VERSION}-venv \
-    && apt-get clean && rm -rf /var/lib/apt/lists/*
-
-# Install gh cli tool
-RUN (type -p wget >/dev/null || (apt update && apt-get install wget -y)) \
-    && mkdir -p -m 755 /etc/apt/keyrings \
-    && wget -qO- https://cli.github.com/packages/githubcli-archive-keyring.gpg | tee /etc/apt/keyrings/githubcli-archive-keyring.gpg > /dev/null \
-    && chmod go+r /etc/apt/keyrings/githubcli-archive-keyring.gpg \
-    && echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | tee /etc/apt/sources.list.d/github-cli.list > /dev/null \
-    && apt update \
-    && apt install gh -y \
-    && apt clean && rm -rf /var/lib/apt/lists/*
-
-# Setup `python`
-RUN ln -s /usr/bin/python3 /usr/bin/python
-
-# Install poetry
-RUN curl -sSL https://install.python-poetry.org | python -
-ENV PATH="/root/.local/bin:$PATH"
-RUN echo 'if [ "$HOME" != "/root" ]; then ln -sf /root/.local/bin/poetry $HOME/.local/bin/poetry; fi' >> /root/.bashrc
-RUN poetry config virtualenvs.create false
-RUN poetry config virtualenvs.in-project true
-
-# Set EGL as the rendering backend for MuJoCo
-ENV MUJOCO_GL="egl"
--- a/docker/lerobot-gpu/Dockerfile
+++ b/docker/lerobot-gpu/Dockerfile
@@ -4,15 +4,18 @@ FROM nvidia/cuda:12.4.1-base-ubuntu22.04
 ARG PYTHON_VERSION=3.10
 ARG DEBIAN_FRONTEND=noninteractive

-
 # Install apt dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
    build-essential cmake \
+    git git-lfs openssh-client \
+    nano vim ffmpeg \
+    htop atop nvtop \
+    sed gawk grep curl wget \
+    tcpdump sysstat screen \
    libglib2.0-0 libgl1-mesa-glx libegl1-mesa \
    python${PYTHON_VERSION} python${PYTHON_VERSION}-venv \
    && apt-get clean && rm -rf /var/lib/apt/lists/*

-
 # Create virtual environment
 RUN ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python
 RUN python -m venv /opt/venv
@@ -20,7 +23,8 @@ ENV PATH="/opt/venv/bin:$PATH"
 RUN echo "source /opt/venv/bin/activate" >> /root/.bashrc

 # Install LeRobot
-COPY . /lerobot
+RUN git lfs install
+RUN git clone https://github.com/huggingface/lerobot.git
 WORKDIR /lerobot
 RUN pip install --upgrade --no-cache-dir pip
 RUN pip install --no-cache-dir ".[test, aloha, xarm, pusht]"
--- a/examples/advanced/2_calculate_validation_loss.py
+++ b/examples/advanced/2_calculate_validation_loss.py
--- a/examples/4_train_policy_with_script.md
+++ b/examples/4_train_policy_with_script.md
@@ -1,183 +0,0 @@
-This tutorial will explain the training script, how to use it, and particularly the use of Hydra to configure everything needed for the training run.
-
-## The training script
-
-LeRobot offers a training script at [`lerobot/scripts/train.py`](../../lerobot/scripts/train.py). At a high level it does the following:
-
- Loads a Hydra configuration file for the following steps (more on Hydra in a moment).
- Makes a simulation environment.
- Makes a dataset corresponding to that simulation environment.
- Makes a policy.
- Runs a standard training loop with forward pass, backward pass, optimization step, and occasional logging, evaluation (of the policy on the environment), and checkpointing.
-
-## Basics of how we use Hydra
-
-Explaining the ins and outs of [Hydra](https://hydra.cc/docs/intro/) is beyond the scope of this document, but here we'll share the main points you need to know.
-
-First, `lerobot/configs` has a directory structure like this:
-
-```
-.
-├── default.yaml
-├── env
-│   ├── aloha.yaml
-│   ├── pusht.yaml
-│   └── xarm.yaml
-└── policy
-    ├── act.yaml
-    ├── diffusion.yaml
-    └── tdmpc.yaml
-```
-
-**_For brevity, in the rest of this document we'll drop the leading `lerobot/configs` path. So `default.yaml` really refers to `lerobot/configs/default.yaml`._**
-
-When you run the training script with
-
-```python
-python lerobot/scripts/train.py
-```
-
-Hydra is set up to read `default.yaml` (via the `@hydra.main` decorator). If you take a look at the `@hydra.main`'s arguments you will see `config_path="../configs", config_name="default"`. At the top of `default.yaml`, is a `defaults` section which looks likes this:
-
-```yaml
-defaults:
-  - _self_
-  - env: pusht
-  - policy: diffusion
-```
-
-This logic tells Hydra to incorporate configuration parameters from `env/pusht.yaml` and `policy/diffusion.yaml`. _Note: Be aware of the order as any configuration parameters with the same name will be overidden. Thus, `default.yaml` is overriden by `env/pusht.yaml`  which is overidden by `policy/diffusion.yaml`_.
-
-Then, `default.yaml` also contains common configuration parameters such as `device: cuda` or `use_amp: false` (for enabling fp16 training). Some other parameters are set to `???` which indicates that they are expected to be set in additional yaml files. For instance, `training.offline_steps: ???` in `default.yaml` is set to `200000` in `diffusion.yaml`.
-
-Thanks to this `defaults` section in `default.yaml`, if you want to train Diffusion Policy with PushT, you really only need to run:
-
-```bash
-python lerobot/scripts/train.py
-```
-
-However, you can be more explicit and launch the exact same Diffusion Policy training on PushT with:
-
-```bash
-python lerobot/scripts/train.py policy=diffusion env=pusht
-```
-
-This way of overriding defaults via the CLI is especially useful when you want to change the policy and/or environment. For instance, you can train ACT on the default Aloha environment with:
-
-```bash
-python lerobot/scripts/train.py policy=act env=aloha
-```
-
-There are two things to note here:
- Config overrides are passed as `param_name=param_value`.
- Here we have overridden the defaults section. `policy=act` tells Hydra to use `policy/act.yaml`, and `env=aloha` tells Hydra to use `env/pusht.yaml`.
-
-_As an aside: we've set up all of our configurations so that they reproduce state-of-the-art results from papers in the literature._
-
-## Overriding configuration parameters in the CLI
-
-Now let's say that we want to train on a different task in the Aloha environment. If you look in `env/aloha.yaml` you will see something like:
-
-```yaml
-# lerobot/configs/env/aloha.yaml
-env:
-  task: AlohaInsertion-v0
-```
-
-And if you look in `policy/act.yaml` you will see something like:
-
-```yaml
-# lerobot/configs/policy/act.yaml
-dataset_repo_id: lerobot/aloha_sim_insertion_human
-```
-
-But our Aloha environment actually supports a cube transfer task as well. To train for this task, you could manually modify the two yaml configuration files respectively.
-
-First, we'd need to switch to using the cube transfer task for the ALOHA environment.
-
-```diff
-# lerobot/configs/env/aloha.yaml
-env:
-  task: AlohaInsertion-v0
-+  task: AlohaTransferCube-v0
-```
-
-Then, we'd also need to switch to using the cube transfer dataset.
-
-```diff
-# lerobot/configs/policy/act.yaml
-dataset_repo_id: lerobot/aloha_sim_insertion_human
-+dataset_repo_id: lerobot/aloha_sim_transfer_cube_human
-```
-
-Then, you'd be able to run:
-
-```bash
-python lerobot/scripts/train.py policy=act env=aloha
-```
-
-and you'd be training and evaluating on the cube transfer task.
-
-An alternative approach to editing the yaml configuration files, would be to override the defaults via the command line:
-
-```bash
-python lerobot/scripts/train.py \
-    policy=act \
-    dataset_repo_id=lerobot/aloha_sim_transfer_cube_human \
-    env=aloha \
-    env.task=AlohaTransferCube-v0
-```
-
-There's something new here. Notice the `.` delimiter used to traverse the configuration hierarchy. _But be aware that the `defaults` section is an exception. As you saw above, we didn't need to write `defaults.policy=act` in the CLI. `policy=act` was enough._
-
-Putting all that knowledge together, here's the command that was used to train https://huggingface.co/lerobot/act_aloha_sim_transfer_cube_human.
-
-```bash
-python lerobot/scripts/train.py \
-    hydra.run.dir=outputs/train/act_aloha_sim_transfer_cube_human \
-    device=cuda
-    env=aloha \
-    env.task=AlohaTransferCube-v0 \
-    dataset_repo_id=lerobot/aloha_sim_transfer_cube_human \
-    policy=act \
-    training.eval_freq=10000 \
-    training.log_freq=250 \
-    training.offline_steps=100000 \
-    training.save_model=true \
-    training.save_freq=25000 \
-    eval.n_episodes=50 \
-    eval.batch_size=50 \
-    wandb.enable=false \
-```
-
-There's one new thing here: `hydra.run.dir=outputs/train/act_aloha_sim_transfer_cube_human`, which specifies where to save the training output.
-
-## Using a configuration file not in `lerobot/configs`
-
-Above we discusses the our training script is set up such that Hydra looks for `default.yaml` in `lerobot/configs`. But, if you have a configuration file elsewhere in your filesystem you may use:
-
-```bash
-python lerobot/scripts/train.py --config-dir PARENT/PATH --config-name FILE_NAME_WITHOUT_EXTENSION
-```
-
-Note: here we use regular syntax for providing CLI arguments to a Python script, not Hydra's `param_name=param_value` syntax.
-
-As a concrete example, this becomes particularly handy when you have a folder with training outputs, and would like to re-run the training. For example, say you previously ran the training script with one of the earlier commands and have `outputs/train/my_experiment/checkpoints/pretrained_model/config.yaml`. This `config.yaml` file will have the full set of configuration parameters within it. To run the training with the same configuration again, do:
-
-```bash
-python lerobot/scripts/train.py --config-dir outputs/train/my_experiment/checkpoints/pretrained_model --config-name config
-```
-
-Note that you may still use the regular syntax for config parameter overrides (eg: by adding `training.offline_steps=200000`).
-
---
-
-So far we've seen how to train Diffusion Policy for PushT and ACT for ALOHA. Now, what if we want to train ACT for PushT? Well, there are aspects of the ACT configuration that are specific to the ALOHA environments, and these happen to be incompatible with PushT. Therefore, trying to run the following will almost certainly raise an exception of sorts (eg: feature dimension mismatch):
-
-```bash
-python lerobot/scripts/train.py policy=act env=pusht dataset_repo_id=lerobot/pusht
-```
-
-Please, head on over to our [advanced tutorial on adapting policy configuration to various environments](./advanced/train_act_pusht/train_act_pusht.md) to learn more.
-
-Or in the meantime, happy coding! 🤗
--- a/examples/advanced/1_train_act_pusht/act_pusht.yaml
+++ b/examples/advanced/1_train_act_pusht/act_pusht.yaml
@@ -1,87 +0,0 @@
-# @package _global_
-
-# Change the seed to match what PushT eval uses
-# (to avoid evaluating on seeds used for generating the training data).
-seed: 100000
-# Change the dataset repository to the PushT one.
-dataset_repo_id: lerobot/pusht
-
-override_dataset_stats:
-  observation.image:
-    # stats from imagenet, since we use a pretrained vision model
-    mean: [[[0.485]], [[0.456]], [[0.406]]]  # (c,1,1)
-    std: [[[0.229]], [[0.224]], [[0.225]]]  # (c,1,1)
-
-training:
-  offline_steps: 80000
-  online_steps: 0
-  eval_freq: 10000
-  save_freq: 100000
-  log_freq: 250
-  save_model: true
-
-  batch_size: 8
-  lr: 1e-5
-  lr_backbone: 1e-5
-  weight_decay: 1e-4
-  grad_clip_norm: 10
-  online_steps_between_rollouts: 1
-
-  delta_timestamps:
-    action: "[i / ${fps} for i in range(${policy.chunk_size})]"
-
-eval:
-  n_episodes: 50
-  batch_size: 50
-
-# See `configuration_act.py` for more details.
-policy:
-  name: act
-
-  # Input / output structure.
-  n_obs_steps: 1
-  chunk_size: 100 # chunk_size
-  n_action_steps: 100
-
-  input_shapes:
-    observation.image: [3, 96, 96]
-    observation.state: ["${env.state_dim}"]
-  output_shapes:
-    action: ["${env.action_dim}"]
-
-  # Normalization / Unnormalization
-  input_normalization_modes:
-    observation.image: mean_std
-    # Use min_max normalization just because it's more standard.
-    observation.state: min_max
-  output_normalization_modes:
-    # Use min_max normalization just because it's more standard.
-    action: min_max
-
-  # Architecture.
-  # Vision backbone.
-  vision_backbone: resnet18
-  pretrained_backbone_weights: ResNet18_Weights.IMAGENET1K_V1
-  replace_final_stride_with_dilation: false
-  # Transformer layers.
-  pre_norm: false
-  dim_model: 512
-  n_heads: 8
-  dim_feedforward: 3200
-  feedforward_activation: relu
-  n_encoder_layers: 4
-    # Note: Although the original ACT implementation has 7 for `n_decoder_layers`, there is a bug in the code
-  # that means only the first layer is used. Here we match the original implementation by setting this to 1.
-  # See this issue https://github.com/tonyzhaozh/act/issues/25#issue-2258740521.
-  n_decoder_layers: 1
-  # VAE.
-  use_vae: true
-  latent_dim: 32
-  n_vae_encoder_layers: 4
-
-  # Inference.
-  temporal_ensemble_momentum: null
-
-  # Training and loss computation.
-  dropout: 0.1
-  kl_weight: 10.0
--- a/examples/advanced/1_train_act_pusht/train_act_pusht.md
+++ b/examples/advanced/1_train_act_pusht/train_act_pusht.md
@@ -1,70 +0,0 @@
-In this tutorial we will learn how to adapt a policy configuration to be compatible with a new environment and dataset. As a concrete example, we will adapt the default configuration for ACT to be compatible with the PushT environment and dataset.
-
-If you haven't already read our tutorial on the [training script and configuration tooling](../4_train_policy_with_script.md) please do so prior to tackling this tutorial.
-
-Let's get started!
-
-Suppose we want to train ACT for PushT. Well, there are aspects of the ACT configuration that are specific to the ALOHA environments, and these happen to be incompatible with PushT. Therefore, trying to run the following will almost certainly raise an exception of sorts (eg: feature dimension mismatch):
-
-```bash
-python lerobot/scripts/train.py policy=act env=pusht dataset_repo_id=lerobot/pusht
-```
-
-We need to adapt the parameters of the ACT policy configuration to the PushT environment. The most important ones are the image keys.
-
-ALOHA's datasets and environments typically use a variable number of cameras. In `lerobot/configs/policy/act.yaml` you may notice two relevant sections. Here we show you the minimal diff needed to adjust to PushT:
-
-```diff
-override_dataset_stats:
-  observation.images.top:
-+  observation.image:
-    # stats from imagenet, since we use a pretrained vision model
-    mean: [[[0.485]], [[0.456]], [[0.406]]]  # (c,1,1)
-    std: [[[0.229]], [[0.224]], [[0.225]]]  # (c,1,1)
-
-policy:
-  input_shapes:
-    observation.images.top: [3, 480, 640]
-+    observation.image: [3, 96, 96]
-    observation.state: ["${env.state_dim}"]
-  output_shapes:
-    action: ["${env.action_dim}"]
-
-  input_normalization_modes:
-    observation.images.top: mean_std
-+    observation.image: mean_std
-     observation.state: min_max
-  output_normalization_modes:
-    action: min_max
-```
-
-Here we've accounted for the following:
- PushT uses "observation.image" for its image key.
- PushT provides smaller images.
-
-_Side note: technically we could override these via the CLI, but with many changes it gets a bit messy, and we also have a bit of a challenge in that we're using `.` in our observation keys which is treated by Hydra as a hierarchical separator_.
-
-For your convenience, we provide [`act_pusht.yaml`](./act_pusht.yaml) in this directory. It contains the diff above, plus some other (optional) ones that are explained within. Please copy it into `lerobot/configs/policy` with:
-
-```bash
-cp examples/advanced/1_train_act_pusht/act_pusht.yaml lerobot/configs/policy/act_pusht.yaml
-```
-
-(remember from a [previous tutorial](../4_train_policy_with_script.md) that Hydra will look in the `lerobot/configs` directory). Now try running the following.
-
-<!-- Note to contributor: are you changing this command? Note that it's tested in `Makefile`, so change it there too! -->
-```bash
-python lerobot/scripts/train.py policy=act_pusht env=pusht
-```
-
-Notice that this is much the same as the command that failed at the start of the tutorial, only:
- Now we are using `policy=act_pusht` to point to our new configuration file.
- We can drop `dataset_repo_id=lerobot/pusht` as the change is incorporated in our new configuration file.
-
-Hurrah! You're now training ACT for the PushT environment.
-
---
-
-The bottom line of this tutorial is that when training policies for different environments and datasets you will need to understand what parts of the policy configuration are specific to those and make changes accordingly.
-
-Happy coding! 🤗
--- a/lerobot/common/datasets/factory.py
+++ b/lerobot/common/datasets/factory.py
@@ -16,12 +16,15 @@
 import logging

 import torch
-from omegaconf import DictConfig, OmegaConf
+from omegaconf import OmegaConf

 from lerobot.common.datasets.lerobot_dataset import LeRobotDataset


-def make_dataset(cfg: DictConfig, split="train") -> LeRobotDataset:
+def make_dataset(
+    cfg,
+    split="train",
+):
    if cfg.env.name not in cfg.dataset_repo_id:
        logging.warning(
            f"There might be a mismatch between your training dataset ({cfg.dataset_repo_id=}) and your "
@@ -40,7 +43,6 @@ def make_dataset(cfg: DictConfig, split="train") -> LeRobotDataset:
        cfg.dataset_repo_id,
        split=split,
        delta_timestamps=delta_timestamps,
-        n_end_keyframes_dropped=eval(cfg.training.get("n_end_keyframes_dropped", "0")),
    )

    if cfg.get("override_dataset_stats"):
--- a/lerobot/common/datasets/lerobot_dataset.py
+++ b/lerobot/common/datasets/lerobot_dataset.py
@@ -44,26 +44,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
        split: str = "train",
        transform: callable = None,
        delta_timestamps: dict[list[float]] | None = None,
-        n_end_keyframes_dropped: int = 0,
    ):
-        """
-        Args:
-            delta_timestamps: A dictionary mapping lists of relative times (Δt) to data keys. When a frame is
-                sampled from the underlying dataset, we treat it as a "keyframe" and load multiple frames
-                according to the list of Δt's. For example {"action": [-0.05, 0, 0.05]} indicates
-                that we want to load the current keyframe's action, as well as one from 50 ms ago, and one
-                50 ms into the future. The action key then contains a (3, action_dim) tensor (whereas without
-                `delta_timestamps` there would just be a (action_dim,) tensor. When the Δt's demand that
-                frames outside of an episode boundary are retrieved, a copy padding strategy is used. See
-                `load_previous_and_future_frames` for more details.
-            n_end_keyframes_dropped: Don't sample the last n items in each episode. This option is handy when
-                used in combination with `delta_timestamps` when, for example, the Δt's demand multiple future
-                frames, but we want to avoid introducing too much copy padding into the data distribution.
-                For example if `delta_timestamps = {"action": [0, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30]}`
-                and we sample the last frame in the episode, we would end up padding with 6 frames worth of
-                copies. Instead, we might want no padding (in which case we need n=6), or we might be okay
-                with up to 2 frames of padding (in which case we need n=4).
-        """
        super().__init__()
        self.repo_id = repo_id
        self.version = version
@@ -84,12 +65,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
        self.info = load_info(repo_id, version, root)
        if self.video:
            self.videos_dir = load_videos(repo_id, version, root)
-        # If `n_end_keyframes_dropped == 0`, `self.index` contains exactly the indices of the hf_dataset. If
-        # `n_end_keyframes_dropped > 0`, `self.index` contains a subset of the indices of the hf_dataset where
-        # we drop those indices pertaining to the last n frames of each episode.
-        self.index = []
-        for from_ix, to_ix in zip(*self.episode_data_index.values(), strict=True):
-            self.index.extend(list(range(from_ix, to_ix - n_end_keyframes_dropped)))

    @property
    def fps(self) -> int:
@@ -132,11 +107,8 @@ class LeRobotDataset(torch.utils.data.Dataset):

    @property
    def num_samples(self) -> int:
-        """Number of possible samples in the dataset.
-
-        This is equivalent to the number of frames in the dataset minus n_end_keyframes_dropped.
-        """
-        return len(self.index)
+        """Number of samples/frames."""
+        return len(self.hf_dataset)

    @property
    def num_episodes(self) -> int:
@@ -156,7 +128,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
        return self.num_samples

    def __getitem__(self, idx):
-        item = self.hf_dataset[self.index[idx]]
+        item = self.hf_dataset[idx]

        if self.delta_timestamps is not None:
            item = load_previous_and_future_frames(
--- a/lerobot/common/policies/diffusion/modeling_diffusion.py
+++ b/lerobot/common/policies/diffusion/modeling_diffusion.py
@@ -304,11 +304,7 @@ class DiffusionModel(nn.Module):
        loss = F.mse_loss(pred, target, reduction="none")

        # Mask loss wherever the action is padded with copies (edges of the dataset trajectory).
-        if self.config.do_mask_loss_for_padding:
-            if "action_is_pad" not in batch:
-                raise ValueError(
-                    f"You need to provide 'action_is_pad' in the batch when {self.config.do_mask_loss_for_padding=}."
-                )
+        if self.config.do_mask_loss_for_padding and "action_is_pad" in batch:
            in_episode_bound = ~batch["action_is_pad"]
            loss = loss * in_episode_bound.unsqueeze(-1)

--- a/lerobot/configs/policy/diffusion.yaml
+++ b/lerobot/configs/policy/diffusion.yaml
@@ -39,21 +39,11 @@ training:
  adam_weight_decay: 1.0e-6
  online_steps_between_rollouts: 1

-  # For each training batch we want (consider n_obs_steps=2, horizon=16):
-  # t           | -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14
-  # action      |  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,   a,  a,  a,  a,  a
-  # observation |  o,  o,   ,   ,   ,   ,   ,   ,   ,   ,   ,    ,   ,   ,   ,
-  # Note that at rollout we only use some of the actions (consider n_action_steps=8):
-  # action used |   ,  a,  a,  a,  a,  a,  a,  a,  a,   ,   ,    ,   ,   ,   ,
  delta_timestamps:
    observation.image: "[i / ${fps} for i in range(1 - ${policy.n_obs_steps}, 1)]"
    observation.state: "[i / ${fps} for i in range(1 - ${policy.n_obs_steps}, 1)]"
    action: "[i / ${fps} for i in range(1 - ${policy.n_obs_steps}, 1 - ${policy.n_obs_steps} + ${policy.horizon})]"

-  # The original implementation doesn't sample keyframes for the last 7 steps. This is because, as described
-  # above, the last 7 actions from the diffusion model are not used.
-  n_end_keyframes_dropped: ${policy.horizon} - ${policy.n_action_steps} - ${policy.n_obs_steps} + 1
-
 eval:
  n_episodes: 50
  batch_size: 50
--- a/lerobot/scripts/compare_policies.py
+++ b/lerobot/scripts/compare_policies.py
@@ -0,0 +1,340 @@
+"""Compare two policies on based on metrics computed from an eval.
+
+Usage example:
+
+You just made changes to a policy and you want to assess its new performance against
+the reference policy (i.e. before your changes).
+
+```
+python lerobot/scripts/compare_policies.py \
+    output/eval/ref_policy/eval_info.json \
+    output/eval/new_policy/eval_info.json
+```
+
+This script can accept `eval_info.json` dicts with identical seeds between each eval episode of ref_policy and
+new_policy (paired-samples) or from evals performed with different seeds (independent samples).
+
+The script will first perform normality tests to determine if parametric tests can be used or not, then
+evaluate if policies metrics are significantly different using the appropriate tests.
+
+CAVEATS: by default, this script will compare seeds numbers to determine if samples can be considered paired.
+If changes have been made to this environment in-between the ref_policy eval and the new_policy eval, you
+should use the `--independent` flag to override this and not pair the samples even if they have identical
+seeds.
+"""
+
+import argparse
+import json
+import logging
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+import scipy.stats as stats
+from scipy.stats import anderson, kstest, mannwhitneyu, normaltest, shapiro, ttest_ind, ttest_rel, wilcoxon
+from statsmodels.stats.contingency_tables import mcnemar
+from termcolor import colored
+from terminaltables import AsciiTable
+
+
+def init_logging() -> None:
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(message)s",
+        handlers=[logging.StreamHandler()],
+    )
+    logging.getLogger("matplotlib.font_manager").disabled = True
+
+
+def log_section(title: str) -> None:
+    section_title = f"\n{'-'*21}\n {title.center(19)} \n{'-'*21}"
+    logging.info(section_title)
+
+
+def log_test(msg: str, p_value: float):
+    if p_value < 0.01:
+        color, interpretation = "red", "H_0 Rejected"
+    elif 0.01 <= p_value < 0.05:
+        color, interpretation = "yellow", "Inconclusive"
+    else:
+        color, interpretation = "green", "H_0 Not Rejected"
+    logging.info(
+        f"{msg}, p-value = {colored(f'{p_value:.3f}', color)} -> {colored(f'{interpretation}', color, attrs=['bold'])}"
+    )
+
+
+def get_eval_info_episodes(eval_info_path: Path) -> dict:
+    with open(eval_info_path) as f:
+        eval_info = json.load(f)
+
+    return {
+        "sum_rewards": np.array([ep_stat["sum_reward"] for ep_stat in eval_info["per_episode"]]),
+        "max_rewards": np.array([ep_stat["max_reward"] for ep_stat in eval_info["per_episode"]]),
+        "successes": np.array([ep_stat["success"] for ep_stat in eval_info["per_episode"]]),
+        "seeds": [ep_stat["seed"] for ep_stat in eval_info["per_episode"]],
+        "num_episodes": len(eval_info["per_episode"]),
+    }
+
+
+def append_table_metric(table: list, metric: str, ref_sample: dict, new_sample: dict, mean_std: bool = False):
+    if mean_std:
+        ref_metric = f"{np.mean(ref_sample[metric]):.3f} ({np.std(ref_sample[metric]):.3f})"
+        new_metric = f"{np.mean(new_sample[metric]):.3f} ({np.std(new_sample[metric]):.3f})"
+        row_header = f"{metric} - mean (std)"
+    else:
+        ref_metric = ref_sample[metric]
+        new_metric = new_sample[metric]
+        row_header = metric
+
+    row = [row_header, ref_metric, new_metric]
+    table.append(row)
+    return table
+
+
+def cohens_d(x, y):
+    return (np.mean(x) - np.mean(y)) / np.sqrt((np.std(x, ddof=1) ** 2 + np.std(y, ddof=1) ** 2) / 2)
+
+
+def normality_tests(array: np.ndarray, name: str):
+    ap_stat, ap_p = normaltest(array)
+    sw_stat, sw_p = shapiro(array)
+    ks_stat, ks_p = kstest(array, "norm", args=(np.mean(array), np.std(array)))
+    ad_stat = anderson(array)
+
+    log_test(f"{name} - D'Agostino and Pearson test: statistic = {ap_stat:.3f}", ap_p)
+    log_test(f"{name} - Shapiro-Wilk test: statistic = {sw_stat:.3f}", sw_p)
+    log_test(f"{name} - Kolmogorov-Smirnov test: statistic = {ks_stat:.3f}", ks_p)
+    logging.info(f"{name} - Anderson-Darling test: statistic = {ad_stat.statistic:.3f}")
+    for i in range(len(ad_stat.critical_values)):
+        cv, sl = ad_stat.critical_values[i], ad_stat.significance_level[i]
+        logging.info(f"    Critical value at {sl}%: {cv:.3f}")
+
+    return sw_p > 0.05 and ks_p > 0.05
+
+
+def perform_tests(ref_sample: dict, new_sample: dict, output_dir: Path, independent: bool = False):
+    seeds_a, seeds_b = ref_sample["seeds"], new_sample["seeds"]
+    if (seeds_a == seeds_b) and not independent:
+        logging.info("\nSamples are paired (identical seeds).")
+        paired = True
+    else:
+        logging.info("\nSamples are considered independent (seeds are different).")
+        paired = False
+
+    table_data = [["Metric", "Ref.", "New"]]
+    table_data = append_table_metric(table_data, "num_episodes", ref_sample, new_sample)
+    table_data = append_table_metric(table_data, "successes", ref_sample, new_sample, mean_std=True)
+    table_data = append_table_metric(table_data, "max_rewards", ref_sample, new_sample, mean_std=True)
+    table_data = append_table_metric(table_data, "sum_rewards", ref_sample, new_sample, mean_std=True)
+    table = AsciiTable(table_data)
+    print(table.table)
+
+    log_section("Effect Size")
+    d_max_reward = cohens_d(ref_sample["max_rewards"], new_sample["max_rewards"])
+    d_sum_reward = cohens_d(ref_sample["sum_rewards"], new_sample["sum_rewards"])
+    logging.info(f"Cohen's d for Max Reward: {d_max_reward:.3f}")
+    logging.info(f"Cohen's d for Sum Reward: {d_sum_reward:.3f}")
+
+    if paired:
+        paired_sample_tests(ref_sample, new_sample)
+    else:
+        independent_sample_tests(ref_sample, new_sample)
+
+    output_dir.mkdir(exist_ok=True, parents=True)
+
+    plot_boxplot(
+        ref_sample["max_rewards"],
+        new_sample["max_rewards"],
+        ["Ref Sample Max Reward", "New Sample Max Reward"],
+        "Boxplot of Max Rewards",
+        f"{output_dir}/boxplot_max_reward.png",
+    )
+    plot_boxplot(
+        ref_sample["sum_rewards"],
+        new_sample["sum_rewards"],
+        ["Ref Sample Sum Reward", "New Sample Sum Reward"],
+        "Boxplot of Sum Rewards",
+        f"{output_dir}/boxplot_sum_reward.png",
+    )
+
+    plot_histogram(
+        ref_sample["max_rewards"],
+        new_sample["max_rewards"],
+        ["Ref Sample Max Reward", "New Sample Max Reward"],
+        "Histogram of Max Rewards",
+        f"{output_dir}/histogram_max_reward.png",
+    )
+    plot_histogram(
+        ref_sample["sum_rewards"],
+        new_sample["sum_rewards"],
+        ["Ref Sample Sum Reward", "New Sample Sum Reward"],
+        "Histogram of Sum Rewards",
+        f"{output_dir}/histogram_sum_reward.png",
+    )
+
+    plot_qqplot(
+        ref_sample["max_rewards"],
+        "Q-Q Plot of Ref Sample Max Rewards",
+        f"{output_dir}/qqplot_sample_a_max_reward.png",
+    )
+    plot_qqplot(
+        new_sample["max_rewards"],
+        "Q-Q Plot of New Sample Max Rewards",
+        f"{output_dir}/qqplot_sample_b_max_reward.png",
+    )
+    plot_qqplot(
+        ref_sample["sum_rewards"],
+        "Q-Q Plot of Ref Sample Sum Rewards",
+        f"{output_dir}/qqplot_sample_a_sum_reward.png",
+    )
+    plot_qqplot(
+        new_sample["sum_rewards"],
+        "Q-Q Plot of New Sample Sum Rewards",
+        f"{output_dir}/qqplot_sample_b_sum_reward.png",
+    )
+
+
+def paired_sample_tests(ref_sample: dict, new_sample: dict):
+    log_section("Normality tests")
+    max_reward_diff = ref_sample["max_rewards"] - new_sample["max_rewards"]
+    sum_reward_diff = ref_sample["sum_rewards"] - new_sample["sum_rewards"]
+
+    normal_max_reward_diff = normality_tests(max_reward_diff, "Max Reward Difference")
+    normal_sum_reward_diff = normality_tests(sum_reward_diff, "Sum Reward Difference")
+
+    log_section("Paired-sample tests")
+    if normal_max_reward_diff:
+        t_stat_max_reward, p_val_max_reward = ttest_rel(ref_sample["max_rewards"], new_sample["max_rewards"])
+        log_test(f"Paired t-test for Max Reward: t-statistic = {t_stat_max_reward:.3f}", p_val_max_reward)
+    else:
+        w_stat_max_reward, p_wilcox_max_reward = wilcoxon(
+            ref_sample["max_rewards"], new_sample["max_rewards"]
+        )
+        log_test(f"Wilcoxon test for Max Reward: statistic = {w_stat_max_reward:.3f}", p_wilcox_max_reward)
+
+    if normal_sum_reward_diff:
+        t_stat_sum_reward, p_val_sum_reward = ttest_rel(ref_sample["sum_rewards"], new_sample["sum_rewards"])
+        log_test(f"Paired t-test for Sum Reward: t-statistic = {t_stat_sum_reward:.3f}", p_val_sum_reward)
+    else:
+        w_stat_sum_reward, p_wilcox_sum_reward = wilcoxon(
+            ref_sample["sum_rewards"], new_sample["sum_rewards"]
+        )
+        log_test(f"Wilcoxon test for Sum Reward: statistic = {w_stat_sum_reward:.3f}", p_wilcox_sum_reward)
+
+    table = np.array(
+        [
+            [
+                np.sum((ref_sample["successes"] == 1) & (new_sample["successes"] == 1)),
+                np.sum((ref_sample["successes"] == 1) & (new_sample["successes"] == 0)),
+            ],
+            [
+                np.sum((ref_sample["successes"] == 0) & (new_sample["successes"] == 1)),
+                np.sum((ref_sample["successes"] == 0) & (new_sample["successes"] == 0)),
+            ],
+        ]
+    )
+    mcnemar_result = mcnemar(table, exact=True)
+    log_test(f"McNemar's test for Success: statistic = {mcnemar_result.statistic:.3f}", mcnemar_result.pvalue)
+
+
+def independent_sample_tests(ref_sample: dict, new_sample: dict):
+    log_section("Normality tests")
+    normal_max_rewards_a = normality_tests(ref_sample["max_rewards"], "Max Rewards Ref Sample")
+    normal_max_rewards_b = normality_tests(new_sample["max_rewards"], "Max Rewards New Sample")
+    normal_sum_rewards_a = normality_tests(ref_sample["sum_rewards"], "Sum Rewards Ref Sample")
+    normal_sum_rewards_b = normality_tests(new_sample["sum_rewards"], "Sum Rewards New Sample")
+
+    log_section("Independent samples tests")
+    table = [["Test", "max_rewards", "sum_rewards"]]
+    if normal_max_rewards_a and normal_max_rewards_b:
+        table = append_independent_test(
+            table, ref_sample, new_sample, ttest_ind, "Two-Sample t-test", kwargs={"equal_var": False}
+        )
+        t_stat_max_reward, p_val_max_reward = ttest_ind(
+            ref_sample["max_rewards"], new_sample["max_rewards"], equal_var=False
+        )
+        log_test(f"Two-Sample t-test for Max Reward: t-statistic = {t_stat_max_reward:.3f}", p_val_max_reward)
+    else:
+        table = append_independent_test(table, ref_sample, new_sample, mannwhitneyu, "Mann-Whitney U")
+        u_stat_max_reward, p_u_max_reward = mannwhitneyu(ref_sample["max_rewards"], new_sample["max_rewards"])
+        log_test(f"Mann-Whitney U test for Max Reward: U-statistic = {u_stat_max_reward:.3f}", p_u_max_reward)
+
+    if normal_sum_rewards_a and normal_sum_rewards_b:
+        t_stat_sum_reward, p_val_sum_reward = ttest_ind(
+            ref_sample["sum_rewards"], new_sample["sum_rewards"], equal_var=False
+        )
+        log_test(f"Two-Sample t-test for Sum Reward: t-statistic = {t_stat_sum_reward:.3f}", p_val_sum_reward)
+    else:
+        u_stat_sum_reward, p_u_sum_reward = mannwhitneyu(ref_sample["sum_rewards"], new_sample["sum_rewards"])
+        log_test(f"Mann-Whitney U test for Sum Reward: U-statistic = {u_stat_sum_reward:.3f}", p_u_sum_reward)
+
+    table = AsciiTable(table)
+    print(table.table)
+
+
+def append_independent_test(
+    table: list,
+    ref_sample: dict,
+    new_sample: dict,
+    test: callable,
+    test_name: str,
+    kwargs: dict | None = None,
+) -> list:
+    kwargs = {} if kwargs is None else kwargs
+    row = [f"{test_name}: p-value ≥ alpha"]
+    for metric in table[0][1:]:
+        _, p_val = test(ref_sample[metric], new_sample[metric], **kwargs)
+        alpha = 0.05
+        status = "✅" if p_val >= alpha else "❌"
+        row.append(f"{status} {p_val:.3f} ≥ {alpha}")
+
+    table.append(row)
+    return table
+
+
+def plot_boxplot(data_a: np.ndarray, data_b: np.ndarray, labels: list[str], title: str, filename: str):
+    plt.boxplot([data_a, data_b], labels=labels)
+    plt.title(title)
+    plt.savefig(filename)
+    plt.close()
+
+
+def plot_histogram(data_a: np.ndarray, data_b: np.ndarray, labels: list[str], title: str, filename: str):
+    plt.hist(data_a, bins=30, alpha=0.7, label=labels[0])
+    plt.hist(data_b, bins=30, alpha=0.7, label=labels[1])
+    plt.title(title)
+    plt.legend()
+    plt.savefig(filename)
+    plt.close()
+
+
+def plot_qqplot(data: np.ndarray, title: str, filename: str):
+    stats.probplot(data, dist="norm", plot=plt)
+    plt.title(title)
+    plt.savefig(filename)
+    plt.close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    parser.add_argument("ref_sample_path", type=Path, help="Path to the reference sample JSON file.")
+    parser.add_argument("new_sample_path", type=Path, help="Path to the new sample JSON file.")
+    parser.add_argument(
+        "--independent",
+        action="store_true",
+        help="Ignore seeds and consider samples to be independent (unpaired).",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=Path,
+        default=Path("outputs/compare/"),
+        help="Directory to save the output results. Defaults to outputs/compare/",
+    )
+    args = parser.parse_args()
+    init_logging()
+
+    ref_sample = get_eval_info_episodes(args.ref_sample_path)
+    new_sample = get_eval_info_episodes(args.new_sample_path)
+    perform_tests(ref_sample, new_sample, args.output_dir, args.independent)
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,7 +41,7 @@ numba = ">=0.59.0"
 torch = "^2.2.1"
 opencv-python = ">=4.9.0"
 diffusers = "^0.27.2"
-torchvision = ">=0.17.1"
+torchvision = ">=0.18.0"
 h5py = ">=3.10.0"
 huggingface-hub = {extras = ["hf-transfer"], version = "^0.23.0"}
 gymnasium = ">=0.29.1"
@@ -58,15 +58,19 @@ imagecodecs = { version = ">=2024.1.1", optional = true }
 pyav = ">=12.0.5"
 moviepy = ">=1.0.3"
 rerun-sdk = ">=0.15.1"
+statsmodels = {version = ">=0.14.2", optional = true}
+matplotlib = {version = ">=3.8.4", optional = true}
+terminaltables = {version = ">=3.1.10", optional = true}


 [tool.poetry.extras]
 pusht = ["gym-pusht"]
 xarm = ["gym-xarm"]
 aloha = ["gym-aloha"]
+umi = ["imagecodecs"]
+compare = ["statsmodels", "matplotlib", "terminaltables"]
 dev = ["pre-commit", "debugpy"]
 test = ["pytest", "pytest-cov"]
-umi = ["imagecodecs"]

 [tool.ruff]
 line-length = 110
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -115,7 +115,6 @@ def test_compute_stats_on_xarm():

    # reduce size of dataset sample on which stats compute is tested to 10 frames
    dataset.hf_dataset = dataset.hf_dataset.select(range(10))
-    dataset.index = [i for i in dataset.index if i < 10]

    # Note: we set the batch size to be smaller than the whole dataset to make sure we are testing batched
    # computation of the statistics. While doing this, we also make sure it works when we don't divide the
--- a/tests/test_examples.py
+++ b/tests/test_examples.py
@@ -45,11 +45,11 @@ def test_example_1():


@require_package("gym_pusht")
-def test_examples_basic2_basic3_advanced1():
+def test_examples_2_through_4():
    """
    Train a model with example 3, check the outputs.
    Evaluate the trained model with example 2, check the outputs.
-    Calculate the validation loss with advanced example 1, check the outputs.
+    Calculate the validation loss with example 4, check the outputs.
    """

    ### Test example 3
@@ -97,7 +97,7 @@ def test_examples_basic2_basic3_advanced1():
    assert Path("outputs/eval/example_pusht_diffusion/rollout.mp4").exists()

    ## Test example 4
-    file_contents = _read_file("examples/advanced/2_calculate_validation_loss.py")
+    file_contents = _read_file("examples/4_calculate_validation_loss.py")

    # Run on a single example from the last episode, use CPU, and use the local model.
    file_contents = _find_and_replace(
Author	SHA1	Message	Date
Simon Alibert	c6a61e3ba2	WIP	2024-05-21 16:31:48 +02:00
Simon Alibert	62d3546f08	Move dependencies to extra	2024-05-21 16:29:44 +02:00
Simon Alibert	956f035d16	Merge remote-tracking branch 'origin/main' into user/aliberts/2024_05_14_compare_policies	2024-05-21 10:14:10 +02:00
Simon Alibert	eb530fa595	Add '--independent' flag	2024-05-16 19:31:57 +02:00
Simon Alibert	fe31b7f4b7	Merge remote-tracking branch 'origin/main' into user/aliberts/2024_05_14_compare_policies	2024-05-16 17:04:33 +02:00
Simon Alibert	8f5cfcd73d	Add argparse, refactor & cleanup	2024-05-16 16:55:40 +02:00
Simon Alibert	10036c1219	WIP add score tests	2024-05-15 17:50:12 +02:00