pretrained config for act

Refactor datasets into LeRobotDataset (#91 )
Co-authored-by: Alexander Soare <alexander.soare159@gmail.com>
2024-04-25 16:06:57 +02:00 · 2024-04-25 12:23:12 +02:00 · 2024-04-25 11:47:38 +02:00
14 changed files with 189 additions and 180 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -6,7 +6,6 @@ data
 outputs
 .vscode
 rl
-.DS_Store

 # HPC
 nautilus/*.yaml
--- a/README.md
+++ b/README.md
@@ -153,7 +153,7 @@ See `python lerobot/scripts/eval.py --help` for more instructions.

 ### Train your own policy

-Checkout [examples](./examples) to see how tou can start training a model on a dataset, which will be automatically downloaded if needed.
+Check out [examples](./examples) to see how you can start training a model on a dataset, which will be automatically downloaded if needed.

 In general, you can use our training script to easily train any policy on any environment:
 ```bash
@@ -165,7 +165,7 @@ policy=act \
 hydra.run.dir=outputs/train/aloha_act
 ```

-After training, you may want to revisit model evaluation to change the evaluation settings. In fact, during training every checkpoints are already evaluated but on a low number of episodes for efficiency. Checkout [example](./examples) to evaluate any model checkpoint on more episodes to increase statistical significance.
+After training, you may want to revisit model evaluation to change the evaluation settings. In fact, during training every checkpoint is already evaluated but on a low number of episodes for efficiency. Check out [example](./examples) to evaluate any model checkpoint on more episodes to increase statistical significance.

 ## Contribute

--- a/lerobot/common/policies/act/configuration_act.py
+++ b/lerobot/common/policies/act/configuration_act.py
@@ -1,21 +1,17 @@
-from dataclasses import dataclass, field
+from transformers.configuration_utils import PretrainedConfig


-@dataclass
-class ActionChunkingTransformerConfig:
+class ActionChunkingTransformerConfig(PretrainedConfig):
    """Configuration class for the Action Chunking Transformers policy.

    Defaults are configured for training on bimanual Aloha tasks like "insertion" or "transfer".

    The parameters you will most likely need to change are the ones which depend on the environment / sensors.
-    Those are: `state_dim`, `action_dim` and `camera_names`.
+    Those are: `input_shapes` and 'output_shapes`.

    Args:
-        state_dim: Dimensionality of the observation state space (excluding images).
-        action_dim: Dimensionality of the action space.
        n_obs_steps: Number of environment steps worth of observations to pass to the policy (takes the
            current step and additional steps going back).
-        camera_names: The (unique) set of names for the cameras.
        chunk_size: The size of the action prediction "chunks" in units of environment steps.
        n_action_steps: The number of action steps to run in the environment for one invocation of the policy.
            This should be no greater than the chunk size. For example, if the chunk size size 100, you may
@@ -58,43 +54,41 @@ class ActionChunkingTransformerConfig:
        dropout: Dropout to use in the transformer layers (see code for details).
        kl_weight: The weight to use for the KL-divergence component of the loss if the variational objective
            is enabled. Loss is then calculated as: `reconstruction_loss + kl_weight * kld_loss`.
-    """

-    # Environment.
-    # TODO(rcadene, alexander-soare): remove these as they are defined in input_shapes, output_shapes
-    state_dim: int = 14
-    action_dim: int = 14
+        Example:

-    # Inputs / output structure.
+        ```python
+        >>> from lerobot import ActionChunkingTransformerConfig
+
+        >>> # Initializing an ACT style configuration
+        >>> configuration = ActionChunkingTransformerConfig()
+
+        >>> # Initializing a model (with random weights) from the ACT style configuration
+        >>> model = ActionChunkingTransformerPolicy(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+        ```"""
+
+    # Input / output structure.
    n_obs_steps: int = 1
-    camera_names: tuple[str] = ("top",)
    chunk_size: int = 100
    n_action_steps: int = 100

-    input_shapes: dict[str, list[str]] = field(
-        default_factory=lambda: {
-            "observation.images.top": [3, 480, 640],
-            "observation.state": [14],
-        }
-    )
-    output_shapes: dict[str, list[str]] = field(
-        default_factory=lambda: {
-            "action": [14],
-        }
-    )
+    input_shapes: dict[str, list[str]] = {
+        "observation.images.top": [3, 480, 640],
+        "observation.state": [14],
+    }
+
+    output_shapes: dict[str, list[str]] = {"action": [14]}

    # Normalization / Unnormalization
-    normalize_input_modes: dict[str, str] = field(
-        default_factory=lambda: {
-            "observation.image": "mean_std",
-            "observation.state": "mean_std",
-        }
-    )
-    unnormalize_output_modes: dict[str, str] = field(
-        default_factory=lambda: {
-            "action": "mean_std",
-        }
-    )
+    normalize_input_modes: dict[str, str] = {
+        "observation.image": "mean_std",
+        "observation.state": "mean_std",
+    }
+
+    unnormalize_output_modes: dict[str, str] = {"action": "mean_std"}

    # Architecture.
    # Vision backbone.
@@ -147,7 +141,10 @@ class ActionChunkingTransformerConfig:
            raise ValueError(
                f"Multiple observation steps not handled yet. Got `nobs_steps={self.n_obs_steps}`"
            )
-        if self.camera_names != ["top"]:
-            raise ValueError(f"For now, `camera_names` can only be ['top']. Got {self.camera_names}.")
-        if len(set(self.camera_names)) != len(self.camera_names):
-            raise ValueError(f"`camera_names` should not have any repeated entries. Got {self.camera_names}.")
+        # Check that there is only one image.
+        # TODO(alexander-soare): generalize this to multiple images.
+        if (
+            sum(k.startswith("observation.images.") for k in self.input_shapes) != 1
+            or "observation.images.top" not in self.input_shapes
+        ):
+            raise ValueError('For now, only "observation.images.top" is accepted for an image input.')
--- a/lerobot/common/policies/act/modeling_act.py
+++ b/lerobot/common/policies/act/modeling_act.py
@@ -5,6 +5,7 @@ The majority of changes here involve removing unused code, unifying naming, and
 """

 import math
+import time
 from collections import deque
 from itertools import chain
 from typing import Callable
@@ -80,9 +81,13 @@ class ActionChunkingTransformerPolicy(nn.Module):
            self.vae_encoder = _TransformerEncoder(cfg)
            self.vae_encoder_cls_embed = nn.Embedding(1, cfg.d_model)
            # Projection layer for joint-space configuration to hidden dimension.
-            self.vae_encoder_robot_state_input_proj = nn.Linear(cfg.state_dim, cfg.d_model)
+            self.vae_encoder_robot_state_input_proj = nn.Linear(
+                cfg.input_shapes["observation.state"][0], cfg.d_model
+            )
            # Projection layer for action (joint-space target) to hidden dimension.
-            self.vae_encoder_action_input_proj = nn.Linear(cfg.state_dim, cfg.d_model)
+            self.vae_encoder_action_input_proj = nn.Linear(
+                cfg.input_shapes["observation.state"][0], cfg.d_model
+            )
            self.latent_dim = cfg.latent_dim
            # Projection layer from the VAE encoder's output to the latent distribution's parameter space.
            self.vae_encoder_latent_output_proj = nn.Linear(cfg.d_model, self.latent_dim * 2)
@@ -110,7 +115,7 @@ class ActionChunkingTransformerPolicy(nn.Module):

        # Transformer encoder input projections. The tokens will be structured like
        # [latent, robot_state, image_feature_map_pixels].
-        self.encoder_robot_state_input_proj = nn.Linear(cfg.state_dim, cfg.d_model)
+        self.encoder_robot_state_input_proj = nn.Linear(cfg.input_shapes["observation.state"][0], cfg.d_model)
        self.encoder_latent_input_proj = nn.Linear(self.latent_dim, cfg.d_model)
        self.encoder_img_feat_input_proj = nn.Conv2d(
            backbone_model.fc.in_features, cfg.d_model, kernel_size=1
@@ -124,9 +129,28 @@ class ActionChunkingTransformerPolicy(nn.Module):
        self.decoder_pos_embed = nn.Embedding(cfg.chunk_size, cfg.d_model)

        # Final action regression head on the output of the transformer's decoder.
-        self.action_head = nn.Linear(cfg.d_model, cfg.action_dim)
+        self.action_head = nn.Linear(cfg.d_model, cfg.output_shapes["action"][0])

        self._reset_parameters()
+        self._create_optimizer()
+
+    def _create_optimizer(self):
+        optimizer_params_dicts = [
+            {
+                "params": [
+                    p for n, p in self.named_parameters() if not n.startswith("backbone") and p.requires_grad
+                ]
+            },
+            {
+                "params": [
+                    p for n, p in self.named_parameters() if n.startswith("backbone") and p.requires_grad
+                ],
+                "lr": self.cfg.lr_backbone,
+            },
+        ]
+        self.optimizer = torch.optim.AdamW(
+            optimizer_params_dicts, lr=self.cfg.lr, weight_decay=self.cfg.weight_decay
+        )

    def _reset_parameters(self):
        """Xavier-uniform initialization of the transformer parameters as in the original code."""
@@ -186,6 +210,33 @@ class ActionChunkingTransformerPolicy(nn.Module):

        return loss_dict

+    def update(self, batch, **_) -> dict:
+        """Run the model in train mode, compute the loss, and do an optimization step."""
+        start_time = time.time()
+        self.train()
+
+        batch = self.normalize_inputs(batch)
+
+        loss_dict = self.forward(batch)
+        # TODO(rcadene): self.unnormalize_outputs(out_dict)
+        loss = loss_dict["loss"]
+        loss.backward()
+
+        grad_norm = torch.nn.utils.clip_grad_norm_(
+            self.parameters(), self.cfg.grad_clip_norm, error_if_nonfinite=False
+        )
+
+        self.optimizer.step()
+        self.optimizer.zero_grad()
+
+        info = {
+            "loss": loss.item(),
+            "grad_norm": float(grad_norm),
+            "lr": self.cfg.lr,
+            "update_s": time.time() - start_time,
+        }
+
+        return info

    def _stack_images(self, batch: dict[str, Tensor]) -> dict[str, Tensor]:
        """Stacks all the images in a batch and puts them in a new key: "observation.images".
@@ -196,17 +247,9 @@ class ActionChunkingTransformerPolicy(nn.Module):
            "observation.images.{name}": (B, C, H, W) tensor of images.
        }
        """
-        # Check that there is only one image.
-        # TODO(alexander-soare): generalize this to multiple images.
-        provided_cameras = {k.rsplit(".", 1)[-1] for k in batch if k.startswith("observation.images.")}
-        if len(missing := set(self.cfg.camera_names).difference(provided_cameras)) > 0:
-            raise ValueError(
-                f"The following camera images are missing from the provided batch: {missing}. Check the "
-                "configuration parameter: `camera_names`."
-            )
-        # Stack images in the order dictated by the camera names.
+        # Stack images in the order dictated by input_shapes.
        batch["observation.images"] = torch.stack(
-            [batch[f"observation.images.{name}"] for name in self.cfg.camera_names],
+            [batch[k] for k in self.cfg.input_shapes if k.startswith("observation.images.")],
            dim=-4,
        )

@@ -274,7 +317,7 @@ class ActionChunkingTransformerPolicy(nn.Module):
        all_cam_features = []
        all_cam_pos_embeds = []
        images = batch["observation.images"]
-        for cam_index in range(len(self.cfg.camera_names)):
+        for cam_index in range(images.shape[-4]):
            cam_features = self.backbone(images[:, cam_index])["feature_map"]
            cam_pos_embed = self.encoder_cam_feat_pos_embed(cam_features).to(dtype=cam_features.dtype)
            cam_features = self.encoder_img_feat_input_proj(cam_features)  # (B, C, h, w)
--- a/lerobot/common/policies/diffusion/configuration_diffusion.py
+++ b/lerobot/common/policies/diffusion/configuration_diffusion.py
@@ -8,12 +8,9 @@ class DiffusionConfig:
    Defaults are configured for training with PushT providing proprioceptive and single camera observations.

    The parameters you will most likely need to change are the ones which depend on the environment / sensors.
-    Those are: `state_dim`, `action_dim` and `image_size`.
+    Those are: `input_shapes` and `output_shapes`.

    Args:
-        state_dim: Dimensionality of the observation state space (excluding images).
-        action_dim: Dimensionality of the action space.
-        image_size: (H, W) size of the input images.
        n_obs_steps: Number of environment steps worth of observations to pass to the policy (takes the
            current step and additional steps going back).
        horizon: Diffusion model action prediction size as detailed in `DiffusionPolicy.select_action`.
@@ -68,13 +65,6 @@ class DiffusionConfig:
            spaced). If not provided, this defaults to be the same as `num_train_timesteps`.
    """

-    # Environment.
-    # Inherit these from the environment config.
-    # TODO(rcadene, alexander-soare): remove these as they are defined in input_shapes, output_shapes
-    state_dim: int = 2
-    action_dim: int = 2
-    image_size: tuple[int, int] = (96, 96)
-
    # Inputs / output structure.
    n_obs_steps: int = 2
    horizon: int = 16
@@ -155,10 +145,14 @@ class DiffusionConfig:
            raise ValueError(
                f"`vision_backbone` must be one of the ResNet variants. Got {self.vision_backbone}."
            )
-        if self.crop_shape[0] > self.image_size[0] or self.crop_shape[1] > self.image_size[1]:
+        if (
+            self.crop_shape[0] > self.input_shapes["observation.image"][1]
+            or self.crop_shape[1] > self.input_shapes["observation.image"][2]
+        ):
            raise ValueError(
-                f"`crop_shape` should fit within `image_size`. Got {self.crop_shape} for `crop_shape` and "
-                f"{self.image_size} for `image_size`."
+                f'`crop_shape` should fit within `input_shapes["observation.image"]`. Got {self.crop_shape} '
+                f'for `crop_shape` and {self.input_shapes["observation.image"]} for '
+                '`input_shapes["observation.image"]`.'
            )
        supported_prediction_types = ["epsilon", "sample"]
        if self.prediction_type not in supported_prediction_types:
--- a/lerobot/common/policies/diffusion/modeling_diffusion.py
+++ b/lerobot/common/policies/diffusion/modeling_diffusion.py
@@ -11,6 +11,7 @@ TODO(alexander-soare):
 import copy
 import logging
 import math
+import time
 from collections import deque
 from typing import Callable

@@ -18,6 +19,7 @@ import einops
 import torch
 import torch.nn.functional as F  # noqa: N812
 import torchvision
+from diffusers.optimization import get_scheduler
 from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
 from robomimic.models.base_nets import SpatialSoftmax
 from torch import Tensor, nn
@@ -69,7 +71,25 @@ class DiffusionPolicy(nn.Module):
            self.ema_diffusion = copy.deepcopy(self.diffusion)
            self.ema = _EMA(cfg, model=self.ema_diffusion)

+        # TODO(alexander-soare): Move optimizer out of policy.
+        self.optimizer = torch.optim.Adam(
+            self.diffusion.parameters(), cfg.lr, cfg.adam_betas, cfg.adam_eps, cfg.adam_weight_decay
+        )

+        # TODO(alexander-soare): Move LR scheduler out of policy.
+        # TODO(rcadene): modify lr scheduler so that it doesn't depend on epochs but steps
+        self.global_step = 0
+
+        # configure lr scheduler
+        self.lr_scheduler = get_scheduler(
+            cfg.lr_scheduler,
+            optimizer=self.optimizer,
+            num_warmup_steps=cfg.lr_warmup_steps,
+            num_training_steps=lr_scheduler_num_training_steps,
+            # pytorch assumes stepping LRScheduler every epoch
+            # however huggingface diffusers steps it every batch
+            last_epoch=self.global_step - 1,
+        )

    def reset(self):
        """
@@ -135,6 +155,41 @@ class DiffusionPolicy(nn.Module):
        loss = self.diffusion.compute_loss(batch)
        return {"loss": loss}

+    def update(self, batch: dict[str, Tensor], **_) -> dict:
+        """Run the model in train mode, compute the loss, and do an optimization step."""
+        start_time = time.time()
+
+        self.diffusion.train()
+
+        batch = self.normalize_inputs(batch)
+
+        loss = self.forward(batch)["loss"]
+        loss.backward()
+
+        # TODO(rcadene): self.unnormalize_outputs(out_dict)
+
+        grad_norm = torch.nn.utils.clip_grad_norm_(
+            self.diffusion.parameters(),
+            self.cfg.grad_clip_norm,
+            error_if_nonfinite=False,
+        )
+
+        self.optimizer.step()
+        self.optimizer.zero_grad()
+        self.lr_scheduler.step()
+
+        if self.ema is not None:
+            self.ema.step(self.diffusion)
+
+        info = {
+            "loss": loss.item(),
+            "grad_norm": float(grad_norm),
+            "lr": self.lr_scheduler.get_last_lr()[0],
+            "update_s": time.time() - start_time,
+        }
+
+        return info
+
    def save(self, fp):
        torch.save(self.state_dict(), fp)

@@ -156,7 +211,8 @@ class _DiffusionUnetImagePolicy(nn.Module):

        self.rgb_encoder = _RgbEncoder(cfg)
        self.unet = _ConditionalUnet1D(
-            cfg, global_cond_dim=(cfg.action_dim + self.rgb_encoder.feature_dim) * cfg.n_obs_steps
+            cfg,
+            global_cond_dim=(cfg.output_shapes["action"][0] + self.rgb_encoder.feature_dim) * cfg.n_obs_steps,
        )

        self.noise_scheduler = DDPMScheduler(
@@ -184,7 +240,7 @@ class _DiffusionUnetImagePolicy(nn.Module):

        # Sample prior.
        sample = torch.randn(
-            size=(batch_size, self.cfg.horizon, self.cfg.action_dim),
+            size=(batch_size, self.cfg.horizon, self.cfg.output_shapes["action"][0]),
            dtype=dtype,
            device=device,
            generator=generator,
@@ -227,7 +283,7 @@ class _DiffusionUnetImagePolicy(nn.Module):
        sample = self.conditional_sample(batch_size, global_cond=global_cond)

        # `horizon` steps worth of actions (from the first observation).
-        actions = sample[..., : self.cfg.action_dim]
+        actions = sample[..., : self.cfg.output_shapes["action"][0]]
        # Extract `n_action_steps` steps worth of actions (from the current observation).
        start = n_obs_steps - 1
        end = start + self.cfg.n_action_steps
@@ -337,7 +393,9 @@ class _RgbEncoder(nn.Module):
        # Set up pooling and final layers.
        # Use a dry run to get the feature map shape.
        with torch.inference_mode():
-            feat_map_shape = tuple(self.backbone(torch.zeros(size=(1, 3, *cfg.image_size))).shape[1:])
+            feat_map_shape = tuple(
+                self.backbone(torch.zeros(size=(1, *cfg.input_shapes["observation.image"]))).shape[1:]
+            )
        self.pool = SpatialSoftmax(feat_map_shape, num_kp=cfg.spatial_softmax_num_keypoints)
        self.feature_dim = cfg.spatial_softmax_num_keypoints * 2
        self.out = nn.Linear(cfg.spatial_softmax_num_keypoints * 2, self.feature_dim)
@@ -454,7 +512,7 @@ class _ConditionalUnet1D(nn.Module):

        # In channels / out channels for each downsampling block in the Unet's encoder. For the decoder, we
        # just reverse these.
-        in_out = [(cfg.action_dim, cfg.down_dims[0])] + list(
+        in_out = [(cfg.output_shapes["action"][0], cfg.down_dims[0])] + list(
            zip(cfg.down_dims[:-1], cfg.down_dims[1:], strict=True)
        )

@@ -505,7 +563,7 @@ class _ConditionalUnet1D(nn.Module):

        self.final_conv = nn.Sequential(
            _Conv1dBlock(cfg.down_dims[0], cfg.down_dims[0], kernel_size=cfg.kernel_size),
-            nn.Conv1d(cfg.down_dims[0], cfg.action_dim, 1),
+            nn.Conv1d(cfg.down_dims[0], cfg.output_shapes["action"][0], 1),
        )

    def forward(self, x: Tensor, timestep: Tensor | int, global_cond=None) -> Tensor:
--- a/lerobot/configs/env/aloha.yaml
+++ b/lerobot/configs/env/aloha.yaml
@@ -21,7 +21,5 @@ env:
  image_size: [3, 480, 640]
  episode_length: 400
  fps: ${fps}
-
-policy:
  state_dim: 14
  action_dim: 14
--- a/lerobot/configs/env/pusht.yaml
+++ b/lerobot/configs/env/pusht.yaml
@@ -21,7 +21,5 @@ env:
  image_size: 96
  episode_length: 300
  fps: ${fps}
-
-policy:
  state_dim: 2
  action_dim: 2
--- a/lerobot/configs/env/xarm.yaml
+++ b/lerobot/configs/env/xarm.yaml
@@ -20,7 +20,5 @@ env:
  image_size: 84
  episode_length: 25
  fps: ${fps}
-
-policy:
  state_dim: 4
  action_dim: 4
--- a/lerobot/configs/policy/act.yaml
+++ b/lerobot/configs/policy/act.yaml
@@ -23,23 +23,17 @@ policy:

  pretrained_model_path:

-  # Environment.
-  # Inherit these from the environment config.
-  state_dim: ???
-  action_dim: ???
-
-  # Inputs / output structure.
+  # Input / output structure.
  n_obs_steps: ${n_obs_steps}
-  camera_names: [top]  # [top, front_close, left_pillar, right_pillar]
  chunk_size: 100 # chunk_size
  n_action_steps: 100

  input_shapes:
    # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
    observation.images.top: [3, 480, 640]
-    observation.state: ["${policy.state_dim}"]
+    observation.state: ["${env.state_dim}"]
  output_shapes:
-    action: ["${policy.action_dim}"]
+    action: ["${env.action_dim}"]

  # Normalization / Unnormalization
  normalize_input_modes:
--- a/lerobot/configs/policy/diffusion.yaml
+++ b/lerobot/configs/policy/diffusion.yaml
@@ -37,15 +37,7 @@ policy:

  pretrained_model_path:

-  # Environment.
-  # Inherit these from the environment config.
-  state_dim: ???
-  action_dim: ???
-  image_size:
-    - ${env.image_size}  # height
-    - ${env.image_size}  # width
-
-  # Inputs / output structure.
+  # Input / output structure.
  n_obs_steps: ${n_obs_steps}
  horizon: ${horizon}
  n_action_steps: ${n_action_steps}
@@ -53,9 +45,9 @@ policy:
  input_shapes:
    # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
    observation.image: [3, 96, 96]
-    observation.state: ["${policy.state_dim}"]
+    observation.state: ["${env.state_dim}"]
  output_shapes:
-    action: ["${policy.action_dim}"]
+    action: ["${env.action_dim}"]

  # Normalization / Unnormalization
  normalize_input_modes:
--- a/lerobot/configs/policy/tdmpc.yaml
+++ b/lerobot/configs/policy/tdmpc.yaml
@@ -16,8 +16,8 @@ policy:
  frame_stack: 1
  num_channels: 32
  img_size: ${env.image_size}
-  state_dim: ???
-  action_dim: ???
+  state_dim: ${env.action_dim}
+  action_dim: ${env.action_dim}

  # planning
  mpc: true
--- a/lerobot/scripts/train.py
+++ b/lerobot/scripts/train.py
@@ -1,5 +1,4 @@
 import logging
-import time
 from copy import deepcopy
 from pathlib import Path

@@ -8,7 +7,6 @@ import hydra
 import torch
 from datasets import concatenate_datasets
 from datasets.utils import disable_progress_bars, enable_progress_bars
-from diffusers.optimization import get_scheduler

 from lerobot.common.datasets.factory import make_dataset
 from lerobot.common.datasets.utils import cycle
@@ -24,45 +22,6 @@ from lerobot.common.utils.utils import (
 from lerobot.scripts.eval import eval_policy


-def update_policy(policy, batch, optimizer, grad_clip_norm, lr_scheduler=None):
-    start_time = time.time()
-
-    model = policy.diffusion if hasattr(policy, "diffusion") else policy  # TODO: hacky, remove this line
-    model.train()
-
-    batch = policy.normalize_inputs(batch)
-
-    output_dict = policy.forward(batch)
-    # TODO(rcadene): policy.unnormalize_outputs(out_dict)
-    loss = output_dict["loss"]
-    loss.backward()
-
-    # Diffusion
-    model = policy.diffusion if hasattr(policy, "diffusion") else policy  # TODO: hacky, remove this line
-    grad_norm = torch.nn.utils.clip_grad_norm_(
-        model.parameters(),
-        grad_clip_norm,
-        error_if_nonfinite=False,
-    )
-
-    optimizer.step()
-    optimizer.zero_grad()
-    if lr_scheduler is not None:
-        lr_scheduler.step()
-
-    if hasattr(policy, "ema") and policy.ema is not None:
-        policy.ema.step(model)
-
-    info = {
-        "loss": loss.item(),
-        "grad_norm": float(grad_norm),
-        "lr": optimizer.param_groups[0]['lr'],
-        "update_s": time.time() - start_time,
-    }
-
-    return info
-
-
@hydra.main(version_base=None, config_name="default", config_path="../configs")
 def train_cli(cfg: dict):
    train(
@@ -275,35 +234,6 @@ def train(cfg: dict, out_dir=None, job_name=None):
    logging.info("make_policy")
    policy = make_policy(cfg, dataset_stats=offline_dataset.stats)

-    # Create optimizer and scheduler
-    # Temporary hack to move optimizer out of policy
-    if cfg.policy.name == "act":
-        optimizer_params_dicts = [
-            {"params": [p for n, p in policy.named_parameters() if not n.startswith("backbone") and p.requires_grad]},
-            {
-                "params": [p for n, p in policy.named_parameters() if n.startswith("backbone") and p.requires_grad],
-                "lr": cfg.lr_backbone,
-            },
-        ]
-        optimizer = torch.optim.AdamW(optimizer_params_dicts, lr=cfg.lr, weight_decay=cfg.weight_decay)
-        lr_scheduler = None
-    elif cfg.policy.name == "diffusion":
-        optimizer = torch.optim.Adam(
-            policy.diffusion.parameters(), cfg.lr, cfg.adam_betas, cfg.adam_eps, cfg.adam_weight_decay
-        )
-        # TODO(rcadene): modify lr scheduler so that it doesn't depend on epochs but steps
-        global_step = 0
-        # configure lr scheduler
-        lr_scheduler = get_scheduler(
-            cfg.lr_scheduler,
-            optimizer=optimizer,
-            num_warmup_steps=cfg.lr_warmup_steps,
-            num_training_steps=cfg.offline_steps,
-            # pytorch assumes stepping LRScheduler every epoch
-            # however huggingface diffusers steps it every batch
-            last_epoch=global_step - 1,
-        )
-
    num_learnable_params = sum(p.numel() for p in policy.parameters() if p.requires_grad)
    num_total_params = sum(p.numel() for p in policy.parameters())

@@ -363,7 +293,7 @@ def train(cfg: dict, out_dir=None, job_name=None):
        for key in batch:
            batch[key] = batch[key].to(cfg.device, non_blocking=True)

-        train_info = update_policy(policy, batch, optimizer, cfg.grad_clip_norm, lr_scheduler)
+        train_info = policy.update(batch, step=step)

        # TODO(rcadene): is it ok if step_t=0 = 0 and not 1 as previously done?
        if step % cfg.log_freq == 0:
@@ -386,7 +316,9 @@ def train(cfg: dict, out_dir=None, job_name=None):
    # create dataloader for online training
    concat_dataset = torch.utils.data.ConcatDataset([offline_dataset, online_dataset])
    weights = [1.0] * len(concat_dataset)
-    sampler = torch.utils.data.WeightedRandomSampler(weights, num_samples=len(concat_dataset), replacement=True)
+    sampler = torch.utils.data.WeightedRandomSampler(
+        weights, num_samples=len(concat_dataset), replacement=True
+    )
    dataloader = torch.utils.data.DataLoader(
        concat_dataset,
        num_workers=4,
--- a/tests/test_policies.py
+++ b/tests/test_policies.py
@@ -121,7 +121,13 @@ def test_policy(env_name, policy_name, extra_overrides):
    ],
 )
 def test_normalize(insert_temporal_dim):
-    # TODO(rcadene, alexander-soare): test with real data and assert results of normalization/unnormalization
+    """
+    Test that normalize/unnormalize can run without exceptions when properly set up, and that they raise
+    an exception when the forward pass is called without the stats having been provided.
+
+    TODO(rcadene, alexander-soare): This should also test that the normalization / unnormalization works as
+    expected.
+    """

    input_shapes = {
        "observation.image": [3, 96, 96],
@@ -193,7 +199,7 @@ def test_normalize(insert_temporal_dim):
    new_normalize.load_state_dict(normalize.state_dict())
    new_normalize(input_batch)

-    # test wihtout stats
+    # test without stats
    unnormalize = Unnormalize(output_shapes, unnormalize_output_modes, stats=None)
    with pytest.raises(AssertionError):
        unnormalize(output_batch)
Author	SHA1	Message	Date
Quentin Gallouédec	783a40c9d4	pretrained config for act	2024-04-25 16:06:57 +02:00
Remi	659c69a1c0	Refactor datasets into LeRobotDataset (#91 ) Co-authored-by: Alexander Soare <alexander.soare159@gmail.com>	2024-04-25 12:23:12 +02:00
Remi	e760e4cd63	Move normalization to policy for act and diffusion (#90 ) Co-authored-by: Alexander Soare <alexander.soare159@gmail.com>	2024-04-25 11:47:38 +02:00