From c594b5e79f0104d099c37159fa2b7656b50ed889 Mon Sep 17 00:00:00 2001 From: Remi Cadene Date: Tue, 16 Jul 2024 17:06:48 +0000 Subject: [PATCH] Add diffusion, vqbet, act bigg --- lerobot/configs/policy/act_koch_real.yaml | 2 +- .../configs/policy/act_koch_real_bigger.yaml | 102 ++++++++++++++++++ .../configs/policy/diffusion_koch_real.yaml | 102 ++++++++++++++++++ lerobot/configs/policy/vqbet_koch_real.yaml | 16 +-- 4 files changed, 213 insertions(+), 9 deletions(-) create mode 100644 lerobot/configs/policy/act_koch_real_bigger.yaml create mode 100644 lerobot/configs/policy/diffusion_koch_real.yaml diff --git a/lerobot/configs/policy/act_koch_real.yaml b/lerobot/configs/policy/act_koch_real.yaml index e72a6a3f..fd1cdd52 100644 --- a/lerobot/configs/policy/act_koch_real.yaml +++ b/lerobot/configs/policy/act_koch_real.yaml @@ -42,7 +42,7 @@ training: online_steps_between_rollouts: 1 delta_timestamps: - action: "[i / ${fps} for i in range(1, ${policy.chunk_size}+1)]" + action: "[i / ${fps} + 1 / ${fps} for i in range(${policy.chunk_size})]" eval: n_episodes: 50 diff --git a/lerobot/configs/policy/act_koch_real_bigger.yaml b/lerobot/configs/policy/act_koch_real_bigger.yaml new file mode 100644 index 00000000..16b07d76 --- /dev/null +++ b/lerobot/configs/policy/act_koch_real_bigger.yaml @@ -0,0 +1,102 @@ +# @package _global_ + +# Use `act_koch_real.yaml` to train on real-world datasets collected on Alexander Koch's robots. +# Compared to `act.yaml`, it contains 2 cameras (i.e. laptop, phone) instead of 1 camera (i.e. top). +# Also, `training.eval_freq` is set to -1. This config is used to evaluate checkpoints at a certain frequency of training steps. +# When it is set to -1, it deactivates evaluation. This is because real-world evaluation is done through our `control_robot.py` script. +# Look at the documentation in header of `control_robot.py` for more information on how to collect data , train and evaluate a policy. +# +# Example of usage for training: +# ```bash +# python lerobot/scripts/train.py \ +# policy=act_koch_real \ +# env=koch_real +# ``` + +seed: 1000 +dataset_repo_id: lerobot/koch_pick_place_lego + +override_dataset_stats: + observation.images.laptop: + # stats from imagenet, since we use a pretrained vision model + mean: [[[0.485]], [[0.456]], [[0.406]]] # (c,1,1) + std: [[[0.229]], [[0.224]], [[0.225]]] # (c,1,1) + observation.images.phone: + # stats from imagenet, since we use a pretrained vision model + mean: [[[0.485]], [[0.456]], [[0.406]]] # (c,1,1) + std: [[[0.229]], [[0.224]], [[0.225]]] # (c,1,1) + +training: + offline_steps: 40000 + online_steps: 0 + eval_freq: -1 + save_freq: 5000 + log_freq: 10 + save_checkpoint: true + + batch_size: 64 + lr: 1e-4 + lr_backbone: 1e-4 + weight_decay: 1e-4 + grad_clip_norm: 10 + online_steps_between_rollouts: 1 + + delta_timestamps: + action: "[i / ${fps} + 1 / ${fps} for i in range(${policy.chunk_size})]" + +eval: + n_episodes: 50 + batch_size: 50 + +# See `configuration_act.py` for more details. +policy: + name: act + + # Input / output structure. + n_obs_steps: 1 + chunk_size: 100 + n_action_steps: 100 + + input_shapes: + # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env? + observation.images.laptop: [3, 480, 640] + observation.images.phone: [3, 480, 640] + observation.state: ["${env.state_dim}"] + output_shapes: + action: ["${env.action_dim}"] + + # Normalization / Unnormalization + input_normalization_modes: + observation.images.laptop: mean_std + observation.images.phone: mean_std + observation.state: mean_std + output_normalization_modes: + action: mean_std + + # Architecture. + # Vision backbone. + vision_backbone: resnet18 + pretrained_backbone_weights: ResNet18_Weights.IMAGENET1K_V1 + replace_final_stride_with_dilation: false + # Transformer layers. + pre_norm: false + dim_model: 512 + n_heads: 8 + dim_feedforward: 3200 + feedforward_activation: relu + n_encoder_layers: 4 + # Note: Although the original ACT implementation has 7 for `n_decoder_layers`, there is a bug in the code + # that means only the first layer is used. Here we match the original implementation by setting this to 1. + # See this issue https://github.com/tonyzhaozh/act/issues/25#issue-2258740521. + n_decoder_layers: 8 + # VAE. + use_vae: true + latent_dim: 32 + n_vae_encoder_layers: 4 + + # Inference. + temporal_ensemble_momentum: null + + # Training and loss computation. + dropout: 0.1 + kl_weight: 10.0 diff --git a/lerobot/configs/policy/diffusion_koch_real.yaml b/lerobot/configs/policy/diffusion_koch_real.yaml new file mode 100644 index 00000000..e0943f7c --- /dev/null +++ b/lerobot/configs/policy/diffusion_koch_real.yaml @@ -0,0 +1,102 @@ +# @package _global_ + +# Defaults for training for the PushT dataset as per https://github.com/real-stanford/diffusion_policy. +# Note: We do not track EMA model weights as we discovered it does not improve the results. See +# https://github.com/huggingface/lerobot/pull/134 for more details. + +seed: 100000 +dataset_repo_id: lerobot/koch_pick_place_lego + +override_dataset_stats: + observation.images.laptop: + # stats from imagenet, since we use a pretrained vision model + mean: [[[0.485]], [[0.456]], [[0.406]]] # (c,1,1) + std: [[[0.229]], [[0.224]], [[0.225]]] # (c,1,1) + observation.images.phone: + # stats from imagenet, since we use a pretrained vision model + mean: [[[0.485]], [[0.456]], [[0.406]]] # (c,1,1) + std: [[[0.229]], [[0.224]], [[0.225]]] # (c,1,1) + +training: + offline_steps: 40000 + online_steps: 0 + eval_freq: -1 + save_freq: 5000 + log_freq: 100 + save_checkpoint: true + + batch_size: 64 + grad_clip_norm: 10 + lr: 1.0e-4 + lr_scheduler: cosine + lr_warmup_steps: 500 + adam_betas: [0.95, 0.999] + adam_eps: 1.0e-8 + adam_weight_decay: 1.0e-6 + online_steps_between_rollouts: 1 + + delta_timestamps: + observation.images.cam_right_wrist: "[i / ${fps} for i in range(1 - ${policy.n_obs_steps}, 1)]" + observation.images.cam_left_wrist: "[i / ${fps} for i in range(1 - ${policy.n_obs_steps}, 1)]" + observation.images.cam_high: "[i / ${fps} for i in range(1 - ${policy.n_obs_steps}, 1)]" + observation.images.cam_low: "[i / ${fps} for i in range(1 - ${policy.n_obs_steps}, 1)]" + observation.state: "[i / ${fps} for i in range(1 - ${policy.n_obs_steps}, 1)]" + action: "[i / ${fps} + 1 / ${fps} for i in range(1 - ${policy.n_obs_steps}, 1 - ${policy.n_obs_steps} + ${policy.horizon})]" + +eval: + n_episodes: 50 + batch_size: 50 + +policy: + name: diffusion + + # Input / output structure. + n_obs_steps: 1 + horizon: 100 + n_action_steps: 100 + + input_shapes: + # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env? + observation.images.laptop: [3, 480, 640] + observation.images.phone: [3, 480, 640] + observation.state: ["${env.state_dim}"] + output_shapes: + action: ["${env.action_dim}"] + + # Normalization / Unnormalization + input_normalization_modes: + observation.images.laptop: mean_std + observation.images.phone: mean_std + observation.state: mean_std + output_normalization_modes: + action: mean_std + + # Architecture / modeling. + # Vision backbone. + vision_backbone: resnet18 + crop_shape: null + crop_is_random: False + pretrained_backbone_weights: ResNet18_Weights.IMAGENET1K_V1 + use_group_norm: False + spatial_softmax_num_keypoints: 512 + # Unet. + down_dims: [512, 1024, 2048] + kernel_size: 5 + n_groups: 8 + diffusion_step_embed_dim: 128 + use_film_scale_modulation: True + # Noise scheduler. + noise_scheduler_type: DDPM + num_train_timesteps: 100 + beta_schedule: squaredcos_cap_v2 + beta_start: 0.0001 + beta_end: 0.02 + prediction_type: epsilon # epsilon / sample + clip_sample: True + clip_sample_range: 1.0 + + # Inference + num_inference_steps: 100 + + # Loss computation + do_mask_loss_for_padding: false diff --git a/lerobot/configs/policy/vqbet_koch_real.yaml b/lerobot/configs/policy/vqbet_koch_real.yaml index 42a9956a..739019f0 100644 --- a/lerobot/configs/policy/vqbet_koch_real.yaml +++ b/lerobot/configs/policy/vqbet_koch_real.yaml @@ -16,17 +16,17 @@ override_dataset_stats: std: [[[0.229]], [[0.224]], [[0.225]]] # (c,1,1) training: - offline_steps: 80000 + offline_steps: 40000 online_steps: 0 eval_freq: -1 - save_freq: 10000 + save_freq: 5000 save_checkpoint: true - batch_size: 8 + batch_size: 64 grad_clip_norm: 10 lr: 1.0e-4 lr_scheduler: cosine - lr_warmup_steps: 2000 + lr_warmup_steps: 500 adam_betas: [0.95, 0.999] adam_eps: 1.0e-8 adam_weight_decay: 1.0e-6 @@ -34,7 +34,7 @@ training: # VQ-BeT specific vqvae_lr: 1.0e-3 - n_vqvae_training_steps: 20000 + n_vqvae_training_steps: 500 bet_weight_decay: 2e-4 bet_learning_rate: 5.5e-5 bet_betas: [0.9, 0.999] @@ -43,7 +43,7 @@ training: observation.images.laptop: "[i / ${fps} for i in range(1 - ${policy.n_obs_steps}, 1)]" observation.images.phone: "[i / ${fps} for i in range(1 - ${policy.n_obs_steps}, 1)]" observation.state: "[i / ${fps} for i in range(1 - ${policy.n_obs_steps}, 1)]" - action: "[i / ${fps} for i in range(1 - ${policy.n_obs_steps}, ${policy.n_action_pred_token} + ${policy.action_chunk_size} - 1)]" + action: "[i / ${fps} + 1 / ${fps} for i in range(1 - ${policy.n_obs_steps}, ${policy.n_action_pred_token} + ${policy.action_chunk_size} - 1)]" eval: n_episodes: 50 @@ -53,9 +53,9 @@ policy: name: vqbet # Input / output structure. - n_obs_steps: 5 + n_obs_steps: 1 n_action_pred_token: 7 - action_chunk_size: 5 + action_chunk_size: 100 input_shapes: # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?