Add online training with TD-MPC as proof of concept (#338)

2024-07-25 11:16:38 +01:00
parent abbb1d2367
commit f8a6574698
25 changed files with 1291 additions and 233 deletions
--- a/lerobot/configs/default.yaml
+++ b/lerobot/configs/default.yaml
@@ -32,19 +32,54 @@ video_backend: pyav

 training:
  offline_steps: ???
-  # NOTE: `online_steps` is not implemented yet. It's here as a placeholder.
-  online_steps: ???
-  online_steps_between_rollouts: ???
-  online_sampling_ratio: 0.5
-  # `online_env_seed` is used for environments for online training data rollouts.
-  online_env_seed: ???
+
+  # Number of workers for the offline training dataloader.
+  num_workers: 4
+
+  batch_size: ???
+
  eval_freq: ???
  log_freq: 200
  save_checkpoint: true
  # Checkpoint is saved every `save_freq` training iterations and after the last training step.
  save_freq: ???
-  num_workers: 4
-  batch_size: ???
+
+  # Online training. Note that the online training loop adopts most of the options above apart from the
+  # dataloader options. Unless otherwise specified.
+  # The online training look looks something like:
+  #
+  # for i in range(online_steps):
+  #     do_online_rollout_and_update_online_buffer()
+  #     for j in range(online_steps_between_rollouts):
+  #         batch = next(dataloader_with_offline_and_online_data)
+  #         loss = policy(batch)
+  #         loss.backward()
+  #         optimizer.step()
+  #
+  online_steps: ???
+  # How many episodes to collect at once when we reach the online rollout part of the training loop.
+  online_rollout_n_episodes: 1
+  # The number of environments to use in the gym.vector.VectorEnv. This ends up also being the batch size for
+  # the policy. Ideally you should set this to by an even divisor or online_rollout_n_episodes.
+  online_rollout_batch_size: 1
+  # How many optimization steps (forward, backward, optimizer step) to do between running rollouts.
+  online_steps_between_rollouts: null
+  # The proportion of online samples (vs offline samples) to include in the online training batches.
+  online_sampling_ratio: 0.5
+  # First seed to use for the online rollout environment. Seeds for subsequent rollouts are incremented by 1.
+  online_env_seed: null
+  # Sets the maximum number of frames that are stored in the online buffer for online training. The buffer is
+  # FIFO.
+  online_buffer_capacity: null
+  # The minimum number of frames to have in the online buffer before commencing online training.
+  # If online_buffer_seed_size > online_rollout_n_episodes, the rollout will be run multiple times until the
+  # seed size condition is satisfied.
+  online_buffer_seed_size: 0
+  # Whether to run the online rollouts asynchronously. This means we can run the online training steps in
+  # parallel with the rollouts. This might be advised if your GPU has the bandwidth to handle training
+  # + eval + environment rendering simultaneously.
+  do_online_rollout_async: false
+
  image_transforms:
  # These transforms are all using standard torchvision.transforms.v2
  # You can find out how these transformations affect images here:
--- a/lerobot/configs/env/xarm.yaml
+++ b/lerobot/configs/env/xarm.yaml
@@ -9,7 +9,7 @@ env:
  state_dim: 4
  action_dim: 4
  fps: ${fps}
-  episode_length: 25
+  episode_length: 200
  gym:
    obs_type: pixels_agent_pos
    render_mode: rgb_array
--- a/lerobot/configs/policy/tdmpc.yaml
+++ b/lerobot/configs/policy/tdmpc.yaml
@@ -4,19 +4,30 @@ seed: 1
 dataset_repo_id: lerobot/xarm_lift_medium

 training:
-  offline_steps: 25000
-  # TODO(alexander-soare): uncomment when online training gets reinstated
-  online_steps: 0  # 25000 not implemented yet
-  eval_freq: 5000
-  online_steps_between_rollouts: 1
-  online_sampling_ratio: 0.5
-  online_env_seed: 10000
-  log_freq: 100
+  offline_steps: 50000
+
+  num_workers: 4

  batch_size: 256
  grad_clip_norm: 10.0
  lr: 3e-4

+  eval_freq: 5000
+  log_freq: 100
+
+  online_steps: 50000
+  online_rollout_n_episodes: 1
+  online_rollout_batch_size: 1
+  # Note: in FOWM `online_steps_between_rollouts` is actually dynamically set to match exactly the length of
+  # the last sampled episode.
+  online_steps_between_rollouts: 50
+  online_sampling_ratio: 0.5
+  online_env_seed: 10000
+  # FOWM Push uses 10000 for `online_buffer_capacity`. Given that their maximum episode length for this task
+  # is 25, 10000 is approx 400 of their episodes worth. Since our episodes are about 8 times longer, we'll use
+  # 80000.
+  online_buffer_capacity: 80000
+
  delta_timestamps:
    observation.image: "[i / ${fps} for i in range(${policy.horizon} + 1)]"
    observation.state: "[i / ${fps} for i in range(${policy.horizon} + 1)]"
@@ -31,6 +42,7 @@ policy:
  # Input / output structure.
  n_action_repeats: 2
  horizon: 5
+  n_action_steps: 1

  input_shapes:
    # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
--- a/lerobot/configs/policy/tdmpc_pusht_keypoints.yaml
+++ b/lerobot/configs/policy/tdmpc_pusht_keypoints.yaml
@@ -0,0 +1,105 @@
+# @package _global_
+
+# Train with:
+#
+# python lerobot/scripts/train.py \
+#   env=pusht \
+#   env.gym.obs_type=environment_state_agent_pos \
+#   policy=tdmpc_pusht_keypoints \
+#   eval.batch_size=50 \
+#   eval.n_episodes=50 \
+#   eval.use_async_envs=true \
+#   device=cuda \
+#   use_amp=true
+
+seed: 1
+dataset_repo_id: lerobot/pusht_keypoints
+
+training:
+  offline_steps: 0
+
+  # Offline training dataloader
+  num_workers: 4
+
+  batch_size: 256
+  grad_clip_norm: 10.0
+  lr: 3e-4
+
+  eval_freq: 10000
+  log_freq: 500
+  save_freq: 50000
+
+  online_steps: 1000000
+  online_rollout_n_episodes: 10
+  online_rollout_batch_size: 10
+  online_steps_between_rollouts: 1000
+  online_sampling_ratio: 1.0
+  online_env_seed: 10000
+  online_buffer_capacity: 40000
+  online_buffer_seed_size: 0
+  do_online_rollout_async: false
+
+  delta_timestamps:
+    observation.environment_state: "[i / ${fps} for i in range(${policy.horizon} + 1)]"
+    observation.state: "[i / ${fps} for i in range(${policy.horizon} + 1)]"
+    action: "[i / ${fps} for i in range(${policy.horizon})]"
+    next.reward: "[i / ${fps} for i in range(${policy.horizon})]"
+
+policy:
+  name: tdmpc
+
+  pretrained_model_path:
+
+  # Input / output structure.
+  n_action_repeats: 1
+  horizon: 5
+  n_action_steps: 5
+
+  input_shapes:
+    # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
+    observation.environment_state: [16]
+    observation.state: ["${env.state_dim}"]
+  output_shapes:
+    action: ["${env.action_dim}"]
+
+  # Normalization / Unnormalization
+  input_normalization_modes:
+    observation.environment_state: min_max
+    observation.state: min_max
+  output_normalization_modes:
+    action: min_max
+
+  # Architecture / modeling.
+  # Neural networks.
+  image_encoder_hidden_dim: 32
+  state_encoder_hidden_dim: 256
+  latent_dim: 50
+  q_ensemble_size: 5
+  mlp_dim: 512
+  # Reinforcement learning.
+  discount: 0.98
+
+  # Inference.
+  use_mpc: true
+  cem_iterations: 6
+  max_std: 2.0
+  min_std: 0.05
+  n_gaussian_samples: 512
+  n_pi_samples: 51
+  uncertainty_regularizer_coeff: 1.0
+  n_elites: 50
+  elite_weighting_temperature: 0.5
+  gaussian_mean_momentum: 0.1
+
+  # Training and loss computation.
+  max_random_shift_ratio: 0.0476
+  # Loss coefficients.
+  reward_coeff: 0.5
+  expectile_weight: 0.9
+  value_coeff: 0.1
+  consistency_coeff: 20.0
+  advantage_scaling: 3.0
+  pi_coeff: 0.5
+  temporal_decay_coeff: 0.5
+  # Target model.
+  target_model_momentum: 0.995