Add online training with TD-MPC as proof of concept (#338)

This commit is contained in:
Alexander Soare
2024-07-25 11:16:38 +01:00
committed by GitHub
parent abbb1d2367
commit f8a6574698
25 changed files with 1291 additions and 233 deletions

View File

@@ -32,19 +32,54 @@ video_backend: pyav
training:
offline_steps: ???
# NOTE: `online_steps` is not implemented yet. It's here as a placeholder.
online_steps: ???
online_steps_between_rollouts: ???
online_sampling_ratio: 0.5
# `online_env_seed` is used for environments for online training data rollouts.
online_env_seed: ???
# Number of workers for the offline training dataloader.
num_workers: 4
batch_size: ???
eval_freq: ???
log_freq: 200
save_checkpoint: true
# Checkpoint is saved every `save_freq` training iterations and after the last training step.
save_freq: ???
num_workers: 4
batch_size: ???
# Online training. Note that the online training loop adopts most of the options above apart from the
# dataloader options. Unless otherwise specified.
# The online training look looks something like:
#
# for i in range(online_steps):
# do_online_rollout_and_update_online_buffer()
# for j in range(online_steps_between_rollouts):
# batch = next(dataloader_with_offline_and_online_data)
# loss = policy(batch)
# loss.backward()
# optimizer.step()
#
online_steps: ???
# How many episodes to collect at once when we reach the online rollout part of the training loop.
online_rollout_n_episodes: 1
# The number of environments to use in the gym.vector.VectorEnv. This ends up also being the batch size for
# the policy. Ideally you should set this to by an even divisor or online_rollout_n_episodes.
online_rollout_batch_size: 1
# How many optimization steps (forward, backward, optimizer step) to do between running rollouts.
online_steps_between_rollouts: null
# The proportion of online samples (vs offline samples) to include in the online training batches.
online_sampling_ratio: 0.5
# First seed to use for the online rollout environment. Seeds for subsequent rollouts are incremented by 1.
online_env_seed: null
# Sets the maximum number of frames that are stored in the online buffer for online training. The buffer is
# FIFO.
online_buffer_capacity: null
# The minimum number of frames to have in the online buffer before commencing online training.
# If online_buffer_seed_size > online_rollout_n_episodes, the rollout will be run multiple times until the
# seed size condition is satisfied.
online_buffer_seed_size: 0
# Whether to run the online rollouts asynchronously. This means we can run the online training steps in
# parallel with the rollouts. This might be advised if your GPU has the bandwidth to handle training
# + eval + environment rendering simultaneously.
do_online_rollout_async: false
image_transforms:
# These transforms are all using standard torchvision.transforms.v2
# You can find out how these transformations affect images here:

View File

@@ -9,7 +9,7 @@ env:
state_dim: 4
action_dim: 4
fps: ${fps}
episode_length: 25
episode_length: 200
gym:
obs_type: pixels_agent_pos
render_mode: rgb_array

View File

@@ -4,19 +4,30 @@ seed: 1
dataset_repo_id: lerobot/xarm_lift_medium
training:
offline_steps: 25000
# TODO(alexander-soare): uncomment when online training gets reinstated
online_steps: 0 # 25000 not implemented yet
eval_freq: 5000
online_steps_between_rollouts: 1
online_sampling_ratio: 0.5
online_env_seed: 10000
log_freq: 100
offline_steps: 50000
num_workers: 4
batch_size: 256
grad_clip_norm: 10.0
lr: 3e-4
eval_freq: 5000
log_freq: 100
online_steps: 50000
online_rollout_n_episodes: 1
online_rollout_batch_size: 1
# Note: in FOWM `online_steps_between_rollouts` is actually dynamically set to match exactly the length of
# the last sampled episode.
online_steps_between_rollouts: 50
online_sampling_ratio: 0.5
online_env_seed: 10000
# FOWM Push uses 10000 for `online_buffer_capacity`. Given that their maximum episode length for this task
# is 25, 10000 is approx 400 of their episodes worth. Since our episodes are about 8 times longer, we'll use
# 80000.
online_buffer_capacity: 80000
delta_timestamps:
observation.image: "[i / ${fps} for i in range(${policy.horizon} + 1)]"
observation.state: "[i / ${fps} for i in range(${policy.horizon} + 1)]"
@@ -31,6 +42,7 @@ policy:
# Input / output structure.
n_action_repeats: 2
horizon: 5
n_action_steps: 1
input_shapes:
# TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?

View File

@@ -0,0 +1,105 @@
# @package _global_
# Train with:
#
# python lerobot/scripts/train.py \
# env=pusht \
# env.gym.obs_type=environment_state_agent_pos \
# policy=tdmpc_pusht_keypoints \
# eval.batch_size=50 \
# eval.n_episodes=50 \
# eval.use_async_envs=true \
# device=cuda \
# use_amp=true
seed: 1
dataset_repo_id: lerobot/pusht_keypoints
training:
offline_steps: 0
# Offline training dataloader
num_workers: 4
batch_size: 256
grad_clip_norm: 10.0
lr: 3e-4
eval_freq: 10000
log_freq: 500
save_freq: 50000
online_steps: 1000000
online_rollout_n_episodes: 10
online_rollout_batch_size: 10
online_steps_between_rollouts: 1000
online_sampling_ratio: 1.0
online_env_seed: 10000
online_buffer_capacity: 40000
online_buffer_seed_size: 0
do_online_rollout_async: false
delta_timestamps:
observation.environment_state: "[i / ${fps} for i in range(${policy.horizon} + 1)]"
observation.state: "[i / ${fps} for i in range(${policy.horizon} + 1)]"
action: "[i / ${fps} for i in range(${policy.horizon})]"
next.reward: "[i / ${fps} for i in range(${policy.horizon})]"
policy:
name: tdmpc
pretrained_model_path:
# Input / output structure.
n_action_repeats: 1
horizon: 5
n_action_steps: 5
input_shapes:
# TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
observation.environment_state: [16]
observation.state: ["${env.state_dim}"]
output_shapes:
action: ["${env.action_dim}"]
# Normalization / Unnormalization
input_normalization_modes:
observation.environment_state: min_max
observation.state: min_max
output_normalization_modes:
action: min_max
# Architecture / modeling.
# Neural networks.
image_encoder_hidden_dim: 32
state_encoder_hidden_dim: 256
latent_dim: 50
q_ensemble_size: 5
mlp_dim: 512
# Reinforcement learning.
discount: 0.98
# Inference.
use_mpc: true
cem_iterations: 6
max_std: 2.0
min_std: 0.05
n_gaussian_samples: 512
n_pi_samples: 51
uncertainty_regularizer_coeff: 1.0
n_elites: 50
elite_weighting_temperature: 0.5
gaussian_mean_momentum: 0.1
# Training and loss computation.
max_random_shift_ratio: 0.0476
# Loss coefficients.
reward_coeff: 0.5
expectile_weight: 0.9
value_coeff: 0.1
consistency_coeff: 20.0
advantage_scaling: 3.0
pi_coeff: 0.5
temporal_decay_coeff: 0.5
# Target model.
target_model_momentum: 0.995