# @package _global_ # Train with: # # python lerobot/scripts/train.py \ # env=pusht \ # env.gym.obs_type=environment_state_agent_pos \ # policy=tdmpc_pusht_keypoints \ # eval.batch_size=50 \ # eval.n_episodes=50 \ # eval.use_async_envs=true \ # device=cuda \ # use_amp=true seed: 1 dataset_repo_id: lerobot/pusht_keypoints training: offline_steps: 0 # Offline training dataloader num_workers: 4 batch_size: 256 grad_clip_norm: 10.0 lr: 3e-4 eval_freq: 10000 log_freq: 500 save_freq: 50000 online_steps: 1000000 online_rollout_n_episodes: 10 online_rollout_batch_size: 10 online_steps_between_rollouts: 1000 online_sampling_ratio: 1.0 online_env_seed: 10000 online_buffer_capacity: 40000 online_buffer_seed_size: 0 do_online_rollout_async: false delta_timestamps: observation.environment_state: "[i / ${fps} for i in range(${policy.horizon} + 1)]" observation.state: "[i / ${fps} for i in range(${policy.horizon} + 1)]" action: "[i / ${fps} for i in range(${policy.horizon})]" next.reward: "[i / ${fps} for i in range(${policy.horizon})]" policy: name: tdmpc pretrained_model_path: # Input / output structure. n_action_repeats: 1 horizon: 5 n_action_steps: 5 input_shapes: # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env? observation.environment_state: [16] observation.state: ["${env.state_dim}"] output_shapes: action: ["${env.action_dim}"] # Normalization / Unnormalization input_normalization_modes: observation.environment_state: min_max observation.state: min_max output_normalization_modes: action: min_max # Architecture / modeling. # Neural networks. image_encoder_hidden_dim: 32 state_encoder_hidden_dim: 256 latent_dim: 50 q_ensemble_size: 5 mlp_dim: 512 # Reinforcement learning. discount: 0.98 # Inference. use_mpc: true cem_iterations: 6 max_std: 2.0 min_std: 0.05 n_gaussian_samples: 512 n_pi_samples: 51 uncertainty_regularizer_coeff: 1.0 n_elites: 50 elite_weighting_temperature: 0.5 gaussian_mean_momentum: 0.1 # Training and loss computation. max_random_shift_ratio: 0.0476 # Loss coefficients. reward_coeff: 0.5 expectile_weight: 0.9 value_coeff: 0.1 consistency_coeff: 20.0 advantage_scaling: 3.0 pi_coeff: 0.5 temporal_decay_coeff: 0.5 # Target model. target_model_momentum: 0.995