# @package _global_ # Use `act_real_no_state.yaml` to train on real-world Aloha/Aloha2 datasets when cameras are moving (e.g. wrist cameras) # Compared to `act_real.yaml`, it is camera only and does not use the state as input which is vector of robot joint positions. # We validated experimentaly that not using state reaches better success rate. Our hypothesis is that `act_real.yaml` might # overfits to the state, because the images are more complex to learn from since they are moving. # # Example of usage for training: # ```bash # python lerobot/scripts/train.py \ # policy=act_real_no_state \ # env=dora_aloha_real # ``` seed: 1000 dataset_repo_id: lerobot/aloha_static_vinh_cup override_dataset_stats: observation.images.cam_right_wrist: # stats from imagenet, since we use a pretrained vision model mean: [[[0.485]], [[0.456]], [[0.406]]] # (c,1,1) std: [[[0.229]], [[0.224]], [[0.225]]] # (c,1,1) observation.images.cam_left_wrist: # stats from imagenet, since we use a pretrained vision model mean: [[[0.485]], [[0.456]], [[0.406]]] # (c,1,1) std: [[[0.229]], [[0.224]], [[0.225]]] # (c,1,1) observation.images.cam_high: # stats from imagenet, since we use a pretrained vision model mean: [[[0.485]], [[0.456]], [[0.406]]] # (c,1,1) std: [[[0.229]], [[0.224]], [[0.225]]] # (c,1,1) observation.images.cam_low: # stats from imagenet, since we use a pretrained vision model mean: [[[0.485]], [[0.456]], [[0.406]]] # (c,1,1) std: [[[0.229]], [[0.224]], [[0.225]]] # (c,1,1) training: offline_steps: 100000 online_steps: 0 eval_freq: -1 save_freq: 20000 save_checkpoint: true batch_size: 8 lr: 1e-5 lr_backbone: 1e-5 weight_decay: 1e-4 grad_clip_norm: 10 online_steps_between_rollouts: 1 delta_timestamps: action: "[i / ${fps} for i in range(${policy.chunk_size})]" eval: n_episodes: 50 batch_size: 50 # See `configuration_act.py` for more details. policy: name: act # Input / output structure. n_obs_steps: 1 chunk_size: 100 # chunk_size n_action_steps: 100 input_shapes: # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env? observation.images.cam_right_wrist: [3, 480, 640] observation.images.cam_left_wrist: [3, 480, 640] observation.images.cam_high: [3, 480, 640] observation.images.cam_low: [3, 480, 640] output_shapes: action: ["${env.action_dim}"] # Normalization / Unnormalization input_normalization_modes: observation.images.cam_right_wrist: mean_std observation.images.cam_left_wrist: mean_std observation.images.cam_high: mean_std observation.images.cam_low: mean_std output_normalization_modes: action: mean_std # Architecture. # Vision backbone. vision_backbone: resnet18 pretrained_backbone_weights: ResNet18_Weights.IMAGENET1K_V1 replace_final_stride_with_dilation: false # Transformer layers. pre_norm: false dim_model: 512 n_heads: 8 dim_feedforward: 3200 feedforward_activation: relu n_encoder_layers: 4 # Note: Although the original ACT implementation has 7 for `n_decoder_layers`, there is a bug in the code # that means only the first layer is used. Here we match the original implementation by setting this to 1. # See this issue https://github.com/tonyzhaozh/act/issues/25#issue-2258740521. n_decoder_layers: 1 # VAE. use_vae: true latent_dim: 32 n_vae_encoder_layers: 4 # Inference. temporal_ensemble_coeff: null # Training and loss computation. dropout: 0.1 kl_weight: 10.0