From 5495d55cc7df7aeb6606ff4720cffd9f105fb761 Mon Sep 17 00:00:00 2001 From: Remi Cadene Date: Thu, 30 May 2024 12:06:57 +0000 Subject: [PATCH] Add aloha2_real, Add act_real, Fix vae=false, Add support for no state --- .../common/policies/act/configuration_act.py | 7 +++--- lerobot/common/policies/act/modeling_act.py | 25 +++++++++---------- .../diffusion/configuration_diffusion.py | 5 +++- lerobot/configs/env/aloha2_real.yaml | 13 ++++++++++ lerobot/configs/policy/act_real.yaml | 18 ++----------- 5 files changed, 35 insertions(+), 33 deletions(-) create mode 100644 lerobot/configs/env/aloha2_real.yaml diff --git a/lerobot/common/policies/act/configuration_act.py b/lerobot/common/policies/act/configuration_act.py index a4b0b7d21..82bc6d8e6 100644 --- a/lerobot/common/policies/act/configuration_act.py +++ b/lerobot/common/policies/act/configuration_act.py @@ -26,10 +26,11 @@ class ACTConfig: Those are: `input_shapes` and 'output_shapes`. Notes on the inputs and outputs: + - "observation.state" is required as an input key. - At least one key starting with "observation.image is required as an input. - - If there are multiple keys beginning with "observation.images." they are treated as multiple camera - views. Right now we only support all images having the same shape. - - May optionally work without an "observation.state" key for the proprioceptive robot state. + - If there are multiple keys beginning with "observation.image" they are treated as multiple camera + views. + Right now we only support all images having the same shape. - "action" is required as an output key. Args: diff --git a/lerobot/common/policies/act/modeling_act.py b/lerobot/common/policies/act/modeling_act.py index bef59becb..81e1c4d3a 100644 --- a/lerobot/common/policies/act/modeling_act.py +++ b/lerobot/common/policies/act/modeling_act.py @@ -200,12 +200,13 @@ class ACT(nn.Module): self.config = config # BERT style VAE encoder with input tokens [cls, robot_state, *action_sequence]. # The cls token forms parameters of the latent's distribution (like this [*means, *log_variances]). - self.use_input_state = "observation.state" in config.input_shapes + self.has_state = "observation.state" in config.input_shapes + self.latent_dim = config.latent_dim if self.config.use_vae: self.vae_encoder = ACTEncoder(config) self.vae_encoder_cls_embed = nn.Embedding(1, config.dim_model) # Projection layer for joint-space configuration to hidden dimension. - if self.use_input_state: + if self.has_state: self.vae_encoder_robot_state_input_proj = nn.Linear( config.input_shapes["observation.state"][0], config.dim_model ) @@ -217,9 +218,7 @@ class ACT(nn.Module): self.vae_encoder_latent_output_proj = nn.Linear(config.dim_model, config.latent_dim * 2) # Fixed sinusoidal positional embedding for the input to the VAE encoder. Unsqueeze for batch # dimension. - num_input_token_encoder = 1 + config.chunk_size - if self.use_input_state: - num_input_token_encoder += 1 + num_input_token_encoder = 1 + 1 + config.chunk_size if self.has_state else 1 + config.chunk_size self.register_buffer( "vae_encoder_pos_enc", create_sinusoidal_pos_embedding(num_input_token_encoder, config.dim_model).unsqueeze(0), @@ -242,16 +241,16 @@ class ACT(nn.Module): # Transformer encoder input projections. The tokens will be structured like # [latent, robot_state, image_feature_map_pixels]. - if self.use_input_state: + if self.has_state: self.encoder_robot_state_input_proj = nn.Linear( config.input_shapes["observation.state"][0], config.dim_model ) - self.encoder_latent_input_proj = nn.Linear(config.latent_dim, config.dim_model) + self.encoder_latent_input_proj = nn.Linear(self.latent_dim, config.dim_model) self.encoder_img_feat_input_proj = nn.Conv2d( backbone_model.fc.in_features, config.dim_model, kernel_size=1 ) # Transformer encoder positional embeddings. - num_input_token_decoder = 2 if self.use_input_state else 1 + num_input_token_decoder = 2 if self.has_state else 1 self.encoder_robot_and_latent_pos_embed = nn.Embedding(num_input_token_decoder, config.dim_model) self.encoder_cam_feat_pos_embed = ACTSinusoidalPositionEmbedding2d(config.dim_model // 2) @@ -299,12 +298,12 @@ class ACT(nn.Module): cls_embed = einops.repeat( self.vae_encoder_cls_embed.weight, "1 d -> b 1 d", b=batch_size ) # (B, 1, D) - if self.use_input_state: + if self.has_state: robot_state_embed = self.vae_encoder_robot_state_input_proj(batch["observation.state"]) robot_state_embed = robot_state_embed.unsqueeze(1) # (B, 1, D) action_embed = self.vae_encoder_action_input_proj(batch["action"]) # (B, S, D) - if self.use_input_state: + if self.has_state: vae_encoder_input = [cls_embed, robot_state_embed, action_embed] # (B, S+2, D) else: vae_encoder_input = [cls_embed, action_embed] @@ -329,7 +328,7 @@ class ACT(nn.Module): # When not using the VAE encoder, we set the latent to be all zeros. mu = log_sigma_x2 = None # TODO(rcadene, alexander-soare): remove call to `.to` to speedup forward ; precompute and use buffer - latent_sample = torch.zeros([batch_size, self.config.latent_dim], dtype=torch.float32).to( + latent_sample = torch.zeros([batch_size, self.latent_dim], dtype=torch.float32).to( batch["observation.state"].device ) @@ -351,12 +350,12 @@ class ACT(nn.Module): cam_pos_embed = torch.cat(all_cam_pos_embeds, axis=-1) # Get positional embeddings for robot state and latent. - if self.use_input_state: + if self.has_state: robot_state_embed = self.encoder_robot_state_input_proj(batch["observation.state"]) # (B, C) latent_embed = self.encoder_latent_input_proj(latent_sample) # (B, C) # Stack encoder input and positional embeddings moving to (S, B, C). - encoder_in_feats = [latent_embed, robot_state_embed] if self.use_input_state else [latent_embed] + encoder_in_feats = [latent_embed, robot_state_embed] if self.has_state else [latent_embed] encoder_in = torch.cat( [ torch.stack(encoder_in_feats, axis=0), diff --git a/lerobot/common/policies/diffusion/configuration_diffusion.py b/lerobot/common/policies/diffusion/configuration_diffusion.py index 59ed16567..48783d897 100644 --- a/lerobot/common/policies/diffusion/configuration_diffusion.py +++ b/lerobot/common/policies/diffusion/configuration_diffusion.py @@ -28,7 +28,10 @@ class DiffusionConfig: Notes on the inputs and outputs: - "observation.state" is required as an input key. - - A key starting with "observation.image is required as an input. + - At least one key starting with "observation.image is required as an input. + - If there are multiple keys beginning with "observation.image" they are treated as multiple camera + views. + Right now we only support all images having the same shape. - "action" is required as an output key. Args: diff --git a/lerobot/configs/env/aloha2_real.yaml b/lerobot/configs/env/aloha2_real.yaml new file mode 100644 index 000000000..3053fc01b --- /dev/null +++ b/lerobot/configs/env/aloha2_real.yaml @@ -0,0 +1,13 @@ +# @package _global_ + +fps: 30 + +env: + name: dora + task: DoraAloha2-v0 + state_dim: 14 + action_dim: 14 + fps: ${fps} + episode_length: 400 + gym: + fps: ${fps} diff --git a/lerobot/configs/policy/act_real.yaml b/lerobot/configs/policy/act_real.yaml index b49426152..b786160ea 100644 --- a/lerobot/configs/policy/act_real.yaml +++ b/lerobot/configs/policy/act_real.yaml @@ -1,21 +1,7 @@ # @package _global_ -# Use `act_real.yaml` to train on real-world Aloha/Aloha2 datasets. -# Compared to `act.yaml`, it contains 4 cameras (i.e. cam_right_wrist, cam_left_wrist, images, -# cam_low) instead of 1 camera (i.e. top). Also, `training.eval_freq` is set to -1. This config is used -# to evaluate checkpoints at a certain frequency of training steps. When it is set to -1, it deactivates evaluation. -# This is because real-world evaluation is done through [dora-lerobot](https://github.com/dora-rs/dora-lerobot). -# Look at its README for more information on how to evaluate a checkpoint in the real-world. -# -# Example of usage for training: -# ```bash -# python lerobot/scripts/train.py \ -# policy=act_real \ -# env=dora_aloha_real -# ``` - seed: 1000 -dataset_repo_id: lerobot/aloha_static_vinh_cup +dataset_repo_id: cadene/aloha_v2_static_dora_test override_dataset_stats: observation.images.cam_right_wrist: @@ -41,7 +27,7 @@ training: eval_freq: -1 save_freq: 10000 log_freq: 100 - save_checkpoint: true + save_model: true batch_size: 8 lr: 1e-5