Add reachy2 dataset, policy, env

2024-06-04 12:31:59 +00:00
parent a56626cf9c
commit 5e85a2c50b
4 changed files with 301 additions and 0 deletions
--- a/lerobot/common/datasets/push_dataset_to_hub/reachy2_hdf5_format.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/reachy2_hdf5_format.py
@@ -0,0 +1,189 @@
 #!/usr/bin/env python
 # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Contains utilities to process raw data format of HDF5 files like in: https://github.com/tonyzhaozh/act
 """
 import gc
 import re
 import shutil
 from pathlib import Path
 import h5py
 import torch
 import tqdm
 from datasets import Dataset, Features, Image, Sequence, Value
 from lerobot.common.datasets.push_dataset_to_hub.utils import concatenate_episodes
 from lerobot.common.datasets.utils import (
    hf_transform_to_torch,
 )
 from lerobot.common.datasets.video_utils import VideoFrame
 def get_cameras(hdf5_data):
    # ignore depth channel, not currently handled
    # TODO(rcadene): add depth
    rgb_cameras = [key for key in hdf5_data["/observations/images_ids"].keys() if "depth" not in key]  # noqa: SIM118
    return rgb_cameras
 def check_format(raw_dir) -> bool:
    hdf5_paths = list(raw_dir.glob("episode_*.hdf5"))
    assert len(hdf5_paths) != 0
    for hdf5_path in hdf5_paths:
        with h5py.File(hdf5_path, "r") as data:
            assert "/action" in data
            assert "/observations/qpos" in data
            assert data["/action"].ndim == 2
            assert data["/observations/qpos"].ndim == 2
            num_frames = data["/action"].shape[0]
            assert num_frames == data["/observations/qpos"].shape[0]
            for camera in get_cameras(data):
                assert num_frames == data[f"/observations/images_ids/{camera}"].shape[0]
                assert (raw_dir / hdf5_path.name.replace(".hdf5", f"_{camera}.mp4")).exists()
                # assert data[f"/observations/images_ids/{camera}"].ndim == 4
                # b, h, w, c = data[f"/observations/images_ids/{camera}"].shape
                # assert c < h and c < w, f"Expect (h,w,c) image format but ({h=},{w=},{c=}) provided."
 def load_from_raw(raw_dir, out_dir, fps, video, debug):
    hdf5_files = list(raw_dir.glob("*.hdf5"))
    ep_dicts = []
    episode_data_index = {"from": [], "to": []}
    id_from = 0
    for ep_idx, ep_path in tqdm.tqdm(enumerate(hdf5_files), total=len(hdf5_files)):
        match = re.search(r"_(\d+).hdf5", ep_path.name)
        if not match:
            raise ValueError(ep_path.name)
        raw_ep_idx = int(match.group(1))
        with h5py.File(ep_path, "r") as ep:
            num_frames = ep["/action"].shape[0]
            # last step of demonstration is considered done
            done = torch.zeros(num_frames, dtype=torch.bool)
            done[-1] = True
            state = torch.from_numpy(ep["/observations/qpos"][:])
            action = torch.from_numpy(ep["/action"][:])
            if "/observations/qvel" in ep:
                velocity = torch.from_numpy(ep["/observations/qvel"][:])
            if "/observations/effort" in ep:
                effort = torch.from_numpy(ep["/observations/effort"][:])
            ep_dict = {}
            videos_dir = out_dir / "videos"
            videos_dir.mkdir(parents=True, exist_ok=True)
            for camera in get_cameras(ep):
                img_key = f"observation.images.{camera}"
                raw_fname = f"episode_{raw_ep_idx}_{camera}.mp4"
                new_fname = f"{img_key}_episode_{ep_idx:06d}.mp4"
                shutil.copy(str(raw_dir / raw_fname), str(videos_dir / new_fname))
                # store the reference to the video frame
                ep_dict[img_key] = [
                    {"path": f"videos/{new_fname}", "timestamp": i / fps} for i in range(num_frames)
                ]
            ep_dict["observation.state"] = state
            if "/observations/velocity" in ep:
                ep_dict["observation.velocity"] = velocity
            if "/observations/effort" in ep:
                ep_dict["observation.effort"] = effort
            ep_dict["action"] = action
            ep_dict["episode_index"] = torch.tensor([ep_idx] * num_frames)
            ep_dict["frame_index"] = torch.arange(0, num_frames, 1)
            ep_dict["timestamp"] = torch.arange(0, num_frames, 1) / fps
            ep_dict["next.done"] = done
            # TODO(rcadene): add reward and success by computing them in sim
            assert isinstance(ep_idx, int)
            ep_dicts.append(ep_dict)
            episode_data_index["from"].append(id_from)
            episode_data_index["to"].append(id_from + num_frames)
        id_from += num_frames
        gc.collect()
        # process first episode only
        if debug:
            break
    data_dict = concatenate_episodes(ep_dicts)
    return data_dict, episode_data_index
 def to_hf_dataset(data_dict, video) -> Dataset:
    features = {}
    keys = [key for key in data_dict if "observation.images." in key]
    for key in keys:
        if video:
            features[key] = VideoFrame()
        else:
            features[key] = Image()
    features["observation.state"] = Sequence(
        length=data_dict["observation.state"].shape[1], feature=Value(dtype="float32", id=None)
    )
    if "observation.velocity" in data_dict:
        features["observation.velocity"] = Sequence(
            length=data_dict["observation.velocity"].shape[1], feature=Value(dtype="float32", id=None)
        )
    if "observation.effort" in data_dict:
        features["observation.effort"] = Sequence(
            length=data_dict["observation.effort"].shape[1], feature=Value(dtype="float32", id=None)
        )
    features["action"] = Sequence(
        length=data_dict["action"].shape[1], feature=Value(dtype="float32", id=None)
    )
    features["episode_index"] = Value(dtype="int64", id=None)
    features["frame_index"] = Value(dtype="int64", id=None)
    features["timestamp"] = Value(dtype="float32", id=None)
    features["next.done"] = Value(dtype="bool", id=None)
    features["index"] = Value(dtype="int64", id=None)
    hf_dataset = Dataset.from_dict(data_dict, features=Features(features))
    hf_dataset.set_transform(hf_transform_to_torch)
    return hf_dataset
 def from_raw_to_lerobot_format(raw_dir: Path, out_dir: Path, fps=None, video=True, debug=False):
    # sanity check
    check_format(raw_dir)
    if fps is None:
        fps = 30
    data_dir, episode_data_index = load_from_raw(raw_dir, out_dir, fps, video, debug)
    hf_dataset = to_hf_dataset(data_dir, video)
    info = {
        "fps": fps,
        "video": video,
    }
    return hf_dataset, episode_data_index, info
--- a/lerobot/configs/env/dora_reachy2_real.yaml
+++ b/lerobot/configs/env/dora_reachy2_real.yaml
@@ -0,0 +1,13 @@
 # @package _global_
 fps: 30
 env:
  name: dora
  task: DoraReachy2-v0
  state_dim: 16
  action_dim: 16
  fps: ${fps}
  episode_length: 400
  gym:
    fps: ${fps}
--- a/lerobot/configs/policy/act_reachy2_real.yaml
+++ b/lerobot/configs/policy/act_reachy2_real.yaml
@@ -0,0 +1,97 @@
 # @package _global_
 # Use `act_real.yaml` to train on real-world Aloha/Aloha2 datasets.
 # Compared to `act.yaml`, it contains 4 cameras (i.e. cam_right_wrist, cam_left_wrist, images,
 # cam_low) instead of 1 camera (i.e. top). Also, `training.eval_freq` is set to -1. This config is used
 # to evaluate checkpoints at a certain frequency of training steps. When it is set to -1, it deactivates evaluation.
 # This is because real-world evaluation is done through [dora-lerobot](https://github.com/dora-rs/dora-lerobot).
 # Look at its README for more information on how to evaluate a checkpoint in the real-world.
 #
 # Example of usage for training:
 # ```bash
 # python lerobot/scripts/train.py \
 #   policy=act_real \
 #   env=dora_aloha_real
 # ```
 seed: 1000
 dataset_repo_id: cadene/reachy2_teleop_remi
 override_dataset_stats:
  observation.images.cam_trunk:
    # stats from imagenet, since we use a pretrained vision model
    mean: [[[0.485]], [[0.456]], [[0.406]]]  # (c,1,1)
    std: [[[0.229]], [[0.224]], [[0.225]]]  # (c,1,1)
 training:
  offline_steps: 80000
  online_steps: 0
  eval_freq: -1
  save_freq: 10000
  log_freq: 100
  save_checkpoint: true
  batch_size: 8
  lr: 1e-5
  lr_backbone: 1e-5
  weight_decay: 1e-4
  grad_clip_norm: 10
  online_steps_between_rollouts: 1
  delta_timestamps:
    action: "[i / ${fps} for i in range(1, ${policy.chunk_size} + 1)]"
 eval:
  n_episodes: 50
  batch_size: 50
 # See `configuration_act.py` for more details.
 policy:
  name: act
  # Input / output structure.
  n_obs_steps: 1
  chunk_size: 100 # chunk_size
  n_action_steps: 100
  input_shapes:
    # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
    observation.images.cam_trunk: [3, 800, 1280]
    observation.state: ["${env.state_dim}"]
  output_shapes:
    action: ["${env.action_dim}"]
  # Normalization / Unnormalization
  input_normalization_modes:
    observation.images.cam_trunk: mean_std
    observation.state: mean_std
  output_normalization_modes:
    action: mean_std
  # Architecture.
  # Vision backbone.
  vision_backbone: resnet18
  pretrained_backbone_weights: ResNet18_Weights.IMAGENET1K_V1
  replace_final_stride_with_dilation: false
  # Transformer layers.
  pre_norm: false
  dim_model: 512
  n_heads: 8
  dim_feedforward: 3200
  feedforward_activation: relu
  n_encoder_layers: 4
  # Note: Although the original ACT implementation has 7 for `n_decoder_layers`, there is a bug in the code
  # that means only the first layer is used. Here we match the original implementation by setting this to 1.
  # See this issue https://github.com/tonyzhaozh/act/issues/25#issue-2258740521.
  n_decoder_layers: 1
  # VAE.
  use_vae: true
  latent_dim: 32
  n_vae_encoder_layers: 4
  # Inference.
  temporal_ensemble_momentum: null
  # Training and loss computation.
  dropout: 0.1
  kl_weight: 10.0
--- a/lerobot/scripts/push_dataset_to_hub.py
+++ b/lerobot/scripts/push_dataset_to_hub.py
@@ -86,6 +86,8 @@ def get_from_raw_to_lerobot_format_fn(raw_format):
        from lerobot.common.datasets.push_dataset_to_hub.aloha_hdf5_format import from_raw_to_lerobot_format
    elif raw_format == "aloha_dora":
        from lerobot.common.datasets.push_dataset_to_hub.aloha_dora_format import from_raw_to_lerobot_format
    elif raw_format == "reachy2_hdf5":
        from lerobot.common.datasets.push_dataset_to_hub.reachy2_hdf5_format import from_raw_to_lerobot_format
    elif raw_format == "xarm_pkl":
        from lerobot.common.datasets.push_dataset_to_hub.xarm_pkl_format import from_raw_to_lerobot_format
    else: