diff --git a/lerobot/common/datasets/push_dataset_to_hub/reachy2_hdf5_format.py b/lerobot/common/datasets/push_dataset_to_hub/reachy2_hdf5_format.py new file mode 100644 index 000000000..14c834034 --- /dev/null +++ b/lerobot/common/datasets/push_dataset_to_hub/reachy2_hdf5_format.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python + +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Contains utilities to process raw data format of HDF5 files like in: https://github.com/tonyzhaozh/act +""" + +import gc +import re +import shutil +from pathlib import Path + +import h5py +import torch +import tqdm +from datasets import Dataset, Features, Image, Sequence, Value + +from lerobot.common.datasets.push_dataset_to_hub.utils import concatenate_episodes +from lerobot.common.datasets.utils import ( + hf_transform_to_torch, +) +from lerobot.common.datasets.video_utils import VideoFrame + + +def get_cameras(hdf5_data): + # ignore depth channel, not currently handled + # TODO(rcadene): add depth + rgb_cameras = [key for key in hdf5_data["/observations/images_ids"].keys() if "depth" not in key] # noqa: SIM118 + return rgb_cameras + + +def check_format(raw_dir) -> bool: + hdf5_paths = list(raw_dir.glob("episode_*.hdf5")) + assert len(hdf5_paths) != 0 + for hdf5_path in hdf5_paths: + with h5py.File(hdf5_path, "r") as data: + assert "/action" in data + assert "/observations/qpos" in data + + assert data["/action"].ndim == 2 + assert data["/observations/qpos"].ndim == 2 + + num_frames = data["/action"].shape[0] + assert num_frames == data["/observations/qpos"].shape[0] + + for camera in get_cameras(data): + assert num_frames == data[f"/observations/images_ids/{camera}"].shape[0] + assert (raw_dir / hdf5_path.name.replace(".hdf5", f"_{camera}.mp4")).exists() + + # assert data[f"/observations/images_ids/{camera}"].ndim == 4 + # b, h, w, c = data[f"/observations/images_ids/{camera}"].shape + # assert c < h and c < w, f"Expect (h,w,c) image format but ({h=},{w=},{c=}) provided." + + +def load_from_raw(raw_dir, out_dir, fps, video, debug): + hdf5_files = list(raw_dir.glob("*.hdf5")) + ep_dicts = [] + episode_data_index = {"from": [], "to": []} + + id_from = 0 + for ep_idx, ep_path in tqdm.tqdm(enumerate(hdf5_files), total=len(hdf5_files)): + match = re.search(r"_(\d+).hdf5", ep_path.name) + if not match: + raise ValueError(ep_path.name) + raw_ep_idx = int(match.group(1)) + + with h5py.File(ep_path, "r") as ep: + num_frames = ep["/action"].shape[0] + + # last step of demonstration is considered done + done = torch.zeros(num_frames, dtype=torch.bool) + done[-1] = True + + state = torch.from_numpy(ep["/observations/qpos"][:]) + action = torch.from_numpy(ep["/action"][:]) + if "/observations/qvel" in ep: + velocity = torch.from_numpy(ep["/observations/qvel"][:]) + if "/observations/effort" in ep: + effort = torch.from_numpy(ep["/observations/effort"][:]) + + ep_dict = {} + + videos_dir = out_dir / "videos" + videos_dir.mkdir(parents=True, exist_ok=True) + + for camera in get_cameras(ep): + img_key = f"observation.images.{camera}" + + raw_fname = f"episode_{raw_ep_idx}_{camera}.mp4" + new_fname = f"{img_key}_episode_{ep_idx:06d}.mp4" + shutil.copy(str(raw_dir / raw_fname), str(videos_dir / new_fname)) + + # store the reference to the video frame + ep_dict[img_key] = [ + {"path": f"videos/{new_fname}", "timestamp": i / fps} for i in range(num_frames) + ] + + ep_dict["observation.state"] = state + if "/observations/velocity" in ep: + ep_dict["observation.velocity"] = velocity + if "/observations/effort" in ep: + ep_dict["observation.effort"] = effort + ep_dict["action"] = action + ep_dict["episode_index"] = torch.tensor([ep_idx] * num_frames) + ep_dict["frame_index"] = torch.arange(0, num_frames, 1) + ep_dict["timestamp"] = torch.arange(0, num_frames, 1) / fps + ep_dict["next.done"] = done + # TODO(rcadene): add reward and success by computing them in sim + + assert isinstance(ep_idx, int) + ep_dicts.append(ep_dict) + + episode_data_index["from"].append(id_from) + episode_data_index["to"].append(id_from + num_frames) + + id_from += num_frames + + gc.collect() + + # process first episode only + if debug: + break + + data_dict = concatenate_episodes(ep_dicts) + return data_dict, episode_data_index + + +def to_hf_dataset(data_dict, video) -> Dataset: + features = {} + + keys = [key for key in data_dict if "observation.images." in key] + for key in keys: + if video: + features[key] = VideoFrame() + else: + features[key] = Image() + + features["observation.state"] = Sequence( + length=data_dict["observation.state"].shape[1], feature=Value(dtype="float32", id=None) + ) + if "observation.velocity" in data_dict: + features["observation.velocity"] = Sequence( + length=data_dict["observation.velocity"].shape[1], feature=Value(dtype="float32", id=None) + ) + if "observation.effort" in data_dict: + features["observation.effort"] = Sequence( + length=data_dict["observation.effort"].shape[1], feature=Value(dtype="float32", id=None) + ) + features["action"] = Sequence( + length=data_dict["action"].shape[1], feature=Value(dtype="float32", id=None) + ) + features["episode_index"] = Value(dtype="int64", id=None) + features["frame_index"] = Value(dtype="int64", id=None) + features["timestamp"] = Value(dtype="float32", id=None) + features["next.done"] = Value(dtype="bool", id=None) + features["index"] = Value(dtype="int64", id=None) + + hf_dataset = Dataset.from_dict(data_dict, features=Features(features)) + hf_dataset.set_transform(hf_transform_to_torch) + return hf_dataset + + +def from_raw_to_lerobot_format(raw_dir: Path, out_dir: Path, fps=None, video=True, debug=False): + # sanity check + check_format(raw_dir) + + if fps is None: + fps = 30 + + data_dir, episode_data_index = load_from_raw(raw_dir, out_dir, fps, video, debug) + hf_dataset = to_hf_dataset(data_dir, video) + + info = { + "fps": fps, + "video": video, + } + return hf_dataset, episode_data_index, info diff --git a/lerobot/configs/env/dora_reachy2_real.yaml b/lerobot/configs/env/dora_reachy2_real.yaml new file mode 100644 index 000000000..d1a53a7fa --- /dev/null +++ b/lerobot/configs/env/dora_reachy2_real.yaml @@ -0,0 +1,13 @@ +# @package _global_ + +fps: 30 + +env: + name: dora + task: DoraReachy2-v0 + state_dim: 16 + action_dim: 16 + fps: ${fps} + episode_length: 400 + gym: + fps: ${fps} diff --git a/lerobot/configs/policy/act_reachy2_real.yaml b/lerobot/configs/policy/act_reachy2_real.yaml new file mode 100644 index 000000000..5e63287d0 --- /dev/null +++ b/lerobot/configs/policy/act_reachy2_real.yaml @@ -0,0 +1,97 @@ +# @package _global_ + +# Use `act_real.yaml` to train on real-world Aloha/Aloha2 datasets. +# Compared to `act.yaml`, it contains 4 cameras (i.e. cam_right_wrist, cam_left_wrist, images, +# cam_low) instead of 1 camera (i.e. top). Also, `training.eval_freq` is set to -1. This config is used +# to evaluate checkpoints at a certain frequency of training steps. When it is set to -1, it deactivates evaluation. +# This is because real-world evaluation is done through [dora-lerobot](https://github.com/dora-rs/dora-lerobot). +# Look at its README for more information on how to evaluate a checkpoint in the real-world. +# +# Example of usage for training: +# ```bash +# python lerobot/scripts/train.py \ +# policy=act_real \ +# env=dora_aloha_real +# ``` + +seed: 1000 +dataset_repo_id: cadene/reachy2_teleop_remi + +override_dataset_stats: + observation.images.cam_trunk: + # stats from imagenet, since we use a pretrained vision model + mean: [[[0.485]], [[0.456]], [[0.406]]] # (c,1,1) + std: [[[0.229]], [[0.224]], [[0.225]]] # (c,1,1) + +training: + offline_steps: 80000 + online_steps: 0 + eval_freq: -1 + save_freq: 10000 + log_freq: 100 + save_checkpoint: true + + batch_size: 8 + lr: 1e-5 + lr_backbone: 1e-5 + weight_decay: 1e-4 + grad_clip_norm: 10 + online_steps_between_rollouts: 1 + + delta_timestamps: + action: "[i / ${fps} for i in range(1, ${policy.chunk_size} + 1)]" + +eval: + n_episodes: 50 + batch_size: 50 + +# See `configuration_act.py` for more details. +policy: + name: act + + # Input / output structure. + n_obs_steps: 1 + chunk_size: 100 # chunk_size + n_action_steps: 100 + + input_shapes: + # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env? + observation.images.cam_trunk: [3, 800, 1280] + observation.state: ["${env.state_dim}"] + output_shapes: + action: ["${env.action_dim}"] + + # Normalization / Unnormalization + input_normalization_modes: + observation.images.cam_trunk: mean_std + observation.state: mean_std + output_normalization_modes: + action: mean_std + + # Architecture. + # Vision backbone. + vision_backbone: resnet18 + pretrained_backbone_weights: ResNet18_Weights.IMAGENET1K_V1 + replace_final_stride_with_dilation: false + # Transformer layers. + pre_norm: false + dim_model: 512 + n_heads: 8 + dim_feedforward: 3200 + feedforward_activation: relu + n_encoder_layers: 4 + # Note: Although the original ACT implementation has 7 for `n_decoder_layers`, there is a bug in the code + # that means only the first layer is used. Here we match the original implementation by setting this to 1. + # See this issue https://github.com/tonyzhaozh/act/issues/25#issue-2258740521. + n_decoder_layers: 1 + # VAE. + use_vae: true + latent_dim: 32 + n_vae_encoder_layers: 4 + + # Inference. + temporal_ensemble_momentum: null + + # Training and loss computation. + dropout: 0.1 + kl_weight: 10.0 diff --git a/lerobot/scripts/push_dataset_to_hub.py b/lerobot/scripts/push_dataset_to_hub.py index 52252b571..264256f3e 100644 --- a/lerobot/scripts/push_dataset_to_hub.py +++ b/lerobot/scripts/push_dataset_to_hub.py @@ -86,6 +86,8 @@ def get_from_raw_to_lerobot_format_fn(raw_format): from lerobot.common.datasets.push_dataset_to_hub.aloha_hdf5_format import from_raw_to_lerobot_format elif raw_format == "aloha_dora": from lerobot.common.datasets.push_dataset_to_hub.aloha_dora_format import from_raw_to_lerobot_format + elif raw_format == "reachy2_hdf5": + from lerobot.common.datasets.push_dataset_to_hub.reachy2_hdf5_format import from_raw_to_lerobot_format elif raw_format == "xarm_pkl": from lerobot.common.datasets.push_dataset_to_hub.xarm_pkl_format import from_raw_to_lerobot_format else: