forked from tangger/lerobot
Add reachy2 dataset, policy, env
This commit is contained in:
@@ -0,0 +1,189 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""
|
||||||
|
Contains utilities to process raw data format of HDF5 files like in: https://github.com/tonyzhaozh/act
|
||||||
|
"""
|
||||||
|
|
||||||
|
import gc
|
||||||
|
import re
|
||||||
|
import shutil
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import h5py
|
||||||
|
import torch
|
||||||
|
import tqdm
|
||||||
|
from datasets import Dataset, Features, Image, Sequence, Value
|
||||||
|
|
||||||
|
from lerobot.common.datasets.push_dataset_to_hub.utils import concatenate_episodes
|
||||||
|
from lerobot.common.datasets.utils import (
|
||||||
|
hf_transform_to_torch,
|
||||||
|
)
|
||||||
|
from lerobot.common.datasets.video_utils import VideoFrame
|
||||||
|
|
||||||
|
|
||||||
|
def get_cameras(hdf5_data):
|
||||||
|
# ignore depth channel, not currently handled
|
||||||
|
# TODO(rcadene): add depth
|
||||||
|
rgb_cameras = [key for key in hdf5_data["/observations/images_ids"].keys() if "depth" not in key] # noqa: SIM118
|
||||||
|
return rgb_cameras
|
||||||
|
|
||||||
|
|
||||||
|
def check_format(raw_dir) -> bool:
|
||||||
|
hdf5_paths = list(raw_dir.glob("episode_*.hdf5"))
|
||||||
|
assert len(hdf5_paths) != 0
|
||||||
|
for hdf5_path in hdf5_paths:
|
||||||
|
with h5py.File(hdf5_path, "r") as data:
|
||||||
|
assert "/action" in data
|
||||||
|
assert "/observations/qpos" in data
|
||||||
|
|
||||||
|
assert data["/action"].ndim == 2
|
||||||
|
assert data["/observations/qpos"].ndim == 2
|
||||||
|
|
||||||
|
num_frames = data["/action"].shape[0]
|
||||||
|
assert num_frames == data["/observations/qpos"].shape[0]
|
||||||
|
|
||||||
|
for camera in get_cameras(data):
|
||||||
|
assert num_frames == data[f"/observations/images_ids/{camera}"].shape[0]
|
||||||
|
assert (raw_dir / hdf5_path.name.replace(".hdf5", f"_{camera}.mp4")).exists()
|
||||||
|
|
||||||
|
# assert data[f"/observations/images_ids/{camera}"].ndim == 4
|
||||||
|
# b, h, w, c = data[f"/observations/images_ids/{camera}"].shape
|
||||||
|
# assert c < h and c < w, f"Expect (h,w,c) image format but ({h=},{w=},{c=}) provided."
|
||||||
|
|
||||||
|
|
||||||
|
def load_from_raw(raw_dir, out_dir, fps, video, debug):
|
||||||
|
hdf5_files = list(raw_dir.glob("*.hdf5"))
|
||||||
|
ep_dicts = []
|
||||||
|
episode_data_index = {"from": [], "to": []}
|
||||||
|
|
||||||
|
id_from = 0
|
||||||
|
for ep_idx, ep_path in tqdm.tqdm(enumerate(hdf5_files), total=len(hdf5_files)):
|
||||||
|
match = re.search(r"_(\d+).hdf5", ep_path.name)
|
||||||
|
if not match:
|
||||||
|
raise ValueError(ep_path.name)
|
||||||
|
raw_ep_idx = int(match.group(1))
|
||||||
|
|
||||||
|
with h5py.File(ep_path, "r") as ep:
|
||||||
|
num_frames = ep["/action"].shape[0]
|
||||||
|
|
||||||
|
# last step of demonstration is considered done
|
||||||
|
done = torch.zeros(num_frames, dtype=torch.bool)
|
||||||
|
done[-1] = True
|
||||||
|
|
||||||
|
state = torch.from_numpy(ep["/observations/qpos"][:])
|
||||||
|
action = torch.from_numpy(ep["/action"][:])
|
||||||
|
if "/observations/qvel" in ep:
|
||||||
|
velocity = torch.from_numpy(ep["/observations/qvel"][:])
|
||||||
|
if "/observations/effort" in ep:
|
||||||
|
effort = torch.from_numpy(ep["/observations/effort"][:])
|
||||||
|
|
||||||
|
ep_dict = {}
|
||||||
|
|
||||||
|
videos_dir = out_dir / "videos"
|
||||||
|
videos_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
for camera in get_cameras(ep):
|
||||||
|
img_key = f"observation.images.{camera}"
|
||||||
|
|
||||||
|
raw_fname = f"episode_{raw_ep_idx}_{camera}.mp4"
|
||||||
|
new_fname = f"{img_key}_episode_{ep_idx:06d}.mp4"
|
||||||
|
shutil.copy(str(raw_dir / raw_fname), str(videos_dir / new_fname))
|
||||||
|
|
||||||
|
# store the reference to the video frame
|
||||||
|
ep_dict[img_key] = [
|
||||||
|
{"path": f"videos/{new_fname}", "timestamp": i / fps} for i in range(num_frames)
|
||||||
|
]
|
||||||
|
|
||||||
|
ep_dict["observation.state"] = state
|
||||||
|
if "/observations/velocity" in ep:
|
||||||
|
ep_dict["observation.velocity"] = velocity
|
||||||
|
if "/observations/effort" in ep:
|
||||||
|
ep_dict["observation.effort"] = effort
|
||||||
|
ep_dict["action"] = action
|
||||||
|
ep_dict["episode_index"] = torch.tensor([ep_idx] * num_frames)
|
||||||
|
ep_dict["frame_index"] = torch.arange(0, num_frames, 1)
|
||||||
|
ep_dict["timestamp"] = torch.arange(0, num_frames, 1) / fps
|
||||||
|
ep_dict["next.done"] = done
|
||||||
|
# TODO(rcadene): add reward and success by computing them in sim
|
||||||
|
|
||||||
|
assert isinstance(ep_idx, int)
|
||||||
|
ep_dicts.append(ep_dict)
|
||||||
|
|
||||||
|
episode_data_index["from"].append(id_from)
|
||||||
|
episode_data_index["to"].append(id_from + num_frames)
|
||||||
|
|
||||||
|
id_from += num_frames
|
||||||
|
|
||||||
|
gc.collect()
|
||||||
|
|
||||||
|
# process first episode only
|
||||||
|
if debug:
|
||||||
|
break
|
||||||
|
|
||||||
|
data_dict = concatenate_episodes(ep_dicts)
|
||||||
|
return data_dict, episode_data_index
|
||||||
|
|
||||||
|
|
||||||
|
def to_hf_dataset(data_dict, video) -> Dataset:
|
||||||
|
features = {}
|
||||||
|
|
||||||
|
keys = [key for key in data_dict if "observation.images." in key]
|
||||||
|
for key in keys:
|
||||||
|
if video:
|
||||||
|
features[key] = VideoFrame()
|
||||||
|
else:
|
||||||
|
features[key] = Image()
|
||||||
|
|
||||||
|
features["observation.state"] = Sequence(
|
||||||
|
length=data_dict["observation.state"].shape[1], feature=Value(dtype="float32", id=None)
|
||||||
|
)
|
||||||
|
if "observation.velocity" in data_dict:
|
||||||
|
features["observation.velocity"] = Sequence(
|
||||||
|
length=data_dict["observation.velocity"].shape[1], feature=Value(dtype="float32", id=None)
|
||||||
|
)
|
||||||
|
if "observation.effort" in data_dict:
|
||||||
|
features["observation.effort"] = Sequence(
|
||||||
|
length=data_dict["observation.effort"].shape[1], feature=Value(dtype="float32", id=None)
|
||||||
|
)
|
||||||
|
features["action"] = Sequence(
|
||||||
|
length=data_dict["action"].shape[1], feature=Value(dtype="float32", id=None)
|
||||||
|
)
|
||||||
|
features["episode_index"] = Value(dtype="int64", id=None)
|
||||||
|
features["frame_index"] = Value(dtype="int64", id=None)
|
||||||
|
features["timestamp"] = Value(dtype="float32", id=None)
|
||||||
|
features["next.done"] = Value(dtype="bool", id=None)
|
||||||
|
features["index"] = Value(dtype="int64", id=None)
|
||||||
|
|
||||||
|
hf_dataset = Dataset.from_dict(data_dict, features=Features(features))
|
||||||
|
hf_dataset.set_transform(hf_transform_to_torch)
|
||||||
|
return hf_dataset
|
||||||
|
|
||||||
|
|
||||||
|
def from_raw_to_lerobot_format(raw_dir: Path, out_dir: Path, fps=None, video=True, debug=False):
|
||||||
|
# sanity check
|
||||||
|
check_format(raw_dir)
|
||||||
|
|
||||||
|
if fps is None:
|
||||||
|
fps = 30
|
||||||
|
|
||||||
|
data_dir, episode_data_index = load_from_raw(raw_dir, out_dir, fps, video, debug)
|
||||||
|
hf_dataset = to_hf_dataset(data_dir, video)
|
||||||
|
|
||||||
|
info = {
|
||||||
|
"fps": fps,
|
||||||
|
"video": video,
|
||||||
|
}
|
||||||
|
return hf_dataset, episode_data_index, info
|
||||||
13
lerobot/configs/env/dora_reachy2_real.yaml
vendored
Normal file
13
lerobot/configs/env/dora_reachy2_real.yaml
vendored
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
# @package _global_
|
||||||
|
|
||||||
|
fps: 30
|
||||||
|
|
||||||
|
env:
|
||||||
|
name: dora
|
||||||
|
task: DoraReachy2-v0
|
||||||
|
state_dim: 16
|
||||||
|
action_dim: 16
|
||||||
|
fps: ${fps}
|
||||||
|
episode_length: 400
|
||||||
|
gym:
|
||||||
|
fps: ${fps}
|
||||||
97
lerobot/configs/policy/act_reachy2_real.yaml
Normal file
97
lerobot/configs/policy/act_reachy2_real.yaml
Normal file
@@ -0,0 +1,97 @@
|
|||||||
|
# @package _global_
|
||||||
|
|
||||||
|
# Use `act_real.yaml` to train on real-world Aloha/Aloha2 datasets.
|
||||||
|
# Compared to `act.yaml`, it contains 4 cameras (i.e. cam_right_wrist, cam_left_wrist, images,
|
||||||
|
# cam_low) instead of 1 camera (i.e. top). Also, `training.eval_freq` is set to -1. This config is used
|
||||||
|
# to evaluate checkpoints at a certain frequency of training steps. When it is set to -1, it deactivates evaluation.
|
||||||
|
# This is because real-world evaluation is done through [dora-lerobot](https://github.com/dora-rs/dora-lerobot).
|
||||||
|
# Look at its README for more information on how to evaluate a checkpoint in the real-world.
|
||||||
|
#
|
||||||
|
# Example of usage for training:
|
||||||
|
# ```bash
|
||||||
|
# python lerobot/scripts/train.py \
|
||||||
|
# policy=act_real \
|
||||||
|
# env=dora_aloha_real
|
||||||
|
# ```
|
||||||
|
|
||||||
|
seed: 1000
|
||||||
|
dataset_repo_id: cadene/reachy2_teleop_remi
|
||||||
|
|
||||||
|
override_dataset_stats:
|
||||||
|
observation.images.cam_trunk:
|
||||||
|
# stats from imagenet, since we use a pretrained vision model
|
||||||
|
mean: [[[0.485]], [[0.456]], [[0.406]]] # (c,1,1)
|
||||||
|
std: [[[0.229]], [[0.224]], [[0.225]]] # (c,1,1)
|
||||||
|
|
||||||
|
training:
|
||||||
|
offline_steps: 80000
|
||||||
|
online_steps: 0
|
||||||
|
eval_freq: -1
|
||||||
|
save_freq: 10000
|
||||||
|
log_freq: 100
|
||||||
|
save_checkpoint: true
|
||||||
|
|
||||||
|
batch_size: 8
|
||||||
|
lr: 1e-5
|
||||||
|
lr_backbone: 1e-5
|
||||||
|
weight_decay: 1e-4
|
||||||
|
grad_clip_norm: 10
|
||||||
|
online_steps_between_rollouts: 1
|
||||||
|
|
||||||
|
delta_timestamps:
|
||||||
|
action: "[i / ${fps} for i in range(1, ${policy.chunk_size} + 1)]"
|
||||||
|
|
||||||
|
eval:
|
||||||
|
n_episodes: 50
|
||||||
|
batch_size: 50
|
||||||
|
|
||||||
|
# See `configuration_act.py` for more details.
|
||||||
|
policy:
|
||||||
|
name: act
|
||||||
|
|
||||||
|
# Input / output structure.
|
||||||
|
n_obs_steps: 1
|
||||||
|
chunk_size: 100 # chunk_size
|
||||||
|
n_action_steps: 100
|
||||||
|
|
||||||
|
input_shapes:
|
||||||
|
# TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
|
||||||
|
observation.images.cam_trunk: [3, 800, 1280]
|
||||||
|
observation.state: ["${env.state_dim}"]
|
||||||
|
output_shapes:
|
||||||
|
action: ["${env.action_dim}"]
|
||||||
|
|
||||||
|
# Normalization / Unnormalization
|
||||||
|
input_normalization_modes:
|
||||||
|
observation.images.cam_trunk: mean_std
|
||||||
|
observation.state: mean_std
|
||||||
|
output_normalization_modes:
|
||||||
|
action: mean_std
|
||||||
|
|
||||||
|
# Architecture.
|
||||||
|
# Vision backbone.
|
||||||
|
vision_backbone: resnet18
|
||||||
|
pretrained_backbone_weights: ResNet18_Weights.IMAGENET1K_V1
|
||||||
|
replace_final_stride_with_dilation: false
|
||||||
|
# Transformer layers.
|
||||||
|
pre_norm: false
|
||||||
|
dim_model: 512
|
||||||
|
n_heads: 8
|
||||||
|
dim_feedforward: 3200
|
||||||
|
feedforward_activation: relu
|
||||||
|
n_encoder_layers: 4
|
||||||
|
# Note: Although the original ACT implementation has 7 for `n_decoder_layers`, there is a bug in the code
|
||||||
|
# that means only the first layer is used. Here we match the original implementation by setting this to 1.
|
||||||
|
# See this issue https://github.com/tonyzhaozh/act/issues/25#issue-2258740521.
|
||||||
|
n_decoder_layers: 1
|
||||||
|
# VAE.
|
||||||
|
use_vae: true
|
||||||
|
latent_dim: 32
|
||||||
|
n_vae_encoder_layers: 4
|
||||||
|
|
||||||
|
# Inference.
|
||||||
|
temporal_ensemble_momentum: null
|
||||||
|
|
||||||
|
# Training and loss computation.
|
||||||
|
dropout: 0.1
|
||||||
|
kl_weight: 10.0
|
||||||
@@ -86,6 +86,8 @@ def get_from_raw_to_lerobot_format_fn(raw_format):
|
|||||||
from lerobot.common.datasets.push_dataset_to_hub.aloha_hdf5_format import from_raw_to_lerobot_format
|
from lerobot.common.datasets.push_dataset_to_hub.aloha_hdf5_format import from_raw_to_lerobot_format
|
||||||
elif raw_format == "aloha_dora":
|
elif raw_format == "aloha_dora":
|
||||||
from lerobot.common.datasets.push_dataset_to_hub.aloha_dora_format import from_raw_to_lerobot_format
|
from lerobot.common.datasets.push_dataset_to_hub.aloha_dora_format import from_raw_to_lerobot_format
|
||||||
|
elif raw_format == "reachy2_hdf5":
|
||||||
|
from lerobot.common.datasets.push_dataset_to_hub.reachy2_hdf5_format import from_raw_to_lerobot_format
|
||||||
elif raw_format == "xarm_pkl":
|
elif raw_format == "xarm_pkl":
|
||||||
from lerobot.common.datasets.push_dataset_to_hub.xarm_pkl_format import from_raw_to_lerobot_format
|
from lerobot.common.datasets.push_dataset_to_hub.xarm_pkl_format import from_raw_to_lerobot_format
|
||||||
else:
|
else:
|
||||||
|
|||||||
Reference in New Issue
Block a user