updating with adding masking in ACT - start adding some tests
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
# Using `lerobot` on a real world arm
|
||||
|
||||
|
||||
In this example, we'll be using `lerobot` on a real world arm to:
|
||||
- record a dataset in the `lerobot` format
|
||||
- (soon) train a policy on it
|
||||
@@ -25,7 +26,9 @@ Follow these steps:
|
||||
- install `lerobot`
|
||||
- install the Dynamixel-sdk: ` pip install dynamixel-sdk`
|
||||
|
||||
## 0 - record examples
|
||||
## Usage
|
||||
|
||||
### 0 - record examples
|
||||
|
||||
Run the `record_training_data.py` example, selecting the duration and number of episodes you want to record, e.g.
|
||||
```
|
||||
@@ -40,7 +43,7 @@ TODO:
|
||||
- being able to drop episodes
|
||||
- checking uploading to the hub
|
||||
|
||||
## 1 - visualize the dataset
|
||||
### 1 - visualize the dataset
|
||||
|
||||
Use the standard dataset visualization script pointing it to the right folder:
|
||||
```
|
||||
@@ -49,7 +52,7 @@ DATA_DIR='./data' python ../../lerobot/scripts/visualize_dataset.py \
|
||||
--episode-index 0
|
||||
```
|
||||
|
||||
## 2 - Train a policy
|
||||
### 2 - Train a policy
|
||||
|
||||
From the example directory let's run this command to train a model using ACT
|
||||
|
||||
@@ -64,7 +67,7 @@ DATA_DIR='./data' python ../../lerobot/scripts/train.py \
|
||||
wandb.enable=false
|
||||
```
|
||||
|
||||
## 3 - Evaluate the policy in the real world
|
||||
### 3 - Evaluate the policy in the real world
|
||||
|
||||
From the example directory let's run this command to evaluate our policy.
|
||||
The configuration for running the policy is in the checkpoint of the model.
|
||||
@@ -75,3 +78,12 @@ python run_policy.py \
|
||||
-p ./outputs/train/blue_red_sort/checkpoints/last/pretrained_model/
|
||||
env.episode_length=1000
|
||||
```
|
||||
|
||||
|
||||
## Convert a hdf5 dataset recorded with the original ACT repo
|
||||
|
||||
You can convert a dataset from the raw data format of HDF5 files like in: https://github.com/tonyzhaozh/act with the following command:
|
||||
|
||||
```
|
||||
python ./lerobot/scripts/push_dataset_to_hub.py
|
||||
```
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import time
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import cv2
|
||||
import gymnasium as gym
|
||||
@@ -23,6 +24,14 @@ CAMERAS_PORTS = {
|
||||
LEADER_PORT = "/dev/ttyACM1"
|
||||
FOLLOWER_PORT = "/dev/ttyACM0"
|
||||
|
||||
MockRobot = MagicMock()
|
||||
MockRobot.read_position = MagicMock()
|
||||
MockRobot.read_position.return_value = np.array([0.0, 1.0, 2.0, 3.0, 4.0, 5.0])
|
||||
|
||||
MockCamera = MagicMock()
|
||||
MockCamera.isOpened = MagicMock(return_value=True)
|
||||
MockCamera.read = MagicMock(return_value=(True, np.zeros((480, 640, 3), dtype=np.uint8)))
|
||||
|
||||
|
||||
def capture_image(cam, cam_width, cam_height):
|
||||
# Capture a single frame
|
||||
@@ -54,6 +63,7 @@ class RealEnv(gym.Env):
|
||||
trigger_torque=70,
|
||||
fps: int = FPS,
|
||||
fps_tolerance: float = 0.1,
|
||||
mock: bool = False,
|
||||
):
|
||||
self.num_joints = num_joints
|
||||
self.cameras_shapes = cameras_shapes
|
||||
@@ -68,15 +78,15 @@ class RealEnv(gym.Env):
|
||||
self.fps_tolerance = fps_tolerance
|
||||
|
||||
# Initialize the robot
|
||||
self.follower = Robot(device_name=self.follower_port)
|
||||
self.follower = Robot(device_name=self.follower_port) if not mock else MockRobot
|
||||
if self.record:
|
||||
self.leader = Robot(device_name=self.leader_port)
|
||||
self.leader = Robot(device_name=self.leader_port) if not mock else MockRobot
|
||||
self.leader.set_trigger_torque(trigger_torque)
|
||||
|
||||
# Initialize the cameras - sorted by camera names
|
||||
self.cameras = {}
|
||||
for cn, p in sorted(self.cameras_ports.items()):
|
||||
self.cameras[cn] = cv2.VideoCapture(p)
|
||||
self.cameras[cn] = cv2.VideoCapture(p) if not mock else MockCamera
|
||||
if not self.cameras[cn].isOpened():
|
||||
raise OSError(
|
||||
f"Cannot open camera port {p} for {cn}."
|
||||
@@ -118,7 +128,6 @@ class RealEnv(gym.Env):
|
||||
|
||||
self._observation = {}
|
||||
self._terminated = False
|
||||
self.starting_time = time.time()
|
||||
self.timestamps = []
|
||||
|
||||
def _get_obs(self):
|
||||
@@ -146,13 +155,8 @@ class RealEnv(gym.Env):
|
||||
if self.timestamps:
|
||||
# wait the right amount of time to stay at the desired fps
|
||||
time.sleep(max(0, 1 / self.fps - (time.time() - self.timestamps[-1])))
|
||||
recording_time = time.time() - self.starting_time
|
||||
else:
|
||||
# it's the first step so we start the timer
|
||||
self.starting_time = time.time()
|
||||
recording_time = 0
|
||||
|
||||
self.timestamps.append(recording_time)
|
||||
self.timestamps.append(time.time())
|
||||
|
||||
# Get the observation
|
||||
self._get_obs()
|
||||
@@ -165,13 +169,15 @@ class RealEnv(gym.Env):
|
||||
|
||||
reward = 0
|
||||
terminated = truncated = self._terminated
|
||||
info = {"timestamp": recording_time, "fps_error": False}
|
||||
info = {"timestamp": self.timestamps[-1] - self.timestamps[0], "fps_error": False}
|
||||
|
||||
# Check if we are able to keep up with the desired fps
|
||||
if recording_time - self.timestamps[-1] > 1 / (self.fps - self.fps_tolerance):
|
||||
if len(self.timestamps) > 1 and (self.timestamps[-1] - self.timestamps[-2]) > 1 / (
|
||||
self.fps - self.fps_tolerance
|
||||
):
|
||||
print(
|
||||
f"Error: recording time interval {recording_time - self.timestamps[-1]:.2f} is greater"
|
||||
f"than expected {1 / (self.fps - self.fps_tolerance):.2f}"
|
||||
f"Error: recording fps {1 / (self.timestamps[-1] - self.timestamps[-2]):.5f} is lower"
|
||||
f" than min admited fps {(self.fps - self.fps_tolerance):.5f}"
|
||||
f" at frame {len(self.timestamps)}"
|
||||
)
|
||||
info["fps_error"] = True
|
||||
|
||||
@@ -6,12 +6,14 @@ using a very simple gym environment (see in examples/real_robot_example/gym_real
|
||||
import argparse
|
||||
import copy
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import gym_real_world # noqa: F401
|
||||
import gymnasium as gym
|
||||
import numpy as np
|
||||
import torch
|
||||
from datasets import Dataset, Features, Sequence, Value
|
||||
from omegaconf import OmegaConf
|
||||
from tqdm import tqdm
|
||||
|
||||
from lerobot.common.datasets.compute_stats import compute_stats
|
||||
@@ -30,17 +32,20 @@ parser.add_argument("--num-episodes", type=int, default=2)
|
||||
parser.add_argument("--num-frames", type=int, default=400)
|
||||
parser.add_argument("--num-workers", type=int, default=16)
|
||||
parser.add_argument("--keep-last", action="store_true")
|
||||
parser.add_argument("--data_dir", type=str, default=None)
|
||||
parser.add_argument("--push-to-hub", action="store_true")
|
||||
parser.add_argument("--fps", type=int, default=30, help="Frames per second of the recording.")
|
||||
parser.add_argument(
|
||||
"--fps_tolerance",
|
||||
type=float,
|
||||
default=0.1,
|
||||
default=0.5,
|
||||
help="Tolerance in fps for the recording before dropping episodes.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--revision", type=str, default=CODEBASE_VERSION, help="Codebase version used to generate the dataset."
|
||||
)
|
||||
parser.add_argument("--gym-config", type=str, default=None, help="Path to the gym config file.")
|
||||
parser.add_argument("--mock_robot", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
repo_id = args.repo_id
|
||||
@@ -50,7 +55,7 @@ revision = args.revision
|
||||
fps = args.fps
|
||||
fps_tolerance = args.fps_tolerance
|
||||
|
||||
out_data = DATA_DIR / repo_id
|
||||
out_data = DATA_DIR / repo_id if args.data_dir is None else Path(args.data_dir)
|
||||
|
||||
# During data collection, frames are stored as png images in `images_dir`
|
||||
images_dir = out_data / "images"
|
||||
@@ -58,6 +63,9 @@ images_dir = out_data / "images"
|
||||
videos_dir = out_data / "videos"
|
||||
meta_data_dir = out_data / "meta_data"
|
||||
|
||||
gym_config = None
|
||||
if args.config is not None:
|
||||
gym_config = OmegaConf.load(args.config)
|
||||
|
||||
# Create image and video directories
|
||||
if not os.path.exists(images_dir):
|
||||
@@ -68,7 +76,12 @@ if not os.path.exists(videos_dir):
|
||||
if __name__ == "__main__":
|
||||
# Create the gym environment - check the kwargs in gym_real_world/gym_environment.py
|
||||
gym_handle = "gym_real_world/RealEnv-v0"
|
||||
env = gym.make(gym_handle, disable_env_checker=True, record=True, fps=fps, fps_tolerance=fps_tolerance)
|
||||
gym_kwargs = {}
|
||||
if gym_config is not None:
|
||||
gym_kwargs = OmegaConf.to_container(gym_config.gym_kwargs)
|
||||
env = gym.make(
|
||||
gym_handle, disable_env_checker=True, record=True, fps=fps, fps_tolerance=fps_tolerance, mock=True
|
||||
)
|
||||
|
||||
ep_dicts = []
|
||||
episode_data_index = {"from": [], "to": []}
|
||||
|
||||
@@ -10,3 +10,10 @@ env:
|
||||
fps: ${fps}
|
||||
episode_length: 200
|
||||
real_world: true
|
||||
gym:
|
||||
cameras_shapes:
|
||||
images.high: [480, 640, 3]
|
||||
images.low: [480, 640, 3]
|
||||
cameras_ports:
|
||||
images.high: /dev/video6
|
||||
images.low: /dev/video0
|
||||
|
||||
19
examples/real_robot_example/train_config/env/gym_real_world_debug.yaml
vendored
Normal file
19
examples/real_robot_example/train_config/env/gym_real_world_debug.yaml
vendored
Normal file
@@ -0,0 +1,19 @@
|
||||
# @package _global_
|
||||
|
||||
fps: 30
|
||||
|
||||
env:
|
||||
name: real_world
|
||||
task: RealEnv-v0
|
||||
state_dim: 6
|
||||
action_dim: 6
|
||||
fps: ${fps}
|
||||
episode_length: 200
|
||||
real_world: true
|
||||
gym:
|
||||
cameras_shapes:
|
||||
images.top: [480, 640, 3]
|
||||
images.front: [480, 640, 3]
|
||||
cameras_ports:
|
||||
images.top: /dev/video6
|
||||
images.front: /dev/video0
|
||||
@@ -0,0 +1,103 @@
|
||||
# @package _global_
|
||||
|
||||
# Use `act_real.yaml` to train on real-world Aloha/Aloha2 datasets.
|
||||
# Compared to `act.yaml`, it contains 4 cameras (i.e. right_wrist, left_wrist, images,
|
||||
# front) instead of 1 camera (i.e. top). Also, `training.eval_freq` is set to -1. This config is used
|
||||
# to evaluate checkpoints at a certain frequency of training steps. When it is set to -1, it deactivates evaluation.
|
||||
# This is because real-world evaluation is done through [dora-lerobot](https://github.com/dora-rs/dora-lerobot).
|
||||
# Look at its README for more information on how to evaluate a checkpoint in the real-world.
|
||||
#
|
||||
# Example of usage for training:
|
||||
# ```bash
|
||||
# python lerobot/scripts/train.py \
|
||||
# policy=act_real \
|
||||
# env=aloha_real
|
||||
# ```
|
||||
|
||||
seed: 1000
|
||||
dataset_repo_id: ???
|
||||
|
||||
override_dataset_stats:
|
||||
observation.images.top:
|
||||
# stats from imagenet, since we use a pretrained vision model
|
||||
mean: [[[0.485]], [[0.456]], [[0.406]]] # (c,1,1)
|
||||
std: [[[0.229]], [[0.224]], [[0.225]]] # (c,1,1)
|
||||
observation.images.front:
|
||||
# stats from imagenet, since we use a pretrained vision model
|
||||
mean: [[[0.485]], [[0.456]], [[0.406]]] # (c,1,1)
|
||||
std: [[[0.229]], [[0.224]], [[0.225]]] # (c,1,1)
|
||||
|
||||
training:
|
||||
offline_steps: 1000
|
||||
online_steps: 0
|
||||
eval_freq: -1
|
||||
save_freq: 1000
|
||||
log_freq: 100
|
||||
save_checkpoint: true
|
||||
|
||||
batch_size: 8
|
||||
lr: 1e-5
|
||||
lr_backbone: 1e-5
|
||||
weight_decay: 1e-4
|
||||
grad_clip_norm: 10
|
||||
online_steps_between_rollouts: 1
|
||||
|
||||
delta_timestamps:
|
||||
action: "[i / ${fps} for i in range(1, ${policy.chunk_size} + 1)]"
|
||||
|
||||
eval:
|
||||
n_episodes: 1
|
||||
batch_size: 1
|
||||
|
||||
# See `configuration_act.py` for more details.
|
||||
policy:
|
||||
name: act
|
||||
|
||||
# Input / output structure.
|
||||
n_obs_steps: 1
|
||||
chunk_size: 100 # chunk_size
|
||||
n_action_steps: 100
|
||||
|
||||
input_shapes:
|
||||
# TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
|
||||
observation.images.top: [3, 480, 640]
|
||||
observation.images.front: [3, 480, 640]
|
||||
observation.state: ["${env.state_dim}"]
|
||||
output_shapes:
|
||||
action: ["${env.action_dim}"]
|
||||
|
||||
# Normalization / Unnormalization
|
||||
input_normalization_modes:
|
||||
observation.images.top: mean_std
|
||||
observation.images.front: mean_std
|
||||
observation.state: mean_std
|
||||
output_normalization_modes:
|
||||
action: mean_std
|
||||
|
||||
# Architecture.
|
||||
# Vision backbone.
|
||||
vision_backbone: resnet18
|
||||
pretrained_backbone_weights: ResNet18_Weights.IMAGENET1K_V1
|
||||
replace_final_stride_with_dilation: false
|
||||
# Transformer layers.
|
||||
pre_norm: false
|
||||
dim_model: 512
|
||||
n_heads: 8
|
||||
dim_feedforward: 3200
|
||||
feedforward_activation: relu
|
||||
n_encoder_layers: 4
|
||||
# Note: Although the original ACT implementation has 7 for `n_decoder_layers`, there is a bug in the code
|
||||
# that means only the first layer is used. Here we match the original implementation by setting this to 1.
|
||||
# See this issue https://github.com/tonyzhaozh/act/issues/25#issue-2258740521.
|
||||
n_decoder_layers: 1
|
||||
# VAE.
|
||||
use_vae: true
|
||||
latent_dim: 32
|
||||
n_vae_encoder_layers: 4
|
||||
|
||||
# Inference.
|
||||
temporal_ensemble_momentum: null
|
||||
|
||||
# Training and loss computation.
|
||||
dropout: 0.1
|
||||
kl_weight: 10.0
|
||||
Reference in New Issue
Block a user