forked from tangger/lerobot
Add online training with TD-MPC as proof of concept (#338)
This commit is contained in:
384
lerobot/common/datasets/online_buffer.py
Normal file
384
lerobot/common/datasets/online_buffer.py
Normal file
@@ -0,0 +1,384 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""An online buffer for the online training loop in train.py
|
||||
|
||||
Note to maintainers: This duplicates some logic from LeRobotDataset and EpisodeAwareSampler. We should
|
||||
consider converging to one approach. Here we have opted to use numpy.memmap to back the data buffer. It's much
|
||||
faster than using HuggingFace Datasets as there's no conversion to an intermediate non-python object. Also it
|
||||
supports in-place slicing and mutation which is very handy for a dynamic buffer.
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
|
||||
|
||||
|
||||
def _make_memmap_safe(**kwargs) -> np.memmap:
|
||||
"""Make a numpy memmap with checks on available disk space first.
|
||||
|
||||
Expected kwargs are: "filename", "dtype" (must by np.dtype), "mode" and "shape"
|
||||
|
||||
For information on dtypes:
|
||||
https://numpy.org/doc/stable/reference/arrays.dtypes.html#arrays-dtypes-constructing
|
||||
"""
|
||||
if kwargs["mode"].startswith("w"):
|
||||
required_space = kwargs["dtype"].itemsize * np.prod(kwargs["shape"]) # bytes
|
||||
stats = os.statvfs(Path(kwargs["filename"]).parent)
|
||||
available_space = stats.f_bavail * stats.f_frsize # bytes
|
||||
if required_space >= available_space * 0.8:
|
||||
raise RuntimeError(
|
||||
f"You're about to take up {required_space} of {available_space} bytes available."
|
||||
)
|
||||
return np.memmap(**kwargs)
|
||||
|
||||
|
||||
class OnlineBuffer(torch.utils.data.Dataset):
|
||||
"""FIFO data buffer for the online training loop in train.py.
|
||||
|
||||
Follows the protocol of LeRobotDataset as much as is required to have it be used by the online training
|
||||
loop in the same way that a LeRobotDataset would be used.
|
||||
|
||||
The underlying data structure will have data inserted in a circular fashion. Always insert after the
|
||||
last index, and when you reach the end, wrap around to the start.
|
||||
|
||||
The data is stored in a numpy memmap.
|
||||
"""
|
||||
|
||||
NEXT_INDEX_KEY = "_next_index"
|
||||
OCCUPANCY_MASK_KEY = "_occupancy_mask"
|
||||
INDEX_KEY = "index"
|
||||
FRAME_INDEX_KEY = "frame_index"
|
||||
EPISODE_INDEX_KEY = "episode_index"
|
||||
TIMESTAMP_KEY = "timestamp"
|
||||
IS_PAD_POSTFIX = "_is_pad"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
write_dir: str | Path,
|
||||
data_spec: dict[str, Any] | None,
|
||||
buffer_capacity: int | None,
|
||||
fps: float | None = None,
|
||||
delta_timestamps: dict[str, list[float]] | dict[str, np.ndarray] | None = None,
|
||||
):
|
||||
"""
|
||||
The online buffer can be provided from scratch or you can load an existing online buffer by passing
|
||||
a `write_dir` associated with an existing buffer.
|
||||
|
||||
Args:
|
||||
write_dir: Where to keep the numpy memmap files. One memmap file will be stored for each data key.
|
||||
Note that if the files already exist, they are opened in read-write mode (used for training
|
||||
resumption.)
|
||||
data_spec: A mapping from data key to data specification, like {data_key: {"shape": tuple[int],
|
||||
"dtype": np.dtype}}. This should include all the data that you wish to record into the buffer,
|
||||
but note that "index", "frame_index" and "episode_index" are already accounted for by this
|
||||
class, so you don't need to include them.
|
||||
buffer_capacity: How many frames should be stored in the buffer as a maximum. Be aware of your
|
||||
system's available disk space when choosing this.
|
||||
fps: Same as the fps concept in LeRobot dataset. Here it needs to be provided for the
|
||||
delta_timestamps logic. You can pass None if you are not using delta_timestamps.
|
||||
delta_timestamps: Same as the delta_timestamps concept in LeRobotDataset. This is internally
|
||||
converted to dict[str, np.ndarray] for optimization purposes.
|
||||
|
||||
"""
|
||||
self.set_delta_timestamps(delta_timestamps)
|
||||
self._fps = fps
|
||||
# Tolerance in seconds used to discard loaded frames when their timestamps are not close enough from
|
||||
# the requested frames. It is only used when `delta_timestamps` is provided.
|
||||
# minus 1e-4 to account for possible numerical error
|
||||
self.tolerance_s = 1 / self.fps - 1e-4 if fps is not None else None
|
||||
self._buffer_capacity = buffer_capacity
|
||||
data_spec = self._make_data_spec(data_spec, buffer_capacity)
|
||||
Path(write_dir).mkdir(parents=True, exist_ok=True)
|
||||
self._data = {}
|
||||
for k, v in data_spec.items():
|
||||
self._data[k] = _make_memmap_safe(
|
||||
filename=Path(write_dir) / k,
|
||||
dtype=v["dtype"] if v is not None else None,
|
||||
mode="r+" if (Path(write_dir) / k).exists() else "w+",
|
||||
shape=tuple(v["shape"]) if v is not None else None,
|
||||
)
|
||||
|
||||
@property
|
||||
def delta_timestamps(self) -> dict[str, np.ndarray] | None:
|
||||
return self._delta_timestamps
|
||||
|
||||
def set_delta_timestamps(self, value: dict[str, list[float]] | None):
|
||||
"""Set delta_timestamps converting the values to numpy arrays.
|
||||
|
||||
The conversion is for an optimization in the __getitem__. The loop is much slower if the arrays
|
||||
need to be converted into numpy arrays.
|
||||
"""
|
||||
if value is not None:
|
||||
self._delta_timestamps = {k: np.array(v) for k, v in value.items()}
|
||||
else:
|
||||
self._delta_timestamps = None
|
||||
|
||||
def _make_data_spec(self, data_spec: dict[str, Any], buffer_capacity: int) -> dict[str, dict[str, Any]]:
|
||||
"""Makes the data spec for np.memmap."""
|
||||
if any(k.startswith("_") for k in data_spec):
|
||||
raise ValueError(
|
||||
"data_spec keys should not start with '_'. This prefix is reserved for internal logic."
|
||||
)
|
||||
preset_keys = {
|
||||
OnlineBuffer.INDEX_KEY,
|
||||
OnlineBuffer.FRAME_INDEX_KEY,
|
||||
OnlineBuffer.EPISODE_INDEX_KEY,
|
||||
OnlineBuffer.TIMESTAMP_KEY,
|
||||
}
|
||||
if len(intersection := set(data_spec).intersection(preset_keys)) > 0:
|
||||
raise ValueError(
|
||||
f"data_spec should not contain any of {preset_keys} as these are handled internally. "
|
||||
f"The provided data_spec has {intersection}."
|
||||
)
|
||||
complete_data_spec = {
|
||||
# _next_index will be a pointer to the next index that we should start filling from when we add
|
||||
# more data.
|
||||
OnlineBuffer.NEXT_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": ()},
|
||||
# Since the memmap is initialized with all-zeros, this keeps track of which indices are occupied
|
||||
# with real data rather than the dummy initialization.
|
||||
OnlineBuffer.OCCUPANCY_MASK_KEY: {"dtype": np.dtype("?"), "shape": (buffer_capacity,)},
|
||||
OnlineBuffer.INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (buffer_capacity,)},
|
||||
OnlineBuffer.FRAME_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (buffer_capacity,)},
|
||||
OnlineBuffer.EPISODE_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (buffer_capacity,)},
|
||||
OnlineBuffer.TIMESTAMP_KEY: {"dtype": np.dtype("float64"), "shape": (buffer_capacity,)},
|
||||
}
|
||||
for k, v in data_spec.items():
|
||||
complete_data_spec[k] = {"dtype": v["dtype"], "shape": (buffer_capacity, *v["shape"])}
|
||||
return complete_data_spec
|
||||
|
||||
def add_data(self, data: dict[str, np.ndarray]):
|
||||
"""Add new data to the buffer, which could potentially mean shifting old data out.
|
||||
|
||||
The new data should contain all the frames (in order) of any number of episodes. The indices should
|
||||
start from 0 (note to the developer: this can easily be generalized). See the `rollout` and
|
||||
`eval_policy` functions in `eval.py` for more information on how the data is constructed.
|
||||
|
||||
Shift the incoming data index and episode_index to continue on from the last frame. Note that this
|
||||
will be done in place!
|
||||
"""
|
||||
if len(missing_keys := (set(self.data_keys).difference(set(data)))) > 0:
|
||||
raise ValueError(f"Missing data keys: {missing_keys}")
|
||||
new_data_length = len(data[self.data_keys[0]])
|
||||
if not all(len(data[k]) == new_data_length for k in self.data_keys):
|
||||
raise ValueError("All data items should have the same length")
|
||||
|
||||
next_index = self._data[OnlineBuffer.NEXT_INDEX_KEY]
|
||||
|
||||
# Sanity check to make sure that the new data indices start from 0.
|
||||
assert data[OnlineBuffer.EPISODE_INDEX_KEY][0].item() == 0
|
||||
assert data[OnlineBuffer.INDEX_KEY][0].item() == 0
|
||||
|
||||
# Shift the incoming indices if necessary.
|
||||
if self.num_samples > 0:
|
||||
last_episode_index = self._data[OnlineBuffer.EPISODE_INDEX_KEY][next_index - 1]
|
||||
last_data_index = self._data[OnlineBuffer.INDEX_KEY][next_index - 1]
|
||||
data[OnlineBuffer.EPISODE_INDEX_KEY] += last_episode_index + 1
|
||||
data[OnlineBuffer.INDEX_KEY] += last_data_index + 1
|
||||
|
||||
# Insert the new data starting from next_index. It may be necessary to wrap around to the start.
|
||||
n_surplus = max(0, new_data_length - (self._buffer_capacity - next_index))
|
||||
for k in self.data_keys:
|
||||
if n_surplus == 0:
|
||||
slc = slice(next_index, next_index + new_data_length)
|
||||
self._data[k][slc] = data[k]
|
||||
self._data[OnlineBuffer.OCCUPANCY_MASK_KEY][slc] = True
|
||||
else:
|
||||
self._data[k][next_index:] = data[k][:-n_surplus]
|
||||
self._data[OnlineBuffer.OCCUPANCY_MASK_KEY][next_index:] = True
|
||||
self._data[k][:n_surplus] = data[k][-n_surplus:]
|
||||
if n_surplus == 0:
|
||||
self._data[OnlineBuffer.NEXT_INDEX_KEY] = next_index + new_data_length
|
||||
else:
|
||||
self._data[OnlineBuffer.NEXT_INDEX_KEY] = n_surplus
|
||||
|
||||
@property
|
||||
def data_keys(self) -> list[str]:
|
||||
keys = set(self._data)
|
||||
keys.remove(OnlineBuffer.OCCUPANCY_MASK_KEY)
|
||||
keys.remove(OnlineBuffer.NEXT_INDEX_KEY)
|
||||
return sorted(keys)
|
||||
|
||||
@property
|
||||
def fps(self) -> float | None:
|
||||
return self._fps
|
||||
|
||||
@property
|
||||
def num_episodes(self) -> int:
|
||||
return len(
|
||||
np.unique(self._data[OnlineBuffer.EPISODE_INDEX_KEY][self._data[OnlineBuffer.OCCUPANCY_MASK_KEY]])
|
||||
)
|
||||
|
||||
@property
|
||||
def num_samples(self) -> int:
|
||||
return np.count_nonzero(self._data[OnlineBuffer.OCCUPANCY_MASK_KEY])
|
||||
|
||||
def __len__(self):
|
||||
return self.num_samples
|
||||
|
||||
def _item_to_tensors(self, item: dict) -> dict:
|
||||
item_ = {}
|
||||
for k, v in item.items():
|
||||
if isinstance(v, torch.Tensor):
|
||||
item_[k] = v
|
||||
elif isinstance(v, np.ndarray):
|
||||
item_[k] = torch.from_numpy(v)
|
||||
else:
|
||||
item_[k] = torch.tensor(v)
|
||||
return item_
|
||||
|
||||
def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
|
||||
if idx >= len(self) or idx < -len(self):
|
||||
raise IndexError
|
||||
|
||||
item = {k: v[idx] for k, v in self._data.items() if not k.startswith("_")}
|
||||
|
||||
if self.delta_timestamps is None:
|
||||
return self._item_to_tensors(item)
|
||||
|
||||
episode_index = item[OnlineBuffer.EPISODE_INDEX_KEY]
|
||||
current_ts = item[OnlineBuffer.TIMESTAMP_KEY]
|
||||
episode_data_indices = np.where(
|
||||
np.bitwise_and(
|
||||
self._data[OnlineBuffer.EPISODE_INDEX_KEY] == episode_index,
|
||||
self._data[OnlineBuffer.OCCUPANCY_MASK_KEY],
|
||||
)
|
||||
)[0]
|
||||
episode_timestamps = self._data[OnlineBuffer.TIMESTAMP_KEY][episode_data_indices]
|
||||
|
||||
for data_key in self.delta_timestamps:
|
||||
# Note: The logic in this loop is copied from `load_previous_and_future_frames`.
|
||||
# Get timestamps used as query to retrieve data of previous/future frames.
|
||||
query_ts = current_ts + self.delta_timestamps[data_key]
|
||||
|
||||
# Compute distances between each query timestamp and all timestamps of all the frames belonging to
|
||||
# the episode.
|
||||
dist = np.abs(query_ts[:, None] - episode_timestamps[None, :])
|
||||
argmin_ = np.argmin(dist, axis=1)
|
||||
min_ = dist[np.arange(dist.shape[0]), argmin_]
|
||||
|
||||
is_pad = min_ > self.tolerance_s
|
||||
|
||||
# Check violated query timestamps are all outside the episode range.
|
||||
assert (
|
||||
(query_ts[is_pad] < episode_timestamps[0]) | (episode_timestamps[-1] < query_ts[is_pad])
|
||||
).all(), (
|
||||
f"One or several timestamps unexpectedly violate the tolerance ({min_} > {self.tolerance_s=}"
|
||||
") inside the episode range."
|
||||
)
|
||||
|
||||
# Load frames for this data key.
|
||||
item[data_key] = self._data[data_key][episode_data_indices[argmin_]]
|
||||
|
||||
item[f"{data_key}{OnlineBuffer.IS_PAD_POSTFIX}"] = is_pad
|
||||
|
||||
return self._item_to_tensors(item)
|
||||
|
||||
def get_data_by_key(self, key: str) -> torch.Tensor:
|
||||
"""Returns all data for a given data key as a Tensor."""
|
||||
return torch.from_numpy(self._data[key][self._data[OnlineBuffer.OCCUPANCY_MASK_KEY]])
|
||||
|
||||
|
||||
def compute_sampler_weights(
|
||||
offline_dataset: LeRobotDataset,
|
||||
offline_drop_n_last_frames: int = 0,
|
||||
online_dataset: OnlineBuffer | None = None,
|
||||
online_sampling_ratio: float | None = None,
|
||||
online_drop_n_last_frames: int = 0,
|
||||
) -> torch.Tensor:
|
||||
"""Compute the sampling weights for the online training dataloader in train.py.
|
||||
|
||||
Args:
|
||||
offline_dataset: The LeRobotDataset used for offline pre-training.
|
||||
online_drop_n_last_frames: Number of frames to drop from the end of each offline dataset episode.
|
||||
online_dataset: The OnlineBuffer used in online training.
|
||||
online_sampling_ratio: The proportion of data that should be sampled from the online dataset. If an
|
||||
online dataset is provided, this value must also be provided.
|
||||
online_drop_n_first_frames: See `offline_drop_n_last_frames`. This is the same, but for the online
|
||||
dataset.
|
||||
Returns:
|
||||
Tensor of weights for [offline_dataset; online_dataset], normalized to 1.
|
||||
|
||||
Notes to maintainers:
|
||||
- This duplicates some logic from EpisodeAwareSampler. We should consider converging to one approach.
|
||||
- When used with `torch.utils.data.WeightedRandomSampler`, it could completely replace
|
||||
`EpisodeAwareSampler` as the online dataset related arguments are optional. The only missing feature
|
||||
is the ability to turn shuffling off.
|
||||
- Options `drop_first_n_frames` and `episode_indices_to_use` can be added easily. They were not
|
||||
included here to avoid adding complexity.
|
||||
"""
|
||||
if len(offline_dataset) == 0 and (online_dataset is None or len(online_dataset) == 0):
|
||||
raise ValueError("At least one of `offline_dataset` or `online_dataset` should be contain data.")
|
||||
if (online_dataset is None) ^ (online_sampling_ratio is None):
|
||||
raise ValueError(
|
||||
"`online_dataset` and `online_sampling_ratio` must be provided together or not at all."
|
||||
)
|
||||
offline_sampling_ratio = 0 if online_sampling_ratio is None else 1 - online_sampling_ratio
|
||||
|
||||
weights = []
|
||||
|
||||
if len(offline_dataset) > 0:
|
||||
offline_data_mask_indices = []
|
||||
for start_index, end_index in zip(
|
||||
offline_dataset.episode_data_index["from"],
|
||||
offline_dataset.episode_data_index["to"],
|
||||
strict=True,
|
||||
):
|
||||
offline_data_mask_indices.extend(
|
||||
range(start_index.item(), end_index.item() - offline_drop_n_last_frames)
|
||||
)
|
||||
offline_data_mask = torch.zeros(len(offline_dataset), dtype=torch.bool)
|
||||
offline_data_mask[torch.tensor(offline_data_mask_indices)] = True
|
||||
weights.append(
|
||||
torch.full(
|
||||
size=(len(offline_dataset),),
|
||||
fill_value=offline_sampling_ratio / offline_data_mask.sum(),
|
||||
)
|
||||
* offline_data_mask
|
||||
)
|
||||
|
||||
if online_dataset is not None and len(online_dataset) > 0:
|
||||
online_data_mask_indices = []
|
||||
episode_indices = online_dataset.get_data_by_key("episode_index")
|
||||
for episode_idx in torch.unique(episode_indices):
|
||||
where_episode = torch.where(episode_indices == episode_idx)
|
||||
start_index = where_episode[0][0]
|
||||
end_index = where_episode[0][-1] + 1
|
||||
online_data_mask_indices.extend(
|
||||
range(start_index.item(), end_index.item() - online_drop_n_last_frames)
|
||||
)
|
||||
online_data_mask = torch.zeros(len(online_dataset), dtype=torch.bool)
|
||||
online_data_mask[torch.tensor(online_data_mask_indices)] = True
|
||||
weights.append(
|
||||
torch.full(
|
||||
size=(len(online_dataset),),
|
||||
fill_value=online_sampling_ratio / online_data_mask.sum(),
|
||||
)
|
||||
* online_data_mask
|
||||
)
|
||||
|
||||
weights = torch.cat(weights)
|
||||
|
||||
if weights.sum() == 0:
|
||||
weights += 1 / len(weights)
|
||||
else:
|
||||
weights /= weights.sum()
|
||||
|
||||
return weights
|
||||
@@ -25,12 +25,16 @@ class TDMPCConfig:
|
||||
camera observations.
|
||||
|
||||
The parameters you will most likely need to change are the ones which depend on the environment / sensors.
|
||||
Those are: `input_shapes`, `output_shapes`, and perhaps `max_random_shift`.
|
||||
Those are: `input_shapes`, `output_shapes`, and perhaps `max_random_shift_ratio`.
|
||||
|
||||
Args:
|
||||
n_action_repeats: The number of times to repeat the action returned by the planning. (hint: Google
|
||||
action repeats in Q-learning or ask your favorite chatbot)
|
||||
horizon: Horizon for model predictive control.
|
||||
n_action_steps: Number of action steps to take from the plan given by model predictive control. This
|
||||
is an alternative to using action repeats. If this is set to more than 1, then we require
|
||||
`n_action_repeats == 1`, `use_mpc == True` and `n_action_steps <= horizon`. Note that this
|
||||
approach of using multiple steps from the plan is not in the original implementation.
|
||||
input_shapes: A dictionary defining the shapes of the input data for the policy. The key represents
|
||||
the input data name, and the value is a list indicating the dimensions of the corresponding data.
|
||||
For example, "observation.image" refers to an input from a camera with dimensions [3, 96, 96],
|
||||
@@ -100,6 +104,7 @@ class TDMPCConfig:
|
||||
# Input / output structure.
|
||||
n_action_repeats: int = 2
|
||||
horizon: int = 5
|
||||
n_action_steps: int = 1
|
||||
|
||||
input_shapes: dict[str, list[int]] = field(
|
||||
default_factory=lambda: {
|
||||
@@ -158,17 +163,18 @@ class TDMPCConfig:
|
||||
"""Input validation (not exhaustive)."""
|
||||
# There should only be one image key.
|
||||
image_keys = {k for k in self.input_shapes if k.startswith("observation.image")}
|
||||
if len(image_keys) != 1:
|
||||
if len(image_keys) > 1:
|
||||
raise ValueError(
|
||||
f"{self.__class__.__name__} only handles one image for now. Got image keys {image_keys}."
|
||||
)
|
||||
image_key = next(iter(image_keys))
|
||||
if self.input_shapes[image_key][-2] != self.input_shapes[image_key][-1]:
|
||||
# TODO(alexander-soare): This limitation is solely because of code in the random shift
|
||||
# augmentation. It should be able to be removed.
|
||||
raise ValueError(
|
||||
f"Only square images are handled now. Got image shape {self.input_shapes[image_key]}."
|
||||
f"{self.__class__.__name__} handles at most one image for now. Got image keys {image_keys}."
|
||||
)
|
||||
if len(image_keys) > 0:
|
||||
image_key = next(iter(image_keys))
|
||||
if self.input_shapes[image_key][-2] != self.input_shapes[image_key][-1]:
|
||||
# TODO(alexander-soare): This limitation is solely because of code in the random shift
|
||||
# augmentation. It should be able to be removed.
|
||||
raise ValueError(
|
||||
f"Only square images are handled now. Got image shape {self.input_shapes[image_key]}."
|
||||
)
|
||||
if self.n_gaussian_samples <= 0:
|
||||
raise ValueError(
|
||||
f"The number of guassian samples for CEM should be non-zero. Got `{self.n_gaussian_samples=}`"
|
||||
@@ -179,3 +185,12 @@ class TDMPCConfig:
|
||||
f"advised that you stick with the default. See {self.__class__.__name__} docstring for more "
|
||||
"information."
|
||||
)
|
||||
if self.n_action_steps > 1:
|
||||
if self.n_action_repeats != 1:
|
||||
raise ValueError(
|
||||
"If `n_action_steps > 1`, `n_action_repeats` must be left to its default value of 1."
|
||||
)
|
||||
if not self.use_mpc:
|
||||
raise ValueError("If `n_action_steps > 1`, `use_mpc` must be set to `True`.")
|
||||
if self.n_action_steps > self.horizon:
|
||||
raise ValueError("`n_action_steps` must be less than or equal to `horizon`.")
|
||||
|
||||
@@ -19,14 +19,10 @@
|
||||
The comments in this code may sometimes refer to these references:
|
||||
TD-MPC paper: Temporal Difference Learning for Model Predictive Control (https://arxiv.org/abs/2203.04955)
|
||||
FOWM paper: Finetuning Offline World Models in the Real World (https://arxiv.org/abs/2310.16029)
|
||||
|
||||
TODO(alexander-soare): Make rollout work for batch sizes larger than 1.
|
||||
TODO(alexander-soare): Use batch-first throughout.
|
||||
"""
|
||||
|
||||
# ruff: noqa: N806
|
||||
|
||||
import logging
|
||||
from collections import deque
|
||||
from copy import deepcopy
|
||||
from functools import partial
|
||||
@@ -56,9 +52,11 @@ class TDMPCPolicy(nn.Module, PyTorchModelHubMixin):
|
||||
process communication to use the xarm environment from FOWM. This is because our xarm
|
||||
environment uses newer dependencies and does not match the environment in FOWM. See
|
||||
https://github.com/huggingface/lerobot/pull/103 for implementation details.
|
||||
- We have NOT checked that training on LeRobot reproduces SOTA results. This is a TODO.
|
||||
- We have NOT checked that training on LeRobot reproduces the results from FOWM.
|
||||
- Nevertheless, we have verified that we can train TD-MPC for PushT. See
|
||||
`lerobot/configs/policy/tdmpc_pusht_keypoints.yaml`.
|
||||
- Our current xarm datasets were generated using the environment from FOWM. Therefore they do not
|
||||
match our xarm environment.
|
||||
match our xarm environment.
|
||||
"""
|
||||
|
||||
name = "tdmpc"
|
||||
@@ -74,22 +72,6 @@ class TDMPCPolicy(nn.Module, PyTorchModelHubMixin):
|
||||
that they will be passed with a call to `load_state_dict` before the policy is used.
|
||||
"""
|
||||
super().__init__()
|
||||
logging.warning(
|
||||
"""
|
||||
Please note several warnings for this policy.
|
||||
|
||||
- Evaluation of pretrained weights created with the original FOWM code
|
||||
(https://github.com/fyhMer/fowm) works as expected. To be precise: we trained and evaluated a
|
||||
model with the FOWM code for the xarm_lift_medium_replay dataset. We ported the weights across
|
||||
to LeRobot, and were able to evaluate with the same success metric. BUT, we had to use inter-
|
||||
process communication to use the xarm environment from FOWM. This is because our xarm
|
||||
environment uses newer dependencies and does not match the environment in FOWM. See
|
||||
https://github.com/huggingface/lerobot/pull/103 for implementation details.
|
||||
- We have NOT checked that training on LeRobot reproduces SOTA results. This is a TODO.
|
||||
- Our current xarm datasets were generated using the environment from FOWM. Therefore they do not
|
||||
match our xarm environment.
|
||||
"""
|
||||
)
|
||||
|
||||
if config is None:
|
||||
config = TDMPCConfig()
|
||||
@@ -114,8 +96,14 @@ class TDMPCPolicy(nn.Module, PyTorchModelHubMixin):
|
||||
|
||||
image_keys = [k for k in config.input_shapes if k.startswith("observation.image")]
|
||||
# Note: This check is covered in the post-init of the config but have a sanity check just in case.
|
||||
assert len(image_keys) == 1
|
||||
self.input_image_key = image_keys[0]
|
||||
self._use_image = False
|
||||
self._use_env_state = False
|
||||
if len(image_keys) > 0:
|
||||
assert len(image_keys) == 1
|
||||
self._use_image = True
|
||||
self.input_image_key = image_keys[0]
|
||||
if "observation.environment_state" in config.input_shapes:
|
||||
self._use_env_state = True
|
||||
|
||||
self.reset()
|
||||
|
||||
@@ -125,10 +113,13 @@ class TDMPCPolicy(nn.Module, PyTorchModelHubMixin):
|
||||
called on `env.reset()`
|
||||
"""
|
||||
self._queues = {
|
||||
"observation.image": deque(maxlen=1),
|
||||
"observation.state": deque(maxlen=1),
|
||||
"action": deque(maxlen=self.config.n_action_repeats),
|
||||
"action": deque(maxlen=max(self.config.n_action_steps, self.config.n_action_repeats)),
|
||||
}
|
||||
if self._use_image:
|
||||
self._queues["observation.image"] = deque(maxlen=1)
|
||||
if self._use_env_state:
|
||||
self._queues["observation.environment_state"] = deque(maxlen=1)
|
||||
# Previous mean obtained from the cross-entropy method (CEM) used during MPC. It is used to warm start
|
||||
# CEM for the next step.
|
||||
self._prev_mean: torch.Tensor | None = None
|
||||
@@ -137,8 +128,9 @@ class TDMPCPolicy(nn.Module, PyTorchModelHubMixin):
|
||||
def select_action(self, batch: dict[str, Tensor]) -> Tensor:
|
||||
"""Select a single action given environment observations."""
|
||||
batch = self.normalize_inputs(batch)
|
||||
batch = dict(batch) # shallow copy so that adding a key doesn't modify the original
|
||||
batch["observation.image"] = batch[self.input_image_key]
|
||||
if self._use_image:
|
||||
batch = dict(batch) # shallow copy so that adding a key doesn't modify the original
|
||||
batch["observation.image"] = batch[self.input_image_key]
|
||||
|
||||
self._queues = populate_queues(self._queues, batch)
|
||||
|
||||
@@ -152,49 +144,57 @@ class TDMPCPolicy(nn.Module, PyTorchModelHubMixin):
|
||||
batch[key] = batch[key][:, 0]
|
||||
|
||||
# NOTE: Order of observations matters here.
|
||||
z = self.model.encode({k: batch[k] for k in ["observation.image", "observation.state"]})
|
||||
if self.config.use_mpc:
|
||||
batch_size = batch["observation.image"].shape[0]
|
||||
# Batch processing is not handled in MPC mode, so process the batch in a loop.
|
||||
action = [] # will be a batch of actions for one step
|
||||
for i in range(batch_size):
|
||||
# Note: self.plan does not handle batches, hence the squeeze.
|
||||
action.append(self.plan(z[i]))
|
||||
action = torch.stack(action)
|
||||
encode_keys = []
|
||||
if self._use_image:
|
||||
encode_keys.append("observation.image")
|
||||
if self._use_env_state:
|
||||
encode_keys.append("observation.environment_state")
|
||||
encode_keys.append("observation.state")
|
||||
z = self.model.encode({k: batch[k] for k in encode_keys})
|
||||
if self.config.use_mpc: # noqa: SIM108
|
||||
actions = self.plan(z) # (horizon, batch, action_dim)
|
||||
else:
|
||||
# Plan with the policy (π) alone.
|
||||
action = self.model.pi(z)
|
||||
# Plan with the policy (π) alone. This always returns one action so unsqueeze to get a
|
||||
# sequence dimension like in the MPC branch.
|
||||
actions = self.model.pi(z).unsqueeze(0)
|
||||
|
||||
self.unnormalize_outputs({"action": action})["action"]
|
||||
actions = torch.clamp(actions, -1, +1)
|
||||
|
||||
for _ in range(self.config.n_action_repeats):
|
||||
self._queues["action"].append(action)
|
||||
actions = self.unnormalize_outputs({"action": actions})["action"]
|
||||
|
||||
if self.config.n_action_repeats > 1:
|
||||
for _ in range(self.config.n_action_repeats):
|
||||
self._queues["action"].append(actions[0])
|
||||
else:
|
||||
# Action queue is (n_action_steps, batch_size, action_dim), so we transpose the action.
|
||||
self._queues["action"].extend(actions[: self.config.n_action_steps])
|
||||
|
||||
action = self._queues["action"].popleft()
|
||||
return torch.clamp(action, -1, 1)
|
||||
return action
|
||||
|
||||
@torch.no_grad()
|
||||
def plan(self, z: Tensor) -> Tensor:
|
||||
"""Plan next action using TD-MPC inference.
|
||||
"""Plan sequence of actions using TD-MPC inference.
|
||||
|
||||
Args:
|
||||
z: (latent_dim,) tensor for the initial state.
|
||||
z: (batch, latent_dim,) tensor for the initial state.
|
||||
Returns:
|
||||
(action_dim,) tensor for the next action.
|
||||
|
||||
TODO(alexander-soare) Extend this to be able to work with batches.
|
||||
(horizon, batch, action_dim,) tensor for the planned trajectory of actions.
|
||||
"""
|
||||
device = get_device_from_parameters(self)
|
||||
|
||||
batch_size = z.shape[0]
|
||||
|
||||
# Sample Nπ trajectories from the policy.
|
||||
pi_actions = torch.empty(
|
||||
self.config.horizon,
|
||||
self.config.n_pi_samples,
|
||||
batch_size,
|
||||
self.config.output_shapes["action"][0],
|
||||
device=device,
|
||||
)
|
||||
if self.config.n_pi_samples > 0:
|
||||
_z = einops.repeat(z, "d -> n d", n=self.config.n_pi_samples)
|
||||
_z = einops.repeat(z, "b d -> n b d", n=self.config.n_pi_samples)
|
||||
for t in range(self.config.horizon):
|
||||
# Note: Adding a small amount of noise here doesn't hurt during inference and may even be
|
||||
# helpful for CEM.
|
||||
@@ -203,12 +203,14 @@ class TDMPCPolicy(nn.Module, PyTorchModelHubMixin):
|
||||
|
||||
# In the CEM loop we will need this for a call to estimate_value with the gaussian sampled
|
||||
# trajectories.
|
||||
z = einops.repeat(z, "d -> n d", n=self.config.n_gaussian_samples + self.config.n_pi_samples)
|
||||
z = einops.repeat(z, "b d -> n b d", n=self.config.n_gaussian_samples + self.config.n_pi_samples)
|
||||
|
||||
# Model Predictive Path Integral (MPPI) with the cross-entropy method (CEM) as the optimization
|
||||
# algorithm.
|
||||
# The initial mean and standard deviation for the cross-entropy method (CEM).
|
||||
mean = torch.zeros(self.config.horizon, self.config.output_shapes["action"][0], device=device)
|
||||
mean = torch.zeros(
|
||||
self.config.horizon, batch_size, self.config.output_shapes["action"][0], device=device
|
||||
)
|
||||
# Maybe warm start CEM with the mean from the previous step.
|
||||
if self._prev_mean is not None:
|
||||
mean[:-1] = self._prev_mean[1:]
|
||||
@@ -219,6 +221,7 @@ class TDMPCPolicy(nn.Module, PyTorchModelHubMixin):
|
||||
std_normal_noise = torch.randn(
|
||||
self.config.horizon,
|
||||
self.config.n_gaussian_samples,
|
||||
batch_size,
|
||||
self.config.output_shapes["action"][0],
|
||||
device=std.device,
|
||||
)
|
||||
@@ -227,21 +230,24 @@ class TDMPCPolicy(nn.Module, PyTorchModelHubMixin):
|
||||
# Compute elite actions.
|
||||
actions = torch.cat([gaussian_actions, pi_actions], dim=1)
|
||||
value = self.estimate_value(z, actions).nan_to_num_(0)
|
||||
elite_idxs = torch.topk(value, self.config.n_elites, dim=0).indices
|
||||
elite_value, elite_actions = value[elite_idxs], actions[:, elite_idxs]
|
||||
elite_idxs = torch.topk(value, self.config.n_elites, dim=0).indices # (n_elites, batch)
|
||||
elite_value = value.take_along_dim(elite_idxs, dim=0) # (n_elites, batch)
|
||||
# (horizon, n_elites, batch, action_dim)
|
||||
elite_actions = actions.take_along_dim(einops.rearrange(elite_idxs, "n b -> 1 n b 1"), dim=1)
|
||||
|
||||
# Update guassian PDF parameters to be the (weighted) mean and standard deviation of the elites.
|
||||
max_value = elite_value.max(0)[0]
|
||||
# Update gaussian PDF parameters to be the (weighted) mean and standard deviation of the elites.
|
||||
max_value = elite_value.max(0, keepdim=True)[0] # (1, batch)
|
||||
# The weighting is a softmax over trajectory values. Note that this is not the same as the usage
|
||||
# of Ω in eqn 4 of the TD-MPC paper. Instead it is the normalized version of it: s = Ω/ΣΩ. This
|
||||
# makes the equations: μ = Σ(s⋅Γ), σ = Σ(s⋅(Γ-μ)²).
|
||||
score = torch.exp(self.config.elite_weighting_temperature * (elite_value - max_value))
|
||||
score /= score.sum()
|
||||
_mean = torch.sum(einops.rearrange(score, "n -> n 1") * elite_actions, dim=1)
|
||||
score /= score.sum(axis=0, keepdim=True)
|
||||
# (horizon, batch, action_dim)
|
||||
_mean = torch.sum(einops.rearrange(score, "n b -> n b 1") * elite_actions, dim=1)
|
||||
_std = torch.sqrt(
|
||||
torch.sum(
|
||||
einops.rearrange(score, "n -> n 1")
|
||||
* (elite_actions - einops.rearrange(_mean, "h d -> h 1 d")) ** 2,
|
||||
einops.rearrange(score, "n b -> n b 1")
|
||||
* (elite_actions - einops.rearrange(_mean, "h b d -> h 1 b d")) ** 2,
|
||||
dim=1,
|
||||
)
|
||||
)
|
||||
@@ -256,11 +262,9 @@ class TDMPCPolicy(nn.Module, PyTorchModelHubMixin):
|
||||
|
||||
# Randomly select one of the elite actions from the last iteration of MPPI/CEM using the softmax
|
||||
# scores from the last iteration.
|
||||
actions = elite_actions[:, torch.multinomial(score, 1).item()]
|
||||
actions = elite_actions[:, torch.multinomial(score.T, 1).squeeze(), torch.arange(batch_size)]
|
||||
|
||||
# Select only the first action
|
||||
action = actions[0]
|
||||
return action
|
||||
return actions
|
||||
|
||||
@torch.no_grad()
|
||||
def estimate_value(self, z: Tensor, actions: Tensor):
|
||||
@@ -312,13 +316,17 @@ class TDMPCPolicy(nn.Module, PyTorchModelHubMixin):
|
||||
G -= running_discount * self.config.uncertainty_regularizer_coeff * terminal_values.std(0)
|
||||
return G
|
||||
|
||||
def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor]:
|
||||
"""Run the batch through the model and compute the loss."""
|
||||
def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor | float]:
|
||||
"""Run the batch through the model and compute the loss.
|
||||
|
||||
Returns a dictionary with loss as a tensor, and other information as native floats.
|
||||
"""
|
||||
device = get_device_from_parameters(self)
|
||||
|
||||
batch = self.normalize_inputs(batch)
|
||||
batch = dict(batch) # shallow copy so that adding a key doesn't modify the original
|
||||
batch["observation.image"] = batch[self.input_image_key]
|
||||
if self._use_image:
|
||||
batch = dict(batch) # shallow copy so that adding a key doesn't modify the original
|
||||
batch["observation.image"] = batch[self.input_image_key]
|
||||
batch = self.normalize_targets(batch)
|
||||
|
||||
info = {}
|
||||
@@ -328,12 +336,12 @@ class TDMPCPolicy(nn.Module, PyTorchModelHubMixin):
|
||||
if batch[key].ndim > 1:
|
||||
batch[key] = batch[key].transpose(1, 0)
|
||||
|
||||
action = batch["action"] # (t, b)
|
||||
reward = batch["next.reward"] # (t,)
|
||||
action = batch["action"] # (t, b, action_dim)
|
||||
reward = batch["next.reward"] # (t, b)
|
||||
observations = {k: v for k, v in batch.items() if k.startswith("observation.")}
|
||||
|
||||
# Apply random image augmentations.
|
||||
if self.config.max_random_shift_ratio > 0:
|
||||
if self._use_image and self.config.max_random_shift_ratio > 0:
|
||||
observations["observation.image"] = flatten_forward_unflatten(
|
||||
partial(random_shifts_aug, max_random_shift_ratio=self.config.max_random_shift_ratio),
|
||||
observations["observation.image"],
|
||||
@@ -345,7 +353,9 @@ class TDMPCPolicy(nn.Module, PyTorchModelHubMixin):
|
||||
for k in observations:
|
||||
current_observation[k] = observations[k][0]
|
||||
next_observations[k] = observations[k][1:]
|
||||
horizon = next_observations["observation.image"].shape[0]
|
||||
horizon, batch_size = next_observations[
|
||||
"observation.image" if self._use_image else "observation.environment_state"
|
||||
].shape[:2]
|
||||
|
||||
# Run latent rollout using the latent dynamics model and policy model.
|
||||
# Note this has shape `horizon+1` because there are `horizon` actions and a current `z`. Each action
|
||||
@@ -415,7 +425,8 @@ class TDMPCPolicy(nn.Module, PyTorchModelHubMixin):
|
||||
# Compute state-action value loss (TD loss) for all of the Q functions in the ensemble.
|
||||
q_value_loss = (
|
||||
(
|
||||
F.mse_loss(
|
||||
temporal_loss_coeffs
|
||||
* F.mse_loss(
|
||||
q_preds_ensemble,
|
||||
einops.repeat(q_targets, "t b -> e t b", e=q_preds_ensemble.shape[0]),
|
||||
reduction="none",
|
||||
@@ -464,10 +475,11 @@ class TDMPCPolicy(nn.Module, PyTorchModelHubMixin):
|
||||
action_preds = self.model.pi(z_preds[:-1]) # (t, b, a)
|
||||
# Calculate the MSE between the actions and the action predictions.
|
||||
# Note: FOWM's original code calculates the log probability (wrt to a unit standard deviation
|
||||
# gaussian) and sums over the action dimension. Computing the log probability amounts to multiplying
|
||||
# the MSE by 0.5 and adding a constant offset (the log(2*pi) term) . Here we drop the constant offset
|
||||
# as it doesn't change the optimization step, and we drop the 0.5 as we instead make a configuration
|
||||
# parameter for it (see below where we compute the total loss).
|
||||
# gaussian) and sums over the action dimension. Computing the (negative) log probability amounts to
|
||||
# multiplying the MSE by 0.5 and adding a constant offset (the log(2*pi)/2 term, times the action
|
||||
# dimension). Here we drop the constant offset as it doesn't change the optimization step, and we drop
|
||||
# the 0.5 as we instead make a configuration parameter for it (see below where we compute the total
|
||||
# loss).
|
||||
mse = F.mse_loss(action_preds, action, reduction="none").sum(-1) # (t, b)
|
||||
# NOTE: The original implementation does not take the sum over the temporal dimension like with the
|
||||
# other losses.
|
||||
@@ -728,6 +740,16 @@ class TDMPCObservationEncoder(nn.Module):
|
||||
nn.LayerNorm(config.latent_dim),
|
||||
nn.Sigmoid(),
|
||||
)
|
||||
if "observation.environment_state" in config.input_shapes:
|
||||
self.env_state_enc_layers = nn.Sequential(
|
||||
nn.Linear(
|
||||
config.input_shapes["observation.environment_state"][0], config.state_encoder_hidden_dim
|
||||
),
|
||||
nn.ELU(),
|
||||
nn.Linear(config.state_encoder_hidden_dim, config.latent_dim),
|
||||
nn.LayerNorm(config.latent_dim),
|
||||
nn.Sigmoid(),
|
||||
)
|
||||
|
||||
def forward(self, obs_dict: dict[str, Tensor]) -> Tensor:
|
||||
"""Encode the image and/or state vector.
|
||||
@@ -736,8 +758,11 @@ class TDMPCObservationEncoder(nn.Module):
|
||||
over all features.
|
||||
"""
|
||||
feat = []
|
||||
# NOTE: Order of observations matters here.
|
||||
if "observation.image" in self.config.input_shapes:
|
||||
feat.append(flatten_forward_unflatten(self.image_enc_layers, obs_dict["observation.image"]))
|
||||
if "observation.environment_state" in self.config.input_shapes:
|
||||
feat.append(self.env_state_enc_layers(obs_dict["observation.environment_state"]))
|
||||
if "observation.state" in self.config.input_shapes:
|
||||
feat.append(self.state_enc_layers(obs_dict["observation.state"]))
|
||||
return torch.stack(feat, dim=0).mean(0)
|
||||
|
||||
Reference in New Issue
Block a user