forked from tangger/lerobot
Remove offline training, refactor train.py and logging/checkpointing (#670)
Co-authored-by: Remi <remi.cadene@huggingface.co>
This commit is contained in:
@@ -13,10 +13,16 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import json
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from typing import TypeVar
|
||||
|
||||
import imageio
|
||||
|
||||
JsonLike = str | int | float | bool | None | list["JsonLike"] | dict[str, "JsonLike"] | tuple["JsonLike", ...]
|
||||
T = TypeVar("T", bound=JsonLike)
|
||||
|
||||
|
||||
def write_video(video_path, stacked_frames, fps):
|
||||
# Filter out DeprecationWarnings raised from pkg_resources
|
||||
@@ -25,3 +31,81 @@ def write_video(video_path, stacked_frames, fps):
|
||||
"ignore", "pkg_resources is deprecated as an API", category=DeprecationWarning
|
||||
)
|
||||
imageio.mimsave(video_path, stacked_frames, fps=fps)
|
||||
|
||||
|
||||
def deserialize_json_into_object(fpath: Path, obj: T) -> T:
|
||||
"""
|
||||
Loads the JSON data from `fpath` and recursively fills `obj` with the
|
||||
corresponding values (strictly matching structure and types).
|
||||
Tuples in `obj` are expected to be lists in the JSON data, which will be
|
||||
converted back into tuples.
|
||||
"""
|
||||
with open(fpath, encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
def _deserialize(target, source):
|
||||
"""
|
||||
Recursively overwrite the structure in `target` with data from `source`,
|
||||
performing strict checks on structure and type.
|
||||
Returns the updated version of `target` (especially important for tuples).
|
||||
"""
|
||||
|
||||
# If the target is a dictionary, source must be a dictionary as well.
|
||||
if isinstance(target, dict):
|
||||
if not isinstance(source, dict):
|
||||
raise TypeError(f"Type mismatch: expected dict, got {type(source)}")
|
||||
|
||||
# Check that they have exactly the same set of keys.
|
||||
if target.keys() != source.keys():
|
||||
raise ValueError(
|
||||
f"Dictionary keys do not match.\n" f"Expected: {target.keys()}, got: {source.keys()}"
|
||||
)
|
||||
|
||||
# Recursively update each key.
|
||||
for k in target:
|
||||
target[k] = _deserialize(target[k], source[k])
|
||||
|
||||
return target
|
||||
|
||||
# If the target is a list, source must be a list as well.
|
||||
elif isinstance(target, list):
|
||||
if not isinstance(source, list):
|
||||
raise TypeError(f"Type mismatch: expected list, got {type(source)}")
|
||||
|
||||
# Check length
|
||||
if len(target) != len(source):
|
||||
raise ValueError(f"List length mismatch: expected {len(target)}, got {len(source)}")
|
||||
|
||||
# Recursively update each element.
|
||||
for i in range(len(target)):
|
||||
target[i] = _deserialize(target[i], source[i])
|
||||
|
||||
return target
|
||||
|
||||
# If the target is a tuple, the source must be a list in JSON,
|
||||
# which we'll convert back to a tuple.
|
||||
elif isinstance(target, tuple):
|
||||
if not isinstance(source, list):
|
||||
raise TypeError(f"Type mismatch: expected list (for tuple), got {type(source)}")
|
||||
|
||||
if len(target) != len(source):
|
||||
raise ValueError(f"Tuple length mismatch: expected {len(target)}, got {len(source)}")
|
||||
|
||||
# Convert each element, forming a new tuple.
|
||||
converted_items = []
|
||||
for t_item, s_item in zip(target, source, strict=False):
|
||||
converted_items.append(_deserialize(t_item, s_item))
|
||||
|
||||
# Return a brand new tuple (tuples are immutable in Python).
|
||||
return tuple(converted_items)
|
||||
|
||||
# Otherwise, we're dealing with a "primitive" (int, float, str, bool, None).
|
||||
else:
|
||||
# Check the exact type. If these must match 1:1, do:
|
||||
if type(target) is not type(source):
|
||||
raise TypeError(f"Type mismatch: expected {type(target)}, got {type(source)}")
|
||||
return source
|
||||
|
||||
# Perform the in-place/recursive deserialization
|
||||
updated_obj = _deserialize(obj, data)
|
||||
return updated_obj
|
||||
|
||||
163
lerobot/common/utils/logging_utils.py
Normal file
163
lerobot/common/utils/logging_utils.py
Normal file
@@ -0,0 +1,163 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import Any
|
||||
|
||||
from lerobot.common.utils.utils import format_big_number
|
||||
|
||||
|
||||
class AverageMeter:
|
||||
"""
|
||||
Computes and stores the average and current value
|
||||
Adapted from https://github.com/pytorch/examples/blob/main/imagenet/main.py
|
||||
"""
|
||||
|
||||
def __init__(self, name: str, fmt: str = ":f"):
|
||||
self.name = name
|
||||
self.fmt = fmt
|
||||
self.reset()
|
||||
|
||||
def reset(self) -> None:
|
||||
self.val = 0.0
|
||||
self.avg = 0.0
|
||||
self.sum = 0.0
|
||||
self.count = 0.0
|
||||
|
||||
def update(self, val: float, n: int = 1) -> None:
|
||||
self.val = val
|
||||
self.sum += val * n
|
||||
self.count += n
|
||||
self.avg = self.sum / self.count
|
||||
|
||||
def __str__(self):
|
||||
fmtstr = "{name}:{avg" + self.fmt + "}"
|
||||
return fmtstr.format(**self.__dict__)
|
||||
|
||||
|
||||
class MetricsTracker:
|
||||
"""
|
||||
A helper class to track and log metrics over time.
|
||||
|
||||
Usage pattern:
|
||||
|
||||
```python
|
||||
# initialize, potentially with non-zero initial step (e.g. if resuming run)
|
||||
metrics = {"loss": AverageMeter("loss", ":.3f")}
|
||||
train_metrics = MetricsTracker(cfg, dataset, metrics, initial_step=step)
|
||||
|
||||
# update metrics derived from step (samples, episodes, epochs) at each training step
|
||||
train_metrics.step()
|
||||
|
||||
# update various metrics
|
||||
loss = policy.forward(batch)
|
||||
train_metrics.loss = loss
|
||||
|
||||
# display current metrics
|
||||
logging.info(train_metrics)
|
||||
|
||||
# export for wandb
|
||||
wandb.log(train_metrics.to_dict())
|
||||
|
||||
# reset averages after logging
|
||||
train_metrics.reset_averages()
|
||||
```
|
||||
"""
|
||||
|
||||
__keys__ = [
|
||||
"_batch_size",
|
||||
"_num_frames",
|
||||
"_avg_samples_per_ep",
|
||||
"metrics",
|
||||
"steps",
|
||||
"samples",
|
||||
"episodes",
|
||||
"epochs",
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
batch_size: int,
|
||||
num_frames: int,
|
||||
num_episodes: int,
|
||||
metrics: dict[str, AverageMeter],
|
||||
initial_step: int = 0,
|
||||
):
|
||||
self.__dict__.update({k: None for k in self.__keys__})
|
||||
self._batch_size = batch_size
|
||||
self._num_frames = num_frames
|
||||
self._avg_samples_per_ep = num_frames / num_episodes
|
||||
self.metrics = metrics
|
||||
|
||||
self.steps = initial_step
|
||||
# A sample is an (observation,action) pair, where observation and action
|
||||
# can be on multiple timestamps. In a batch, we have `batch_size` number of samples.
|
||||
self.samples = self.steps * self._batch_size
|
||||
self.episodes = self.samples / self._avg_samples_per_ep
|
||||
self.epochs = self.samples / self._num_frames
|
||||
|
||||
def __getattr__(self, name: str) -> int | dict[str, AverageMeter] | AverageMeter | Any:
|
||||
if name in self.__dict__:
|
||||
return self.__dict__[name]
|
||||
elif name in self.metrics:
|
||||
return self.metrics[name]
|
||||
else:
|
||||
raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
|
||||
|
||||
def __setattr__(self, name: str, value: Any) -> None:
|
||||
if name in self.__dict__:
|
||||
super().__setattr__(name, value)
|
||||
elif name in self.metrics:
|
||||
self.metrics[name].update(value)
|
||||
else:
|
||||
raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
|
||||
|
||||
def step(self) -> None:
|
||||
"""
|
||||
Updates metrics that depend on 'step' for one step.
|
||||
"""
|
||||
self.steps += 1
|
||||
self.samples += self._batch_size
|
||||
self.episodes = self.samples / self._avg_samples_per_ep
|
||||
self.epochs = self.samples / self._num_frames
|
||||
|
||||
def __str__(self) -> str:
|
||||
display_list = [
|
||||
f"step:{format_big_number(self.steps)}",
|
||||
# number of samples seen during training
|
||||
f"smpl:{format_big_number(self.samples)}",
|
||||
# number of episodes seen during training
|
||||
f"ep:{format_big_number(self.episodes)}",
|
||||
# number of time all unique samples are seen
|
||||
f"epch:{self.epochs:.2f}",
|
||||
*[str(m) for m in self.metrics.values()],
|
||||
]
|
||||
return " ".join(display_list)
|
||||
|
||||
def to_dict(self, use_avg: bool = True) -> dict[str, int | float]:
|
||||
"""
|
||||
Returns the current metric values (or averages if `use_avg=True`) as a dict.
|
||||
"""
|
||||
return {
|
||||
"steps": self.steps,
|
||||
"samples": self.samples,
|
||||
"episodes": self.episodes,
|
||||
"epochs": self.epochs,
|
||||
**{k: m.avg if use_avg else m.val for k, m in self.metrics.items()},
|
||||
}
|
||||
|
||||
def reset_averages(self) -> None:
|
||||
"""Resets average meters."""
|
||||
for m in self.metrics.values():
|
||||
m.reset()
|
||||
191
lerobot/common/utils/random_utils.py
Normal file
191
lerobot/common/utils/random_utils.py
Normal file
@@ -0,0 +1,191 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import random
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
from typing import Any, Generator
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from safetensors.torch import load_file, save_file
|
||||
|
||||
from lerobot.common.constants import RNG_STATE
|
||||
from lerobot.common.datasets.utils import flatten_dict, unflatten_dict
|
||||
|
||||
|
||||
def serialize_python_rng_state() -> dict[str, torch.Tensor]:
|
||||
"""
|
||||
Returns the rng state for `random` in the form of a flat dict[str, torch.Tensor] to be saved using
|
||||
`safetensors.save_file()` or `torch.save()`.
|
||||
"""
|
||||
py_state = random.getstate()
|
||||
return {
|
||||
"py_rng_version": torch.tensor([py_state[0]], dtype=torch.int64),
|
||||
"py_rng_state": torch.tensor(py_state[1], dtype=torch.int64),
|
||||
}
|
||||
|
||||
|
||||
def deserialize_python_rng_state(rng_state_dict: dict[str, torch.Tensor]) -> None:
|
||||
"""
|
||||
Restores the rng state for `random` from a dictionary produced by `serialize_python_rng_state()`.
|
||||
"""
|
||||
py_state = (rng_state_dict["py_rng_version"].item(), tuple(rng_state_dict["py_rng_state"].tolist()), None)
|
||||
random.setstate(py_state)
|
||||
|
||||
|
||||
def serialize_numpy_rng_state() -> dict[str, torch.Tensor]:
|
||||
"""
|
||||
Returns the rng state for `numpy` in the form of a flat dict[str, torch.Tensor] to be saved using
|
||||
`safetensors.save_file()` or `torch.save()`.
|
||||
"""
|
||||
np_state = np.random.get_state()
|
||||
# Ensure no breaking changes from numpy
|
||||
assert np_state[0] == "MT19937"
|
||||
return {
|
||||
"np_rng_state_values": torch.tensor(np_state[1], dtype=torch.int64),
|
||||
"np_rng_state_index": torch.tensor([np_state[2]], dtype=torch.int64),
|
||||
"np_rng_has_gauss": torch.tensor([np_state[3]], dtype=torch.int64),
|
||||
"np_rng_cached_gaussian": torch.tensor([np_state[4]], dtype=torch.float32),
|
||||
}
|
||||
|
||||
|
||||
def deserialize_numpy_rng_state(rng_state_dict: dict[str, torch.Tensor]) -> None:
|
||||
"""
|
||||
Restores the rng state for `numpy` from a dictionary produced by `serialize_numpy_rng_state()`.
|
||||
"""
|
||||
np_state = (
|
||||
"MT19937",
|
||||
rng_state_dict["np_rng_state_values"].numpy(),
|
||||
rng_state_dict["np_rng_state_index"].item(),
|
||||
rng_state_dict["np_rng_has_gauss"].item(),
|
||||
rng_state_dict["np_rng_cached_gaussian"].item(),
|
||||
)
|
||||
np.random.set_state(np_state)
|
||||
|
||||
|
||||
def serialize_torch_rng_state() -> dict[str, torch.Tensor]:
|
||||
"""
|
||||
Returns the rng state for `torch` in the form of a flat dict[str, torch.Tensor] to be saved using
|
||||
`safetensors.save_file()` or `torch.save()`.
|
||||
"""
|
||||
torch_rng_state_dict = {"torch_rng_state": torch.get_rng_state()}
|
||||
if torch.cuda.is_available():
|
||||
torch_rng_state_dict["torch_cuda_rng_state"] = torch.cuda.get_rng_state()
|
||||
return torch_rng_state_dict
|
||||
|
||||
|
||||
def deserialize_torch_rng_state(rng_state_dict: dict[str, torch.Tensor]) -> None:
|
||||
"""
|
||||
Restores the rng state for `torch` from a dictionary produced by `serialize_torch_rng_state()`.
|
||||
"""
|
||||
torch.set_rng_state(rng_state_dict["torch_rng_state"])
|
||||
if torch.cuda.is_available() and "torch_cuda_rng_state" in rng_state_dict:
|
||||
torch.cuda.set_rng_state(rng_state_dict["torch_cuda_rng_state"])
|
||||
|
||||
|
||||
def serialize_rng_state() -> dict[str, torch.Tensor]:
|
||||
"""
|
||||
Returns the rng state for `random`, `numpy`, and `torch`, in the form of a flat
|
||||
dict[str, torch.Tensor] to be saved using `safetensors.save_file()` `torch.save()`.
|
||||
"""
|
||||
py_rng_state_dict = serialize_python_rng_state()
|
||||
np_rng_state_dict = serialize_numpy_rng_state()
|
||||
torch_rng_state_dict = serialize_torch_rng_state()
|
||||
|
||||
return {
|
||||
**py_rng_state_dict,
|
||||
**np_rng_state_dict,
|
||||
**torch_rng_state_dict,
|
||||
}
|
||||
|
||||
|
||||
def deserialize_rng_state(rng_state_dict: dict[str, torch.Tensor]) -> None:
|
||||
"""
|
||||
Restores the rng state for `random`, `numpy`, and `torch` from a dictionary produced by
|
||||
`serialize_rng_state()`.
|
||||
"""
|
||||
py_rng_state_dict = {k: v for k, v in rng_state_dict.items() if k.startswith("py")}
|
||||
np_rng_state_dict = {k: v for k, v in rng_state_dict.items() if k.startswith("np")}
|
||||
torch_rng_state_dict = {k: v for k, v in rng_state_dict.items() if k.startswith("torch")}
|
||||
|
||||
deserialize_python_rng_state(py_rng_state_dict)
|
||||
deserialize_numpy_rng_state(np_rng_state_dict)
|
||||
deserialize_torch_rng_state(torch_rng_state_dict)
|
||||
|
||||
|
||||
def save_rng_state(save_dir: Path) -> None:
|
||||
rng_state_dict = serialize_rng_state()
|
||||
flat_rng_state_dict = flatten_dict(rng_state_dict)
|
||||
save_file(flat_rng_state_dict, save_dir / RNG_STATE)
|
||||
|
||||
|
||||
def load_rng_state(save_dir: Path) -> None:
|
||||
flat_rng_state_dict = load_file(save_dir / RNG_STATE)
|
||||
rng_state_dict = unflatten_dict(flat_rng_state_dict)
|
||||
deserialize_rng_state(rng_state_dict)
|
||||
|
||||
|
||||
def get_rng_state() -> dict[str, Any]:
|
||||
"""Get the random state for `random`, `numpy`, and `torch`."""
|
||||
random_state_dict = {
|
||||
"random_state": random.getstate(),
|
||||
"numpy_random_state": np.random.get_state(),
|
||||
"torch_random_state": torch.random.get_rng_state(),
|
||||
}
|
||||
if torch.cuda.is_available():
|
||||
random_state_dict["torch_cuda_random_state"] = torch.cuda.random.get_rng_state()
|
||||
return random_state_dict
|
||||
|
||||
|
||||
def set_rng_state(random_state_dict: dict[str, Any]):
|
||||
"""Set the random state for `random`, `numpy`, and `torch`.
|
||||
|
||||
Args:
|
||||
random_state_dict: A dictionary of the form returned by `get_rng_state`.
|
||||
"""
|
||||
random.setstate(random_state_dict["random_state"])
|
||||
np.random.set_state(random_state_dict["numpy_random_state"])
|
||||
torch.random.set_rng_state(random_state_dict["torch_random_state"])
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.random.set_rng_state(random_state_dict["torch_cuda_random_state"])
|
||||
|
||||
|
||||
def set_seed(seed) -> None:
|
||||
"""Set seed for reproducibility."""
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def seeded_context(seed: int) -> Generator[None, None, None]:
|
||||
"""Set the seed when entering a context, and restore the prior random state at exit.
|
||||
|
||||
Example usage:
|
||||
|
||||
```
|
||||
a = random.random() # produces some random number
|
||||
with seeded_context(1337):
|
||||
b = random.random() # produces some other random number
|
||||
c = random.random() # produces yet another random number, but the same it would have if we never made `b`
|
||||
```
|
||||
"""
|
||||
random_state_dict = get_rng_state()
|
||||
set_seed(seed)
|
||||
yield None
|
||||
set_rng_state(random_state_dict)
|
||||
161
lerobot/common/utils/train_utils.py
Normal file
161
lerobot/common/utils/train_utils.py
Normal file
@@ -0,0 +1,161 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from termcolor import colored
|
||||
from torch.optim import Optimizer
|
||||
from torch.optim.lr_scheduler import LRScheduler
|
||||
|
||||
from lerobot.common.constants import (
|
||||
CHECKPOINTS_DIR,
|
||||
LAST_CHECKPOINT_LINK,
|
||||
PRETRAINED_MODEL_DIR,
|
||||
TRAINING_STATE_DIR,
|
||||
TRAINING_STEP,
|
||||
)
|
||||
from lerobot.common.datasets.utils import load_json, write_json
|
||||
from lerobot.common.optim.optimizers import load_optimizer_state, save_optimizer_state
|
||||
from lerobot.common.optim.schedulers import load_scheduler_state, save_scheduler_state
|
||||
from lerobot.common.policies.pretrained import PreTrainedPolicy
|
||||
from lerobot.common.utils.random_utils import load_rng_state, save_rng_state
|
||||
from lerobot.configs.train import TrainPipelineConfig
|
||||
|
||||
|
||||
def log_output_dir(out_dir):
|
||||
logging.info(colored("Output dir:", "yellow", attrs=["bold"]) + f" {out_dir}")
|
||||
|
||||
|
||||
def get_step_identifier(step: int, total_steps: int) -> str:
|
||||
num_digits = max(6, len(str(total_steps)))
|
||||
return f"{step:0{num_digits}d}"
|
||||
|
||||
|
||||
def get_step_checkpoint_dir(output_dir: Path, total_steps: int, step: int) -> Path:
|
||||
"""Returns the checkpoint sub-directory corresponding to the step number."""
|
||||
step_identifier = get_step_identifier(step, total_steps)
|
||||
return output_dir / CHECKPOINTS_DIR / step_identifier
|
||||
|
||||
|
||||
def save_training_step(step: int, save_dir: Path) -> None:
|
||||
write_json({"step": step}, save_dir / TRAINING_STEP)
|
||||
|
||||
|
||||
def load_training_step(save_dir: Path) -> int:
|
||||
training_step = load_json(save_dir / TRAINING_STEP)
|
||||
return training_step["step"]
|
||||
|
||||
|
||||
def update_last_checkpoint(checkpoint_dir: Path) -> Path:
|
||||
last_checkpoint_dir = checkpoint_dir.parent / LAST_CHECKPOINT_LINK
|
||||
if last_checkpoint_dir.is_symlink():
|
||||
last_checkpoint_dir.unlink()
|
||||
relative_target = checkpoint_dir.relative_to(checkpoint_dir.parent)
|
||||
last_checkpoint_dir.symlink_to(relative_target)
|
||||
|
||||
|
||||
def save_checkpoint(
|
||||
checkpoint_dir: Path,
|
||||
step: int,
|
||||
cfg: TrainPipelineConfig,
|
||||
policy: PreTrainedPolicy,
|
||||
optimizer: Optimizer,
|
||||
scheduler: LRScheduler | None = None,
|
||||
) -> None:
|
||||
"""This function creates the following directory structure:
|
||||
|
||||
005000/ # training step at checkpoint
|
||||
├── pretrained_model/
|
||||
│ ├── config.json # policy config
|
||||
│ ├── model.safetensors # policy weights
|
||||
│ └── train_config.json # train config
|
||||
└── training_state/
|
||||
├── optimizer_param_groups.json # optimizer param groups
|
||||
├── optimizer_state.safetensors # optimizer state
|
||||
├── rng_state.safetensors # rng states
|
||||
├── scheduler_state.json # scheduler state
|
||||
└── training_step.json # training step
|
||||
|
||||
Args:
|
||||
cfg (TrainPipelineConfig): The training config used for this run.
|
||||
step (int): The training step at that checkpoint.
|
||||
policy (PreTrainedPolicy): The policy to save.
|
||||
optimizer (Optimizer | None, optional): The optimizer to save the state from. Defaults to None.
|
||||
scheduler (LRScheduler | None, optional): The scheduler to save the state from. Defaults to None.
|
||||
"""
|
||||
pretrained_dir = checkpoint_dir / PRETRAINED_MODEL_DIR
|
||||
policy.save_pretrained(pretrained_dir)
|
||||
cfg.save_pretrained(pretrained_dir)
|
||||
save_training_state(checkpoint_dir, step, optimizer, scheduler)
|
||||
|
||||
|
||||
def save_training_state(
|
||||
checkpoint_dir: Path,
|
||||
train_step: int,
|
||||
optimizer: Optimizer | None = None,
|
||||
scheduler: LRScheduler | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Saves the training step, optimizer state, scheduler state, and rng state.
|
||||
|
||||
Args:
|
||||
save_dir (Path): The directory to save artifacts to.
|
||||
train_step (int): Current training step.
|
||||
optimizer (Optimizer | None, optional): The optimizer from which to save the state_dict.
|
||||
Defaults to None.
|
||||
scheduler (LRScheduler | None, optional): The scheduler from which to save the state_dict.
|
||||
Defaults to None.
|
||||
"""
|
||||
save_dir = checkpoint_dir / TRAINING_STATE_DIR
|
||||
save_dir.mkdir(parents=True, exist_ok=True)
|
||||
save_training_step(train_step, save_dir)
|
||||
save_rng_state(save_dir)
|
||||
if optimizer is not None:
|
||||
save_optimizer_state(optimizer, save_dir)
|
||||
if scheduler is not None:
|
||||
save_scheduler_state(scheduler, save_dir)
|
||||
|
||||
|
||||
def load_training_state(
|
||||
checkpoint_dir: Path, optimizer: Optimizer, scheduler: LRScheduler | None
|
||||
) -> tuple[int, Optimizer, LRScheduler | None]:
|
||||
"""
|
||||
Loads the training step, optimizer state, scheduler state, and rng state.
|
||||
This is used to resume a training run.
|
||||
|
||||
Args:
|
||||
checkpoint_dir (Path): The checkpoint directory. Should contain a 'training_state' dir.
|
||||
optimizer (Optimizer): The optimizer to load the state_dict to.
|
||||
scheduler (LRScheduler | None): The scheduler to load the state_dict to (can be None).
|
||||
|
||||
Raises:
|
||||
NotADirectoryError: If 'checkpoint_dir' doesn't contain a 'training_state' dir
|
||||
|
||||
Returns:
|
||||
tuple[int, Optimizer, LRScheduler | None]: training step, optimizer and scheduler with their
|
||||
state_dict loaded.
|
||||
"""
|
||||
training_state_dir = checkpoint_dir / TRAINING_STATE_DIR
|
||||
if not training_state_dir.is_dir():
|
||||
raise NotADirectoryError(training_state_dir)
|
||||
|
||||
load_rng_state(training_state_dir)
|
||||
step = load_training_step(training_state_dir)
|
||||
optimizer = load_optimizer_state(optimizer, training_state_dir)
|
||||
if scheduler is not None:
|
||||
scheduler = load_scheduler_state(scheduler, training_state_dir)
|
||||
|
||||
return step, optimizer, scheduler
|
||||
@@ -17,14 +17,10 @@ import logging
|
||||
import os
|
||||
import os.path as osp
|
||||
import platform
|
||||
import random
|
||||
from contextlib import contextmanager
|
||||
from copy import copy
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any, Generator
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
|
||||
@@ -106,59 +102,6 @@ def is_amp_available(device: str):
|
||||
raise ValueError(f"Unknown device '{device}.")
|
||||
|
||||
|
||||
def get_global_random_state() -> dict[str, Any]:
|
||||
"""Get the random state for `random`, `numpy`, and `torch`."""
|
||||
random_state_dict = {
|
||||
"random_state": random.getstate(),
|
||||
"numpy_random_state": np.random.get_state(),
|
||||
"torch_random_state": torch.random.get_rng_state(),
|
||||
}
|
||||
if torch.cuda.is_available():
|
||||
random_state_dict["torch_cuda_random_state"] = torch.cuda.random.get_rng_state()
|
||||
return random_state_dict
|
||||
|
||||
|
||||
def set_global_random_state(random_state_dict: dict[str, Any]):
|
||||
"""Set the random state for `random`, `numpy`, and `torch`.
|
||||
|
||||
Args:
|
||||
random_state_dict: A dictionary of the form returned by `get_global_random_state`.
|
||||
"""
|
||||
random.setstate(random_state_dict["random_state"])
|
||||
np.random.set_state(random_state_dict["numpy_random_state"])
|
||||
torch.random.set_rng_state(random_state_dict["torch_random_state"])
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.random.set_rng_state(random_state_dict["torch_cuda_random_state"])
|
||||
|
||||
|
||||
def set_global_seed(seed):
|
||||
"""Set seed for reproducibility."""
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def seeded_context(seed: int) -> Generator[None, None, None]:
|
||||
"""Set the seed when entering a context, and restore the prior random state at exit.
|
||||
|
||||
Example usage:
|
||||
|
||||
```
|
||||
a = random.random() # produces some random number
|
||||
with seeded_context(1337):
|
||||
b = random.random() # produces some other random number
|
||||
c = random.random() # produces yet another random number, but the same it would have if we never made `b`
|
||||
```
|
||||
"""
|
||||
random_state_dict = get_global_random_state()
|
||||
set_global_seed(seed)
|
||||
yield None
|
||||
set_global_random_state(random_state_dict)
|
||||
|
||||
|
||||
def init_logging():
|
||||
def custom_format(record):
|
||||
dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
121
lerobot/common/utils/wandb_utils.py
Normal file
121
lerobot/common/utils/wandb_utils.py
Normal file
@@ -0,0 +1,121 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from glob import glob
|
||||
from pathlib import Path
|
||||
|
||||
from huggingface_hub.constants import SAFETENSORS_SINGLE_FILE
|
||||
from termcolor import colored
|
||||
|
||||
from lerobot.common.constants import PRETRAINED_MODEL_DIR
|
||||
from lerobot.configs.train import TrainPipelineConfig
|
||||
|
||||
|
||||
def cfg_to_group(cfg: TrainPipelineConfig, return_list: bool = False) -> list[str] | str:
|
||||
"""Return a group name for logging. Optionally returns group name as list."""
|
||||
lst = [
|
||||
f"policy:{cfg.policy.type}",
|
||||
f"dataset:{cfg.dataset.repo_id}",
|
||||
f"seed:{cfg.seed}",
|
||||
]
|
||||
if cfg.env is not None:
|
||||
lst.append(f"env:{cfg.env.type}")
|
||||
return lst if return_list else "-".join(lst)
|
||||
|
||||
|
||||
def get_wandb_run_id_from_filesystem(log_dir: Path) -> str:
|
||||
# Get the WandB run ID.
|
||||
paths = glob(str(log_dir / "wandb/latest-run/run-*"))
|
||||
if len(paths) != 1:
|
||||
raise RuntimeError("Couldn't get the previous WandB run ID for run resumption.")
|
||||
match = re.search(r"run-([^\.]+).wandb", paths[0].split("/")[-1])
|
||||
if match is None:
|
||||
raise RuntimeError("Couldn't get the previous WandB run ID for run resumption.")
|
||||
wandb_run_id = match.groups(0)[0]
|
||||
return wandb_run_id
|
||||
|
||||
|
||||
def get_safe_wandb_artifact_name(name: str):
|
||||
"""WandB artifacts don't accept ":" or "/" in their name."""
|
||||
return name.replace(":", "_").replace("/", "_")
|
||||
|
||||
|
||||
class WandBLogger:
|
||||
"""A helper class to log object using wandb."""
|
||||
|
||||
def __init__(self, cfg: TrainPipelineConfig):
|
||||
self.cfg = cfg.wandb
|
||||
self.log_dir = cfg.output_dir
|
||||
self.job_name = cfg.job_name
|
||||
self.env_fps = cfg.env.fps if cfg.env else None
|
||||
self._group = cfg_to_group(cfg)
|
||||
|
||||
# Set up WandB.
|
||||
os.environ["WANDB_SILENT"] = "True"
|
||||
import wandb
|
||||
|
||||
wandb_run_id = get_wandb_run_id_from_filesystem(self.log_dir) if cfg.resume else None
|
||||
wandb.init(
|
||||
id=wandb_run_id,
|
||||
project=self.cfg.project,
|
||||
entity=self.cfg.entity,
|
||||
name=self.job_name,
|
||||
notes=self.cfg.notes,
|
||||
tags=cfg_to_group(cfg, return_list=True),
|
||||
dir=self.log_dir,
|
||||
config=cfg.to_dict(),
|
||||
# TODO(rcadene): try set to True
|
||||
save_code=False,
|
||||
# TODO(rcadene): split train and eval, and run async eval with job_type="eval"
|
||||
job_type="train_eval",
|
||||
resume="must" if cfg.resume else None,
|
||||
)
|
||||
print(colored("Logs will be synced with wandb.", "blue", attrs=["bold"]))
|
||||
logging.info(f"Track this run --> {colored(wandb.run.get_url(), 'yellow', attrs=['bold'])}")
|
||||
self._wandb = wandb
|
||||
|
||||
def log_policy(self, checkpoint_dir: Path):
|
||||
"""Checkpoints the policy to wandb."""
|
||||
if self.cfg.disable_artifact:
|
||||
return
|
||||
|
||||
step_id = checkpoint_dir.name
|
||||
artifact_name = f"{self._group}-{step_id}"
|
||||
artifact_name = get_safe_wandb_artifact_name(artifact_name)
|
||||
artifact = self._wandb.Artifact(artifact_name, type="model")
|
||||
artifact.add_file(checkpoint_dir / PRETRAINED_MODEL_DIR / SAFETENSORS_SINGLE_FILE)
|
||||
self._wandb.log_artifact(artifact)
|
||||
|
||||
def log_dict(self, d: dict, step: int, mode: str = "train"):
|
||||
if mode in {"train", "eval"}:
|
||||
raise ValueError(mode)
|
||||
|
||||
for k, v in d.items():
|
||||
if not isinstance(v, (int, float, str)):
|
||||
logging.warning(
|
||||
f'WandB logging of key "{k}" was ignored as its type is not handled by this wrapper.'
|
||||
)
|
||||
continue
|
||||
self._wandb.log({f"{mode}/{k}": v}, step=step)
|
||||
|
||||
def log_video(self, video_path: str, step: int, mode: str = "train"):
|
||||
if mode in {"train", "eval"}:
|
||||
raise ValueError(mode)
|
||||
|
||||
wandb_video = self._wandb.Video(video_path, fps=self.env_fps, format="mp4")
|
||||
self._wandb.log({f"{mode}/video": wandb_video}, step=step)
|
||||
Reference in New Issue
Block a user