Remove offline training, refactor train.py and logging/checkpointing (#670)

Co-authored-by: Remi <remi.cadene@huggingface.co>
2025-02-11 10:36:06 +01:00
parent 334deb985d
commit 90e099b39f
40 changed files with 1515 additions and 935 deletions
--- a/lerobot/common/utils/io_utils.py
+++ b/lerobot/common/utils/io_utils.py
@@ -13,10 +13,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import json
 import warnings
+from pathlib import Path
+from typing import TypeVar

 import imageio

+JsonLike = str | int | float | bool | None | list["JsonLike"] | dict[str, "JsonLike"] | tuple["JsonLike", ...]
+T = TypeVar("T", bound=JsonLike)
+

 def write_video(video_path, stacked_frames, fps):
    # Filter out DeprecationWarnings raised from pkg_resources
@@ -25,3 +31,81 @@ def write_video(video_path, stacked_frames, fps):
            "ignore", "pkg_resources is deprecated as an API", category=DeprecationWarning
        )
        imageio.mimsave(video_path, stacked_frames, fps=fps)
+
+
+def deserialize_json_into_object(fpath: Path, obj: T) -> T:
+    """
+    Loads the JSON data from `fpath` and recursively fills `obj` with the
+    corresponding values (strictly matching structure and types).
+    Tuples in `obj` are expected to be lists in the JSON data, which will be
+    converted back into tuples.
+    """
+    with open(fpath, encoding="utf-8") as f:
+        data = json.load(f)
+
+    def _deserialize(target, source):
+        """
+        Recursively overwrite the structure in `target` with data from `source`,
+        performing strict checks on structure and type.
+        Returns the updated version of `target` (especially important for tuples).
+        """
+
+        # If the target is a dictionary, source must be a dictionary as well.
+        if isinstance(target, dict):
+            if not isinstance(source, dict):
+                raise TypeError(f"Type mismatch: expected dict, got {type(source)}")
+
+            # Check that they have exactly the same set of keys.
+            if target.keys() != source.keys():
+                raise ValueError(
+                    f"Dictionary keys do not match.\n" f"Expected: {target.keys()}, got: {source.keys()}"
+                )
+
+            # Recursively update each key.
+            for k in target:
+                target[k] = _deserialize(target[k], source[k])
+
+            return target
+
+        # If the target is a list, source must be a list as well.
+        elif isinstance(target, list):
+            if not isinstance(source, list):
+                raise TypeError(f"Type mismatch: expected list, got {type(source)}")
+
+            # Check length
+            if len(target) != len(source):
+                raise ValueError(f"List length mismatch: expected {len(target)}, got {len(source)}")
+
+            # Recursively update each element.
+            for i in range(len(target)):
+                target[i] = _deserialize(target[i], source[i])
+
+            return target
+
+        # If the target is a tuple, the source must be a list in JSON,
+        # which we'll convert back to a tuple.
+        elif isinstance(target, tuple):
+            if not isinstance(source, list):
+                raise TypeError(f"Type mismatch: expected list (for tuple), got {type(source)}")
+
+            if len(target) != len(source):
+                raise ValueError(f"Tuple length mismatch: expected {len(target)}, got {len(source)}")
+
+            # Convert each element, forming a new tuple.
+            converted_items = []
+            for t_item, s_item in zip(target, source, strict=False):
+                converted_items.append(_deserialize(t_item, s_item))
+
+            # Return a brand new tuple (tuples are immutable in Python).
+            return tuple(converted_items)
+
+        # Otherwise, we're dealing with a "primitive" (int, float, str, bool, None).
+        else:
+            # Check the exact type.  If these must match 1:1, do:
+            if type(target) is not type(source):
+                raise TypeError(f"Type mismatch: expected {type(target)}, got {type(source)}")
+            return source
+
+    # Perform the in-place/recursive deserialization
+    updated_obj = _deserialize(obj, data)
+    return updated_obj
--- a/lerobot/common/utils/logging_utils.py
+++ b/lerobot/common/utils/logging_utils.py
@@ -0,0 +1,163 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any
+
+from lerobot.common.utils.utils import format_big_number
+
+
+class AverageMeter:
+    """
+    Computes and stores the average and current value
+    Adapted from https://github.com/pytorch/examples/blob/main/imagenet/main.py
+    """
+
+    def __init__(self, name: str, fmt: str = ":f"):
+        self.name = name
+        self.fmt = fmt
+        self.reset()
+
+    def reset(self) -> None:
+        self.val = 0.0
+        self.avg = 0.0
+        self.sum = 0.0
+        self.count = 0.0
+
+    def update(self, val: float, n: int = 1) -> None:
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    def __str__(self):
+        fmtstr = "{name}:{avg" + self.fmt + "}"
+        return fmtstr.format(**self.__dict__)
+
+
+class MetricsTracker:
+    """
+    A helper class to track and log metrics over time.
+
+    Usage pattern:
+
+    ```python
+    # initialize, potentially with non-zero initial step (e.g. if resuming run)
+    metrics = {"loss": AverageMeter("loss", ":.3f")}
+    train_metrics = MetricsTracker(cfg, dataset, metrics, initial_step=step)
+
+    # update metrics derived from step (samples, episodes, epochs) at each training step
+    train_metrics.step()
+
+    # update various metrics
+    loss = policy.forward(batch)
+    train_metrics.loss = loss
+
+    # display current metrics
+    logging.info(train_metrics)
+
+    # export for wandb
+    wandb.log(train_metrics.to_dict())
+
+    # reset averages after logging
+    train_metrics.reset_averages()
+    ```
+    """
+
+    __keys__ = [
+        "_batch_size",
+        "_num_frames",
+        "_avg_samples_per_ep",
+        "metrics",
+        "steps",
+        "samples",
+        "episodes",
+        "epochs",
+    ]
+
+    def __init__(
+        self,
+        batch_size: int,
+        num_frames: int,
+        num_episodes: int,
+        metrics: dict[str, AverageMeter],
+        initial_step: int = 0,
+    ):
+        self.__dict__.update({k: None for k in self.__keys__})
+        self._batch_size = batch_size
+        self._num_frames = num_frames
+        self._avg_samples_per_ep = num_frames / num_episodes
+        self.metrics = metrics
+
+        self.steps = initial_step
+        # A sample is an (observation,action) pair, where observation and action
+        # can be on multiple timestamps. In a batch, we have `batch_size` number of samples.
+        self.samples = self.steps * self._batch_size
+        self.episodes = self.samples / self._avg_samples_per_ep
+        self.epochs = self.samples / self._num_frames
+
+    def __getattr__(self, name: str) -> int | dict[str, AverageMeter] | AverageMeter | Any:
+        if name in self.__dict__:
+            return self.__dict__[name]
+        elif name in self.metrics:
+            return self.metrics[name]
+        else:
+            raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
+
+    def __setattr__(self, name: str, value: Any) -> None:
+        if name in self.__dict__:
+            super().__setattr__(name, value)
+        elif name in self.metrics:
+            self.metrics[name].update(value)
+        else:
+            raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
+
+    def step(self) -> None:
+        """
+        Updates metrics that depend on 'step' for one step.
+        """
+        self.steps += 1
+        self.samples += self._batch_size
+        self.episodes = self.samples / self._avg_samples_per_ep
+        self.epochs = self.samples / self._num_frames
+
+    def __str__(self) -> str:
+        display_list = [
+            f"step:{format_big_number(self.steps)}",
+            # number of samples seen during training
+            f"smpl:{format_big_number(self.samples)}",
+            # number of episodes seen during training
+            f"ep:{format_big_number(self.episodes)}",
+            # number of time all unique samples are seen
+            f"epch:{self.epochs:.2f}",
+            *[str(m) for m in self.metrics.values()],
+        ]
+        return " ".join(display_list)
+
+    def to_dict(self, use_avg: bool = True) -> dict[str, int | float]:
+        """
+        Returns the current metric values (or averages if `use_avg=True`) as a dict.
+        """
+        return {
+            "steps": self.steps,
+            "samples": self.samples,
+            "episodes": self.episodes,
+            "epochs": self.epochs,
+            **{k: m.avg if use_avg else m.val for k, m in self.metrics.items()},
+        }
+
+    def reset_averages(self) -> None:
+        """Resets average meters."""
+        for m in self.metrics.values():
+            m.reset()
--- a/lerobot/common/utils/random_utils.py
+++ b/lerobot/common/utils/random_utils.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import random
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Any, Generator
+
+import numpy as np
+import torch
+from safetensors.torch import load_file, save_file
+
+from lerobot.common.constants import RNG_STATE
+from lerobot.common.datasets.utils import flatten_dict, unflatten_dict
+
+
+def serialize_python_rng_state() -> dict[str, torch.Tensor]:
+    """
+    Returns the rng state for `random` in the form of a flat dict[str, torch.Tensor] to be saved using
+    `safetensors.save_file()` or `torch.save()`.
+    """
+    py_state = random.getstate()
+    return {
+        "py_rng_version": torch.tensor([py_state[0]], dtype=torch.int64),
+        "py_rng_state": torch.tensor(py_state[1], dtype=torch.int64),
+    }
+
+
+def deserialize_python_rng_state(rng_state_dict: dict[str, torch.Tensor]) -> None:
+    """
+    Restores the rng state for `random` from a dictionary produced by `serialize_python_rng_state()`.
+    """
+    py_state = (rng_state_dict["py_rng_version"].item(), tuple(rng_state_dict["py_rng_state"].tolist()), None)
+    random.setstate(py_state)
+
+
+def serialize_numpy_rng_state() -> dict[str, torch.Tensor]:
+    """
+    Returns the rng state for `numpy` in the form of a flat dict[str, torch.Tensor] to be saved using
+    `safetensors.save_file()` or `torch.save()`.
+    """
+    np_state = np.random.get_state()
+    # Ensure no breaking changes from numpy
+    assert np_state[0] == "MT19937"
+    return {
+        "np_rng_state_values": torch.tensor(np_state[1], dtype=torch.int64),
+        "np_rng_state_index": torch.tensor([np_state[2]], dtype=torch.int64),
+        "np_rng_has_gauss": torch.tensor([np_state[3]], dtype=torch.int64),
+        "np_rng_cached_gaussian": torch.tensor([np_state[4]], dtype=torch.float32),
+    }
+
+
+def deserialize_numpy_rng_state(rng_state_dict: dict[str, torch.Tensor]) -> None:
+    """
+    Restores the rng state for `numpy` from a dictionary produced by `serialize_numpy_rng_state()`.
+    """
+    np_state = (
+        "MT19937",
+        rng_state_dict["np_rng_state_values"].numpy(),
+        rng_state_dict["np_rng_state_index"].item(),
+        rng_state_dict["np_rng_has_gauss"].item(),
+        rng_state_dict["np_rng_cached_gaussian"].item(),
+    )
+    np.random.set_state(np_state)
+
+
+def serialize_torch_rng_state() -> dict[str, torch.Tensor]:
+    """
+    Returns the rng state for `torch` in the form of a flat dict[str, torch.Tensor] to be saved using
+    `safetensors.save_file()` or `torch.save()`.
+    """
+    torch_rng_state_dict = {"torch_rng_state": torch.get_rng_state()}
+    if torch.cuda.is_available():
+        torch_rng_state_dict["torch_cuda_rng_state"] = torch.cuda.get_rng_state()
+    return torch_rng_state_dict
+
+
+def deserialize_torch_rng_state(rng_state_dict: dict[str, torch.Tensor]) -> None:
+    """
+    Restores the rng state for `torch` from a dictionary produced by `serialize_torch_rng_state()`.
+    """
+    torch.set_rng_state(rng_state_dict["torch_rng_state"])
+    if torch.cuda.is_available() and "torch_cuda_rng_state" in rng_state_dict:
+        torch.cuda.set_rng_state(rng_state_dict["torch_cuda_rng_state"])
+
+
+def serialize_rng_state() -> dict[str, torch.Tensor]:
+    """
+    Returns the rng state for `random`, `numpy`, and `torch`, in the form of a flat
+    dict[str, torch.Tensor] to be saved using `safetensors.save_file()` `torch.save()`.
+    """
+    py_rng_state_dict = serialize_python_rng_state()
+    np_rng_state_dict = serialize_numpy_rng_state()
+    torch_rng_state_dict = serialize_torch_rng_state()
+
+    return {
+        **py_rng_state_dict,
+        **np_rng_state_dict,
+        **torch_rng_state_dict,
+    }
+
+
+def deserialize_rng_state(rng_state_dict: dict[str, torch.Tensor]) -> None:
+    """
+    Restores the rng state for `random`, `numpy`, and `torch` from a dictionary produced by
+    `serialize_rng_state()`.
+    """
+    py_rng_state_dict = {k: v for k, v in rng_state_dict.items() if k.startswith("py")}
+    np_rng_state_dict = {k: v for k, v in rng_state_dict.items() if k.startswith("np")}
+    torch_rng_state_dict = {k: v for k, v in rng_state_dict.items() if k.startswith("torch")}
+
+    deserialize_python_rng_state(py_rng_state_dict)
+    deserialize_numpy_rng_state(np_rng_state_dict)
+    deserialize_torch_rng_state(torch_rng_state_dict)
+
+
+def save_rng_state(save_dir: Path) -> None:
+    rng_state_dict = serialize_rng_state()
+    flat_rng_state_dict = flatten_dict(rng_state_dict)
+    save_file(flat_rng_state_dict, save_dir / RNG_STATE)
+
+
+def load_rng_state(save_dir: Path) -> None:
+    flat_rng_state_dict = load_file(save_dir / RNG_STATE)
+    rng_state_dict = unflatten_dict(flat_rng_state_dict)
+    deserialize_rng_state(rng_state_dict)
+
+
+def get_rng_state() -> dict[str, Any]:
+    """Get the random state for `random`, `numpy`, and `torch`."""
+    random_state_dict = {
+        "random_state": random.getstate(),
+        "numpy_random_state": np.random.get_state(),
+        "torch_random_state": torch.random.get_rng_state(),
+    }
+    if torch.cuda.is_available():
+        random_state_dict["torch_cuda_random_state"] = torch.cuda.random.get_rng_state()
+    return random_state_dict
+
+
+def set_rng_state(random_state_dict: dict[str, Any]):
+    """Set the random state for `random`, `numpy`, and `torch`.
+
+    Args:
+        random_state_dict: A dictionary of the form returned by `get_rng_state`.
+    """
+    random.setstate(random_state_dict["random_state"])
+    np.random.set_state(random_state_dict["numpy_random_state"])
+    torch.random.set_rng_state(random_state_dict["torch_random_state"])
+    if torch.cuda.is_available():
+        torch.cuda.random.set_rng_state(random_state_dict["torch_cuda_random_state"])
+
+
+def set_seed(seed) -> None:
+    """Set seed for reproducibility."""
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+
+
+@contextmanager
+def seeded_context(seed: int) -> Generator[None, None, None]:
+    """Set the seed when entering a context, and restore the prior random state at exit.
+
+    Example usage:
+
+    ```
+    a = random.random()  # produces some random number
+    with seeded_context(1337):
+        b = random.random()  # produces some other random number
+    c = random.random()  # produces yet another random number, but the same it would have if we never made `b`
+    ```
+    """
+    random_state_dict = get_rng_state()
+    set_seed(seed)
+    yield None
+    set_rng_state(random_state_dict)
--- a/lerobot/common/utils/train_utils.py
+++ b/lerobot/common/utils/train_utils.py
@@ -0,0 +1,161 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from pathlib import Path
+
+from termcolor import colored
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import LRScheduler
+
+from lerobot.common.constants import (
+    CHECKPOINTS_DIR,
+    LAST_CHECKPOINT_LINK,
+    PRETRAINED_MODEL_DIR,
+    TRAINING_STATE_DIR,
+    TRAINING_STEP,
+)
+from lerobot.common.datasets.utils import load_json, write_json
+from lerobot.common.optim.optimizers import load_optimizer_state, save_optimizer_state
+from lerobot.common.optim.schedulers import load_scheduler_state, save_scheduler_state
+from lerobot.common.policies.pretrained import PreTrainedPolicy
+from lerobot.common.utils.random_utils import load_rng_state, save_rng_state
+from lerobot.configs.train import TrainPipelineConfig
+
+
+def log_output_dir(out_dir):
+    logging.info(colored("Output dir:", "yellow", attrs=["bold"]) + f" {out_dir}")
+
+
+def get_step_identifier(step: int, total_steps: int) -> str:
+    num_digits = max(6, len(str(total_steps)))
+    return f"{step:0{num_digits}d}"
+
+
+def get_step_checkpoint_dir(output_dir: Path, total_steps: int, step: int) -> Path:
+    """Returns the checkpoint sub-directory corresponding to the step number."""
+    step_identifier = get_step_identifier(step, total_steps)
+    return output_dir / CHECKPOINTS_DIR / step_identifier
+
+
+def save_training_step(step: int, save_dir: Path) -> None:
+    write_json({"step": step}, save_dir / TRAINING_STEP)
+
+
+def load_training_step(save_dir: Path) -> int:
+    training_step = load_json(save_dir / TRAINING_STEP)
+    return training_step["step"]
+
+
+def update_last_checkpoint(checkpoint_dir: Path) -> Path:
+    last_checkpoint_dir = checkpoint_dir.parent / LAST_CHECKPOINT_LINK
+    if last_checkpoint_dir.is_symlink():
+        last_checkpoint_dir.unlink()
+    relative_target = checkpoint_dir.relative_to(checkpoint_dir.parent)
+    last_checkpoint_dir.symlink_to(relative_target)
+
+
+def save_checkpoint(
+    checkpoint_dir: Path,
+    step: int,
+    cfg: TrainPipelineConfig,
+    policy: PreTrainedPolicy,
+    optimizer: Optimizer,
+    scheduler: LRScheduler | None = None,
+) -> None:
+    """This function creates the following directory structure:
+
+    005000/  #  training step at checkpoint
+    ├── pretrained_model/
+    │   ├── config.json  # policy config
+    │   ├── model.safetensors  # policy weights
+    │   └── train_config.json  # train config
+    └── training_state/
+        ├── optimizer_param_groups.json  #  optimizer param groups
+        ├── optimizer_state.safetensors  # optimizer state
+        ├── rng_state.safetensors  # rng states
+        ├── scheduler_state.json  # scheduler state
+        └── training_step.json  # training step
+
+    Args:
+        cfg (TrainPipelineConfig): The training config used for this run.
+        step (int): The training step at that checkpoint.
+        policy (PreTrainedPolicy): The policy to save.
+        optimizer (Optimizer | None, optional): The optimizer to save the state from. Defaults to None.
+        scheduler (LRScheduler | None, optional): The scheduler to save the state from. Defaults to None.
+    """
+    pretrained_dir = checkpoint_dir / PRETRAINED_MODEL_DIR
+    policy.save_pretrained(pretrained_dir)
+    cfg.save_pretrained(pretrained_dir)
+    save_training_state(checkpoint_dir, step, optimizer, scheduler)
+
+
+def save_training_state(
+    checkpoint_dir: Path,
+    train_step: int,
+    optimizer: Optimizer | None = None,
+    scheduler: LRScheduler | None = None,
+) -> None:
+    """
+    Saves the training step, optimizer state, scheduler state, and rng state.
+
+    Args:
+        save_dir (Path): The directory to save artifacts to.
+        train_step (int): Current training step.
+        optimizer (Optimizer | None, optional): The optimizer from which to save the state_dict.
+            Defaults to None.
+        scheduler (LRScheduler | None, optional): The scheduler from which to save the state_dict.
+            Defaults to None.
+    """
+    save_dir = checkpoint_dir / TRAINING_STATE_DIR
+    save_dir.mkdir(parents=True, exist_ok=True)
+    save_training_step(train_step, save_dir)
+    save_rng_state(save_dir)
+    if optimizer is not None:
+        save_optimizer_state(optimizer, save_dir)
+    if scheduler is not None:
+        save_scheduler_state(scheduler, save_dir)
+
+
+def load_training_state(
+    checkpoint_dir: Path, optimizer: Optimizer, scheduler: LRScheduler | None
+) -> tuple[int, Optimizer, LRScheduler | None]:
+    """
+    Loads the training step, optimizer state, scheduler state, and rng state.
+    This is used to resume a training run.
+
+    Args:
+        checkpoint_dir (Path): The checkpoint directory. Should contain a 'training_state' dir.
+        optimizer (Optimizer): The optimizer to load the state_dict to.
+        scheduler (LRScheduler | None): The scheduler to load the state_dict to (can be None).
+
+    Raises:
+        NotADirectoryError: If 'checkpoint_dir' doesn't contain a 'training_state' dir
+
+    Returns:
+        tuple[int, Optimizer, LRScheduler | None]: training step, optimizer and scheduler with their
+            state_dict loaded.
+    """
+    training_state_dir = checkpoint_dir / TRAINING_STATE_DIR
+    if not training_state_dir.is_dir():
+        raise NotADirectoryError(training_state_dir)
+
+    load_rng_state(training_state_dir)
+    step = load_training_step(training_state_dir)
+    optimizer = load_optimizer_state(optimizer, training_state_dir)
+    if scheduler is not None:
+        scheduler = load_scheduler_state(scheduler, training_state_dir)
+
+    return step, optimizer, scheduler
--- a/lerobot/common/utils/utils.py
+++ b/lerobot/common/utils/utils.py
@@ -17,14 +17,10 @@ import logging
 import os
 import os.path as osp
 import platform
-import random
-from contextlib import contextmanager
 from copy import copy
 from datetime import datetime, timezone
 from pathlib import Path
-from typing import Any, Generator

-import numpy as np
 import torch


@@ -106,59 +102,6 @@ def is_amp_available(device: str):
        raise ValueError(f"Unknown device '{device}.")


-def get_global_random_state() -> dict[str, Any]:
-    """Get the random state for `random`, `numpy`, and `torch`."""
-    random_state_dict = {
-        "random_state": random.getstate(),
-        "numpy_random_state": np.random.get_state(),
-        "torch_random_state": torch.random.get_rng_state(),
-    }
-    if torch.cuda.is_available():
-        random_state_dict["torch_cuda_random_state"] = torch.cuda.random.get_rng_state()
-    return random_state_dict
-
-
-def set_global_random_state(random_state_dict: dict[str, Any]):
-    """Set the random state for `random`, `numpy`, and `torch`.
-
-    Args:
-        random_state_dict: A dictionary of the form returned by `get_global_random_state`.
-    """
-    random.setstate(random_state_dict["random_state"])
-    np.random.set_state(random_state_dict["numpy_random_state"])
-    torch.random.set_rng_state(random_state_dict["torch_random_state"])
-    if torch.cuda.is_available():
-        torch.cuda.random.set_rng_state(random_state_dict["torch_cuda_random_state"])
-
-
-def set_global_seed(seed):
-    """Set seed for reproducibility."""
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed_all(seed)
-
-
-@contextmanager
-def seeded_context(seed: int) -> Generator[None, None, None]:
-    """Set the seed when entering a context, and restore the prior random state at exit.
-
-    Example usage:
-
-    ```
-    a = random.random()  # produces some random number
-    with seeded_context(1337):
-        b = random.random()  # produces some other random number
-    c = random.random()  # produces yet another random number, but the same it would have if we never made `b`
-    ```
-    """
-    random_state_dict = get_global_random_state()
-    set_global_seed(seed)
-    yield None
-    set_global_random_state(random_state_dict)
-
-
 def init_logging():
    def custom_format(record):
        dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
--- a/lerobot/common/utils/wandb_utils.py
+++ b/lerobot/common/utils/wandb_utils.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import os
+import re
+from glob import glob
+from pathlib import Path
+
+from huggingface_hub.constants import SAFETENSORS_SINGLE_FILE
+from termcolor import colored
+
+from lerobot.common.constants import PRETRAINED_MODEL_DIR
+from lerobot.configs.train import TrainPipelineConfig
+
+
+def cfg_to_group(cfg: TrainPipelineConfig, return_list: bool = False) -> list[str] | str:
+    """Return a group name for logging. Optionally returns group name as list."""
+    lst = [
+        f"policy:{cfg.policy.type}",
+        f"dataset:{cfg.dataset.repo_id}",
+        f"seed:{cfg.seed}",
+    ]
+    if cfg.env is not None:
+        lst.append(f"env:{cfg.env.type}")
+    return lst if return_list else "-".join(lst)
+
+
+def get_wandb_run_id_from_filesystem(log_dir: Path) -> str:
+    # Get the WandB run ID.
+    paths = glob(str(log_dir / "wandb/latest-run/run-*"))
+    if len(paths) != 1:
+        raise RuntimeError("Couldn't get the previous WandB run ID for run resumption.")
+    match = re.search(r"run-([^\.]+).wandb", paths[0].split("/")[-1])
+    if match is None:
+        raise RuntimeError("Couldn't get the previous WandB run ID for run resumption.")
+    wandb_run_id = match.groups(0)[0]
+    return wandb_run_id
+
+
+def get_safe_wandb_artifact_name(name: str):
+    """WandB artifacts don't accept ":" or "/" in their name."""
+    return name.replace(":", "_").replace("/", "_")
+
+
+class WandBLogger:
+    """A helper class to log object using wandb."""
+
+    def __init__(self, cfg: TrainPipelineConfig):
+        self.cfg = cfg.wandb
+        self.log_dir = cfg.output_dir
+        self.job_name = cfg.job_name
+        self.env_fps = cfg.env.fps if cfg.env else None
+        self._group = cfg_to_group(cfg)
+
+        # Set up WandB.
+        os.environ["WANDB_SILENT"] = "True"
+        import wandb
+
+        wandb_run_id = get_wandb_run_id_from_filesystem(self.log_dir) if cfg.resume else None
+        wandb.init(
+            id=wandb_run_id,
+            project=self.cfg.project,
+            entity=self.cfg.entity,
+            name=self.job_name,
+            notes=self.cfg.notes,
+            tags=cfg_to_group(cfg, return_list=True),
+            dir=self.log_dir,
+            config=cfg.to_dict(),
+            # TODO(rcadene): try set to True
+            save_code=False,
+            # TODO(rcadene): split train and eval, and run async eval with job_type="eval"
+            job_type="train_eval",
+            resume="must" if cfg.resume else None,
+        )
+        print(colored("Logs will be synced with wandb.", "blue", attrs=["bold"]))
+        logging.info(f"Track this run --> {colored(wandb.run.get_url(), 'yellow', attrs=['bold'])}")
+        self._wandb = wandb
+
+    def log_policy(self, checkpoint_dir: Path):
+        """Checkpoints the policy to wandb."""
+        if self.cfg.disable_artifact:
+            return
+
+        step_id = checkpoint_dir.name
+        artifact_name = f"{self._group}-{step_id}"
+        artifact_name = get_safe_wandb_artifact_name(artifact_name)
+        artifact = self._wandb.Artifact(artifact_name, type="model")
+        artifact.add_file(checkpoint_dir / PRETRAINED_MODEL_DIR / SAFETENSORS_SINGLE_FILE)
+        self._wandb.log_artifact(artifact)
+
+    def log_dict(self, d: dict, step: int, mode: str = "train"):
+        if mode in {"train", "eval"}:
+            raise ValueError(mode)
+
+        for k, v in d.items():
+            if not isinstance(v, (int, float, str)):
+                logging.warning(
+                    f'WandB logging of key "{k}" was ignored as its type is not handled by this wrapper.'
+                )
+                continue
+            self._wandb.log({f"{mode}/{k}": v}, step=step)
+
+    def log_video(self, video_path: str, step: int, mode: str = "train"):
+        if mode in {"train", "eval"}:
+            raise ValueError(mode)
+
+        wandb_video = self._wandb.Video(video_path, fps=self.env_fps, format="mp4")
+        self._wandb.log({f"{mode}/video": wandb_video}, step=step)