Per-episode stats (#521)

Co-authored-by: Remi Cadene <re.cadene@gmail.com> Co-authored-by: Remi <remi.cadene@huggingface.co>
2025-02-15 15:47:16 +01:00
parent 7c2bbee613
commit 8426c64f42
19 changed files with 906 additions and 798 deletions
--- a/lerobot/common/datasets/compute_stats.py
+++ b/lerobot/common/datasets/compute_stats.py
@@ -13,202 +13,148 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from copy import deepcopy
-from math import ceil
+import numpy as np

-import einops
-import torch
-import tqdm
+from lerobot.common.datasets.utils import load_image_as_numpy


-def get_stats_einops_patterns(dataset, num_workers=0):
-    """These einops patterns will be used to aggregate batches and compute statistics.
+def estimate_num_samples(
+    dataset_len: int, min_num_samples: int = 100, max_num_samples: int = 10_000, power: float = 0.75
+) -> int:
+    """Heuristic to estimate the number of samples based on dataset size.
+    The power controls the sample growth relative to dataset size.
+    Lower the power for less number of samples.

-    Note: We assume the images are in channel first format
+    For default arguments, we have:
+    - from 1 to ~500, num_samples=100
+    - at 1000, num_samples=177
+    - at 2000, num_samples=299
+    - at 5000, num_samples=594
+    - at 10000, num_samples=1000
+    - at 20000, num_samples=1681
    """
+    if dataset_len < min_num_samples:
+        min_num_samples = dataset_len
+    return max(min_num_samples, min(int(dataset_len**power), max_num_samples))

-    dataloader = torch.utils.data.DataLoader(
-        dataset,
-        num_workers=num_workers,
-        batch_size=2,
-        shuffle=False,
-    )
-    batch = next(iter(dataloader))

-    stats_patterns = {}
+def sample_indices(data_len: int) -> list[int]:
+    num_samples = estimate_num_samples(data_len)
+    return np.round(np.linspace(0, data_len - 1, num_samples)).astype(int).tolist()

-    for key in dataset.features:
-        # sanity check that tensors are not float64
-        assert batch[key].dtype != torch.float64

-        # if isinstance(feats_type, (VideoFrame, Image)):
-        if key in dataset.meta.camera_keys:
-            # sanity check that images are channel first
-            _, c, h, w = batch[key].shape
-            assert c < h and c < w, f"expect channel first images, but instead {batch[key].shape}"
+def sample_images(image_paths: list[str]) -> np.ndarray:
+    sampled_indices = sample_indices(len(image_paths))
+    images = []
+    for idx in sampled_indices:
+        path = image_paths[idx]
+        # we load as uint8 to reduce memory usage
+        img = load_image_as_numpy(path, dtype=np.uint8, channel_first=True)
+        images.append(img)

-            # sanity check that images are float32 in range [0,1]
-            assert batch[key].dtype == torch.float32, f"expect torch.float32, but instead {batch[key].dtype=}"
-            assert batch[key].max() <= 1, f"expect pixels lower than 1, but instead {batch[key].max()=}"
-            assert batch[key].min() >= 0, f"expect pixels greater than 1, but instead {batch[key].min()=}"
+    images = np.stack(images)
+    return images

-            stats_patterns[key] = "b c h w -> c 1 1"
-        elif batch[key].ndim == 2:
-            stats_patterns[key] = "b c -> c "
-        elif batch[key].ndim == 1:
-            stats_patterns[key] = "b -> 1"
+
+def get_feature_stats(array: np.ndarray, axis: tuple, keepdims: bool) -> dict[str, np.ndarray]:
+    return {
+        "min": np.min(array, axis=axis, keepdims=keepdims),
+        "max": np.max(array, axis=axis, keepdims=keepdims),
+        "mean": np.mean(array, axis=axis, keepdims=keepdims),
+        "std": np.std(array, axis=axis, keepdims=keepdims),
+        "count": np.array([len(array)]),
+    }
+
+
+def compute_episode_stats(episode_data: dict[str, list[str] | np.ndarray], features: dict) -> dict:
+    ep_stats = {}
+    for key, data in episode_data.items():
+        if features[key]["dtype"] == "string":
+            continue  # HACK: we should receive np.arrays of strings
+        elif features[key]["dtype"] in ["image", "video"]:
+            ep_ft_array = sample_images(data)  # data is a list of image paths
+            axes_to_reduce = (0, 2, 3)  # keep channel dim
+            keepdims = True
        else:
-            raise ValueError(f"{key}, {batch[key].shape}")
+            ep_ft_array = data  # data is alreay a np.ndarray
+            axes_to_reduce = 0  # compute stats over the first axis
+            keepdims = data.ndim == 1  # keep as np.array

-    return stats_patterns
+        ep_stats[key] = get_feature_stats(ep_ft_array, axis=axes_to_reduce, keepdims=keepdims)
+
+        # finally, we normalize and remove batch dim for images
+        if features[key]["dtype"] in ["image", "video"]:
+            ep_stats[key] = {
+                k: v if k == "count" else np.squeeze(v / 255.0, axis=0) for k, v in ep_stats[key].items()
+            }
+
+    return ep_stats


-def compute_stats(dataset, batch_size=8, num_workers=8, max_num_samples=None):
-    """Compute mean/std and min/max statistics of all data keys in a LeRobotDataset."""
-    if max_num_samples is None:
-        max_num_samples = len(dataset)
-
-    # for more info on why we need to set the same number of workers, see `load_from_videos`
-    stats_patterns = get_stats_einops_patterns(dataset, num_workers)
-
-    # mean and std will be computed incrementally while max and min will track the running value.
-    mean, std, max, min = {}, {}, {}, {}
-    for key in stats_patterns:
-        mean[key] = torch.tensor(0.0).float()
-        std[key] = torch.tensor(0.0).float()
-        max[key] = torch.tensor(-float("inf")).float()
-        min[key] = torch.tensor(float("inf")).float()
-
-    def create_seeded_dataloader(dataset, batch_size, seed):
-        generator = torch.Generator()
-        generator.manual_seed(seed)
-        dataloader = torch.utils.data.DataLoader(
-            dataset,
-            num_workers=num_workers,
-            batch_size=batch_size,
-            shuffle=True,
-            drop_last=False,
-            generator=generator,
-        )
-        return dataloader
-
-    # Note: Due to be refactored soon. The point of storing `first_batch` is to make sure we don't get
-    # surprises when rerunning the sampler.
-    first_batch = None
-    running_item_count = 0  # for online mean computation
-    dataloader = create_seeded_dataloader(dataset, batch_size, seed=1337)
-    for i, batch in enumerate(
-        tqdm.tqdm(dataloader, total=ceil(max_num_samples / batch_size), desc="Compute mean, min, max")
-    ):
-        this_batch_size = len(batch["index"])
-        running_item_count += this_batch_size
-        if first_batch is None:
-            first_batch = deepcopy(batch)
-        for key, pattern in stats_patterns.items():
-            batch[key] = batch[key].float()
-            # Numerically stable update step for mean computation.
-            batch_mean = einops.reduce(batch[key], pattern, "mean")
-            # Hint: to update the mean we need x̄ₙ = (Nₙ₋₁x̄ₙ₋₁ + Bₙxₙ) / Nₙ, where the subscript represents
-            # the update step, N is the running item count, B is this batch size, x̄ is the running mean,
-            # and x is the current batch mean. Some rearrangement is then required to avoid risking
-            # numerical overflow. Another hint: Nₙ₋₁ = Nₙ - Bₙ. Rearrangement yields
-            # x̄ₙ = x̄ₙ₋₁ + Bₙ * (xₙ - x̄ₙ₋₁) / Nₙ
-            mean[key] = mean[key] + this_batch_size * (batch_mean - mean[key]) / running_item_count
-            max[key] = torch.maximum(max[key], einops.reduce(batch[key], pattern, "max"))
-            min[key] = torch.minimum(min[key], einops.reduce(batch[key], pattern, "min"))
-
-        if i == ceil(max_num_samples / batch_size) - 1:
-            break
-
-    first_batch_ = None
-    running_item_count = 0  # for online std computation
-    dataloader = create_seeded_dataloader(dataset, batch_size, seed=1337)
-    for i, batch in enumerate(
-        tqdm.tqdm(dataloader, total=ceil(max_num_samples / batch_size), desc="Compute std")
-    ):
-        this_batch_size = len(batch["index"])
-        running_item_count += this_batch_size
-        # Sanity check to make sure the batches are still in the same order as before.
-        if first_batch_ is None:
-            first_batch_ = deepcopy(batch)
-            for key in stats_patterns:
-                assert torch.equal(first_batch_[key], first_batch[key])
-        for key, pattern in stats_patterns.items():
-            batch[key] = batch[key].float()
-            # Numerically stable update step for mean computation (where the mean is over squared
-            # residuals).See notes in the mean computation loop above.
-            batch_std = einops.reduce((batch[key] - mean[key]) ** 2, pattern, "mean")
-            std[key] = std[key] + this_batch_size * (batch_std - std[key]) / running_item_count
-
-        if i == ceil(max_num_samples / batch_size) - 1:
-            break
-
-    for key in stats_patterns:
-        std[key] = torch.sqrt(std[key])
-
-    stats = {}
-    for key in stats_patterns:
-        stats[key] = {
-            "mean": mean[key],
-            "std": std[key],
-            "max": max[key],
-            "min": min[key],
-        }
-    return stats
+def _assert_type_and_shape(stats_list: list[dict[str, dict]]):
+    for i in range(len(stats_list)):
+        for fkey in stats_list[i]:
+            for k, v in stats_list[i][fkey].items():
+                if not isinstance(v, np.ndarray):
+                    raise ValueError(
+                        f"Stats must be composed of numpy array, but key '{k}' of feature '{fkey}' is of type '{type(v)}' instead."
+                    )
+                if v.ndim == 0:
+                    raise ValueError("Number of dimensions must be at least 1, and is 0 instead.")
+                if k == "count" and v.shape != (1,):
+                    raise ValueError(f"Shape of 'count' must be (1), but is {v.shape} instead.")
+                if "image" in fkey and k != "count" and v.shape != (3, 1, 1):
+                    raise ValueError(f"Shape of '{k}' must be (3,1,1), but is {v.shape} instead.")


-def aggregate_stats(ls_datasets) -> dict[str, torch.Tensor]:
-    """Aggregate stats of multiple LeRobot datasets into one set of stats without recomputing from scratch.
+def aggregate_feature_stats(stats_ft_list: list[dict[str, dict]]) -> dict[str, dict[str, np.ndarray]]:
+    """Aggregates stats for a single feature."""
+    means = np.stack([s["mean"] for s in stats_ft_list])
+    variances = np.stack([s["std"] ** 2 for s in stats_ft_list])
+    counts = np.stack([s["count"] for s in stats_ft_list])
+    total_count = counts.sum(axis=0)

-    The final stats will have the union of all data keys from each of the datasets.
+    # Prepare weighted mean by matching number of dimensions
+    while counts.ndim < means.ndim:
+        counts = np.expand_dims(counts, axis=-1)

-    The final stats will have the union of all data keys from each of the datasets. For instance:
-    - new_max = max(max_dataset_0, max_dataset_1, ...)
+    # Compute the weighted mean
+    weighted_means = means * counts
+    total_mean = weighted_means.sum(axis=0) / total_count
+
+    # Compute the variance using the parallel algorithm
+    delta_means = means - total_mean
+    weighted_variances = (variances + delta_means**2) * counts
+    total_variance = weighted_variances.sum(axis=0) / total_count
+
+    return {
+        "min": np.min(np.stack([s["min"] for s in stats_ft_list]), axis=0),
+        "max": np.max(np.stack([s["max"] for s in stats_ft_list]), axis=0),
+        "mean": total_mean,
+        "std": np.sqrt(total_variance),
+        "count": total_count,
+    }
+
+
+def aggregate_stats(stats_list: list[dict[str, dict]]) -> dict[str, dict[str, np.ndarray]]:
+    """Aggregate stats from multiple compute_stats outputs into a single set of stats.
+
+    The final stats will have the union of all data keys from each of the stats dicts.
+
+    For instance:
    - new_min = min(min_dataset_0, min_dataset_1, ...)
-    - new_mean = (mean of all data)
+    - new_max = max(max_dataset_0, max_dataset_1, ...)
+    - new_mean = (mean of all data, weighted by counts)
    - new_std = (std of all data)
    """
-    data_keys = set()
-    for dataset in ls_datasets:
-        data_keys.update(dataset.meta.stats.keys())
-    stats = {k: {} for k in data_keys}
-    for data_key in data_keys:
-        for stat_key in ["min", "max"]:
-            # compute `max(dataset_0["max"], dataset_1["max"], ...)`
-            stats[data_key][stat_key] = einops.reduce(
-                torch.stack(
-                    [ds.meta.stats[data_key][stat_key] for ds in ls_datasets if data_key in ds.meta.stats],
-                    dim=0,
-                ),
-                "n ... -> ...",
-                stat_key,
-            )
-        total_samples = sum(d.num_frames for d in ls_datasets if data_key in d.meta.stats)
-        # Compute the "sum" statistic by multiplying each mean by the number of samples in the respective
-        # dataset, then divide by total_samples to get the overall "mean".
-        # NOTE: the brackets around (d.num_frames / total_samples) are needed tor minimize the risk of
-        # numerical overflow!
-        stats[data_key]["mean"] = sum(
-            d.meta.stats[data_key]["mean"] * (d.num_frames / total_samples)
-            for d in ls_datasets
-            if data_key in d.meta.stats
-        )
-        # The derivation for standard deviation is a little more involved but is much in the same spirit as
-        # the computation of the mean.
-        # Given two sets of data where the statistics are known:
-        # σ_combined = sqrt[ (n1 * (σ1^2 + d1^2) + n2 * (σ2^2 + d2^2)) / (n1 + n2) ]
-        # where d1 = μ1 - μ_combined, d2 = μ2 - μ_combined
-        # NOTE: the brackets around (d.num_frames / total_samples) are needed tor minimize the risk of
-        # numerical overflow!
-        stats[data_key]["std"] = torch.sqrt(
-            sum(
-                (
-                    d.meta.stats[data_key]["std"] ** 2
-                    + (d.meta.stats[data_key]["mean"] - stats[data_key]["mean"]) ** 2
-                )
-                * (d.num_frames / total_samples)
-                for d in ls_datasets
-                if data_key in d.meta.stats
-            )
-        )
-    return stats
+
+    _assert_type_and_shape(stats_list)
+
+    data_keys = {key for stats in stats_list for key in stats}
+    aggregated_stats = {key: {} for key in data_keys}
+
+    for key in data_keys:
+        stats_with_key = [stats[key] for stats in stats_list if key in stats]
+        aggregated_stats[key] = aggregate_feature_stats(stats_with_key)
+
+    return aggregated_stats
--- a/lerobot/common/datasets/lerobot_dataset.py
+++ b/lerobot/common/datasets/lerobot_dataset.py
@@ -26,18 +26,17 @@ import PIL.Image
 import torch
 import torch.utils
 from datasets import load_dataset
-from huggingface_hub import create_repo, snapshot_download, upload_folder
+from huggingface_hub import HfApi, snapshot_download

-from lerobot.common.datasets.compute_stats import aggregate_stats, compute_stats
+from lerobot.common.datasets.compute_stats import aggregate_stats, compute_episode_stats
 from lerobot.common.datasets.image_writer import AsyncImageWriter, write_image
 from lerobot.common.datasets.utils import (
    DEFAULT_FEATURES,
    DEFAULT_IMAGE_PATH,
-    EPISODES_PATH,
    INFO_PATH,
-    STATS_PATH,
    TASKS_PATH,
    append_jsonlines,
+    backward_compatible_episodes_stats,
    check_delta_timestamps,
    check_frame_features,
    check_timestamps_sync,
@@ -52,10 +51,13 @@ from lerobot.common.datasets.utils import (
    get_hub_safe_version,
    hf_transform_to_torch,
    load_episodes,
+    load_episodes_stats,
    load_info,
    load_stats,
    load_tasks,
-    serialize_dict,
+    write_episode,
+    write_episode_stats,
+    write_info,
    write_json,
    write_parquet,
 )
@@ -90,6 +92,17 @@ class LeRobotDatasetMetadata:
        self.stats = load_stats(self.root)
        self.tasks, self.task_to_task_index = load_tasks(self.root)
        self.episodes = load_episodes(self.root)
+        try:
+            self.episodes_stats = load_episodes_stats(self.root)
+            self.stats = aggregate_stats(list(self.episodes_stats.values()))
+        except FileNotFoundError:
+            logging.warning(
+                f"""'episodes_stats.jsonl' not found. Using global dataset stats for each episode instead.
+                Convert your dataset stats to the new format using this command:
+                python lerobot/common/datasets/v21/convert_dataset_v20_to_v21.py --repo-id={self.repo_id} """
+            )
+            self.stats = load_stats(self.root)
+            self.episodes_stats = backward_compatible_episodes_stats(self.stats, self.episodes)

    def pull_from_repo(
        self,
@@ -228,7 +241,13 @@ class LeRobotDatasetMetadata:
        }
        append_jsonlines(task_dict, self.root / TASKS_PATH)

-    def save_episode(self, episode_index: int, episode_length: int, episode_tasks: list[str]) -> None:
+    def save_episode(
+        self,
+        episode_index: int,
+        episode_length: int,
+        episode_tasks: list[str],
+        episode_stats: dict[str, dict],
+    ) -> None:
        self.info["total_episodes"] += 1
        self.info["total_frames"] += episode_length

@@ -238,21 +257,19 @@ class LeRobotDatasetMetadata:

        self.info["splits"] = {"train": f"0:{self.info['total_episodes']}"}
        self.info["total_videos"] += len(self.video_keys)
-        write_json(self.info, self.root / INFO_PATH)
+        write_info(self.info, self.root)

        episode_dict = {
            "episode_index": episode_index,
            "tasks": episode_tasks,
            "length": episode_length,
        }
-        self.episodes.append(episode_dict)
-        append_jsonlines(episode_dict, self.root / EPISODES_PATH)
+        self.episodes[episode_index] = episode_dict
+        write_episode(episode_dict, self.root)

-        # TODO(aliberts): refactor stats in save_episodes
-        # image_sampling = int(self.fps / 2)  # sample 2 img/s for the stats
-        # ep_stats = compute_episode_stats(episode_buffer, self.features, episode_length, image_sampling=image_sampling)
-        # ep_stats = serialize_dict(ep_stats)
-        # append_jsonlines(ep_stats, self.root / STATS_PATH)
+        self.episodes_stats[episode_index] = episode_stats
+        self.stats = aggregate_stats([self.stats, episode_stats]) if self.stats else episode_stats
+        write_episode_stats(episode_index, episode_stats, self.root)

    def write_video_info(self) -> None:
        """
@@ -309,6 +326,7 @@ class LeRobotDatasetMetadata:
            )
        else:
            # TODO(aliberts, rcadene): implement sanity check for features
+            features = {**features, **DEFAULT_FEATURES}

            # check if none of the features contains a "/" in their names,
            # as this would break the dict flattening in the stats computation, which uses '/' as separator
@@ -319,7 +337,7 @@ class LeRobotDatasetMetadata:
            features = {**features, **DEFAULT_FEATURES}

        obj.tasks, obj.task_to_task_index = {}, {}
-        obj.stats, obj.episodes = {}, []
+        obj.episodes_stats, obj.stats, obj.episodes = {}, {}, {}
        obj.info = create_empty_dataset_info(CODEBASE_VERSION, fps, robot_type, features, use_videos)
        if len(obj.video_keys) > 0 and not use_videos:
            raise ValueError()
@@ -457,6 +475,9 @@ class LeRobotDataset(torch.utils.data.Dataset):

        # Load metadata
        self.meta = LeRobotDatasetMetadata(self.repo_id, self.root, self.local_files_only)
+        if self.episodes is not None and self.meta._version == CODEBASE_VERSION:
+            episodes_stats = [self.meta.episodes_stats[ep_idx] for ep_idx in self.episodes]
+            self.stats = aggregate_stats(episodes_stats)

        # Check version
        check_version_compatibility(self.repo_id, self.meta._version, CODEBASE_VERSION)
@@ -479,10 +500,13 @@ class LeRobotDataset(torch.utils.data.Dataset):

    def push_to_hub(
        self,
+        branch: str | None = None,
+        create_card: bool = True,
        tags: list | None = None,
        license: str | None = "apache-2.0",
        push_videos: bool = True,
        private: bool = False,
+        allow_patterns: list[str] | str | None = None,
        **card_kwargs,
    ) -> None:
        if not self.consolidated:
@@ -496,24 +520,32 @@ class LeRobotDataset(torch.utils.data.Dataset):
        if not push_videos:
            ignore_patterns.append("videos/")

-        create_repo(
+        hub_api = HfApi()
+        hub_api.create_repo(
            repo_id=self.repo_id,
            private=private,
            repo_type="dataset",
            exist_ok=True,
        )
+        if branch:
+            create_branch(repo_id=self.repo_id, branch=branch, repo_type="dataset")

-        upload_folder(
+        hub_api.upload_folder(
            repo_id=self.repo_id,
            folder_path=self.root,
            repo_type="dataset",
+            revision=branch,
+            allow_patterns=allow_patterns,
            ignore_patterns=ignore_patterns,
        )
-        card = create_lerobot_dataset_card(
-            tags=tags, dataset_info=self.meta.info, license=license, **card_kwargs
-        )
-        card.push_to_hub(repo_id=self.repo_id, repo_type="dataset")
-        create_branch(repo_id=self.repo_id, branch=CODEBASE_VERSION, repo_type="dataset")
+        if create_card:
+            card = create_lerobot_dataset_card(
+                tags=tags, dataset_info=self.meta.info, license=license, **card_kwargs
+            )
+            card.push_to_hub(repo_id=self.repo_id, repo_type="dataset", revision=branch)
+
+        if not branch:
+            create_branch(repo_id=self.repo_id, branch=CODEBASE_VERSION, repo_type="dataset")

    def pull_from_repo(
        self,
@@ -630,7 +662,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
            if key not in self.meta.video_keys
        }

-    def _query_videos(self, query_timestamps: dict[str, list[float]], ep_idx: int) -> dict:
+    def _query_videos(self, query_timestamps: dict[str, list[float]], ep_idx: int) -> dict[str, torch.Tensor]:
        """Note: When using data workers (e.g. DataLoader with num_workers>0), do not call this function
        in the main process (e.g. by using a second Dataloader with num_workers=0). It will result in a
        Segmentation Fault. This probably happens because a memory reference to the video loader is created in
@@ -660,8 +692,7 @@ class LeRobotDataset(torch.utils.data.Dataset):

        query_indices = None
        if self.delta_indices is not None:
-            current_ep_idx = self.episodes.index(ep_idx) if self.episodes is not None else ep_idx
-            query_indices, padding = self._get_query_indices(idx, current_ep_idx)
+            query_indices, padding = self._get_query_indices(idx, ep_idx)
            query_result = self._query_hf_dataset(query_indices)
            item = {**item, **padding}
            for key, val in query_result.items():
@@ -735,11 +766,13 @@ class LeRobotDataset(torch.utils.data.Dataset):
        if self.episode_buffer is None:
            self.episode_buffer = self.create_episode_buffer()

+        # Automatically add frame_index and timestamp to episode buffer
        frame_index = self.episode_buffer["size"]
        timestamp = frame.pop("timestamp") if "timestamp" in frame else frame_index / self.fps
        self.episode_buffer["frame_index"].append(frame_index)
        self.episode_buffer["timestamp"].append(timestamp)

+        # Add frame features to episode_buffer
        for key in frame:
            if key == "task":
                # Note: we associate the task in natural language to its task index during `save_episode`
@@ -787,7 +820,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
            # TODO(aliberts): Add option to use existing episode_index
            raise NotImplementedError(
                "You might have manually provided the episode_buffer with an episode_index that doesn't "
-                "match the total number of episodes in the dataset. This is not supported for now."
+                "match the total number of episodes already in the dataset. This is not supported for now."
            )

        if episode_length == 0:
@@ -821,8 +854,8 @@ class LeRobotDataset(torch.utils.data.Dataset):

        self._wait_image_writer()
        self._save_episode_table(episode_buffer, episode_index)
-
-        self.meta.save_episode(episode_index, episode_length, episode_tasks)
+        ep_stats = compute_episode_stats(episode_buffer, self.features)
+        self.meta.save_episode(episode_index, episode_length, episode_tasks, ep_stats)

        if encode_videos and len(self.meta.video_keys) > 0:
            video_paths = self.encode_episode_videos(episode_index)
@@ -908,7 +941,7 @@ class LeRobotDataset(torch.utils.data.Dataset):

        return video_paths

-    def consolidate(self, run_compute_stats: bool = True, keep_image_files: bool = False) -> None:
+    def consolidate(self, keep_image_files: bool = False) -> None:
        self.hf_dataset = self.load_hf_dataset()
        self.episode_data_index = get_episode_data_index(self.meta.episodes, self.episodes)
        check_timestamps_sync(self.hf_dataset, self.episode_data_index, self.fps, self.tolerance_s)
@@ -928,17 +961,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
        parquet_files = list(self.root.rglob("*.parquet"))
        assert len(parquet_files) == self.num_episodes

-        if run_compute_stats:
-            self.stop_image_writer()
-            # TODO(aliberts): refactor stats in save_episodes
-            self.meta.stats = compute_stats(self)
-            serialized_stats = serialize_dict(self.meta.stats)
-            write_json(serialized_stats, self.root / STATS_PATH)
-            self.consolidated = True
-        else:
-            logging.warning(
-                "Skipping computation of the dataset statistics, dataset is not fully consolidated."
-            )
+        self.consolidated = True

    @classmethod
    def create(
@@ -1056,7 +1079,10 @@ class MultiLeRobotDataset(torch.utils.data.Dataset):

        self.image_transforms = image_transforms
        self.delta_timestamps = delta_timestamps
-        self.stats = aggregate_stats(self._datasets)
+        # TODO(rcadene, aliberts): We should not perform this aggregation for datasets
+        # with multiple robots of different ranges. Instead we should have one normalization
+        # per robot.
+        self.stats = aggregate_stats([dataset.meta.stats for dataset in self._datasets])

    @property
    def repo_id_to_index(self):
--- a/lerobot/common/datasets/utils.py
+++ b/lerobot/common/datasets/utils.py
@@ -43,6 +43,7 @@ DEFAULT_CHUNK_SIZE = 1000  # Max number of episodes per chunk
 INFO_PATH = "meta/info.json"
 EPISODES_PATH = "meta/episodes.jsonl"
 STATS_PATH = "meta/stats.json"
+EPISODES_STATS_PATH = "meta/episodes_stats.jsonl"
 TASKS_PATH = "meta/tasks.jsonl"

 DEFAULT_VIDEO_PATH = "videos/chunk-{episode_chunk:03d}/{video_key}/episode_{episode_index:06d}.mp4"
@@ -113,7 +114,16 @@ def get_nested_item(obj: DictLike, flattened_key: str, sep: str = "/") -> Any:


 def serialize_dict(stats: dict[str, torch.Tensor | np.ndarray | dict]) -> dict:
-    serialized_dict = {key: value.tolist() for key, value in flatten_dict(stats).items()}
+    serialized_dict = {}
+    for key, value in flatten_dict(stats).items():
+        if isinstance(value, (torch.Tensor, np.ndarray)):
+            serialized_dict[key] = value.tolist()
+        elif isinstance(value, np.generic):
+            serialized_dict[key] = value.item()
+        elif isinstance(value, (int, float)):
+            serialized_dict[key] = value
+        else:
+            raise NotImplementedError(f"The value '{value}' of type '{type(value)}' is not supported.")
    return unflatten_dict(serialized_dict)


@@ -154,6 +164,10 @@ def append_jsonlines(data: dict, fpath: Path) -> None:
        writer.write(data)


+def write_info(info: dict, local_dir: Path):
+    write_json(info, local_dir / INFO_PATH)
+
+
 def load_info(local_dir: Path) -> dict:
    info = load_json(local_dir / INFO_PATH)
    for ft in info["features"].values():
@@ -161,12 +175,29 @@ def load_info(local_dir: Path) -> dict:
    return info


-def load_stats(local_dir: Path) -> dict:
+def write_stats(stats: dict, local_dir: Path):
+    serialized_stats = serialize_dict(stats)
+    write_json(serialized_stats, local_dir / STATS_PATH)
+
+
+def cast_stats_to_numpy(stats) -> dict[str, dict[str, np.ndarray]]:
+    stats = {key: np.array(value) for key, value in flatten_dict(stats).items()}
+    return unflatten_dict(stats)
+
+
+def load_stats(local_dir: Path) -> dict[str, dict[str, np.ndarray]]:
    if not (local_dir / STATS_PATH).exists():
        return None
    stats = load_json(local_dir / STATS_PATH)
-    stats = {key: torch.tensor(value) for key, value in flatten_dict(stats).items()}
-    return unflatten_dict(stats)
+    return cast_stats_to_numpy(stats)
+
+
+def write_task(task_index: int, task: dict, local_dir: Path):
+    task_dict = {
+        "task_index": task_index,
+        "task": task,
+    }
+    append_jsonlines(task_dict, local_dir / TASKS_PATH)


 def load_tasks(local_dir: Path) -> dict:
@@ -176,16 +207,42 @@ def load_tasks(local_dir: Path) -> dict:
    return tasks, task_to_task_index


+def write_episode(episode: dict, local_dir: Path):
+    append_jsonlines(episode, local_dir / EPISODES_PATH)
+
+
 def load_episodes(local_dir: Path) -> dict:
-    return load_jsonlines(local_dir / EPISODES_PATH)
+    episodes = load_jsonlines(local_dir / EPISODES_PATH)
+    return {item["episode_index"]: item for item in sorted(episodes, key=lambda x: x["episode_index"])}


-def load_image_as_numpy(fpath: str | Path, dtype="float32", channel_first: bool = True) -> np.ndarray:
+def write_episode_stats(episode_index: int, episode_stats: dict, local_dir: Path):
+    # We wrap episode_stats in a dictionnary since `episode_stats["episode_index"]`
+    # is a dictionary of stats and not an integer.
+    episode_stats = {"episode_index": episode_index, "stats": serialize_dict(episode_stats)}
+    append_jsonlines(episode_stats, local_dir / EPISODES_STATS_PATH)
+
+
+def load_episodes_stats(local_dir: Path) -> dict:
+    episodes_stats = load_jsonlines(local_dir / EPISODES_STATS_PATH)
+    return {
+        item["episode_index"]: cast_stats_to_numpy(item["stats"])
+        for item in sorted(episodes_stats, key=lambda x: x["episode_index"])
+    }
+
+
+def backward_compatible_episodes_stats(stats, episodes: list[int]) -> dict[str, dict[str, np.ndarray]]:
+    return {ep_idx: stats for ep_idx in episodes}
+
+
+def load_image_as_numpy(
+    fpath: str | Path, dtype: np.dtype = np.float32, channel_first: bool = True
+) -> np.ndarray:
    img = PILImage.open(fpath).convert("RGB")
    img_array = np.array(img, dtype=dtype)
    if channel_first:  # (H, W, C) -> (C, H, W)
        img_array = np.transpose(img_array, (2, 0, 1))
-    if "float" in dtype:
+    if np.issubdtype(dtype, np.floating):
        img_array /= 255.0
    return img_array

@@ -370,9 +427,9 @@ def create_empty_dataset_info(


 def get_episode_data_index(
-    episode_dicts: list[dict], episodes: list[int] | None = None
+    episode_dicts: dict[dict], episodes: list[int] | None = None
 ) -> dict[str, torch.Tensor]:
-    episode_lengths = {ep_idx: ep_dict["length"] for ep_idx, ep_dict in enumerate(episode_dicts)}
+    episode_lengths = {ep_idx: ep_dict["length"] for ep_idx, ep_dict in episode_dicts.items()}
    if episodes is not None:
        episode_lengths = {ep_idx: episode_lengths[ep_idx] for ep_idx in episodes}

--- a/lerobot/common/datasets/v21/convert_dataset_v20_to_v21.py
+++ b/lerobot/common/datasets/v21/convert_dataset_v20_to_v21.py
@@ -0,0 +1,87 @@
+"""
+This script will help you convert any LeRobot dataset already pushed to the hub from codebase version 2.0 to
+2.1. It performs the following:
+
+- Generates per-episodes stats and writes them in `episodes_stats.jsonl`
+- Removes the deprecated `stats.json` (by default)
+- Updates codebase_version in `info.json`
+
+Usage:
+
+```bash
+python lerobot/common/datasets/v21/convert_dataset_v20_to_v21.py \
+    --repo-id=aliberts/koch_tutorial
+```
+
+"""
+# TODO(rcadene, aliberts): ensure this script works for any other changes for the final v2.1
+
+import argparse
+
+from huggingface_hub import HfApi
+
+from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDataset
+from lerobot.common.datasets.utils import EPISODES_STATS_PATH, STATS_PATH, load_stats, write_info
+from lerobot.common.datasets.v21.convert_stats import check_aggregate_stats, convert_stats
+
+
+def main(
+    repo_id: str,
+    test_branch: str | None = None,
+    delete_old_stats: bool = False,
+    num_workers: int = 4,
+):
+    dataset = LeRobotDataset(repo_id)
+    if (dataset.root / EPISODES_STATS_PATH).is_file():
+        raise FileExistsError("episodes_stats.jsonl already exists.")
+
+    convert_stats(dataset, num_workers=num_workers)
+    ref_stats = load_stats(dataset.root)
+    check_aggregate_stats(dataset, ref_stats)
+
+    dataset.meta.info["codebase_version"] = CODEBASE_VERSION
+    write_info(dataset.meta.info, dataset.root)
+
+    dataset.push_to_hub(branch=test_branch, create_card=False, allow_patterns="meta/")
+
+    if delete_old_stats:
+        if (dataset.root / STATS_PATH).is_file:
+            (dataset.root / STATS_PATH).unlink()
+        hub_api = HfApi()
+        if hub_api.file_exists(
+            STATS_PATH, repo_id=dataset.repo_id, revision=test_branch, repo_type="dataset"
+        ):
+            hub_api.delete_file(
+                STATS_PATH, repo_id=dataset.repo_id, revision=test_branch, repo_type="dataset"
+            )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--repo-id",
+        type=str,
+        required=True,
+        help="Repository identifier on Hugging Face: a community or a user name `/` the name of the dataset (e.g. `lerobot/pusht`, `cadene/aloha_sim_insertion_human`).",
+    )
+    parser.add_argument(
+        "--test-branch",
+        type=str,
+        default=None,
+        help="Repo branch to test your conversion first (e.g. 'v2.0.test')",
+    )
+    parser.add_argument(
+        "--delete-old-stats",
+        type=bool,
+        default=False,
+        help="Delete the deprecated `stats.json`",
+    )
+    parser.add_argument(
+        "--num-workers",
+        type=int,
+        default=4,
+        help="Number of workers for parallelizing compute",
+    )
+
+    args = parser.parse_args()
+    main(**vars(args))
--- a/lerobot/common/datasets/v21/convert_stats.py
+++ b/lerobot/common/datasets/v21/convert_stats.py
@@ -0,0 +1,85 @@
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+import numpy as np
+from tqdm import tqdm
+
+from lerobot.common.datasets.compute_stats import aggregate_stats, get_feature_stats, sample_indices
+from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
+from lerobot.common.datasets.utils import write_episode_stats
+
+
+def sample_episode_video_frames(dataset: LeRobotDataset, episode_index: int, ft_key: str) -> np.ndarray:
+    ep_len = dataset.meta.episodes[episode_index]["length"]
+    sampled_indices = sample_indices(ep_len)
+    query_timestamps = dataset._get_query_timestamps(0.0, {ft_key: sampled_indices})
+    video_frames = dataset._query_videos(query_timestamps, episode_index)
+    return video_frames[ft_key].numpy()
+
+
+def convert_episode_stats(dataset: LeRobotDataset, ep_idx: int):
+    ep_start_idx = dataset.episode_data_index["from"][ep_idx]
+    ep_end_idx = dataset.episode_data_index["to"][ep_idx]
+    ep_data = dataset.hf_dataset.select(range(ep_start_idx, ep_end_idx))
+
+    ep_stats = {}
+    for key, ft in dataset.features.items():
+        if ft["dtype"] == "video":
+            # We sample only for videos
+            ep_ft_data = sample_episode_video_frames(dataset, ep_idx, key)
+        else:
+            ep_ft_data = np.array(ep_data[key])
+
+        axes_to_reduce = (0, 2, 3) if ft["dtype"] in ["image", "video"] else 0
+        keepdims = True if ft["dtype"] in ["image", "video"] else ep_ft_data.ndim == 1
+        ep_stats[key] = get_feature_stats(ep_ft_data, axis=axes_to_reduce, keepdims=keepdims)
+
+        if ft["dtype"] in ["image", "video"]:  # remove batch dim
+            ep_stats[key] = {
+                k: v if k == "count" else np.squeeze(v, axis=0) for k, v in ep_stats[key].items()
+            }
+
+    dataset.meta.episodes_stats[ep_idx] = ep_stats
+
+
+def convert_stats(dataset: LeRobotDataset, num_workers: int = 0):
+    assert dataset.episodes is None
+    print("Computing episodes stats")
+    total_episodes = dataset.meta.total_episodes
+    if num_workers > 0:
+        with ThreadPoolExecutor(max_workers=num_workers) as executor:
+            futures = {
+                executor.submit(convert_episode_stats, dataset, ep_idx): ep_idx
+                for ep_idx in range(total_episodes)
+            }
+            for future in tqdm(as_completed(futures), total=total_episodes):
+                future.result()
+    else:
+        for ep_idx in tqdm(range(total_episodes)):
+            convert_episode_stats(dataset, ep_idx)
+
+    for ep_idx in tqdm(range(total_episodes)):
+        write_episode_stats(ep_idx, dataset.meta.episodes_stats[ep_idx], dataset.root)
+
+
+def check_aggregate_stats(
+    dataset: LeRobotDataset,
+    reference_stats: dict[str, dict[str, np.ndarray]],
+    video_rtol_atol: tuple[float] = (1e-2, 1e-2),
+    default_rtol_atol: tuple[float] = (5e-6, 0.0),
+):
+    """Verifies that the aggregated stats from episodes_stats are close to reference stats."""
+    agg_stats = aggregate_stats(list(dataset.meta.episodes_stats.values()))
+    for key, ft in dataset.features.items():
+        # These values might need some fine-tuning
+        if ft["dtype"] == "video":
+            # to account for image sub-sampling
+            rtol, atol = video_rtol_atol
+        else:
+            rtol, atol = default_rtol_atol
+
+        for stat, val in agg_stats[key].items():
+            if key in reference_stats and stat in reference_stats[key]:
+                err_msg = f"feature='{key}' stats='{stat}'"
+                np.testing.assert_allclose(
+                    val, reference_stats[key][stat], rtol=rtol, atol=atol, err_msg=err_msg
+                )
--- a/lerobot/common/datasets/video_utils.py
+++ b/lerobot/common/datasets/video_utils.py
@@ -69,8 +69,8 @@ def decode_video_frames_torchvision(

    # set the first and last requested timestamps
    # Note: previous timestamps are usually loaded, since we need to access the previous key frame
-    first_ts = timestamps[0]
-    last_ts = timestamps[-1]
+    first_ts = min(timestamps)
+    last_ts = max(timestamps)

    # access closest key frame of the first requested frame
    # Note: closest key frame timestamp is usally smaller than `first_ts` (e.g. key frame can be the first frame of the video)