Per-episode stats (#521)

Co-authored-by: Remi Cadene <re.cadene@gmail.com> Co-authored-by: Remi <remi.cadene@huggingface.co>
2025-02-15 15:47:16 +01:00
parent 7c2bbee613
commit 8426c64f42
19 changed files with 906 additions and 798 deletions
--- a/tests/fixtures/dataset_factories.py
+++ b/tests/fixtures/dataset_factories.py
@@ -29,7 +29,7 @@ from tests.fixtures.constants import (


 def get_task_index(task_dicts: dict, task: str) -> int:
-    tasks = {d["task_index"]: d["task"] for d in task_dicts}
+    tasks = {d["task_index"]: d["task"] for d in task_dicts.values()}
    task_to_task_index = {task: task_idx for task_idx, task in tasks.items()}
    return task_to_task_index[task]

@@ -142,6 +142,7 @@ def stats_factory():
                    "mean": np.full((3, 1, 1), 0.5, dtype=np.float32).tolist(),
                    "min": np.full((3, 1, 1), 0, dtype=np.float32).tolist(),
                    "std": np.full((3, 1, 1), 0.25, dtype=np.float32).tolist(),
+                    "count": [10],
                }
            else:
                stats[key] = {
@@ -149,20 +150,38 @@ def stats_factory():
                    "mean": np.full(shape, 0.5, dtype=dtype).tolist(),
                    "min": np.full(shape, 0, dtype=dtype).tolist(),
                    "std": np.full(shape, 0.25, dtype=dtype).tolist(),
+                    "count": [10],
                }
        return stats

    return _create_stats


+@pytest.fixture(scope="session")
+def episodes_stats_factory(stats_factory):
+    def _create_episodes_stats(
+        features: dict[str],
+        total_episodes: int = 3,
+    ) -> dict:
+        episodes_stats = {}
+        for episode_index in range(total_episodes):
+            episodes_stats[episode_index] = {
+                "episode_index": episode_index,
+                "stats": stats_factory(features),
+            }
+        return episodes_stats
+
+    return _create_episodes_stats
+
+
@pytest.fixture(scope="session")
 def tasks_factory():
    def _create_tasks(total_tasks: int = 3) -> int:
-        tasks_list = []
-        for i in range(total_tasks):
-            task_dict = {"task_index": i, "task": f"Perform action {i}."}
-            tasks_list.append(task_dict)
-        return tasks_list
+        tasks = {}
+        for task_index in range(total_tasks):
+            task_dict = {"task_index": task_index, "task": f"Perform action {task_index}."}
+            tasks[task_index] = task_dict
+        return tasks

    return _create_tasks

@@ -191,10 +210,10 @@ def episodes_factory(tasks_factory):
        # Generate random lengths that sum up to total_length
        lengths = np.random.multinomial(total_frames, [1 / total_episodes] * total_episodes).tolist()

-        tasks_list = [task_dict["task"] for task_dict in tasks]
+        tasks_list = [task_dict["task"] for task_dict in tasks.values()]
        num_tasks_available = len(tasks_list)

-        episodes_list = []
+        episodes = {}
        remaining_tasks = tasks_list.copy()
        for ep_idx in range(total_episodes):
            num_tasks_in_episode = random.randint(1, min(3, num_tasks_available)) if multi_task else 1
@@ -204,15 +223,13 @@ def episodes_factory(tasks_factory):
                for task in episode_tasks:
                    remaining_tasks.remove(task)

-            episodes_list.append(
-                {
-                    "episode_index": ep_idx,
-                    "tasks": episode_tasks,
-                    "length": lengths[ep_idx],
-                }
-            )
+            episodes[ep_idx] = {
+                "episode_index": ep_idx,
+                "tasks": episode_tasks,
+                "length": lengths[ep_idx],
+            }

-        return episodes_list
+        return episodes

    return _create_episodes

@@ -236,7 +253,7 @@ def hf_dataset_factory(features_factory, tasks_factory, episodes_factory, img_ar
        frame_index_col = np.array([], dtype=np.int64)
        episode_index_col = np.array([], dtype=np.int64)
        task_index = np.array([], dtype=np.int64)
-        for ep_dict in episodes:
+        for ep_dict in episodes.values():
            timestamp_col = np.concatenate((timestamp_col, np.arange(ep_dict["length"]) / fps))
            frame_index_col = np.concatenate((frame_index_col, np.arange(ep_dict["length"], dtype=int)))
            episode_index_col = np.concatenate(
@@ -279,6 +296,7 @@ def hf_dataset_factory(features_factory, tasks_factory, episodes_factory, img_ar
 def lerobot_dataset_metadata_factory(
    info_factory,
    stats_factory,
+    episodes_stats_factory,
    tasks_factory,
    episodes_factory,
    mock_snapshot_download_factory,
@@ -288,6 +306,7 @@ def lerobot_dataset_metadata_factory(
        repo_id: str = DUMMY_REPO_ID,
        info: dict | None = None,
        stats: dict | None = None,
+        episodes_stats: list[dict] | None = None,
        tasks: list[dict] | None = None,
        episodes: list[dict] | None = None,
        local_files_only: bool = False,
@@ -296,6 +315,10 @@ def lerobot_dataset_metadata_factory(
            info = info_factory()
        if not stats:
            stats = stats_factory(features=info["features"])
+        if not episodes_stats:
+            episodes_stats = episodes_stats_factory(
+                features=info["features"], total_episodes=info["total_episodes"]
+            )
        if not tasks:
            tasks = tasks_factory(total_tasks=info["total_tasks"])
        if not episodes:
@@ -306,6 +329,7 @@ def lerobot_dataset_metadata_factory(
        mock_snapshot_download = mock_snapshot_download_factory(
            info=info,
            stats=stats,
+            episodes_stats=episodes_stats,
            tasks=tasks,
            episodes=episodes,
        )
@@ -329,6 +353,7 @@ def lerobot_dataset_metadata_factory(
 def lerobot_dataset_factory(
    info_factory,
    stats_factory,
+    episodes_stats_factory,
    tasks_factory,
    episodes_factory,
    hf_dataset_factory,
@@ -344,6 +369,7 @@ def lerobot_dataset_factory(
        multi_task: bool = False,
        info: dict | None = None,
        stats: dict | None = None,
+        episodes_stats: list[dict] | None = None,
        tasks: list[dict] | None = None,
        episode_dicts: list[dict] | None = None,
        hf_dataset: datasets.Dataset | None = None,
@@ -355,6 +381,8 @@ def lerobot_dataset_factory(
            )
        if not stats:
            stats = stats_factory(features=info["features"])
+        if not episodes_stats:
+            episodes_stats = episodes_stats_factory(features=info["features"], total_episodes=total_episodes)
        if not tasks:
            tasks = tasks_factory(total_tasks=info["total_tasks"])
        if not episode_dicts:
@@ -370,6 +398,7 @@ def lerobot_dataset_factory(
        mock_snapshot_download = mock_snapshot_download_factory(
            info=info,
            stats=stats,
+            episodes_stats=episodes_stats,
            tasks=tasks,
            episodes=episode_dicts,
            hf_dataset=hf_dataset,
@@ -379,6 +408,7 @@ def lerobot_dataset_factory(
            repo_id=repo_id,
            info=info,
            stats=stats,
+            episodes_stats=episodes_stats,
            tasks=tasks,
            episodes=episode_dicts,
            local_files_only=kwargs.get("local_files_only", False),
@@ -406,7 +436,7 @@ def empty_lerobot_dataset_factory():
        robot: Robot | None = None,
        robot_type: str | None = None,
        features: dict | None = None,
-    ):
+    ) -> LeRobotDataset:
        return LeRobotDataset.create(
            repo_id=repo_id, fps=fps, root=root, robot=robot, robot_type=robot_type, features=features
        )
--- a/tests/fixtures/files.py
+++ b/tests/fixtures/files.py
@@ -7,7 +7,13 @@ import pyarrow.compute as pc
 import pyarrow.parquet as pq
 import pytest

-from lerobot.common.datasets.utils import EPISODES_PATH, INFO_PATH, STATS_PATH, TASKS_PATH
+from lerobot.common.datasets.utils import (
+    EPISODES_PATH,
+    EPISODES_STATS_PATH,
+    INFO_PATH,
+    STATS_PATH,
+    TASKS_PATH,
+)


@pytest.fixture(scope="session")
@@ -38,6 +44,20 @@ def stats_path(stats_factory):
    return _create_stats_json_file


+@pytest.fixture(scope="session")
+def episodes_stats_path(episodes_stats_factory):
+    def _create_episodes_stats_jsonl_file(dir: Path, episodes_stats: list[dict] | None = None) -> Path:
+        if not episodes_stats:
+            episodes_stats = episodes_stats_factory()
+        fpath = dir / EPISODES_STATS_PATH
+        fpath.parent.mkdir(parents=True, exist_ok=True)
+        with jsonlines.open(fpath, "w") as writer:
+            writer.write_all(episodes_stats.values())
+        return fpath
+
+    return _create_episodes_stats_jsonl_file
+
+
@pytest.fixture(scope="session")
 def tasks_path(tasks_factory):
    def _create_tasks_jsonl_file(dir: Path, tasks: list | None = None) -> Path:
@@ -46,7 +66,7 @@ def tasks_path(tasks_factory):
        fpath = dir / TASKS_PATH
        fpath.parent.mkdir(parents=True, exist_ok=True)
        with jsonlines.open(fpath, "w") as writer:
-            writer.write_all(tasks)
+            writer.write_all(tasks.values())
        return fpath

    return _create_tasks_jsonl_file
@@ -60,7 +80,7 @@ def episode_path(episodes_factory):
        fpath = dir / EPISODES_PATH
        fpath.parent.mkdir(parents=True, exist_ok=True)
        with jsonlines.open(fpath, "w") as writer:
-            writer.write_all(episodes)
+            writer.write_all(episodes.values())
        return fpath

    return _create_episodes_jsonl_file
--- a/tests/fixtures/hub.py
+++ b/tests/fixtures/hub.py
@@ -4,7 +4,13 @@ import datasets
 import pytest
 from huggingface_hub.utils import filter_repo_objects

-from lerobot.common.datasets.utils import EPISODES_PATH, INFO_PATH, STATS_PATH, TASKS_PATH
+from lerobot.common.datasets.utils import (
+    EPISODES_PATH,
+    EPISODES_STATS_PATH,
+    INFO_PATH,
+    STATS_PATH,
+    TASKS_PATH,
+)
 from tests.fixtures.constants import LEROBOT_TEST_DIR


@@ -14,6 +20,8 @@ def mock_snapshot_download_factory(
    info_path,
    stats_factory,
    stats_path,
+    episodes_stats_factory,
+    episodes_stats_path,
    tasks_factory,
    tasks_path,
    episodes_factory,
@@ -29,6 +37,7 @@ def mock_snapshot_download_factory(
    def _mock_snapshot_download_func(
        info: dict | None = None,
        stats: dict | None = None,
+        episodes_stats: list[dict] | None = None,
        tasks: list[dict] | None = None,
        episodes: list[dict] | None = None,
        hf_dataset: datasets.Dataset | None = None,
@@ -37,6 +46,10 @@ def mock_snapshot_download_factory(
            info = info_factory()
        if not stats:
            stats = stats_factory(features=info["features"])
+        if not episodes_stats:
+            episodes_stats = episodes_stats_factory(
+                features=info["features"], total_episodes=info["total_episodes"]
+            )
        if not tasks:
            tasks = tasks_factory(total_tasks=info["total_tasks"])
        if not episodes:
@@ -67,11 +80,11 @@ def mock_snapshot_download_factory(

            # List all possible files
            all_files = []
-            meta_files = [INFO_PATH, STATS_PATH, TASKS_PATH, EPISODES_PATH]
+            meta_files = [INFO_PATH, STATS_PATH, EPISODES_STATS_PATH, TASKS_PATH, EPISODES_PATH]
            all_files.extend(meta_files)

            data_files = []
-            for episode_dict in episodes:
+            for episode_dict in episodes.values():
                ep_idx = episode_dict["episode_index"]
                ep_chunk = ep_idx // info["chunks_size"]
                data_path = info["data_path"].format(episode_chunk=ep_chunk, episode_index=ep_idx)
@@ -92,6 +105,8 @@ def mock_snapshot_download_factory(
                    _ = info_path(local_dir, info)
                elif rel_path == STATS_PATH:
                    _ = stats_path(local_dir, stats)
+                elif rel_path == EPISODES_STATS_PATH:
+                    _ = episodes_stats_path(local_dir, episodes_stats)
                elif rel_path == TASKS_PATH:
                    _ = tasks_path(local_dir, tasks)
                elif rel_path == EPISODES_PATH:
--- a/tests/test_cameras.py
+++ b/tests/test_cameras.py
@@ -182,7 +182,7 @@ def test_camera(request, camera_type, mock):

@pytest.mark.parametrize("camera_type, mock", TEST_CAMERA_TYPES)
@require_camera
-def test_save_images_from_cameras(tmpdir, request, camera_type, mock):
+def test_save_images_from_cameras(tmp_path, request, camera_type, mock):
    # TODO(rcadene): refactor
    if camera_type == "opencv":
        from lerobot.common.robot_devices.cameras.opencv import save_images_from_cameras
@@ -190,4 +190,4 @@ def test_save_images_from_cameras(tmpdir, request, camera_type, mock):
        from lerobot.common.robot_devices.cameras.intelrealsense import save_images_from_cameras

    # Small `record_time_s` to speedup unit tests
-    save_images_from_cameras(tmpdir, record_time_s=0.02, mock=mock)
+    save_images_from_cameras(tmp_path, record_time_s=0.02, mock=mock)
--- a/tests/test_compute_stats.py
+++ b/tests/test_compute_stats.py
@@ -0,0 +1,311 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from unittest.mock import patch
+
+import numpy as np
+import pytest
+
+from lerobot.common.datasets.compute_stats import (
+    _assert_type_and_shape,
+    aggregate_feature_stats,
+    aggregate_stats,
+    compute_episode_stats,
+    estimate_num_samples,
+    get_feature_stats,
+    sample_images,
+    sample_indices,
+)
+
+
+def mock_load_image_as_numpy(path, dtype, channel_first):
+    return np.ones((3, 32, 32), dtype=dtype) if channel_first else np.ones((32, 32, 3), dtype=dtype)
+
+
+@pytest.fixture
+def sample_array():
+    return np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+
+
+def test_estimate_num_samples():
+    assert estimate_num_samples(1) == 1
+    assert estimate_num_samples(10) == 10
+    assert estimate_num_samples(100) == 100
+    assert estimate_num_samples(200) == 100
+    assert estimate_num_samples(1000) == 177
+    assert estimate_num_samples(2000) == 299
+    assert estimate_num_samples(5000) == 594
+    assert estimate_num_samples(10_000) == 1000
+    assert estimate_num_samples(20_000) == 1681
+    assert estimate_num_samples(50_000) == 3343
+    assert estimate_num_samples(500_000) == 10_000
+
+
+def test_sample_indices():
+    indices = sample_indices(10)
+    assert len(indices) > 0
+    assert indices[0] == 0
+    assert indices[-1] == 9
+    assert len(indices) == estimate_num_samples(10)
+
+
+@patch("lerobot.common.datasets.compute_stats.load_image_as_numpy", side_effect=mock_load_image_as_numpy)
+def test_sample_images(mock_load):
+    image_paths = [f"image_{i}.jpg" for i in range(100)]
+    images = sample_images(image_paths)
+    assert isinstance(images, np.ndarray)
+    assert images.shape[1:] == (3, 32, 32)
+    assert images.dtype == np.uint8
+    assert len(images) == estimate_num_samples(100)
+
+
+def test_get_feature_stats_images():
+    data = np.random.rand(100, 3, 32, 32)
+    stats = get_feature_stats(data, axis=(0, 2, 3), keepdims=True)
+    assert "min" in stats and "max" in stats and "mean" in stats and "std" in stats and "count" in stats
+    np.testing.assert_equal(stats["count"], np.array([100]))
+    assert stats["min"].shape == stats["max"].shape == stats["mean"].shape == stats["std"].shape
+
+
+def test_get_feature_stats_axis_0_keepdims(sample_array):
+    expected = {
+        "min": np.array([[1, 2, 3]]),
+        "max": np.array([[7, 8, 9]]),
+        "mean": np.array([[4.0, 5.0, 6.0]]),
+        "std": np.array([[2.44948974, 2.44948974, 2.44948974]]),
+        "count": np.array([3]),
+    }
+    result = get_feature_stats(sample_array, axis=(0,), keepdims=True)
+    for key in expected:
+        np.testing.assert_allclose(result[key], expected[key])
+
+
+def test_get_feature_stats_axis_1(sample_array):
+    expected = {
+        "min": np.array([1, 4, 7]),
+        "max": np.array([3, 6, 9]),
+        "mean": np.array([2.0, 5.0, 8.0]),
+        "std": np.array([0.81649658, 0.81649658, 0.81649658]),
+        "count": np.array([3]),
+    }
+    result = get_feature_stats(sample_array, axis=(1,), keepdims=False)
+    for key in expected:
+        np.testing.assert_allclose(result[key], expected[key])
+
+
+def test_get_feature_stats_no_axis(sample_array):
+    expected = {
+        "min": np.array(1),
+        "max": np.array(9),
+        "mean": np.array(5.0),
+        "std": np.array(2.5819889),
+        "count": np.array([3]),
+    }
+    result = get_feature_stats(sample_array, axis=None, keepdims=False)
+    for key in expected:
+        np.testing.assert_allclose(result[key], expected[key])
+
+
+def test_get_feature_stats_empty_array():
+    array = np.array([])
+    with pytest.raises(ValueError):
+        get_feature_stats(array, axis=(0,), keepdims=True)
+
+
+def test_get_feature_stats_single_value():
+    array = np.array([[1337]])
+    result = get_feature_stats(array, axis=None, keepdims=True)
+    np.testing.assert_equal(result["min"], np.array(1337))
+    np.testing.assert_equal(result["max"], np.array(1337))
+    np.testing.assert_equal(result["mean"], np.array(1337.0))
+    np.testing.assert_equal(result["std"], np.array(0.0))
+    np.testing.assert_equal(result["count"], np.array([1]))
+
+
+def test_compute_episode_stats():
+    episode_data = {
+        "observation.image": [f"image_{i}.jpg" for i in range(100)],
+        "observation.state": np.random.rand(100, 10),
+    }
+    features = {
+        "observation.image": {"dtype": "image"},
+        "observation.state": {"dtype": "numeric"},
+    }
+
+    with patch(
+        "lerobot.common.datasets.compute_stats.load_image_as_numpy", side_effect=mock_load_image_as_numpy
+    ):
+        stats = compute_episode_stats(episode_data, features)
+
+    assert "observation.image" in stats and "observation.state" in stats
+    assert stats["observation.image"]["count"].item() == 100
+    assert stats["observation.state"]["count"].item() == 100
+    assert stats["observation.image"]["mean"].shape == (3, 1, 1)
+
+
+def test_assert_type_and_shape_valid():
+    valid_stats = [
+        {
+            "feature1": {
+                "min": np.array([1.0]),
+                "max": np.array([10.0]),
+                "mean": np.array([5.0]),
+                "std": np.array([2.0]),
+                "count": np.array([1]),
+            }
+        }
+    ]
+    _assert_type_and_shape(valid_stats)
+
+
+def test_assert_type_and_shape_invalid_type():
+    invalid_stats = [
+        {
+            "feature1": {
+                "min": [1.0],  # Not a numpy array
+                "max": np.array([10.0]),
+                "mean": np.array([5.0]),
+                "std": np.array([2.0]),
+                "count": np.array([1]),
+            }
+        }
+    ]
+    with pytest.raises(ValueError, match="Stats must be composed of numpy array"):
+        _assert_type_and_shape(invalid_stats)
+
+
+def test_assert_type_and_shape_invalid_shape():
+    invalid_stats = [
+        {
+            "feature1": {
+                "count": np.array([1, 2]),  # Wrong shape
+            }
+        }
+    ]
+    with pytest.raises(ValueError, match=r"Shape of 'count' must be \(1\)"):
+        _assert_type_and_shape(invalid_stats)
+
+
+def test_aggregate_feature_stats():
+    stats_ft_list = [
+        {
+            "min": np.array([1.0]),
+            "max": np.array([10.0]),
+            "mean": np.array([5.0]),
+            "std": np.array([2.0]),
+            "count": np.array([1]),
+        },
+        {
+            "min": np.array([2.0]),
+            "max": np.array([12.0]),
+            "mean": np.array([6.0]),
+            "std": np.array([2.5]),
+            "count": np.array([1]),
+        },
+    ]
+    result = aggregate_feature_stats(stats_ft_list)
+    np.testing.assert_allclose(result["min"], np.array([1.0]))
+    np.testing.assert_allclose(result["max"], np.array([12.0]))
+    np.testing.assert_allclose(result["mean"], np.array([5.5]))
+    np.testing.assert_allclose(result["std"], np.array([2.318405]), atol=1e-6)
+    np.testing.assert_allclose(result["count"], np.array([2]))
+
+
+def test_aggregate_stats():
+    all_stats = [
+        {
+            "observation.image": {
+                "min": [1, 2, 3],
+                "max": [10, 20, 30],
+                "mean": [5.5, 10.5, 15.5],
+                "std": [2.87, 5.87, 8.87],
+                "count": 10,
+            },
+            "observation.state": {"min": 1, "max": 10, "mean": 5.5, "std": 2.87, "count": 10},
+            "extra_key_0": {"min": 5, "max": 25, "mean": 15, "std": 6, "count": 6},
+        },
+        {
+            "observation.image": {
+                "min": [2, 1, 0],
+                "max": [15, 10, 5],
+                "mean": [8.5, 5.5, 2.5],
+                "std": [3.42, 2.42, 1.42],
+                "count": 15,
+            },
+            "observation.state": {"min": 2, "max": 15, "mean": 8.5, "std": 3.42, "count": 15},
+            "extra_key_1": {"min": 0, "max": 20, "mean": 10, "std": 5, "count": 5},
+        },
+    ]
+
+    expected_agg_stats = {
+        "observation.image": {
+            "min": [1, 1, 0],
+            "max": [15, 20, 30],
+            "mean": [7.3, 7.5, 7.7],
+            "std": [3.5317, 4.8267, 8.5581],
+            "count": 25,
+        },
+        "observation.state": {
+            "min": 1,
+            "max": 15,
+            "mean": 7.3,
+            "std": 3.5317,
+            "count": 25,
+        },
+        "extra_key_0": {
+            "min": 5,
+            "max": 25,
+            "mean": 15.0,
+            "std": 6.0,
+            "count": 6,
+        },
+        "extra_key_1": {
+            "min": 0,
+            "max": 20,
+            "mean": 10.0,
+            "std": 5.0,
+            "count": 5,
+        },
+    }
+
+    # cast to numpy
+    for ep_stats in all_stats:
+        for fkey, stats in ep_stats.items():
+            for k in stats:
+                stats[k] = np.array(stats[k], dtype=np.int64 if k == "count" else np.float32)
+                if fkey == "observation.image" and k != "count":
+                    stats[k] = stats[k].reshape(3, 1, 1)  # for normalization on image channels
+                else:
+                    stats[k] = stats[k].reshape(1)
+
+    # cast to numpy
+    for fkey, stats in expected_agg_stats.items():
+        for k in stats:
+            stats[k] = np.array(stats[k], dtype=np.int64 if k == "count" else np.float32)
+            if fkey == "observation.image" and k != "count":
+                stats[k] = stats[k].reshape(3, 1, 1)  # for normalization on image channels
+            else:
+                stats[k] = stats[k].reshape(1)
+
+    results = aggregate_stats(all_stats)
+
+    for fkey in expected_agg_stats:
+        np.testing.assert_allclose(results[fkey]["min"], expected_agg_stats[fkey]["min"])
+        np.testing.assert_allclose(results[fkey]["max"], expected_agg_stats[fkey]["max"])
+        np.testing.assert_allclose(results[fkey]["mean"], expected_agg_stats[fkey]["mean"])
+        np.testing.assert_allclose(
+            results[fkey]["std"], expected_agg_stats[fkey]["std"], atol=1e-04, rtol=1e-04
+        )
+        np.testing.assert_allclose(results[fkey]["count"], expected_agg_stats[fkey]["count"])
--- a/tests/test_control_robot.py
+++ b/tests/test_control_robot.py
@@ -24,7 +24,6 @@ pytest -sx 'tests/test_control_robot.py::test_teleoperate[aloha-True]'
 """

 import multiprocessing
-from pathlib import Path
 from unittest.mock import patch

 import pytest
@@ -45,7 +44,7 @@ from tests.utils import DEVICE, TEST_ROBOT_TYPES, mock_calibration_dir, require_

@pytest.mark.parametrize("robot_type, mock", TEST_ROBOT_TYPES)
@require_robot
-def test_teleoperate(tmpdir, request, robot_type, mock):
+def test_teleoperate(tmp_path, request, robot_type, mock):
    robot_kwargs = {"robot_type": robot_type, "mock": mock}

    if mock and robot_type != "aloha":
@@ -53,8 +52,7 @@ def test_teleoperate(tmpdir, request, robot_type, mock):

        # Create an empty calibration directory to trigger manual calibration
        # and avoid writing calibration files in user .cache/calibration folder
-        tmpdir = Path(tmpdir)
-        calibration_dir = tmpdir / robot_type
+        calibration_dir = tmp_path / robot_type
        mock_calibration_dir(calibration_dir)
        robot_kwargs["calibration_dir"] = calibration_dir
    else:
@@ -70,15 +68,14 @@ def test_teleoperate(tmpdir, request, robot_type, mock):

@pytest.mark.parametrize("robot_type, mock", TEST_ROBOT_TYPES)
@require_robot
-def test_calibrate(tmpdir, request, robot_type, mock):
+def test_calibrate(tmp_path, request, robot_type, mock):
    robot_kwargs = {"robot_type": robot_type, "mock": mock}

    if mock:
        request.getfixturevalue("patch_builtins_input")

    # Create an empty calibration directory to trigger manual calibration
-    tmpdir = Path(tmpdir)
-    calibration_dir = tmpdir / robot_type
+    calibration_dir = tmp_path / robot_type
    robot_kwargs["calibration_dir"] = calibration_dir

    robot = make_robot(**robot_kwargs)
@@ -89,7 +86,7 @@ def test_calibrate(tmpdir, request, robot_type, mock):

@pytest.mark.parametrize("robot_type, mock", TEST_ROBOT_TYPES)
@require_robot
-def test_record_without_cameras(tmpdir, request, robot_type, mock):
+def test_record_without_cameras(tmp_path, request, robot_type, mock):
    robot_kwargs = {"robot_type": robot_type, "mock": mock}

    # Avoid using cameras
@@ -100,7 +97,7 @@ def test_record_without_cameras(tmpdir, request, robot_type, mock):

        # Create an empty calibration directory to trigger manual calibration
        # and avoid writing calibration files in user .cache/calibration folder
-        calibration_dir = Path(tmpdir) / robot_type
+        calibration_dir = tmp_path / robot_type
        mock_calibration_dir(calibration_dir)
        robot_kwargs["calibration_dir"] = calibration_dir
    else:
@@ -108,7 +105,7 @@ def test_record_without_cameras(tmpdir, request, robot_type, mock):
        pass

    repo_id = "lerobot/debug"
-    root = Path(tmpdir) / "data" / repo_id
+    root = tmp_path / "data" / repo_id
    single_task = "Do something."

    robot = make_robot(**robot_kwargs)
@@ -121,7 +118,6 @@ def test_record_without_cameras(tmpdir, request, robot_type, mock):
        episode_time_s=1,
        reset_time_s=0.1,
        num_episodes=2,
-        run_compute_stats=False,
        push_to_hub=False,
        video=False,
        play_sounds=False,
@@ -131,8 +127,7 @@ def test_record_without_cameras(tmpdir, request, robot_type, mock):

@pytest.mark.parametrize("robot_type, mock", TEST_ROBOT_TYPES)
@require_robot
-def test_record_and_replay_and_policy(tmpdir, request, robot_type, mock):
-    tmpdir = Path(tmpdir)
+def test_record_and_replay_and_policy(tmp_path, request, robot_type, mock):
    robot_kwargs = {"robot_type": robot_type, "mock": mock}

    if mock and robot_type != "aloha":
@@ -140,7 +135,7 @@ def test_record_and_replay_and_policy(tmpdir, request, robot_type, mock):

        # Create an empty calibration directory to trigger manual calibration
        # and avoid writing calibration files in user .cache/calibration folder
-        calibration_dir = tmpdir / robot_type
+        calibration_dir = tmp_path / robot_type
        mock_calibration_dir(calibration_dir)
        robot_kwargs["calibration_dir"] = calibration_dir
    else:
@@ -148,7 +143,7 @@ def test_record_and_replay_and_policy(tmpdir, request, robot_type, mock):
        pass

    repo_id = "lerobot_test/debug"
-    root = tmpdir / "data" / repo_id
+    root = tmp_path / "data" / repo_id
    single_task = "Do something."

    robot = make_robot(**robot_kwargs)
@@ -180,7 +175,7 @@ def test_record_and_replay_and_policy(tmpdir, request, robot_type, mock):
    policy_cfg = ACTConfig()
    policy = make_policy(policy_cfg, ds_meta=dataset.meta, device=DEVICE)

-    out_dir = tmpdir / "logger"
+    out_dir = tmp_path / "logger"

    pretrained_policy_path = out_dir / "checkpoints/last/pretrained_model"
    policy.save_pretrained(pretrained_policy_path)
@@ -207,7 +202,7 @@ def test_record_and_replay_and_policy(tmpdir, request, robot_type, mock):
        num_image_writer_processes = 0

    eval_repo_id = "lerobot/eval_debug"
-    eval_root = tmpdir / "data" / eval_repo_id
+    eval_root = tmp_path / "data" / eval_repo_id

    rec_eval_cfg = RecordControlConfig(
        repo_id=eval_repo_id,
@@ -218,7 +213,6 @@ def test_record_and_replay_and_policy(tmpdir, request, robot_type, mock):
        episode_time_s=1,
        reset_time_s=0.1,
        num_episodes=2,
-        run_compute_stats=False,
        push_to_hub=False,
        video=False,
        display_cameras=False,
@@ -240,7 +234,7 @@ def test_record_and_replay_and_policy(tmpdir, request, robot_type, mock):

@pytest.mark.parametrize("robot_type, mock", [("koch", True)])
@require_robot
-def test_resume_record(tmpdir, request, robot_type, mock):
+def test_resume_record(tmp_path, request, robot_type, mock):
    robot_kwargs = {"robot_type": robot_type, "mock": mock}

    if mock and robot_type != "aloha":
@@ -248,7 +242,7 @@ def test_resume_record(tmpdir, request, robot_type, mock):

        # Create an empty calibration directory to trigger manual calibration
        # and avoid writing calibration files in user .cache/calibration folder
-        calibration_dir = tmpdir / robot_type
+        calibration_dir = tmp_path / robot_type
        mock_calibration_dir(calibration_dir)
        robot_kwargs["calibration_dir"] = calibration_dir
    else:
@@ -258,7 +252,7 @@ def test_resume_record(tmpdir, request, robot_type, mock):
    robot = make_robot(**robot_kwargs)

    repo_id = "lerobot/debug"
-    root = Path(tmpdir) / "data" / repo_id
+    root = tmp_path / "data" / repo_id
    single_task = "Do something."

    rec_cfg = RecordControlConfig(
@@ -272,7 +266,6 @@ def test_resume_record(tmpdir, request, robot_type, mock):
        video=False,
        display_cameras=False,
        play_sounds=False,
-        run_compute_stats=False,
        local_files_only=True,
        num_episodes=1,
    )
@@ -291,7 +284,7 @@ def test_resume_record(tmpdir, request, robot_type, mock):

@pytest.mark.parametrize("robot_type, mock", [("koch", True)])
@require_robot
-def test_record_with_event_rerecord_episode(tmpdir, request, robot_type, mock):
+def test_record_with_event_rerecord_episode(tmp_path, request, robot_type, mock):
    robot_kwargs = {"robot_type": robot_type, "mock": mock}

    if mock and robot_type != "aloha":
@@ -299,7 +292,7 @@ def test_record_with_event_rerecord_episode(tmpdir, request, robot_type, mock):

        # Create an empty calibration directory to trigger manual calibration
        # and avoid writing calibration files in user .cache/calibration folder
-        calibration_dir = tmpdir / robot_type
+        calibration_dir = tmp_path / robot_type
        mock_calibration_dir(calibration_dir)
        robot_kwargs["calibration_dir"] = calibration_dir
    else:
@@ -316,7 +309,7 @@ def test_record_with_event_rerecord_episode(tmpdir, request, robot_type, mock):
        mock_listener.return_value = (None, mock_events)

        repo_id = "lerobot/debug"
-        root = Path(tmpdir) / "data" / repo_id
+        root = tmp_path / "data" / repo_id
        single_task = "Do something."

        rec_cfg = RecordControlConfig(
@@ -331,7 +324,6 @@ def test_record_with_event_rerecord_episode(tmpdir, request, robot_type, mock):
            video=False,
            display_cameras=False,
            play_sounds=False,
-            run_compute_stats=False,
        )
        dataset = record(robot, rec_cfg)

@@ -342,7 +334,7 @@ def test_record_with_event_rerecord_episode(tmpdir, request, robot_type, mock):

@pytest.mark.parametrize("robot_type, mock", [("koch", True)])
@require_robot
-def test_record_with_event_exit_early(tmpdir, request, robot_type, mock):
+def test_record_with_event_exit_early(tmp_path, request, robot_type, mock):
    robot_kwargs = {"robot_type": robot_type, "mock": mock}

    if mock:
@@ -350,7 +342,7 @@ def test_record_with_event_exit_early(tmpdir, request, robot_type, mock):

        # Create an empty calibration directory to trigger manual calibration
        # and avoid writing calibration files in user .cache/calibration folder
-        calibration_dir = tmpdir / robot_type
+        calibration_dir = tmp_path / robot_type
        mock_calibration_dir(calibration_dir)
        robot_kwargs["calibration_dir"] = calibration_dir
    else:
@@ -367,7 +359,7 @@ def test_record_with_event_exit_early(tmpdir, request, robot_type, mock):
        mock_listener.return_value = (None, mock_events)

        repo_id = "lerobot/debug"
-        root = Path(tmpdir) / "data" / repo_id
+        root = tmp_path / "data" / repo_id
        single_task = "Do something."

        rec_cfg = RecordControlConfig(
@@ -382,7 +374,6 @@ def test_record_with_event_exit_early(tmpdir, request, robot_type, mock):
            video=False,
            display_cameras=False,
            play_sounds=False,
-            run_compute_stats=False,
        )

        dataset = record(robot, rec_cfg)
@@ -395,7 +386,7 @@ def test_record_with_event_exit_early(tmpdir, request, robot_type, mock):
    "robot_type, mock, num_image_writer_processes", [("koch", True, 0), ("koch", True, 1)]
 )
@require_robot
-def test_record_with_event_stop_recording(tmpdir, request, robot_type, mock, num_image_writer_processes):
+def test_record_with_event_stop_recording(tmp_path, request, robot_type, mock, num_image_writer_processes):
    robot_kwargs = {"robot_type": robot_type, "mock": mock}

    if mock:
@@ -403,7 +394,7 @@ def test_record_with_event_stop_recording(tmpdir, request, robot_type, mock, num

        # Create an empty calibration directory to trigger manual calibration
        # and avoid writing calibration files in user .cache/calibration folder
-        calibration_dir = tmpdir / robot_type
+        calibration_dir = tmp_path / robot_type
        mock_calibration_dir(calibration_dir)
        robot_kwargs["calibration_dir"] = calibration_dir
    else:
@@ -420,7 +411,7 @@ def test_record_with_event_stop_recording(tmpdir, request, robot_type, mock, num
        mock_listener.return_value = (None, mock_events)

        repo_id = "lerobot/debug"
-        root = Path(tmpdir) / "data" / repo_id
+        root = tmp_path / "data" / repo_id
        single_task = "Do something."

        rec_cfg = RecordControlConfig(
@@ -436,7 +427,6 @@ def test_record_with_event_stop_recording(tmpdir, request, robot_type, mock, num
            video=False,
            display_cameras=False,
            play_sounds=False,
-            run_compute_stats=False,
            num_image_writer_processes=num_image_writer_processes,
        )

--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -20,21 +20,14 @@ from copy import deepcopy
 from itertools import chain
 from pathlib import Path

-import einops
 import numpy as np
 import pytest
 import torch
-from datasets import Dataset
 from huggingface_hub import HfApi
 from PIL import Image
 from safetensors.torch import load_file

 import lerobot
-from lerobot.common.datasets.compute_stats import (
-    aggregate_stats,
-    compute_stats,
-    get_stats_einops_patterns,
-)
 from lerobot.common.datasets.factory import make_dataset
 from lerobot.common.datasets.image_writer import image_array_to_pil_image
 from lerobot.common.datasets.lerobot_dataset import (
@@ -44,13 +37,11 @@ from lerobot.common.datasets.lerobot_dataset import (
 from lerobot.common.datasets.utils import (
    create_branch,
    flatten_dict,
-    hf_transform_to_torch,
    unflatten_dict,
 )
 from lerobot.common.envs.factory import make_env_config
 from lerobot.common.policies.factory import make_policy_config
 from lerobot.common.robot_devices.robots.utils import make_robot
-from lerobot.common.utils.random_utils import seeded_context
 from lerobot.configs.default import DatasetConfig
 from lerobot.configs.train import TrainPipelineConfig
 from tests.fixtures.constants import DUMMY_CHW, DUMMY_HWC, DUMMY_REPO_ID
@@ -196,12 +187,12 @@ def test_add_frame_wrong_shape_numpy_ndim_0(tmp_path, empty_lerobot_dataset_fact
 def test_add_frame(tmp_path, empty_lerobot_dataset_factory):
    features = {"state": {"dtype": "float32", "shape": (1,), "names": None}}
    dataset = empty_lerobot_dataset_factory(root=tmp_path / "test", features=features)
-    dataset.add_frame({"state": torch.randn(1), "task": "dummy"})
+    dataset.add_frame({"state": torch.randn(1), "task": "Dummy task"})
    dataset.save_episode(encode_videos=False)
-    dataset.consolidate(run_compute_stats=False)
+    dataset.consolidate()

    assert len(dataset) == 1
-    assert dataset[0]["task"] == "dummy"
+    assert dataset[0]["task"] == "Dummy task"
    assert dataset[0]["task_index"] == 0
    assert dataset[0]["state"].ndim == 0

@@ -209,9 +200,9 @@ def test_add_frame(tmp_path, empty_lerobot_dataset_factory):
 def test_add_frame_state_1d(tmp_path, empty_lerobot_dataset_factory):
    features = {"state": {"dtype": "float32", "shape": (2,), "names": None}}
    dataset = empty_lerobot_dataset_factory(root=tmp_path / "test", features=features)
-    dataset.add_frame({"state": torch.randn(2), "task": "dummy"})
+    dataset.add_frame({"state": torch.randn(2), "task": "Dummy task"})
    dataset.save_episode(encode_videos=False)
-    dataset.consolidate(run_compute_stats=False)
+    dataset.consolidate()

    assert dataset[0]["state"].shape == torch.Size([2])

@@ -219,9 +210,9 @@ def test_add_frame_state_1d(tmp_path, empty_lerobot_dataset_factory):
 def test_add_frame_state_2d(tmp_path, empty_lerobot_dataset_factory):
    features = {"state": {"dtype": "float32", "shape": (2, 4), "names": None}}
    dataset = empty_lerobot_dataset_factory(root=tmp_path / "test", features=features)
-    dataset.add_frame({"state": torch.randn(2, 4), "task": "dummy"})
+    dataset.add_frame({"state": torch.randn(2, 4), "task": "Dummy task"})
    dataset.save_episode(encode_videos=False)
-    dataset.consolidate(run_compute_stats=False)
+    dataset.consolidate()

    assert dataset[0]["state"].shape == torch.Size([2, 4])

@@ -229,9 +220,9 @@ def test_add_frame_state_2d(tmp_path, empty_lerobot_dataset_factory):
 def test_add_frame_state_3d(tmp_path, empty_lerobot_dataset_factory):
    features = {"state": {"dtype": "float32", "shape": (2, 4, 3), "names": None}}
    dataset = empty_lerobot_dataset_factory(root=tmp_path / "test", features=features)
-    dataset.add_frame({"state": torch.randn(2, 4, 3), "task": "dummy"})
+    dataset.add_frame({"state": torch.randn(2, 4, 3), "task": "Dummy task"})
    dataset.save_episode(encode_videos=False)
-    dataset.consolidate(run_compute_stats=False)
+    dataset.consolidate()

    assert dataset[0]["state"].shape == torch.Size([2, 4, 3])

@@ -239,9 +230,9 @@ def test_add_frame_state_3d(tmp_path, empty_lerobot_dataset_factory):
 def test_add_frame_state_4d(tmp_path, empty_lerobot_dataset_factory):
    features = {"state": {"dtype": "float32", "shape": (2, 4, 3, 5), "names": None}}
    dataset = empty_lerobot_dataset_factory(root=tmp_path / "test", features=features)
-    dataset.add_frame({"state": torch.randn(2, 4, 3, 5), "task": "dummy"})
+    dataset.add_frame({"state": torch.randn(2, 4, 3, 5), "task": "Dummy task"})
    dataset.save_episode(encode_videos=False)
-    dataset.consolidate(run_compute_stats=False)
+    dataset.consolidate()

    assert dataset[0]["state"].shape == torch.Size([2, 4, 3, 5])

@@ -249,9 +240,9 @@ def test_add_frame_state_4d(tmp_path, empty_lerobot_dataset_factory):
 def test_add_frame_state_5d(tmp_path, empty_lerobot_dataset_factory):
    features = {"state": {"dtype": "float32", "shape": (2, 4, 3, 5, 1), "names": None}}
    dataset = empty_lerobot_dataset_factory(root=tmp_path / "test", features=features)
-    dataset.add_frame({"state": torch.randn(2, 4, 3, 5, 1), "task": "dummy"})
+    dataset.add_frame({"state": torch.randn(2, 4, 3, 5, 1), "task": "Dummy task"})
    dataset.save_episode(encode_videos=False)
-    dataset.consolidate(run_compute_stats=False)
+    dataset.consolidate()

    assert dataset[0]["state"].shape == torch.Size([2, 4, 3, 5, 1])

@@ -261,7 +252,7 @@ def test_add_frame_state_numpy(tmp_path, empty_lerobot_dataset_factory):
    dataset = empty_lerobot_dataset_factory(root=tmp_path / "test", features=features)
    dataset.add_frame({"state": np.array([1], dtype=np.float32), "task": "Dummy task"})
    dataset.save_episode(encode_videos=False)
-    dataset.consolidate(run_compute_stats=False)
+    dataset.consolidate()

    assert dataset[0]["state"].ndim == 0

@@ -271,7 +262,7 @@ def test_add_frame_string(tmp_path, empty_lerobot_dataset_factory):
    dataset = empty_lerobot_dataset_factory(root=tmp_path / "test", features=features)
    dataset.add_frame({"caption": "Dummy caption", "task": "Dummy task"})
    dataset.save_episode(encode_videos=False)
-    dataset.consolidate(run_compute_stats=False)
+    dataset.consolidate()

    assert dataset[0]["caption"] == "Dummy caption"

@@ -307,7 +298,7 @@ def test_add_frame_image(image_dataset):
    dataset = image_dataset
    dataset.add_frame({"image": np.random.rand(*DUMMY_CHW), "task": "Dummy task"})
    dataset.save_episode(encode_videos=False)
-    dataset.consolidate(run_compute_stats=False)
+    dataset.consolidate()

    assert dataset[0]["image"].shape == torch.Size(DUMMY_CHW)

@@ -316,7 +307,7 @@ def test_add_frame_image_h_w_c(image_dataset):
    dataset = image_dataset
    dataset.add_frame({"image": np.random.rand(*DUMMY_HWC), "task": "Dummy task"})
    dataset.save_episode(encode_videos=False)
-    dataset.consolidate(run_compute_stats=False)
+    dataset.consolidate()

    assert dataset[0]["image"].shape == torch.Size(DUMMY_CHW)

@@ -326,7 +317,7 @@ def test_add_frame_image_uint8(image_dataset):
    image = np.random.randint(0, 256, DUMMY_HWC, dtype=np.uint8)
    dataset.add_frame({"image": image, "task": "Dummy task"})
    dataset.save_episode(encode_videos=False)
-    dataset.consolidate(run_compute_stats=False)
+    dataset.consolidate()

    assert dataset[0]["image"].shape == torch.Size(DUMMY_CHW)

@@ -336,7 +327,7 @@ def test_add_frame_image_pil(image_dataset):
    image = np.random.randint(0, 256, DUMMY_HWC, dtype=np.uint8)
    dataset.add_frame({"image": Image.fromarray(image), "task": "Dummy task"})
    dataset.save_episode(encode_videos=False)
-    dataset.consolidate(run_compute_stats=False)
+    dataset.consolidate()

    assert dataset[0]["image"].shape == torch.Size(DUMMY_CHW)

@@ -463,67 +454,6 @@ def test_multidataset_frames():
            assert torch.equal(sub_dataset_item[k], dataset_item[k])


-# TODO(aliberts, rcadene): Refactor and move this to a tests/test_compute_stats.py
-def test_compute_stats_on_xarm():
-    """Check that the statistics are computed correctly according to the stats_patterns property.
-
-    We compare with taking a straight min, mean, max, std of all the data in one pass (which we can do
-    because we are working with a small dataset).
-    """
-    # TODO(rcadene, aliberts): remove dataset download
-    dataset = LeRobotDataset("lerobot/xarm_lift_medium", episodes=[0])
-
-    # reduce size of dataset sample on which stats compute is tested to 10 frames
-    dataset.hf_dataset = dataset.hf_dataset.select(range(10))
-
-    # Note: we set the batch size to be smaller than the whole dataset to make sure we are testing batched
-    # computation of the statistics. While doing this, we also make sure it works when we don't divide the
-    # dataset into even batches.
-    computed_stats = compute_stats(dataset, batch_size=int(len(dataset) * 0.25), num_workers=0)
-
-    # get einops patterns to aggregate batches and compute statistics
-    stats_patterns = get_stats_einops_patterns(dataset)
-
-    # get all frames from the dataset in the same dtype and range as during compute_stats
-    dataloader = torch.utils.data.DataLoader(
-        dataset,
-        num_workers=0,
-        batch_size=len(dataset),
-        shuffle=False,
-    )
-    full_batch = next(iter(dataloader))
-
-    # compute stats based on all frames from the dataset without any batching
-    expected_stats = {}
-    for k, pattern in stats_patterns.items():
-        full_batch[k] = full_batch[k].float()
-        expected_stats[k] = {}
-        expected_stats[k]["mean"] = einops.reduce(full_batch[k], pattern, "mean")
-        expected_stats[k]["std"] = torch.sqrt(
-            einops.reduce((full_batch[k] - expected_stats[k]["mean"]) ** 2, pattern, "mean")
-        )
-        expected_stats[k]["min"] = einops.reduce(full_batch[k], pattern, "min")
-        expected_stats[k]["max"] = einops.reduce(full_batch[k], pattern, "max")
-
-    # test computed stats match expected stats
-    for k in stats_patterns:
-        assert torch.allclose(computed_stats[k]["mean"], expected_stats[k]["mean"])
-        assert torch.allclose(computed_stats[k]["std"], expected_stats[k]["std"])
-        assert torch.allclose(computed_stats[k]["min"], expected_stats[k]["min"])
-        assert torch.allclose(computed_stats[k]["max"], expected_stats[k]["max"])
-
-    # load stats used during training which are expected to match the ones returned by computed_stats
-    loaded_stats = dataset.meta.stats  # noqa: F841
-
-    # TODO(rcadene): we can't test this because expected_stats is computed on a subset
-    # # test loaded stats match expected stats
-    # for k in stats_patterns:
-    #     assert torch.allclose(loaded_stats[k]["mean"], expected_stats[k]["mean"])
-    #     assert torch.allclose(loaded_stats[k]["std"], expected_stats[k]["std"])
-    #     assert torch.allclose(loaded_stats[k]["min"], expected_stats[k]["min"])
-    #     assert torch.allclose(loaded_stats[k]["max"], expected_stats[k]["max"])
-
-
 # TODO(aliberts): Move to more appropriate location
 def test_flatten_unflatten_dict():
    d = {
@@ -627,35 +557,6 @@ def test_backward_compatibility(repo_id):
    # load_and_compare(i - 1)


-@pytest.mark.skip("TODO after fix multidataset")
-def test_multidataset_aggregate_stats():
-    """Makes 3 basic datasets and checks that aggregate stats are computed correctly."""
-    with seeded_context(0):
-        data_a = torch.rand(30, dtype=torch.float32)
-        data_b = torch.rand(20, dtype=torch.float32)
-        data_c = torch.rand(20, dtype=torch.float32)
-
-    hf_dataset_1 = Dataset.from_dict(
-        {"a": data_a[:10], "b": data_b[:10], "c": data_c[:10], "index": torch.arange(10)}
-    )
-    hf_dataset_1.set_transform(hf_transform_to_torch)
-    hf_dataset_2 = Dataset.from_dict({"a": data_a[10:20], "b": data_b[10:], "index": torch.arange(10)})
-    hf_dataset_2.set_transform(hf_transform_to_torch)
-    hf_dataset_3 = Dataset.from_dict({"a": data_a[20:], "c": data_c[10:], "index": torch.arange(10)})
-    hf_dataset_3.set_transform(hf_transform_to_torch)
-    dataset_1 = LeRobotDataset.from_preloaded("d1", hf_dataset=hf_dataset_1)
-    dataset_1.stats = compute_stats(dataset_1, batch_size=len(hf_dataset_1), num_workers=0)
-    dataset_2 = LeRobotDataset.from_preloaded("d2", hf_dataset=hf_dataset_2)
-    dataset_2.stats = compute_stats(dataset_2, batch_size=len(hf_dataset_2), num_workers=0)
-    dataset_3 = LeRobotDataset.from_preloaded("d3", hf_dataset=hf_dataset_3)
-    dataset_3.stats = compute_stats(dataset_3, batch_size=len(hf_dataset_3), num_workers=0)
-    stats = aggregate_stats([dataset_1, dataset_2, dataset_3])
-    for data_key, data in zip(["a", "b", "c"], [data_a, data_b, data_c], strict=True):
-        for agg_fn in ["mean", "min", "max"]:
-            assert torch.allclose(stats[data_key][agg_fn], einops.reduce(data, "n -> 1", agg_fn))
-        assert torch.allclose(stats[data_key]["std"], torch.std(data, correction=0))
-
-
@pytest.mark.skip("Requires internet access")
 def test_create_branch():
    api = HfApi()
--- a/tests/test_push_dataset_to_hub.py
+++ b/tests/test_push_dataset_to_hub.py
@@ -1,370 +0,0 @@
-"""
-This file contains generic tests to ensure that nothing breaks if we modify the push_dataset_to_hub API.
-Also, this file contains backward compatibility tests. Because they are slow and require to download the raw datasets,
-we skip them for now in our CI.
-
-Example to run backward compatiblity tests locally:
-```
-python -m pytest --run-skipped tests/test_push_dataset_to_hub.py::test_push_dataset_to_hub_pusht_backward_compatibility
-```
-"""
-
-from pathlib import Path
-
-import numpy as np
-import pytest
-import torch
-
-from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
-from lerobot.common.datasets.push_dataset_to_hub.utils import save_images_concurrently
-from lerobot.common.datasets.video_utils import encode_video_frames
-from lerobot.scripts.push_dataset_to_hub import push_dataset_to_hub
-from tests.utils import require_package_arg
-
-
-def _mock_download_raw_pusht(raw_dir, num_frames=4, num_episodes=3):
-    import zarr
-
-    raw_dir.mkdir(parents=True, exist_ok=True)
-    zarr_path = raw_dir / "pusht_cchi_v7_replay.zarr"
-    store = zarr.DirectoryStore(zarr_path)
-    zarr_data = zarr.group(store=store)
-
-    zarr_data.create_dataset(
-        "data/action", shape=(num_frames, 1), chunks=(num_frames, 1), dtype=np.float32, overwrite=True
-    )
-    zarr_data.create_dataset(
-        "data/img",
-        shape=(num_frames, 96, 96, 3),
-        chunks=(num_frames, 96, 96, 3),
-        dtype=np.uint8,
-        overwrite=True,
-    )
-    zarr_data.create_dataset(
-        "data/n_contacts", shape=(num_frames, 2), chunks=(num_frames, 2), dtype=np.float32, overwrite=True
-    )
-    zarr_data.create_dataset(
-        "data/state", shape=(num_frames, 5), chunks=(num_frames, 5), dtype=np.float32, overwrite=True
-    )
-    zarr_data.create_dataset(
-        "data/keypoint", shape=(num_frames, 9, 2), chunks=(num_frames, 9, 2), dtype=np.float32, overwrite=True
-    )
-    zarr_data.create_dataset(
-        "meta/episode_ends", shape=(num_episodes,), chunks=(num_episodes,), dtype=np.int32, overwrite=True
-    )
-
-    zarr_data["data/action"][:] = np.random.randn(num_frames, 1)
-    zarr_data["data/img"][:] = np.random.randint(0, 255, size=(num_frames, 96, 96, 3), dtype=np.uint8)
-    zarr_data["data/n_contacts"][:] = np.random.randn(num_frames, 2)
-    zarr_data["data/state"][:] = np.random.randn(num_frames, 5)
-    zarr_data["data/keypoint"][:] = np.random.randn(num_frames, 9, 2)
-    zarr_data["meta/episode_ends"][:] = np.array([1, 3, 4])
-
-    store.close()
-
-
-def _mock_download_raw_umi(raw_dir, num_frames=4, num_episodes=3):
-    import zarr
-
-    raw_dir.mkdir(parents=True, exist_ok=True)
-    zarr_path = raw_dir / "cup_in_the_wild.zarr"
-    store = zarr.DirectoryStore(zarr_path)
-    zarr_data = zarr.group(store=store)
-
-    zarr_data.create_dataset(
-        "data/camera0_rgb",
-        shape=(num_frames, 96, 96, 3),
-        chunks=(num_frames, 96, 96, 3),
-        dtype=np.uint8,
-        overwrite=True,
-    )
-    zarr_data.create_dataset(
-        "data/robot0_demo_end_pose",
-        shape=(num_frames, 5),
-        chunks=(num_frames, 5),
-        dtype=np.float32,
-        overwrite=True,
-    )
-    zarr_data.create_dataset(
-        "data/robot0_demo_start_pose",
-        shape=(num_frames, 5),
-        chunks=(num_frames, 5),
-        dtype=np.float32,
-        overwrite=True,
-    )
-    zarr_data.create_dataset(
-        "data/robot0_eef_pos", shape=(num_frames, 5), chunks=(num_frames, 5), dtype=np.float32, overwrite=True
-    )
-    zarr_data.create_dataset(
-        "data/robot0_eef_rot_axis_angle",
-        shape=(num_frames, 5),
-        chunks=(num_frames, 5),
-        dtype=np.float32,
-        overwrite=True,
-    )
-    zarr_data.create_dataset(
-        "data/robot0_gripper_width",
-        shape=(num_frames, 5),
-        chunks=(num_frames, 5),
-        dtype=np.float32,
-        overwrite=True,
-    )
-    zarr_data.create_dataset(
-        "meta/episode_ends", shape=(num_episodes,), chunks=(num_episodes,), dtype=np.int32, overwrite=True
-    )
-
-    zarr_data["data/camera0_rgb"][:] = np.random.randint(0, 255, size=(num_frames, 96, 96, 3), dtype=np.uint8)
-    zarr_data["data/robot0_demo_end_pose"][:] = np.random.randn(num_frames, 5)
-    zarr_data["data/robot0_demo_start_pose"][:] = np.random.randn(num_frames, 5)
-    zarr_data["data/robot0_eef_pos"][:] = np.random.randn(num_frames, 5)
-    zarr_data["data/robot0_eef_rot_axis_angle"][:] = np.random.randn(num_frames, 5)
-    zarr_data["data/robot0_gripper_width"][:] = np.random.randn(num_frames, 5)
-    zarr_data["meta/episode_ends"][:] = np.array([1, 3, 4])
-
-    store.close()
-
-
-def _mock_download_raw_xarm(raw_dir, num_frames=4):
-    import pickle
-
-    dataset_dict = {
-        "observations": {
-            "rgb": np.random.randint(0, 255, size=(num_frames, 3, 84, 84), dtype=np.uint8),
-            "state": np.random.randn(num_frames, 4),
-        },
-        "actions": np.random.randn(num_frames, 3),
-        "rewards": np.random.randn(num_frames),
-        "masks": np.random.randn(num_frames),
-        "dones": np.array([False, True, True, True]),
-    }
-
-    raw_dir.mkdir(parents=True, exist_ok=True)
-    pkl_path = raw_dir / "buffer.pkl"
-    with open(pkl_path, "wb") as f:
-        pickle.dump(dataset_dict, f)
-
-
-def _mock_download_raw_aloha(raw_dir, num_frames=6, num_episodes=3):
-    import h5py
-
-    for ep_idx in range(num_episodes):
-        raw_dir.mkdir(parents=True, exist_ok=True)
-        path_h5 = raw_dir / f"episode_{ep_idx}.hdf5"
-        with h5py.File(str(path_h5), "w") as f:
-            f.create_dataset("action", data=np.random.randn(num_frames // num_episodes, 14))
-            f.create_dataset("observations/qpos", data=np.random.randn(num_frames // num_episodes, 14))
-            f.create_dataset("observations/qvel", data=np.random.randn(num_frames // num_episodes, 14))
-            f.create_dataset(
-                "observations/images/top",
-                data=np.random.randint(
-                    0, 255, size=(num_frames // num_episodes, 480, 640, 3), dtype=np.uint8
-                ),
-            )
-
-
-def _mock_download_raw_dora(raw_dir, num_frames=6, num_episodes=3, fps=30):
-    from datetime import datetime, timedelta, timezone
-
-    import pandas
-
-    def write_parquet(key, timestamps, values):
-        data = {
-            "timestamp_utc": timestamps,
-            key: values,
-        }
-        df = pandas.DataFrame(data)
-        raw_dir.mkdir(parents=True, exist_ok=True)
-        df.to_parquet(raw_dir / f"{key}.parquet", engine="pyarrow")
-
-    episode_indices = [None, None, -1, None, None, -1, None, None, -1]
-    episode_indices_mapping = [0, 0, 0, 1, 1, 1, 2, 2, 2]
-    frame_indices = [0, 1, -1, 0, 1, -1, 0, 1, -1]
-
-    cam_key = "observation.images.cam_high"
-    timestamps = []
-    actions = []
-    states = []
-    frames = []
-    # `+ num_episodes`` for buffer frames associated to episode_index=-1
-    for i, frame_idx in enumerate(frame_indices):
-        t_utc = datetime.now(timezone.utc) + timedelta(seconds=i / fps)
-        action = np.random.randn(21).tolist()
-        state = np.random.randn(21).tolist()
-        ep_idx = episode_indices_mapping[i]
-        frame = [{"path": f"videos/{cam_key}_episode_{ep_idx:06d}.mp4", "timestamp": frame_idx / fps}]
-        timestamps.append(t_utc)
-        actions.append(action)
-        states.append(state)
-        frames.append(frame)
-
-    write_parquet(cam_key, timestamps, frames)
-    write_parquet("observation.state", timestamps, states)
-    write_parquet("action", timestamps, actions)
-    write_parquet("episode_index", timestamps, episode_indices)
-
-    # write fake mp4 file for each episode
-    for ep_idx in range(num_episodes):
-        imgs_array = np.random.randint(0, 255, size=(num_frames // num_episodes, 480, 640, 3), dtype=np.uint8)
-
-        tmp_imgs_dir = raw_dir / "tmp_images"
-        save_images_concurrently(imgs_array, tmp_imgs_dir)
-
-        fname = f"{cam_key}_episode_{ep_idx:06d}.mp4"
-        video_path = raw_dir / "videos" / fname
-        encode_video_frames(tmp_imgs_dir, video_path, fps, vcodec="libx264")
-
-
-def _mock_download_raw(raw_dir, repo_id):
-    if "wrist_gripper" in repo_id:
-        _mock_download_raw_dora(raw_dir)
-    elif "aloha" in repo_id:
-        _mock_download_raw_aloha(raw_dir)
-    elif "pusht" in repo_id:
-        _mock_download_raw_pusht(raw_dir)
-    elif "xarm" in repo_id:
-        _mock_download_raw_xarm(raw_dir)
-    elif "umi" in repo_id:
-        _mock_download_raw_umi(raw_dir)
-    else:
-        raise ValueError(repo_id)
-
-
-@pytest.mark.skip("push_dataset_to_hub is deprecated")
-def test_push_dataset_to_hub_invalid_repo_id(tmpdir):
-    with pytest.raises(ValueError):
-        push_dataset_to_hub(Path(tmpdir), "raw_format", "invalid_repo_id")
-
-
-@pytest.mark.skip("push_dataset_to_hub is deprecated")
-def test_push_dataset_to_hub_out_dir_force_override_false(tmpdir):
-    tmpdir = Path(tmpdir)
-    out_dir = tmpdir / "out"
-    raw_dir = tmpdir / "raw"
-    # mkdir to skip download
-    raw_dir.mkdir(parents=True, exist_ok=True)
-    with pytest.raises(ValueError):
-        push_dataset_to_hub(
-            raw_dir=raw_dir,
-            raw_format="some_format",
-            repo_id="user/dataset",
-            local_dir=out_dir,
-            force_override=False,
-        )
-
-
-@pytest.mark.skip("push_dataset_to_hub is deprecated")
-@pytest.mark.parametrize(
-    "required_packages, raw_format, repo_id, make_test_data",
-    [
-        (["gym_pusht"], "pusht_zarr", "lerobot/pusht", False),
-        (["gym_pusht"], "pusht_zarr", "lerobot/pusht", True),
-        (None, "xarm_pkl", "lerobot/xarm_lift_medium", False),
-        (None, "aloha_hdf5", "lerobot/aloha_sim_insertion_scripted", False),
-        (["imagecodecs"], "umi_zarr", "lerobot/umi_cup_in_the_wild", False),
-        (None, "dora_parquet", "cadene/wrist_gripper", False),
-    ],
-)
-@require_package_arg
-def test_push_dataset_to_hub_format(required_packages, tmpdir, raw_format, repo_id, make_test_data):
-    num_episodes = 3
-    tmpdir = Path(tmpdir)
-
-    raw_dir = tmpdir / f"{repo_id}_raw"
-    _mock_download_raw(raw_dir, repo_id)
-
-    local_dir = tmpdir / repo_id
-
-    lerobot_dataset = push_dataset_to_hub(
-        raw_dir=raw_dir,
-        raw_format=raw_format,
-        repo_id=repo_id,
-        push_to_hub=False,
-        local_dir=local_dir,
-        force_override=False,
-        cache_dir=tmpdir / "cache",
-        tests_data_dir=tmpdir / "tests/data" if make_test_data else None,
-        encoding={"vcodec": "libx264"},
-    )
-
-    # minimal generic tests on the local directory containing LeRobotDataset
-    assert (local_dir / "meta_data" / "info.json").exists()
-    assert (local_dir / "meta_data" / "stats.safetensors").exists()
-    assert (local_dir / "meta_data" / "episode_data_index.safetensors").exists()
-    for i in range(num_episodes):
-        for cam_key in lerobot_dataset.camera_keys:
-            assert (local_dir / "videos" / f"{cam_key}_episode_{i:06d}.mp4").exists()
-    assert (local_dir / "train" / "dataset_info.json").exists()
-    assert (local_dir / "train" / "state.json").exists()
-    assert len(list((local_dir / "train").glob("*.arrow"))) > 0
-
-    # minimal generic tests on the item
-    item = lerobot_dataset[0]
-    assert "index" in item
-    assert "episode_index" in item
-    assert "timestamp" in item
-    for cam_key in lerobot_dataset.camera_keys:
-        assert cam_key in item
-
-    if make_test_data:
-        # Check that only the first episode is selected.
-        test_dataset = LeRobotDataset(repo_id=repo_id, root=tmpdir / "tests/data")
-        num_frames = sum(
-            i == lerobot_dataset.hf_dataset["episode_index"][0]
-            for i in lerobot_dataset.hf_dataset["episode_index"]
-        ).item()
-        assert (
-            test_dataset.hf_dataset["episode_index"]
-            == lerobot_dataset.hf_dataset["episode_index"][:num_frames]
-        )
-        for k in ["from", "to"]:
-            assert torch.equal(test_dataset.episode_data_index[k], lerobot_dataset.episode_data_index[k][:1])
-
-
-@pytest.mark.skip("push_dataset_to_hub is deprecated")
-@pytest.mark.parametrize(
-    "raw_format, repo_id",
-    [
-        # TODO(rcadene): add raw dataset test artifacts
-        ("pusht_zarr", "lerobot/pusht"),
-        ("xarm_pkl", "lerobot/xarm_lift_medium"),
-        ("aloha_hdf5", "lerobot/aloha_sim_insertion_scripted"),
-        ("umi_zarr", "lerobot/umi_cup_in_the_wild"),
-        ("dora_parquet", "cadene/wrist_gripper"),
-    ],
-)
-def test_push_dataset_to_hub_pusht_backward_compatibility(tmpdir, raw_format, repo_id):
-    _, dataset_id = repo_id.split("/")
-
-    tmpdir = Path(tmpdir)
-    raw_dir = tmpdir / f"{dataset_id}_raw"
-    local_dir = tmpdir / repo_id
-
-    push_dataset_to_hub(
-        raw_dir=raw_dir,
-        raw_format=raw_format,
-        repo_id=repo_id,
-        push_to_hub=False,
-        local_dir=local_dir,
-        force_override=False,
-        cache_dir=tmpdir / "cache",
-        episodes=[0],
-    )
-
-    ds_actual = LeRobotDataset(repo_id, root=tmpdir)
-    ds_reference = LeRobotDataset(repo_id)
-
-    assert len(ds_reference.hf_dataset) == len(ds_actual.hf_dataset)
-
-    def check_same_items(item1, item2):
-        assert item1.keys() == item2.keys(), "Keys mismatch"
-
-        for key in item1:
-            if isinstance(item1[key], torch.Tensor) and isinstance(item2[key], torch.Tensor):
-                assert torch.equal(item1[key], item2[key]), f"Mismatch found in key: {key}"
-            else:
-                assert item1[key] == item2[key], f"Mismatch found in key: {key}"
-
-    for i in range(len(ds_reference.hf_dataset)):
-        item_reference = ds_reference.hf_dataset[i]
-        item_actual = ds_actual.hf_dataset[i]
-        check_same_items(item_reference, item_actual)
--- a/tests/test_robots.py
+++ b/tests/test_robots.py
@@ -23,8 +23,6 @@ pytest -sx 'tests/test_robots.py::test_robot[aloha-True]'
 ```
 """

-from pathlib import Path
-
 import pytest
 import torch

@@ -35,7 +33,7 @@ from tests.utils import TEST_ROBOT_TYPES, mock_calibration_dir, require_robot

@pytest.mark.parametrize("robot_type, mock", TEST_ROBOT_TYPES)
@require_robot
-def test_robot(tmpdir, request, robot_type, mock):
+def test_robot(tmp_path, request, robot_type, mock):
    # TODO(rcadene): measure fps in nightly?
    # TODO(rcadene): test logs
    # TODO(rcadene): add compatibility with other robots
@@ -50,8 +48,7 @@ def test_robot(tmpdir, request, robot_type, mock):
            request.getfixturevalue("patch_builtins_input")

        # Create an empty calibration directory to trigger manual calibration
-        tmpdir = Path(tmpdir)
-        calibration_dir = tmpdir / robot_type
+        calibration_dir = tmp_path / robot_type
        mock_calibration_dir(calibration_dir)
        robot_kwargs["calibration_dir"] = calibration_dir