Dataset v3 (#1412)

Co-authored-by: Simon Alibert <75076266+aliberts@users.noreply.github.com> Co-authored-by: Remi Cadene <re.cadene@gmail.com> Co-authored-by: Tavish <tavish9.chen@gmail.com> Co-authored-by: fracapuano <francesco.capuano@huggingface.co> Co-authored-by: CarolinePascal <caroline8.pascal@gmail.com>
2025-09-15 09:53:30 +02:00
parent d602e8169c
commit f55c6e89f0
50 changed files with 4642 additions and 4092 deletions
--- a/tests/fixtures/constants.py
+++ b/tests/fixtures/constants.py
@@ -29,8 +29,8 @@ DUMMY_MOTOR_FEATURES = {
    },
 }
 DUMMY_CAMERA_FEATURES = {
-    "laptop": {"shape": (480, 640, 3), "names": ["height", "width", "channels"], "info": None},
-    "phone": {"shape": (480, 640, 3), "names": ["height", "width", "channels"], "info": None},
+    "laptop": {"shape": (64, 96, 3), "names": ["height", "width", "channels"], "info": None},
+    "phone": {"shape": (64, 96, 3), "names": ["height", "width", "channels"], "info": None},
 }
 DEFAULT_FPS = 30
 DUMMY_VIDEO_INFO = {
--- a/tests/fixtures/dataset_factories.py
+++ b/tests/fixtures/dataset_factories.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import random
+import shutil
 from functools import partial
 from pathlib import Path
 from typing import Protocol
@@ -19,19 +20,25 @@ from unittest.mock import patch

 import datasets
 import numpy as np
+import pandas as pd
 import PIL.Image
 import pytest
 import torch
+from datasets import Dataset

 from lerobot.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDataset, LeRobotDatasetMetadata
 from lerobot.datasets.utils import (
    DEFAULT_CHUNK_SIZE,
+    DEFAULT_DATA_FILE_SIZE_IN_MB,
+    DEFAULT_DATA_PATH,
    DEFAULT_FEATURES,
-    DEFAULT_PARQUET_PATH,
+    DEFAULT_VIDEO_FILE_SIZE_IN_MB,
    DEFAULT_VIDEO_PATH,
+    flatten_dict,
    get_hf_features_from_features,
    hf_transform_to_torch,
 )
+from lerobot.datasets.video_utils import encode_video_frames
 from tests.fixtures.constants import (
    DEFAULT_FPS,
    DUMMY_CAMERA_FEATURES,
@@ -46,10 +53,9 @@ class LeRobotDatasetFactory(Protocol):
    def __call__(self, *args, **kwargs) -> LeRobotDataset: ...


-def get_task_index(task_dicts: dict, task: str) -> int:
-    tasks = {d["task_index"]: d["task"] for d in task_dicts.values()}
-    task_to_task_index = {task: task_idx for task_idx, task in tasks.items()}
-    return task_to_task_index[task]
+def get_task_index(tasks: datasets.Dataset, task: str) -> int:
+    task_idx = tasks.loc[task].task_index.item()
+    return task_idx


@pytest.fixture(scope="session")
@@ -62,15 +68,49 @@ def img_tensor_factory():

@pytest.fixture(scope="session")
 def img_array_factory():
-    def _create_img_array(height=100, width=100, channels=3, dtype=np.uint8) -> np.ndarray:
-        if np.issubdtype(dtype, np.unsignedinteger):
-            # Int array in [0, 255] range
-            img_array = np.random.randint(0, 256, size=(height, width, channels), dtype=dtype)
-        elif np.issubdtype(dtype, np.floating):
-            # Float array in [0, 1] range
-            img_array = np.random.rand(height, width, channels).astype(dtype)
+    def _create_img_array(height=100, width=100, channels=3, dtype=np.uint8, content=None) -> np.ndarray:
+        if content is None:
+            # Original random noise behavior
+            if np.issubdtype(dtype, np.unsignedinteger):
+                # Int array in [0, 255] range
+                img_array = np.random.randint(0, 256, size=(height, width, channels), dtype=dtype)
+            elif np.issubdtype(dtype, np.floating):
+                # Float array in [0, 1] range
+                img_array = np.random.rand(height, width, channels).astype(dtype)
+            else:
+                raise ValueError(dtype)
        else:
-            raise ValueError(dtype)
+            # Create image with text content using OpenCV
+            import cv2
+
+            # Create white background
+            img_array = np.ones((height, width, channels), dtype=np.uint8) * 255
+
+            # Font settings
+            font = cv2.FONT_HERSHEY_SIMPLEX
+            font_scale = max(0.5, height / 200)  # Scale font with image size
+            font_color = (0, 0, 0)  # Black text
+            thickness = max(1, int(height / 100))
+
+            # Get text size to center it
+            text_size = cv2.getTextSize(content, font, font_scale, thickness)[0]
+            text_x = (width - text_size[0]) // 2
+            text_y = (height + text_size[1]) // 2
+
+            # Put text on image
+            cv2.putText(img_array, content, (text_x, text_y), font, font_scale, font_color, thickness)
+
+            # Handle single channel case
+            if channels == 1:
+                img_array = cv2.cvtColor(img_array, cv2.COLOR_BGR2GRAY)
+                img_array = img_array[:, :, np.newaxis]
+
+            # Convert to target dtype
+            if np.issubdtype(dtype, np.floating):
+                img_array = img_array.astype(dtype) / 255.0
+            else:
+                img_array = img_array.astype(dtype)
+
        return img_array

    return _create_img_array
@@ -117,9 +157,10 @@ def info_factory(features_factory):
        total_frames: int = 0,
        total_tasks: int = 0,
        total_videos: int = 0,
-        total_chunks: int = 0,
        chunks_size: int = DEFAULT_CHUNK_SIZE,
-        data_path: str = DEFAULT_PARQUET_PATH,
+        data_files_size_in_mb: float = DEFAULT_DATA_FILE_SIZE_IN_MB,
+        video_files_size_in_mb: float = DEFAULT_VIDEO_FILE_SIZE_IN_MB,
+        data_path: str = DEFAULT_DATA_PATH,
        video_path: str = DEFAULT_VIDEO_PATH,
        motor_features: dict = DUMMY_MOTOR_FEATURES,
        camera_features: dict = DUMMY_CAMERA_FEATURES,
@@ -133,8 +174,9 @@ def info_factory(features_factory):
            "total_frames": total_frames,
            "total_tasks": total_tasks,
            "total_videos": total_videos,
-            "total_chunks": total_chunks,
            "chunks_size": chunks_size,
+            "data_files_size_in_mb": data_files_size_in_mb,
+            "video_files_size_in_mb": video_files_size_in_mb,
            "fps": fps,
            "splits": {},
            "data_path": data_path,
@@ -175,41 +217,26 @@ def stats_factory():
    return _create_stats


-@pytest.fixture(scope="session")
-def episodes_stats_factory(stats_factory):
-    def _create_episodes_stats(
-        features: dict[str],
-        total_episodes: int = 3,
-    ) -> dict:
-        episodes_stats = {}
-        for episode_index in range(total_episodes):
-            episodes_stats[episode_index] = {
-                "episode_index": episode_index,
-                "stats": stats_factory(features),
-            }
-        return episodes_stats
-
-    return _create_episodes_stats
-
-
@pytest.fixture(scope="session")
 def tasks_factory():
-    def _create_tasks(total_tasks: int = 3) -> int:
-        tasks = {}
-        for task_index in range(total_tasks):
-            task_dict = {"task_index": task_index, "task": f"Perform action {task_index}."}
-            tasks[task_index] = task_dict
-        return tasks
+    def _create_tasks(total_tasks: int = 3) -> pd.DataFrame:
+        ids = list(range(total_tasks))
+        tasks = [f"Perform action {i}." for i in ids]
+        df = pd.DataFrame({"task_index": ids}, index=tasks)
+        return df

    return _create_tasks


@pytest.fixture(scope="session")
-def episodes_factory(tasks_factory):
+def episodes_factory(tasks_factory, stats_factory):
    def _create_episodes(
+        features: dict[str],
+        fps: int = DEFAULT_FPS,
        total_episodes: int = 3,
        total_frames: int = 400,
-        tasks: dict | None = None,
+        video_keys: list[str] | None = None,
+        tasks: pd.DataFrame | None = None,
        multi_task: bool = False,
    ):
        if total_episodes <= 0 or total_frames <= 0:
@@ -217,66 +244,142 @@ def episodes_factory(tasks_factory):
        if total_frames < total_episodes:
            raise ValueError("total_length must be greater than or equal to num_episodes.")

-        if not tasks:
+        if tasks is None:
            min_tasks = 2 if multi_task else 1
            total_tasks = random.randint(min_tasks, total_episodes)
            tasks = tasks_factory(total_tasks)

-        if total_episodes < len(tasks) and not multi_task:
+        num_tasks_available = len(tasks)
+
+        if total_episodes < num_tasks_available and not multi_task:
            raise ValueError("The number of tasks should be less than the number of episodes.")

        # Generate random lengths that sum up to total_length
        lengths = np.random.multinomial(total_frames, [1 / total_episodes] * total_episodes).tolist()

-        tasks_list = [task_dict["task"] for task_dict in tasks.values()]
-        num_tasks_available = len(tasks_list)
+        # Create empty dictionaries with all keys
+        d = {
+            "episode_index": [],
+            "meta/episodes/chunk_index": [],
+            "meta/episodes/file_index": [],
+            "data/chunk_index": [],
+            "data/file_index": [],
+            "dataset_from_index": [],
+            "dataset_to_index": [],
+            "tasks": [],
+            "length": [],
+        }
+        if video_keys is not None:
+            for video_key in video_keys:
+                d[f"videos/{video_key}/chunk_index"] = []
+                d[f"videos/{video_key}/file_index"] = []
+                d[f"videos/{video_key}/from_timestamp"] = []
+                d[f"videos/{video_key}/to_timestamp"] = []

-        episodes = {}
-        remaining_tasks = tasks_list.copy()
+        for stats_key in flatten_dict({"stats": stats_factory(features)}):
+            d[stats_key] = []
+
+        num_frames = 0
+        remaining_tasks = list(tasks.index)
        for ep_idx in range(total_episodes):
            num_tasks_in_episode = random.randint(1, min(3, num_tasks_available)) if multi_task else 1
-            tasks_to_sample = remaining_tasks if remaining_tasks else tasks_list
+            tasks_to_sample = remaining_tasks if len(remaining_tasks) > 0 else list(tasks.index)
            episode_tasks = random.sample(tasks_to_sample, min(num_tasks_in_episode, len(tasks_to_sample)))
            if remaining_tasks:
                for task in episode_tasks:
                    remaining_tasks.remove(task)

-            episodes[ep_idx] = {
-                "episode_index": ep_idx,
-                "tasks": episode_tasks,
-                "length": lengths[ep_idx],
-            }
+            d["episode_index"].append(ep_idx)
+            # TODO(rcadene): remove heuristic of only one file
+            d["meta/episodes/chunk_index"].append(0)
+            d["meta/episodes/file_index"].append(0)
+            d["data/chunk_index"].append(0)
+            d["data/file_index"].append(0)
+            d["dataset_from_index"].append(num_frames)
+            d["dataset_to_index"].append(num_frames + lengths[ep_idx])
+            d["tasks"].append(episode_tasks)
+            d["length"].append(lengths[ep_idx])

-        return episodes
+            if video_keys is not None:
+                for video_key in video_keys:
+                    d[f"videos/{video_key}/chunk_index"].append(0)
+                    d[f"videos/{video_key}/file_index"].append(0)
+                    d[f"videos/{video_key}/from_timestamp"].append(num_frames / fps)
+                    d[f"videos/{video_key}/to_timestamp"].append((num_frames + lengths[ep_idx]) / fps)
+
+            # Add stats columns like "stats/action/max"
+            for stats_key, stats in flatten_dict({"stats": stats_factory(features)}).items():
+                d[stats_key].append(stats)
+
+            num_frames += lengths[ep_idx]
+
+        return Dataset.from_dict(d)

    return _create_episodes


+@pytest.fixture(scope="session")
+def create_videos(info_factory, img_array_factory):
+    def _create_video_directory(
+        root: Path,
+        info: dict | None = None,
+        total_episodes: int = 3,
+        total_frames: int = 150,
+        total_tasks: int = 1,
+    ):
+        if info is None:
+            info = info_factory(
+                total_episodes=total_episodes, total_frames=total_frames, total_tasks=total_tasks
+            )
+
+        video_feats = {key: feats for key, feats in info["features"].items() if feats["dtype"] == "video"}
+        for key, ft in video_feats.items():
+            # create and save images with identifiable content
+            tmp_dir = root / "tmp_images"
+            tmp_dir.mkdir(parents=True, exist_ok=True)
+            for frame_index in range(info["total_frames"]):
+                content = f"{key}-{frame_index}"
+                img = img_array_factory(height=ft["shape"][0], width=ft["shape"][1], content=content)
+                pil_img = PIL.Image.fromarray(img)
+                path = tmp_dir / f"frame-{frame_index:06d}.png"
+                pil_img.save(path)
+
+            video_path = root / DEFAULT_VIDEO_PATH.format(video_key=key, chunk_index=0, file_index=0)
+            video_path.parent.mkdir(parents=True, exist_ok=True)
+            # Use the global fps from info, not video-specific fps which might not exist
+            encode_video_frames(tmp_dir, video_path, fps=info["fps"])
+            shutil.rmtree(tmp_dir)
+
+    return _create_video_directory
+
+
@pytest.fixture(scope="session")
 def hf_dataset_factory(features_factory, tasks_factory, episodes_factory, img_array_factory):
    def _create_hf_dataset(
        features: dict | None = None,
-        tasks: list[dict] | None = None,
-        episodes: list[dict] | None = None,
+        tasks: pd.DataFrame | None = None,
+        episodes: datasets.Dataset | None = None,
        fps: int = DEFAULT_FPS,
    ) -> datasets.Dataset:
-        if not tasks:
+        if tasks is None:
            tasks = tasks_factory()
-        if not episodes:
-            episodes = episodes_factory()
-        if not features:
+        if features is None:
            features = features_factory()
+        if episodes is None:
+            episodes = episodes_factory(features, fps)

        timestamp_col = np.array([], dtype=np.float32)
        frame_index_col = np.array([], dtype=np.int64)
        episode_index_col = np.array([], dtype=np.int64)
        task_index = np.array([], dtype=np.int64)
-        for ep_dict in episodes.values():
+        for ep_dict in episodes:
            timestamp_col = np.concatenate((timestamp_col, np.arange(ep_dict["length"]) / fps))
            frame_index_col = np.concatenate((frame_index_col, np.arange(ep_dict["length"], dtype=int)))
            episode_index_col = np.concatenate(
                (episode_index_col, np.full(ep_dict["length"], ep_dict["episode_index"], dtype=int))
            )
+            # Slightly incorrect, but for simplicity, we assign to all frames the first task defined in the episode metadata.
+            # TODO(rcadene): assign the tasks of the episode per chunks of frames
            ep_task_index = get_task_index(tasks, ep_dict["tasks"][0])
            task_index = np.concatenate((task_index, np.full(ep_dict["length"], ep_task_index, dtype=int)))

@@ -286,8 +389,8 @@ def hf_dataset_factory(features_factory, tasks_factory, episodes_factory, img_ar
        for key, ft in features.items():
            if ft["dtype"] == "image":
                robot_cols[key] = [
-                    img_array_factory(height=ft["shapes"][1], width=ft["shapes"][0])
-                    for _ in range(len(index_col))
+                    img_array_factory(height=ft["shape"][1], width=ft["shape"][0], content=f"{key}-{i}")
+                    for i in range(len(index_col))
                ]
            elif ft["shape"][0] > 1 and ft["dtype"] != "video":
                robot_cols[key] = np.random.random((len(index_col), ft["shape"][0])).astype(ft["dtype"])
@@ -314,7 +417,6 @@ def hf_dataset_factory(features_factory, tasks_factory, episodes_factory, img_ar
 def lerobot_dataset_metadata_factory(
    info_factory,
    stats_factory,
-    episodes_stats_factory,
    tasks_factory,
    episodes_factory,
    mock_snapshot_download_factory,
@@ -324,29 +426,29 @@ def lerobot_dataset_metadata_factory(
        repo_id: str = DUMMY_REPO_ID,
        info: dict | None = None,
        stats: dict | None = None,
-        episodes_stats: list[dict] | None = None,
-        tasks: list[dict] | None = None,
-        episodes: list[dict] | None = None,
+        tasks: pd.DataFrame | None = None,
+        episodes: datasets.Dataset | None = None,
    ) -> LeRobotDatasetMetadata:
-        if not info:
+        if info is None:
            info = info_factory()
-        if not stats:
+        if stats is None:
            stats = stats_factory(features=info["features"])
-        if not episodes_stats:
-            episodes_stats = episodes_stats_factory(
-                features=info["features"], total_episodes=info["total_episodes"]
-            )
-        if not tasks:
+        if tasks is None:
            tasks = tasks_factory(total_tasks=info["total_tasks"])
-        if not episodes:
+        if episodes is None:
+            video_keys = [key for key, ft in info["features"].items() if ft["dtype"] == "video"]
            episodes = episodes_factory(
-                total_episodes=info["total_episodes"], total_frames=info["total_frames"], tasks=tasks
+                features=info["features"],
+                fps=info["fps"],
+                total_episodes=info["total_episodes"],
+                total_frames=info["total_frames"],
+                video_keys=video_keys,
+                tasks=tasks,
            )

        mock_snapshot_download = mock_snapshot_download_factory(
            info=info,
            stats=stats,
-            episodes_stats=episodes_stats,
            tasks=tasks,
            episodes=episodes,
        )
@@ -366,7 +468,6 @@ def lerobot_dataset_metadata_factory(
 def lerobot_dataset_factory(
    info_factory,
    stats_factory,
-    episodes_stats_factory,
    tasks_factory,
    episodes_factory,
    hf_dataset_factory,
@@ -380,50 +481,63 @@ def lerobot_dataset_factory(
        total_frames: int = 150,
        total_tasks: int = 1,
        multi_task: bool = False,
+        use_videos: bool = True,
        info: dict | None = None,
        stats: dict | None = None,
-        episodes_stats: list[dict] | None = None,
-        tasks: list[dict] | None = None,
-        episode_dicts: list[dict] | None = None,
+        tasks: pd.DataFrame | None = None,
+        episodes_metadata: datasets.Dataset | None = None,
        hf_dataset: datasets.Dataset | None = None,
+        data_files_size_in_mb: float = DEFAULT_DATA_FILE_SIZE_IN_MB,
+        chunks_size: int = DEFAULT_CHUNK_SIZE,
        **kwargs,
    ) -> LeRobotDataset:
-        if not info:
+        # Instantiate objects
+        if info is None:
            info = info_factory(
-                total_episodes=total_episodes, total_frames=total_frames, total_tasks=total_tasks
+                total_episodes=total_episodes,
+                total_frames=total_frames,
+                total_tasks=total_tasks,
+                use_videos=use_videos,
+                data_files_size_in_mb=data_files_size_in_mb,
+                chunks_size=chunks_size,
            )
-        if not stats:
+        if stats is None:
            stats = stats_factory(features=info["features"])
-        if not episodes_stats:
-            episodes_stats = episodes_stats_factory(features=info["features"], total_episodes=total_episodes)
-        if not tasks:
+        if tasks is None:
            tasks = tasks_factory(total_tasks=info["total_tasks"])
-        if not episode_dicts:
-            episode_dicts = episodes_factory(
+        if episodes_metadata is None:
+            video_keys = [key for key, ft in info["features"].items() if ft["dtype"] == "video"]
+            episodes_metadata = episodes_factory(
+                features=info["features"],
+                fps=info["fps"],
                total_episodes=info["total_episodes"],
                total_frames=info["total_frames"],
+                video_keys=video_keys,
                tasks=tasks,
                multi_task=multi_task,
            )
-        if not hf_dataset:
-            hf_dataset = hf_dataset_factory(tasks=tasks, episodes=episode_dicts, fps=info["fps"])
+        if hf_dataset is None:
+            hf_dataset = hf_dataset_factory(
+                features=info["features"], tasks=tasks, episodes=episodes_metadata, fps=info["fps"]
+            )

+        # Write data on disk
        mock_snapshot_download = mock_snapshot_download_factory(
            info=info,
            stats=stats,
-            episodes_stats=episodes_stats,
            tasks=tasks,
-            episodes=episode_dicts,
+            episodes=episodes_metadata,
            hf_dataset=hf_dataset,
+            data_files_size_in_mb=data_files_size_in_mb,
+            chunks_size=chunks_size,
        )
        mock_metadata = lerobot_dataset_metadata_factory(
            root=root,
            repo_id=repo_id,
            info=info,
            stats=stats,
-            episodes_stats=episodes_stats,
            tasks=tasks,
-            episodes=episode_dicts,
+            episodes=episodes_metadata,
        )
        with (
            patch("lerobot.datasets.lerobot_dataset.LeRobotDatasetMetadata") as mock_metadata_patch,
--- a/tests/fixtures/files.py
+++ b/tests/fixtures/files.py
@@ -11,137 +11,166 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import json
+import logging
 from pathlib import Path

 import datasets
-import jsonlines
-import pyarrow.compute as pc
-import pyarrow.parquet as pq
+import numpy as np
+import pandas as pd
 import pytest
+from datasets import Dataset

 from lerobot.datasets.utils import (
-    EPISODES_PATH,
-    EPISODES_STATS_PATH,
-    INFO_PATH,
-    STATS_PATH,
-    TASKS_PATH,
+    DEFAULT_CHUNK_SIZE,
+    DEFAULT_DATA_FILE_SIZE_IN_MB,
+    DEFAULT_DATA_PATH,
+    get_hf_dataset_size_in_mb,
+    update_chunk_file_indices,
+    write_episodes,
+    write_info,
+    write_stats,
+    write_tasks,
 )


+def write_hf_dataset(
+    hf_dataset: Dataset,
+    local_dir: Path,
+    data_file_size_mb: float | None = None,
+    chunk_size: int | None = None,
+):
+    """
+    Writes a Hugging Face Dataset to one or more Parquet files in a structured directory format.
+
+    If the dataset size is within `DEFAULT_DATA_FILE_SIZE_IN_MB`, it's saved as a single file.
+    Otherwise, the dataset is split into multiple smaller Parquet files, each not exceeding the size limit.
+    The file and chunk indices are managed to organize the output files in a hierarchical structure,
+    e.g., `data/chunk-000/file-000.parquet`, `data/chunk-000/file-001.parquet`, etc.
+    This function ensures that episodes are not split across multiple files.
+
+    Args:
+        hf_dataset (Dataset): The Hugging Face Dataset to be written to disk.
+        local_dir (Path): The root directory where the dataset files will be stored.
+        data_file_size_mb (float, optional): Maximal size for the parquet data file, in MB. Defaults to DEFAULT_DATA_FILE_SIZE_IN_MB.
+        chunk_size (int, optional): Maximal number of files within a chunk folder before creating another one. Defaults to DEFAULT_CHUNK_SIZE.
+    """
+    if data_file_size_mb is None:
+        data_file_size_mb = DEFAULT_DATA_FILE_SIZE_IN_MB
+    if chunk_size is None:
+        chunk_size = DEFAULT_CHUNK_SIZE
+
+    dataset_size_in_mb = get_hf_dataset_size_in_mb(hf_dataset)
+
+    if dataset_size_in_mb <= data_file_size_mb:
+        # If the dataset is small enough, write it to a single file.
+        path = local_dir / DEFAULT_DATA_PATH.format(chunk_index=0, file_index=0)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        hf_dataset.to_parquet(path)
+        return
+
+    # If the dataset is too large, split it into smaller chunks, keeping episodes whole.
+    episode_indices = np.array(hf_dataset["episode_index"])
+    episode_boundaries = np.where(np.diff(episode_indices) != 0)[0] + 1
+    episode_starts = np.concatenate(([0], episode_boundaries))
+    episode_ends = np.concatenate((episode_boundaries, [len(hf_dataset)]))
+
+    num_episodes = len(episode_starts)
+    current_episode_idx = 0
+    chunk_idx, file_idx = 0, 0
+
+    while current_episode_idx < num_episodes:
+        shard_start_row = episode_starts[current_episode_idx]
+        shard_end_row = episode_ends[current_episode_idx]
+        next_episode_to_try_idx = current_episode_idx + 1
+
+        while next_episode_to_try_idx < num_episodes:
+            potential_shard_end_row = episode_ends[next_episode_to_try_idx]
+            dataset_shard_candidate = hf_dataset.select(range(shard_start_row, potential_shard_end_row))
+            shard_size_mb = get_hf_dataset_size_in_mb(dataset_shard_candidate)
+
+            if shard_size_mb > data_file_size_mb:
+                break
+            else:
+                shard_end_row = potential_shard_end_row
+                next_episode_to_try_idx += 1
+
+        dataset_shard = hf_dataset.select(range(shard_start_row, shard_end_row))
+
+        if (
+            shard_start_row == episode_starts[current_episode_idx]
+            and shard_end_row == episode_ends[current_episode_idx]
+        ):
+            shard_size_mb = get_hf_dataset_size_in_mb(dataset_shard)
+            if shard_size_mb > data_file_size_mb:
+                logging.warning(
+                    f"Episode with index {hf_dataset[shard_start_row.item()]['episode_index']} has size {shard_size_mb:.2f}MB, "
+                    f"which is larger than data_file_size_mb ({data_file_size_mb}MB). "
+                    "Writing it to a separate shard anyway to preserve episode integrity."
+                )
+
+        # Define the path for the current shard and ensure the directory exists.
+        path = local_dir / DEFAULT_DATA_PATH.format(chunk_index=chunk_idx, file_index=file_idx)
+        path.parent.mkdir(parents=True, exist_ok=True)
+
+        # Write the shard to a Parquet file.
+        dataset_shard.to_parquet(path)
+
+        # Update chunk and file indices for the next iteration.
+        chunk_idx, file_idx = update_chunk_file_indices(chunk_idx, file_idx, chunk_size)
+        current_episode_idx = next_episode_to_try_idx
+
+
@pytest.fixture(scope="session")
-def info_path(info_factory):
-    def _create_info_json_file(dir: Path, info: dict | None = None) -> Path:
-        if not info:
+def create_info(info_factory):
+    def _create_info(dir: Path, info: dict | None = None):
+        if info is None:
            info = info_factory()
-        fpath = dir / INFO_PATH
-        fpath.parent.mkdir(parents=True, exist_ok=True)
-        with open(fpath, "w") as f:
-            json.dump(info, f, indent=4, ensure_ascii=False)
-        return fpath
+        write_info(info, dir)

-    return _create_info_json_file
+    return _create_info


@pytest.fixture(scope="session")
-def stats_path(stats_factory):
-    def _create_stats_json_file(dir: Path, stats: dict | None = None) -> Path:
-        if not stats:
+def create_stats(stats_factory):
+    def _create_stats(dir: Path, stats: dict | None = None):
+        if stats is None:
            stats = stats_factory()
-        fpath = dir / STATS_PATH
-        fpath.parent.mkdir(parents=True, exist_ok=True)
-        with open(fpath, "w") as f:
-            json.dump(stats, f, indent=4, ensure_ascii=False)
-        return fpath
+        write_stats(stats, dir)

-    return _create_stats_json_file
+    return _create_stats


@pytest.fixture(scope="session")
-def episodes_stats_path(episodes_stats_factory):
-    def _create_episodes_stats_jsonl_file(dir: Path, episodes_stats: list[dict] | None = None) -> Path:
-        if not episodes_stats:
-            episodes_stats = episodes_stats_factory()
-        fpath = dir / EPISODES_STATS_PATH
-        fpath.parent.mkdir(parents=True, exist_ok=True)
-        with jsonlines.open(fpath, "w") as writer:
-            writer.write_all(episodes_stats.values())
-        return fpath
-
-    return _create_episodes_stats_jsonl_file
-
-
-@pytest.fixture(scope="session")
-def tasks_path(tasks_factory):
-    def _create_tasks_jsonl_file(dir: Path, tasks: list | None = None) -> Path:
-        if not tasks:
+def create_tasks(tasks_factory):
+    def _create_tasks(dir: Path, tasks: pd.DataFrame | None = None):
+        if tasks is None:
            tasks = tasks_factory()
-        fpath = dir / TASKS_PATH
-        fpath.parent.mkdir(parents=True, exist_ok=True)
-        with jsonlines.open(fpath, "w") as writer:
-            writer.write_all(tasks.values())
-        return fpath
+        write_tasks(tasks, dir)

-    return _create_tasks_jsonl_file
+    return _create_tasks


@pytest.fixture(scope="session")
-def episode_path(episodes_factory):
-    def _create_episodes_jsonl_file(dir: Path, episodes: list | None = None) -> Path:
-        if not episodes:
+def create_episodes(episodes_factory):
+    def _create_episodes(dir: Path, episodes: datasets.Dataset | None = None):
+        if episodes is None:
+            # TODO(rcadene): add features, fps as arguments
            episodes = episodes_factory()
-        fpath = dir / EPISODES_PATH
-        fpath.parent.mkdir(parents=True, exist_ok=True)
-        with jsonlines.open(fpath, "w") as writer:
-            writer.write_all(episodes.values())
-        return fpath
+        write_episodes(episodes, dir)

-    return _create_episodes_jsonl_file
+    return _create_episodes


@pytest.fixture(scope="session")
-def single_episode_parquet_path(hf_dataset_factory, info_factory):
-    def _create_single_episode_parquet(
-        dir: Path, ep_idx: int = 0, hf_dataset: datasets.Dataset | None = None, info: dict | None = None
-    ) -> Path:
-        if not info:
-            info = info_factory()
+def create_hf_dataset(hf_dataset_factory):
+    def _create_hf_dataset(
+        dir: Path,
+        hf_dataset: datasets.Dataset | None = None,
+        data_file_size_in_mb: float | None = None,
+        chunk_size: int | None = None,
+    ):
        if hf_dataset is None:
            hf_dataset = hf_dataset_factory()
+        write_hf_dataset(hf_dataset, dir, data_file_size_in_mb, chunk_size)

-        data_path = info["data_path"]
-        chunks_size = info["chunks_size"]
-        ep_chunk = ep_idx // chunks_size
-        fpath = dir / data_path.format(episode_chunk=ep_chunk, episode_index=ep_idx)
-        fpath.parent.mkdir(parents=True, exist_ok=True)
-        table = hf_dataset.data.table
-        ep_table = table.filter(pc.equal(table["episode_index"], ep_idx))
-        pq.write_table(ep_table, fpath)
-        return fpath
-
-    return _create_single_episode_parquet
-
-
-@pytest.fixture(scope="session")
-def multi_episode_parquet_path(hf_dataset_factory, info_factory):
-    def _create_multi_episode_parquet(
-        dir: Path, hf_dataset: datasets.Dataset | None = None, info: dict | None = None
-    ) -> Path:
-        if not info:
-            info = info_factory()
-        if hf_dataset is None:
-            hf_dataset = hf_dataset_factory()
-
-        data_path = info["data_path"]
-        chunks_size = info["chunks_size"]
-        total_episodes = info["total_episodes"]
-        for ep_idx in range(total_episodes):
-            ep_chunk = ep_idx // chunks_size
-            fpath = dir / data_path.format(episode_chunk=ep_chunk, episode_index=ep_idx)
-            fpath.parent.mkdir(parents=True, exist_ok=True)
-            table = hf_dataset.data.table
-            ep_table = table.filter(pc.equal(table["episode_index"], ep_idx))
-            pq.write_table(ep_table, fpath)
-        return dir / "data"
-
-    return _create_multi_episode_parquet
+    return _create_hf_dataset
--- a/tests/fixtures/hub.py
+++ b/tests/fixtures/hub.py
@@ -14,15 +14,19 @@
 from pathlib import Path

 import datasets
+import pandas as pd
 import pytest
 from huggingface_hub.utils import filter_repo_objects

 from lerobot.datasets.utils import (
-    EPISODES_PATH,
-    EPISODES_STATS_PATH,
+    DEFAULT_CHUNK_SIZE,
+    DEFAULT_DATA_FILE_SIZE_IN_MB,
+    DEFAULT_DATA_PATH,
+    DEFAULT_EPISODES_PATH,
+    DEFAULT_TASKS_PATH,
+    DEFAULT_VIDEO_PATH,
    INFO_PATH,
    STATS_PATH,
-    TASKS_PATH,
 )
 from tests.fixtures.constants import LEROBOT_TEST_DIR

@@ -30,17 +34,16 @@ from tests.fixtures.constants import LEROBOT_TEST_DIR
@pytest.fixture(scope="session")
 def mock_snapshot_download_factory(
    info_factory,
-    info_path,
+    create_info,
    stats_factory,
-    stats_path,
-    episodes_stats_factory,
-    episodes_stats_path,
+    create_stats,
    tasks_factory,
-    tasks_path,
+    create_tasks,
    episodes_factory,
-    episode_path,
-    single_episode_parquet_path,
+    create_episodes,
    hf_dataset_factory,
+    create_hf_dataset,
+    create_videos,
 ):
    """
    This factory allows to patch snapshot_download such that when called, it will create expected files rather
@@ -50,82 +53,93 @@ def mock_snapshot_download_factory(
    def _mock_snapshot_download_func(
        info: dict | None = None,
        stats: dict | None = None,
-        episodes_stats: list[dict] | None = None,
-        tasks: list[dict] | None = None,
-        episodes: list[dict] | None = None,
+        tasks: pd.DataFrame | None = None,
+        episodes: datasets.Dataset | None = None,
        hf_dataset: datasets.Dataset | None = None,
+        data_files_size_in_mb: float = DEFAULT_DATA_FILE_SIZE_IN_MB,
+        chunks_size: int = DEFAULT_CHUNK_SIZE,
    ):
-        if not info:
-            info = info_factory()
-        if not stats:
+        if info is None:
+            info = info_factory(data_files_size_in_mb=data_files_size_in_mb, chunks_size=chunks_size)
+        if stats is None:
            stats = stats_factory(features=info["features"])
-        if not episodes_stats:
-            episodes_stats = episodes_stats_factory(
-                features=info["features"], total_episodes=info["total_episodes"]
-            )
-        if not tasks:
+        if tasks is None:
            tasks = tasks_factory(total_tasks=info["total_tasks"])
-        if not episodes:
+        if episodes is None:
            episodes = episodes_factory(
-                total_episodes=info["total_episodes"], total_frames=info["total_frames"], tasks=tasks
+                features=info["features"],
+                fps=info["fps"],
+                total_episodes=info["total_episodes"],
+                total_frames=info["total_frames"],
+                tasks=tasks,
            )
-        if not hf_dataset:
+        if hf_dataset is None:
            hf_dataset = hf_dataset_factory(tasks=tasks, episodes=episodes, fps=info["fps"])

-        def _extract_episode_index_from_path(fpath: str) -> int:
-            path = Path(fpath)
-            if path.suffix == ".parquet" and path.stem.startswith("episode_"):
-                episode_index = int(path.stem[len("episode_") :])  # 'episode_000000' -> 0
-                return episode_index
-            else:
-                return None
-
        def _mock_snapshot_download(
-            repo_id: str,
+            repo_id: str,  # TODO(rcadene): repo_id should be used no?
            local_dir: str | Path | None = None,
            allow_patterns: str | list[str] | None = None,
            ignore_patterns: str | list[str] | None = None,
            *args,
            **kwargs,
        ) -> str:
-            if not local_dir:
+            if local_dir is None:
                local_dir = LEROBOT_TEST_DIR

            # List all possible files
-            all_files = []
-            meta_files = [INFO_PATH, STATS_PATH, EPISODES_STATS_PATH, TASKS_PATH, EPISODES_PATH]
-            all_files.extend(meta_files)
+            all_files = [
+                INFO_PATH,
+                STATS_PATH,
+                # TODO(rcadene): remove naive chunk 0 file 0 ?
+                DEFAULT_TASKS_PATH.format(chunk_index=0, file_index=0),
+                DEFAULT_EPISODES_PATH.format(chunk_index=0, file_index=0),
+                DEFAULT_DATA_PATH.format(chunk_index=0, file_index=0),
+            ]

-            data_files = []
-            for episode_dict in episodes.values():
-                ep_idx = episode_dict["episode_index"]
-                ep_chunk = ep_idx // info["chunks_size"]
-                data_path = info["data_path"].format(episode_chunk=ep_chunk, episode_index=ep_idx)
-                data_files.append(data_path)
-            all_files.extend(data_files)
+            video_keys = [key for key, feats in info["features"].items() if feats["dtype"] == "video"]
+            for key in video_keys:
+                all_files.append(DEFAULT_VIDEO_PATH.format(video_key=key, chunk_index=0, file_index=0))

            allowed_files = filter_repo_objects(
                all_files, allow_patterns=allow_patterns, ignore_patterns=ignore_patterns
            )

-            # Create allowed files
+            request_info = False
+            request_tasks = False
+            request_episodes = False
+            request_stats = False
+            request_data = False
+            request_videos = False
            for rel_path in allowed_files:
-                if rel_path.startswith("data/"):
-                    episode_index = _extract_episode_index_from_path(rel_path)
-                    if episode_index is not None:
-                        _ = single_episode_parquet_path(local_dir, episode_index, hf_dataset, info)
-                if rel_path == INFO_PATH:
-                    _ = info_path(local_dir, info)
-                elif rel_path == STATS_PATH:
-                    _ = stats_path(local_dir, stats)
-                elif rel_path == EPISODES_STATS_PATH:
-                    _ = episodes_stats_path(local_dir, episodes_stats)
-                elif rel_path == TASKS_PATH:
-                    _ = tasks_path(local_dir, tasks)
-                elif rel_path == EPISODES_PATH:
-                    _ = episode_path(local_dir, episodes)
+                if rel_path.startswith("meta/info.json"):
+                    request_info = True
+                elif rel_path.startswith("meta/stats"):
+                    request_stats = True
+                elif rel_path.startswith("meta/tasks"):
+                    request_tasks = True
+                elif rel_path.startswith("meta/episodes"):
+                    request_episodes = True
+                elif rel_path.startswith("data/"):
+                    request_data = True
+                elif rel_path.startswith("videos/"):
+                    request_videos = True
                else:
-                    pass
+                    raise ValueError(f"{rel_path} not supported.")
+
+            if request_info:
+                create_info(local_dir, info)
+            if request_stats:
+                create_stats(local_dir, stats)
+            if request_tasks:
+                create_tasks(local_dir, tasks)
+            if request_episodes:
+                create_episodes(local_dir, episodes)
+            if request_data:
+                create_hf_dataset(local_dir, hf_dataset, data_files_size_in_mb, chunks_size)
+            if request_videos:
+                create_videos(root=local_dir, info=info)
+
            return str(local_dir)

        return _mock_snapshot_download