Package folder structure (#1417)

* Move files * Replace imports & paths * Update relative paths * Update doc symlinks * Update instructions paths * Fix imports * Update grpc files * Update more instructions * Downgrade grpc-tools * Update manifest * Update more paths * Update config paths * Update CI paths * Update bandit exclusions * Remove walkthrough section
2025-07-01 16:34:46 +02:00
parent 483be9aac2
commit d4ee470b00
268 changed files with 862 additions and 890 deletions
--- a/src/lerobot/datasets/backward_compatibility.py
+++ b/src/lerobot/datasets/backward_compatibility.py
@@ -0,0 +1,68 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import packaging.version
+
+V2_MESSAGE = """
+The dataset you requested ({repo_id}) is in {version} format.
+
+We introduced a new format since v2.0 which is not backward compatible with v1.x.
+Please, use our conversion script. Modify the following command with your own task description:
+```
+python -m lerobot.datasets.v2.convert_dataset_v1_to_v2 \\
+    --repo-id {repo_id} \\
+    --single-task "TASK DESCRIPTION."  # <---- /!\\ Replace TASK DESCRIPTION /!\\
+```
+
+A few examples to replace TASK DESCRIPTION: "Pick up the blue cube and place it into the bin.", "Insert the
+peg into the socket.", "Slide open the ziploc bag.", "Take the elevator to the 1st floor.", "Open the top
+cabinet, store the pot inside it then close the cabinet.", "Push the T-shaped block onto the T-shaped
+target.", "Grab the spray paint on the shelf and place it in the bin on top of the robot dog.", "Fold the
+sweatshirt.", ...
+
+If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
+or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
+"""
+
+V21_MESSAGE = """
+The dataset you requested ({repo_id}) is in {version} format.
+While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
+stats instead of per-episode stats. Update your dataset stats to the new format using this command:
+```
+python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id={repo_id}
+```
+
+If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
+or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
+"""
+
+FUTURE_MESSAGE = """
+The dataset you requested ({repo_id}) is only available in {version} format.
+As we cannot ensure forward compatibility with it, please update your current version of lerobot.
+"""
+
+
+class CompatibilityError(Exception): ...
+
+
+class BackwardCompatibilityError(CompatibilityError):
+    def __init__(self, repo_id: str, version: packaging.version.Version):
+        message = V2_MESSAGE.format(repo_id=repo_id, version=version)
+        super().__init__(message)
+
+
+class ForwardCompatibilityError(CompatibilityError):
+    def __init__(self, repo_id: str, version: packaging.version.Version):
+        message = FUTURE_MESSAGE.format(repo_id=repo_id, version=version)
+        super().__init__(message)
--- a/src/lerobot/datasets/card_template.md
+++ b/src/lerobot/datasets/card_template.md
@@ -0,0 +1,27 @@
+---
+# For reference on dataset card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/datasetcard.md?plain=1
+# Doc / guide: https://huggingface.co/docs/hub/datasets-cards
+{{ card_data }}
+---
+
+This dataset was created using [LeRobot](https://github.com/huggingface/lerobot).
+
+## Dataset Description
+
+{{ dataset_description | default("", true) }}
+
+- **Homepage:** {{ url | default("[More Information Needed]", true)}}
+- **Paper:** {{ paper | default("[More Information Needed]", true)}}
+- **License:** {{ license | default("[More Information Needed]", true)}}
+
+## Dataset Structure
+
+{{ dataset_structure | default("[More Information Needed]", true)}}
+
+## Citation
+
+**BibTeX:**
+
+```bibtex
+{{ citation_bibtex | default("[More Information Needed]", true)}}
+```
--- a/src/lerobot/datasets/compute_stats.py
+++ b/src/lerobot/datasets/compute_stats.py
@@ -0,0 +1,176 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+
+from lerobot.datasets.utils import load_image_as_numpy
+
+
+def estimate_num_samples(
+    dataset_len: int, min_num_samples: int = 100, max_num_samples: int = 10_000, power: float = 0.75
+) -> int:
+    """Heuristic to estimate the number of samples based on dataset size.
+    The power controls the sample growth relative to dataset size.
+    Lower the power for less number of samples.
+
+    For default arguments, we have:
+    - from 1 to ~500, num_samples=100
+    - at 1000, num_samples=177
+    - at 2000, num_samples=299
+    - at 5000, num_samples=594
+    - at 10000, num_samples=1000
+    - at 20000, num_samples=1681
+    """
+    if dataset_len < min_num_samples:
+        min_num_samples = dataset_len
+    return max(min_num_samples, min(int(dataset_len**power), max_num_samples))
+
+
+def sample_indices(data_len: int) -> list[int]:
+    num_samples = estimate_num_samples(data_len)
+    return np.round(np.linspace(0, data_len - 1, num_samples)).astype(int).tolist()
+
+
+def auto_downsample_height_width(img: np.ndarray, target_size: int = 150, max_size_threshold: int = 300):
+    _, height, width = img.shape
+
+    if max(width, height) < max_size_threshold:
+        # no downsampling needed
+        return img
+
+    downsample_factor = int(width / target_size) if width > height else int(height / target_size)
+    return img[:, ::downsample_factor, ::downsample_factor]
+
+
+def sample_images(image_paths: list[str]) -> np.ndarray:
+    sampled_indices = sample_indices(len(image_paths))
+
+    images = None
+    for i, idx in enumerate(sampled_indices):
+        path = image_paths[idx]
+        # we load as uint8 to reduce memory usage
+        img = load_image_as_numpy(path, dtype=np.uint8, channel_first=True)
+        img = auto_downsample_height_width(img)
+
+        if images is None:
+            images = np.empty((len(sampled_indices), *img.shape), dtype=np.uint8)
+
+        images[i] = img
+
+    return images
+
+
+def get_feature_stats(array: np.ndarray, axis: tuple, keepdims: bool) -> dict[str, np.ndarray]:
+    return {
+        "min": np.min(array, axis=axis, keepdims=keepdims),
+        "max": np.max(array, axis=axis, keepdims=keepdims),
+        "mean": np.mean(array, axis=axis, keepdims=keepdims),
+        "std": np.std(array, axis=axis, keepdims=keepdims),
+        "count": np.array([len(array)]),
+    }
+
+
+def compute_episode_stats(episode_data: dict[str, list[str] | np.ndarray], features: dict) -> dict:
+    ep_stats = {}
+    for key, data in episode_data.items():
+        if features[key]["dtype"] == "string":
+            continue  # HACK: we should receive np.arrays of strings
+        elif features[key]["dtype"] in ["image", "video"]:
+            ep_ft_array = sample_images(data)  # data is a list of image paths
+            axes_to_reduce = (0, 2, 3)  # keep channel dim
+            keepdims = True
+        else:
+            ep_ft_array = data  # data is already a np.ndarray
+            axes_to_reduce = 0  # compute stats over the first axis
+            keepdims = data.ndim == 1  # keep as np.array
+
+        ep_stats[key] = get_feature_stats(ep_ft_array, axis=axes_to_reduce, keepdims=keepdims)
+
+        # finally, we normalize and remove batch dim for images
+        if features[key]["dtype"] in ["image", "video"]:
+            ep_stats[key] = {
+                k: v if k == "count" else np.squeeze(v / 255.0, axis=0) for k, v in ep_stats[key].items()
+            }
+
+    return ep_stats
+
+
+def _assert_type_and_shape(stats_list: list[dict[str, dict]]):
+    for i in range(len(stats_list)):
+        for fkey in stats_list[i]:
+            for k, v in stats_list[i][fkey].items():
+                if not isinstance(v, np.ndarray):
+                    raise ValueError(
+                        f"Stats must be composed of numpy array, but key '{k}' of feature '{fkey}' is of type '{type(v)}' instead."
+                    )
+                if v.ndim == 0:
+                    raise ValueError("Number of dimensions must be at least 1, and is 0 instead.")
+                if k == "count" and v.shape != (1,):
+                    raise ValueError(f"Shape of 'count' must be (1), but is {v.shape} instead.")
+                if "image" in fkey and k != "count" and v.shape != (3, 1, 1):
+                    raise ValueError(f"Shape of '{k}' must be (3,1,1), but is {v.shape} instead.")
+
+
+def aggregate_feature_stats(stats_ft_list: list[dict[str, dict]]) -> dict[str, dict[str, np.ndarray]]:
+    """Aggregates stats for a single feature."""
+    means = np.stack([s["mean"] for s in stats_ft_list])
+    variances = np.stack([s["std"] ** 2 for s in stats_ft_list])
+    counts = np.stack([s["count"] for s in stats_ft_list])
+    total_count = counts.sum(axis=0)
+
+    # Prepare weighted mean by matching number of dimensions
+    while counts.ndim < means.ndim:
+        counts = np.expand_dims(counts, axis=-1)
+
+    # Compute the weighted mean
+    weighted_means = means * counts
+    total_mean = weighted_means.sum(axis=0) / total_count
+
+    # Compute the variance using the parallel algorithm
+    delta_means = means - total_mean
+    weighted_variances = (variances + delta_means**2) * counts
+    total_variance = weighted_variances.sum(axis=0) / total_count
+
+    return {
+        "min": np.min(np.stack([s["min"] for s in stats_ft_list]), axis=0),
+        "max": np.max(np.stack([s["max"] for s in stats_ft_list]), axis=0),
+        "mean": total_mean,
+        "std": np.sqrt(total_variance),
+        "count": total_count,
+    }
+
+
+def aggregate_stats(stats_list: list[dict[str, dict]]) -> dict[str, dict[str, np.ndarray]]:
+    """Aggregate stats from multiple compute_stats outputs into a single set of stats.
+
+    The final stats will have the union of all data keys from each of the stats dicts.
+
+    For instance:
+    - new_min = min(min_dataset_0, min_dataset_1, ...)
+    - new_max = max(max_dataset_0, max_dataset_1, ...)
+    - new_mean = (mean of all data, weighted by counts)
+    - new_std = (std of all data)
+    """
+
+    _assert_type_and_shape(stats_list)
+
+    data_keys = {key for stats in stats_list for key in stats}
+    aggregated_stats = {key: {} for key in data_keys}
+
+    for key in data_keys:
+        stats_with_key = [stats[key] for stats in stats_list if key in stats]
+        aggregated_stats[key] = aggregate_feature_stats(stats_with_key)
+
+    return aggregated_stats
--- a/src/lerobot/datasets/factory.py
+++ b/src/lerobot/datasets/factory.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from pprint import pformat
+
+import torch
+
+from lerobot.configs.policies import PreTrainedConfig
+from lerobot.configs.train import TrainPipelineConfig
+from lerobot.datasets.lerobot_dataset import (
+    LeRobotDataset,
+    LeRobotDatasetMetadata,
+    MultiLeRobotDataset,
+)
+from lerobot.datasets.transforms import ImageTransforms
+
+IMAGENET_STATS = {
+    "mean": [[[0.485]], [[0.456]], [[0.406]]],  # (c,1,1)
+    "std": [[[0.229]], [[0.224]], [[0.225]]],  # (c,1,1)
+}
+
+
+def resolve_delta_timestamps(
+    cfg: PreTrainedConfig, ds_meta: LeRobotDatasetMetadata
+) -> dict[str, list] | None:
+    """Resolves delta_timestamps by reading from the 'delta_indices' properties of the PreTrainedConfig.
+
+    Args:
+        cfg (PreTrainedConfig): The PreTrainedConfig to read delta_indices from.
+        ds_meta (LeRobotDatasetMetadata): The dataset from which features and fps are used to build
+            delta_timestamps against.
+
+    Returns:
+        dict[str, list] | None: A dictionary of delta_timestamps, e.g.:
+            {
+                "observation.state": [-0.04, -0.02, 0]
+                "observation.action": [-0.02, 0, 0.02]
+            }
+            returns `None` if the resulting dict is empty.
+    """
+    delta_timestamps = {}
+    for key in ds_meta.features:
+        if key == "next.reward" and cfg.reward_delta_indices is not None:
+            delta_timestamps[key] = [i / ds_meta.fps for i in cfg.reward_delta_indices]
+        if key == "action" and cfg.action_delta_indices is not None:
+            delta_timestamps[key] = [i / ds_meta.fps for i in cfg.action_delta_indices]
+        if key.startswith("observation.") and cfg.observation_delta_indices is not None:
+            delta_timestamps[key] = [i / ds_meta.fps for i in cfg.observation_delta_indices]
+
+    if len(delta_timestamps) == 0:
+        delta_timestamps = None
+
+    return delta_timestamps
+
+
+def make_dataset(cfg: TrainPipelineConfig) -> LeRobotDataset | MultiLeRobotDataset:
+    """Handles the logic of setting up delta timestamps and image transforms before creating a dataset.
+
+    Args:
+        cfg (TrainPipelineConfig): A TrainPipelineConfig config which contains a DatasetConfig and a PreTrainedConfig.
+
+    Raises:
+        NotImplementedError: The MultiLeRobotDataset is currently deactivated.
+
+    Returns:
+        LeRobotDataset | MultiLeRobotDataset
+    """
+    image_transforms = (
+        ImageTransforms(cfg.dataset.image_transforms) if cfg.dataset.image_transforms.enable else None
+    )
+
+    if isinstance(cfg.dataset.repo_id, str):
+        ds_meta = LeRobotDatasetMetadata(
+            cfg.dataset.repo_id, root=cfg.dataset.root, revision=cfg.dataset.revision
+        )
+        delta_timestamps = resolve_delta_timestamps(cfg.policy, ds_meta)
+        dataset = LeRobotDataset(
+            cfg.dataset.repo_id,
+            root=cfg.dataset.root,
+            episodes=cfg.dataset.episodes,
+            delta_timestamps=delta_timestamps,
+            image_transforms=image_transforms,
+            revision=cfg.dataset.revision,
+            video_backend=cfg.dataset.video_backend,
+        )
+    else:
+        raise NotImplementedError("The MultiLeRobotDataset isn't supported for now.")
+        dataset = MultiLeRobotDataset(
+            cfg.dataset.repo_id,
+            # TODO(aliberts): add proper support for multi dataset
+            # delta_timestamps=delta_timestamps,
+            image_transforms=image_transforms,
+            video_backend=cfg.dataset.video_backend,
+        )
+        logging.info(
+            "Multiple datasets were provided. Applied the following index mapping to the provided datasets: "
+            f"{pformat(dataset.repo_id_to_index, indent=2)}"
+        )
+
+    if cfg.dataset.use_imagenet_stats:
+        for key in dataset.meta.camera_keys:
+            for stats_type, stats in IMAGENET_STATS.items():
+                dataset.meta.stats[key][stats_type] = torch.tensor(stats, dtype=torch.float32)
+
+    return dataset
--- a/src/lerobot/datasets/image_writer.py
+++ b/src/lerobot/datasets/image_writer.py
@@ -0,0 +1,178 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import multiprocessing
+import queue
+import threading
+from pathlib import Path
+
+import numpy as np
+import PIL.Image
+import torch
+
+
+def safe_stop_image_writer(func):
+    def wrapper(*args, **kwargs):
+        try:
+            return func(*args, **kwargs)
+        except Exception as e:
+            dataset = kwargs.get("dataset")
+            image_writer = getattr(dataset, "image_writer", None) if dataset else None
+            if image_writer is not None:
+                print("Waiting for image writer to terminate...")
+                image_writer.stop()
+            raise e
+
+    return wrapper
+
+
+def image_array_to_pil_image(image_array: np.ndarray, range_check: bool = True) -> PIL.Image.Image:
+    # TODO(aliberts): handle 1 channel and 4 for depth images
+    if image_array.ndim != 3:
+        raise ValueError(f"The array has {image_array.ndim} dimensions, but 3 is expected for an image.")
+
+    if image_array.shape[0] == 3:
+        # Transpose from pytorch convention (C, H, W) to (H, W, C)
+        image_array = image_array.transpose(1, 2, 0)
+
+    elif image_array.shape[-1] != 3:
+        raise NotImplementedError(
+            f"The image has {image_array.shape[-1]} channels, but 3 is required for now."
+        )
+
+    if image_array.dtype != np.uint8:
+        if range_check:
+            max_ = image_array.max().item()
+            min_ = image_array.min().item()
+            if max_ > 1.0 or min_ < 0.0:
+                raise ValueError(
+                    "The image data type is float, which requires values in the range [0.0, 1.0]. "
+                    f"However, the provided range is [{min_}, {max_}]. Please adjust the range or "
+                    "provide a uint8 image with values in the range [0, 255]."
+                )
+
+        image_array = (image_array * 255).astype(np.uint8)
+
+    return PIL.Image.fromarray(image_array)
+
+
+def write_image(image: np.ndarray | PIL.Image.Image, fpath: Path):
+    try:
+        if isinstance(image, np.ndarray):
+            img = image_array_to_pil_image(image)
+        elif isinstance(image, PIL.Image.Image):
+            img = image
+        else:
+            raise TypeError(f"Unsupported image type: {type(image)}")
+        img.save(fpath)
+    except Exception as e:
+        print(f"Error writing image {fpath}: {e}")
+
+
+def worker_thread_loop(queue: queue.Queue):
+    while True:
+        item = queue.get()
+        if item is None:
+            queue.task_done()
+            break
+        image_array, fpath = item
+        write_image(image_array, fpath)
+        queue.task_done()
+
+
+def worker_process(queue: queue.Queue, num_threads: int):
+    threads = []
+    for _ in range(num_threads):
+        t = threading.Thread(target=worker_thread_loop, args=(queue,))
+        t.daemon = True
+        t.start()
+        threads.append(t)
+    for t in threads:
+        t.join()
+
+
+class AsyncImageWriter:
+    """
+    This class abstract away the initialisation of processes or/and threads to
+    save images on disk asynchronously, which is critical to control a robot and record data
+    at a high frame rate.
+
+    When `num_processes=0`, it creates a threads pool of size `num_threads`.
+    When `num_processes>0`, it creates processes pool of size `num_processes`, where each subprocess starts
+    their own threads pool of size `num_threads`.
+
+    The optimal number of processes and threads depends on your computer capabilities.
+    We advise to use 4 threads per camera with 0 processes. If the fps is not stable, try to increase or lower
+    the number of threads. If it is still not stable, try to use 1 subprocess, or more.
+    """
+
+    def __init__(self, num_processes: int = 0, num_threads: int = 1):
+        self.num_processes = num_processes
+        self.num_threads = num_threads
+        self.queue = None
+        self.threads = []
+        self.processes = []
+        self._stopped = False
+
+        if num_threads <= 0 and num_processes <= 0:
+            raise ValueError("Number of threads and processes must be greater than zero.")
+
+        if self.num_processes == 0:
+            # Use threading
+            self.queue = queue.Queue()
+            for _ in range(self.num_threads):
+                t = threading.Thread(target=worker_thread_loop, args=(self.queue,))
+                t.daemon = True
+                t.start()
+                self.threads.append(t)
+        else:
+            # Use multiprocessing
+            self.queue = multiprocessing.JoinableQueue()
+            for _ in range(self.num_processes):
+                p = multiprocessing.Process(target=worker_process, args=(self.queue, self.num_threads))
+                p.daemon = True
+                p.start()
+                self.processes.append(p)
+
+    def save_image(self, image: torch.Tensor | np.ndarray | PIL.Image.Image, fpath: Path):
+        if isinstance(image, torch.Tensor):
+            # Convert tensor to numpy array to minimize main process time
+            image = image.cpu().numpy()
+        self.queue.put((image, fpath))
+
+    def wait_until_done(self):
+        self.queue.join()
+
+    def stop(self):
+        if self._stopped:
+            return
+
+        if self.num_processes == 0:
+            for _ in self.threads:
+                self.queue.put(None)
+            for t in self.threads:
+                t.join()
+        else:
+            num_nones = self.num_processes * self.num_threads
+            for _ in range(num_nones):
+                self.queue.put(None)
+            for p in self.processes:
+                p.join()
+                if p.is_alive():
+                    p.terminate()
+            self.queue.close()
+            self.queue.join_thread()
+
+        self._stopped = True
--- a/src/lerobot/datasets/lerobot_dataset.py
+++ b/src/lerobot/datasets/lerobot_dataset.py
--- a/src/lerobot/datasets/online_buffer.py
+++ b/src/lerobot/datasets/online_buffer.py
@@ -0,0 +1,384 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""An online buffer for the online training loop in train.py
+
+Note to maintainers: This duplicates some logic from LeRobotDataset and EpisodeAwareSampler. We should
+consider converging to one approach. Here we have opted to use numpy.memmap to back the data buffer. It's much
+faster than using HuggingFace Datasets as there's no conversion to an intermediate non-python object. Also it
+supports in-place slicing and mutation which is very handy for a dynamic buffer.
+"""
+
+import os
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import torch
+
+from lerobot.datasets.lerobot_dataset import LeRobotDataset
+
+
+def _make_memmap_safe(**kwargs) -> np.memmap:
+    """Make a numpy memmap with checks on available disk space first.
+
+    Expected kwargs are: "filename", "dtype" (must by np.dtype), "mode" and "shape"
+
+    For information on dtypes:
+    https://numpy.org/doc/stable/reference/arrays.dtypes.html#arrays-dtypes-constructing
+    """
+    if kwargs["mode"].startswith("w"):
+        required_space = kwargs["dtype"].itemsize * np.prod(kwargs["shape"])  # bytes
+        stats = os.statvfs(Path(kwargs["filename"]).parent)
+        available_space = stats.f_bavail * stats.f_frsize  # bytes
+        if required_space >= available_space * 0.8:
+            raise RuntimeError(
+                f"You're about to take up {required_space} of {available_space} bytes available."
+            )
+    return np.memmap(**kwargs)
+
+
+class OnlineBuffer(torch.utils.data.Dataset):
+    """FIFO data buffer for the online training loop in train.py.
+
+    Follows the protocol of LeRobotDataset as much as is required to have it be used by the online training
+    loop in the same way that a LeRobotDataset would be used.
+
+    The underlying data structure will have data inserted in a circular fashion. Always insert after the
+    last index, and when you reach the end, wrap around to the start.
+
+    The data is stored in a numpy memmap.
+    """
+
+    NEXT_INDEX_KEY = "_next_index"
+    OCCUPANCY_MASK_KEY = "_occupancy_mask"
+    INDEX_KEY = "index"
+    FRAME_INDEX_KEY = "frame_index"
+    EPISODE_INDEX_KEY = "episode_index"
+    TIMESTAMP_KEY = "timestamp"
+    IS_PAD_POSTFIX = "_is_pad"
+
+    def __init__(
+        self,
+        write_dir: str | Path,
+        data_spec: dict[str, Any] | None,
+        buffer_capacity: int | None,
+        fps: float | None = None,
+        delta_timestamps: dict[str, list[float]] | dict[str, np.ndarray] | None = None,
+    ):
+        """
+        The online buffer can be provided from scratch or you can load an existing online buffer by passing
+        a `write_dir` associated with an existing buffer.
+
+        Args:
+            write_dir: Where to keep the numpy memmap files. One memmap file will be stored for each data key.
+                Note that if the files already exist, they are opened in read-write mode (used for training
+                resumption.)
+            data_spec: A mapping from data key to data specification, like {data_key: {"shape": tuple[int],
+                "dtype": np.dtype}}. This should include all the data that you wish to record into the buffer,
+                but note that "index", "frame_index" and "episode_index" are already accounted for by this
+                class, so you don't need to include them.
+            buffer_capacity: How many frames should be stored in the buffer as a maximum. Be aware of your
+                system's available disk space when choosing this.
+            fps: Same as the fps concept in LeRobot dataset. Here it needs to be provided for the
+                 delta_timestamps logic. You can pass None if you are not using delta_timestamps.
+            delta_timestamps: Same as the delta_timestamps concept in LeRobotDataset. This is internally
+                converted to dict[str, np.ndarray] for optimization purposes.
+
+        """
+        self.set_delta_timestamps(delta_timestamps)
+        self._fps = fps
+        # Tolerance in seconds used to discard loaded frames when their timestamps are not close enough from
+        # the requested frames. It is only used when `delta_timestamps` is provided.
+        # minus 1e-4 to account for possible numerical error
+        self.tolerance_s = 1 / self.fps - 1e-4 if fps is not None else None
+        self._buffer_capacity = buffer_capacity
+        data_spec = self._make_data_spec(data_spec, buffer_capacity)
+        Path(write_dir).mkdir(parents=True, exist_ok=True)
+        self._data = {}
+        for k, v in data_spec.items():
+            self._data[k] = _make_memmap_safe(
+                filename=Path(write_dir) / k,
+                dtype=v["dtype"] if v is not None else None,
+                mode="r+" if (Path(write_dir) / k).exists() else "w+",
+                shape=tuple(v["shape"]) if v is not None else None,
+            )
+
+    @property
+    def delta_timestamps(self) -> dict[str, np.ndarray] | None:
+        return self._delta_timestamps
+
+    def set_delta_timestamps(self, value: dict[str, list[float]] | None):
+        """Set delta_timestamps converting the values to numpy arrays.
+
+        The conversion is for an optimization in the __getitem__. The loop is much slower if the arrays
+        need to be converted into numpy arrays.
+        """
+        if value is not None:
+            self._delta_timestamps = {k: np.array(v) for k, v in value.items()}
+        else:
+            self._delta_timestamps = None
+
+    def _make_data_spec(self, data_spec: dict[str, Any], buffer_capacity: int) -> dict[str, dict[str, Any]]:
+        """Makes the data spec for np.memmap."""
+        if any(k.startswith("_") for k in data_spec):
+            raise ValueError(
+                "data_spec keys should not start with '_'. This prefix is reserved for internal logic."
+            )
+        preset_keys = {
+            OnlineBuffer.INDEX_KEY,
+            OnlineBuffer.FRAME_INDEX_KEY,
+            OnlineBuffer.EPISODE_INDEX_KEY,
+            OnlineBuffer.TIMESTAMP_KEY,
+        }
+        if len(intersection := set(data_spec).intersection(preset_keys)) > 0:
+            raise ValueError(
+                f"data_spec should not contain any of {preset_keys} as these are handled internally. "
+                f"The provided data_spec has {intersection}."
+            )
+        complete_data_spec = {
+            # _next_index will be a pointer to the next index that we should start filling from when we add
+            # more data.
+            OnlineBuffer.NEXT_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": ()},
+            # Since the memmap is initialized with all-zeros, this keeps track of which indices are occupied
+            # with real data rather than the dummy initialization.
+            OnlineBuffer.OCCUPANCY_MASK_KEY: {"dtype": np.dtype("?"), "shape": (buffer_capacity,)},
+            OnlineBuffer.INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (buffer_capacity,)},
+            OnlineBuffer.FRAME_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (buffer_capacity,)},
+            OnlineBuffer.EPISODE_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (buffer_capacity,)},
+            OnlineBuffer.TIMESTAMP_KEY: {"dtype": np.dtype("float64"), "shape": (buffer_capacity,)},
+        }
+        for k, v in data_spec.items():
+            complete_data_spec[k] = {"dtype": v["dtype"], "shape": (buffer_capacity, *v["shape"])}
+        return complete_data_spec
+
+    def add_data(self, data: dict[str, np.ndarray]):
+        """Add new data to the buffer, which could potentially mean shifting old data out.
+
+        The new data should contain all the frames (in order) of any number of episodes. The indices should
+        start from 0 (note to the developer: this can easily be generalized). See the `rollout` and
+        `eval_policy` functions in `eval.py` for more information on how the data is constructed.
+
+        Shift the incoming data index and episode_index to continue on from the last frame. Note that this
+        will be done in place!
+        """
+        if len(missing_keys := (set(self.data_keys).difference(set(data)))) > 0:
+            raise ValueError(f"Missing data keys: {missing_keys}")
+        new_data_length = len(data[self.data_keys[0]])
+        if not all(len(data[k]) == new_data_length for k in self.data_keys):
+            raise ValueError("All data items should have the same length")
+
+        next_index = self._data[OnlineBuffer.NEXT_INDEX_KEY]
+
+        # Sanity check to make sure that the new data indices start from 0.
+        assert data[OnlineBuffer.EPISODE_INDEX_KEY][0].item() == 0
+        assert data[OnlineBuffer.INDEX_KEY][0].item() == 0
+
+        # Shift the incoming indices if necessary.
+        if self.num_frames > 0:
+            last_episode_index = self._data[OnlineBuffer.EPISODE_INDEX_KEY][next_index - 1]
+            last_data_index = self._data[OnlineBuffer.INDEX_KEY][next_index - 1]
+            data[OnlineBuffer.EPISODE_INDEX_KEY] += last_episode_index + 1
+            data[OnlineBuffer.INDEX_KEY] += last_data_index + 1
+
+        # Insert the new data starting from next_index. It may be necessary to wrap around to the start.
+        n_surplus = max(0, new_data_length - (self._buffer_capacity - next_index))
+        for k in self.data_keys:
+            if n_surplus == 0:
+                slc = slice(next_index, next_index + new_data_length)
+                self._data[k][slc] = data[k]
+                self._data[OnlineBuffer.OCCUPANCY_MASK_KEY][slc] = True
+            else:
+                self._data[k][next_index:] = data[k][:-n_surplus]
+                self._data[OnlineBuffer.OCCUPANCY_MASK_KEY][next_index:] = True
+                self._data[k][:n_surplus] = data[k][-n_surplus:]
+        if n_surplus == 0:
+            self._data[OnlineBuffer.NEXT_INDEX_KEY] = next_index + new_data_length
+        else:
+            self._data[OnlineBuffer.NEXT_INDEX_KEY] = n_surplus
+
+    @property
+    def data_keys(self) -> list[str]:
+        keys = set(self._data)
+        keys.remove(OnlineBuffer.OCCUPANCY_MASK_KEY)
+        keys.remove(OnlineBuffer.NEXT_INDEX_KEY)
+        return sorted(keys)
+
+    @property
+    def fps(self) -> float | None:
+        return self._fps
+
+    @property
+    def num_episodes(self) -> int:
+        return len(
+            np.unique(self._data[OnlineBuffer.EPISODE_INDEX_KEY][self._data[OnlineBuffer.OCCUPANCY_MASK_KEY]])
+        )
+
+    @property
+    def num_frames(self) -> int:
+        return np.count_nonzero(self._data[OnlineBuffer.OCCUPANCY_MASK_KEY])
+
+    def __len__(self):
+        return self.num_frames
+
+    def _item_to_tensors(self, item: dict) -> dict:
+        item_ = {}
+        for k, v in item.items():
+            if isinstance(v, torch.Tensor):
+                item_[k] = v
+            elif isinstance(v, np.ndarray):
+                item_[k] = torch.from_numpy(v)
+            else:
+                item_[k] = torch.tensor(v)
+        return item_
+
+    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
+        if idx >= len(self) or idx < -len(self):
+            raise IndexError
+
+        item = {k: v[idx] for k, v in self._data.items() if not k.startswith("_")}
+
+        if self.delta_timestamps is None:
+            return self._item_to_tensors(item)
+
+        episode_index = item[OnlineBuffer.EPISODE_INDEX_KEY]
+        current_ts = item[OnlineBuffer.TIMESTAMP_KEY]
+        episode_data_indices = np.where(
+            np.bitwise_and(
+                self._data[OnlineBuffer.EPISODE_INDEX_KEY] == episode_index,
+                self._data[OnlineBuffer.OCCUPANCY_MASK_KEY],
+            )
+        )[0]
+        episode_timestamps = self._data[OnlineBuffer.TIMESTAMP_KEY][episode_data_indices]
+
+        for data_key in self.delta_timestamps:
+            # Note: The logic in this loop is copied from `load_previous_and_future_frames`.
+            # Get timestamps used as query to retrieve data of previous/future frames.
+            query_ts = current_ts + self.delta_timestamps[data_key]
+
+            # Compute distances between each query timestamp and all timestamps of all the frames belonging to
+            # the episode.
+            dist = np.abs(query_ts[:, None] - episode_timestamps[None, :])
+            argmin_ = np.argmin(dist, axis=1)
+            min_ = dist[np.arange(dist.shape[0]), argmin_]
+
+            is_pad = min_ > self.tolerance_s
+
+            # Check violated query timestamps are all outside the episode range.
+            assert (
+                (query_ts[is_pad] < episode_timestamps[0]) | (episode_timestamps[-1] < query_ts[is_pad])
+            ).all(), (
+                f"One or several timestamps unexpectedly violate the tolerance ({min_} > {self.tolerance_s=}"
+                ") inside the episode range."
+            )
+
+            # Load frames for this data key.
+            item[data_key] = self._data[data_key][episode_data_indices[argmin_]]
+
+            item[f"{data_key}{OnlineBuffer.IS_PAD_POSTFIX}"] = is_pad
+
+        return self._item_to_tensors(item)
+
+    def get_data_by_key(self, key: str) -> torch.Tensor:
+        """Returns all data for a given data key as a Tensor."""
+        return torch.from_numpy(self._data[key][self._data[OnlineBuffer.OCCUPANCY_MASK_KEY]])
+
+
+def compute_sampler_weights(
+    offline_dataset: LeRobotDataset,
+    offline_drop_n_last_frames: int = 0,
+    online_dataset: OnlineBuffer | None = None,
+    online_sampling_ratio: float | None = None,
+    online_drop_n_last_frames: int = 0,
+) -> torch.Tensor:
+    """Compute the sampling weights for the online training dataloader in train.py.
+
+    Args:
+        offline_dataset: The LeRobotDataset used for offline pre-training.
+        online_drop_n_last_frames: Number of frames to drop from the end of each offline dataset episode.
+        online_dataset: The OnlineBuffer used in online training.
+        online_sampling_ratio: The proportion of data that should be sampled from the online dataset. If an
+            online dataset is provided, this value must also be provided.
+        online_drop_n_first_frames: See `offline_drop_n_last_frames`. This is the same, but for the online
+            dataset.
+    Returns:
+        Tensor of weights for [offline_dataset; online_dataset], normalized to 1.
+
+    Notes to maintainers:
+        - This duplicates some logic from EpisodeAwareSampler. We should consider converging to one approach.
+        - When used with `torch.utils.data.WeightedRandomSampler`, it could completely replace
+          `EpisodeAwareSampler` as the online dataset related arguments are optional. The only missing feature
+          is the ability to turn shuffling off.
+        - Options `drop_first_n_frames` and `episode_indices_to_use` can be added easily. They were not
+          included here to avoid adding complexity.
+    """
+    if len(offline_dataset) == 0 and (online_dataset is None or len(online_dataset) == 0):
+        raise ValueError("At least one of `offline_dataset` or `online_dataset` should be contain data.")
+    if (online_dataset is None) ^ (online_sampling_ratio is None):
+        raise ValueError(
+            "`online_dataset` and `online_sampling_ratio` must be provided together or not at all."
+        )
+    offline_sampling_ratio = 0 if online_sampling_ratio is None else 1 - online_sampling_ratio
+
+    weights = []
+
+    if len(offline_dataset) > 0:
+        offline_data_mask_indices = []
+        for start_index, end_index in zip(
+            offline_dataset.episode_data_index["from"],
+            offline_dataset.episode_data_index["to"],
+            strict=True,
+        ):
+            offline_data_mask_indices.extend(
+                range(start_index.item(), end_index.item() - offline_drop_n_last_frames)
+            )
+        offline_data_mask = torch.zeros(len(offline_dataset), dtype=torch.bool)
+        offline_data_mask[torch.tensor(offline_data_mask_indices)] = True
+        weights.append(
+            torch.full(
+                size=(len(offline_dataset),),
+                fill_value=offline_sampling_ratio / offline_data_mask.sum(),
+            )
+            * offline_data_mask
+        )
+
+    if online_dataset is not None and len(online_dataset) > 0:
+        online_data_mask_indices = []
+        episode_indices = online_dataset.get_data_by_key("episode_index")
+        for episode_idx in torch.unique(episode_indices):
+            where_episode = torch.where(episode_indices == episode_idx)
+            start_index = where_episode[0][0]
+            end_index = where_episode[0][-1] + 1
+            online_data_mask_indices.extend(
+                range(start_index.item(), end_index.item() - online_drop_n_last_frames)
+            )
+        online_data_mask = torch.zeros(len(online_dataset), dtype=torch.bool)
+        online_data_mask[torch.tensor(online_data_mask_indices)] = True
+        weights.append(
+            torch.full(
+                size=(len(online_dataset),),
+                fill_value=online_sampling_ratio / online_data_mask.sum(),
+            )
+            * online_data_mask
+        )
+
+    weights = torch.cat(weights)
+
+    if weights.sum() == 0:
+        weights += 1 / len(weights)
+    else:
+        weights /= weights.sum()
+
+    return weights
--- a/src/lerobot/datasets/push_dataset_to_hub/utils.py
+++ b/src/lerobot/datasets/push_dataset_to_hub/utils.py
@@ -0,0 +1,131 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+from typing import Dict
+
+import datasets
+import numpy
+import PIL
+import torch
+
+from lerobot.datasets.video_utils import encode_video_frames
+
+
+def concatenate_episodes(ep_dicts):
+    data_dict = {}
+
+    keys = ep_dicts[0].keys()
+    for key in keys:
+        if torch.is_tensor(ep_dicts[0][key][0]):
+            data_dict[key] = torch.cat([ep_dict[key] for ep_dict in ep_dicts])
+        else:
+            if key not in data_dict:
+                data_dict[key] = []
+            for ep_dict in ep_dicts:
+                for x in ep_dict[key]:
+                    data_dict[key].append(x)
+
+    total_frames = data_dict["frame_index"].shape[0]
+    data_dict["index"] = torch.arange(0, total_frames, 1)
+    return data_dict
+
+
+def save_images_concurrently(imgs_array: numpy.array, out_dir: Path, max_workers: int = 4):
+    out_dir = Path(out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    def save_image(img_array, i, out_dir):
+        img = PIL.Image.fromarray(img_array)
+        img.save(str(out_dir / f"frame_{i:06d}.png"), quality=100)
+
+    num_images = len(imgs_array)
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        [executor.submit(save_image, imgs_array[i], i, out_dir) for i in range(num_images)]
+
+
+def get_default_encoding() -> dict:
+    """Returns the default ffmpeg encoding parameters used by `encode_video_frames`."""
+    signature = inspect.signature(encode_video_frames)
+    return {
+        k: v.default
+        for k, v in signature.parameters.items()
+        if v.default is not inspect.Parameter.empty and k in ["vcodec", "pix_fmt", "g", "crf"]
+    }
+
+
+def check_repo_id(repo_id: str) -> None:
+    if len(repo_id.split("/")) != 2:
+        raise ValueError(
+            f"""`repo_id` is expected to contain a community or user id `/` the name of the dataset
+            (e.g. 'lerobot/pusht'), but contains '{repo_id}'."""
+        )
+
+
+# TODO(aliberts): remove
+def calculate_episode_data_index(hf_dataset: datasets.Dataset) -> Dict[str, torch.Tensor]:
+    """
+    Calculate episode data index for the provided HuggingFace Dataset. Relies on episode_index column of hf_dataset.
+
+    Parameters:
+    - hf_dataset (datasets.Dataset): A HuggingFace dataset containing the episode index.
+
+    Returns:
+    - episode_data_index: A dictionary containing the data index for each episode. The dictionary has two keys:
+        - "from": A tensor containing the starting index of each episode.
+        - "to": A tensor containing the ending index of each episode.
+    """
+    episode_data_index = {"from": [], "to": []}
+
+    current_episode = None
+    """
+    The episode_index is a list of integers, each representing the episode index of the corresponding example.
+    For instance, the following is a valid episode_index:
+      [0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2]
+
+    Below, we iterate through the episode_index and populate the episode_data_index dictionary with the starting and
+    ending index of each episode. For the episode_index above, the episode_data_index dictionary will look like this:
+        {
+            "from": [0, 3, 7],
+            "to": [3, 7, 12]
+        }
+    """
+    if len(hf_dataset) == 0:
+        episode_data_index = {
+            "from": torch.tensor([]),
+            "to": torch.tensor([]),
+        }
+        return episode_data_index
+    for idx, episode_idx in enumerate(hf_dataset["episode_index"]):
+        if episode_idx != current_episode:
+            # We encountered a new episode, so we append its starting location to the "from" list
+            episode_data_index["from"].append(idx)
+            # If this is not the first episode, we append the ending location of the previous episode to the "to" list
+            if current_episode is not None:
+                episode_data_index["to"].append(idx)
+            # Let's keep track of the current episode index
+            current_episode = episode_idx
+        else:
+            # We are still in the same episode, so there is nothing for us to do here
+            pass
+    # We have reached the end of the dataset, so we append the ending location of the last episode to the "to" list
+    episode_data_index["to"].append(idx + 1)
+
+    for k in ["from", "to"]:
+        episode_data_index[k] = torch.tensor(episode_data_index[k])
+
+    return episode_data_index
--- a/src/lerobot/datasets/sampler.py
+++ b/src/lerobot/datasets/sampler.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Iterator, Union
+
+import torch
+
+
+class EpisodeAwareSampler:
+    def __init__(
+        self,
+        episode_data_index: dict,
+        episode_indices_to_use: Union[list, None] = None,
+        drop_n_first_frames: int = 0,
+        drop_n_last_frames: int = 0,
+        shuffle: bool = False,
+    ):
+        """Sampler that optionally incorporates episode boundary information.
+
+        Args:
+            episode_data_index: Dictionary with keys 'from' and 'to' containing the start and end indices of each episode.
+            episode_indices_to_use: List of episode indices to use. If None, all episodes are used.
+                                    Assumes that episodes are indexed from 0 to N-1.
+            drop_n_first_frames: Number of frames to drop from the start of each episode.
+            drop_n_last_frames: Number of frames to drop from the end of each episode.
+            shuffle: Whether to shuffle the indices.
+        """
+        indices = []
+        for episode_idx, (start_index, end_index) in enumerate(
+            zip(episode_data_index["from"], episode_data_index["to"], strict=True)
+        ):
+            if episode_indices_to_use is None or episode_idx in episode_indices_to_use:
+                indices.extend(
+                    range(start_index.item() + drop_n_first_frames, end_index.item() - drop_n_last_frames)
+                )
+
+        self.indices = indices
+        self.shuffle = shuffle
+
+    def __iter__(self) -> Iterator[int]:
+        if self.shuffle:
+            for i in torch.randperm(len(self.indices)):
+                yield self.indices[i]
+        else:
+            for i in self.indices:
+                yield i
+
+    def __len__(self) -> int:
+        return len(self.indices)
--- a/src/lerobot/datasets/transforms.py
+++ b/src/lerobot/datasets/transforms.py
@@ -0,0 +1,249 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+from dataclasses import dataclass, field
+from typing import Any, Callable, Sequence
+
+import torch
+from torchvision.transforms import v2
+from torchvision.transforms.v2 import Transform
+from torchvision.transforms.v2 import functional as F  # noqa: N812
+
+
+class RandomSubsetApply(Transform):
+    """Apply a random subset of N transformations from a list of transformations.
+
+    Args:
+        transforms: list of transformations.
+        p: represents the multinomial probabilities (with no replacement) used for sampling the transform.
+            If the sum of the weights is not 1, they will be normalized. If ``None`` (default), all transforms
+            have the same probability.
+        n_subset: number of transformations to apply. If ``None``, all transforms are applied.
+            Must be in [1, len(transforms)].
+        random_order: apply transformations in a random order.
+    """
+
+    def __init__(
+        self,
+        transforms: Sequence[Callable],
+        p: list[float] | None = None,
+        n_subset: int | None = None,
+        random_order: bool = False,
+    ) -> None:
+        super().__init__()
+        if not isinstance(transforms, Sequence):
+            raise TypeError("Argument transforms should be a sequence of callables")
+        if p is None:
+            p = [1] * len(transforms)
+        elif len(p) != len(transforms):
+            raise ValueError(
+                f"Length of p doesn't match the number of transforms: {len(p)} != {len(transforms)}"
+            )
+
+        if n_subset is None:
+            n_subset = len(transforms)
+        elif not isinstance(n_subset, int):
+            raise TypeError("n_subset should be an int or None")
+        elif not (1 <= n_subset <= len(transforms)):
+            raise ValueError(f"n_subset should be in the interval [1, {len(transforms)}]")
+
+        self.transforms = transforms
+        total = sum(p)
+        self.p = [prob / total for prob in p]
+        self.n_subset = n_subset
+        self.random_order = random_order
+
+        self.selected_transforms = None
+
+    def forward(self, *inputs: Any) -> Any:
+        needs_unpacking = len(inputs) > 1
+
+        selected_indices = torch.multinomial(torch.tensor(self.p), self.n_subset)
+        if not self.random_order:
+            selected_indices = selected_indices.sort().values
+
+        self.selected_transforms = [self.transforms[i] for i in selected_indices]
+
+        for transform in self.selected_transforms:
+            outputs = transform(*inputs)
+            inputs = outputs if needs_unpacking else (outputs,)
+
+        return outputs
+
+    def extra_repr(self) -> str:
+        return (
+            f"transforms={self.transforms}, "
+            f"p={self.p}, "
+            f"n_subset={self.n_subset}, "
+            f"random_order={self.random_order}"
+        )
+
+
+class SharpnessJitter(Transform):
+    """Randomly change the sharpness of an image or video.
+
+    Similar to a v2.RandomAdjustSharpness with p=1 and a sharpness_factor sampled randomly.
+    While v2.RandomAdjustSharpness applies — with a given probability — a fixed sharpness_factor to an image,
+    SharpnessJitter applies a random sharpness_factor each time. This is to have a more diverse set of
+    augmentations as a result.
+
+    A sharpness_factor of 0 gives a blurred image, 1 gives the original image while 2 increases the sharpness
+    by a factor of 2.
+
+    If the input is a :class:`torch.Tensor`,
+    it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+
+    Args:
+        sharpness: How much to jitter sharpness. sharpness_factor is chosen uniformly from
+            [max(0, 1 - sharpness), 1 + sharpness] or the given
+            [min, max]. Should be non negative numbers.
+    """
+
+    def __init__(self, sharpness: float | Sequence[float]) -> None:
+        super().__init__()
+        self.sharpness = self._check_input(sharpness)
+
+    def _check_input(self, sharpness):
+        if isinstance(sharpness, (int, float)):
+            if sharpness < 0:
+                raise ValueError("If sharpness is a single number, it must be non negative.")
+            sharpness = [1.0 - sharpness, 1.0 + sharpness]
+            sharpness[0] = max(sharpness[0], 0.0)
+        elif isinstance(sharpness, collections.abc.Sequence) and len(sharpness) == 2:
+            sharpness = [float(v) for v in sharpness]
+        else:
+            raise TypeError(f"{sharpness=} should be a single number or a sequence with length 2.")
+
+        if not 0.0 <= sharpness[0] <= sharpness[1]:
+            raise ValueError(f"sharpness values should be between (0., inf), but got {sharpness}.")
+
+        return float(sharpness[0]), float(sharpness[1])
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        sharpness_factor = torch.empty(1).uniform_(self.sharpness[0], self.sharpness[1]).item()
+        return {"sharpness_factor": sharpness_factor}
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        sharpness_factor = params["sharpness_factor"]
+        return self._call_kernel(F.adjust_sharpness, inpt, sharpness_factor=sharpness_factor)
+
+
+@dataclass
+class ImageTransformConfig:
+    """
+    For each transform, the following parameters are available:
+      weight: This represents the multinomial probability (with no replacement)
+            used for sampling the transform. If the sum of the weights is not 1,
+            they will be normalized.
+      type: The name of the class used. This is either a class available under torchvision.transforms.v2 or a
+            custom transform defined here.
+      kwargs: Lower & upper bound respectively used for sampling the transform's parameter
+            (following uniform distribution) when it's applied.
+    """
+
+    weight: float = 1.0
+    type: str = "Identity"
+    kwargs: dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class ImageTransformsConfig:
+    """
+    These transforms are all using standard torchvision.transforms.v2
+    You can find out how these transformations affect images here:
+    https://pytorch.org/vision/0.18/auto_examples/transforms/plot_transforms_illustrations.html
+    We use a custom RandomSubsetApply container to sample them.
+    """
+
+    # Set this flag to `true` to enable transforms during training
+    enable: bool = False
+    # This is the maximum number of transforms (sampled from these below) that will be applied to each frame.
+    # It's an integer in the interval [1, number_of_available_transforms].
+    max_num_transforms: int = 3
+    # By default, transforms are applied in Torchvision's suggested order (shown below).
+    # Set this to True to apply them in a random order.
+    random_order: bool = False
+    tfs: dict[str, ImageTransformConfig] = field(
+        default_factory=lambda: {
+            "brightness": ImageTransformConfig(
+                weight=1.0,
+                type="ColorJitter",
+                kwargs={"brightness": (0.8, 1.2)},
+            ),
+            "contrast": ImageTransformConfig(
+                weight=1.0,
+                type="ColorJitter",
+                kwargs={"contrast": (0.8, 1.2)},
+            ),
+            "saturation": ImageTransformConfig(
+                weight=1.0,
+                type="ColorJitter",
+                kwargs={"saturation": (0.5, 1.5)},
+            ),
+            "hue": ImageTransformConfig(
+                weight=1.0,
+                type="ColorJitter",
+                kwargs={"hue": (-0.05, 0.05)},
+            ),
+            "sharpness": ImageTransformConfig(
+                weight=1.0,
+                type="SharpnessJitter",
+                kwargs={"sharpness": (0.5, 1.5)},
+            ),
+        }
+    )
+
+
+def make_transform_from_config(cfg: ImageTransformConfig):
+    if cfg.type == "Identity":
+        return v2.Identity(**cfg.kwargs)
+    elif cfg.type == "ColorJitter":
+        return v2.ColorJitter(**cfg.kwargs)
+    elif cfg.type == "SharpnessJitter":
+        return SharpnessJitter(**cfg.kwargs)
+    else:
+        raise ValueError(f"Transform '{cfg.type}' is not valid.")
+
+
+class ImageTransforms(Transform):
+    """A class to compose image transforms based on configuration."""
+
+    def __init__(self, cfg: ImageTransformsConfig) -> None:
+        super().__init__()
+        self._cfg = cfg
+
+        self.weights = []
+        self.transforms = {}
+        for tf_name, tf_cfg in cfg.tfs.items():
+            if tf_cfg.weight <= 0.0:
+                continue
+
+            self.transforms[tf_name] = make_transform_from_config(tf_cfg)
+            self.weights.append(tf_cfg.weight)
+
+        n_subset = min(len(self.transforms), cfg.max_num_transforms)
+        if n_subset == 0 or not cfg.enable:
+            self.tf = v2.Identity()
+        else:
+            self.tf = RandomSubsetApply(
+                transforms=list(self.transforms.values()),
+                p=self.weights,
+                n_subset=n_subset,
+                random_order=cfg.random_order,
+            )
+
+    def forward(self, *inputs: Any) -> Any:
+        return self.tf(*inputs)
--- a/src/lerobot/datasets/utils.py
+++ b/src/lerobot/datasets/utils.py
@@ -0,0 +1,860 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import contextlib
+import importlib.resources
+import json
+import logging
+from collections.abc import Iterator
+from itertools import accumulate
+from pathlib import Path
+from pprint import pformat
+from types import SimpleNamespace
+from typing import Any
+
+import datasets
+import jsonlines
+import numpy as np
+import packaging.version
+import torch
+from datasets.table import embed_table_storage
+from huggingface_hub import DatasetCard, DatasetCardData, HfApi
+from huggingface_hub.errors import RevisionNotFoundError
+from PIL import Image as PILImage
+from torchvision import transforms
+
+from lerobot.configs.types import DictLike, FeatureType, PolicyFeature
+from lerobot.datasets.backward_compatibility import (
+    V21_MESSAGE,
+    BackwardCompatibilityError,
+    ForwardCompatibilityError,
+)
+from lerobot.robots import Robot
+from lerobot.utils.utils import is_valid_numpy_dtype_string
+
+DEFAULT_CHUNK_SIZE = 1000  # Max number of episodes per chunk
+
+INFO_PATH = "meta/info.json"
+EPISODES_PATH = "meta/episodes.jsonl"
+STATS_PATH = "meta/stats.json"
+EPISODES_STATS_PATH = "meta/episodes_stats.jsonl"
+TASKS_PATH = "meta/tasks.jsonl"
+
+DEFAULT_VIDEO_PATH = "videos/chunk-{episode_chunk:03d}/{video_key}/episode_{episode_index:06d}.mp4"
+DEFAULT_PARQUET_PATH = "data/chunk-{episode_chunk:03d}/episode_{episode_index:06d}.parquet"
+DEFAULT_IMAGE_PATH = "images/{image_key}/episode_{episode_index:06d}/frame_{frame_index:06d}.png"
+
+DATASET_CARD_TEMPLATE = """
+---
+# Metadata will go there
+---
+This dataset was created using [LeRobot](https://github.com/huggingface/lerobot).
+
+## {}
+
+"""
+
+DEFAULT_FEATURES = {
+    "timestamp": {"dtype": "float32", "shape": (1,), "names": None},
+    "frame_index": {"dtype": "int64", "shape": (1,), "names": None},
+    "episode_index": {"dtype": "int64", "shape": (1,), "names": None},
+    "index": {"dtype": "int64", "shape": (1,), "names": None},
+    "task_index": {"dtype": "int64", "shape": (1,), "names": None},
+}
+
+
+def flatten_dict(d: dict, parent_key: str = "", sep: str = "/") -> dict:
+    """Flatten a nested dictionary structure by collapsing nested keys into one key with a separator.
+
+    For example:
+    ```
+    >>> dct = {"a": {"b": 1, "c": {"d": 2}}, "e": 3}`
+    >>> print(flatten_dict(dct))
+    {"a/b": 1, "a/c/d": 2, "e": 3}
+    """
+    items = []
+    for k, v in d.items():
+        new_key = f"{parent_key}{sep}{k}" if parent_key else k
+        if isinstance(v, dict):
+            items.extend(flatten_dict(v, new_key, sep=sep).items())
+        else:
+            items.append((new_key, v))
+    return dict(items)
+
+
+def unflatten_dict(d: dict, sep: str = "/") -> dict:
+    outdict = {}
+    for key, value in d.items():
+        parts = key.split(sep)
+        d = outdict
+        for part in parts[:-1]:
+            if part not in d:
+                d[part] = {}
+            d = d[part]
+        d[parts[-1]] = value
+    return outdict
+
+
+def get_nested_item(obj: DictLike, flattened_key: str, sep: str = "/") -> Any:
+    split_keys = flattened_key.split(sep)
+    getter = obj[split_keys[0]]
+    if len(split_keys) == 1:
+        return getter
+
+    for key in split_keys[1:]:
+        getter = getter[key]
+
+    return getter
+
+
+def serialize_dict(stats: dict[str, torch.Tensor | np.ndarray | dict]) -> dict:
+    serialized_dict = {}
+    for key, value in flatten_dict(stats).items():
+        if isinstance(value, (torch.Tensor, np.ndarray)):
+            serialized_dict[key] = value.tolist()
+        elif isinstance(value, np.generic):
+            serialized_dict[key] = value.item()
+        elif isinstance(value, (int, float)):
+            serialized_dict[key] = value
+        else:
+            raise NotImplementedError(f"The value '{value}' of type '{type(value)}' is not supported.")
+    return unflatten_dict(serialized_dict)
+
+
+def embed_images(dataset: datasets.Dataset) -> datasets.Dataset:
+    # Embed image bytes into the table before saving to parquet
+    format = dataset.format
+    dataset = dataset.with_format("arrow")
+    dataset = dataset.map(embed_table_storage, batched=False)
+    dataset = dataset.with_format(**format)
+    return dataset
+
+
+def load_json(fpath: Path) -> Any:
+    with open(fpath) as f:
+        return json.load(f)
+
+
+def write_json(data: dict, fpath: Path) -> None:
+    fpath.parent.mkdir(exist_ok=True, parents=True)
+    with open(fpath, "w") as f:
+        json.dump(data, f, indent=4, ensure_ascii=False)
+
+
+def load_jsonlines(fpath: Path) -> list[Any]:
+    with jsonlines.open(fpath, "r") as reader:
+        return list(reader)
+
+
+def write_jsonlines(data: dict, fpath: Path) -> None:
+    fpath.parent.mkdir(exist_ok=True, parents=True)
+    with jsonlines.open(fpath, "w") as writer:
+        writer.write_all(data)
+
+
+def append_jsonlines(data: dict, fpath: Path) -> None:
+    fpath.parent.mkdir(exist_ok=True, parents=True)
+    with jsonlines.open(fpath, "a") as writer:
+        writer.write(data)
+
+
+def write_info(info: dict, local_dir: Path):
+    write_json(info, local_dir / INFO_PATH)
+
+
+def load_info(local_dir: Path) -> dict:
+    info = load_json(local_dir / INFO_PATH)
+    for ft in info["features"].values():
+        ft["shape"] = tuple(ft["shape"])
+    return info
+
+
+def write_stats(stats: dict, local_dir: Path):
+    serialized_stats = serialize_dict(stats)
+    write_json(serialized_stats, local_dir / STATS_PATH)
+
+
+def cast_stats_to_numpy(stats) -> dict[str, dict[str, np.ndarray]]:
+    stats = {key: np.array(value) for key, value in flatten_dict(stats).items()}
+    return unflatten_dict(stats)
+
+
+def load_stats(local_dir: Path) -> dict[str, dict[str, np.ndarray]]:
+    if not (local_dir / STATS_PATH).exists():
+        return None
+    stats = load_json(local_dir / STATS_PATH)
+    return cast_stats_to_numpy(stats)
+
+
+def write_task(task_index: int, task: dict, local_dir: Path):
+    task_dict = {
+        "task_index": task_index,
+        "task": task,
+    }
+    append_jsonlines(task_dict, local_dir / TASKS_PATH)
+
+
+def load_tasks(local_dir: Path) -> tuple[dict, dict]:
+    tasks = load_jsonlines(local_dir / TASKS_PATH)
+    tasks = {item["task_index"]: item["task"] for item in sorted(tasks, key=lambda x: x["task_index"])}
+    task_to_task_index = {task: task_index for task_index, task in tasks.items()}
+    return tasks, task_to_task_index
+
+
+def write_episode(episode: dict, local_dir: Path):
+    append_jsonlines(episode, local_dir / EPISODES_PATH)
+
+
+def load_episodes(local_dir: Path) -> dict:
+    episodes = load_jsonlines(local_dir / EPISODES_PATH)
+    return {item["episode_index"]: item for item in sorted(episodes, key=lambda x: x["episode_index"])}
+
+
+def write_episode_stats(episode_index: int, episode_stats: dict, local_dir: Path):
+    # We wrap episode_stats in a dictionary since `episode_stats["episode_index"]`
+    # is a dictionary of stats and not an integer.
+    episode_stats = {"episode_index": episode_index, "stats": serialize_dict(episode_stats)}
+    append_jsonlines(episode_stats, local_dir / EPISODES_STATS_PATH)
+
+
+def load_episodes_stats(local_dir: Path) -> dict:
+    episodes_stats = load_jsonlines(local_dir / EPISODES_STATS_PATH)
+    return {
+        item["episode_index"]: cast_stats_to_numpy(item["stats"])
+        for item in sorted(episodes_stats, key=lambda x: x["episode_index"])
+    }
+
+
+def backward_compatible_episodes_stats(
+    stats: dict[str, dict[str, np.ndarray]], episodes: list[int]
+) -> dict[str, dict[str, np.ndarray]]:
+    return dict.fromkeys(episodes, stats)
+
+
+def load_image_as_numpy(
+    fpath: str | Path, dtype: np.dtype = np.float32, channel_first: bool = True
+) -> np.ndarray:
+    img = PILImage.open(fpath).convert("RGB")
+    img_array = np.array(img, dtype=dtype)
+    if channel_first:  # (H, W, C) -> (C, H, W)
+        img_array = np.transpose(img_array, (2, 0, 1))
+    if np.issubdtype(dtype, np.floating):
+        img_array /= 255.0
+    return img_array
+
+
+def hf_transform_to_torch(items_dict: dict[torch.Tensor | None]):
+    """Get a transform function that convert items from Hugging Face dataset (pyarrow)
+    to torch tensors. Importantly, images are converted from PIL, which corresponds to
+    a channel last representation (h w c) of uint8 type, to a torch image representation
+    with channel first (c h w) of float32 type in range [0,1].
+    """
+    for key in items_dict:
+        first_item = items_dict[key][0]
+        if isinstance(first_item, PILImage.Image):
+            to_tensor = transforms.ToTensor()
+            items_dict[key] = [to_tensor(img) for img in items_dict[key]]
+        elif first_item is None:
+            pass
+        else:
+            items_dict[key] = [x if isinstance(x, str) else torch.tensor(x) for x in items_dict[key]]
+    return items_dict
+
+
+def is_valid_version(version: str) -> bool:
+    try:
+        packaging.version.parse(version)
+        return True
+    except packaging.version.InvalidVersion:
+        return False
+
+
+def check_version_compatibility(
+    repo_id: str,
+    version_to_check: str | packaging.version.Version,
+    current_version: str | packaging.version.Version,
+    enforce_breaking_major: bool = True,
+) -> None:
+    v_check = (
+        packaging.version.parse(version_to_check)
+        if not isinstance(version_to_check, packaging.version.Version)
+        else version_to_check
+    )
+    v_current = (
+        packaging.version.parse(current_version)
+        if not isinstance(current_version, packaging.version.Version)
+        else current_version
+    )
+    if v_check.major < v_current.major and enforce_breaking_major:
+        raise BackwardCompatibilityError(repo_id, v_check)
+    elif v_check.minor < v_current.minor:
+        logging.warning(V21_MESSAGE.format(repo_id=repo_id, version=v_check))
+
+
+def get_repo_versions(repo_id: str) -> list[packaging.version.Version]:
+    """Returns available valid versions (branches and tags) on given repo."""
+    api = HfApi()
+    repo_refs = api.list_repo_refs(repo_id, repo_type="dataset")
+    repo_refs = [b.name for b in repo_refs.branches + repo_refs.tags]
+    repo_versions = []
+    for ref in repo_refs:
+        with contextlib.suppress(packaging.version.InvalidVersion):
+            repo_versions.append(packaging.version.parse(ref))
+
+    return repo_versions
+
+
+def get_safe_version(repo_id: str, version: str | packaging.version.Version) -> str:
+    """
+    Returns the version if available on repo or the latest compatible one.
+    Otherwise, will throw a `CompatibilityError`.
+    """
+    target_version = (
+        packaging.version.parse(version) if not isinstance(version, packaging.version.Version) else version
+    )
+    hub_versions = get_repo_versions(repo_id)
+
+    if not hub_versions:
+        raise RevisionNotFoundError(
+            f"""Your dataset must be tagged with a codebase version.
+            Assuming _version_ is the codebase_version value in the info.json, you can run this:
+            ```python
+            from huggingface_hub import HfApi
+
+            hub_api = HfApi()
+            hub_api.create_tag("{repo_id}", tag="_version_", repo_type="dataset")
+            ```
+            """
+        )
+
+    if target_version in hub_versions:
+        return f"v{target_version}"
+
+    compatibles = [
+        v for v in hub_versions if v.major == target_version.major and v.minor <= target_version.minor
+    ]
+    if compatibles:
+        return_version = max(compatibles)
+        if return_version < target_version:
+            logging.warning(f"Revision {version} for {repo_id} not found, using version v{return_version}")
+        return f"v{return_version}"
+
+    lower_major = [v for v in hub_versions if v.major < target_version.major]
+    if lower_major:
+        raise BackwardCompatibilityError(repo_id, max(lower_major))
+
+    upper_versions = [v for v in hub_versions if v > target_version]
+    assert len(upper_versions) > 0
+    raise ForwardCompatibilityError(repo_id, min(upper_versions))
+
+
+def get_hf_features_from_features(features: dict) -> datasets.Features:
+    hf_features = {}
+    for key, ft in features.items():
+        if ft["dtype"] == "video":
+            continue
+        elif ft["dtype"] == "image":
+            hf_features[key] = datasets.Image()
+        elif ft["shape"] == (1,):
+            hf_features[key] = datasets.Value(dtype=ft["dtype"])
+        elif len(ft["shape"]) == 1:
+            hf_features[key] = datasets.Sequence(
+                length=ft["shape"][0], feature=datasets.Value(dtype=ft["dtype"])
+            )
+        elif len(ft["shape"]) == 2:
+            hf_features[key] = datasets.Array2D(shape=ft["shape"], dtype=ft["dtype"])
+        elif len(ft["shape"]) == 3:
+            hf_features[key] = datasets.Array3D(shape=ft["shape"], dtype=ft["dtype"])
+        elif len(ft["shape"]) == 4:
+            hf_features[key] = datasets.Array4D(shape=ft["shape"], dtype=ft["dtype"])
+        elif len(ft["shape"]) == 5:
+            hf_features[key] = datasets.Array5D(shape=ft["shape"], dtype=ft["dtype"])
+        else:
+            raise ValueError(f"Corresponding feature is not valid: {ft}")
+
+    return datasets.Features(hf_features)
+
+
+def _validate_feature_names(features: dict[str, dict]) -> None:
+    invalid_features = {name: ft for name, ft in features.items() if "/" in name}
+    if invalid_features:
+        raise ValueError(f"Feature names should not contain '/'. Found '/' in '{invalid_features}'.")
+
+
+def hw_to_dataset_features(
+    hw_features: dict[str, type | tuple], prefix: str, use_video: bool = True
+) -> dict[str, dict]:
+    features = {}
+    joint_fts = {key: ftype for key, ftype in hw_features.items() if ftype is float}
+    cam_fts = {key: shape for key, shape in hw_features.items() if isinstance(shape, tuple)}
+
+    if joint_fts and prefix == "action":
+        features[prefix] = {
+            "dtype": "float32",
+            "shape": (len(joint_fts),),
+            "names": list(joint_fts),
+        }
+
+    if joint_fts and prefix == "observation":
+        features[f"{prefix}.state"] = {
+            "dtype": "float32",
+            "shape": (len(joint_fts),),
+            "names": list(joint_fts),
+        }
+
+    for key, shape in cam_fts.items():
+        features[f"{prefix}.images.{key}"] = {
+            "dtype": "video" if use_video else "image",
+            "shape": shape,
+            "names": ["height", "width", "channels"],
+        }
+
+    _validate_feature_names(features)
+    return features
+
+
+def build_dataset_frame(
+    ds_features: dict[str, dict], values: dict[str, Any], prefix: str
+) -> dict[str, np.ndarray]:
+    frame = {}
+    for key, ft in ds_features.items():
+        if key in DEFAULT_FEATURES or not key.startswith(prefix):
+            continue
+        elif ft["dtype"] == "float32" and len(ft["shape"]) == 1:
+            frame[key] = np.array([values[name] for name in ft["names"]], dtype=np.float32)
+        elif ft["dtype"] in ["image", "video"]:
+            frame[key] = values[key.removeprefix(f"{prefix}.images.")]
+
+    return frame
+
+
+def get_features_from_robot(robot: Robot, use_videos: bool = True) -> dict:
+    camera_ft = {}
+    if robot.cameras:
+        camera_ft = {
+            key: {"dtype": "video" if use_videos else "image", **ft}
+            for key, ft in robot.camera_features.items()
+        }
+    return {**robot.motor_features, **camera_ft, **DEFAULT_FEATURES}
+
+
+def dataset_to_policy_features(features: dict[str, dict]) -> dict[str, PolicyFeature]:
+    # TODO(aliberts): Implement "type" in dataset features and simplify this
+    policy_features = {}
+    for key, ft in features.items():
+        shape = ft["shape"]
+        if ft["dtype"] in ["image", "video"]:
+            type = FeatureType.VISUAL
+            if len(shape) != 3:
+                raise ValueError(f"Number of dimensions of {key} != 3 (shape={shape})")
+
+            names = ft["names"]
+            # Backward compatibility for "channel" which is an error introduced in LeRobotDataset v2.0 for ported datasets.
+            if names[2] in ["channel", "channels"]:  # (h, w, c) -> (c, h, w)
+                shape = (shape[2], shape[0], shape[1])
+        elif key == "observation.environment_state":
+            type = FeatureType.ENV
+        elif key.startswith("observation"):
+            type = FeatureType.STATE
+        elif key.startswith("action"):
+            type = FeatureType.ACTION
+        else:
+            continue
+
+        policy_features[key] = PolicyFeature(
+            type=type,
+            shape=shape,
+        )
+
+    return policy_features
+
+
+def create_empty_dataset_info(
+    codebase_version: str,
+    fps: int,
+    features: dict,
+    use_videos: bool,
+    robot_type: str | None = None,
+) -> dict:
+    return {
+        "codebase_version": codebase_version,
+        "robot_type": robot_type,
+        "total_episodes": 0,
+        "total_frames": 0,
+        "total_tasks": 0,
+        "total_videos": 0,
+        "total_chunks": 0,
+        "chunks_size": DEFAULT_CHUNK_SIZE,
+        "fps": fps,
+        "splits": {},
+        "data_path": DEFAULT_PARQUET_PATH,
+        "video_path": DEFAULT_VIDEO_PATH if use_videos else None,
+        "features": features,
+    }
+
+
+def get_episode_data_index(
+    episode_dicts: dict[dict], episodes: list[int] | None = None
+) -> dict[str, torch.Tensor]:
+    episode_lengths = {ep_idx: ep_dict["length"] for ep_idx, ep_dict in episode_dicts.items()}
+    if episodes is not None:
+        episode_lengths = {ep_idx: episode_lengths[ep_idx] for ep_idx in episodes}
+
+    cumulative_lengths = list(accumulate(episode_lengths.values()))
+    return {
+        "from": torch.LongTensor([0] + cumulative_lengths[:-1]),
+        "to": torch.LongTensor(cumulative_lengths),
+    }
+
+
+def check_timestamps_sync(
+    timestamps: np.ndarray,
+    episode_indices: np.ndarray,
+    episode_data_index: dict[str, np.ndarray],
+    fps: int,
+    tolerance_s: float,
+    raise_value_error: bool = True,
+) -> bool:
+    """
+    This check is to make sure that each timestamp is separated from the next by (1/fps) +/- tolerance
+    to account for possible numerical error.
+
+    Args:
+        timestamps (np.ndarray): Array of timestamps in seconds.
+        episode_indices (np.ndarray): Array indicating the episode index for each timestamp.
+        episode_data_index (dict[str, np.ndarray]): A dictionary that includes 'to',
+            which identifies indices for the end of each episode.
+        fps (int): Frames per second. Used to check the expected difference between consecutive timestamps.
+        tolerance_s (float): Allowed deviation from the expected (1/fps) difference.
+        raise_value_error (bool): Whether to raise a ValueError if the check fails.
+
+    Returns:
+        bool: True if all checked timestamp differences lie within tolerance, False otherwise.
+
+    Raises:
+        ValueError: If the check fails and `raise_value_error` is True.
+    """
+    if timestamps.shape != episode_indices.shape:
+        raise ValueError(
+            "timestamps and episode_indices should have the same shape. "
+            f"Found {timestamps.shape=} and {episode_indices.shape=}."
+        )
+
+    # Consecutive differences
+    diffs = np.diff(timestamps)
+    within_tolerance = np.abs(diffs - (1.0 / fps)) <= tolerance_s
+
+    # Mask to ignore differences at the boundaries between episodes
+    mask = np.ones(len(diffs), dtype=bool)
+    ignored_diffs = episode_data_index["to"][:-1] - 1  # indices at the end of each episode
+    mask[ignored_diffs] = False
+    filtered_within_tolerance = within_tolerance[mask]
+
+    # Check if all remaining diffs are within tolerance
+    if not np.all(filtered_within_tolerance):
+        # Track original indices before masking
+        original_indices = np.arange(len(diffs))
+        filtered_indices = original_indices[mask]
+        outside_tolerance_filtered_indices = np.nonzero(~filtered_within_tolerance)[0]
+        outside_tolerance_indices = filtered_indices[outside_tolerance_filtered_indices]
+
+        outside_tolerances = []
+        for idx in outside_tolerance_indices:
+            entry = {
+                "timestamps": [timestamps[idx], timestamps[idx + 1]],
+                "diff": diffs[idx],
+                "episode_index": episode_indices[idx].item()
+                if hasattr(episode_indices[idx], "item")
+                else episode_indices[idx],
+            }
+            outside_tolerances.append(entry)
+
+        if raise_value_error:
+            raise ValueError(
+                f"""One or several timestamps unexpectedly violate the tolerance inside episode range.
+                This might be due to synchronization issues during data collection.
+                \n{pformat(outside_tolerances)}"""
+            )
+        return False
+
+    return True
+
+
+def check_delta_timestamps(
+    delta_timestamps: dict[str, list[float]], fps: int, tolerance_s: float, raise_value_error: bool = True
+) -> bool:
+    """This will check if all the values in delta_timestamps are multiples of 1/fps +/- tolerance.
+    This is to ensure that these delta_timestamps added to any timestamp from a dataset will themselves be
+    actual timestamps from the dataset.
+    """
+    outside_tolerance = {}
+    for key, delta_ts in delta_timestamps.items():
+        within_tolerance = [abs(ts * fps - round(ts * fps)) / fps <= tolerance_s for ts in delta_ts]
+        if not all(within_tolerance):
+            outside_tolerance[key] = [
+                ts for ts, is_within in zip(delta_ts, within_tolerance, strict=True) if not is_within
+            ]
+
+    if len(outside_tolerance) > 0:
+        if raise_value_error:
+            raise ValueError(
+                f"""
+                The following delta_timestamps are found outside of tolerance range.
+                Please make sure they are multiples of 1/{fps} +/- tolerance and adjust
+                their values accordingly.
+                \n{pformat(outside_tolerance)}
+                """
+            )
+        return False
+
+    return True
+
+
+def get_delta_indices(delta_timestamps: dict[str, list[float]], fps: int) -> dict[str, list[int]]:
+    delta_indices = {}
+    for key, delta_ts in delta_timestamps.items():
+        delta_indices[key] = [round(d * fps) for d in delta_ts]
+
+    return delta_indices
+
+
+def cycle(iterable):
+    """The equivalent of itertools.cycle, but safe for Pytorch dataloaders.
+
+    See https://github.com/pytorch/pytorch/issues/23900 for information on why itertools.cycle is not safe.
+    """
+    iterator = iter(iterable)
+    while True:
+        try:
+            yield next(iterator)
+        except StopIteration:
+            iterator = iter(iterable)
+
+
+def create_branch(repo_id, *, branch: str, repo_type: str | None = None) -> None:
+    """Create a branch on a existing Hugging Face repo. Delete the branch if it already
+    exists before creating it.
+    """
+    api = HfApi()
+
+    branches = api.list_repo_refs(repo_id, repo_type=repo_type).branches
+    refs = [branch.ref for branch in branches]
+    ref = f"refs/heads/{branch}"
+    if ref in refs:
+        api.delete_branch(repo_id, repo_type=repo_type, branch=branch)
+
+    api.create_branch(repo_id, repo_type=repo_type, branch=branch)
+
+
+def create_lerobot_dataset_card(
+    tags: list | None = None,
+    dataset_info: dict | None = None,
+    **kwargs,
+) -> DatasetCard:
+    """
+    Keyword arguments will be used to replace values in src/lerobot/datasets/card_template.md.
+    Note: If specified, license must be one of https://huggingface.co/docs/hub/repositories-licenses.
+    """
+    card_tags = ["LeRobot"]
+
+    if tags:
+        card_tags += tags
+    if dataset_info:
+        dataset_structure = "[meta/info.json](meta/info.json):\n"
+        dataset_structure += f"```json\n{json.dumps(dataset_info, indent=4)}\n```\n"
+        kwargs = {**kwargs, "dataset_structure": dataset_structure}
+    card_data = DatasetCardData(
+        license=kwargs.get("license"),
+        tags=card_tags,
+        task_categories=["robotics"],
+        configs=[
+            {
+                "config_name": "default",
+                "data_files": "data/*/*.parquet",
+            }
+        ],
+    )
+
+    card_template = (importlib.resources.files("lerobot.datasets") / "card_template.md").read_text()
+
+    return DatasetCard.from_template(
+        card_data=card_data,
+        template_str=card_template,
+        **kwargs,
+    )
+
+
+class IterableNamespace(SimpleNamespace):
+    """
+    A namespace object that supports both dictionary-like iteration and dot notation access.
+    Automatically converts nested dictionaries into IterableNamespaces.
+
+    This class extends SimpleNamespace to provide:
+    - Dictionary-style iteration over keys
+    - Access to items via both dot notation (obj.key) and brackets (obj["key"])
+    - Dictionary-like methods: items(), keys(), values()
+    - Recursive conversion of nested dictionaries
+
+    Args:
+        dictionary: Optional dictionary to initialize the namespace
+        **kwargs: Additional keyword arguments passed to SimpleNamespace
+
+    Examples:
+        >>> data = {"name": "Alice", "details": {"age": 25}}
+        >>> ns = IterableNamespace(data)
+        >>> ns.name
+        'Alice'
+        >>> ns.details.age
+        25
+        >>> list(ns.keys())
+        ['name', 'details']
+        >>> for key, value in ns.items():
+        ...     print(f"{key}: {value}")
+        name: Alice
+        details: IterableNamespace(age=25)
+    """
+
+    def __init__(self, dictionary: dict[str, Any] = None, **kwargs):
+        super().__init__(**kwargs)
+        if dictionary is not None:
+            for key, value in dictionary.items():
+                if isinstance(value, dict):
+                    setattr(self, key, IterableNamespace(value))
+                else:
+                    setattr(self, key, value)
+
+    def __iter__(self) -> Iterator[str]:
+        return iter(vars(self))
+
+    def __getitem__(self, key: str) -> Any:
+        return vars(self)[key]
+
+    def items(self):
+        return vars(self).items()
+
+    def values(self):
+        return vars(self).values()
+
+    def keys(self):
+        return vars(self).keys()
+
+
+def validate_frame(frame: dict, features: dict):
+    expected_features = set(features) - set(DEFAULT_FEATURES)
+    actual_features = set(frame)
+
+    error_message = validate_features_presence(actual_features, expected_features)
+
+    common_features = actual_features & expected_features
+    for name in common_features - {"task"}:
+        error_message += validate_feature_dtype_and_shape(name, features[name], frame[name])
+
+    if error_message:
+        raise ValueError(error_message)
+
+
+def validate_features_presence(actual_features: set[str], expected_features: set[str]):
+    error_message = ""
+    missing_features = expected_features - actual_features
+    extra_features = actual_features - expected_features
+
+    if missing_features or extra_features:
+        error_message += "Feature mismatch in `frame` dictionary:\n"
+        if missing_features:
+            error_message += f"Missing features: {missing_features}\n"
+        if extra_features:
+            error_message += f"Extra features: {extra_features}\n"
+
+    return error_message
+
+
+def validate_feature_dtype_and_shape(name: str, feature: dict, value: np.ndarray | PILImage.Image | str):
+    expected_dtype = feature["dtype"]
+    expected_shape = feature["shape"]
+    if is_valid_numpy_dtype_string(expected_dtype):
+        return validate_feature_numpy_array(name, expected_dtype, expected_shape, value)
+    elif expected_dtype in ["image", "video"]:
+        return validate_feature_image_or_video(name, expected_shape, value)
+    elif expected_dtype == "string":
+        return validate_feature_string(name, value)
+    else:
+        raise NotImplementedError(f"The feature dtype '{expected_dtype}' is not implemented yet.")
+
+
+def validate_feature_numpy_array(
+    name: str, expected_dtype: str, expected_shape: list[int], value: np.ndarray
+):
+    error_message = ""
+    if isinstance(value, np.ndarray):
+        actual_dtype = value.dtype
+        actual_shape = value.shape
+
+        if actual_dtype != np.dtype(expected_dtype):
+            error_message += f"The feature '{name}' of dtype '{actual_dtype}' is not of the expected dtype '{expected_dtype}'.\n"
+
+        if actual_shape != expected_shape:
+            error_message += f"The feature '{name}' of shape '{actual_shape}' does not have the expected shape '{expected_shape}'.\n"
+    else:
+        error_message += f"The feature '{name}' is not a 'np.ndarray'. Expected type is '{expected_dtype}', but type '{type(value)}' provided instead.\n"
+
+    return error_message
+
+
+def validate_feature_image_or_video(name: str, expected_shape: list[str], value: np.ndarray | PILImage.Image):
+    # Note: The check of pixels range ([0,1] for float and [0,255] for uint8) is done by the image writer threads.
+    error_message = ""
+    if isinstance(value, np.ndarray):
+        actual_shape = value.shape
+        c, h, w = expected_shape
+        if len(actual_shape) != 3 or (actual_shape != (c, h, w) and actual_shape != (h, w, c)):
+            error_message += f"The feature '{name}' of shape '{actual_shape}' does not have the expected shape '{(c, h, w)}' or '{(h, w, c)}'.\n"
+    elif isinstance(value, PILImage.Image):
+        pass
+    else:
+        error_message += f"The feature '{name}' is expected to be of type 'PIL.Image' or 'np.ndarray' channel first or channel last, but type '{type(value)}' provided instead.\n"
+
+    return error_message
+
+
+def validate_feature_string(name: str, value: str):
+    if not isinstance(value, str):
+        return f"The feature '{name}' is expected to be of type 'str', but type '{type(value)}' provided instead.\n"
+    return ""
+
+
+def validate_episode_buffer(episode_buffer: dict, total_episodes: int, features: dict):
+    if "size" not in episode_buffer:
+        raise ValueError("size key not found in episode_buffer")
+
+    if "task" not in episode_buffer:
+        raise ValueError("task key not found in episode_buffer")
+
+    if episode_buffer["episode_index"] != total_episodes:
+        # TODO(aliberts): Add option to use existing episode_index
+        raise NotImplementedError(
+            "You might have manually provided the episode_buffer with an episode_index that doesn't "
+            "match the total number of episodes already in the dataset. This is not supported for now."
+        )
+
+    if episode_buffer["size"] == 0:
+        raise ValueError("You must add one or several frames with `add_frame` before calling `add_episode`.")
+
+    buffer_keys = set(episode_buffer.keys()) - {"task", "size"}
+    if not buffer_keys == set(features):
+        raise ValueError(
+            f"Features from `episode_buffer` don't match the ones in `features`."
+            f"In episode_buffer not in features: {buffer_keys - set(features)}"
+            f"In features not in episode_buffer: {set(features) - buffer_keys}"
+        )
--- a/src/lerobot/datasets/v2/batch_convert_dataset_v1_to_v2.py
+++ b/src/lerobot/datasets/v2/batch_convert_dataset_v1_to_v2.py
@@ -0,0 +1,884 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This script is for internal use to convert all datasets under the 'lerobot' hub user account to v2.
+
+Note: Since the original Aloha datasets don't use shadow motors, you need to comment those out in
+lerobot/configs/robot/aloha.yaml before running this script.
+"""
+
+import traceback
+from pathlib import Path
+from textwrap import dedent
+
+from lerobot import available_datasets
+from lerobot.datasets.v2.convert_dataset_v1_to_v2 import convert_dataset
+from lerobot.robots.aloha.configuration_aloha import AlohaRobotConfig
+
+LOCAL_DIR = Path("data/")
+
+# spellchecker:off
+ALOHA_MOBILE_INFO = {
+    "robot_config": AlohaRobotConfig(),
+    "license": "mit",
+    "url": "https://mobile-aloha.github.io/",
+    "paper": "https://huggingface.co/papers/2401.02117",
+    "citation_bibtex": dedent(r"""
+        @inproceedings{fu2024mobile,
+            author    = {Fu, Zipeng and Zhao, Tony Z. and Finn, Chelsea},
+            title     = {Mobile ALOHA: Learning Bimanual Mobile Manipulation with Low-Cost Whole-Body Teleoperation},
+            booktitle = {arXiv},
+            year      = {2024},
+        }""").lstrip(),
+}
+ALOHA_STATIC_INFO = {
+    "robot_config": AlohaRobotConfig(),
+    "license": "mit",
+    "url": "https://tonyzhaozh.github.io/aloha/",
+    "paper": "https://huggingface.co/papers/2304.13705",
+    "citation_bibtex": dedent(r"""
+        @article{Zhao2023LearningFB,
+            title={Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware},
+            author={Tony Zhao and Vikash Kumar and Sergey Levine and Chelsea Finn},
+            journal={RSS},
+            year={2023},
+            volume={abs/2304.13705},
+            url={https://huggingface.co/papers/2304.13705}
+        }""").lstrip(),
+}
+PUSHT_INFO = {
+    "license": "mit",
+    "url": "https://diffusion-policy.cs.columbia.edu/",
+    "paper": "https://huggingface.co/papers/2303.04137",
+    "citation_bibtex": dedent(r"""
+        @article{chi2024diffusionpolicy,
+            author = {Cheng Chi and Zhenjia Xu and Siyuan Feng and Eric Cousineau and Yilun Du and Benjamin Burchfiel and Russ Tedrake and Shuran Song},
+            title ={Diffusion Policy: Visuomotor Policy Learning via Action Diffusion},
+            journal = {The International Journal of Robotics Research},
+            year = {2024},
+        }""").lstrip(),
+}
+XARM_INFO = {
+    "license": "mit",
+    "url": "https://www.nicklashansen.com/td-mpc/",
+    "paper": "https://huggingface.co/papers/2203.04955",
+    "citation_bibtex": dedent(r"""
+        @inproceedings{Hansen2022tdmpc,
+            title={Temporal Difference Learning for Model Predictive Control},
+            author={Nicklas Hansen and Xiaolong Wang and Hao Su},
+            booktitle={ICML},
+            year={2022}
+        }
+    """),
+}
+UNITREEH_INFO = {
+    "license": "apache-2.0",
+}
+
+DATASETS = {
+    "aloha_mobile_cabinet": {
+        "single_task": "Open the top cabinet, store the pot inside it then close the cabinet.",
+        **ALOHA_MOBILE_INFO,
+    },
+    "aloha_mobile_chair": {
+        "single_task": "Push the chairs in front of the desk to place them against it.",
+        **ALOHA_MOBILE_INFO,
+    },
+    "aloha_mobile_elevator": {
+        "single_task": "Take the elevator to the 1st floor.",
+        **ALOHA_MOBILE_INFO,
+    },
+    "aloha_mobile_shrimp": {
+        "single_task": "Sauté the raw shrimp on both sides, then serve it in the bowl.",
+        **ALOHA_MOBILE_INFO,
+    },
+    "aloha_mobile_wash_pan": {
+        "single_task": "Pick up the pan, rinse it in the sink and then place it in the drying rack.",
+        **ALOHA_MOBILE_INFO,
+    },
+    "aloha_mobile_wipe_wine": {
+        "single_task": "Pick up the wet cloth on the faucet and use it to clean the spilled wine on the table and underneath the glass.",
+        **ALOHA_MOBILE_INFO,
+    },
+    "aloha_static_battery": {
+        "single_task": "Place the battery into the slot of the remote controller.",
+        **ALOHA_STATIC_INFO,
+    },
+    "aloha_static_candy": {"single_task": "Pick up the candy and unwrap it.", **ALOHA_STATIC_INFO},
+    "aloha_static_coffee": {
+        "single_task": "Place the coffee capsule inside the capsule container, then place the cup onto the center of the cup tray, then push the 'Hot Water' and 'Travel Mug' buttons.",
+        **ALOHA_STATIC_INFO,
+    },
+    "aloha_static_coffee_new": {
+        "single_task": "Place the coffee capsule inside the capsule container, then place the cup onto the center of the cup tray.",
+        **ALOHA_STATIC_INFO,
+    },
+    "aloha_static_cups_open": {
+        "single_task": "Pick up the plastic cup and open its lid.",
+        **ALOHA_STATIC_INFO,
+    },
+    "aloha_static_fork_pick_up": {
+        "single_task": "Pick up the fork and place it on the plate.",
+        **ALOHA_STATIC_INFO,
+    },
+    "aloha_static_pingpong_test": {
+        "single_task": "Transfer one of the two balls in the right glass into the left glass, then transfer it back to the right glass.",
+        **ALOHA_STATIC_INFO,
+    },
+    "aloha_static_pro_pencil": {
+        "single_task": "Pick up the pencil with the right arm, hand it over to the left arm then place it back onto the table.",
+        **ALOHA_STATIC_INFO,
+    },
+    "aloha_static_screw_driver": {
+        "single_task": "Pick up the screwdriver with the right arm, hand it over to the left arm then place it into the cup.",
+        **ALOHA_STATIC_INFO,
+    },
+    "aloha_static_tape": {
+        "single_task": "Cut a small piece of tape from the tape dispenser then place it on the cardboard box's edge.",
+        **ALOHA_STATIC_INFO,
+    },
+    "aloha_static_thread_velcro": {
+        "single_task": "Pick up the velcro cable tie with the left arm, then insert the end of the velcro tie into the other end's loop with the right arm.",
+        **ALOHA_STATIC_INFO,
+    },
+    "aloha_static_towel": {
+        "single_task": "Pick up a piece of paper towel and place it on the spilled liquid.",
+        **ALOHA_STATIC_INFO,
+    },
+    "aloha_static_vinh_cup": {
+        "single_task": "Pick up the plastic cup with the right arm, then pop its lid open with the left arm.",
+        **ALOHA_STATIC_INFO,
+    },
+    "aloha_static_vinh_cup_left": {
+        "single_task": "Pick up the plastic cup with the left arm, then pop its lid open with the right arm.",
+        **ALOHA_STATIC_INFO,
+    },
+    "aloha_static_ziploc_slide": {"single_task": "Slide open the ziploc bag.", **ALOHA_STATIC_INFO},
+    "aloha_sim_insertion_scripted": {"single_task": "Insert the peg into the socket.", **ALOHA_STATIC_INFO},
+    "aloha_sim_insertion_scripted_image": {
+        "single_task": "Insert the peg into the socket.",
+        **ALOHA_STATIC_INFO,
+    },
+    "aloha_sim_insertion_human": {"single_task": "Insert the peg into the socket.", **ALOHA_STATIC_INFO},
+    "aloha_sim_insertion_human_image": {
+        "single_task": "Insert the peg into the socket.",
+        **ALOHA_STATIC_INFO,
+    },
+    "aloha_sim_transfer_cube_scripted": {
+        "single_task": "Pick up the cube with the right arm and transfer it to the left arm.",
+        **ALOHA_STATIC_INFO,
+    },
+    "aloha_sim_transfer_cube_scripted_image": {
+        "single_task": "Pick up the cube with the right arm and transfer it to the left arm.",
+        **ALOHA_STATIC_INFO,
+    },
+    "aloha_sim_transfer_cube_human": {
+        "single_task": "Pick up the cube with the right arm and transfer it to the left arm.",
+        **ALOHA_STATIC_INFO,
+    },
+    "aloha_sim_transfer_cube_human_image": {
+        "single_task": "Pick up the cube with the right arm and transfer it to the left arm.",
+        **ALOHA_STATIC_INFO,
+    },
+    "pusht": {"single_task": "Push the T-shaped block onto the T-shaped target.", **PUSHT_INFO},
+    "pusht_image": {"single_task": "Push the T-shaped block onto the T-shaped target.", **PUSHT_INFO},
+    "unitreeh1_fold_clothes": {"single_task": "Fold the sweatshirt.", **UNITREEH_INFO},
+    "unitreeh1_rearrange_objects": {"single_task": "Put the object into the bin.", **UNITREEH_INFO},
+    "unitreeh1_two_robot_greeting": {
+        "single_task": "Greet the other robot with a high five.",
+        **UNITREEH_INFO,
+    },
+    "unitreeh1_warehouse": {
+        "single_task": "Grab the spray paint on the shelf and place it in the bin on top of the robot dog.",
+        **UNITREEH_INFO,
+    },
+    "xarm_lift_medium": {"single_task": "Pick up the cube and lift it.", **XARM_INFO},
+    "xarm_lift_medium_image": {"single_task": "Pick up the cube and lift it.", **XARM_INFO},
+    "xarm_lift_medium_replay": {"single_task": "Pick up the cube and lift it.", **XARM_INFO},
+    "xarm_lift_medium_replay_image": {"single_task": "Pick up the cube and lift it.", **XARM_INFO},
+    "xarm_push_medium": {"single_task": "Push the cube onto the target.", **XARM_INFO},
+    "xarm_push_medium_image": {"single_task": "Push the cube onto the target.", **XARM_INFO},
+    "xarm_push_medium_replay": {"single_task": "Push the cube onto the target.", **XARM_INFO},
+    "xarm_push_medium_replay_image": {"single_task": "Push the cube onto the target.", **XARM_INFO},
+    "umi_cup_in_the_wild": {
+        "single_task": "Put the cup on the plate.",
+        "license": "apache-2.0",
+    },
+    "asu_table_top": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "paper": "https://link.springer.com/article/10.1007/s10514-023-10129-1",
+        "citation_bibtex": dedent(r"""
+            @inproceedings{zhou2023modularity,
+                title={Modularity through Attention: Efficient Training and Transfer of Language-Conditioned Policies for Robot Manipulation},
+                author={Zhou, Yifan and Sonawani, Shubham and Phielipp, Mariano and Stepputtis, Simon and Amor, Heni},
+                booktitle={Conference on Robot Learning},
+                pages={1684--1695},
+                year={2023},
+                organization={PMLR}
+            }
+            @article{zhou2023learning,
+                title={Learning modular language-conditioned robot policies through attention},
+                author={Zhou, Yifan and Sonawani, Shubham and Phielipp, Mariano and Ben Amor, Heni and Stepputtis, Simon},
+                journal={Autonomous Robots},
+                pages={1--21},
+                year={2023},
+                publisher={Springer}
+            }""").lstrip(),
+    },
+    "austin_buds_dataset": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "url": "https://ut-austin-rpl.github.io/BUDS-website/",
+        "paper": "https://huggingface.co/papers/2109.13841",
+        "citation_bibtex": dedent(r"""
+            @article{zhu2022bottom,
+                title={Bottom-Up Skill Discovery From Unsegmented Demonstrations for Long-Horizon Robot Manipulation},
+                author={Zhu, Yifeng and Stone, Peter and Zhu, Yuke},
+                journal={IEEE Robotics and Automation Letters},
+                volume={7},
+                number={2},
+                pages={4126--4133},
+                year={2022},
+                publisher={IEEE}
+            }""").lstrip(),
+    },
+    "austin_sailor_dataset": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "url": "https://ut-austin-rpl.github.io/sailor/",
+        "paper": "https://huggingface.co/papers/2210.11435",
+        "citation_bibtex": dedent(r"""
+            @inproceedings{nasiriany2022sailor,
+                title={Learning and Retrieval from Prior Data for Skill-based Imitation Learning},
+                author={Soroush Nasiriany and Tian Gao and Ajay Mandlekar and Yuke Zhu},
+                booktitle={Conference on Robot Learning (CoRL)},
+                year={2022}
+            }""").lstrip(),
+    },
+    "austin_sirius_dataset": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "url": "https://ut-austin-rpl.github.io/sirius/",
+        "paper": "https://huggingface.co/papers/2211.08416",
+        "citation_bibtex": dedent(r"""
+            @inproceedings{liu2022robot,
+                title = {Robot Learning on the Job: Human-in-the-Loop Autonomy and Learning During Deployment},
+                author = {Huihan Liu and Soroush Nasiriany and Lance Zhang and Zhiyao Bao and Yuke Zhu},
+                booktitle = {Robotics: Science and Systems (RSS)},
+                year = {2023}
+            }""").lstrip(),
+    },
+    "berkeley_autolab_ur5": {
+        "tasks_col": "language_instruction",
+        "license": "cc-by-4.0",
+        "url": "https://sites.google.com/view/berkeley-ur5/home",
+        "citation_bibtex": dedent(r"""
+            @misc{BerkeleyUR5Website,
+                title = {Berkeley {UR5} Demonstration Dataset},
+                author = {Lawrence Yunliang Chen and Simeon Adebola and Ken Goldberg},
+                howpublished = {https://sites.google.com/view/berkeley-ur5/home},
+            }""").lstrip(),
+    },
+    "berkeley_cable_routing": {
+        "tasks_col": "language_instruction",
+        "license": "cc-by-4.0",
+        "url": "https://sites.google.com/view/cablerouting/home",
+        "paper": "https://huggingface.co/papers/2307.08927",
+        "citation_bibtex": dedent(r"""
+            @article{luo2023multistage,
+                author    = {Jianlan Luo and Charles Xu and Xinyang Geng and Gilbert Feng and Kuan Fang and Liam Tan and Stefan Schaal and Sergey Levine},
+                title     = {Multi-Stage Cable Routing through Hierarchical Imitation Learning},
+                journal   = {arXiv pre-print},
+                year      = {2023},
+                url       = {https://huggingface.co/papers/2307.08927},
+            }""").lstrip(),
+    },
+    "berkeley_fanuc_manipulation": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "url": "https://sites.google.com/berkeley.edu/fanuc-manipulation",
+        "citation_bibtex": dedent(r"""
+            @article{fanuc_manipulation2023,
+                title={Fanuc Manipulation: A Dataset for Learning-based Manipulation with FANUC Mate 200iD Robot},
+                author={Zhu, Xinghao and Tian, Ran and Xu, Chenfeng and Ding, Mingyu and Zhan, Wei and Tomizuka, Masayoshi},
+                year={2023},
+            }""").lstrip(),
+    },
+    "berkeley_gnm_cory_hall": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "paper": "https://huggingface.co/papers/1709.10489",
+        "citation_bibtex": dedent(r"""
+            @inproceedings{kahn2018self,
+                title={Self-supervised deep reinforcement learning with generalized computation graphs for robot navigation},
+                author={Kahn, Gregory and Villaflor, Adam and Ding, Bosen and Abbeel, Pieter and Levine, Sergey},
+                booktitle={2018 IEEE international conference on robotics and automation (ICRA)},
+                pages={5129--5136},
+                year={2018},
+                organization={IEEE}
+            }""").lstrip(),
+    },
+    "berkeley_gnm_recon": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "url": "https://sites.google.com/view/recon-robot",
+        "paper": "https://huggingface.co/papers/2104.05859",
+        "citation_bibtex": dedent(r"""
+            @inproceedings{shah2021rapid,
+                title={Rapid Exploration for Open-World Navigation with Latent Goal Models},
+                author={Dhruv Shah and Benjamin Eysenbach and Nicholas Rhinehart and Sergey Levine},
+                booktitle={5th Annual Conference on Robot Learning },
+                year={2021},
+                url={https://openreview.net/forum?id=d_SWJhyKfVw}
+            }""").lstrip(),
+    },
+    "berkeley_gnm_sac_son": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "url": "https://sites.google.com/view/SACSoN-review",
+        "paper": "https://huggingface.co/papers/2306.01874",
+        "citation_bibtex": dedent(r"""
+            @article{hirose2023sacson,
+                title={SACSoN: Scalable Autonomous Data Collection for Social Navigation},
+                author={Hirose, Noriaki and Shah, Dhruv and Sridhar, Ajay and Levine, Sergey},
+                journal={arXiv preprint arXiv:2306.01874},
+                year={2023}
+            }""").lstrip(),
+    },
+    "berkeley_mvp": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "paper": "https://huggingface.co/papers/2203.06173",
+        "citation_bibtex": dedent(r"""
+            @InProceedings{Radosavovic2022,
+                title = {Real-World Robot Learning with Masked Visual Pre-training},
+                author = {Ilija Radosavovic and Tete Xiao and Stephen James and Pieter Abbeel and Jitendra Malik and Trevor Darrell},
+                booktitle = {CoRL},
+                year = {2022}
+            }""").lstrip(),
+    },
+    "berkeley_rpt": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "paper": "https://huggingface.co/papers/2306.10007",
+        "citation_bibtex": dedent(r"""
+            @article{Radosavovic2023,
+                title={Robot Learning with Sensorimotor Pre-training},
+                author={Ilija Radosavovic and Baifeng Shi and Letian Fu and Ken Goldberg and Trevor Darrell and Jitendra Malik},
+                year={2023},
+                journal={arXiv:2306.10007}
+            }""").lstrip(),
+    },
+    "cmu_franka_exploration_dataset": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "url": "https://human-world-model.github.io/",
+        "paper": "https://huggingface.co/papers/2308.10901",
+        "citation_bibtex": dedent(r"""
+            @inproceedings{mendonca2023structured,
+                title={Structured World Models from Human Videos},
+                author={Mendonca, Russell  and Bahl, Shikhar and Pathak, Deepak},
+                journal={RSS},
+                year={2023}
+            }""").lstrip(),
+    },
+    "cmu_play_fusion": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "url": "https://play-fusion.github.io/",
+        "paper": "https://huggingface.co/papers/2312.04549",
+        "citation_bibtex": dedent(r"""
+            @inproceedings{chen2023playfusion,
+                title={PlayFusion: Skill Acquisition via Diffusion from Language-Annotated Play},
+                author={Chen, Lili and Bahl, Shikhar and Pathak, Deepak},
+                booktitle={CoRL},
+                year={2023}
+            }""").lstrip(),
+    },
+    "cmu_stretch": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "url": "https://robo-affordances.github.io/",
+        "paper": "https://huggingface.co/papers/2304.08488",
+        "citation_bibtex": dedent(r"""
+            @inproceedings{bahl2023affordances,
+                title={Affordances from Human Videos as a Versatile Representation for Robotics},
+                author={Bahl, Shikhar and Mendonca, Russell and Chen, Lili and Jain, Unnat and Pathak, Deepak},
+                booktitle={CVPR},
+                year={2023}
+            }
+                @article{mendonca2023structured,
+                title={Structured World Models from Human Videos},
+                author={Mendonca, Russell and Bahl, Shikhar and Pathak, Deepak},
+                journal={CoRL},
+                year={2023}
+            }""").lstrip(),
+    },
+    "columbia_cairlab_pusht_real": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "url": "https://diffusion-policy.cs.columbia.edu/",
+        "paper": "https://huggingface.co/papers/2303.04137",
+        "citation_bibtex": dedent(r"""
+            @inproceedings{chi2023diffusionpolicy,
+                title={Diffusion Policy: Visuomotor Policy Learning via Action Diffusion},
+                author={Chi, Cheng and Feng, Siyuan and Du, Yilun and Xu, Zhenjia and Cousineau, Eric and Burchfiel, Benjamin and Song, Shuran},
+                booktitle={Proceedings of Robotics: Science and Systems (RSS)},
+                year={2023}
+            }""").lstrip(),
+    },
+    "conq_hose_manipulation": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "url": "https://sites.google.com/view/conq-hose-manipulation-dataset/home",
+        "citation_bibtex": dedent(r"""
+            @misc{ConqHoseManipData,
+                author={Peter Mitrano and Dmitry Berenson},
+                title={Conq Hose Manipulation Dataset, v1.15.0},
+                year={2024},
+                howpublished={https://sites.google.com/view/conq-hose-manipulation-dataset}
+            }""").lstrip(),
+    },
+    "dlr_edan_shared_control": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "paper": "https://ieeexplore.ieee.org/document/9341156",
+        "citation_bibtex": dedent(r"""
+            @inproceedings{vogel_edan_2020,
+                title = {EDAN - an EMG-Controlled Daily Assistant to Help People with Physical Disabilities},
+                language = {en},
+                booktitle = {2020 {IEEE}/{RSJ} {International} {Conference} on {Intelligent} {Robots} and {Systems} ({IROS})},
+                author = {Vogel, Jörn and Hagengruber, Annette and Iskandar, Maged and Quere, Gabriel and Leipscher, Ulrike and Bustamante, Samuel and Dietrich, Alexander and Hoeppner, Hannes and Leidner, Daniel and Albu-Schäffer, Alin},
+                year = {2020}
+            }
+            @inproceedings{quere_shared_2020,
+                address = {Paris, France},
+                title = {Shared {Control} {Templates} for {Assistive} {Robotics}},
+                language = {en},
+                booktitle = {2020 {IEEE} {International} {Conference} on {Robotics} and {Automation} ({ICRA})},
+                author = {Quere, Gabriel and Hagengruber, Annette and Iskandar, Maged and Bustamante, Samuel and Leidner, Daniel and Stulp, Freek and Vogel, Joern},
+                year = {2020},
+                pages = {7},
+            }""").lstrip(),
+    },
+    "dlr_sara_grid_clamp": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "paper": "https://www.researchsquare.com/article/rs-3289569/v1",
+        "citation_bibtex": dedent(r"""
+            @article{padalkar2023guided,
+                title={A guided reinforcement learning approach using shared control templates for learning manipulation skills in the real world},
+                author={Padalkar, Abhishek and Quere, Gabriel and Raffin, Antonin and Silv{\'e}rio, Jo{\~a}o and Stulp, Freek},
+                journal={Research square preprint rs-3289569/v1},
+                year={2023}
+            }""").lstrip(),
+    },
+    "dlr_sara_pour": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "paper": "https://elib.dlr.de/193739/1/padalkar2023rlsct.pdf",
+        "citation_bibtex": dedent(r"""
+            @inproceedings{padalkar2023guiding,
+                title={Guiding Reinforcement Learning with Shared Control Templates},
+                author={Padalkar, Abhishek and Quere, Gabriel and Steinmetz, Franz and Raffin, Antonin and Nieuwenhuisen, Matthias and Silv{\'e}rio, Jo{\~a}o and Stulp, Freek},
+                booktitle={40th IEEE International Conference on Robotics and Automation, ICRA 2023},
+                year={2023},
+                organization={IEEE}
+            }""").lstrip(),
+    },
+    "droid_100": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "url": "https://droid-dataset.github.io/",
+        "paper": "https://huggingface.co/papers/2403.12945",
+        "citation_bibtex": dedent(r"""
+            @article{khazatsky2024droid,
+                title   = {DROID: A Large-Scale In-The-Wild Robot Manipulation Dataset},
+                author  = {Alexander Khazatsky and Karl Pertsch and Suraj Nair and Ashwin Balakrishna and Sudeep Dasari and Siddharth Karamcheti and Soroush Nasiriany and Mohan Kumar Srirama and Lawrence Yunliang Chen and Kirsty Ellis and Peter David Fagan and Joey Hejna and Masha Itkina and Marion Lepert and Yecheng Jason Ma and Patrick Tree Miller and Jimmy Wu and Suneel Belkhale and Shivin Dass and Huy Ha and Arhan Jain and Abraham Lee and Youngwoon Lee and Marius Memmel and Sungjae Park and Ilija Radosavovic and Kaiyuan Wang and Albert Zhan and Kevin Black and Cheng Chi and Kyle Beltran Hatch and Shan Lin and Jingpei Lu and Jean Mercat and Abdul Rehman and Pannag R Sanketi and Archit Sharma and Cody Simpson and Quan Vuong and Homer Rich Walke and Blake Wulfe and Ted Xiao and Jonathan Heewon Yang and Arefeh Yavary and Tony Z. Zhao and Christopher Agia and Rohan Baijal and Mateo Guaman Castro and Daphne Chen and Qiuyu Chen and Trinity Chung and Jaimyn Drake and Ethan Paul Foster and Jensen Gao and David Antonio Herrera and Minho Heo and Kyle Hsu and Jiaheng Hu and Donovon Jackson and Charlotte Le and Yunshuang Li and Kevin Lin and Roy Lin and Zehan Ma and Abhiram Maddukuri and Suvir Mirchandani and Daniel Morton and Tony Nguyen and Abigail O'Neill and Rosario Scalise and Derick Seale and Victor Son and Stephen Tian and Emi Tran and Andrew E. Wang and Yilin Wu and Annie Xie and Jingyun Yang and Patrick Yin and Yunchu Zhang and Osbert Bastani and Glen Berseth and Jeannette Bohg and Ken Goldberg and Abhinav Gupta and Abhishek Gupta and Dinesh Jayaraman and Joseph J Lim and Jitendra Malik and Roberto Martín-Martín and Subramanian Ramamoorthy and Dorsa Sadigh and Shuran Song and Jiajun Wu and Michael C. Yip and Yuke Zhu and Thomas Kollar and Sergey Levine and Chelsea Finn},
+                year    = {2024},
+            }""").lstrip(),
+    },
+    "fmb": {
+        "tasks_col": "language_instruction",
+        "license": "cc-by-4.0",
+        "url": "https://functional-manipulation-benchmark.github.io/",
+        "paper": "https://huggingface.co/papers/2401.08553",
+        "citation_bibtex": dedent(r"""
+            @article{luo2024fmb,
+                title={FMB: a Functional Manipulation Benchmark for Generalizable Robotic Learning},
+                author={Luo, Jianlan and Xu, Charles and Liu, Fangchen and Tan, Liam and Lin, Zipeng and Wu, Jeffrey and Abbeel, Pieter and Levine, Sergey},
+                journal={arXiv preprint arXiv:2401.08553},
+                year={2024}
+            }""").lstrip(),
+    },
+    "iamlab_cmu_pickup_insert": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "url": "https://openreview.net/forum?id=WuBv9-IGDUA",
+        "paper": "https://huggingface.co/papers/2401.14502",
+        "citation_bibtex": dedent(r"""
+            @inproceedings{saxena2023multiresolution,
+                title={Multi-Resolution Sensing for Real-Time Control with Vision-Language Models},
+                author={Saumya Saxena and Mohit Sharma and Oliver Kroemer},
+                booktitle={7th Annual Conference on Robot Learning},
+                year={2023},
+                url={https://openreview.net/forum?id=WuBv9-IGDUA}
+            }""").lstrip(),
+    },
+    "imperialcollege_sawyer_wrist_cam": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+    },
+    "jaco_play": {
+        "tasks_col": "language_instruction",
+        "license": "cc-by-4.0",
+        "url": "https://github.com/clvrai/clvr_jaco_play_dataset",
+        "citation_bibtex": dedent(r"""
+            @software{dass2023jacoplay,
+                author = {Dass, Shivin and Yapeter, Jullian and Zhang, Jesse and Zhang, Jiahui
+                            and Pertsch, Karl and Nikolaidis, Stefanos and Lim, Joseph J.},
+                title = {CLVR Jaco Play Dataset},
+                url = {https://github.com/clvrai/clvr_jaco_play_dataset},
+                version = {1.0.0},
+                year = {2023}
+            }""").lstrip(),
+    },
+    "kaist_nonprehensile": {
+        "tasks_col": "language_instruction",
+        "license": "cc-by-4.0",
+        "url": "https://github.com/JaeHyung-Kim/rlds_dataset_builder",
+        "citation_bibtex": dedent(r"""
+            @article{kimpre,
+                title={Pre-and post-contact policy decomposition for non-prehensile manipulation with zero-shot sim-to-real transfer},
+                author={Kim, Minchan and Han, Junhyek and Kim, Jaehyung and Kim, Beomjoon},
+                booktitle={2023 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)},
+                year={2023},
+                organization={IEEE}
+            }""").lstrip(),
+    },
+    "nyu_door_opening_surprising_effectiveness": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "url": "https://jyopari.github.io/VINN/",
+        "paper": "https://huggingface.co/papers/2112.01511",
+        "citation_bibtex": dedent(r"""
+            @misc{pari2021surprising,
+                title={The Surprising Effectiveness of Representation Learning for Visual Imitation},
+                author={Jyothish Pari and Nur Muhammad Shafiullah and Sridhar Pandian Arunachalam and Lerrel Pinto},
+                year={2021},
+                eprint={2112.01511},
+                archivePrefix={arXiv},
+                primaryClass={cs.RO}
+            }""").lstrip(),
+    },
+    "nyu_franka_play_dataset": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "url": "https://play-to-policy.github.io/",
+        "paper": "https://huggingface.co/papers/2210.10047",
+        "citation_bibtex": dedent(r"""
+            @article{cui2022play,
+                title   = {From Play to Policy: Conditional Behavior Generation from Uncurated Robot Data},
+                author  = {Cui, Zichen Jeff and Wang, Yibin and Shafiullah, Nur Muhammad Mahi and Pinto, Lerrel},
+                journal = {arXiv preprint arXiv:2210.10047},
+                year    = {2022}
+            }""").lstrip(),
+    },
+    "nyu_rot_dataset": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "url": "https://rot-robot.github.io/",
+        "paper": "https://huggingface.co/papers/2206.15469",
+        "citation_bibtex": dedent(r"""
+            @inproceedings{haldar2023watch,
+                title={Watch and match: Supercharging imitation with regularized optimal transport},
+                author={Haldar, Siddhant and Mathur, Vaibhav and Yarats, Denis and Pinto, Lerrel},
+                booktitle={Conference on Robot Learning},
+                pages={32--43},
+                year={2023},
+                organization={PMLR}
+            }""").lstrip(),
+    },
+    "roboturk": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "url": "https://roboturk.stanford.edu/dataset_real.html",
+        "paper": "PAPER",
+        "citation_bibtex": dedent(r"""
+            @inproceedings{mandlekar2019scaling,
+                title={Scaling robot supervision to hundreds of hours with roboturk: Robotic manipulation dataset through human reasoning and dexterity},
+                author={Mandlekar, Ajay and Booher, Jonathan and Spero, Max and Tung, Albert and Gupta, Anchit and Zhu, Yuke and Garg, Animesh and Savarese, Silvio and Fei-Fei, Li},
+                booktitle={2019 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)},
+                pages={1048--1055},
+                year={2019},
+                organization={IEEE}
+            }""").lstrip(),
+    },
+    "stanford_hydra_dataset": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "url": "https://sites.google.com/view/hydra-il-2023",
+        "paper": "https://huggingface.co/papers/2306.17237",
+        "citation_bibtex": dedent(r"""
+            @article{belkhale2023hydra,
+                title={HYDRA: Hybrid Robot Actions for Imitation Learning},
+                author={Belkhale, Suneel and Cui, Yuchen and Sadigh, Dorsa},
+                journal={arxiv},
+                year={2023}
+            }""").lstrip(),
+    },
+    "stanford_kuka_multimodal_dataset": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "url": "https://sites.google.com/view/visionandtouch",
+        "paper": "https://huggingface.co/papers/1810.10191",
+        "citation_bibtex": dedent(r"""
+            @inproceedings{lee2019icra,
+                title={Making sense of vision and touch: Self-supervised learning of multimodal representations for contact-rich tasks},
+                author={Lee, Michelle A and Zhu, Yuke and Srinivasan, Krishnan and Shah, Parth and Savarese, Silvio and Fei-Fei, Li and  Garg, Animesh and Bohg, Jeannette},
+                booktitle={2019 IEEE International Conference on Robotics and Automation (ICRA)},
+                year={2019},
+                url={https://huggingface.co/papers/1810.10191}
+            }""").lstrip(),
+    },
+    "stanford_robocook": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "url": "https://hshi74.github.io/robocook/",
+        "paper": "https://huggingface.co/papers/2306.14447",
+        "citation_bibtex": dedent(r"""
+            @article{shi2023robocook,
+                title={RoboCook: Long-Horizon Elasto-Plastic Object Manipulation with Diverse Tools},
+                author={Shi, Haochen and Xu, Huazhe and Clarke, Samuel and Li, Yunzhu and Wu, Jiajun},
+                journal={arXiv preprint arXiv:2306.14447},
+                year={2023}
+            }""").lstrip(),
+    },
+    "taco_play": {
+        "tasks_col": "language_instruction",
+        "license": "cc-by-4.0",
+        "url": "https://www.kaggle.com/datasets/oiermees/taco-robot",
+        "paper": "https://huggingface.co/papers/2209.08959, https://huggingface.co/papers/2210.01911",
+        "citation_bibtex": dedent(r"""
+            @inproceedings{rosete2022tacorl,
+                author = {Erick Rosete-Beas and Oier Mees and Gabriel Kalweit and Joschka Boedecker and Wolfram Burgard},
+                title = {Latent Plans for Task Agnostic Offline Reinforcement Learning},
+                journal = {Proceedings of the 6th Conference on Robot Learning (CoRL)},
+                year = {2022}
+            }
+            @inproceedings{mees23hulc2,
+                title={Grounding  Language  with  Visual  Affordances  over  Unstructured  Data},
+                author={Oier Mees and Jessica Borja-Diaz and Wolfram Burgard},
+                booktitle = {Proceedings of the IEEE International Conference on Robotics and Automation (ICRA)},
+                year={2023},
+                address = {London, UK}
+            }""").lstrip(),
+    },
+    "tokyo_u_lsmo": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "url": "URL",
+        "paper": "https://huggingface.co/papers/2107.05842",
+        "citation_bibtex": dedent(r"""
+            @Article{Osa22,
+                author  = {Takayuki Osa},
+                journal = {The International Journal of Robotics Research},
+                title   = {Motion Planning by Learning the Solution Manifold in Trajectory Optimization},
+                year    = {2022},
+                number  = {3},
+                pages   = {291--311},
+                volume  = {41},
+            }""").lstrip(),
+    },
+    "toto": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "url": "https://toto-benchmark.org/",
+        "paper": "https://huggingface.co/papers/2306.00942",
+        "citation_bibtex": dedent(r"""
+            @inproceedings{zhou2023train,
+                author={Zhou, Gaoyue and Dean, Victoria and Srirama, Mohan Kumar and Rajeswaran, Aravind and Pari, Jyothish and Hatch, Kyle and Jain, Aryan and Yu, Tianhe and Abbeel, Pieter and Pinto, Lerrel and Finn, Chelsea and Gupta, Abhinav},
+                booktitle={2023 IEEE International Conference on Robotics and Automation (ICRA)},
+                title={Train Offline, Test Online: A Real Robot Learning Benchmark},
+                year={2023},
+            }""").lstrip(),
+    },
+    "ucsd_kitchen_dataset": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "citation_bibtex": dedent(r"""
+            @ARTICLE{ucsd_kitchens,
+                author = {Ge Yan, Kris Wu, and Xiaolong Wang},
+                title = {{ucsd kitchens Dataset}},
+                year = {2023},
+                month = {August}
+            }""").lstrip(),
+    },
+    "ucsd_pick_and_place_dataset": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "url": "https://owmcorl.github.io/#",
+        "paper": "https://huggingface.co/papers/2310.16029",
+        "citation_bibtex": dedent(r"""
+            @preprint{Feng2023Finetuning,
+                title={Finetuning Offline World Models in the Real World},
+                author={Yunhai Feng, Nicklas Hansen, Ziyan Xiong, Chandramouli Rajagopalan, Xiaolong Wang},
+                year={2023}
+            }""").lstrip(),
+    },
+    "uiuc_d3field": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "url": "https://robopil.github.io/d3fields/",
+        "paper": "https://huggingface.co/papers/2309.16118",
+        "citation_bibtex": dedent(r"""
+            @article{wang2023d3field,
+                title={D^3Field: Dynamic 3D Descriptor Fields for Generalizable Robotic Manipulation},
+                author={Wang, Yixuan and Li, Zhuoran and Zhang, Mingtong and Driggs-Campbell, Katherine and Wu, Jiajun and Fei-Fei, Li and Li, Yunzhu},
+                journal={arXiv preprint arXiv:},
+                year={2023},
+            }""").lstrip(),
+    },
+    "usc_cloth_sim": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "url": "https://uscresl.github.io/dmfd/",
+        "paper": "https://huggingface.co/papers/2207.10148",
+        "citation_bibtex": dedent(r"""
+            @article{salhotra2022dmfd,
+                author={Salhotra, Gautam and Liu, I-Chun Arthur and Dominguez-Kuhne, Marcus and Sukhatme, Gaurav S.},
+                journal={IEEE Robotics and Automation Letters},
+                title={Learning Deformable Object Manipulation From Expert Demonstrations},
+                year={2022},
+                volume={7},
+                number={4},
+                pages={8775-8782},
+                doi={10.1109/LRA.2022.3187843}
+            }""").lstrip(),
+    },
+    "utaustin_mutex": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "url": "https://ut-austin-rpl.github.io/MUTEX/",
+        "paper": "https://huggingface.co/papers/2309.14320",
+        "citation_bibtex": dedent(r"""
+            @inproceedings{shah2023mutex,
+                title={{MUTEX}: Learning Unified Policies from Multimodal Task Specifications},
+                author={Rutav Shah and Roberto Mart{\'\i}n-Mart{\'\i}n and Yuke Zhu},
+                booktitle={7th Annual Conference on Robot Learning},
+                year={2023},
+                url={https://openreview.net/forum?id=PwqiqaaEzJ}
+            }""").lstrip(),
+    },
+    "utokyo_pr2_opening_fridge": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "citation_bibtex": dedent(r"""
+            @misc{oh2023pr2utokyodatasets,
+                author={Jihoon Oh and Naoaki Kanazawa and Kento Kawaharazuka},
+                title={X-Embodiment U-Tokyo PR2 Datasets},
+                year={2023},
+                url={https://github.com/ojh6404/rlds_dataset_builder},
+            }""").lstrip(),
+    },
+    "utokyo_pr2_tabletop_manipulation": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "citation_bibtex": dedent(r"""
+            @misc{oh2023pr2utokyodatasets,
+                author={Jihoon Oh and Naoaki Kanazawa and Kento Kawaharazuka},
+                title={X-Embodiment U-Tokyo PR2 Datasets},
+                year={2023},
+                url={https://github.com/ojh6404/rlds_dataset_builder},
+            }""").lstrip(),
+    },
+    "utokyo_saytap": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "url": "https://saytap.github.io/",
+        "paper": "https://huggingface.co/papers/2306.07580",
+        "citation_bibtex": dedent(r"""
+            @article{saytap2023,
+                author = {Yujin Tang and Wenhao Yu and Jie Tan and Heiga Zen and Aleksandra Faust and
+                Tatsuya Harada},
+                title  = {SayTap: Language to Quadrupedal Locomotion},
+                eprint = {arXiv:2306.07580},
+                url    = {https://saytap.github.io},
+                note   = {https://saytap.github.io},
+                year   = {2023}
+            }""").lstrip(),
+    },
+    "utokyo_xarm_bimanual": {
+        "tasks_col": "language_instruction",
+        "license": "cc-by-4.0",
+        "citation_bibtex": dedent(r"""
+            @misc{matsushima2023weblab,
+                title={Weblab xArm Dataset},
+                author={Tatsuya Matsushima and Hiroki Furuta and Yusuke Iwasawa and Yutaka Matsuo},
+                year={2023},
+            }""").lstrip(),
+    },
+    "utokyo_xarm_pick_and_place": {
+        "tasks_col": "language_instruction",
+        "license": "cc-by-4.0",
+        "citation_bibtex": dedent(r"""
+            @misc{matsushima2023weblab,
+                title={Weblab xArm Dataset},
+                author={Tatsuya Matsushima and Hiroki Furuta and Yusuke Iwasawa and Yutaka Matsuo},
+                year={2023},
+            }""").lstrip(),
+    },
+    "viola": {
+        "tasks_col": "language_instruction",
+        "license": "mit",
+        "url": "https://ut-austin-rpl.github.io/VIOLA/",
+        "paper": "https://huggingface.co/papers/2210.11339",
+        "citation_bibtex": dedent(r"""
+            @article{zhu2022viola,
+                title={VIOLA: Imitation Learning for Vision-Based Manipulation with Object Proposal Priors},
+                author={Zhu, Yifeng and Joshi, Abhishek and Stone, Peter and Zhu, Yuke},
+                journal={6th Annual Conference on Robot Learning (CoRL)},
+                year={2022}
+            }""").lstrip(),
+    },
+}
+# spellchecker:on
+
+
+def batch_convert():
+    status = {}
+    logfile = LOCAL_DIR / "conversion_log.txt"
+    assert set(DATASETS) == {id_.split("/")[1] for id_ in available_datasets}
+    for num, (name, kwargs) in enumerate(DATASETS.items()):
+        repo_id = f"lerobot/{name}"
+        print(f"\nConverting {repo_id} ({num}/{len(DATASETS)})")
+        print("---------------------------------------------------------")
+        try:
+            convert_dataset(repo_id, LOCAL_DIR, **kwargs)
+            status = f"{repo_id}: success."
+            with open(logfile, "a") as file:
+                file.write(status + "\n")
+        except Exception:
+            status = f"{repo_id}: failed\n    {traceback.format_exc()}"
+            with open(logfile, "a") as file:
+                file.write(status + "\n")
+            continue
+
+
+if __name__ == "__main__":
+    batch_convert()
--- a/src/lerobot/datasets/v2/convert_dataset_v1_to_v2.py
+++ b/src/lerobot/datasets/v2/convert_dataset_v1_to_v2.py
@@ -0,0 +1,687 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This script will help you convert any LeRobot dataset already pushed to the hub from codebase version 1.6 to
+2.0. You will be required to provide the 'tasks', which is a short but accurate description in plain English
+for each of the task performed in the dataset. This will allow to easily train models with task-conditioning.
+
+We support 3 different scenarios for these tasks (see instructions below):
+    1. Single task dataset: all episodes of your dataset have the same single task.
+    2. Single task episodes: the episodes of your dataset each contain a single task but they can differ from
+      one episode to the next.
+    3. Multi task episodes: episodes of your dataset may each contain several different tasks.
+
+
+Can you can also provide a robot config .yaml file (not mandatory) to this script via the option
+'--robot-config' so that it writes information about the robot (robot type, motors names) this dataset was
+recorded with. For now, only Aloha/Koch type robots are supported with this option.
+
+
+# 1. Single task dataset
+If your dataset contains a single task, you can simply provide it directly via the CLI with the
+'--single-task' option.
+
+Examples:
+
+```bash
+python -m lerobot.datasets.v2.convert_dataset_v1_to_v2 \
+    --repo-id lerobot/aloha_sim_insertion_human_image \
+    --single-task "Insert the peg into the socket." \
+    --robot-config lerobot/configs/robot/aloha.yaml \
+    --local-dir data
+```
+
+```bash
+python -m lerobot.datasets.v2.convert_dataset_v1_to_v2 \
+    --repo-id aliberts/koch_tutorial \
+    --single-task "Pick the Lego block and drop it in the box on the right." \
+    --robot-config lerobot/configs/robot/koch.yaml \
+    --local-dir data
+```
+
+
+# 2. Single task episodes
+If your dataset is a multi-task dataset, you have two options to provide the tasks to this script:
+
+- If your dataset already contains a language instruction column in its parquet file, you can simply provide
+  this column's name with the '--tasks-col' arg.
+
+    Example:
+
+    ```bash
+    python -m lerobot.datasets.v2.convert_dataset_v1_to_v2 \
+        --repo-id lerobot/stanford_kuka_multimodal_dataset \
+        --tasks-col "language_instruction" \
+        --local-dir data
+    ```
+
+- If your dataset doesn't contain a language instruction, you should provide the path to a .json file with the
+  '--tasks-path' arg. This file should have the following structure where keys correspond to each
+  episode_index in the dataset, and values are the language instruction for that episode.
+
+    Example:
+
+    ```json
+    {
+        "0": "Do something",
+        "1": "Do something else",
+        "2": "Do something",
+        "3": "Go there",
+        ...
+    }
+    ```
+
+# 3. Multi task episodes
+If you have multiple tasks per episodes, your dataset should contain a language instruction column in its
+parquet file, and you must provide this column's name with the '--tasks-col' arg.
+
+Example:
+
+```bash
+python -m lerobot.datasets.v2.convert_dataset_v1_to_v2 \
+    --repo-id lerobot/stanford_kuka_multimodal_dataset \
+    --tasks-col "language_instruction" \
+    --local-dir data
+```
+"""
+
+import argparse
+import contextlib
+import filecmp
+import json
+import logging
+import math
+import shutil
+import subprocess
+import tempfile
+from pathlib import Path
+
+import datasets
+import pyarrow.compute as pc
+import pyarrow.parquet as pq
+import torch
+from datasets import Dataset
+from huggingface_hub import HfApi
+from huggingface_hub.errors import EntryNotFoundError, HfHubHTTPError
+from safetensors.torch import load_file
+
+from lerobot.datasets.utils import (
+    DEFAULT_CHUNK_SIZE,
+    DEFAULT_PARQUET_PATH,
+    DEFAULT_VIDEO_PATH,
+    EPISODES_PATH,
+    INFO_PATH,
+    STATS_PATH,
+    TASKS_PATH,
+    create_branch,
+    create_lerobot_dataset_card,
+    flatten_dict,
+    get_safe_version,
+    load_json,
+    unflatten_dict,
+    write_json,
+    write_jsonlines,
+)
+from lerobot.datasets.video_utils import (
+    VideoFrame,  # noqa: F401
+    get_image_pixel_channels,
+    get_video_info,
+)
+from lerobot.robots import RobotConfig
+
+V16 = "v1.6"
+V20 = "v2.0"
+
+GITATTRIBUTES_REF = "aliberts/gitattributes_reference"
+V1_VIDEO_FILE = "{video_key}_episode_{episode_index:06d}.mp4"
+V1_INFO_PATH = "meta_data/info.json"
+V1_STATS_PATH = "meta_data/stats.safetensors"
+
+
+def parse_robot_config(robot_cfg: RobotConfig) -> tuple[str, dict]:
+    if robot_cfg.type in ["aloha", "koch"]:
+        state_names = [
+            f"{arm}_{motor}" if len(robot_cfg.follower_arms) > 1 else motor
+            for arm in robot_cfg.follower_arms
+            for motor in robot_cfg.follower_arms[arm].motors
+        ]
+        action_names = [
+            # f"{arm}_{motor}" for arm in ["left", "right"] for motor in robot_cfg["leader_arms"][arm]["motors"]
+            f"{arm}_{motor}" if len(robot_cfg.leader_arms) > 1 else motor
+            for arm in robot_cfg.leader_arms
+            for motor in robot_cfg.leader_arms[arm].motors
+        ]
+    # elif robot_cfg["robot_type"] == "stretch3": TODO
+    else:
+        raise NotImplementedError(
+            "Please provide robot_config={'robot_type': ..., 'names': ...} directly to convert_dataset()."
+        )
+
+    return {
+        "robot_type": robot_cfg.type,
+        "names": {
+            "observation.state": state_names,
+            "observation.effort": state_names,
+            "action": action_names,
+        },
+    }
+
+
+def convert_stats_to_json(v1_dir: Path, v2_dir: Path) -> None:
+    safetensor_path = v1_dir / V1_STATS_PATH
+    stats = load_file(safetensor_path)
+    serialized_stats = {key: value.tolist() for key, value in stats.items()}
+    serialized_stats = unflatten_dict(serialized_stats)
+
+    json_path = v2_dir / STATS_PATH
+    json_path.parent.mkdir(exist_ok=True, parents=True)
+    with open(json_path, "w") as f:
+        json.dump(serialized_stats, f, indent=4)
+
+    # Sanity check
+    with open(json_path) as f:
+        stats_json = json.load(f)
+
+    stats_json = flatten_dict(stats_json)
+    stats_json = {key: torch.tensor(value) for key, value in stats_json.items()}
+    for key in stats:
+        torch.testing.assert_close(stats_json[key], stats[key])
+
+
+def get_features_from_hf_dataset(
+    dataset: Dataset, robot_config: RobotConfig | None = None
+) -> dict[str, list]:
+    robot_config = parse_robot_config(robot_config)
+    features = {}
+    for key, ft in dataset.features.items():
+        if isinstance(ft, datasets.Value):
+            dtype = ft.dtype
+            shape = (1,)
+            names = None
+        if isinstance(ft, datasets.Sequence):
+            assert isinstance(ft.feature, datasets.Value)
+            dtype = ft.feature.dtype
+            shape = (ft.length,)
+            motor_names = (
+                robot_config["names"][key] if robot_config else [f"motor_{i}" for i in range(ft.length)]
+            )
+            assert len(motor_names) == shape[0]
+            names = {"motors": motor_names}
+        elif isinstance(ft, datasets.Image):
+            dtype = "image"
+            image = dataset[0][key]  # Assuming first row
+            channels = get_image_pixel_channels(image)
+            shape = (image.height, image.width, channels)
+            names = ["height", "width", "channels"]
+        elif ft._type == "VideoFrame":
+            dtype = "video"
+            shape = None  # Add shape later
+            names = ["height", "width", "channels"]
+
+        features[key] = {
+            "dtype": dtype,
+            "shape": shape,
+            "names": names,
+        }
+
+    return features
+
+
+def add_task_index_by_episodes(dataset: Dataset, tasks_by_episodes: dict) -> tuple[Dataset, list[str]]:
+    df = dataset.to_pandas()
+    tasks = list(set(tasks_by_episodes.values()))
+    tasks_to_task_index = {task: task_idx for task_idx, task in enumerate(tasks)}
+    episodes_to_task_index = {ep_idx: tasks_to_task_index[task] for ep_idx, task in tasks_by_episodes.items()}
+    df["task_index"] = df["episode_index"].map(episodes_to_task_index).astype(int)
+
+    features = dataset.features
+    features["task_index"] = datasets.Value(dtype="int64")
+    dataset = Dataset.from_pandas(df, features=features, split="train")
+    return dataset, tasks
+
+
+def add_task_index_from_tasks_col(
+    dataset: Dataset, tasks_col: str
+) -> tuple[Dataset, dict[str, list[str]], list[str]]:
+    df = dataset.to_pandas()
+
+    # HACK: This is to clean some of the instructions in our version of Open X datasets
+    prefix_to_clean = "tf.Tensor(b'"
+    suffix_to_clean = "', shape=(), dtype=string)"
+    df[tasks_col] = df[tasks_col].str.removeprefix(prefix_to_clean).str.removesuffix(suffix_to_clean)
+
+    # Create task_index col
+    tasks_by_episode = df.groupby("episode_index")[tasks_col].unique().apply(lambda x: x.tolist()).to_dict()
+    tasks = df[tasks_col].unique().tolist()
+    tasks_to_task_index = {task: idx for idx, task in enumerate(tasks)}
+    df["task_index"] = df[tasks_col].map(tasks_to_task_index).astype(int)
+
+    # Build the dataset back from df
+    features = dataset.features
+    features["task_index"] = datasets.Value(dtype="int64")
+    dataset = Dataset.from_pandas(df, features=features, split="train")
+    dataset = dataset.remove_columns(tasks_col)
+
+    return dataset, tasks, tasks_by_episode
+
+
+def split_parquet_by_episodes(
+    dataset: Dataset,
+    total_episodes: int,
+    total_chunks: int,
+    output_dir: Path,
+) -> list:
+    table = dataset.data.table
+    episode_lengths = []
+    for ep_chunk in range(total_chunks):
+        ep_chunk_start = DEFAULT_CHUNK_SIZE * ep_chunk
+        ep_chunk_end = min(DEFAULT_CHUNK_SIZE * (ep_chunk + 1), total_episodes)
+        chunk_dir = "/".join(DEFAULT_PARQUET_PATH.split("/")[:-1]).format(episode_chunk=ep_chunk)
+        (output_dir / chunk_dir).mkdir(parents=True, exist_ok=True)
+        for ep_idx in range(ep_chunk_start, ep_chunk_end):
+            ep_table = table.filter(pc.equal(table["episode_index"], ep_idx))
+            episode_lengths.insert(ep_idx, len(ep_table))
+            output_file = output_dir / DEFAULT_PARQUET_PATH.format(
+                episode_chunk=ep_chunk, episode_index=ep_idx
+            )
+            pq.write_table(ep_table, output_file)
+
+    return episode_lengths
+
+
+def move_videos(
+    repo_id: str,
+    video_keys: list[str],
+    total_episodes: int,
+    total_chunks: int,
+    work_dir: Path,
+    clean_gittatributes: Path,
+    branch: str = "main",
+) -> None:
+    """
+    HACK: Since HfApi() doesn't provide a way to move files directly in a repo, this function will run git
+    commands to fetch git lfs video files references to move them into subdirectories without having to
+    actually download them.
+    """
+    _lfs_clone(repo_id, work_dir, branch)
+
+    videos_moved = False
+    video_files = [str(f.relative_to(work_dir)) for f in work_dir.glob("videos*/*.mp4")]
+    if len(video_files) == 0:
+        video_files = [str(f.relative_to(work_dir)) for f in work_dir.glob("videos*/*/*/*.mp4")]
+        videos_moved = True  # Videos have already been moved
+
+    assert len(video_files) == total_episodes * len(video_keys)
+
+    lfs_untracked_videos = _get_lfs_untracked_videos(work_dir, video_files)
+
+    current_gittatributes = work_dir / ".gitattributes"
+    if not filecmp.cmp(current_gittatributes, clean_gittatributes, shallow=False):
+        fix_gitattributes(work_dir, current_gittatributes, clean_gittatributes)
+
+    if lfs_untracked_videos:
+        fix_lfs_video_files_tracking(work_dir, video_files)
+
+    if videos_moved:
+        return
+
+    video_dirs = sorted(work_dir.glob("videos*/"))
+    for ep_chunk in range(total_chunks):
+        ep_chunk_start = DEFAULT_CHUNK_SIZE * ep_chunk
+        ep_chunk_end = min(DEFAULT_CHUNK_SIZE * (ep_chunk + 1), total_episodes)
+        for vid_key in video_keys:
+            chunk_dir = "/".join(DEFAULT_VIDEO_PATH.split("/")[:-1]).format(
+                episode_chunk=ep_chunk, video_key=vid_key
+            )
+            (work_dir / chunk_dir).mkdir(parents=True, exist_ok=True)
+
+            for ep_idx in range(ep_chunk_start, ep_chunk_end):
+                target_path = DEFAULT_VIDEO_PATH.format(
+                    episode_chunk=ep_chunk, video_key=vid_key, episode_index=ep_idx
+                )
+                video_file = V1_VIDEO_FILE.format(video_key=vid_key, episode_index=ep_idx)
+                if len(video_dirs) == 1:
+                    video_path = video_dirs[0] / video_file
+                else:
+                    for dir in video_dirs:
+                        if (dir / video_file).is_file():
+                            video_path = dir / video_file
+                            break
+
+                video_path.rename(work_dir / target_path)
+
+    commit_message = "Move video files into chunk subdirectories"
+    subprocess.run(["git", "add", "."], cwd=work_dir, check=True)
+    subprocess.run(["git", "commit", "-m", commit_message], cwd=work_dir, check=True)
+    subprocess.run(["git", "push"], cwd=work_dir, check=True)
+
+
+def fix_lfs_video_files_tracking(work_dir: Path, lfs_untracked_videos: list[str]) -> None:
+    """
+    HACK: This function fixes the tracking by git lfs which was not properly set on some repos. In that case,
+    there's no other option than to download the actual files and reupload them with lfs tracking.
+    """
+    for i in range(0, len(lfs_untracked_videos), 100):
+        files = lfs_untracked_videos[i : i + 100]
+        try:
+            subprocess.run(["git", "rm", "--cached", *files], cwd=work_dir, capture_output=True, check=True)
+        except subprocess.CalledProcessError as e:
+            print("git rm --cached ERROR:")
+            print(e.stderr)
+        subprocess.run(["git", "add", *files], cwd=work_dir, check=True)
+
+    commit_message = "Track video files with git lfs"
+    subprocess.run(["git", "commit", "-m", commit_message], cwd=work_dir, check=True)
+    subprocess.run(["git", "push"], cwd=work_dir, check=True)
+
+
+def fix_gitattributes(work_dir: Path, current_gittatributes: Path, clean_gittatributes: Path) -> None:
+    shutil.copyfile(clean_gittatributes, current_gittatributes)
+    subprocess.run(["git", "add", ".gitattributes"], cwd=work_dir, check=True)
+    subprocess.run(["git", "commit", "-m", "Fix .gitattributes"], cwd=work_dir, check=True)
+    subprocess.run(["git", "push"], cwd=work_dir, check=True)
+
+
+def _lfs_clone(repo_id: str, work_dir: Path, branch: str) -> None:
+    subprocess.run(["git", "lfs", "install"], cwd=work_dir, check=True)
+    repo_url = f"https://huggingface.co/datasets/{repo_id}"
+    env = {"GIT_LFS_SKIP_SMUDGE": "1"}  # Prevent downloading LFS files
+    subprocess.run(
+        ["git", "clone", "--branch", branch, "--single-branch", "--depth", "1", repo_url, str(work_dir)],
+        check=True,
+        env=env,
+    )
+
+
+def _get_lfs_untracked_videos(work_dir: Path, video_files: list[str]) -> list[str]:
+    lfs_tracked_files = subprocess.run(
+        ["git", "lfs", "ls-files", "-n"], cwd=work_dir, capture_output=True, text=True, check=True
+    )
+    lfs_tracked_files = set(lfs_tracked_files.stdout.splitlines())
+    return [f for f in video_files if f not in lfs_tracked_files]
+
+
+def get_videos_info(repo_id: str, local_dir: Path, video_keys: list[str], branch: str) -> dict:
+    # Assumes first episode
+    video_files = [
+        DEFAULT_VIDEO_PATH.format(episode_chunk=0, video_key=vid_key, episode_index=0)
+        for vid_key in video_keys
+    ]
+    hub_api = HfApi()
+    hub_api.snapshot_download(
+        repo_id=repo_id, repo_type="dataset", local_dir=local_dir, revision=branch, allow_patterns=video_files
+    )
+    videos_info_dict = {}
+    for vid_key, vid_path in zip(video_keys, video_files, strict=True):
+        videos_info_dict[vid_key] = get_video_info(local_dir / vid_path)
+
+    return videos_info_dict
+
+
+def convert_dataset(
+    repo_id: str,
+    local_dir: Path,
+    single_task: str | None = None,
+    tasks_path: Path | None = None,
+    tasks_col: Path | None = None,
+    robot_config: RobotConfig | None = None,
+    test_branch: str | None = None,
+    **card_kwargs,
+):
+    v1 = get_safe_version(repo_id, V16)
+    v1x_dir = local_dir / V16 / repo_id
+    v20_dir = local_dir / V20 / repo_id
+    v1x_dir.mkdir(parents=True, exist_ok=True)
+    v20_dir.mkdir(parents=True, exist_ok=True)
+
+    hub_api = HfApi()
+    hub_api.snapshot_download(
+        repo_id=repo_id, repo_type="dataset", revision=v1, local_dir=v1x_dir, ignore_patterns="videos*/"
+    )
+    branch = "main"
+    if test_branch:
+        branch = test_branch
+        create_branch(repo_id=repo_id, branch=test_branch, repo_type="dataset")
+
+    metadata_v1 = load_json(v1x_dir / V1_INFO_PATH)
+    dataset = datasets.load_dataset("parquet", data_dir=v1x_dir / "data", split="train")
+    features = get_features_from_hf_dataset(dataset, robot_config)
+    video_keys = [key for key, ft in features.items() if ft["dtype"] == "video"]
+
+    if single_task and "language_instruction" in dataset.column_names:
+        logging.warning(
+            "'single_task' provided but 'language_instruction' tasks_col found. Using 'language_instruction'.",
+        )
+        single_task = None
+        tasks_col = "language_instruction"
+
+    # Episodes & chunks
+    episode_indices = sorted(dataset.unique("episode_index"))
+    total_episodes = len(episode_indices)
+    assert episode_indices == list(range(total_episodes))
+    total_videos = total_episodes * len(video_keys)
+    total_chunks = total_episodes // DEFAULT_CHUNK_SIZE
+    if total_episodes % DEFAULT_CHUNK_SIZE != 0:
+        total_chunks += 1
+
+    # Tasks
+    if single_task:
+        tasks_by_episodes = dict.fromkeys(episode_indices, single_task)
+        dataset, tasks = add_task_index_by_episodes(dataset, tasks_by_episodes)
+        tasks_by_episodes = {ep_idx: [task] for ep_idx, task in tasks_by_episodes.items()}
+    elif tasks_path:
+        tasks_by_episodes = load_json(tasks_path)
+        tasks_by_episodes = {int(ep_idx): task for ep_idx, task in tasks_by_episodes.items()}
+        dataset, tasks = add_task_index_by_episodes(dataset, tasks_by_episodes)
+        tasks_by_episodes = {ep_idx: [task] for ep_idx, task in tasks_by_episodes.items()}
+    elif tasks_col:
+        dataset, tasks, tasks_by_episodes = add_task_index_from_tasks_col(dataset, tasks_col)
+    else:
+        raise ValueError
+
+    assert set(tasks) == {task for ep_tasks in tasks_by_episodes.values() for task in ep_tasks}
+    tasks = [{"task_index": task_idx, "task": task} for task_idx, task in enumerate(tasks)]
+    write_jsonlines(tasks, v20_dir / TASKS_PATH)
+    features["task_index"] = {
+        "dtype": "int64",
+        "shape": (1,),
+        "names": None,
+    }
+
+    # Videos
+    if video_keys:
+        assert metadata_v1.get("video", False)
+        dataset = dataset.remove_columns(video_keys)
+        clean_gitattr = Path(
+            hub_api.hf_hub_download(
+                repo_id=GITATTRIBUTES_REF, repo_type="dataset", local_dir=local_dir, filename=".gitattributes"
+            )
+        ).absolute()
+        with tempfile.TemporaryDirectory() as tmp_video_dir:
+            move_videos(
+                repo_id, video_keys, total_episodes, total_chunks, Path(tmp_video_dir), clean_gitattr, branch
+            )
+        videos_info = get_videos_info(repo_id, v1x_dir, video_keys=video_keys, branch=branch)
+        for key in video_keys:
+            features[key]["shape"] = (
+                videos_info[key].pop("video.height"),
+                videos_info[key].pop("video.width"),
+                videos_info[key].pop("video.channels"),
+            )
+            features[key]["video_info"] = videos_info[key]
+            assert math.isclose(videos_info[key]["video.fps"], metadata_v1["fps"], rel_tol=1e-3)
+            if "encoding" in metadata_v1:
+                assert videos_info[key]["video.pix_fmt"] == metadata_v1["encoding"]["pix_fmt"]
+    else:
+        assert metadata_v1.get("video", 0) == 0
+        videos_info = None
+
+    # Split data into 1 parquet file by episode
+    episode_lengths = split_parquet_by_episodes(dataset, total_episodes, total_chunks, v20_dir)
+
+    if robot_config is not None:
+        robot_type = robot_config.type
+        repo_tags = [robot_type]
+    else:
+        robot_type = "unknown"
+        repo_tags = None
+
+    # Episodes
+    episodes = [
+        {"episode_index": ep_idx, "tasks": tasks_by_episodes[ep_idx], "length": episode_lengths[ep_idx]}
+        for ep_idx in episode_indices
+    ]
+    write_jsonlines(episodes, v20_dir / EPISODES_PATH)
+
+    # Assemble metadata v2.0
+    metadata_v2_0 = {
+        "codebase_version": V20,
+        "robot_type": robot_type,
+        "total_episodes": total_episodes,
+        "total_frames": len(dataset),
+        "total_tasks": len(tasks),
+        "total_videos": total_videos,
+        "total_chunks": total_chunks,
+        "chunks_size": DEFAULT_CHUNK_SIZE,
+        "fps": metadata_v1["fps"],
+        "splits": {"train": f"0:{total_episodes}"},
+        "data_path": DEFAULT_PARQUET_PATH,
+        "video_path": DEFAULT_VIDEO_PATH if video_keys else None,
+        "features": features,
+    }
+    write_json(metadata_v2_0, v20_dir / INFO_PATH)
+    convert_stats_to_json(v1x_dir, v20_dir)
+    card = create_lerobot_dataset_card(tags=repo_tags, dataset_info=metadata_v2_0, **card_kwargs)
+
+    with contextlib.suppress(EntryNotFoundError, HfHubHTTPError):
+        hub_api.delete_folder(repo_id=repo_id, path_in_repo="data", repo_type="dataset", revision=branch)
+
+    with contextlib.suppress(EntryNotFoundError, HfHubHTTPError):
+        hub_api.delete_folder(repo_id=repo_id, path_in_repo="meta_data", repo_type="dataset", revision=branch)
+
+    with contextlib.suppress(EntryNotFoundError, HfHubHTTPError):
+        hub_api.delete_folder(repo_id=repo_id, path_in_repo="meta", repo_type="dataset", revision=branch)
+
+    hub_api.upload_folder(
+        repo_id=repo_id,
+        path_in_repo="data",
+        folder_path=v20_dir / "data",
+        repo_type="dataset",
+        revision=branch,
+    )
+    hub_api.upload_folder(
+        repo_id=repo_id,
+        path_in_repo="meta",
+        folder_path=v20_dir / "meta",
+        repo_type="dataset",
+        revision=branch,
+    )
+
+    card.push_to_hub(repo_id=repo_id, repo_type="dataset", revision=branch)
+
+    if not test_branch:
+        create_branch(repo_id=repo_id, branch=V20, repo_type="dataset")
+
+
+def make_robot_config(robot_type: str, **kwargs) -> RobotConfig:
+    if robot_type == "aloha":
+        raise NotImplementedError  # TODO
+
+    elif robot_type == "koch_follower":
+        from lerobot.robots.koch_follower import KochFollowerConfig
+
+        return KochFollowerConfig(**kwargs)
+    elif robot_type == "so100_follower":
+        from lerobot.robots.so100_follower import SO100FollowerConfig
+
+        return SO100FollowerConfig(**kwargs)
+    elif robot_type == "stretch":
+        from lerobot.robots.stretch3 import Stretch3RobotConfig
+
+        return Stretch3RobotConfig(**kwargs)
+    elif robot_type == "lekiwi":
+        from lerobot.robots.lekiwi import LeKiwiConfig
+
+        return LeKiwiConfig(**kwargs)
+    else:
+        raise ValueError(f"Robot type '{robot_type}' is not available.")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    task_args = parser.add_mutually_exclusive_group(required=True)
+
+    parser.add_argument(
+        "--repo-id",
+        type=str,
+        required=True,
+        help="Repository identifier on Hugging Face: a community or a user name `/` the name of the dataset (e.g. `lerobot/pusht`, `cadene/aloha_sim_insertion_human`).",
+    )
+    task_args.add_argument(
+        "--single-task",
+        type=str,
+        help="A short but accurate description of the single task performed in the dataset.",
+    )
+    task_args.add_argument(
+        "--tasks-col",
+        type=str,
+        help="The name of the column containing language instructions",
+    )
+    task_args.add_argument(
+        "--tasks-path",
+        type=Path,
+        help="The path to a .json file containing one language instruction for each episode_index",
+    )
+    parser.add_argument(
+        "--robot",
+        type=str,
+        default=None,
+        help="Robot config used for the dataset during conversion (e.g. 'koch', 'aloha', 'so100', etc.)",
+    )
+    parser.add_argument(
+        "--local-dir",
+        type=Path,
+        default=None,
+        help="Local directory to store the dataset during conversion. Defaults to /tmp/lerobot_dataset_v2",
+    )
+    parser.add_argument(
+        "--license",
+        type=str,
+        default="apache-2.0",
+        help="Repo license. Must be one of https://huggingface.co/docs/hub/repositories-licenses. Defaults to mit.",
+    )
+    parser.add_argument(
+        "--test-branch",
+        type=str,
+        default=None,
+        help="Repo branch to test your conversion first (e.g. 'v2.0.test')",
+    )
+
+    args = parser.parse_args()
+    if not args.local_dir:
+        args.local_dir = Path("/tmp/lerobot_dataset_v2")
+
+    if args.robot is not None:
+        robot_config = make_robot_config(args.robot)
+
+    del args.robot
+
+    convert_dataset(**vars(args), robot_config=robot_config)
+
+
+if __name__ == "__main__":
+    main()
--- a/src/lerobot/datasets/v21/_remove_language_instruction.py
+++ b/src/lerobot/datasets/v21/_remove_language_instruction.py
@@ -0,0 +1,87 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import traceback
+from pathlib import Path
+
+from datasets import get_dataset_config_info
+from huggingface_hub import HfApi
+
+from lerobot import available_datasets
+from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata
+from lerobot.datasets.utils import INFO_PATH, write_info
+from lerobot.datasets.v21.convert_dataset_v20_to_v21 import V20, SuppressWarnings
+
+LOCAL_DIR = Path("data/")
+
+hub_api = HfApi()
+
+
+def fix_dataset(repo_id: str) -> str:
+    if not hub_api.revision_exists(repo_id, V20, repo_type="dataset"):
+        return f"{repo_id}: skipped (not in {V20})."
+
+    dataset_info = get_dataset_config_info(repo_id, "default")
+    with SuppressWarnings():
+        lerobot_metadata = LeRobotDatasetMetadata(repo_id, revision=V20, force_cache_sync=True)
+
+    meta_features = {key for key, ft in lerobot_metadata.features.items() if ft["dtype"] != "video"}
+    parquet_features = set(dataset_info.features)
+
+    diff_parquet_meta = parquet_features - meta_features
+    diff_meta_parquet = meta_features - parquet_features
+
+    if diff_parquet_meta:
+        raise ValueError(f"In parquet not in info.json: {parquet_features - meta_features}")
+
+    if not diff_meta_parquet:
+        return f"{repo_id}: skipped (no diff)"
+
+    if diff_meta_parquet:
+        logging.warning(f"In info.json not in parquet: {meta_features - parquet_features}")
+        assert diff_meta_parquet == {"language_instruction"}
+        lerobot_metadata.features.pop("language_instruction")
+        write_info(lerobot_metadata.info, lerobot_metadata.root)
+        commit_info = hub_api.upload_file(
+            path_or_fileobj=lerobot_metadata.root / INFO_PATH,
+            path_in_repo=INFO_PATH,
+            repo_id=repo_id,
+            repo_type="dataset",
+            revision=V20,
+            commit_message="Remove 'language_instruction'",
+            create_pr=True,
+        )
+        return f"{repo_id}: success - PR: {commit_info.pr_url}"
+
+
+def batch_fix():
+    status = {}
+    LOCAL_DIR.mkdir(parents=True, exist_ok=True)
+    logfile = LOCAL_DIR / "fix_features_v20.txt"
+    for num, repo_id in enumerate(available_datasets):
+        print(f"\nConverting {repo_id} ({num}/{len(available_datasets)})")
+        print("---------------------------------------------------------")
+        try:
+            status = fix_dataset(repo_id)
+        except Exception:
+            status = f"{repo_id}: failed\n    {traceback.format_exc()}"
+
+        logging.info(status)
+        with open(logfile, "a") as file:
+            file.write(status + "\n")
+
+
+if __name__ == "__main__":
+    batch_fix()
--- a/src/lerobot/datasets/v21/batch_convert_dataset_v20_to_v21.py
+++ b/src/lerobot/datasets/v21/batch_convert_dataset_v20_to_v21.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This script is for internal use to convert all datasets under the 'lerobot' hub user account to v2.1.
+"""
+
+import traceback
+from pathlib import Path
+
+from huggingface_hub import HfApi
+
+from lerobot import available_datasets
+from lerobot.datasets.v21.convert_dataset_v20_to_v21 import V21, convert_dataset
+
+LOCAL_DIR = Path("data/")
+
+
+def batch_convert():
+    status = {}
+    LOCAL_DIR.mkdir(parents=True, exist_ok=True)
+    logfile = LOCAL_DIR / "conversion_log_v21.txt"
+    hub_api = HfApi()
+    for num, repo_id in enumerate(available_datasets):
+        print(f"\nConverting {repo_id} ({num}/{len(available_datasets)})")
+        print("---------------------------------------------------------")
+        try:
+            if hub_api.revision_exists(repo_id, V21, repo_type="dataset"):
+                status = f"{repo_id}: success (already in {V21})."
+            else:
+                convert_dataset(repo_id)
+                status = f"{repo_id}: success."
+        except Exception:
+            status = f"{repo_id}: failed\n    {traceback.format_exc()}"
+
+        with open(logfile, "a") as file:
+            file.write(status + "\n")
+
+
+if __name__ == "__main__":
+    batch_convert()
--- a/src/lerobot/datasets/v21/convert_dataset_v20_to_v21.py
+++ b/src/lerobot/datasets/v21/convert_dataset_v20_to_v21.py
@@ -0,0 +1,114 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This script will help you convert any LeRobot dataset already pushed to the hub from codebase version 2.0 to
+2.1. It will:
+
+- Generate per-episodes stats and writes them in `episodes_stats.jsonl`
+- Check consistency between these new stats and the old ones.
+- Remove the deprecated `stats.json`.
+- Update codebase_version in `info.json`.
+- Push this new version to the hub on the 'main' branch and tags it with "v2.1".
+
+Usage:
+
+```bash
+python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 \
+    --repo-id=aliberts/koch_tutorial
+```
+
+"""
+
+import argparse
+import logging
+
+from huggingface_hub import HfApi
+
+from lerobot.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDataset
+from lerobot.datasets.utils import EPISODES_STATS_PATH, STATS_PATH, load_stats, write_info
+from lerobot.datasets.v21.convert_stats import check_aggregate_stats, convert_stats
+
+V20 = "v2.0"
+V21 = "v2.1"
+
+
+class SuppressWarnings:
+    def __enter__(self):
+        self.previous_level = logging.getLogger().getEffectiveLevel()
+        logging.getLogger().setLevel(logging.ERROR)
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        logging.getLogger().setLevel(self.previous_level)
+
+
+def convert_dataset(
+    repo_id: str,
+    branch: str | None = None,
+    num_workers: int = 4,
+):
+    with SuppressWarnings():
+        dataset = LeRobotDataset(repo_id, revision=V20, force_cache_sync=True)
+
+    if (dataset.root / EPISODES_STATS_PATH).is_file():
+        (dataset.root / EPISODES_STATS_PATH).unlink()
+
+    convert_stats(dataset, num_workers=num_workers)
+    ref_stats = load_stats(dataset.root)
+    check_aggregate_stats(dataset, ref_stats)
+
+    dataset.meta.info["codebase_version"] = CODEBASE_VERSION
+    write_info(dataset.meta.info, dataset.root)
+
+    dataset.push_to_hub(branch=branch, tag_version=False, allow_patterns="meta/")
+
+    # delete old stats.json file
+    if (dataset.root / STATS_PATH).is_file:
+        (dataset.root / STATS_PATH).unlink()
+
+    hub_api = HfApi()
+    if hub_api.file_exists(
+        repo_id=dataset.repo_id, filename=STATS_PATH, revision=branch, repo_type="dataset"
+    ):
+        hub_api.delete_file(
+            path_in_repo=STATS_PATH, repo_id=dataset.repo_id, revision=branch, repo_type="dataset"
+        )
+
+    hub_api.create_tag(repo_id, tag=CODEBASE_VERSION, revision=branch, repo_type="dataset")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--repo-id",
+        type=str,
+        required=True,
+        help="Repository identifier on Hugging Face: a community or a user name `/` the name of the dataset "
+        "(e.g. `lerobot/pusht`, `cadene/aloha_sim_insertion_human`).",
+    )
+    parser.add_argument(
+        "--branch",
+        type=str,
+        default=None,
+        help="Repo branch to push your dataset. Defaults to the main branch.",
+    )
+    parser.add_argument(
+        "--num-workers",
+        type=int,
+        default=4,
+        help="Number of workers for parallelizing stats compute. Defaults to 4.",
+    )
+
+    args = parser.parse_args()
+    convert_dataset(**vars(args))
--- a/src/lerobot/datasets/v21/convert_stats.py
+++ b/src/lerobot/datasets/v21/convert_stats.py
@@ -0,0 +1,99 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+import numpy as np
+from tqdm import tqdm
+
+from lerobot.datasets.compute_stats import aggregate_stats, get_feature_stats, sample_indices
+from lerobot.datasets.lerobot_dataset import LeRobotDataset
+from lerobot.datasets.utils import write_episode_stats
+
+
+def sample_episode_video_frames(dataset: LeRobotDataset, episode_index: int, ft_key: str) -> np.ndarray:
+    ep_len = dataset.meta.episodes[episode_index]["length"]
+    sampled_indices = sample_indices(ep_len)
+    query_timestamps = dataset._get_query_timestamps(0.0, {ft_key: sampled_indices})
+    video_frames = dataset._query_videos(query_timestamps, episode_index)
+    return video_frames[ft_key].numpy()
+
+
+def convert_episode_stats(dataset: LeRobotDataset, ep_idx: int):
+    ep_start_idx = dataset.episode_data_index["from"][ep_idx]
+    ep_end_idx = dataset.episode_data_index["to"][ep_idx]
+    ep_data = dataset.hf_dataset.select(range(ep_start_idx, ep_end_idx))
+
+    ep_stats = {}
+    for key, ft in dataset.features.items():
+        if ft["dtype"] == "video":
+            # We sample only for videos
+            ep_ft_data = sample_episode_video_frames(dataset, ep_idx, key)
+        else:
+            ep_ft_data = np.array(ep_data[key])
+
+        axes_to_reduce = (0, 2, 3) if ft["dtype"] in ["image", "video"] else 0
+        keepdims = True if ft["dtype"] in ["image", "video"] else ep_ft_data.ndim == 1
+        ep_stats[key] = get_feature_stats(ep_ft_data, axis=axes_to_reduce, keepdims=keepdims)
+
+        if ft["dtype"] in ["image", "video"]:  # remove batch dim
+            ep_stats[key] = {
+                k: v if k == "count" else np.squeeze(v, axis=0) for k, v in ep_stats[key].items()
+            }
+
+    dataset.meta.episodes_stats[ep_idx] = ep_stats
+
+
+def convert_stats(dataset: LeRobotDataset, num_workers: int = 0):
+    assert dataset.episodes is None
+    print("Computing episodes stats")
+    total_episodes = dataset.meta.total_episodes
+    if num_workers > 0:
+        with ThreadPoolExecutor(max_workers=num_workers) as executor:
+            futures = {
+                executor.submit(convert_episode_stats, dataset, ep_idx): ep_idx
+                for ep_idx in range(total_episodes)
+            }
+            for future in tqdm(as_completed(futures), total=total_episodes):
+                future.result()
+    else:
+        for ep_idx in tqdm(range(total_episodes)):
+            convert_episode_stats(dataset, ep_idx)
+
+    for ep_idx in tqdm(range(total_episodes)):
+        write_episode_stats(ep_idx, dataset.meta.episodes_stats[ep_idx], dataset.root)
+
+
+def check_aggregate_stats(
+    dataset: LeRobotDataset,
+    reference_stats: dict[str, dict[str, np.ndarray]],
+    video_rtol_atol: tuple[float] = (1e-2, 1e-2),
+    default_rtol_atol: tuple[float] = (5e-6, 6e-5),
+):
+    """Verifies that the aggregated stats from episodes_stats are close to reference stats."""
+    agg_stats = aggregate_stats(list(dataset.meta.episodes_stats.values()))
+    for key, ft in dataset.features.items():
+        # These values might need some fine-tuning
+        if ft["dtype"] == "video":
+            # to account for image sub-sampling
+            rtol, atol = video_rtol_atol
+        else:
+            rtol, atol = default_rtol_atol
+
+        for stat, val in agg_stats[key].items():
+            if key in reference_stats and stat in reference_stats[key]:
+                err_msg = f"feature='{key}' stats='{stat}'"
+                np.testing.assert_allclose(
+                    val, reference_stats[key][stat], rtol=rtol, atol=atol, err_msg=err_msg
+                )
--- a/src/lerobot/datasets/video_utils.py
+++ b/src/lerobot/datasets/video_utils.py
@@ -0,0 +1,453 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import glob
+import importlib
+import logging
+import warnings
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, ClassVar
+
+import av
+import pyarrow as pa
+import torch
+import torchvision
+from datasets.features.features import register_feature
+from PIL import Image
+
+
+def get_safe_default_codec():
+    if importlib.util.find_spec("torchcodec"):
+        return "torchcodec"
+    else:
+        logging.warning(
+            "'torchcodec' is not available in your platform, falling back to 'pyav' as a default decoder"
+        )
+        return "pyav"
+
+
+def decode_video_frames(
+    video_path: Path | str,
+    timestamps: list[float],
+    tolerance_s: float,
+    backend: str | None = None,
+) -> torch.Tensor:
+    """
+    Decodes video frames using the specified backend.
+
+    Args:
+        video_path (Path): Path to the video file.
+        timestamps (list[float]): List of timestamps to extract frames.
+        tolerance_s (float): Allowed deviation in seconds for frame retrieval.
+        backend (str, optional): Backend to use for decoding. Defaults to "torchcodec" when available in the platform; otherwise, defaults to "pyav"..
+
+    Returns:
+        torch.Tensor: Decoded frames.
+
+    Currently supports torchcodec on cpu and pyav.
+    """
+    if backend is None:
+        backend = get_safe_default_codec()
+    if backend == "torchcodec":
+        return decode_video_frames_torchcodec(video_path, timestamps, tolerance_s)
+    elif backend in ["pyav", "video_reader"]:
+        return decode_video_frames_torchvision(video_path, timestamps, tolerance_s, backend)
+    else:
+        raise ValueError(f"Unsupported video backend: {backend}")
+
+
+def decode_video_frames_torchvision(
+    video_path: Path | str,
+    timestamps: list[float],
+    tolerance_s: float,
+    backend: str = "pyav",
+    log_loaded_timestamps: bool = False,
+) -> torch.Tensor:
+    """Loads frames associated to the requested timestamps of a video
+
+    The backend can be either "pyav" (default) or "video_reader".
+    "video_reader" requires installing torchvision from source, see:
+    https://github.com/pytorch/vision/blob/main/torchvision/csrc/io/decoder/gpu/README.rst
+    (note that you need to compile against ffmpeg<4.3)
+
+    While both use cpu, "video_reader" is supposedly faster than "pyav" but requires additional setup.
+    For more info on video decoding, see `benchmark/video/README.md`
+
+    See torchvision doc for more info on these two backends:
+    https://pytorch.org/vision/0.18/index.html?highlight=backend#torchvision.set_video_backend
+
+    Note: Video benefits from inter-frame compression. Instead of storing every frame individually,
+    the encoder stores a reference frame (or a key frame) and subsequent frames as differences relative to
+    that key frame. As a consequence, to access a requested frame, we need to load the preceding key frame,
+    and all subsequent frames until reaching the requested frame. The number of key frames in a video
+    can be adjusted during encoding to take into account decoding time and video size in bytes.
+    """
+    video_path = str(video_path)
+
+    # set backend
+    keyframes_only = False
+    torchvision.set_video_backend(backend)
+    if backend == "pyav":
+        keyframes_only = True  # pyav doesn't support accurate seek
+
+    # set a video stream reader
+    # TODO(rcadene): also load audio stream at the same time
+    reader = torchvision.io.VideoReader(video_path, "video")
+
+    # set the first and last requested timestamps
+    # Note: previous timestamps are usually loaded, since we need to access the previous key frame
+    first_ts = min(timestamps)
+    last_ts = max(timestamps)
+
+    # access closest key frame of the first requested frame
+    # Note: closest key frame timestamp is usually smaller than `first_ts` (e.g. key frame can be the first frame of the video)
+    # for details on what `seek` is doing see: https://pyav.basswood-io.com/docs/stable/api/container.html?highlight=inputcontainer#av.container.InputContainer.seek
+    reader.seek(first_ts, keyframes_only=keyframes_only)
+
+    # load all frames until last requested frame
+    loaded_frames = []
+    loaded_ts = []
+    for frame in reader:
+        current_ts = frame["pts"]
+        if log_loaded_timestamps:
+            logging.info(f"frame loaded at timestamp={current_ts:.4f}")
+        loaded_frames.append(frame["data"])
+        loaded_ts.append(current_ts)
+        if current_ts >= last_ts:
+            break
+
+    if backend == "pyav":
+        reader.container.close()
+
+    reader = None
+
+    query_ts = torch.tensor(timestamps)
+    loaded_ts = torch.tensor(loaded_ts)
+
+    # compute distances between each query timestamp and timestamps of all loaded frames
+    dist = torch.cdist(query_ts[:, None], loaded_ts[:, None], p=1)
+    min_, argmin_ = dist.min(1)
+
+    is_within_tol = min_ < tolerance_s
+    assert is_within_tol.all(), (
+        f"One or several query timestamps unexpectedly violate the tolerance ({min_[~is_within_tol]} > {tolerance_s=})."
+        "It means that the closest frame that can be loaded from the video is too far away in time."
+        "This might be due to synchronization issues with timestamps during data collection."
+        "To be safe, we advise to ignore this item during training."
+        f"\nqueried timestamps: {query_ts}"
+        f"\nloaded timestamps: {loaded_ts}"
+        f"\nvideo: {video_path}"
+        f"\nbackend: {backend}"
+    )
+
+    # get closest frames to the query timestamps
+    closest_frames = torch.stack([loaded_frames[idx] for idx in argmin_])
+    closest_ts = loaded_ts[argmin_]
+
+    if log_loaded_timestamps:
+        logging.info(f"{closest_ts=}")
+
+    # convert to the pytorch format which is float32 in [0,1] range (and channel first)
+    closest_frames = closest_frames.type(torch.float32) / 255
+
+    assert len(timestamps) == len(closest_frames)
+    return closest_frames
+
+
+def decode_video_frames_torchcodec(
+    video_path: Path | str,
+    timestamps: list[float],
+    tolerance_s: float,
+    device: str = "cpu",
+    log_loaded_timestamps: bool = False,
+) -> torch.Tensor:
+    """Loads frames associated with the requested timestamps of a video using torchcodec.
+
+    Note: Setting device="cuda" outside the main process, e.g. in data loader workers, will lead to CUDA initialization errors.
+
+    Note: Video benefits from inter-frame compression. Instead of storing every frame individually,
+    the encoder stores a reference frame (or a key frame) and subsequent frames as differences relative to
+    that key frame. As a consequence, to access a requested frame, we need to load the preceding key frame,
+    and all subsequent frames until reaching the requested frame. The number of key frames in a video
+    can be adjusted during encoding to take into account decoding time and video size in bytes.
+    """
+
+    if importlib.util.find_spec("torchcodec"):
+        from torchcodec.decoders import VideoDecoder
+    else:
+        raise ImportError("torchcodec is required but not available.")
+
+    # initialize video decoder
+    decoder = VideoDecoder(video_path, device=device, seek_mode="approximate")
+    loaded_frames = []
+    loaded_ts = []
+    # get metadata for frame information
+    metadata = decoder.metadata
+    average_fps = metadata.average_fps
+
+    # convert timestamps to frame indices
+    frame_indices = [round(ts * average_fps) for ts in timestamps]
+
+    # retrieve frames based on indices
+    frames_batch = decoder.get_frames_at(indices=frame_indices)
+
+    for frame, pts in zip(frames_batch.data, frames_batch.pts_seconds, strict=False):
+        loaded_frames.append(frame)
+        loaded_ts.append(pts.item())
+        if log_loaded_timestamps:
+            logging.info(f"Frame loaded at timestamp={pts:.4f}")
+
+    query_ts = torch.tensor(timestamps)
+    loaded_ts = torch.tensor(loaded_ts)
+
+    # compute distances between each query timestamp and loaded timestamps
+    dist = torch.cdist(query_ts[:, None], loaded_ts[:, None], p=1)
+    min_, argmin_ = dist.min(1)
+
+    is_within_tol = min_ < tolerance_s
+    assert is_within_tol.all(), (
+        f"One or several query timestamps unexpectedly violate the tolerance ({min_[~is_within_tol]} > {tolerance_s=})."
+        "It means that the closest frame that can be loaded from the video is too far away in time."
+        "This might be due to synchronization issues with timestamps during data collection."
+        "To be safe, we advise to ignore this item during training."
+        f"\nqueried timestamps: {query_ts}"
+        f"\nloaded timestamps: {loaded_ts}"
+        f"\nvideo: {video_path}"
+    )
+
+    # get closest frames to the query timestamps
+    closest_frames = torch.stack([loaded_frames[idx] for idx in argmin_])
+    closest_ts = loaded_ts[argmin_]
+
+    if log_loaded_timestamps:
+        logging.info(f"{closest_ts=}")
+
+    # convert to float32 in [0,1] range (channel first)
+    closest_frames = closest_frames.type(torch.float32) / 255
+
+    assert len(timestamps) == len(closest_frames)
+    return closest_frames
+
+
+def encode_video_frames(
+    imgs_dir: Path | str,
+    video_path: Path | str,
+    fps: int,
+    vcodec: str = "libsvtav1",
+    pix_fmt: str = "yuv420p",
+    g: int | None = 2,
+    crf: int | None = 30,
+    fast_decode: int = 0,
+    log_level: int | None = av.logging.ERROR,
+    overwrite: bool = False,
+) -> None:
+    """More info on ffmpeg arguments tuning on `benchmark/video/README.md`"""
+    # Check encoder availability
+    if vcodec not in ["h264", "hevc", "libsvtav1"]:
+        raise ValueError(f"Unsupported video codec: {vcodec}. Supported codecs are: h264, hevc, libsvtav1.")
+
+    video_path = Path(video_path)
+    imgs_dir = Path(imgs_dir)
+
+    video_path.parent.mkdir(parents=True, exist_ok=overwrite)
+
+    # Encoders/pixel formats incompatibility check
+    if (vcodec == "libsvtav1" or vcodec == "hevc") and pix_fmt == "yuv444p":
+        logging.warning(
+            f"Incompatible pixel format 'yuv444p' for codec {vcodec}, auto-selecting format 'yuv420p'"
+        )
+        pix_fmt = "yuv420p"
+
+    # Get input frames
+    template = "frame_" + ("[0-9]" * 6) + ".png"
+    input_list = sorted(
+        glob.glob(str(imgs_dir / template)), key=lambda x: int(x.split("_")[-1].split(".")[0])
+    )
+
+    # Define video output frame size (assuming all input frames are the same size)
+    if len(input_list) == 0:
+        raise FileNotFoundError(f"No images found in {imgs_dir}.")
+    dummy_image = Image.open(input_list[0])
+    width, height = dummy_image.size
+
+    # Define video codec options
+    video_options = {}
+
+    if g is not None:
+        video_options["g"] = str(g)
+
+    if crf is not None:
+        video_options["crf"] = str(crf)
+
+    if fast_decode:
+        key = "svtav1-params" if vcodec == "libsvtav1" else "tune"
+        value = f"fast-decode={fast_decode}" if vcodec == "libsvtav1" else "fastdecode"
+        video_options[key] = value
+
+    # Set logging level
+    if log_level is not None:
+        # "While less efficient, it is generally preferable to modify logging with Python’s logging"
+        logging.getLogger("libav").setLevel(log_level)
+
+    # Create and open output file (overwrite by default)
+    with av.open(str(video_path), "w") as output:
+        output_stream = output.add_stream(vcodec, fps, options=video_options)
+        output_stream.pix_fmt = pix_fmt
+        output_stream.width = width
+        output_stream.height = height
+
+        # Loop through input frames and encode them
+        for input_data in input_list:
+            input_image = Image.open(input_data).convert("RGB")
+            input_frame = av.VideoFrame.from_image(input_image)
+            packet = output_stream.encode(input_frame)
+            if packet:
+                output.mux(packet)
+
+        # Flush the encoder
+        packet = output_stream.encode()
+        if packet:
+            output.mux(packet)
+
+    # Reset logging level
+    if log_level is not None:
+        av.logging.restore_default_callback()
+
+    if not video_path.exists():
+        raise OSError(f"Video encoding did not work. File not found: {video_path}.")
+
+
+@dataclass
+class VideoFrame:
+    # TODO(rcadene, lhoestq): move to Hugging Face `datasets` repo
+    """
+    Provides a type for a dataset containing video frames.
+
+    Example:
+
+    ```python
+    data_dict = [{"image": {"path": "videos/episode_0.mp4", "timestamp": 0.3}}]
+    features = {"image": VideoFrame()}
+    Dataset.from_dict(data_dict, features=Features(features))
+    ```
+    """
+
+    pa_type: ClassVar[Any] = pa.struct({"path": pa.string(), "timestamp": pa.float32()})
+    _type: str = field(default="VideoFrame", init=False, repr=False)
+
+    def __call__(self):
+        return self.pa_type
+
+
+with warnings.catch_warnings():
+    warnings.filterwarnings(
+        "ignore",
+        "'register_feature' is experimental and might be subject to breaking changes in the future.",
+        category=UserWarning,
+    )
+    # to make VideoFrame available in HuggingFace `datasets`
+    register_feature(VideoFrame, "VideoFrame")
+
+
+def get_audio_info(video_path: Path | str) -> dict:
+    # Set logging level
+    logging.getLogger("libav").setLevel(av.logging.ERROR)
+
+    # Getting audio stream information
+    audio_info = {}
+    with av.open(str(video_path), "r") as audio_file:
+        try:
+            audio_stream = audio_file.streams.audio[0]
+        except IndexError:
+            # Reset logging level
+            av.logging.restore_default_callback()
+            return {"has_audio": False}
+
+        audio_info["audio.channels"] = audio_stream.channels
+        audio_info["audio.codec"] = audio_stream.codec.canonical_name
+        # In an ideal loseless case : bit depth x sample rate x channels = bit rate.
+        # In an actual compressed case, the bit rate is set according to the compression level : the lower the bit rate, the more compression is applied.
+        audio_info["audio.bit_rate"] = audio_stream.bit_rate
+        audio_info["audio.sample_rate"] = audio_stream.sample_rate  # Number of samples per second
+        # In an ideal loseless case : fixed number of bits per sample.
+        # In an actual compressed case : variable number of bits per sample (often reduced to match a given depth rate).
+        audio_info["audio.bit_depth"] = audio_stream.format.bits
+        audio_info["audio.channel_layout"] = audio_stream.layout.name
+        audio_info["has_audio"] = True
+
+    # Reset logging level
+    av.logging.restore_default_callback()
+
+    return audio_info
+
+
+def get_video_info(video_path: Path | str) -> dict:
+    # Set logging level
+    logging.getLogger("libav").setLevel(av.logging.ERROR)
+
+    # Getting video stream information
+    video_info = {}
+    with av.open(str(video_path), "r") as video_file:
+        try:
+            video_stream = video_file.streams.video[0]
+        except IndexError:
+            # Reset logging level
+            av.logging.restore_default_callback()
+            return {}
+
+        video_info["video.height"] = video_stream.height
+        video_info["video.width"] = video_stream.width
+        video_info["video.codec"] = video_stream.codec.canonical_name
+        video_info["video.pix_fmt"] = video_stream.pix_fmt
+        video_info["video.is_depth_map"] = False
+
+        # Calculate fps from r_frame_rate
+        video_info["video.fps"] = int(video_stream.base_rate)
+
+        pixel_channels = get_video_pixel_channels(video_stream.pix_fmt)
+        video_info["video.channels"] = pixel_channels
+
+    # Reset logging level
+    av.logging.restore_default_callback()
+
+    # Adding audio stream information
+    video_info.update(**get_audio_info(video_path))
+
+    return video_info
+
+
+def get_video_pixel_channels(pix_fmt: str) -> int:
+    if "gray" in pix_fmt or "depth" in pix_fmt or "monochrome" in pix_fmt:
+        return 1
+    elif "rgba" in pix_fmt or "yuva" in pix_fmt:
+        return 4
+    elif "rgb" in pix_fmt or "yuv" in pix_fmt:
+        return 3
+    else:
+        raise ValueError("Unknown format")
+
+
+def get_image_pixel_channels(image: Image):
+    if image.mode == "L":
+        return 1  # Grayscale
+    elif image.mode == "LA":
+        return 2  # Grayscale + Alpha
+    elif image.mode == "RGB":
+        return 3  # RGB
+    elif image.mode == "RGBA":
+        return 4  # RGBA
+    else:
+        raise ValueError("Unknown format")