Add Streaming Dataset (#1613)

Co-authored-by: Michel Aractingi <michel.aractingi@huggingface.co>
2025-09-15 14:08:01 +02:00
parent f55c6e89f0
commit 33cad37054
10 changed files with 1380 additions and 26 deletions
--- a/examples/5_train_with_streaming.py
+++ b/examples/5_train_with_streaming.py
@@ -0,0 +1,116 @@
 # Copyright 2025 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """This script demonstrates how to train a Diffusion Policy on the PushT environment,
 using a dataset processed in streaming mode.
 Once you have trained a model with this script, you can try to evaluate it on
 examples/2_evaluate_pretrained_policy.py
 """
 from pathlib import Path
 import torch
 from lerobot.configs.types import FeatureType
 from lerobot.constants import ACTION
 from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata
 from lerobot.datasets.streaming_dataset import StreamingLeRobotDataset
 from lerobot.datasets.utils import dataset_to_policy_features
 from lerobot.policies.act.configuration_act import ACTConfig
 from lerobot.policies.act.modeling_act import ACTPolicy
 def main():
    # Create a directory to store the training checkpoint.
    output_directory = Path("outputs/train/example_streaming_dataset")
    output_directory.mkdir(parents=True, exist_ok=True)
    # Selects the "best" device available
    device = (
        torch.device("cuda")
        if torch.cuda.is_available()
        else torch.device("mps")
        if torch.backends.mps.is_available()
        else torch.device("cpu")
    )
    print(f"Using device: {device}")
    training_steps = 10
    log_freq = 1
    dataset_id = (
        "aractingi/droid_1.0.1"  # 26M frames! Would require 4TB of disk space if installed locally (:
    )
    dataset_metadata = LeRobotDatasetMetadata(dataset_id)
    features = dataset_to_policy_features(dataset_metadata.features)
    output_features = {key: ft for key, ft in features.items() if ft.type is FeatureType.ACTION}
    input_features = {key: ft for key, ft in features.items() if key not in output_features}
    # We can now instantiate our policy with this config and the dataset stats.
    cfg = ACTConfig(input_features=input_features, output_features=output_features)
    policy = ACTPolicy(cfg, dataset_stats=dataset_metadata.stats)
    policy.train()
    policy.to(device)
    # Delta timestamps are used to (1) augment frames used during training and (2) supervise the policy.
    # Here, we use delta-timestamps to only provide ground truth actions for supervision
    delta_timestamps = {
        ACTION: [t / dataset_metadata.fps for t in range(cfg.n_action_steps)],
    }
    # Instantiating the training dataset in streaming mode allows to not consume up memory as the data is fetched
    # iteratively rather than being load into memory all at once. Retrieved frames are shuffled across epochs
    dataset = StreamingLeRobotDataset(dataset_id, delta_timestamps=delta_timestamps, tolerance_s=1e-3)
    optimizer = torch.optim.Adam(policy.parameters(), lr=1e-4)
    dataloader = torch.utils.data.DataLoader(
        dataset,
        num_workers=4,
        batch_size=16,
        pin_memory=device.type != "cpu",
        drop_last=True,
        prefetch_factor=2,  # loads batches with multiprocessing while policy trains
    )
    # Run training loop.
    step = 0
    done = False
    while not done:
        for batch in dataloader:
            batch = {
                k: (v.type(torch.float32) if isinstance(v, torch.Tensor) and v.dtype != torch.bool else v)
                for k, v in batch.items()
            }
            batch = {k: (v.to(device) if isinstance(v, torch.Tensor) else v) for k, v in batch.items()}
            # batch = {k: (v.to(device) if isinstance(v, torch.Tensor) else v) for k, v in batch.items()}
            loss, _ = policy.forward(batch)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            if step % log_freq == 0:
                print(f"step: {step} loss: {loss.item():.3f}")
            step += 1
            if step >= training_steps:
                done = True
                break
    # Save a policy checkpoint.
    policy.save_pretrained(output_directory)
 if __name__ == "__main__":
    main()
--- a/src/lerobot/configs/default.py
+++ b/src/lerobot/configs/default.py
@@ -37,6 +37,7 @@ class DatasetConfig:
    revision: str | None = None
    use_imagenet_stats: bool = True
    video_backend: str = field(default_factory=get_safe_default_codec)
    streaming: bool = False
@dataclass
--- a/src/lerobot/constants.py
+++ b/src/lerobot/constants.py
@@ -52,3 +52,8 @@ HF_LEROBOT_HOME = Path(os.getenv("HF_LEROBOT_HOME", default_cache_path)).expandu
 # calibration dir
 default_calibration_path = HF_LEROBOT_HOME / "calibration"
 HF_LEROBOT_CALIBRATION = Path(os.getenv("HF_LEROBOT_CALIBRATION", default_calibration_path)).expanduser()
 # streaming datasets
 LOOKBACK_BACKTRACKTABLE = 100
 LOOKAHEAD_BACKTRACKTABLE = 100
--- a/src/lerobot/datasets/factory.py
+++ b/src/lerobot/datasets/factory.py
@@ -25,6 +25,7 @@ from lerobot.datasets.lerobot_dataset import (
    LeRobotDatasetMetadata,
    MultiLeRobotDataset,
 )
 from lerobot.datasets.streaming_dataset import StreamingLeRobotDataset
 from lerobot.datasets.transforms import ImageTransforms
 IMAGENET_STATS = {
@@ -87,15 +88,26 @@ def make_dataset(cfg: TrainPipelineConfig) -> LeRobotDataset | MultiLeRobotDatas
            cfg.dataset.repo_id, root=cfg.dataset.root, revision=cfg.dataset.revision
        )
        delta_timestamps = resolve_delta_timestamps(cfg.policy, ds_meta)
-        dataset = LeRobotDataset(
+        if not cfg.dataset.streaming:
-            cfg.dataset.repo_id,
+            dataset = LeRobotDataset(
-            root=cfg.dataset.root,
+                cfg.dataset.repo_id,
-            episodes=cfg.dataset.episodes,
+                root=cfg.dataset.root,
-            delta_timestamps=delta_timestamps,
+                episodes=cfg.dataset.episodes,
-            image_transforms=image_transforms,
+                delta_timestamps=delta_timestamps,
-            revision=cfg.dataset.revision,
+                image_transforms=image_transforms,
-            video_backend=cfg.dataset.video_backend,
+                revision=cfg.dataset.revision,
-        )
+                video_backend=cfg.dataset.video_backend,
            )
        else:
            dataset = StreamingLeRobotDataset(
                cfg.dataset.repo_id,
                root=cfg.dataset.root,
                episodes=cfg.dataset.episodes,
                delta_timestamps=delta_timestamps,
                image_transforms=image_transforms,
                revision=cfg.dataset.revision,
                max_num_shards=cfg.num_workers,
            )
    else:
        raise NotImplementedError("The MultiLeRobotDataset isn't supported for now.")
        dataset = MultiLeRobotDataset(
--- a/src/lerobot/datasets/lerobot_dataset.py
+++ b/src/lerobot/datasets/lerobot_dataset.py
@@ -129,6 +129,10 @@ class LeRobotDatasetMetadata:
            ignore_patterns=ignore_patterns,
        )
    @property
    def url_root(self) -> str:
        return f"hf://datasets/{self.repo_id}"
    @property
    def _version(self) -> packaging.version.Version:
        """Codebase version used to create this dataset."""
--- a/src/lerobot/datasets/streaming_dataset.py
+++ b/src/lerobot/datasets/streaming_dataset.py
@@ -0,0 +1,535 @@
 #!/usr/bin/env python
 # Copyright 2025 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from collections.abc import Callable, Generator, Iterator
 from pathlib import Path
 import datasets
 import numpy as np
 import torch
 from datasets import load_dataset
 from lerobot.constants import HF_LEROBOT_HOME, LOOKAHEAD_BACKTRACKTABLE, LOOKBACK_BACKTRACKTABLE
 from lerobot.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDatasetMetadata
 from lerobot.datasets.utils import (
    Backtrackable,
    LookAheadError,
    LookBackError,
    check_version_compatibility,
    find_float_index,
    get_delta_indices,
    is_float_in_list,
    item_to_torch,
    safe_shard,
 )
 from lerobot.datasets.video_utils import (
    VideoDecoderCache,
    decode_video_frames_torchcodec,
 )
 class StreamingLeRobotDataset(torch.utils.data.IterableDataset):
    """LeRobotDataset with streaming capabilities.
    This class extends LeRobotDataset to add streaming functionality, allowing data to be streamed
    rather than loaded entirely into memory. This is especially useful for large datasets that may
    not fit in memory or when you want to quickly explore a dataset without downloading it completely.
    The key innovation is using a Backtrackable iterator that maintains a bounded buffer of recent
    items, allowing us to access previous frames for delta timestamps without loading the entire
    dataset into memory.
    Example:
        Basic usage:
        ```python
        from lerobot.common.datasets.streaming_dataset import StreamingLeRobotDataset
        # Create a streaming dataset with delta timestamps
        delta_timestamps = {
            "observation.image": [-1.0, -0.5, 0.0],  # 1 sec ago, 0.5 sec ago, current
            "action": [0.0, 0.1, 0.2],  # current, 0.1 sec future, 0.2 sec future
        }
        dataset = StreamingLeRobotDataset(
            repo_id="your-dataset-repo-id",
            delta_timestamps=delta_timestamps,
            streaming=True,
            buffer_size=1000,
        )
        # Iterate over the dataset
        for i, item in enumerate(dataset):
            print(f"Sample {i}: Episode {item['episode_index']} Frame {item['frame_index']}")
            # item will contain stacked frames according to delta_timestamps
            if i >= 10:
                break
        ```
    """
    def __init__(
        self,
        repo_id: str,
        root: str | Path | None = None,
        episodes: list[int] | None = None,
        image_transforms: Callable | None = None,
        delta_timestamps: dict[list[float]] | None = None,
        tolerance_s: float = 1e-4,
        revision: str | None = None,
        force_cache_sync: bool = False,
        streaming: bool = True,
        buffer_size: int = 1000,
        max_num_shards: int = 16,
        seed: int = 42,
        rng: np.random.Generator | None = None,
        shuffle: bool = True,
    ):
        """Initialize a StreamingLeRobotDataset.
        Args:
            repo_id (str): This is the repo id that will be used to fetch the dataset.
            root (Path | None, optional): Local directory to use for downloading/writing files.
            episodes (list[int] | None, optional): If specified, this will only load episodes specified by
                their episode_index in this list.
            image_transforms (Callable | None, optional): Transform to apply to image data.
            tolerance_s (float, optional): Tolerance in seconds for timestamp matching.
            revision (str, optional): Git revision id (branch name, tag, or commit hash).
            force_cache_sync (bool, optional): Flag to sync and refresh local files first.
            streaming (bool, optional): Whether to stream the dataset or load it all. Defaults to True.
            buffer_size (int, optional): Buffer size for shuffling when streaming. Defaults to 1000.
            max_num_shards (int, optional): Number of shards to re-shard the input dataset into. Defaults to 16.
            seed (int, optional): Reproducibility random seed.
            rng (np.random.Generator | None, optional): Random number generator.
            shuffle (bool, optional): Whether to shuffle the dataset across exhaustions. Defaults to True.
        """
        super().__init__()
        self.repo_id = repo_id
        self.root = Path(root) if root else HF_LEROBOT_HOME / repo_id
        self.streaming_from_local = root is not None
        self.image_transforms = image_transforms
        self.episodes = episodes
        self.tolerance_s = tolerance_s
        self.revision = revision if revision else CODEBASE_VERSION
        self.seed = seed
        self.rng = rng if rng is not None else np.random.default_rng(seed)
        self.shuffle = shuffle
        self.streaming = streaming
        self.buffer_size = buffer_size
        # We cache the video decoders to avoid re-initializing them at each frame (avoiding a ~10x slowdown)
        self.video_decoder_cache = None
        self.root.mkdir(exist_ok=True, parents=True)
        # Load metadata
        self.meta = LeRobotDatasetMetadata(
            self.repo_id, self.root, self.revision, force_cache_sync=force_cache_sync
        )
        # Check version
        check_version_compatibility(self.repo_id, self.meta._version, CODEBASE_VERSION)
        self.delta_timestamps = None
        self.delta_indices = None
        if delta_timestamps is not None:
            self._validate_delta_timestamp_keys(delta_timestamps)  # raises ValueError if invalid
            self.delta_timestamps = delta_timestamps
            self.delta_indices = get_delta_indices(self.delta_timestamps, self.fps)
        self.hf_dataset: datasets.IterableDataset = load_dataset(
            self.repo_id if not self.streaming_from_local else str(self.root),
            split="train",
            streaming=self.streaming,
            data_files="data/*/*.parquet",
            revision=self.revision,
        )
        self.num_shards = min(self.hf_dataset.num_shards, max_num_shards)
    @property
    def num_frames(self):
        return self.meta.total_frames
    @property
    def num_episodes(self):
        return self.meta.total_episodes
    @property
    def fps(self):
        return self.meta.fps
    @staticmethod
    def _iter_random_indices(
        rng: np.random.Generator, buffer_size: int, random_batch_size=100
    ) -> Iterator[int]:
        while True:
            yield from (int(i) for i in rng.integers(0, buffer_size, size=random_batch_size))
    @staticmethod
    def _infinite_generator_over_elements(rng: np.random.Generator, elements: list[int]) -> Iterator[int]:
        while True:
            yield rng.choice(elements)
    # TODO(fracapuano): Implement multi-threaded prefetching to accelerate data loading.
    # The current sequential iteration is a bottleneck. A producer-consumer pattern
    # could be used with a ThreadPoolExecutor to run `make_frame` (especially video decoding)
    # in parallel, feeding a queue from which this iterator will yield processed items.
    def __iter__(self) -> Iterator[dict[str, torch.Tensor]]:
        if self.video_decoder_cache is None:
            self.video_decoder_cache = VideoDecoderCache()
        # keep the same seed across exhaustions if shuffle is False, otherwise shuffle data across exhaustions
        rng = np.random.default_rng(self.seed) if not self.shuffle else self.rng
        buffer_indices_generator = self._iter_random_indices(rng, self.buffer_size)
        idx_to_backtrack_dataset = {
            idx: self._make_backtrackable_dataset(safe_shard(self.hf_dataset, idx, self.num_shards))
            for idx in range(self.num_shards)
        }
        # This buffer is populated while iterating on the dataset's shards
        # the logic is to add 2 levels of randomness:
        # (1) sample one shard at random from the ones available, and
        # (2) sample one frame from the shard sampled at (1)
        frames_buffer = []
        while available_shards := list(idx_to_backtrack_dataset.keys()):
            shard_key = next(self._infinite_generator_over_elements(rng, available_shards))
            backtrack_dataset = idx_to_backtrack_dataset[shard_key]  # selects which shard to iterate on
            try:
                for frame in self.make_frame(backtrack_dataset):
                    if len(frames_buffer) == self.buffer_size:
                        i = next(buffer_indices_generator)  # samples a element from the buffer
                        yield frames_buffer[i]
                        frames_buffer[i] = frame
                    else:
                        frames_buffer.append(frame)
                    break  # random shard sampled, switch shard
            except (
                RuntimeError,
                StopIteration,
            ):  # NOTE: StopIteration inside a generator throws a RuntimeError since python 3.7
                del idx_to_backtrack_dataset[shard_key]  # Remove exhausted shard, onto another shard
        # Once shards are all exhausted, shuffle the buffer and yield the remaining frames
        rng.shuffle(frames_buffer)
        yield from frames_buffer
    def _get_window_steps(
        self, delta_timestamps: dict[str, list[float]] | None = None, dynamic_bounds: bool = False
    ) -> tuple[int, int]:
        if delta_timestamps is None:
            return 1, 1
        if not dynamic_bounds:
            # Fix the windows
            lookback = LOOKBACK_BACKTRACKTABLE
            lookahead = LOOKAHEAD_BACKTRACKTABLE
        else:
            # Dynamically adjust the windows based on the given delta_timesteps
            all_timestamps = sum(delta_timestamps.values(), [])
            lookback = min(all_timestamps) * self.fps
            lookahead = max(all_timestamps) * self.fps
            # When lookback is >=0 it means no negative timesteps have been provided
            lookback = 0 if lookback >= 0 else (lookback * -1)
        return lookback, lookahead
    def _make_backtrackable_dataset(self, dataset: datasets.IterableDataset) -> Backtrackable:
        lookback, lookahead = self._get_window_steps(self.delta_timestamps)
        return Backtrackable(dataset, history=lookback, lookahead=lookahead)
    def _make_timestamps_from_indices(
        self, start_ts: float, indices: dict[str, list[int]] | None = None
    ) -> dict[str, list[float]]:
        if indices is not None:
            return {
                key: (
                    start_ts + torch.tensor(indices[key]) / self.fps
                ).tolist()  # NOTE: why not delta_timestamps directly?
                for key in self.delta_timestamps
            }
        else:
            return dict.fromkeys(self.meta.video_keys, [start_ts])
    def _make_padding_camera_frame(self, camera_key: str):
        """Variable-shape padding frame for given camera keys, given in (H, W, C)"""
        return torch.zeros(self.meta.info["features"][camera_key]["shape"]).permute(-1, 0, 1)
    def _get_video_frame_padding_mask(
        self,
        video_frames: dict[str, torch.Tensor],
        query_timestamps: dict[str, list[float]],
        original_timestamps: dict[str, list[float]],
    ) -> dict[str, torch.BoolTensor]:
        padding_mask = {}
        for video_key, timestamps in original_timestamps.items():
            if video_key not in video_frames:
                continue  # only padding on video keys that are available
            frames = []
            mask = []
            padding_frame = self._make_padding_camera_frame(video_key)
            for ts in timestamps:
                if is_float_in_list(ts, query_timestamps[video_key]):
                    idx = find_float_index(ts, query_timestamps[video_key])
                    frames.append(video_frames[video_key][idx, :])
                    mask.append(False)
                else:
                    frames.append(padding_frame)
                    mask.append(True)
            padding_mask[f"{video_key}_is_pad"] = torch.BoolTensor(mask)
        return padding_mask
    def make_frame(
        self, dataset_iterator: Backtrackable, previous_dataset_iterator: Backtrackable | None = None
    ) -> Generator:
        """Makes a frame starting from a dataset iterator"""
        item = next(dataset_iterator)
        item = item_to_torch(item)
        updates = []  # list of "updates" to apply to the item retrieved from hf_dataset (w/o camera features)
        # Get episode index from the item
        ep_idx = item["episode_index"]
        # "timestamp" restarts from 0 for each episode, whereas we need a global timestep within the single .mp4 file (given by index/fps)
        current_ts = item["index"] / self.fps
        episode_boundaries_ts = {
            key: (
                self.meta.episodes[ep_idx][f"videos/{key}/from_timestamp"],
                self.meta.episodes[ep_idx][f"videos/{key}/to_timestamp"],
            )
            for key in self.meta.video_keys
        }
        # Apply delta querying logic if necessary
        if self.delta_indices is not None:
            query_result, padding = self._get_delta_frames(dataset_iterator, item)
            updates.append(query_result)
            updates.append(padding)
        # Load video frames, when needed
        if len(self.meta.video_keys) > 0:
            original_timestamps = self._make_timestamps_from_indices(current_ts, self.delta_indices)
            # Some timestamps might not result available considering the episode's boundaries
            query_timestamps = self._get_query_timestamps(
                current_ts, self.delta_indices, episode_boundaries_ts
            )
            video_frames = self._query_videos(query_timestamps, ep_idx)
            if self.image_transforms is not None:
                image_keys = self.meta.camera_keys
                for cam in image_keys:
                    video_frames[cam] = self.image_transforms(video_frames[cam])
            updates.append(video_frames)
            if self.delta_indices is not None:
                # We always return the same number of frames. Unavailable frames are padded.
                padding_mask = self._get_video_frame_padding_mask(
                    video_frames, query_timestamps, original_timestamps
                )
                updates.append(padding_mask)
        result = item.copy()
        for update in updates:
            result.update(update)
        result["task"] = self.meta.tasks.iloc[item["task_index"]].name
        yield result
    def _get_query_timestamps(
        self,
        current_ts: float,
        query_indices: dict[str, list[int]] | None = None,
        episode_boundaries_ts: dict[str, tuple[float, float]] | None = None,
    ) -> dict[str, list[float]]:
        query_timestamps = {}
        keys_to_timestamps = self._make_timestamps_from_indices(current_ts, query_indices)
        for key in self.meta.video_keys:
            if query_indices is not None and key in query_indices:
                timestamps = keys_to_timestamps[key]
                # Clamp out timesteps outside of episode boundaries
                query_timestamps[key] = torch.clamp(
                    torch.tensor(timestamps), *episode_boundaries_ts[key]
                ).tolist()
            else:
                query_timestamps[key] = [current_ts]
        return query_timestamps
    def _query_videos(self, query_timestamps: dict[str, list[float]], ep_idx: int) -> dict:
        """Note: When using data workers (e.g. DataLoader with num_workers>0), do not call this function
        in the main process (e.g. by using a second Dataloader with num_workers=0). It will result in a
        Segmentation Fault. This probably happens because a memory reference to the video loader is created in
        the main process and a subprocess fails to access it.
        """
        item = {}
        for video_key, query_ts in query_timestamps.items():
            root = self.meta.url_root if self.streaming and not self.streaming_from_local else self.root
            video_path = f"{root}/{self.meta.get_video_file_path(ep_idx, video_key)}"
            frames = decode_video_frames_torchcodec(
                video_path, query_ts, self.tolerance_s, decoder_cache=self.video_decoder_cache
            )
            item[video_key] = frames.squeeze(0) if len(query_ts) == 1 else frames
        return item
    def _get_delta_frames(self, dataset_iterator: Backtrackable, current_item: dict):
        # TODO(fracapuano): Modularize this function, refactor the code
        """Get frames with delta offsets using the backtrackable iterator.
        Args:
            current_item (dict): Current item from the iterator.
            ep_idx (int): Episode index.
        Returns:
            tuple: (query_result, padding) - frames at delta offsets and padding info.
        """
        current_episode_idx = current_item["episode_index"]
        # Prepare results
        query_result = {}
        padding = {}
        for key, delta_indices in self.delta_indices.items():
            if key in self.meta.video_keys:
                continue  # visual frames are decoded separately
            target_frames = []
            is_pad = []
            # Create a results dictionary to store frames in processing order, then reconstruct original order for stacking
            delta_results = {}
            # Separate and sort deltas by difficulty (easier operations first)
            negative_deltas = sorted([d for d in delta_indices if d < 0], reverse=True)  # [-1, -2, -3, ...]
            positive_deltas = sorted([d for d in delta_indices if d > 0])  # [1, 2, 3, ...]
            zero_deltas = [d for d in delta_indices if d == 0]
            # Process zero deltas (current frame)
            for delta in zero_deltas:
                delta_results[delta] = (
                    current_item[key],
                    False,
                )
            # Process negative deltas in order of increasing difficulty
            lookback_failed = False
            last_successful_frame = current_item[key]
            for delta in negative_deltas:
                if lookback_failed:
                    delta_results[delta] = (last_successful_frame, True)
                    continue
                try:
                    steps_back = abs(delta)
                    if dataset_iterator.can_peek_back(steps_back):
                        past_item = dataset_iterator.peek_back(steps_back)
                        past_item = item_to_torch(past_item)
                        if past_item["episode_index"] == current_episode_idx:
                            delta_results[delta] = (past_item[key], False)
                            last_successful_frame = past_item[key]
                        else:
                            raise LookBackError("Retrieved frame is from different episode!")
                    else:
                        raise LookBackError("Cannot go back further than the history buffer!")
                except LookBackError:
                    delta_results[delta] = (last_successful_frame, True)
                    lookback_failed = True  # All subsequent negative deltas will also fail
            # Process positive deltas in order of increasing difficulty
            lookahead_failed = False
            last_successful_frame = current_item[key]
            for delta in positive_deltas:
                if lookahead_failed:
                    delta_results[delta] = (last_successful_frame, True)
                    continue
                try:
                    if dataset_iterator.can_peek_ahead(delta):
                        future_item = dataset_iterator.peek_ahead(delta)
                        future_item = item_to_torch(future_item)
                        if future_item["episode_index"] == current_episode_idx:
                            delta_results[delta] = (future_item[key], False)
                            last_successful_frame = future_item[key]
                        else:
                            raise LookAheadError("Retrieved frame is from different episode!")
                    else:
                        raise LookAheadError("Cannot go ahead further than the lookahead buffer!")
                except LookAheadError:
                    delta_results[delta] = (last_successful_frame, True)
                    lookahead_failed = True  # All subsequent positive deltas will also fail
            # Reconstruct original order for stacking
            for delta in delta_indices:
                frame, is_padded = delta_results[delta]
                # add batch dimension for stacking
                target_frames.append(frame)  # frame.unsqueeze(0))
                is_pad.append(is_padded)
            # Stack frames and add to results
            if target_frames:
                query_result[key] = torch.stack(target_frames)
                padding[f"{key}_is_pad"] = torch.BoolTensor(is_pad)
        return query_result, padding
    def _validate_delta_timestamp_keys(self, delta_timestamps: dict[list[float]]) -> None:
        """
        Validate that all keys in delta_timestamps correspond to actual features in the dataset.
        Raises:
            ValueError: If any delta timestamp key doesn't correspond to a dataset feature.
        """
        if delta_timestamps is None:
            return
        # Get all available feature keys from the dataset metadata
        available_features = set(self.meta.features.keys())
        # Get all keys from delta_timestamps
        delta_keys = set(delta_timestamps.keys())
        # Find any keys that don't correspond to features
        invalid_keys = delta_keys - available_features
        if invalid_keys:
            raise ValueError(
                f"The following delta_timestamp keys do not correspond to dataset features: {invalid_keys}. "
                f"Available features are: {sorted(available_features)}"
            )
--- a/src/lerobot/datasets/utils.py
+++ b/src/lerobot/datasets/utils.py
@@ -17,10 +17,11 @@ import contextlib
 import importlib.resources
 import json
 import logging
-from collections.abc import Iterator
+from collections import deque
 from collections.abc import Iterable, Iterator
 from pathlib import Path
 from pprint import pformat
-from typing import Any
+from typing import Any, Deque, Generic, TypeVar
 import datasets
 import numpy as np
@@ -86,6 +87,8 @@ DEFAULT_FEATURES = {
    "task_index": {"dtype": "int64", "shape": (1,), "names": None},
 }
 T = TypeVar("T")
 def get_parquet_file_size_in_mb(parquet_path: str | Path) -> float:
    metadata = pq.read_metadata(parquet_path)
@@ -776,3 +779,230 @@ def to_parquet_with_hf_images(df: pandas.DataFrame, path: Path) -> None:
    """
    # TODO(qlhoest): replace this weird synthax by `df.to_parquet(path)` only
    datasets.Dataset.from_dict(df.to_dict(orient="list")).to_parquet(path)
 def item_to_torch(item: dict) -> dict:
    """Convert all items in a dictionary to PyTorch tensors where appropriate.
    This function is used to convert an item from a streaming dataset to PyTorch tensors.
    Args:
        item (dict): Dictionary of items from a dataset.
    Returns:
        dict: Dictionary with all tensor-like items converted to torch.Tensor.
    """
    for key, val in item.items():
        if isinstance(val, (np.ndarray, list)) and key not in ["task"]:
            # Convert numpy arrays and lists to torch tensors
            item[key] = torch.tensor(val)
    return item
 def is_float_in_list(target, float_list, threshold=1e-6):
    return any(abs(target - x) <= threshold for x in float_list)
 def find_float_index(target, float_list, threshold=1e-6):
    for i, x in enumerate(float_list):
        if abs(target - x) <= threshold:
            return i
    return -1
 class LookBackError(Exception):
    """
    Exception raised when trying to look back in the history of a Backtrackable object.
    """
    pass
 class LookAheadError(Exception):
    """
    Exception raised when trying to look ahead in the future of a Backtrackable object.
    """
    pass
 class Backtrackable(Generic[T]):
    """
    Wrap any iterator/iterable so you can step back up to `history` items
    and look ahead up to `lookahead` items.
    This is useful for streaming datasets where you need to access previous and future items
    but can't load the entire dataset into memory.
    Example:
    -------
    ```python
    ds = load_dataset("c4", "en", streaming=True, split="train")
    rev = Backtrackable(ds, history=3, lookahead=2)
    x0 = next(rev)  # forward
    x1 = next(rev)
    x2 = next(rev)
    # Look ahead
    x3_peek = rev.peek_ahead(1)  # next item without moving cursor
    x4_peek = rev.peek_ahead(2)  # two items ahead
    # Look back
    x1_again = rev.peek_back(1)  # previous item without moving cursor
    x0_again = rev.peek_back(2)  # two items back
    # Move backward
    x1_back = rev.prev()  # back one step
    next(rev)  # returns x2, continues forward from where we were
    ```
    """
    __slots__ = ("_source", "_back_buf", "_ahead_buf", "_cursor", "_history", "_lookahead")
    def __init__(self, iterable: Iterable[T], *, history: int = 1, lookahead: int = 0):
        if history < 1:
            raise ValueError("history must be >= 1")
        if lookahead <= 0:
            raise ValueError("lookahead must be > 0")
        self._source: Iterator[T] = iter(iterable)
        self._back_buf: Deque[T] = deque(maxlen=history)
        self._ahead_buf: Deque[T] = deque(maxlen=lookahead) if lookahead > 0 else deque()
        self._cursor: int = 0
        self._history = history
        self._lookahead = lookahead
    def __iter__(self) -> "Backtrackable[T]":
        return self
    def __next__(self) -> T:
        # If we've stepped back, consume from back buffer first
        if self._cursor < 0:  # -1 means "last item", etc.
            self._cursor += 1
            return self._back_buf[self._cursor]
        # If we have items in the ahead buffer, use them first
        item = self._ahead_buf.popleft() if self._ahead_buf else next(self._source)
        # Add current item to back buffer and reset cursor
        self._back_buf.append(item)
        self._cursor = 0
        return item
    def prev(self) -> T:
        """
        Step one item back in history and return it.
        Raises IndexError if already at the oldest buffered item.
        """
        if len(self._back_buf) + self._cursor <= 1:
            raise LookBackError("At start of history")
        self._cursor -= 1
        return self._back_buf[self._cursor]
    def peek_back(self, n: int = 1) -> T:
        """
        Look `n` items back (n=1 == previous item) without moving the cursor.
        """
        if n < 0 or n + 1 > len(self._back_buf) + self._cursor:
            raise LookBackError("peek_back distance out of range")
        return self._back_buf[self._cursor - (n + 1)]
    def peek_ahead(self, n: int = 1) -> T:
        """
        Look `n` items ahead (n=1 == next item) without moving the cursor.
        Fills the ahead buffer if necessary.
        """
        if n < 1:
            raise LookAheadError("peek_ahead distance must be 1 or more")
        elif n > self._lookahead:
            raise LookAheadError("peek_ahead distance exceeds lookahead limit")
        # Fill ahead buffer if we don't have enough items
        while len(self._ahead_buf) < n:
            try:
                item = next(self._source)
                self._ahead_buf.append(item)
            except StopIteration as err:
                raise LookAheadError("peek_ahead: not enough items in source") from err
        return self._ahead_buf[n - 1]
    def history(self) -> list[T]:
        """
        Return a copy of the buffered history (most recent last).
        The list length ≤ `history` argument passed at construction.
        """
        if self._cursor == 0:
            return list(self._back_buf)
        # When cursor<0, slice so the order remains chronological
        return list(self._back_buf)[: self._cursor or None]
    def lookahead_buffer(self) -> list[T]:
        """
        Return a copy of the current lookahead buffer.
        """
        return list(self._ahead_buf)
    def can_peek_back(self, steps: int = 1) -> bool:
        """
        Check if we can go back `steps` items without raising an IndexError.
        """
        return steps <= len(self._back_buf) + self._cursor
    def can_peek_ahead(self, steps: int = 1) -> bool:
        """
        Check if we can peek ahead `steps` items.
        This may involve trying to fill the ahead buffer.
        """
        if self._lookahead > 0 and steps > self._lookahead:
            return False
        # Try to fill ahead buffer to check if we can peek that far
        try:
            while len(self._ahead_buf) < steps:
                if self._lookahead > 0 and len(self._ahead_buf) >= self._lookahead:
                    return False
                item = next(self._source)
                self._ahead_buf.append(item)
            return True
        except StopIteration:
            return False
    def reset_cursor(self) -> None:
        """
        Reset cursor to the most recent position (equivalent to calling next()
        until you're back to the latest item).
        """
        self._cursor = 0
    def clear_ahead_buffer(self) -> None:
        """
        Clear the ahead buffer, discarding any pre-fetched items.
        """
        self._ahead_buf.clear()
    def switch_source_iterable(self, new_source: Iterable[T]) -> None:
        """
        Switch the source of the backtrackable to a new iterable, keeping the history.
        This is useful when iterating over a sequence of datasets. The history from the
        previous source is kept, but the lookahead buffer is cleared. The cursor is reset
        to the present.
        """
        self._source = iter(new_source)
        self.clear_ahead_buffer()
        self.reset_cursor()
 def safe_shard(dataset: datasets.IterableDataset, index: int, num_shards: int) -> datasets.Dataset:
    """
    Safe shards the dataset.
    """
    shard_idx = min(dataset.num_shards, index + 1) - 1
    return dataset.shard(num_shards, index=shard_idx)
--- a/src/lerobot/datasets/video_utils.py
+++ b/src/lerobot/datasets/video_utils.py
@@ -21,9 +21,11 @@ import tempfile
 import warnings
 from dataclasses import dataclass, field
 from pathlib import Path
 from threading import Lock
 from typing import Any, ClassVar
 import av
 import fsspec
 import pyarrow as pa
 import torch
 import torchvision
@@ -169,15 +171,68 @@ def decode_video_frames_torchvision(
    return closest_frames
 class VideoDecoderCache:
    """Thread-safe cache for video decoders to avoid expensive re-initialization."""
    def __init__(self):
        self._cache: dict[str, tuple[Any, Any]] = {}
        self._lock = Lock()
    def get_decoder(self, video_path: str):
        """Get a cached decoder or create a new one."""
        if importlib.util.find_spec("torchcodec"):
            from torchcodec.decoders import VideoDecoder
        else:
            raise ImportError("torchcodec is required but not available.")
        video_path = str(video_path)
        with self._lock:
            if video_path not in self._cache:
                file_handle = fsspec.open(video_path).__enter__()
                decoder = VideoDecoder(file_handle, seek_mode="approximate")
                self._cache[video_path] = (decoder, file_handle)
            return self._cache[video_path][0]
    def clear(self):
        """Clear the cache and close file handles."""
        with self._lock:
            for _, file_handle in self._cache.values():
                file_handle.close()
            self._cache.clear()
    def size(self) -> int:
        """Return the number of cached decoders."""
        with self._lock:
            return len(self._cache)
 class FrameTimestampError(ValueError):
    """Helper error to indicate the retrieved timestamps exceed the queried ones"""
    pass
 _default_decoder_cache = VideoDecoderCache()
 def decode_video_frames_torchcodec(
    video_path: Path | str,
    timestamps: list[float],
    tolerance_s: float,
    device: str = "cpu",
    log_loaded_timestamps: bool = False,
    decoder_cache: VideoDecoderCache | None = None,
 ) -> torch.Tensor:
    """Loads frames associated with the requested timestamps of a video using torchcodec.
    Args:
        video_path: Path to the video file.
        timestamps: List of timestamps to extract frames.
        tolerance_s: Allowed deviation in seconds for frame retrieval.
        log_loaded_timestamps: Whether to log loaded timestamps.
        decoder_cache: Optional decoder cache instance. Uses default if None.
    Note: Setting device="cuda" outside the main process, e.g. in data loader workers, will lead to CUDA initialization errors.
    Note: Video benefits from inter-frame compression. Instead of storing every frame individually,
@@ -186,27 +241,24 @@ def decode_video_frames_torchcodec(
    and all subsequent frames until reaching the requested frame. The number of key frames in a video
    can be adjusted during encoding to take into account decoding time and video size in bytes.
    """
    if decoder_cache is None:
        decoder_cache = _default_decoder_cache
-    if importlib.util.find_spec("torchcodec"):
+    # Use cached decoder instead of creating new one each time
-        from torchcodec.decoders import VideoDecoder
+    decoder = decoder_cache.get_decoder(str(video_path))
    else:
        raise ImportError("torchcodec is required but not available.")
    # initialize video decoder
    decoder = VideoDecoder(video_path, device=device, seek_mode="approximate")
    loaded_frames = []
    loaded_ts = []
    loaded_frames = []
    # get metadata for frame information
    metadata = decoder.metadata
    average_fps = metadata.average_fps
    # convert timestamps to frame indices
    frame_indices = [round(ts * average_fps) for ts in timestamps]
    # retrieve frames based on indices
    frames_batch = decoder.get_frames_at(indices=frame_indices)
-    for frame, pts in zip(frames_batch.data, frames_batch.pts_seconds, strict=False):
+    for frame, pts in zip(frames_batch.data, frames_batch.pts_seconds, strict=True):
        loaded_frames.append(frame)
        loaded_ts.append(pts.item())
        if log_loaded_timestamps:
@@ -237,10 +289,14 @@ def decode_video_frames_torchcodec(
    if log_loaded_timestamps:
        logging.info(f"{closest_ts=}")
-    # convert to float32 in [0,1] range (channel first)
+    # convert to float32 in [0,1] range
-    closest_frames = closest_frames.type(torch.float32) / 255
+    closest_frames = (closest_frames / 255.0).type(torch.float32)
    if not len(timestamps) == len(closest_frames):
        raise FrameTimestampError(
            f"Retrieved timestamps differ from queried {set(closest_frames) - set(timestamps)}"
        )
    assert len(timestamps) == len(closest_frames)
    return closest_frames
--- a/src/lerobot/scripts/train.py
+++ b/src/lerobot/scripts/train.py
@@ -179,10 +179,11 @@ def train(cfg: TrainPipelineConfig):
        dataset,
        num_workers=cfg.num_workers,
        batch_size=cfg.batch_size,
-        shuffle=shuffle,
+        shuffle=shuffle and not cfg.dataset.streaming,
        sampler=sampler,
        pin_memory=device.type == "cuda",
        drop_last=False,
        prefetch_factor=2,
    )
    dl_iter = cycle(dataloader)
@@ -208,6 +209,9 @@ def train(cfg: TrainPipelineConfig):
        for key in batch:
            if isinstance(batch[key], torch.Tensor):
                if batch[key].dtype != torch.bool:
                    batch[key] = batch[key].type(torch.float32) if device.type == "mps" else batch[key]
                batch[key] = batch[key].to(device, non_blocking=device.type == "cuda")
        train_tracker, output_dict = update_policy(
--- a/tests/datasets/test_streaming.py
+++ b/tests/datasets/test_streaming.py
@@ -0,0 +1,391 @@
 #!/usr/bin/env python
 # Copyright 2025 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numpy as np
 import pytest
 import torch
 from lerobot.datasets.streaming_dataset import StreamingLeRobotDataset
 from lerobot.datasets.utils import safe_shard
 from tests.fixtures.constants import DUMMY_REPO_ID
 def get_frames_expected_order(streaming_ds: StreamingLeRobotDataset) -> list[int]:
    """Replicates the shuffling logic of StreamingLeRobotDataset to get the expected order of indices."""
    rng = np.random.default_rng(streaming_ds.seed)
    buffer_size = streaming_ds.buffer_size
    num_shards = streaming_ds.num_shards
    shards_indices = []
    for shard_idx in range(num_shards):
        shard = streaming_ds.hf_dataset.shard(num_shards, index=shard_idx)
        shard_indices = [item["index"] for item in shard]
        shards_indices.append(shard_indices)
    shard_iterators = {i: iter(s) for i, s in enumerate(shards_indices)}
    buffer_indices_generator = streaming_ds._iter_random_indices(rng, buffer_size)
    frames_buffer = []
    expected_indices = []
    while shard_iterators:  # While there are still available shards
        available_shard_keys = list(shard_iterators.keys())
        if not available_shard_keys:
            break
        # Call _infinite_generator_over_elements with current available shards (key difference!)
        shard_key = next(streaming_ds._infinite_generator_over_elements(rng, available_shard_keys))
        try:
            frame_index = next(shard_iterators[shard_key])
            if len(frames_buffer) == buffer_size:
                i = next(buffer_indices_generator)
                expected_indices.append(frames_buffer[i])
                frames_buffer[i] = frame_index
            else:
                frames_buffer.append(frame_index)
        except StopIteration:
            del shard_iterators[shard_key]  # Remove exhausted shard
    rng.shuffle(frames_buffer)
    expected_indices.extend(frames_buffer)
    return expected_indices
 def test_single_frame_consistency(tmp_path, lerobot_dataset_factory):
    """Test if are correctly accessed"""
    ds_num_frames = 400
    ds_num_episodes = 10
    buffer_size = 100
    local_path = tmp_path / "test"
    repo_id = f"{DUMMY_REPO_ID}"
    ds = lerobot_dataset_factory(
        root=local_path,
        repo_id=repo_id,
        total_episodes=ds_num_episodes,
        total_frames=ds_num_frames,
    )
    streaming_ds = iter(StreamingLeRobotDataset(repo_id=repo_id, root=local_path, buffer_size=buffer_size))
    key_checks = []
    for _ in range(ds_num_frames):
        streaming_frame = next(streaming_ds)
        frame_idx = streaming_frame["index"]
        target_frame = ds[frame_idx]
        for key in streaming_frame:
            left = streaming_frame[key]
            right = target_frame[key]
            if isinstance(left, str):
                check = left == right
            elif isinstance(left, torch.Tensor):
                check = torch.allclose(left, right) and left.shape == right.shape
            elif isinstance(left, float):
                check = left == right.item()  # right is a torch.Tensor
            key_checks.append((key, check))
        assert all(t[1] for t in key_checks), (
            f"Checking {list(filter(lambda t: not t[1], key_checks))[0][0]} left and right were found different (frame_idx: {frame_idx})"
        )
@pytest.mark.parametrize(
    "shuffle",
    [False, True],
 )
 def test_frames_order_over_epochs(tmp_path, lerobot_dataset_factory, shuffle):
    """Test if streamed frames correspond to shuffling operations over in-memory dataset."""
    ds_num_frames = 400
    ds_num_episodes = 10
    buffer_size = 100
    seed = 42
    n_epochs = 3
    local_path = tmp_path / "test"
    repo_id = f"{DUMMY_REPO_ID}"
    lerobot_dataset_factory(
        root=local_path,
        repo_id=repo_id,
        total_episodes=ds_num_episodes,
        total_frames=ds_num_frames,
    )
    streaming_ds = StreamingLeRobotDataset(
        repo_id=repo_id, root=local_path, buffer_size=buffer_size, seed=seed, shuffle=shuffle
    )
    first_epoch_indices = [frame["index"] for frame in streaming_ds]
    expected_indices = get_frames_expected_order(streaming_ds)
    assert first_epoch_indices == expected_indices, "First epoch indices do not match expected indices"
    expected_indices = get_frames_expected_order(streaming_ds)
    for _ in range(n_epochs):
        streaming_indices = [frame["index"] for frame in streaming_ds]
        frames_match = all(
            s_index == e_index for s_index, e_index in zip(streaming_indices, expected_indices, strict=True)
        )
        if shuffle:
            assert not frames_match
        else:
            assert frames_match
@pytest.mark.parametrize(
    "shuffle",
    [False, True],
 )
 def test_frames_order_with_shards(tmp_path, lerobot_dataset_factory, shuffle):
    """Test if streamed frames correspond to shuffling operations over in-memory dataset with multiple shards."""
    ds_num_frames = 100
    ds_num_episodes = 10
    buffer_size = 10
    seed = 42
    n_epochs = 3
    data_file_size_mb = 0.001
    chunks_size = 1
    local_path = tmp_path / "test"
    repo_id = f"{DUMMY_REPO_ID}-ciao"
    lerobot_dataset_factory(
        root=local_path,
        repo_id=repo_id,
        total_episodes=ds_num_episodes,
        total_frames=ds_num_frames,
        data_files_size_in_mb=data_file_size_mb,
        chunks_size=chunks_size,
    )
    streaming_ds = StreamingLeRobotDataset(
        repo_id=repo_id,
        root=local_path,
        buffer_size=buffer_size,
        seed=seed,
        shuffle=shuffle,
        max_num_shards=4,
    )
    first_epoch_indices = [frame["index"] for frame in streaming_ds]
    expected_indices = get_frames_expected_order(streaming_ds)
    assert first_epoch_indices == expected_indices, "First epoch indices do not match expected indices"
    for _ in range(n_epochs):
        streaming_indices = [
            frame["index"] for frame in streaming_ds
        ]  # NOTE: this is the same as first_epoch_indices
        frames_match = all(
            s_index == e_index for s_index, e_index in zip(streaming_indices, expected_indices, strict=True)
        )
        if shuffle:
            assert not frames_match
        else:
            assert frames_match
@pytest.mark.parametrize(
    "state_deltas, action_deltas",
    [
        ([-1, -0.5, -0.20, 0], [0, 1, 2, 3]),
        ([-1, -0.5, -0.20, 0], [-1.5, -1, -0.5, -0.20, -0.10, 0]),
        ([-2, -1, -0.5, 0], [0, 1, 2, 3]),
        ([-2, -1, -0.5, 0], [-1.5, -1, -0.5, -0.20, -0.10, 0]),
    ],
 )
 def test_frames_with_delta_consistency(tmp_path, lerobot_dataset_factory, state_deltas, action_deltas):
    ds_num_frames = 500
    ds_num_episodes = 10
    buffer_size = 100
    seed = 42
    local_path = tmp_path / "test"
    repo_id = f"{DUMMY_REPO_ID}-ciao"
    camera_key = "phone"
    delta_timestamps = {
        camera_key: state_deltas,
        "state": state_deltas,
        "action": action_deltas,
    }
    ds = lerobot_dataset_factory(
        root=local_path,
        repo_id=repo_id,
        total_episodes=ds_num_episodes,
        total_frames=ds_num_frames,
        delta_timestamps=delta_timestamps,
    )
    streaming_ds = iter(
        StreamingLeRobotDataset(
            repo_id=repo_id,
            root=local_path,
            buffer_size=buffer_size,
            seed=seed,
            shuffle=False,
            delta_timestamps=delta_timestamps,
        )
    )
    for i in range(ds_num_frames):
        streaming_frame = next(streaming_ds)
        frame_idx = streaming_frame["index"]
        target_frame = ds[frame_idx]
        assert set(streaming_frame.keys()) == set(target_frame.keys()), (
            f"Keys differ between streaming frame and target one. Differ at: {set(streaming_frame.keys()) - set(target_frame.keys())}"
        )
        key_checks = []
        for key in streaming_frame:
            left = streaming_frame[key]
            right = target_frame[key]
            if isinstance(left, str):
                check = left == right
            elif isinstance(left, torch.Tensor):
                if (
                    key not in ds.meta.camera_keys
                    and "is_pad" not in key
                    and f"{key}_is_pad" in streaming_frame
                ):
                    # comparing frames only on non-padded regions. Padding is applied to last-valid broadcasting
                    left = left[~streaming_frame[f"{key}_is_pad"]]
                    right = right[~target_frame[f"{key}_is_pad"]]
                check = torch.allclose(left, right) and left.shape == right.shape
            key_checks.append((key, check))
        assert all(t[1] for t in key_checks), (
            f"Checking {list(filter(lambda t: not t[1], key_checks))[0][0]} left and right were found different (i: {i}, frame_idx: {frame_idx})"
        )
@pytest.mark.parametrize(
    "state_deltas, action_deltas",
    [
        ([-1, -0.5, -0.20, 0], [0, 1, 2, 3, 10, 20]),
        ([-1, -0.5, -0.20, 0], [-20, -1.5, -1, -0.5, -0.20, -0.10, 0]),
        ([-2, -1, -0.5, 0], [0, 1, 2, 3, 10, 20]),
        ([-2, -1, -0.5, 0], [-20, -1.5, -1, -0.5, -0.20, -0.10, 0]),
    ],
 )
 def test_frames_with_delta_consistency_with_shards(
    tmp_path, lerobot_dataset_factory, state_deltas, action_deltas
 ):
    ds_num_frames = 100
    ds_num_episodes = 10
    buffer_size = 10
    data_file_size_mb = 0.001
    chunks_size = 1
    seed = 42
    local_path = tmp_path / "test"
    repo_id = f"{DUMMY_REPO_ID}-ciao"
    camera_key = "phone"
    delta_timestamps = {
        camera_key: state_deltas,
        "state": state_deltas,
        "action": action_deltas,
    }
    ds = lerobot_dataset_factory(
        root=local_path,
        repo_id=repo_id,
        total_episodes=ds_num_episodes,
        total_frames=ds_num_frames,
        delta_timestamps=delta_timestamps,
        data_files_size_in_mb=data_file_size_mb,
        chunks_size=chunks_size,
    )
    streaming_ds = StreamingLeRobotDataset(
        repo_id=repo_id,
        root=local_path,
        buffer_size=buffer_size,
        seed=seed,
        shuffle=False,
        delta_timestamps=delta_timestamps,
        max_num_shards=4,
    )
    iter(streaming_ds)
    num_shards = 4
    shards_indices = []
    for shard_idx in range(num_shards):
        shard = safe_shard(streaming_ds.hf_dataset, shard_idx, num_shards)
        shard_indices = [item["index"] for item in shard]
        shards_indices.append(shard_indices)
    streaming_ds = iter(streaming_ds)
    for i in range(ds_num_frames):
        streaming_frame = next(streaming_ds)
        frame_idx = streaming_frame["index"]
        target_frame = ds[frame_idx]
        assert set(streaming_frame.keys()) == set(target_frame.keys()), (
            f"Keys differ between streaming frame and target one. Differ at: {set(streaming_frame.keys()) - set(target_frame.keys())}"
        )
        key_checks = []
        for key in streaming_frame:
            left = streaming_frame[key]
            right = target_frame[key]
            if isinstance(left, str):
                check = left == right
            elif isinstance(left, torch.Tensor):
                if (
                    key not in ds.meta.camera_keys
                    and "is_pad" not in key
                    and f"{key}_is_pad" in streaming_frame
                ):
                    # comparing frames only on non-padded regions. Padding is applied to last-valid broadcasting
                    left = left[~streaming_frame[f"{key}_is_pad"]]
                    right = right[~target_frame[f"{key}_is_pad"]]
                check = torch.allclose(left, right) and left.shape == right.shape
            elif isinstance(left, float):
                check = left == right.item()  # right is a torch.Tensor
            key_checks.append((key, check))
        assert all(t[1] for t in key_checks), (
            f"Checking {list(filter(lambda t: not t[1], key_checks))[0][0]} left and right were found different (i: {i}, frame_idx: {frame_idx})"
        )