Loads episode_data_index and stats during dataset __init__ (#85)

Co-authored-by: Simon Alibert <75076266+aliberts@users.noreply.github.com> Co-authored-by: Alexander Soare <alexander.soare159@gmail.com>
2024-04-23 14:13:25 +02:00
parent e2168163cd
commit 1030ea0070
89 changed files with 1008 additions and 432 deletions
--- a/lerobot/common/datasets/aloha.py
+++ b/lerobot/common/datasets/aloha.py
@@ -1,9 +1,13 @@
 from pathlib import Path

 import torch
-from datasets import load_dataset, load_from_disk

-from lerobot.common.datasets.utils import load_previous_and_future_frames
+from lerobot.common.datasets.utils import (
+    load_episode_data_index,
+    load_hf_dataset,
+    load_previous_and_future_frames,
+    load_stats,
+)


 class AlohaDataset(torch.utils.data.Dataset):
@@ -27,7 +31,7 @@ class AlohaDataset(torch.utils.data.Dataset):
    def __init__(
        self,
        dataset_id: str,
-        version: str | None = "v1.0",
+        version: str | None = "v1.1",
        root: Path | None = None,
        split: str = "train",
        transform: callable = None,
@@ -40,13 +44,10 @@ class AlohaDataset(torch.utils.data.Dataset):
        self.split = split
        self.transform = transform
        self.delta_timestamps = delta_timestamps
-        if self.root is not None:
-            self.hf_dataset = load_from_disk(Path(self.root) / self.dataset_id / self.split)
-        else:
-            self.hf_dataset = load_dataset(
-                f"lerobot/{self.dataset_id}", revision=self.version, split=self.split
-            )
-        self.hf_dataset = self.hf_dataset.with_format("torch")
+        # load data from hub or locally when root is provided
+        self.hf_dataset = load_hf_dataset(dataset_id, version, root, split)
+        self.episode_data_index = load_episode_data_index(dataset_id, version, root)
+        self.stats = load_stats(dataset_id, version, root)

    @property
    def num_samples(self) -> int:
@@ -54,7 +55,7 @@ class AlohaDataset(torch.utils.data.Dataset):

    @property
    def num_episodes(self) -> int:
-        return len(self.hf_dataset.unique("episode_id"))
+        return len(self.hf_dataset.unique("episode_index"))

    def __len__(self):
        return self.num_samples
@@ -66,19 +67,11 @@ class AlohaDataset(torch.utils.data.Dataset):
            item = load_previous_and_future_frames(
                item,
                self.hf_dataset,
+                self.episode_data_index,
                self.delta_timestamps,
                tol=1 / self.fps - 1e-4,  # 1e-4 to account for possible numerical error
            )

-        # convert images from channel last (PIL) to channel first (pytorch)
-        for key in self.image_keys:
-            if item[key].ndim == 3:
-                item[key] = item[key].permute((2, 0, 1))  # h w c -> c h w
-            elif item[key].ndim == 4:
-                item[key] = item[key].permute((0, 3, 1, 2))  # t h w c -> t c h w
-            else:
-                raise ValueError(item[key].ndim)
-
        if self.transform is not None:
            item = self.transform(item)

--- a/lerobot/common/datasets/factory.py
+++ b/lerobot/common/datasets/factory.py
@@ -1,12 +1,10 @@
-import logging
 import os
 from pathlib import Path

 import torch
 from torchvision.transforms import v2

-from lerobot.common.datasets.utils import compute_stats
-from lerobot.common.transforms import NormalizeTransform, Prod
+from lerobot.common.transforms import NormalizeTransform

 DATA_DIR = Path(os.environ["DATA_DIR"]) if "DATA_DIR" in os.environ else None

@@ -52,32 +50,18 @@ def make_dataset(
            stats["action"]["min"] = torch.tensor([12.0, 25.0], dtype=torch.float32)
            stats["action"]["max"] = torch.tensor([511.0, 511.0], dtype=torch.float32)
        elif stats_path is None:
-            # load stats if the file exists already or compute stats and save it
-            if DATA_DIR is None:
-                # TODO(rcadene): clean stats
-                precomputed_stats_path = Path("data") / cfg.dataset_id / "stats.pth"
-            else:
-                precomputed_stats_path = DATA_DIR / cfg.dataset_id / "stats.pth"
-            if precomputed_stats_path.exists():
-                stats = torch.load(precomputed_stats_path)
-            else:
-                logging.info(f"compute_stats and save to {precomputed_stats_path}")
-                # Create a dataset for stats computation.
-                stats_dataset = clsfunc(
-                    dataset_id=cfg.dataset_id,
-                    split="train",
-                    root=DATA_DIR,
-                    transform=Prod(in_keys=clsfunc.image_keys, prod=1 / 255.0),
-                )
-                stats = compute_stats(stats_dataset)
-                precomputed_stats_path.parent.mkdir(parents=True, exist_ok=True)
-                torch.save(stats, precomputed_stats_path)
+            # load a first dataset to access precomputed stats
+            stats_dataset = clsfunc(
+                dataset_id=cfg.dataset_id,
+                split="train",
+                root=DATA_DIR,
+            )
+            stats = stats_dataset.stats
        else:
            stats = torch.load(stats_path)

        transforms = v2.Compose(
            [
-                Prod(in_keys=clsfunc.image_keys, prod=1 / 255.0),
                NormalizeTransform(
                    stats,
                    in_keys=[
--- a/lerobot/common/datasets/pusht.py
+++ b/lerobot/common/datasets/pusht.py
@@ -1,9 +1,13 @@
 from pathlib import Path

 import torch
-from datasets import load_dataset, load_from_disk

-from lerobot.common.datasets.utils import load_previous_and_future_frames
+from lerobot.common.datasets.utils import (
+    load_episode_data_index,
+    load_hf_dataset,
+    load_previous_and_future_frames,
+    load_stats,
+)


 class PushtDataset(torch.utils.data.Dataset):
@@ -25,7 +29,7 @@ class PushtDataset(torch.utils.data.Dataset):
    def __init__(
        self,
        dataset_id: str = "pusht",
-        version: str | None = "v1.0",
+        version: str | None = "v1.1",
        root: Path | None = None,
        split: str = "train",
        transform: callable = None,
@@ -38,13 +42,10 @@ class PushtDataset(torch.utils.data.Dataset):
        self.split = split
        self.transform = transform
        self.delta_timestamps = delta_timestamps
-        if self.root is not None:
-            self.hf_dataset = load_from_disk(Path(self.root) / self.dataset_id / self.split)
-        else:
-            self.hf_dataset = load_dataset(
-                f"lerobot/{self.dataset_id}", revision=self.version, split=self.split
-            )
-        self.hf_dataset = self.hf_dataset.with_format("torch")
+        # load data from hub or locally when root is provided
+        self.hf_dataset = load_hf_dataset(dataset_id, version, root, split)
+        self.episode_data_index = load_episode_data_index(dataset_id, version, root)
+        self.stats = load_stats(dataset_id, version, root)

    @property
    def num_samples(self) -> int:
@@ -52,7 +53,7 @@ class PushtDataset(torch.utils.data.Dataset):

    @property
    def num_episodes(self) -> int:
-        return len(self.hf_dataset.unique("episode_id"))
+        return len(self.episode_data_index["from"])

    def __len__(self):
        return self.num_samples
@@ -64,19 +65,11 @@ class PushtDataset(torch.utils.data.Dataset):
            item = load_previous_and_future_frames(
                item,
                self.hf_dataset,
+                self.episode_data_index,
                self.delta_timestamps,
                tol=1 / self.fps - 1e-4,  # 1e-4 to account for possible numerical error
            )

-        # convert images from channel last (PIL) to channel first (pytorch)
-        for key in self.image_keys:
-            if item[key].ndim == 3:
-                item[key] = item[key].permute((2, 0, 1))  # h w c -> c h w
-            elif item[key].ndim == 4:
-                item[key] = item[key].permute((0, 3, 1, 2))  # t h w c -> t c h w
-            else:
-                raise ValueError(item[key].ndim)
-
        if self.transform is not None:
            item = self.transform(item)

--- a/lerobot/common/datasets/utils.py
+++ b/lerobot/common/datasets/utils.py
@@ -1,15 +1,121 @@
 from copy import deepcopy
 from math import ceil
+from pathlib import Path

 import datasets
 import einops
 import torch
 import tqdm
+from datasets import Image, load_dataset, load_from_disk
+from huggingface_hub import hf_hub_download
+from PIL import Image as PILImage
+from safetensors.torch import load_file
+from torchvision import transforms
+
+
+def flatten_dict(d, parent_key="", sep="/"):
+    """Flatten a nested dictionary structure by collapsing nested keys into one key with a separator.
+    
+    For example:
+    ```
+    >>> dct = {"a": {"b": 1, "c": {"d": 2}}, "e": 3}`
+    >>> print(flatten_dict(dct))
+    {"a/b": 1, "a/c/d": 2, "e": 3}
+    """
+    items = []
+    for k, v in d.items():
+        new_key = f"{parent_key}{sep}{k}" if parent_key else k
+        if isinstance(v, dict):
+            items.extend(flatten_dict(v, new_key, sep=sep).items())
+        else:
+            items.append((new_key, v))
+    return dict(items)
+
+
+def unflatten_dict(d, sep="/"):
+    outdict = {}
+    for key, value in d.items():
+        parts = key.split(sep)
+        d = outdict
+        for part in parts[:-1]:
+            if part not in d:
+                d[part] = {}
+            d = d[part]
+        d[parts[-1]] = value
+    return outdict
+
+
+def hf_transform_to_torch(items_dict):
+    """Get a transform function that convert items from Hugging Face dataset (pyarrow)
+    to torch tensors. Importantly, images are converted from PIL, which corresponds to
+    a channel last representation (h w c) of uint8 type, to a torch image representation
+    with channel first (c h w) of float32 type in range [0,1].
+    """
+    for key in items_dict:
+        first_item = items_dict[key][0]
+        if isinstance(first_item, PILImage.Image):
+            to_tensor = transforms.ToTensor()
+            items_dict[key] = [to_tensor(img) for img in items_dict[key]]
+        else:
+            items_dict[key] = [torch.tensor(x) for x in items_dict[key]]
+    return items_dict
+
+
+def load_hf_dataset(dataset_id, version, root, split) -> datasets.Dataset:
+    """hf_dataset contains all the observations, states, actions, rewards, etc."""
+    if root is not None:
+        hf_dataset = load_from_disk(str(Path(root) / dataset_id / split))
+    else:
+        # TODO(rcadene): remove dataset_id everywhere and use repo_id instead
+        repo_id = f"lerobot/{dataset_id}"
+        hf_dataset = load_dataset(repo_id, revision=version, split=split)
+    hf_dataset.set_transform(hf_transform_to_torch)
+    return hf_dataset
+
+
+def load_episode_data_index(dataset_id, version, root) -> dict[str, torch.Tensor]:
+    """episode_data_index contains the range of indices for each episode
+
+    Example:
+    ```python
+    from_id = episode_data_index["from"][episode_id].item()
+    to_id = episode_data_index["to"][episode_id].item()
+    episode_frames = [dataset[i] for i in range(from_id, to_id)]
+    ```
+    """
+    if root is not None:
+        path = Path(root) / dataset_id / "meta_data" / "episode_data_index.safetensors"
+    else:
+        repo_id = f"lerobot/{dataset_id}"
+        path = hf_hub_download(
+            repo_id, "meta_data/episode_data_index.safetensors", repo_type="dataset", revision=version
+        )
+
+    return load_file(path)
+
+
+def load_stats(dataset_id, version, root) -> dict[str, dict[str, torch.Tensor]]:
+    """stats contains the statistics per modality computed over the full dataset, such as max, min, mean, std
+
+    Example:
+    ```python
+    normalized_action = (action - stats["action"]["mean"]) / stats["action"]["std"]
+    ```
+    """
+    if root is not None:
+        path = Path(root) / dataset_id / "meta_data" / "stats.safetensors"
+    else:
+        repo_id = f"lerobot/{dataset_id}"
+        path = hf_hub_download(repo_id, "meta_data/stats.safetensors", repo_type="dataset", revision=version)
+
+    stats = load_file(path)
+    return unflatten_dict(stats)


 def load_previous_and_future_frames(
    item: dict[str, torch.Tensor],
    hf_dataset: datasets.Dataset,
+    episode_data_index: dict[str, torch.Tensor],
    delta_timestamps: dict[str, list[float]],
    tol: float,
 ) -> dict[torch.Tensor]:
@@ -31,6 +137,8 @@ def load_previous_and_future_frames(
      corresponds to a different modality (e.g., "timestamp", "observation.image", "action").
    - hf_dataset (datasets.Dataset): A dictionary containing the full dataset. Each key corresponds to a different
      modality (e.g., "timestamp", "observation.image", "action").
+    - episode_data_index (dict): A dictionary containing two keys ("from" and "to") associated to dataset indices.
+      They indicate the start index and end index of each episode in the dataset.
    - delta_timestamps (dict): A dictionary containing lists of delta timestamps for each possible modality to be
      retrieved. These deltas are added to the item timestamp to form the query timestamps.
    - tol (float, optional): The tolerance level used to determine if a data point is close enough to the query
@@ -46,12 +154,14 @@ def load_previous_and_future_frames(
      issues with timestamps during data collection.
    """
    # get indices of the frames associated to the episode, and their timestamps
-    ep_data_id_from = item["episode_data_index_from"].item()
-    ep_data_id_to = item["episode_data_index_to"].item()
+    ep_id = item["episode_index"].item()
+    ep_data_id_from = episode_data_index["from"][ep_id].item()
+    ep_data_id_to = episode_data_index["to"][ep_id].item()
    ep_data_ids = torch.arange(ep_data_id_from, ep_data_id_to, 1)

    # load timestamps
    ep_timestamps = hf_dataset.select_columns("timestamp")[ep_data_id_from:ep_data_id_to]["timestamp"]
+    ep_timestamps = torch.stack(ep_timestamps)

    # we make the assumption that the timestamps are sorted
    ep_first_ts = ep_timestamps[0]
@@ -82,39 +192,57 @@ def load_previous_and_future_frames(

        # load frames modality
        item[key] = hf_dataset.select_columns(key)[data_ids][key]
+        item[key] = torch.stack(item[key])
        item[f"{key}_is_pad"] = is_pad

    return item


-def get_stats_einops_patterns(dataset):
-    """These einops patterns will be used to aggregate batches and compute statistics."""
-    stats_patterns = {
-        "action": "b c -> c",
-        "observation.state": "b c -> c",
-    }
-    for key in dataset.image_keys:
-        stats_patterns[key] = "b c h w -> c 1 1"
+def get_stats_einops_patterns(hf_dataset):
+    """These einops patterns will be used to aggregate batches and compute statistics.
+
+    Note: We assume the images of `hf_dataset` are in channel first format
+    """
+
+    dataloader = torch.utils.data.DataLoader(
+        hf_dataset,
+        num_workers=0,
+        batch_size=2,
+        shuffle=False,
+    )
+    batch = next(iter(dataloader))
+
+    stats_patterns = {}
+    for key, feats_type in hf_dataset.features.items():
+        # sanity check that tensors are not float64
+        assert batch[key].dtype != torch.float64
+
+        if isinstance(feats_type, Image):
+            # sanity check that images are channel first
+            _, c, h, w = batch[key].shape
+            assert c < h and c < w, f"expect channel first images, but instead {batch[key].shape}"
+
+            # sanity check that images are float32 in range [0,1]
+            assert batch[key].dtype == torch.float32, f"expect torch.float32, but instead {batch[key].dtype=}"
+            assert batch[key].max() <= 1, f"expect pixels lower than 1, but instead {batch[key].max()=}"
+            assert batch[key].min() >= 0, f"expect pixels greater than 1, but instead {batch[key].min()=}"
+
+            stats_patterns[key] = "b c h w -> c 1 1"
+        elif batch[key].ndim == 2:
+            stats_patterns[key] = "b c -> c "
+        elif batch[key].ndim == 1:
+            stats_patterns[key] = "b -> 1"
+        else:
+            raise ValueError(f"{key}, {feats_type}, {batch[key].shape}")
+
    return stats_patterns


-def compute_stats(dataset, batch_size=32, max_num_samples=None):
+def compute_stats(hf_dataset, batch_size=32, max_num_samples=None):
    if max_num_samples is None:
-        max_num_samples = len(dataset)
-    else:
-        raise NotImplementedError("We need to set shuffle=True, but this violate an assert for now.")
+        max_num_samples = len(hf_dataset)

-    dataloader = torch.utils.data.DataLoader(
-        dataset,
-        num_workers=4,
-        batch_size=batch_size,
-        shuffle=False,
-        # pin_memory=cfg.device != "cpu",
-        drop_last=False,
-    )
-
-    # get einops patterns to aggregate batches and compute statistics
-    stats_patterns = get_stats_einops_patterns(dataset)
+    stats_patterns = get_stats_einops_patterns(hf_dataset)

    # mean and std will be computed incrementally while max and min will track the running value.
    mean, std, max, min = {}, {}, {}, {}
@@ -124,10 +252,24 @@ def compute_stats(dataset, batch_size=32, max_num_samples=None):
        max[key] = torch.tensor(-float("inf")).float()
        min[key] = torch.tensor(float("inf")).float()

+    def create_seeded_dataloader(hf_dataset, batch_size, seed):
+        generator = torch.Generator()
+        generator.manual_seed(seed)
+        dataloader = torch.utils.data.DataLoader(
+            hf_dataset,
+            num_workers=4,
+            batch_size=batch_size,
+            shuffle=True,
+            drop_last=False,
+            generator=generator,
+        )
+        return dataloader
+
    # Note: Due to be refactored soon. The point of storing `first_batch` is to make sure we don't get
    # surprises when rerunning the sampler.
    first_batch = None
    running_item_count = 0  # for online mean computation
+    dataloader = create_seeded_dataloader(hf_dataset, batch_size, seed=1337)
    for i, batch in enumerate(
        tqdm.tqdm(dataloader, total=ceil(max_num_samples / batch_size), desc="Compute mean, min, max")
    ):
@@ -153,6 +295,7 @@ def compute_stats(dataset, batch_size=32, max_num_samples=None):

    first_batch_ = None
    running_item_count = 0  # for online std computation
+    dataloader = create_seeded_dataloader(hf_dataset, batch_size, seed=1337)
    for i, batch in enumerate(
        tqdm.tqdm(dataloader, total=ceil(max_num_samples / batch_size), desc="Compute std")
    ):
--- a/lerobot/common/datasets/xarm.py
+++ b/lerobot/common/datasets/xarm.py
@@ -1,25 +1,37 @@
 from pathlib import Path

 import torch
-from datasets import load_dataset, load_from_disk

-from lerobot.common.datasets.utils import load_previous_and_future_frames
+from lerobot.common.datasets.utils import (
+    load_episode_data_index,
+    load_hf_dataset,
+    load_previous_and_future_frames,
+    load_stats,
+)


 class XarmDataset(torch.utils.data.Dataset):
    """
    https://huggingface.co/datasets/lerobot/xarm_lift_medium
+    https://huggingface.co/datasets/lerobot/xarm_lift_medium_replay
+    https://huggingface.co/datasets/lerobot/xarm_push_medium
+    https://huggingface.co/datasets/lerobot/xarm_push_medium_replay
    """

    # Copied from lerobot/__init__.py
-    available_datasets = ["xarm_lift_medium"]
+    available_datasets = [
+        "xarm_lift_medium",
+        "xarm_lift_medium_replay",
+        "xarm_push_medium",
+        "xarm_push_medium_replay",
+    ]
    fps = 15
    image_keys = ["observation.image"]

    def __init__(
        self,
-        dataset_id: str = "xarm_lift_medium",
-        version: str | None = "v1.0",
+        dataset_id: str,
+        version: str | None = "v1.1",
        root: Path | None = None,
        split: str = "train",
        transform: callable = None,
@@ -32,13 +44,10 @@ class XarmDataset(torch.utils.data.Dataset):
        self.split = split
        self.transform = transform
        self.delta_timestamps = delta_timestamps
-        if self.root is not None:
-            self.hf_dataset = load_from_disk(Path(self.root) / self.dataset_id / self.split)
-        else:
-            self.hf_dataset = load_dataset(
-                f"lerobot/{self.dataset_id}", revision=self.version, split=self.split
-            )
-        self.hf_dataset = self.hf_dataset.with_format("torch")
+        # load data from hub or locally when root is provided
+        self.hf_dataset = load_hf_dataset(dataset_id, version, root, split)
+        self.episode_data_index = load_episode_data_index(dataset_id, version, root)
+        self.stats = load_stats(dataset_id, version, root)

    @property
    def num_samples(self) -> int:
@@ -46,7 +55,7 @@ class XarmDataset(torch.utils.data.Dataset):

    @property
    def num_episodes(self) -> int:
-        return len(self.hf_dataset.unique("episode_id"))
+        return len(self.hf_dataset.unique("episode_index"))

    def __len__(self):
        return self.num_samples
@@ -58,19 +67,11 @@ class XarmDataset(torch.utils.data.Dataset):
            item = load_previous_and_future_frames(
                item,
                self.hf_dataset,
+                self.episode_data_index,
                self.delta_timestamps,
                tol=1 / self.fps - 1e-4,  # 1e-4 to account for possible numerical error
            )

-        # convert images from channel last (PIL) to channel first (pytorch)
-        for key in self.image_keys:
-            if item[key].ndim == 3:
-                item[key] = item[key].permute((2, 0, 1))  # h w c -> c h w
-            elif item[key].ndim == 4:
-                item[key] = item[key].permute((0, 3, 1, 2))  # t h w c -> t c h w
-            else:
-                raise ValueError(item[key].ndim)
-
        if self.transform is not None:
            item = self.transform(item)