Refactor push_dataset_to_hub (#118)

2024-04-30 14:25:41 +02:00
parent 2765877f28
commit e4e739f4f8
25 changed files with 1089 additions and 1192 deletions
--- a/lerobot/common/datasets/push_dataset_to_hub/_download_raw.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/_download_raw.py
@@ -4,20 +4,22 @@ useless dependencies when using datasets.
 """

 import io
+import logging
+import shutil
 from pathlib import Path

 import tqdm


-def download_raw(root, dataset_id) -> Path:
+def download_raw(raw_dir, dataset_id):
    if "pusht" in dataset_id:
-        return download_pusht(root=root, dataset_id=dataset_id)
+        download_pusht(raw_dir)
    elif "xarm" in dataset_id:
-        return download_xarm(root=root, dataset_id=dataset_id)
+        download_xarm(raw_dir)
    elif "aloha" in dataset_id:
-        return download_aloha(root=root, dataset_id=dataset_id)
+        download_aloha(raw_dir, dataset_id)
    elif "umi" in dataset_id:
-        return download_umi(root=root, dataset_id=dataset_id)
+        download_umi(raw_dir)
    else:
        raise ValueError(dataset_id)

@@ -45,50 +47,53 @@ def download_and_extract_zip(url: str, destination_folder: Path) -> bool:

        with zipfile.ZipFile(zip_file, "r") as zip_ref:
            zip_ref.extractall(destination_folder)
-        return True
-    else:
-        return False


-def download_pusht(root: str, dataset_id: str = "pusht", fps: int = 10) -> Path:
+def download_pusht(raw_dir: str):
    pusht_url = "https://diffusion-policy.cs.columbia.edu/data/training/pusht.zip"
-    pusht_zarr = Path("pusht/pusht_cchi_v7_replay.zarr")

-    root = Path(root)
-    raw_dir: Path = root / f"{dataset_id}_raw"
-    zarr_path: Path = (raw_dir / pusht_zarr).resolve()
-    if not zarr_path.is_dir():
-        raw_dir.mkdir(parents=True, exist_ok=True)
-        download_and_extract_zip(pusht_url, raw_dir)
-    return zarr_path
+    raw_dir = Path(raw_dir)
+    raw_dir.mkdir(parents=True, exist_ok=True)
+    download_and_extract_zip(pusht_url, raw_dir)
+    # file is created inside a useful "pusht" directory, so we move it out and delete the dir
+    zarr_path = raw_dir / "pusht_cchi_v7_replay.zarr"
+    shutil.move(raw_dir / "pusht" / "pusht_cchi_v7_replay.zarr", zarr_path)
+    shutil.rmtree(raw_dir / "pusht")


-def download_xarm(root: str, dataset_id: str, fps: int = 15) -> Path:
-    root = Path(root)
-    raw_dir: Path = root / "xarm_datasets_raw"
-    if not raw_dir.exists():
-        import zipfile
+def download_xarm(raw_dir: Path):
+    """Download all xarm datasets at once"""
+    import zipfile

-        import gdown
+    import gdown

-        raw_dir.mkdir(parents=True, exist_ok=True)
-        # from https://github.com/fyhMer/fowm/blob/main/scripts/download_datasets.py
-        url = "https://drive.google.com/uc?id=1nhxpykGtPDhmQKm-_B8zBSywVRdgeVya"
-        zip_path = raw_dir / "data.zip"
-        gdown.download(url, str(zip_path), quiet=False)
-        print("Extracting...")
-        with zipfile.ZipFile(str(zip_path), "r") as zip_f:
-            for member in zip_f.namelist():
-                if member.startswith("data/xarm") and member.endswith(".pkl"):
-                    print(member)
-                    zip_f.extract(member=member)
-        zip_path.unlink()
-
-    dataset_path: Path = root / f"{dataset_id}"
-    return dataset_path
+    raw_dir = Path(raw_dir)
+    raw_dir.mkdir(parents=True, exist_ok=True)
+    # from https://github.com/fyhMer/fowm/blob/main/scripts/download_datasets.py
+    url = "https://drive.google.com/uc?id=1nhxpykGtPDhmQKm-_B8zBSywVRdgeVya"
+    zip_path = raw_dir / "data.zip"
+    gdown.download(url, str(zip_path), quiet=False)
+    print("Extracting...")
+    with zipfile.ZipFile(str(zip_path), "r") as zip_f:
+        for pkl_path in zip_f.namelist():
+            if pkl_path.startswith("data/xarm") and pkl_path.endswith(".pkl"):
+                zip_f.extract(member=pkl_path)
+                # move to corresponding raw directory
+                extract_dir = pkl_path.replace("/buffer.pkl", "")
+                raw_pkl_path = raw_dir / "buffer.pkl"
+                shutil.move(pkl_path, raw_pkl_path)
+                shutil.rmtree(extract_dir)
+    zip_path.unlink()


-def download_aloha(root: str, dataset_id: str) -> Path:
+def download_aloha(raw_dir: Path, dataset_id: str):
+    # TODO(rcadene): remove gdown and use hugging face download instead
+    import gdown
+
+    logging.warning(
+        "Aloha download is broken and requires a custom version of gdown which is not limited on number of files"
+    )
+
    folder_urls = {
        "aloha_sim_insertion_human": "https://drive.google.com/drive/folders/1RgyD0JgTX30H4IM5XZn8I3zSV_mr8pyF",
        "aloha_sim_insertion_scripted": "https://drive.google.com/drive/folders/1TsojQQSXtHEoGnqgJ3gmpPQR2DPLtS2N",
@@ -129,40 +134,32 @@ def download_aloha(root: str, dataset_id: str) -> Path:
        "aloha_sim_transfer_cube_human": ["top"],
        "aloha_sim_transfer_cube_scripted": ["top"],
    }
-    root = Path(root)
-    raw_dir: Path = root / f"{dataset_id}_raw"
-    if not raw_dir.is_dir():
-        import gdown

-        assert dataset_id in folder_urls
-        assert dataset_id in ep48_urls
-        assert dataset_id in ep49_urls
+    assert dataset_id in folder_urls
+    assert dataset_id in ep48_urls
+    assert dataset_id in ep49_urls

-        raw_dir.mkdir(parents=True, exist_ok=True)
+    raw_dir = Path(raw_dir)
+    raw_dir.mkdir(parents=True, exist_ok=True)

-        gdown.download_folder(folder_urls[dataset_id], output=str(raw_dir))
+    gdown.download_folder(folder_urls[dataset_id], output=str(raw_dir))

-        # because of the 50 files limit per directory, two files episode 48 and 49 were missing
-        gdown.download(ep48_urls[dataset_id], output=str(raw_dir / "episode_48.hdf5"), fuzzy=True)
-        gdown.download(ep49_urls[dataset_id], output=str(raw_dir / "episode_49.hdf5"), fuzzy=True)
-    return raw_dir
+    # because of the 50 files limit per directory, two files episode 48 and 49 were missing
+    gdown.download(ep48_urls[dataset_id], output=str(raw_dir / "episode_48.hdf5"), fuzzy=True)
+    gdown.download(ep49_urls[dataset_id], output=str(raw_dir / "episode_49.hdf5"), fuzzy=True)


-def download_umi(root: str, dataset_id: str) -> Path:
+def download_umi(raw_dir: Path):
    url_cup_in_the_wild = "https://real.stanford.edu/umi/data/zarr_datasets/cup_in_the_wild.zarr.zip"
-    cup_in_the_wild_zarr = Path("umi/cup_in_the_wild/cup_in_the_wild.zarr")
+    zarr_path = raw_dir / "cup_in_the_wild.zarr"

-    root = Path(root)
-    raw_dir: Path = root / f"{dataset_id}_raw"
-    zarr_path: Path = (raw_dir / cup_in_the_wild_zarr).resolve()
-    if not zarr_path.is_dir():
-        raw_dir.mkdir(parents=True, exist_ok=True)
-        download_and_extract_zip(url_cup_in_the_wild, zarr_path)
-    return zarr_path
+    raw_dir = Path(raw_dir)
+    raw_dir.mkdir(parents=True, exist_ok=True)
+    download_and_extract_zip(url_cup_in_the_wild, zarr_path)


 if __name__ == "__main__":
-    root = "data"
+    data_dir = Path("data")
    dataset_ids = [
        "pusht",
        "xarm_lift_medium",
@@ -176,4 +173,5 @@ if __name__ == "__main__":
        "umi_cup_in_the_wild",
    ]
    for dataset_id in dataset_ids:
-        download_raw(root=root, dataset_id=dataset_id)
+        raw_dir = data_dir / f"{dataset_id}_raw"
+        download_raw(raw_dir, dataset_id)
--- a/lerobot/common/datasets/push_dataset_to_hub/aloha_hdf5_format.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/aloha_hdf5_format.py
@@ -0,0 +1,163 @@
+"""
+Contains utilities to process raw data format of HDF5 files like in: https://github.com/tonyzhaozh/act
+"""
+
+import re
+import shutil
+from pathlib import Path
+
+import h5py
+import torch
+import tqdm
+from datasets import Dataset, Features, Image, Sequence, Value
+from PIL import Image as PILImage
+
+from lerobot.common.datasets.push_dataset_to_hub.utils import concatenate_episodes, save_images_concurrently
+from lerobot.common.datasets.utils import (
+    hf_transform_to_torch,
+)
+
+# TODO(rcadene): enable for PR video dataset
+# from lerobot.common.datasets.video_utils import encode_video_frames
+
+
+def check_format(raw_dir) -> bool:
+    cameras = ["top"]
+
+    hdf5_files: list[Path] = list(raw_dir.glob("episode_*.hdf5"))
+    assert len(hdf5_files) != 0
+    hdf5_files = sorted(hdf5_files, key=lambda x: int(re.search(r"episode_(\d+).hdf5", x.name).group(1)))
+
+    # Check if the sequence is consecutive eg episode_0, episode_1, episode_2, etc.
+    previous_number = None
+    for file in hdf5_files:
+        current_number = int(re.search(r"episode_(\d+).hdf5", file.name).group(1))
+        if previous_number is not None:
+            assert current_number - previous_number == 1
+        previous_number = current_number
+
+    for file in hdf5_files:
+        with h5py.File(file, "r") as file:
+            # Check for the expected datasets within the HDF5 file
+            required_datasets = ["/action", "/observations/qpos"]
+            # Add camera-specific image datasets to the required datasets
+            camera_datasets = [f"/observations/images/{cam}" for cam in cameras]
+            required_datasets.extend(camera_datasets)
+
+            assert all(dataset in file for dataset in required_datasets)
+
+
+def load_from_raw(raw_dir, out_dir, fps, video, debug):
+    hdf5_files = list(raw_dir.glob("*.hdf5"))
+    hdf5_files = sorted(hdf5_files, key=lambda x: int(re.search(r"episode_(\d+)", x.name).group(1)))
+    ep_dicts = []
+    episode_data_index = {"from": [], "to": []}
+
+    id_from = 0
+
+    for ep_path in tqdm.tqdm(hdf5_files):
+        with h5py.File(ep_path, "r") as ep:
+            ep_idx = int(re.search(r"episode_(\d+)", ep_path.name).group(1))
+            num_frames = ep["/action"].shape[0]
+
+            # last step of demonstration is considered done
+            done = torch.zeros(num_frames, dtype=torch.bool)
+            done[-1] = True
+
+            state = torch.from_numpy(ep["/observations/qpos"][:])
+            action = torch.from_numpy(ep["/action"][:])
+
+            ep_dict = {}
+
+            cameras = list(ep["/observations/images"].keys())
+            for cam in cameras:
+                img_key = f"observation.images.{cam}"
+                imgs_array = ep[f"/observations/images/{cam}"][:]  # b h w c
+                if video:
+                    # save png images in temporary directory
+                    tmp_imgs_dir = out_dir / "tmp_images"
+                    save_images_concurrently(imgs_array, tmp_imgs_dir)
+
+                    # encode images to a mp4 video
+                    video_path = out_dir / "videos" / f"{img_key}_episode_{ep_idx:06d}.mp4"
+                    encode_video_frames(tmp_imgs_dir, video_path, fps)  # noqa: F821
+
+                    # clean temporary images directory
+                    shutil.rmtree(tmp_imgs_dir)
+
+                    # store the episode idx
+                    ep_dict[img_key] = torch.tensor([ep_idx] * num_frames, dtype=torch.int)
+                else:
+                    ep_dict[img_key] = [PILImage.fromarray(x) for x in imgs_array]
+
+            ep_dict["observation.state"] = state
+            ep_dict["action"] = action
+            ep_dict["episode_index"] = torch.tensor([ep_idx] * num_frames)
+            ep_dict["frame_index"] = torch.arange(0, num_frames, 1)
+            ep_dict["timestamp"] = torch.arange(0, num_frames, 1) / fps
+            ep_dict["next.done"] = done
+            # TODO(rcadene): compute reward and success
+            # ep_dict[""next.reward"] = reward
+            # ep_dict[""next.success"] = success
+
+            assert isinstance(ep_idx, int)
+            ep_dicts.append(ep_dict)
+
+            episode_data_index["from"].append(id_from)
+            episode_data_index["to"].append(id_from + num_frames)
+
+        id_from += num_frames
+
+        # process first episode only
+        if debug:
+            break
+
+    data_dict = concatenate_episodes(ep_dicts)
+    return data_dict, episode_data_index
+
+
+def to_hf_dataset(data_dict, video) -> Dataset:
+    features = {}
+
+    image_keys = [key for key in data_dict if "observation.images." in key]
+    for image_key in image_keys:
+        if video:
+            features[image_key] = Value(dtype="int64", id="video")
+        else:
+            features[image_key] = Image()
+
+    features["observation.state"] = Sequence(
+        length=data_dict["observation.state"].shape[1], feature=Value(dtype="float32", id=None)
+    )
+    features["action"] = Sequence(
+        length=data_dict["action"].shape[1], feature=Value(dtype="float32", id=None)
+    )
+    features["episode_index"] = Value(dtype="int64", id=None)
+    features["frame_index"] = Value(dtype="int64", id=None)
+    features["timestamp"] = Value(dtype="float32", id=None)
+    features["next.done"] = Value(dtype="bool", id=None)
+    features["index"] = Value(dtype="int64", id=None)
+    # TODO(rcadene): add reward and success
+    # features["next.reward"] = Value(dtype="float32", id=None)
+    # features["next.success"] = Value(dtype="bool", id=None)
+
+    hf_dataset = Dataset.from_dict(data_dict, features=Features(features))
+    hf_dataset.set_transform(hf_transform_to_torch)
+    return hf_dataset
+
+
+def from_raw_to_lerobot_format(raw_dir: Path, out_dir: Path, fps=None, video=True, debug=False):
+    # sanity check
+    check_format(raw_dir)
+
+    if fps is None:
+        fps = 50
+
+    data_dir, episode_data_index = load_from_raw(raw_dir, out_dir, fps, video, debug)
+    hf_dataset = to_hf_dataset(data_dir, video)
+
+    info = {
+        "fps": fps,
+        "video": video,
+    }
+    return hf_dataset, episode_data_index, info
--- a/lerobot/common/datasets/push_dataset_to_hub/aloha_processor.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/aloha_processor.py
@@ -1,199 +0,0 @@
-import re
-from pathlib import Path
-
-import h5py
-import torch
-import tqdm
-from datasets import Dataset, Features, Image, Sequence, Value
-from PIL import Image as PILImage
-
-from lerobot.common.datasets.push_dataset_to_hub.utils import concatenate_episodes
-from lerobot.common.datasets.utils import (
-    hf_transform_to_torch,
-)
-
-
-class AlohaProcessor:
-    """
-    Process HDF5 files formatted like in: https://github.com/tonyzhaozh/act
-
-    Attributes:
-        folder_path (Path): Path to the directory containing HDF5 files.
-        cameras (list[str]): List of camera identifiers to check in the files.
-        fps (int): Frames per second used in timestamp calculations.
-
-    Methods:
-        is_valid() -> bool:
-            Validates if each HDF5 file within the folder contains all required datasets.
-        preprocess() -> dict:
-            Processes the files and returns structured data suitable for further analysis.
-        to_hf_dataset(data_dict: dict) -> Dataset:
-            Converts processed data into a Hugging Face Dataset object.
-    """
-
-    def __init__(self, folder_path: Path, cameras: list[str] | None = None, fps: int | None = None):
-        """
-        Initializes the AlohaProcessor with a specified directory path containing HDF5 files,
-        an optional list of cameras, and a frame rate.
-
-        Args:
-            folder_path (Path): The directory path where HDF5 files are stored.
-            cameras (list[str] | None): Optional list of cameras to validate within the files. Defaults to ['top'] if None.
-            fps (int): Frame rate for the datasets, used in time calculations. Default is 50.
-
-        Examples:
-            >>> processor = AlohaProcessor(Path("path_to_hdf5_directory"), ["camera1", "camera2"])
-            >>> processor.is_valid()
-            True
-        """
-        self.folder_path = folder_path
-        if cameras is None:
-            cameras = ["top"]
-        self.cameras = cameras
-        if fps is None:
-            fps = 50
-        self._fps = fps
-
-    @property
-    def fps(self) -> int:
-        return self._fps
-
-    def is_valid(self) -> bool:
-        """
-        Validates the HDF5 files in the specified folder to ensure they contain the required datasets
-        for actions, positions, and images for each specified camera.
-
-        Returns:
-            bool: True if all files are valid HDF5 files with all required datasets, False otherwise.
-        """
-        hdf5_files: list[Path] = list(self.folder_path.glob("episode_*.hdf5"))
-        if len(hdf5_files) == 0:
-            return False
-        try:
-            hdf5_files = sorted(
-                hdf5_files, key=lambda x: int(re.search(r"episode_(\d+).hdf5", x.name).group(1))
-            )
-        except AttributeError:
-            # All file names must contain a numerical identifier matching 'episode_(\\d+).hdf5
-            return False
-
-        # Check if the sequence is consecutive eg episode_0, episode_1, episode_2, etc.
-        # If not, return False
-        previous_number = None
-        for file in hdf5_files:
-            current_number = int(re.search(r"episode_(\d+).hdf5", file.name).group(1))
-            if previous_number is not None and current_number - previous_number != 1:
-                return False
-            previous_number = current_number
-
-        for file in hdf5_files:
-            try:
-                with h5py.File(file, "r") as file:
-                    # Check for the expected datasets within the HDF5 file
-                    required_datasets = ["/action", "/observations/qpos"]
-                    # Add camera-specific image datasets to the required datasets
-                    camera_datasets = [f"/observations/images/{cam}" for cam in self.cameras]
-                    required_datasets.extend(camera_datasets)
-
-                    if not all(dataset in file for dataset in required_datasets):
-                        return False
-            except OSError:
-                return False
-        return True
-
-    def preprocess(self):
-        """
-        Collects episode data from the HDF5 file and returns it as an AlohaStep named tuple.
-
-        Returns:
-            AlohaStep: Named tuple containing episode data.
-
-        Raises:
-            ValueError: If the file is not valid.
-        """
-        if not self.is_valid():
-            raise ValueError("The HDF5 file is invalid or does not contain the required datasets.")
-
-        hdf5_files = list(self.folder_path.glob("*.hdf5"))
-        hdf5_files = sorted(hdf5_files, key=lambda x: int(re.search(r"episode_(\d+)", x.name).group(1)))
-        ep_dicts = []
-        episode_data_index = {"from": [], "to": []}
-
-        id_from = 0
-
-        for ep_path in tqdm.tqdm(hdf5_files):
-            with h5py.File(ep_path, "r") as ep:
-                ep_id = int(re.search(r"episode_(\d+)", ep_path.name).group(1))
-                num_frames = ep["/action"].shape[0]
-
-                # last step of demonstration is considered done
-                done = torch.zeros(num_frames, dtype=torch.bool)
-                done[-1] = True
-
-                state = torch.from_numpy(ep["/observations/qpos"][:])
-                action = torch.from_numpy(ep["/action"][:])
-
-                ep_dict = {}
-
-                for cam in self.cameras:
-                    image = torch.from_numpy(ep[f"/observations/images/{cam}"][:])  # b h w c
-                    ep_dict[f"observation.images.{cam}"] = [PILImage.fromarray(x.numpy()) for x in image]
-
-                ep_dict.update(
-                    {
-                        "observation.state": state,
-                        "action": action,
-                        "episode_index": torch.tensor([ep_id] * num_frames),
-                        "frame_index": torch.arange(0, num_frames, 1),
-                        "timestamp": torch.arange(0, num_frames, 1) / self.fps,
-                        # TODO(rcadene): compute reward and success
-                        # "next.reward": reward,
-                        "next.done": done,
-                        # "next.success": success,
-                    }
-                )
-
-                assert isinstance(ep_id, int)
-                ep_dicts.append(ep_dict)
-
-                episode_data_index["from"].append(id_from)
-                episode_data_index["to"].append(id_from + num_frames)
-
-            id_from += num_frames
-
-        data_dict = concatenate_episodes(ep_dicts)
-        return data_dict, episode_data_index
-
-    def to_hf_dataset(self, data_dict) -> Dataset:
-        """
-        Converts a dictionary of data into a Hugging Face Dataset object.
-
-        Args:
-            data_dict (dict): A dictionary containing the data to be converted.
-
-        Returns:
-            Dataset: The converted Hugging Face Dataset object.
-        """
-        image_features = {f"observation.images.{cam}": Image() for cam in self.cameras}
-        features = {
-            "observation.state": Sequence(
-                length=data_dict["observation.state"].shape[1], feature=Value(dtype="float32", id=None)
-            ),
-            "action": Sequence(length=data_dict["action"].shape[1], feature=Value(dtype="float32", id=None)),
-            "episode_index": Value(dtype="int64", id=None),
-            "frame_index": Value(dtype="int64", id=None),
-            "timestamp": Value(dtype="float32", id=None),
-            # "next.reward": Value(dtype="float32", id=None),
-            "next.done": Value(dtype="bool", id=None),
-            # "next.success": Value(dtype="bool", id=None),
-            "index": Value(dtype="int64", id=None),
-        }
-        update_features = {**image_features, **features}
-        features = Features(update_features)
-        hf_dataset = Dataset.from_dict(data_dict, features=features)
-        hf_dataset.set_transform(hf_transform_to_torch)
-
-        return hf_dataset
-
-    def cleanup(self):
-        pass
--- a/lerobot/common/datasets/push_dataset_to_hub/pusht_processor.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/pusht_processor.py
@@ -1,180 +0,0 @@
-from pathlib import Path
-
-import numpy as np
-import torch
-import tqdm
-import zarr
-from datasets import Dataset, Features, Image, Sequence, Value
-from PIL import Image as PILImage
-
-from lerobot.common.datasets.push_dataset_to_hub.utils import concatenate_episodes
-from lerobot.common.datasets.utils import (
-    hf_transform_to_torch,
-)
-
-
-class PushTProcessor:
-    """Process zarr files formatted like in: https://github.com/real-stanford/diffusion_policy"""
-
-    def __init__(self, folder_path: Path, fps: int | None = None):
-        self.zarr_path = folder_path
-        if fps is None:
-            fps = 10
-        self._fps = fps
-
-    @property
-    def fps(self) -> int:
-        return self._fps
-
-    def is_valid(self):
-        try:
-            zarr_data = zarr.open(self.zarr_path, mode="r")
-        except Exception:
-            # TODO (azouitine): Handle the exception properly
-            return False
-        required_datasets = {
-            "data/action",
-            "data/img",
-            "data/keypoint",
-            "data/n_contacts",
-            "data/state",
-            "meta/episode_ends",
-        }
-        for dataset in required_datasets:
-            if dataset not in zarr_data:
-                return False
-        nb_frames = zarr_data["data/img"].shape[0]
-
-        required_datasets.remove("meta/episode_ends")
-
-        return all(nb_frames == zarr_data[dataset].shape[0] for dataset in required_datasets)
-
-    def preprocess(self):
-        try:
-            import pymunk
-            from gym_pusht.envs.pusht import PushTEnv, pymunk_to_shapely
-
-            from lerobot.common.datasets.push_dataset_to_hub._diffusion_policy_replay_buffer import (
-                ReplayBuffer as DiffusionPolicyReplayBuffer,
-            )
-        except ModuleNotFoundError as e:
-            print("`gym_pusht` is not installed. Please install it with `pip install 'lerobot[gym_pusht]'`")
-            raise e
-
-        # as define in env
-        success_threshold = 0.95  # 95% coverage,
-
-        dataset_dict = DiffusionPolicyReplayBuffer.copy_from_path(
-            self.zarr_path
-        )  # , keys=['img', 'state', 'action'])
-
-        episode_ids = torch.from_numpy(dataset_dict.get_episode_idxs())
-        num_episodes = dataset_dict.meta["episode_ends"].shape[0]
-        assert len(
-            {dataset_dict[key].shape[0] for key in dataset_dict.keys()}  # noqa: SIM118
-        ), "Some data type dont have the same number of total frames."
-
-        # TODO: verify that goal pose is expected to be fixed
-        goal_pos_angle = np.array([256, 256, np.pi / 4])  # x, y, theta (in radians)
-        goal_body = PushTEnv.get_goal_pose_body(goal_pos_angle)
-
-        imgs = torch.from_numpy(dataset_dict["img"])  # b h w c
-        states = torch.from_numpy(dataset_dict["state"])
-        actions = torch.from_numpy(dataset_dict["action"])
-
-        ep_dicts = []
-        episode_data_index = {"from": [], "to": []}
-
-        id_from = 0
-        for episode_id in tqdm.tqdm(range(num_episodes)):
-            id_to = dataset_dict.meta["episode_ends"][episode_id]
-
-            num_frames = id_to - id_from
-
-            assert (episode_ids[id_from:id_to] == episode_id).all()
-
-            image = imgs[id_from:id_to]
-            assert image.min() >= 0.0
-            assert image.max() <= 255.0
-            image = image.type(torch.uint8)
-
-            state = states[id_from:id_to]
-            agent_pos = state[:, :2]
-            block_pos = state[:, 2:4]
-            block_angle = state[:, 4]
-
-            reward = torch.zeros(num_frames)
-            success = torch.zeros(num_frames, dtype=torch.bool)
-            done = torch.zeros(num_frames, dtype=torch.bool)
-            for i in range(num_frames):
-                space = pymunk.Space()
-                space.gravity = 0, 0
-                space.damping = 0
-
-                # Add walls.
-                walls = [
-                    PushTEnv.add_segment(space, (5, 506), (5, 5), 2),
-                    PushTEnv.add_segment(space, (5, 5), (506, 5), 2),
-                    PushTEnv.add_segment(space, (506, 5), (506, 506), 2),
-                    PushTEnv.add_segment(space, (5, 506), (506, 506), 2),
-                ]
-                space.add(*walls)
-
-                block_body = PushTEnv.add_tee(space, block_pos[i].tolist(), block_angle[i].item())
-                goal_geom = pymunk_to_shapely(goal_body, block_body.shapes)
-                block_geom = pymunk_to_shapely(block_body, block_body.shapes)
-                intersection_area = goal_geom.intersection(block_geom).area
-                goal_area = goal_geom.area
-                coverage = intersection_area / goal_area
-                reward[i] = np.clip(coverage / success_threshold, 0, 1)
-                success[i] = coverage > success_threshold
-
-            # last step of demonstration is considered done
-            done[-1] = True
-
-            ep_dict = {
-                "observation.image": [PILImage.fromarray(x.numpy()) for x in image],
-                "observation.state": agent_pos,
-                "action": actions[id_from:id_to],
-                "episode_index": torch.tensor([episode_id] * num_frames, dtype=torch.int),
-                "frame_index": torch.arange(0, num_frames, 1),
-                "timestamp": torch.arange(0, num_frames, 1) / self.fps,
-                # "next.observation.image": image[1:],
-                # "next.observation.state": agent_pos[1:],
-                # TODO(rcadene): verify that reward and done are aligned with image and agent_pos
-                "next.reward": torch.cat([reward[1:], reward[[-1]]]),
-                "next.done": torch.cat([done[1:], done[[-1]]]),
-                "next.success": torch.cat([success[1:], success[[-1]]]),
-            }
-            ep_dicts.append(ep_dict)
-
-            episode_data_index["from"].append(id_from)
-            episode_data_index["to"].append(id_from + num_frames)
-
-            id_from += num_frames
-
-        data_dict = concatenate_episodes(ep_dicts)
-        return data_dict, episode_data_index
-
-    def to_hf_dataset(self, data_dict):
-        features = {
-            "observation.image": Image(),
-            "observation.state": Sequence(
-                length=data_dict["observation.state"].shape[1], feature=Value(dtype="float32", id=None)
-            ),
-            "action": Sequence(length=data_dict["action"].shape[1], feature=Value(dtype="float32", id=None)),
-            "episode_index": Value(dtype="int64", id=None),
-            "frame_index": Value(dtype="int64", id=None),
-            "timestamp": Value(dtype="float32", id=None),
-            "next.reward": Value(dtype="float32", id=None),
-            "next.done": Value(dtype="bool", id=None),
-            "next.success": Value(dtype="bool", id=None),
-            "index": Value(dtype="int64", id=None),
-        }
-        features = Features(features)
-        hf_dataset = Dataset.from_dict(data_dict, features=features)
-        hf_dataset.set_transform(hf_transform_to_torch)
-        return hf_dataset
-
-    def cleanup(self):
-        pass
--- a/lerobot/common/datasets/push_dataset_to_hub/pusht_zarr_format.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/pusht_zarr_format.py
@@ -0,0 +1,214 @@
+"""Process zarr files formatted like in: https://github.com/real-stanford/diffusion_policy"""
+
+import shutil
+from pathlib import Path
+
+import numpy as np
+import torch
+import tqdm
+import zarr
+from datasets import Dataset, Features, Image, Sequence, Value
+from PIL import Image as PILImage
+
+from lerobot.common.datasets.push_dataset_to_hub.utils import concatenate_episodes, save_images_concurrently
+from lerobot.common.datasets.utils import (
+    hf_transform_to_torch,
+)
+
+# TODO(rcadene): enable for PR video dataset
+# from lerobot.common.datasets.video_utils import encode_video_frames
+
+
+def check_format(raw_dir):
+    zarr_path = raw_dir / "pusht_cchi_v7_replay.zarr"
+    zarr_data = zarr.open(zarr_path, mode="r")
+
+    required_datasets = {
+        "data/action",
+        "data/img",
+        "data/keypoint",
+        "data/n_contacts",
+        "data/state",
+        "meta/episode_ends",
+    }
+    for dataset in required_datasets:
+        assert dataset in zarr_data
+    nb_frames = zarr_data["data/img"].shape[0]
+
+    required_datasets.remove("meta/episode_ends")
+
+    assert all(nb_frames == zarr_data[dataset].shape[0] for dataset in required_datasets)
+
+
+def load_from_raw(raw_dir, out_dir, fps, video, debug):
+    try:
+        import pymunk
+        from gym_pusht.envs.pusht import PushTEnv, pymunk_to_shapely
+
+        from lerobot.common.datasets.push_dataset_to_hub._diffusion_policy_replay_buffer import (
+            ReplayBuffer as DiffusionPolicyReplayBuffer,
+        )
+    except ModuleNotFoundError as e:
+        print("`gym_pusht` is not installed. Please install it with `pip install 'lerobot[gym_pusht]'`")
+        raise e
+    # as define in gmy-pusht env: https://github.com/huggingface/gym-pusht/blob/e0684ff988d223808c0a9dcfaba9dc4991791370/gym_pusht/envs/pusht.py#L174
+    success_threshold = 0.95  # 95% coverage,
+
+    zarr_path = raw_dir / "pusht_cchi_v7_replay.zarr"
+    zarr_data = DiffusionPolicyReplayBuffer.copy_from_path(zarr_path)
+
+    episode_ids = torch.from_numpy(zarr_data.get_episode_idxs())
+    num_episodes = zarr_data.meta["episode_ends"].shape[0]
+    assert len(
+        {zarr_data[key].shape[0] for key in zarr_data.keys()}  # noqa: SIM118
+    ), "Some data type dont have the same number of total frames."
+
+    # TODO(rcadene): verify that goal pose is expected to be fixed
+    goal_pos_angle = np.array([256, 256, np.pi / 4])  # x, y, theta (in radians)
+    goal_body = PushTEnv.get_goal_pose_body(goal_pos_angle)
+
+    imgs = torch.from_numpy(zarr_data["img"])  # b h w c
+    states = torch.from_numpy(zarr_data["state"])
+    actions = torch.from_numpy(zarr_data["action"])
+
+    ep_dicts = []
+    episode_data_index = {"from": [], "to": []}
+
+    id_from = 0
+    for ep_idx in tqdm.tqdm(range(num_episodes)):
+        id_to = zarr_data.meta["episode_ends"][ep_idx]
+        num_frames = id_to - id_from
+
+        # sanity check
+        assert (episode_ids[id_from:id_to] == ep_idx).all()
+
+        # get image
+        image = imgs[id_from:id_to]
+        assert image.min() >= 0.0
+        assert image.max() <= 255.0
+        image = image.type(torch.uint8)
+
+        # get state
+        state = states[id_from:id_to]
+        agent_pos = state[:, :2]
+        block_pos = state[:, 2:4]
+        block_angle = state[:, 4]
+
+        # get reward, success, done
+        reward = torch.zeros(num_frames)
+        success = torch.zeros(num_frames, dtype=torch.bool)
+        done = torch.zeros(num_frames, dtype=torch.bool)
+        for i in range(num_frames):
+            space = pymunk.Space()
+            space.gravity = 0, 0
+            space.damping = 0
+
+            # Add walls.
+            walls = [
+                PushTEnv.add_segment(space, (5, 506), (5, 5), 2),
+                PushTEnv.add_segment(space, (5, 5), (506, 5), 2),
+                PushTEnv.add_segment(space, (506, 5), (506, 506), 2),
+                PushTEnv.add_segment(space, (5, 506), (506, 506), 2),
+            ]
+            space.add(*walls)
+
+            block_body = PushTEnv.add_tee(space, block_pos[i].tolist(), block_angle[i].item())
+            goal_geom = pymunk_to_shapely(goal_body, block_body.shapes)
+            block_geom = pymunk_to_shapely(block_body, block_body.shapes)
+            intersection_area = goal_geom.intersection(block_geom).area
+            goal_area = goal_geom.area
+            coverage = intersection_area / goal_area
+            reward[i] = np.clip(coverage / success_threshold, 0, 1)
+            success[i] = coverage > success_threshold
+
+        # last step of demonstration is considered done
+        done[-1] = True
+
+        ep_dict = {}
+
+        imgs_array = [x.numpy() for x in image]
+        if video:
+            # save png images in temporary directory
+            tmp_imgs_dir = out_dir / "tmp_images"
+            save_images_concurrently(imgs_array, tmp_imgs_dir)
+
+            # encode images to a mp4 video
+            video_path = out_dir / "videos" / f"observation.image_episode_{ep_idx:06d}.mp4"
+            encode_video_frames(tmp_imgs_dir, video_path, fps)  # noqa: F821
+
+            # clean temporary images directory
+            shutil.rmtree(tmp_imgs_dir)
+
+            # store the episode index
+            ep_dict["observation.image"] = torch.tensor([ep_idx] * num_frames, dtype=torch.int)
+        else:
+            ep_dict["observation.image"] = [PILImage.fromarray(x) for x in imgs_array]
+
+        ep_dict["observation.state"] = agent_pos
+        ep_dict["action"] = actions[id_from:id_to]
+        ep_dict["episode_index"] = torch.tensor([ep_idx] * num_frames, dtype=torch.int)
+        ep_dict["frame_index"] = torch.arange(0, num_frames, 1)
+        ep_dict["timestamp"] = torch.arange(0, num_frames, 1) / fps
+        # ep_dict["next.observation.image"] = image[1:],
+        # ep_dict["next.observation.state"] = agent_pos[1:],
+        # TODO(rcadene)] = verify that reward and done are aligned with image and agent_pos
+        ep_dict["next.reward"] = torch.cat([reward[1:], reward[[-1]]])
+        ep_dict["next.done"] = torch.cat([done[1:], done[[-1]]])
+        ep_dict["next.success"] = torch.cat([success[1:], success[[-1]]])
+        ep_dicts.append(ep_dict)
+
+        episode_data_index["from"].append(id_from)
+        episode_data_index["to"].append(id_from + num_frames)
+
+        id_from += num_frames
+
+        # process first episode only
+        if debug:
+            break
+
+    data_dict = concatenate_episodes(ep_dicts)
+    return data_dict, episode_data_index
+
+
+def to_hf_dataset(data_dict, video):
+    features = {}
+
+    if video:
+        features["observation.image"] = Value(dtype="int64", id="video")
+    else:
+        features["observation.image"] = Image()
+
+    features["observation.state"] = Sequence(
+        length=data_dict["observation.state"].shape[1], feature=Value(dtype="float32", id=None)
+    )
+    features["action"] = Sequence(
+        length=data_dict["action"].shape[1], feature=Value(dtype="float32", id=None)
+    )
+    features["episode_index"] = Value(dtype="int64", id=None)
+    features["frame_index"] = Value(dtype="int64", id=None)
+    features["timestamp"] = Value(dtype="float32", id=None)
+    features["next.reward"] = Value(dtype="float32", id=None)
+    features["next.done"] = Value(dtype="bool", id=None)
+    features["next.success"] = Value(dtype="bool", id=None)
+    features["index"] = Value(dtype="int64", id=None)
+
+    hf_dataset = Dataset.from_dict(data_dict, features=Features(features))
+    hf_dataset.set_transform(hf_transform_to_torch)
+    return hf_dataset
+
+
+def from_raw_to_lerobot_format(raw_dir: Path, out_dir: Path, fps=None, video=True, debug=False):
+    # sanity check
+    check_format(raw_dir)
+
+    if fps is None:
+        fps = 10
+
+    data_dict, episode_data_index = load_from_raw(raw_dir, out_dir, fps, video, debug)
+    hf_dataset = to_hf_dataset(data_dict, video)
+
+    info = {
+        "fps": fps,
+        "video": video,
+    }
+    return hf_dataset, episode_data_index, info
--- a/lerobot/common/datasets/push_dataset_to_hub/umi_processor.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/umi_processor.py
@@ -1,280 +0,0 @@
-import os
-import re
-import shutil
-from glob import glob
-
-import numpy as np
-import torch
-import tqdm
-import zarr
-from datasets import Dataset, Features, Image, Sequence, Value
-from PIL import Image as PILImage
-
-from lerobot.common.datasets.push_dataset_to_hub._umi_imagecodecs_numcodecs import register_codecs
-from lerobot.common.datasets.push_dataset_to_hub.utils import concatenate_episodes
-from lerobot.common.datasets.utils import (
-    hf_transform_to_torch,
-)
-
-
-class UmiProcessor:
-    """
-     Process UMI (Universal Manipulation Interface) data stored in Zarr format like in: https://github.com/real-stanford/universal_manipulation_interface
-
-    Attributes:
-        folder_path (str): The path to the folder containing Zarr datasets.
-        fps (int): Frames per second, used to calculate timestamps for frames.
-
-    """
-
-    def __init__(self, folder_path: str, fps: int | None = None):
-        self.zarr_path = folder_path
-        if fps is None:
-            # https://arxiv.org/pdf/2402.10329#table.caption.16
-            fps = 10  # For umi cup in the wild
-        self._fps = fps
-        register_codecs()
-
-    @property
-    def fps(self) -> int:
-        return self._fps
-
-    def is_valid(self) -> bool:
-        """
-        Validates the Zarr folder to ensure it contains all required datasets with consistent frame counts.
-
-        Returns:
-            bool: True if all required datasets are present and have consistent frame counts, False otherwise.
-        """
-        # Check if the Zarr folder is valid
-        try:
-            zarr_data = zarr.open(self.zarr_path, mode="r")
-        except Exception:
-            # TODO (azouitine): Handle the exception properly
-            return False
-        required_datasets = {
-            "data/robot0_demo_end_pose",
-            "data/robot0_demo_start_pose",
-            "data/robot0_eef_pos",
-            "data/robot0_eef_rot_axis_angle",
-            "data/robot0_gripper_width",
-            "meta/episode_ends",
-            "data/camera0_rgb",
-        }
-        for dataset in required_datasets:
-            if dataset not in zarr_data:
-                return False
-        nb_frames = zarr_data["data/camera0_rgb"].shape[0]
-
-        required_datasets.remove("meta/episode_ends")
-
-        return all(nb_frames == zarr_data[dataset].shape[0] for dataset in required_datasets)
-
-    def preprocess(self):
-        """
-        Collects and processes all episodes from the Zarr dataset into structured data dictionaries.
-
-        Returns:
-            Tuple[Dict, Dict]: A tuple containing the structured episode data and episode index mappings.
-        """
-        zarr_data = zarr.open(self.zarr_path, mode="r")
-
-        # We process the image data separately because it is too large to fit in memory
-        end_pose = torch.from_numpy(zarr_data["data/robot0_demo_end_pose"][:])
-        start_pos = torch.from_numpy(zarr_data["data/robot0_demo_start_pose"][:])
-        eff_pos = torch.from_numpy(zarr_data["data/robot0_eef_pos"][:])
-        eff_rot_axis_angle = torch.from_numpy(zarr_data["data/robot0_eef_rot_axis_angle"][:])
-        gripper_width = torch.from_numpy(zarr_data["data/robot0_gripper_width"][:])
-
-        states_pos = torch.cat([eff_pos, eff_rot_axis_angle], dim=1)
-        states = torch.cat([states_pos, gripper_width], dim=1)
-
-        episode_ends = zarr_data["meta/episode_ends"][:]
-        num_episodes: int = episode_ends.shape[0]
-
-        episode_ids = torch.from_numpy(self.get_episode_idxs(episode_ends))
-
-        # We convert it in torch tensor later because the jit function does not support torch tensors
-        episode_ends = torch.from_numpy(episode_ends)
-
-        ep_dicts = []
-        episode_data_index = {"from": [], "to": []}
-        id_from = 0
-
-        for episode_id in tqdm.tqdm(range(num_episodes)):
-            id_to = episode_ends[episode_id]
-
-            num_frames = id_to - id_from
-
-            assert (
-                episode_ids[id_from:id_to] == episode_id
-            ).all(), f"episode_ids[{id_from}:{id_to}] != {episode_id}"
-
-            state = states[id_from:id_to]
-            ep_dict = {
-                # observation.image will be filled later
-                "observation.state": state,
-                "episode_index": torch.tensor([episode_id] * num_frames, dtype=torch.int),
-                "frame_index": torch.arange(0, num_frames, 1),
-                "timestamp": torch.arange(0, num_frames, 1) / self.fps,
-                "episode_data_index_from": torch.tensor([id_from] * num_frames),
-                "episode_data_index_to": torch.tensor([id_from + num_frames] * num_frames),
-                "end_pose": end_pose[id_from:id_to],
-                "start_pos": start_pos[id_from:id_to],
-                "gripper_width": gripper_width[id_from:id_to],
-            }
-            ep_dicts.append(ep_dict)
-            episode_data_index["from"].append(id_from)
-            episode_data_index["to"].append(id_from + num_frames)
-            id_from += num_frames
-
-        data_dict = concatenate_episodes(ep_dicts)
-
-        total_frames = id_from
-        data_dict["index"] = torch.arange(0, total_frames, 1)
-
-        print("Saving images to disk in temporary folder...")
-        # datasets.Image() can take a list of paths to images, so we save the images to a temporary folder
-        # to avoid loading them all in memory
-        _save_images_concurrently(
-            data=zarr_data, image_key="data/camera0_rgb", folder_path="tmp_umi_images", max_workers=12
-        )
-        print("Saving images to disk in temporary folder... Done")
-
-        # Sort files by number eg. 1.png, 2.png, 3.png, 9.png, 10.png instead of 1.png, 10.png, 2.png, 3.png, 9.png
-        # to correctly match the images with the data
-        images_path = sorted(
-            glob("tmp_umi_images/*"), key=lambda x: int(re.search(r"(\d+)\.png$", x).group(1))
-        )
-        data_dict["observation.image"] = images_path
-        print("Images saved to disk, do not forget to delete the folder tmp_umi_images/")
-
-        # Cleanup
-        return data_dict, episode_data_index
-
-    def to_hf_dataset(self, data_dict):
-        """
-        Converts the processed data dictionary into a Hugging Face dataset with defined features.
-
-        Args:
-            data_dict (Dict): The data dictionary containing tensors and episode information.
-
-        Returns:
-            Dataset: A Hugging Face dataset constructed from the provided data dictionary.
-        """
-        features = {
-            "observation.image": Image(),
-            "observation.state": Sequence(
-                length=data_dict["observation.state"].shape[1], feature=Value(dtype="float32", id=None)
-            ),
-            "episode_index": Value(dtype="int64", id=None),
-            "frame_index": Value(dtype="int64", id=None),
-            "timestamp": Value(dtype="float32", id=None),
-            "index": Value(dtype="int64", id=None),
-            "episode_data_index_from": Value(dtype="int64", id=None),
-            "episode_data_index_to": Value(dtype="int64", id=None),
-            # `start_pos` and `end_pos` respectively represent the positions of the end-effector
-            # at the beginning and the end of the episode.
-            # `gripper_width` indicates the distance between the grippers, and this value is included
-            # in the state vector, which comprises the concatenation of the end-effector position
-            # and gripper width.
-            "end_pose": Sequence(
-                length=data_dict["end_pose"].shape[1], feature=Value(dtype="float32", id=None)
-            ),
-            "start_pos": Sequence(
-                length=data_dict["start_pos"].shape[1], feature=Value(dtype="float32", id=None)
-            ),
-            "gripper_width": Sequence(
-                length=data_dict["gripper_width"].shape[1], feature=Value(dtype="float32", id=None)
-            ),
-        }
-        features = Features(features)
-        hf_dataset = Dataset.from_dict(data_dict, features=features)
-        hf_dataset.set_transform(hf_transform_to_torch)
-
-        return hf_dataset
-
-    def cleanup(self):
-        # Cleanup
-        if os.path.exists("tmp_umi_images"):
-            print("Removing temporary images folder")
-            shutil.rmtree("tmp_umi_images")
-            print("Cleanup done")
-
-    @classmethod
-    def get_episode_idxs(cls, episode_ends: np.ndarray) -> np.ndarray:
-        # Optimized and simplified version of this function: https://github.com/real-stanford/universal_manipulation_interface/blob/298776ce251f33b6b3185a98d6e7d1f9ad49168b/diffusion_policy/common/replay_buffer.py#L374
-        from numba import jit
-
-        @jit(nopython=True)
-        def _get_episode_idxs(episode_ends):
-            result = np.zeros((episode_ends[-1],), dtype=np.int64)
-            start_idx = 0
-            for episode_number, end_idx in enumerate(episode_ends):
-                result[start_idx:end_idx] = episode_number
-                start_idx = end_idx
-            return result
-
-        return _get_episode_idxs(episode_ends)
-
-
-def _clear_folder(folder_path: str):
-    """
-    Clears all the content of the specified folder. Creates the folder if it does not exist.
-
-    Args:
-    folder_path (str): Path to the folder to clear.
-
-    Examples:
-    >>> import os
-    >>> os.makedirs('example_folder', exist_ok=True)
-    >>> with open('example_folder/temp_file.txt', 'w') as f:
-    ...     f.write('example')
-    >>> clear_folder('example_folder')
-    >>> os.listdir('example_folder')
-    []
-    """
-    if os.path.exists(folder_path):
-        for filename in os.listdir(folder_path):
-            file_path = os.path.join(folder_path, filename)
-            try:
-                if os.path.isfile(file_path) or os.path.islink(file_path):
-                    os.unlink(file_path)
-                elif os.path.isdir(file_path):
-                    shutil.rmtree(file_path)
-            except Exception as e:
-                print(f"Failed to delete {file_path}. Reason: {e}")
-    else:
-        os.makedirs(folder_path)
-
-
-def _save_image(img_array: np.array, i: int, folder_path: str):
-    """
-    Saves a single image to the specified folder.
-
-    Args:
-    img_array (ndarray): The numpy array of the image.
-    i (int): Index of the image, used for naming.
-    folder_path (str): Path to the folder where the image will be saved.
-    """
-    img = PILImage.fromarray(img_array)
-    img_format = "PNG" if img_array.dtype == np.uint8 else "JPEG"
-    img.save(os.path.join(folder_path, f"{i}.{img_format.lower()}"), quality=100)
-
-
-def _save_images_concurrently(data: dict, image_key: str, folder_path: str, max_workers: int = 4):
-    from concurrent.futures import ThreadPoolExecutor
-
-    """
-    Saves images from the zarr_data to the specified folder using multithreading.
-
-    Args:
-    zarr_data (dict): A dictionary containing image data in an array format.
-    folder_path (str): Path to the folder where images will be saved.
-    max_workers (int): The maximum number of threads to use for saving images.
-    """
-    num_images = len(data["data/camera0_rgb"])
-    _clear_folder(folder_path)  # Clear or create folder first
-
-    with ThreadPoolExecutor(max_workers=max_workers) as executor:
-        [executor.submit(_save_image, data[image_key][i], i, folder_path) for i in range(num_images)]
--- a/lerobot/common/datasets/push_dataset_to_hub/umi_zarr_format.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/umi_zarr_format.py
@@ -0,0 +1,207 @@
+"""Process UMI (Universal Manipulation Interface) data stored in Zarr format like in: https://github.com/real-stanford/universal_manipulation_interface"""
+
+import logging
+import shutil
+from pathlib import Path
+
+import numpy as np
+import torch
+import tqdm
+import zarr
+from datasets import Dataset, Features, Image, Sequence, Value
+from PIL import Image as PILImage
+
+from lerobot.common.datasets.push_dataset_to_hub._umi_imagecodecs_numcodecs import register_codecs
+from lerobot.common.datasets.push_dataset_to_hub.utils import concatenate_episodes, save_images_concurrently
+from lerobot.common.datasets.utils import (
+    hf_transform_to_torch,
+)
+
+# TODO(rcadene): enable for PR video dataset
+# from lerobot.common.datasets.video_utils import encode_video_frames
+
+
+def check_format(raw_dir) -> bool:
+    zarr_path = raw_dir / "cup_in_the_wild.zarr"
+    zarr_data = zarr.open(zarr_path, mode="r")
+
+    required_datasets = {
+        "data/robot0_demo_end_pose",
+        "data/robot0_demo_start_pose",
+        "data/robot0_eef_pos",
+        "data/robot0_eef_rot_axis_angle",
+        "data/robot0_gripper_width",
+        "meta/episode_ends",
+        "data/camera0_rgb",
+    }
+    for dataset in required_datasets:
+        if dataset not in zarr_data:
+            return False
+
+    # mandatory to access zarr_data
+    register_codecs()
+    nb_frames = zarr_data["data/camera0_rgb"].shape[0]
+
+    required_datasets.remove("meta/episode_ends")
+    assert all(nb_frames == zarr_data[dataset].shape[0] for dataset in required_datasets)
+
+
+def get_episode_idxs(episode_ends: np.ndarray) -> np.ndarray:
+    # Optimized and simplified version of this function: https://github.com/real-stanford/universal_manipulation_interface/blob/298776ce251f33b6b3185a98d6e7d1f9ad49168b/diffusion_policy/common/replay_buffer.py#L374
+    from numba import jit
+
+    @jit(nopython=True)
+    def _get_episode_idxs(episode_ends):
+        result = np.zeros((episode_ends[-1],), dtype=np.int64)
+        start_idx = 0
+        for episode_number, end_idx in enumerate(episode_ends):
+            result[start_idx:end_idx] = episode_number
+            start_idx = end_idx
+        return result
+
+    return _get_episode_idxs(episode_ends)
+
+
+def load_from_raw(raw_dir, out_dir, fps, video, debug):
+    zarr_path = raw_dir / "cup_in_the_wild.zarr"
+    zarr_data = zarr.open(zarr_path, mode="r")
+
+    # We process the image data separately because it is too large to fit in memory
+    end_pose = torch.from_numpy(zarr_data["data/robot0_demo_end_pose"][:])
+    start_pos = torch.from_numpy(zarr_data["data/robot0_demo_start_pose"][:])
+    eff_pos = torch.from_numpy(zarr_data["data/robot0_eef_pos"][:])
+    eff_rot_axis_angle = torch.from_numpy(zarr_data["data/robot0_eef_rot_axis_angle"][:])
+    gripper_width = torch.from_numpy(zarr_data["data/robot0_gripper_width"][:])
+
+    states_pos = torch.cat([eff_pos, eff_rot_axis_angle], dim=1)
+    states = torch.cat([states_pos, gripper_width], dim=1)
+
+    episode_ends = zarr_data["meta/episode_ends"][:]
+    num_episodes = episode_ends.shape[0]
+
+    episode_ids = torch.from_numpy(get_episode_idxs(episode_ends))
+
+    # We convert it in torch tensor later because the jit function does not support torch tensors
+    episode_ends = torch.from_numpy(episode_ends)
+
+    ep_dicts = []
+    episode_data_index = {"from": [], "to": []}
+
+    id_from = 0
+    for ep_idx in tqdm.tqdm(range(num_episodes)):
+        id_to = episode_ends[ep_idx]
+        num_frames = id_to - id_from
+
+        # sanity heck
+        assert (episode_ids[id_from:id_to] == ep_idx).all()
+
+        # TODO(rcadene): save temporary images of the episode?
+
+        state = states[id_from:id_to]
+
+        ep_dict = {}
+
+        # load 57MB of images in RAM (400x224x224x3 uint8)
+        imgs_array = zarr_data["data/camera0_rgb"][id_from:id_to]
+        if video:
+            # save png images in temporary directory
+            tmp_imgs_dir = out_dir / "tmp_images"
+            save_images_concurrently(imgs_array, tmp_imgs_dir)
+
+            # encode images to a mp4 video
+            video_path = out_dir / "videos" / f"observation.image_episode_{ep_idx:06d}.mp4"
+            encode_video_frames(tmp_imgs_dir, video_path, fps)  # noqa: F821
+
+            # clean temporary images directory
+            shutil.rmtree(tmp_imgs_dir)
+
+            # store the episode index
+            ep_dict["observation.image"] = torch.tensor([ep_idx] * num_frames, dtype=torch.int)
+        else:
+            ep_dict["observation.image"] = [PILImage.fromarray(x) for x in imgs_array]
+
+        ep_dict["observation.state"] = state
+        ep_dict["episode_index"] = torch.tensor([ep_idx] * num_frames, dtype=torch.int)
+        ep_dict["frame_index"] = torch.arange(0, num_frames, 1)
+        ep_dict["timestamp"] = torch.arange(0, num_frames, 1) / fps
+        ep_dict["episode_data_index_from"] = torch.tensor([id_from] * num_frames)
+        ep_dict["episode_data_index_to"] = torch.tensor([id_from + num_frames] * num_frames)
+        ep_dict["end_pose"] = end_pose[id_from:id_to]
+        ep_dict["start_pos"] = start_pos[id_from:id_to]
+        ep_dict["gripper_width"] = gripper_width[id_from:id_to]
+        ep_dicts.append(ep_dict)
+
+        episode_data_index["from"].append(id_from)
+        episode_data_index["to"].append(id_from + num_frames)
+        id_from += num_frames
+
+        # process first episode only
+        if debug:
+            break
+
+    data_dict = concatenate_episodes(ep_dicts)
+
+    total_frames = id_from
+    data_dict["index"] = torch.arange(0, total_frames, 1)
+
+    return data_dict, episode_data_index
+
+
+def to_hf_dataset(data_dict, video):
+    features = {}
+
+    if video:
+        features["observation.image"] = Value(dtype="int64", id="video")
+    else:
+        features["observation.image"] = Image()
+
+    features["observation.state"] = Sequence(
+        length=data_dict["observation.state"].shape[1], feature=Value(dtype="float32", id=None)
+    )
+    features["episode_index"] = Value(dtype="int64", id=None)
+    features["frame_index"] = Value(dtype="int64", id=None)
+    features["timestamp"] = Value(dtype="float32", id=None)
+    features["index"] = Value(dtype="int64", id=None)
+    features["episode_data_index_from"] = Value(dtype="int64", id=None)
+    features["episode_data_index_to"] = Value(dtype="int64", id=None)
+    # `start_pos` and `end_pos` respectively represent the positions of the end-effector
+    # at the beginning and the end of the episode.
+    # `gripper_width` indicates the distance between the grippers, and this value is included
+    # in the state vector, which comprises the concatenation of the end-effector position
+    # and gripper width.
+    features["end_pose"] = Sequence(
+        length=data_dict["end_pose"].shape[1], feature=Value(dtype="float32", id=None)
+    )
+    features["start_pos"] = Sequence(
+        length=data_dict["start_pos"].shape[1], feature=Value(dtype="float32", id=None)
+    )
+    features["gripper_width"] = Sequence(
+        length=data_dict["gripper_width"].shape[1], feature=Value(dtype="float32", id=None)
+    )
+
+    hf_dataset = Dataset.from_dict(data_dict, features=Features(features))
+    hf_dataset.set_transform(hf_transform_to_torch)
+    return hf_dataset
+
+
+def from_raw_to_lerobot_format(raw_dir: Path, out_dir: Path, fps=None, video=True, debug=False):
+    # sanity check
+    check_format(raw_dir)
+
+    if fps is None:
+        # For umi cup in the wild: https://arxiv.org/pdf/2402.10329#table.caption.16
+        fps = 10
+
+    if not video:
+        logging.warning(
+            "Generating UMI dataset without `video=True` creates ~150GB on disk and requires ~80GB in RAM."
+        )
+
+    data_dict, episode_data_index = load_from_raw(raw_dir, out_dir, fps, video, debug)
+    hf_dataset = to_hf_dataset(data_dict, video)
+
+    info = {
+        "fps": fps,
+        "video": video,
+    }
+    return hf_dataset, episode_data_index, info
--- a/lerobot/common/datasets/push_dataset_to_hub/utils.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/utils.py
@@ -1,3 +1,8 @@
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+
+import numpy
+import PIL
 import torch


@@ -18,3 +23,16 @@ def concatenate_episodes(ep_dicts):
    total_frames = data_dict["frame_index"].shape[0]
    data_dict["index"] = torch.arange(0, total_frames, 1)
    return data_dict
+
+
+def save_images_concurrently(imgs_array: numpy.array, out_dir: Path, max_workers: int = 4):
+    out_dir = Path(out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    def save_image(img_array, i, out_dir):
+        img = PIL.Image.fromarray(img_array)
+        img.save(str(out_dir / f"frame_{i:06d}.png"), quality=100)
+
+    num_images = len(imgs_array)
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        [executor.submit(save_image, imgs_array[i], i, out_dir) for i in range(num_images)]
--- a/lerobot/common/datasets/push_dataset_to_hub/xarm_pkl_format.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/xarm_pkl_format.py
@@ -0,0 +1,163 @@
+"""Process pickle files formatted like in: https://github.com/fyhMer/fowm"""
+
+import pickle
+import shutil
+from pathlib import Path
+
+import einops
+import torch
+import tqdm
+from datasets import Dataset, Features, Image, Sequence, Value
+from PIL import Image as PILImage
+
+from lerobot.common.datasets.push_dataset_to_hub.utils import concatenate_episodes, save_images_concurrently
+from lerobot.common.datasets.utils import (
+    hf_transform_to_torch,
+)
+
+# TODO(rcadene): enable for PR video dataset
+# from lerobot.common.datasets.video_utils import encode_video_frames
+
+
+def check_format(raw_dir):
+    keys = {"actions", "rewards", "dones"}
+    nested_keys = {"observations": {"rgb", "state"}, "next_observations": {"rgb", "state"}}
+
+    xarm_files = list(raw_dir.glob("*.pkl"))
+    assert len(xarm_files) > 0
+
+    with open(xarm_files[0], "rb") as f:
+        dataset_dict = pickle.load(f)
+
+    assert isinstance(dataset_dict, dict)
+    assert all(k in dataset_dict for k in keys)
+
+    # Check for consistent lengths in nested keys
+    expected_len = len(dataset_dict["actions"])
+    assert all(len(dataset_dict[key]) == expected_len for key in keys if key in dataset_dict)
+
+    for key, subkeys in nested_keys.items():
+        nested_dict = dataset_dict.get(key, {})
+        assert all(len(nested_dict[subkey]) == expected_len for subkey in subkeys if subkey in nested_dict)
+
+
+def load_from_raw(raw_dir, out_dir, fps, video, debug):
+    pkl_path = raw_dir / "buffer.pkl"
+
+    with open(pkl_path, "rb") as f:
+        pkl_data = pickle.load(f)
+
+    ep_dicts = []
+    episode_data_index = {"from": [], "to": []}
+
+    id_from = 0
+    id_to = 0
+    ep_idx = 0
+    total_frames = pkl_data["actions"].shape[0]
+    for i in tqdm.tqdm(range(total_frames)):
+        id_to += 1
+
+        if not pkl_data["dones"][i]:
+            continue
+
+        num_frames = id_to - id_from
+
+        image = torch.tensor(pkl_data["observations"]["rgb"][id_from:id_to])
+        image = einops.rearrange(image, "b c h w -> b h w c")
+        state = torch.tensor(pkl_data["observations"]["state"][id_from:id_to])
+        action = torch.tensor(pkl_data["actions"][id_from:id_to])
+        # TODO(rcadene): we have a missing last frame which is the observation when the env is done
+        # it is critical to have this frame for tdmpc to predict a "done observation/state"
+        # next_image = torch.tensor(pkl_data["next_observations"]["rgb"][id_from:id_to])
+        # next_state = torch.tensor(pkl_data["next_observations"]["state"][id_from:id_to])
+        next_reward = torch.tensor(pkl_data["rewards"][id_from:id_to])
+        next_done = torch.tensor(pkl_data["dones"][id_from:id_to])
+
+        ep_dict = {}
+
+        imgs_array = [x.numpy() for x in image]
+        if video:
+            # save png images in temporary directory
+            tmp_imgs_dir = out_dir / "tmp_images"
+            save_images_concurrently(imgs_array, tmp_imgs_dir)
+
+            # encode images to a mp4 video
+            video_path = out_dir / "videos" / f"observation.image_episode_{ep_idx:06d}.mp4"
+            encode_video_frames(tmp_imgs_dir, video_path, fps)  # noqa: F821
+
+            # clean temporary images directory
+            shutil.rmtree(tmp_imgs_dir)
+
+            # store the episode index
+            ep_dict["observation.image"] = torch.tensor([ep_idx] * num_frames, dtype=torch.int)
+        else:
+            ep_dict["observation.image"] = [PILImage.fromarray(x) for x in imgs_array]
+
+        ep_dict["observation.state"] = state
+        ep_dict["action"] = action
+        ep_dict["episode_index"] = torch.tensor([ep_idx] * num_frames, dtype=torch.int)
+        ep_dict["frame_index"] = torch.arange(0, num_frames, 1)
+        ep_dict["timestamp"] = torch.arange(0, num_frames, 1) / fps
+        # ep_dict["next.observation.image"] = next_image
+        # ep_dict["next.observation.state"] = next_state
+        ep_dict["next.reward"] = next_reward
+        ep_dict["next.done"] = next_done
+        ep_dicts.append(ep_dict)
+
+        episode_data_index["from"].append(id_from)
+        episode_data_index["to"].append(id_from + num_frames)
+
+        id_from = id_to
+        ep_idx += 1
+
+        # process first episode only
+        if debug:
+            break
+
+    data_dict = concatenate_episodes(ep_dicts)
+    return data_dict, episode_data_index
+
+
+def to_hf_dataset(data_dict, video):
+    features = {}
+
+    if video:
+        features["observation.image"] = Value(dtype="int64", id="video")
+    else:
+        features["observation.image"] = Image()
+
+    features["observation.state"] = Sequence(
+        length=data_dict["observation.state"].shape[1], feature=Value(dtype="float32", id=None)
+    )
+    features["action"] = Sequence(
+        length=data_dict["action"].shape[1], feature=Value(dtype="float32", id=None)
+    )
+    features["episode_index"] = Value(dtype="int64", id=None)
+    features["frame_index"] = Value(dtype="int64", id=None)
+    features["timestamp"] = Value(dtype="float32", id=None)
+    features["next.reward"] = Value(dtype="float32", id=None)
+    features["next.done"] = Value(dtype="bool", id=None)
+    features["index"] = Value(dtype="int64", id=None)
+    # TODO(rcadene): add success
+    # features["next.success"] = Value(dtype='bool', id=None)
+
+    hf_dataset = Dataset.from_dict(data_dict, features=Features(features))
+    hf_dataset.set_transform(hf_transform_to_torch)
+    return hf_dataset
+
+
+def from_raw_to_lerobot_format(raw_dir: Path, out_dir: Path, fps=None, video=True, debug=False):
+    # sanity check
+    check_format(raw_dir)
+
+    if fps is None:
+        fps = 15
+
+    data_dict, episode_data_index = load_from_raw(raw_dir, out_dir, fps, video, debug)
+    hf_dataset = to_hf_dataset(data_dict, video)
+
+    info = {
+        "fps": fps,
+        "video": video,
+    }
+    return hf_dataset, episode_data_index, info
--- a/lerobot/common/datasets/push_dataset_to_hub/xarm_processor.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/xarm_processor.py
@@ -1,145 +0,0 @@
-import pickle
-from pathlib import Path
-
-import einops
-import torch
-import tqdm
-from datasets import Dataset, Features, Image, Sequence, Value
-from PIL import Image as PILImage
-
-from lerobot.common.datasets.push_dataset_to_hub.utils import concatenate_episodes
-from lerobot.common.datasets.utils import (
-    hf_transform_to_torch,
-)
-
-
-class XarmProcessor:
-    """Process pickle files formatted like in: https://github.com/fyhMer/fowm"""
-
-    def __init__(self, folder_path: str, fps: int | None = None):
-        self.folder_path = Path(folder_path)
-        self.keys = {"actions", "rewards", "dones"}
-        self.nested_keys = {"observations": {"rgb", "state"}, "next_observations": {"rgb", "state"}}
-        if fps is None:
-            fps = 15
-        self._fps = fps
-
-    @property
-    def fps(self) -> int:
-        return self._fps
-
-    def is_valid(self) -> bool:
-        # get all .pkl files
-        xarm_files = list(self.folder_path.glob("*.pkl"))
-        if len(xarm_files) != 1:
-            return False
-
-        try:
-            with open(xarm_files[0], "rb") as f:
-                dataset_dict = pickle.load(f)
-        except Exception:
-            return False
-
-        if not isinstance(dataset_dict, dict):
-            return False
-
-        if not all(k in dataset_dict for k in self.keys):
-            return False
-
-        # Check for consistent lengths in nested keys
-        try:
-            expected_len = len(dataset_dict["actions"])
-            if any(len(dataset_dict[key]) != expected_len for key in self.keys if key in dataset_dict):
-                return False
-
-            for key, subkeys in self.nested_keys.items():
-                nested_dict = dataset_dict.get(key, {})
-                if any(
-                    len(nested_dict[subkey]) != expected_len for subkey in subkeys if subkey in nested_dict
-                ):
-                    return False
-        except KeyError:  # If any expected key or subkey is missing
-            return False
-
-        return True  # All checks passed
-
-    def preprocess(self):
-        if not self.is_valid():
-            raise ValueError("The Xarm file is invalid or does not contain the required datasets.")
-
-        xarm_files = list(self.folder_path.glob("*.pkl"))
-
-        with open(xarm_files[0], "rb") as f:
-            dataset_dict = pickle.load(f)
-        ep_dicts = []
-        episode_data_index = {"from": [], "to": []}
-
-        id_from = 0
-        id_to = 0
-        episode_id = 0
-        total_frames = dataset_dict["actions"].shape[0]
-        for i in tqdm.tqdm(range(total_frames)):
-            id_to += 1
-
-            if not dataset_dict["dones"][i]:
-                continue
-
-            num_frames = id_to - id_from
-
-            image = torch.tensor(dataset_dict["observations"]["rgb"][id_from:id_to])
-            image = einops.rearrange(image, "b c h w -> b h w c")
-            state = torch.tensor(dataset_dict["observations"]["state"][id_from:id_to])
-            action = torch.tensor(dataset_dict["actions"][id_from:id_to])
-            # TODO(rcadene): we have a missing last frame which is the observation when the env is done
-            # it is critical to have this frame for tdmpc to predict a "done observation/state"
-            # next_image = torch.tensor(dataset_dict["next_observations"]["rgb"][id_from:id_to])
-            # next_state = torch.tensor(dataset_dict["next_observations"]["state"][id_from:id_to])
-            next_reward = torch.tensor(dataset_dict["rewards"][id_from:id_to])
-            next_done = torch.tensor(dataset_dict["dones"][id_from:id_to])
-
-            ep_dict = {
-                "observation.image": [PILImage.fromarray(x.numpy()) for x in image],
-                "observation.state": state,
-                "action": action,
-                "episode_index": torch.tensor([episode_id] * num_frames, dtype=torch.int),
-                "frame_index": torch.arange(0, num_frames, 1),
-                "timestamp": torch.arange(0, num_frames, 1) / self.fps,
-                # "next.observation.image": next_image,
-                # "next.observation.state": next_state,
-                "next.reward": next_reward,
-                "next.done": next_done,
-            }
-            ep_dicts.append(ep_dict)
-
-            episode_data_index["from"].append(id_from)
-            episode_data_index["to"].append(id_from + num_frames)
-
-            id_from = id_to
-            episode_id += 1
-
-        data_dict = concatenate_episodes(ep_dicts)
-        return data_dict, episode_data_index
-
-    def to_hf_dataset(self, data_dict):
-        features = {
-            "observation.image": Image(),
-            "observation.state": Sequence(
-                length=data_dict["observation.state"].shape[1], feature=Value(dtype="float32", id=None)
-            ),
-            "action": Sequence(length=data_dict["action"].shape[1], feature=Value(dtype="float32", id=None)),
-            "episode_index": Value(dtype="int64", id=None),
-            "frame_index": Value(dtype="int64", id=None),
-            "timestamp": Value(dtype="float32", id=None),
-            "next.reward": Value(dtype="float32", id=None),
-            "next.done": Value(dtype="bool", id=None),
-            #'next.success': Value(dtype='bool', id=None),
-            "index": Value(dtype="int64", id=None),
-        }
-        features = Features(features)
-        hf_dataset = Dataset.from_dict(data_dict, features=features)
-        hf_dataset.set_transform(hf_transform_to_torch)
-
-        return hf_dataset
-
-    def cleanup(self):
-        pass