Refactor the download and publication of the datasets and convert it into CLI script (#95)

Co-authored-by: Remi <re.cadene@gmail.com>
2024-04-29 00:08:17 +02:00
parent 81e490d46f
commit 55dc9f7f51
15 changed files with 1410 additions and 827 deletions
--- a/lerobot/common/datasets/push_dataset_to_hub/_diffusion_policy_replay_buffer.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/_diffusion_policy_replay_buffer.py
@@ -0,0 +1,619 @@
+"""Helper code for loading PushT dataset from Diffusion Policy (https://diffusion-policy.cs.columbia.edu/)
+
+Copied from the original Diffusion Policy repository and used in our `download_and_upload_dataset.py` script.
+"""
+
+from __future__ import annotations
+
+import math
+import numbers
+import os
+from functools import cached_property
+
+import numcodecs
+import numpy as np
+import zarr
+
+
+def check_chunks_compatible(chunks: tuple, shape: tuple):
+    assert len(shape) == len(chunks)
+    for c in chunks:
+        assert isinstance(c, numbers.Integral)
+        assert c > 0
+
+
+def rechunk_recompress_array(group, name, chunks=None, chunk_length=None, compressor=None, tmp_key="_temp"):
+    old_arr = group[name]
+    if chunks is None:
+        chunks = (chunk_length,) + old_arr.chunks[1:] if chunk_length is not None else old_arr.chunks
+    check_chunks_compatible(chunks, old_arr.shape)
+
+    if compressor is None:
+        compressor = old_arr.compressor
+
+    if (chunks == old_arr.chunks) and (compressor == old_arr.compressor):
+        # no change
+        return old_arr
+
+    # rechunk recompress
+    group.move(name, tmp_key)
+    old_arr = group[tmp_key]
+    n_copied, n_skipped, n_bytes_copied = zarr.copy(
+        source=old_arr,
+        dest=group,
+        name=name,
+        chunks=chunks,
+        compressor=compressor,
+    )
+    del group[tmp_key]
+    arr = group[name]
+    return arr
+
+
+def get_optimal_chunks(shape, dtype, target_chunk_bytes=2e6, max_chunk_length=None):
+    """
+    Common shapes
+    T,D
+    T,N,D
+    T,H,W,C
+    T,N,H,W,C
+    """
+    itemsize = np.dtype(dtype).itemsize
+    # reversed
+    rshape = list(shape[::-1])
+    if max_chunk_length is not None:
+        rshape[-1] = int(max_chunk_length)
+    split_idx = len(shape) - 1
+    for i in range(len(shape) - 1):
+        this_chunk_bytes = itemsize * np.prod(rshape[:i])
+        next_chunk_bytes = itemsize * np.prod(rshape[: i + 1])
+        if this_chunk_bytes <= target_chunk_bytes and next_chunk_bytes > target_chunk_bytes:
+            split_idx = i
+
+    rchunks = rshape[:split_idx]
+    item_chunk_bytes = itemsize * np.prod(rshape[:split_idx])
+    this_max_chunk_length = rshape[split_idx]
+    next_chunk_length = min(this_max_chunk_length, math.ceil(target_chunk_bytes / item_chunk_bytes))
+    rchunks.append(next_chunk_length)
+    len_diff = len(shape) - len(rchunks)
+    rchunks.extend([1] * len_diff)
+    chunks = tuple(rchunks[::-1])
+    # print(np.prod(chunks) * itemsize / target_chunk_bytes)
+    return chunks
+
+
+class ReplayBuffer:
+    """
+    Zarr-based temporal datastructure.
+    Assumes first dimension to be time. Only chunk in time dimension.
+    """
+
+    def __init__(self, root: zarr.Group | dict[str, dict]):
+        """
+        Dummy constructor. Use copy_from* and create_from* class methods instead.
+        """
+        assert "data" in root
+        assert "meta" in root
+        assert "episode_ends" in root["meta"]
+        for value in root["data"].values():
+            assert value.shape[0] == root["meta"]["episode_ends"][-1]
+        self.root = root
+
+    # ============= create constructors ===============
+    @classmethod
+    def create_empty_zarr(cls, storage=None, root=None):
+        if root is None:
+            if storage is None:
+                storage = zarr.MemoryStore()
+            root = zarr.group(store=storage)
+        root.require_group("data", overwrite=False)
+        meta = root.require_group("meta", overwrite=False)
+        if "episode_ends" not in meta:
+            meta.zeros("episode_ends", shape=(0,), dtype=np.int64, compressor=None, overwrite=False)
+        return cls(root=root)
+
+    @classmethod
+    def create_empty_numpy(cls):
+        root = {"data": {}, "meta": {"episode_ends": np.zeros((0,), dtype=np.int64)}}
+        return cls(root=root)
+
+    @classmethod
+    def create_from_group(cls, group, **kwargs):
+        if "data" not in group:
+            # create from stratch
+            buffer = cls.create_empty_zarr(root=group, **kwargs)
+        else:
+            # already exist
+            buffer = cls(root=group, **kwargs)
+        return buffer
+
+    @classmethod
+    def create_from_path(cls, zarr_path, mode="r", **kwargs):
+        """
+        Open a on-disk zarr directly (for dataset larger than memory).
+        Slower.
+        """
+        group = zarr.open(os.path.expanduser(zarr_path), mode)
+        return cls.create_from_group(group, **kwargs)
+
+    # ============= copy constructors ===============
+    @classmethod
+    def copy_from_store(
+        cls,
+        src_store,
+        store=None,
+        keys=None,
+        chunks: dict[str, tuple] | None = None,
+        compressors: dict | str | numcodecs.abc.Codec | None = None,
+        if_exists="replace",
+        **kwargs,
+    ):
+        """
+        Load to memory.
+        """
+        src_root = zarr.group(src_store)
+        if chunks is None:
+            chunks = {}
+        if compressors is None:
+            compressors = {}
+        root = None
+        if store is None:
+            # numpy backend
+            meta = {}
+            for key, value in src_root["meta"].items():
+                if len(value.shape) == 0:
+                    meta[key] = np.array(value)
+                else:
+                    meta[key] = value[:]
+
+            if keys is None:
+                keys = src_root["data"].keys()
+            data = {}
+            for key in keys:
+                arr = src_root["data"][key]
+                data[key] = arr[:]
+
+            root = {"meta": meta, "data": data}
+        else:
+            root = zarr.group(store=store)
+            # copy without recompression
+            n_copied, n_skipped, n_bytes_copied = zarr.copy_store(
+                source=src_store, dest=store, source_path="/meta", dest_path="/meta", if_exists=if_exists
+            )
+            data_group = root.create_group("data", overwrite=True)
+            if keys is None:
+                keys = src_root["data"].keys()
+            for key in keys:
+                value = src_root["data"][key]
+                cks = cls._resolve_array_chunks(chunks=chunks, key=key, array=value)
+                cpr = cls._resolve_array_compressor(compressors=compressors, key=key, array=value)
+                if cks == value.chunks and cpr == value.compressor:
+                    # copy without recompression
+                    this_path = "/data/" + key
+                    n_copied, n_skipped, n_bytes_copied = zarr.copy_store(
+                        source=src_store,
+                        dest=store,
+                        source_path=this_path,
+                        dest_path=this_path,
+                        if_exists=if_exists,
+                    )
+                else:
+                    # copy with recompression
+                    n_copied, n_skipped, n_bytes_copied = zarr.copy(
+                        source=value,
+                        dest=data_group,
+                        name=key,
+                        chunks=cks,
+                        compressor=cpr,
+                        if_exists=if_exists,
+                    )
+        buffer = cls(root=root)
+        return buffer
+
+    @classmethod
+    def copy_from_path(
+        cls,
+        zarr_path,
+        backend=None,
+        store=None,
+        keys=None,
+        chunks: dict[str, tuple] | None = None,
+        compressors: dict | str | numcodecs.abc.Codec | None = None,
+        if_exists="replace",
+        **kwargs,
+    ):
+        """
+        Copy a on-disk zarr to in-memory compressed.
+        Recommended
+        """
+        if chunks is None:
+            chunks = {}
+        if compressors is None:
+            compressors = {}
+        if backend == "numpy":
+            print("backend argument is deprecated!")
+            store = None
+        group = zarr.open(os.path.expanduser(zarr_path), "r")
+        return cls.copy_from_store(
+            src_store=group.store,
+            store=store,
+            keys=keys,
+            chunks=chunks,
+            compressors=compressors,
+            if_exists=if_exists,
+            **kwargs,
+        )
+
+    # ============= save methods ===============
+    def save_to_store(
+        self,
+        store,
+        chunks: dict[str, tuple] | None = None,
+        compressors: str | numcodecs.abc.Codec | dict | None = None,
+        if_exists="replace",
+        **kwargs,
+    ):
+        root = zarr.group(store)
+        if chunks is None:
+            chunks = {}
+        if compressors is None:
+            compressors = {}
+        if self.backend == "zarr":
+            # recompression free copy
+            n_copied, n_skipped, n_bytes_copied = zarr.copy_store(
+                source=self.root.store,
+                dest=store,
+                source_path="/meta",
+                dest_path="/meta",
+                if_exists=if_exists,
+            )
+        else:
+            meta_group = root.create_group("meta", overwrite=True)
+            # save meta, no chunking
+            for key, value in self.root["meta"].items():
+                _ = meta_group.array(name=key, data=value, shape=value.shape, chunks=value.shape)
+
+        # save data, chunk
+        data_group = root.create_group("data", overwrite=True)
+        for key, value in self.root["data"].items():
+            cks = self._resolve_array_chunks(chunks=chunks, key=key, array=value)
+            cpr = self._resolve_array_compressor(compressors=compressors, key=key, array=value)
+            if isinstance(value, zarr.Array):
+                if cks == value.chunks and cpr == value.compressor:
+                    # copy without recompression
+                    this_path = "/data/" + key
+                    n_copied, n_skipped, n_bytes_copied = zarr.copy_store(
+                        source=self.root.store,
+                        dest=store,
+                        source_path=this_path,
+                        dest_path=this_path,
+                        if_exists=if_exists,
+                    )
+                else:
+                    # copy with recompression
+                    n_copied, n_skipped, n_bytes_copied = zarr.copy(
+                        source=value,
+                        dest=data_group,
+                        name=key,
+                        chunks=cks,
+                        compressor=cpr,
+                        if_exists=if_exists,
+                    )
+            else:
+                # numpy
+                _ = data_group.array(name=key, data=value, chunks=cks, compressor=cpr)
+        return store
+
+    def save_to_path(
+        self,
+        zarr_path,
+        chunks: dict[str, tuple] | None = None,
+        compressors: str | numcodecs.abc.Codec | dict | None = None,
+        if_exists="replace",
+        **kwargs,
+    ):
+        if chunks is None:
+            chunks = {}
+        if compressors is None:
+            compressors = {}
+        store = zarr.DirectoryStore(os.path.expanduser(zarr_path))
+        return self.save_to_store(
+            store, chunks=chunks, compressors=compressors, if_exists=if_exists, **kwargs
+        )
+
+    @staticmethod
+    def resolve_compressor(compressor="default"):
+        if compressor == "default":
+            compressor = numcodecs.Blosc(cname="lz4", clevel=5, shuffle=numcodecs.Blosc.NOSHUFFLE)
+        elif compressor == "disk":
+            compressor = numcodecs.Blosc("zstd", clevel=5, shuffle=numcodecs.Blosc.BITSHUFFLE)
+        return compressor
+
+    @classmethod
+    def _resolve_array_compressor(cls, compressors: dict | str | numcodecs.abc.Codec, key, array):
+        # allows compressor to be explicitly set to None
+        cpr = "nil"
+        if isinstance(compressors, dict):
+            if key in compressors:
+                cpr = cls.resolve_compressor(compressors[key])
+            elif isinstance(array, zarr.Array):
+                cpr = array.compressor
+        else:
+            cpr = cls.resolve_compressor(compressors)
+        # backup default
+        if cpr == "nil":
+            cpr = cls.resolve_compressor("default")
+        return cpr
+
+    @classmethod
+    def _resolve_array_chunks(cls, chunks: dict | tuple, key, array):
+        cks = None
+        if isinstance(chunks, dict):
+            if key in chunks:
+                cks = chunks[key]
+            elif isinstance(array, zarr.Array):
+                cks = array.chunks
+        elif isinstance(chunks, tuple):
+            cks = chunks
+        else:
+            raise TypeError(f"Unsupported chunks type {type(chunks)}")
+        # backup default
+        if cks is None:
+            cks = get_optimal_chunks(shape=array.shape, dtype=array.dtype)
+        # check
+        check_chunks_compatible(chunks=cks, shape=array.shape)
+        return cks
+
+    # ============= properties =================
+    @cached_property
+    def data(self):
+        return self.root["data"]
+
+    @cached_property
+    def meta(self):
+        return self.root["meta"]
+
+    def update_meta(self, data):
+        # sanitize data
+        np_data = {}
+        for key, value in data.items():
+            if isinstance(value, np.ndarray):
+                np_data[key] = value
+            else:
+                arr = np.array(value)
+                if arr.dtype == object:
+                    raise TypeError(f"Invalid value type {type(value)}")
+                np_data[key] = arr
+
+        meta_group = self.meta
+        if self.backend == "zarr":
+            for key, value in np_data.items():
+                _ = meta_group.array(
+                    name=key, data=value, shape=value.shape, chunks=value.shape, overwrite=True
+                )
+        else:
+            meta_group.update(np_data)
+
+        return meta_group
+
+    @property
+    def episode_ends(self):
+        return self.meta["episode_ends"]
+
+    def get_episode_idxs(self):
+        import numba
+
+        numba.jit(nopython=True)
+
+        def _get_episode_idxs(episode_ends):
+            result = np.zeros((episode_ends[-1],), dtype=np.int64)
+            for i in range(len(episode_ends)):
+                start = 0
+                if i > 0:
+                    start = episode_ends[i - 1]
+                end = episode_ends[i]
+                for idx in range(start, end):
+                    result[idx] = i
+            return result
+
+        return _get_episode_idxs(self.episode_ends)
+
+    @property
+    def backend(self):
+        backend = "numpy"
+        if isinstance(self.root, zarr.Group):
+            backend = "zarr"
+        return backend
+
+    # =========== dict-like API ==============
+    def __repr__(self) -> str:
+        if self.backend == "zarr":
+            return str(self.root.tree())
+        else:
+            return super().__repr__()
+
+    def keys(self):
+        return self.data.keys()
+
+    def values(self):
+        return self.data.values()
+
+    def items(self):
+        return self.data.items()
+
+    def __getitem__(self, key):
+        return self.data[key]
+
+    def __contains__(self, key):
+        return key in self.data
+
+    # =========== our API ==============
+    @property
+    def n_steps(self):
+        if len(self.episode_ends) == 0:
+            return 0
+        return self.episode_ends[-1]
+
+    @property
+    def n_episodes(self):
+        return len(self.episode_ends)
+
+    @property
+    def chunk_size(self):
+        if self.backend == "zarr":
+            return next(iter(self.data.arrays()))[-1].chunks[0]
+        return None
+
+    @property
+    def episode_lengths(self):
+        ends = self.episode_ends[:]
+        ends = np.insert(ends, 0, 0)
+        lengths = np.diff(ends)
+        return lengths
+
+    def add_episode(
+        self,
+        data: dict[str, np.ndarray],
+        chunks: dict[str, tuple] | None = None,
+        compressors: str | numcodecs.abc.Codec | dict | None = None,
+    ):
+        if chunks is None:
+            chunks = {}
+        if compressors is None:
+            compressors = {}
+        assert len(data) > 0
+        is_zarr = self.backend == "zarr"
+
+        curr_len = self.n_steps
+        episode_length = None
+        for value in data.values():
+            assert len(value.shape) >= 1
+            if episode_length is None:
+                episode_length = len(value)
+            else:
+                assert episode_length == len(value)
+        new_len = curr_len + episode_length
+
+        for key, value in data.items():
+            new_shape = (new_len,) + value.shape[1:]
+            # create array
+            if key not in self.data:
+                if is_zarr:
+                    cks = self._resolve_array_chunks(chunks=chunks, key=key, array=value)
+                    cpr = self._resolve_array_compressor(compressors=compressors, key=key, array=value)
+                    arr = self.data.zeros(
+                        name=key, shape=new_shape, chunks=cks, dtype=value.dtype, compressor=cpr
+                    )
+                else:
+                    # copy data to prevent modify
+                    arr = np.zeros(shape=new_shape, dtype=value.dtype)
+                    self.data[key] = arr
+            else:
+                arr = self.data[key]
+                assert value.shape[1:] == arr.shape[1:]
+                # same method for both zarr and numpy
+                if is_zarr:
+                    arr.resize(new_shape)
+                else:
+                    arr.resize(new_shape, refcheck=False)
+            # copy data
+            arr[-value.shape[0] :] = value
+
+        # append to episode ends
+        episode_ends = self.episode_ends
+        if is_zarr:
+            episode_ends.resize(episode_ends.shape[0] + 1)
+        else:
+            episode_ends.resize(episode_ends.shape[0] + 1, refcheck=False)
+        episode_ends[-1] = new_len
+
+        # rechunk
+        if is_zarr and episode_ends.chunks[0] < episode_ends.shape[0]:
+            rechunk_recompress_array(self.meta, "episode_ends", chunk_length=int(episode_ends.shape[0] * 1.5))
+
+    def drop_episode(self):
+        is_zarr = self.backend == "zarr"
+        episode_ends = self.episode_ends[:].copy()
+        assert len(episode_ends) > 0
+        start_idx = 0
+        if len(episode_ends) > 1:
+            start_idx = episode_ends[-2]
+        for value in self.data.values():
+            new_shape = (start_idx,) + value.shape[1:]
+            if is_zarr:
+                value.resize(new_shape)
+            else:
+                value.resize(new_shape, refcheck=False)
+        if is_zarr:
+            self.episode_ends.resize(len(episode_ends) - 1)
+        else:
+            self.episode_ends.resize(len(episode_ends) - 1, refcheck=False)
+
+    def pop_episode(self):
+        assert self.n_episodes > 0
+        episode = self.get_episode(self.n_episodes - 1, copy=True)
+        self.drop_episode()
+        return episode
+
+    def extend(self, data):
+        self.add_episode(data)
+
+    def get_episode(self, idx, copy=False):
+        idx = list(range(len(self.episode_ends)))[idx]
+        start_idx = 0
+        if idx > 0:
+            start_idx = self.episode_ends[idx - 1]
+        end_idx = self.episode_ends[idx]
+        result = self.get_steps_slice(start_idx, end_idx, copy=copy)
+        return result
+
+    def get_episode_slice(self, idx):
+        start_idx = 0
+        if idx > 0:
+            start_idx = self.episode_ends[idx - 1]
+        end_idx = self.episode_ends[idx]
+        return slice(start_idx, end_idx)
+
+    def get_steps_slice(self, start, stop, step=None, copy=False):
+        _slice = slice(start, stop, step)
+
+        result = {}
+        for key, value in self.data.items():
+            x = value[_slice]
+            if copy and isinstance(value, np.ndarray):
+                x = x.copy()
+            result[key] = x
+        return result
+
+    # =========== chunking =============
+    def get_chunks(self) -> dict:
+        assert self.backend == "zarr"
+        chunks = {}
+        for key, value in self.data.items():
+            chunks[key] = value.chunks
+        return chunks
+
+    def set_chunks(self, chunks: dict):
+        assert self.backend == "zarr"
+        for key, value in chunks.items():
+            if key in self.data:
+                arr = self.data[key]
+                if value != arr.chunks:
+                    check_chunks_compatible(chunks=value, shape=arr.shape)
+                    rechunk_recompress_array(self.data, key, chunks=value)
+
+    def get_compressors(self) -> dict:
+        assert self.backend == "zarr"
+        compressors = {}
+        for key, value in self.data.items():
+            compressors[key] = value.compressor
+        return compressors
+
+    def set_compressors(self, compressors: dict):
+        assert self.backend == "zarr"
+        for key, value in compressors.items():
+            if key in self.data:
+                arr = self.data[key]
+                compressor = self.resolve_compressor(value)
+                if compressor != arr.compressor:
+                    rechunk_recompress_array(self.data, key, compressor=compressor)
--- a/lerobot/common/datasets/push_dataset_to_hub/_download_raw.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/_download_raw.py
@@ -0,0 +1,179 @@
+"""
+This file contains all obsolete download scripts. They are centralized here to not have to load
+useless dependencies when using datasets.
+"""
+
+import io
+from pathlib import Path
+
+import tqdm
+
+
+def download_raw(root, dataset_id) -> Path:
+    if "pusht" in dataset_id:
+        return download_pusht(root=root, dataset_id=dataset_id)
+    elif "xarm" in dataset_id:
+        return download_xarm(root=root, dataset_id=dataset_id)
+    elif "aloha" in dataset_id:
+        return download_aloha(root=root, dataset_id=dataset_id)
+    elif "umi" in dataset_id:
+        return download_umi(root=root, dataset_id=dataset_id)
+    else:
+        raise ValueError(dataset_id)
+
+
+def download_and_extract_zip(url: str, destination_folder: Path) -> bool:
+    import zipfile
+
+    import requests
+
+    print(f"downloading from {url}")
+    response = requests.get(url, stream=True)
+    if response.status_code == 200:
+        total_size = int(response.headers.get("content-length", 0))
+        progress_bar = tqdm.tqdm(total=total_size, unit="B", unit_scale=True)
+
+        zip_file = io.BytesIO()
+        for chunk in response.iter_content(chunk_size=1024):
+            if chunk:
+                zip_file.write(chunk)
+                progress_bar.update(len(chunk))
+
+        progress_bar.close()
+
+        zip_file.seek(0)
+
+        with zipfile.ZipFile(zip_file, "r") as zip_ref:
+            zip_ref.extractall(destination_folder)
+        return True
+    else:
+        return False
+
+
+def download_pusht(root: str, dataset_id: str = "pusht", fps: int = 10) -> Path:
+    pusht_url = "https://diffusion-policy.cs.columbia.edu/data/training/pusht.zip"
+    pusht_zarr = Path("pusht/pusht_cchi_v7_replay.zarr")
+
+    root = Path(root)
+    raw_dir: Path = root / f"{dataset_id}_raw"
+    zarr_path: Path = (raw_dir / pusht_zarr).resolve()
+    if not zarr_path.is_dir():
+        raw_dir.mkdir(parents=True, exist_ok=True)
+        download_and_extract_zip(pusht_url, raw_dir)
+    return zarr_path
+
+
+def download_xarm(root: str, dataset_id: str, fps: int = 15) -> Path:
+    root = Path(root)
+    raw_dir: Path = root / "xarm_datasets_raw"
+    if not raw_dir.exists():
+        import zipfile
+
+        import gdown
+
+        raw_dir.mkdir(parents=True, exist_ok=True)
+        # from https://github.com/fyhMer/fowm/blob/main/scripts/download_datasets.py
+        url = "https://drive.google.com/uc?id=1nhxpykGtPDhmQKm-_B8zBSywVRdgeVya"
+        zip_path = raw_dir / "data.zip"
+        gdown.download(url, str(zip_path), quiet=False)
+        print("Extracting...")
+        with zipfile.ZipFile(str(zip_path), "r") as zip_f:
+            for member in zip_f.namelist():
+                if member.startswith("data/xarm") and member.endswith(".pkl"):
+                    print(member)
+                    zip_f.extract(member=member)
+        zip_path.unlink()
+
+    dataset_path: Path = root / f"{dataset_id}"
+    return dataset_path
+
+
+def download_aloha(root: str, dataset_id: str) -> Path:
+    folder_urls = {
+        "aloha_sim_insertion_human": "https://drive.google.com/drive/folders/1RgyD0JgTX30H4IM5XZn8I3zSV_mr8pyF",
+        "aloha_sim_insertion_scripted": "https://drive.google.com/drive/folders/1TsojQQSXtHEoGnqgJ3gmpPQR2DPLtS2N",
+        "aloha_sim_transfer_cube_human": "https://drive.google.com/drive/folders/1sc-E4QYW7A0o23m1u2VWNGVq5smAsfCo",
+        "aloha_sim_transfer_cube_scripted": "https://drive.google.com/drive/folders/1aRyoOhQwxhyt1J8XgEig4s6kzaw__LXj",
+    }
+
+    ep48_urls = {
+        "aloha_sim_insertion_human": "https://drive.google.com/file/d/18Cudl6nikDtgRolea7je8iF_gGKzynOP/view?usp=drive_link",
+        "aloha_sim_insertion_scripted": "https://drive.google.com/file/d/1wfMSZ24oOh5KR_0aaP3Cnu_c4ZCveduB/view?usp=drive_link",
+        "aloha_sim_transfer_cube_human": "https://drive.google.com/file/d/18smMymtr8tIxaNUQ61gW6dG50pt3MvGq/view?usp=drive_link",
+        "aloha_sim_transfer_cube_scripted": "https://drive.google.com/file/d/1pnGIOd-E4-rhz2P3VxpknMKRZCoKt6eI/view?usp=drive_link",
+    }
+
+    ep49_urls = {
+        "aloha_sim_insertion_human": "https://drive.google.com/file/d/1C1kZYyROzs-PrLc0SkDgUgMi4-L3lauE/view?usp=drive_link",
+        "aloha_sim_insertion_scripted": "https://drive.google.com/file/d/17EuCUWS6uCCr6yyNzpXdcdE-_TTNCKtf/view?usp=drive_link",
+        "aloha_sim_transfer_cube_human": "https://drive.google.com/file/d/1Nk7l53d9sJoGDBKAOnNrExX5nLacATc6/view?usp=drive_link",
+        "aloha_sim_transfer_cube_scripted": "https://drive.google.com/file/d/1GKReZHrXU73NMiC5zKCq_UtqPVtYq8eo/view?usp=drive_link",
+    }
+    num_episodes = {  # noqa: F841 # we keep this for reference
+        "aloha_sim_insertion_human": 50,
+        "aloha_sim_insertion_scripted": 50,
+        "aloha_sim_transfer_cube_human": 50,
+        "aloha_sim_transfer_cube_scripted": 50,
+    }
+
+    episode_len = {  # noqa: F841 # we keep this for reference
+        "aloha_sim_insertion_human": 500,
+        "aloha_sim_insertion_scripted": 400,
+        "aloha_sim_transfer_cube_human": 400,
+        "aloha_sim_transfer_cube_scripted": 400,
+    }
+
+    cameras = {  # noqa: F841 # we keep this for reference
+        "aloha_sim_insertion_human": ["top"],
+        "aloha_sim_insertion_scripted": ["top"],
+        "aloha_sim_transfer_cube_human": ["top"],
+        "aloha_sim_transfer_cube_scripted": ["top"],
+    }
+    root = Path(root)
+    raw_dir: Path = root / f"{dataset_id}_raw"
+    if not raw_dir.is_dir():
+        import gdown
+
+        assert dataset_id in folder_urls
+        assert dataset_id in ep48_urls
+        assert dataset_id in ep49_urls
+
+        raw_dir.mkdir(parents=True, exist_ok=True)
+
+        gdown.download_folder(folder_urls[dataset_id], output=str(raw_dir))
+
+        # because of the 50 files limit per directory, two files episode 48 and 49 were missing
+        gdown.download(ep48_urls[dataset_id], output=str(raw_dir / "episode_48.hdf5"), fuzzy=True)
+        gdown.download(ep49_urls[dataset_id], output=str(raw_dir / "episode_49.hdf5"), fuzzy=True)
+    return raw_dir
+
+
+def download_umi(root: str, dataset_id: str) -> Path:
+    url_cup_in_the_wild = "https://real.stanford.edu/umi/data/zarr_datasets/cup_in_the_wild.zarr.zip"
+    cup_in_the_wild_zarr = Path("umi/cup_in_the_wild/cup_in_the_wild.zarr")
+
+    root = Path(root)
+    raw_dir: Path = root / f"{dataset_id}_raw"
+    zarr_path: Path = (raw_dir / cup_in_the_wild_zarr).resolve()
+    if not zarr_path.is_dir():
+        raw_dir.mkdir(parents=True, exist_ok=True)
+        download_and_extract_zip(url_cup_in_the_wild, zarr_path)
+    return zarr_path
+
+
+if __name__ == "__main__":
+    root = "data"
+    dataset_ids = [
+        "pusht",
+        "xarm_lift_medium",
+        "xarm_lift_medium_replay",
+        "xarm_push_medium",
+        "xarm_push_medium_replay",
+        "aloha_sim_insertion_human",
+        "aloha_sim_insertion_scripted",
+        "aloha_sim_transfer_cube_human",
+        "aloha_sim_transfer_cube_scripted",
+        "umi_cup_in_the_wild",
+    ]
+    for dataset_id in dataset_ids:
+        download_raw(root=root, dataset_id=dataset_id)
--- a/lerobot/common/datasets/push_dataset_to_hub/_umi_imagecodecs_numcodecs.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/_umi_imagecodecs_numcodecs.py
@@ -0,0 +1,311 @@
+# imagecodecs/numcodecs.py
+
+# Copyright (c) 2021-2022, Christoph Gohlke
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+#    this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+#    contributors may be used to endorse or promote products derived from
+#    this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+# Copied from: https://github.com/real-stanford/universal_manipulation_interface/blob/298776ce251f33b6b3185a98d6e7d1f9ad49168b/diffusion_policy/codecs/imagecodecs_numcodecs.py#L1
+"""Additional numcodecs implemented using imagecodecs."""
+
+__version__ = "2022.9.26"
+
+__all__ = ("register_codecs",)
+
+import imagecodecs
+import numpy
+from numcodecs.abc import Codec
+from numcodecs.registry import get_codec, register_codec
+
+# TODO (azouitine): Remove useless codecs
+
+
+def protective_squeeze(x: numpy.ndarray):
+    """
+    Squeeze dim only if it's not the last dim.
+    Image dim expected to be *, H, W, C
+    """
+    img_shape = x.shape[-3:]
+    if len(x.shape) > 3:
+        n_imgs = numpy.prod(x.shape[:-3])
+        if n_imgs > 1:
+            img_shape = (-1,) + img_shape
+    return x.reshape(img_shape)
+
+
+def get_default_image_compressor(**kwargs):
+    if imagecodecs.JPEGXL:
+        # has JPEGXL
+        this_kwargs = {
+            "effort": 3,
+            "distance": 0.3,
+            # bug in libjxl, invalid codestream for non-lossless
+            # when decoding speed > 1
+            "decodingspeed": 1,
+        }
+        this_kwargs.update(kwargs)
+        return JpegXl(**this_kwargs)
+    else:
+        this_kwargs = {"level": 50}
+        this_kwargs.update(kwargs)
+        return Jpeg2k(**this_kwargs)
+
+
+class Jpeg2k(Codec):
+    """JPEG 2000 codec for numcodecs."""
+
+    codec_id = "imagecodecs_jpeg2k"
+
+    def __init__(
+        self,
+        level=None,
+        codecformat=None,
+        colorspace=None,
+        tile=None,
+        reversible=None,
+        bitspersample=None,
+        resolutions=None,
+        numthreads=None,
+        verbose=0,
+    ):
+        self.level = level
+        self.codecformat = codecformat
+        self.colorspace = colorspace
+        self.tile = None if tile is None else tuple(tile)
+        self.reversible = reversible
+        self.bitspersample = bitspersample
+        self.resolutions = resolutions
+        self.numthreads = numthreads
+        self.verbose = verbose
+
+    def encode(self, buf):
+        buf = protective_squeeze(numpy.asarray(buf))
+        return imagecodecs.jpeg2k_encode(
+            buf,
+            level=self.level,
+            codecformat=self.codecformat,
+            colorspace=self.colorspace,
+            tile=self.tile,
+            reversible=self.reversible,
+            bitspersample=self.bitspersample,
+            resolutions=self.resolutions,
+            numthreads=self.numthreads,
+            verbose=self.verbose,
+        )
+
+    def decode(self, buf, out=None):
+        return imagecodecs.jpeg2k_decode(buf, verbose=self.verbose, numthreads=self.numthreads, out=out)
+
+
+class JpegXl(Codec):
+    """JPEG XL codec for numcodecs."""
+
+    codec_id = "imagecodecs_jpegxl"
+
+    def __init__(
+        self,
+        # encode
+        level=None,
+        effort=None,
+        distance=None,
+        lossless=None,
+        decodingspeed=None,
+        photometric=None,
+        planar=None,
+        usecontainer=None,
+        # decode
+        index=None,
+        keeporientation=None,
+        # both
+        numthreads=None,
+    ):
+        """
+        Return JPEG XL image from numpy array.
+        Float must be in nominal range 0..1.
+
+        Currently L, LA, RGB, RGBA images are supported in contig mode.
+        Extra channels are only supported for grayscale images in planar mode.
+
+        Parameters
+        ----------
+        level : Default to None, i.e. not overwriting lossess and decodingspeed options.
+            When < 0: Use lossless compression
+            When in [0,1,2,3,4]: Sets the decoding speed tier for the provided options.
+                Minimum is 0 (slowest to decode, best quality/density), and maximum
+                is 4 (fastest to decode, at the cost of some quality/density).
+        effort : Default to 3.
+            Sets encoder effort/speed level without affecting decoding speed.
+            Valid values are, from faster to slower speed: 1:lightning 2:thunder
+                3:falcon 4:cheetah 5:hare 6:wombat 7:squirrel 8:kitten 9:tortoise.
+            Speed: lightning, thunder, falcon, cheetah, hare, wombat, squirrel, kitten, tortoise
+            control the encoder effort in ascending order.
+            This also affects memory usage: using lower effort will typically reduce memory
+            consumption during encoding.
+            lightning and thunder are fast modes useful for lossless mode (modular).
+            falcon disables all of the following tools.
+            cheetah enables coefficient reordering, context clustering, and heuristics for selecting DCT sizes and quantization steps.
+            hare enables Gaborish filtering, chroma from luma, and an initial estimate of quantization steps.
+            wombat enables error diffusion quantization and full DCT size selection heuristics.
+            squirrel (default) enables dots, patches, and spline detection, and full context clustering.
+            kitten optimizes the adaptive quantization for a psychovisual metric.
+            tortoise enables a more thorough adaptive quantization search.
+        distance : Default to 1.0
+            Sets the distance level for lossy compression: target max butteraugli distance,
+            lower = higher quality. Range: 0 .. 15. 0.0 = mathematically lossless
+            (however, use JxlEncoderSetFrameLossless instead to use true lossless,
+            as setting distance to 0 alone is not the only requirement).
+            1.0 = visually lossless. Recommended range: 0.5 .. 3.0.
+        lossess : Default to False.
+            Use lossess encoding.
+        decodingspeed : Default to 0.
+            Duplicate to level. [0,4]
+        photometric : Return JxlColorSpace value.
+            Default logic is quite complicated but works most of the time.
+            Accepted value:
+                int: [-1,3]
+                str: ['RGB',
+                    'WHITEISZERO', 'MINISWHITE',
+                    'BLACKISZERO', 'MINISBLACK', 'GRAY',
+                    'XYB', 'KNOWN']
+        planar : Enable multi-channel mode.
+            Default to false.
+        usecontainer :
+            Forces the encoder to use the box-based container format (BMFF)
+            even when not necessary.
+            When using JxlEncoderUseBoxes, JxlEncoderStoreJPEGMetadata or
+            JxlEncoderSetCodestreamLevel with level 10, the encoder will
+            automatically also use the container format, it is not necessary
+            to use JxlEncoderUseContainer for those use cases.
+            By default this setting is disabled.
+        index : Selectively decode frames for animation.
+            Default to 0, decode all frames.
+            When set to > 0, decode that frame index only.
+        keeporientation :
+            Enables or disables preserving of as-in-bitstream pixeldata orientation.
+            Some images are encoded with an Orientation tag indicating that the
+            decoder must perform a rotation and/or mirroring to the encoded image data.
+
+            If skip_reorientation is JXL_FALSE (the default): the decoder will apply
+            the transformation from the orientation setting, hence rendering the image
+            according to its specified intent. When producing a JxlBasicInfo, the decoder
+            will always set the orientation field to JXL_ORIENT_IDENTITY (matching the
+            returned pixel data) and also align xsize and ysize so that they correspond
+            to the width and the height of the returned pixel data.
+
+            If skip_reorientation is JXL_TRUE: the decoder will skip applying the
+            transformation from the orientation setting, returning the image in
+            the as-in-bitstream pixeldata orientation. This may be faster to decode
+            since the decoder doesnt have to apply the transformation, but can
+            cause wrong display of the image if the orientation tag is not correctly
+            taken into account by the user.
+
+            By default, this option is disabled, and the returned pixel data is
+            re-oriented according to the images Orientation setting.
+        threads : Default to 1.
+            If <= 0, use all cores.
+            If > 32, clipped to 32.
+        """
+
+        self.level = level
+        self.effort = effort
+        self.distance = distance
+        self.lossless = bool(lossless)
+        self.decodingspeed = decodingspeed
+        self.photometric = photometric
+        self.planar = planar
+        self.usecontainer = usecontainer
+        self.index = index
+        self.keeporientation = keeporientation
+        self.numthreads = numthreads
+
+    def encode(self, buf):
+        # TODO: only squeeze all but last dim
+        buf = protective_squeeze(numpy.asarray(buf))
+        return imagecodecs.jpegxl_encode(
+            buf,
+            level=self.level,
+            effort=self.effort,
+            distance=self.distance,
+            lossless=self.lossless,
+            decodingspeed=self.decodingspeed,
+            photometric=self.photometric,
+            planar=self.planar,
+            usecontainer=self.usecontainer,
+            numthreads=self.numthreads,
+        )
+
+    def decode(self, buf, out=None):
+        return imagecodecs.jpegxl_decode(
+            buf,
+            index=self.index,
+            keeporientation=self.keeporientation,
+            numthreads=self.numthreads,
+            out=out,
+        )
+
+
+def _flat(out):
+    """Return numpy array as contiguous view of bytes if possible."""
+    if out is None:
+        return None
+    view = memoryview(out)
+    if view.readonly or not view.contiguous:
+        return None
+    return view.cast("B")
+
+
+def register_codecs(codecs=None, force=False, verbose=True):
+    """Register codecs in this module with numcodecs."""
+    for name, cls in globals().items():
+        if not hasattr(cls, "codec_id") or name == "Codec":
+            continue
+        if codecs is not None and cls.codec_id not in codecs:
+            continue
+        try:
+            try:  # noqa: SIM105
+                get_codec({"id": cls.codec_id})
+            except TypeError:
+                # registered, but failed
+                pass
+        except ValueError:
+            # not registered yet
+            pass
+        else:
+            if not force:
+                if verbose:
+                    log_warning(f"numcodec {cls.codec_id!r} already registered")
+                continue
+            if verbose:
+                log_warning(f"replacing registered numcodec {cls.codec_id!r}")
+        register_codec(cls)
+
+
+def log_warning(msg, *args, **kwargs):
+    """Log message with level WARNING."""
+    import logging
+
+    logging.getLogger(__name__).warning(msg, *args, **kwargs)
--- a/lerobot/common/datasets/push_dataset_to_hub/aloha_processor.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/aloha_processor.py
@@ -0,0 +1,199 @@
+import re
+from pathlib import Path
+
+import h5py
+import torch
+import tqdm
+from datasets import Dataset, Features, Image, Sequence, Value
+from PIL import Image as PILImage
+
+from lerobot.common.datasets.push_dataset_to_hub.utils import concatenate_episodes
+from lerobot.common.datasets.utils import (
+    hf_transform_to_torch,
+)
+
+
+class AlohaProcessor:
+    """
+    Process HDF5 files formatted like in: https://github.com/tonyzhaozh/act
+
+    Attributes:
+        folder_path (Path): Path to the directory containing HDF5 files.
+        cameras (list[str]): List of camera identifiers to check in the files.
+        fps (int): Frames per second used in timestamp calculations.
+
+    Methods:
+        is_valid() -> bool:
+            Validates if each HDF5 file within the folder contains all required datasets.
+        preprocess() -> dict:
+            Processes the files and returns structured data suitable for further analysis.
+        to_hf_dataset(data_dict: dict) -> Dataset:
+            Converts processed data into a Hugging Face Dataset object.
+    """
+
+    def __init__(self, folder_path: Path, cameras: list[str] | None = None, fps: int | None = None):
+        """
+        Initializes the AlohaProcessor with a specified directory path containing HDF5 files,
+        an optional list of cameras, and a frame rate.
+
+        Args:
+            folder_path (Path): The directory path where HDF5 files are stored.
+            cameras (list[str] | None): Optional list of cameras to validate within the files. Defaults to ['top'] if None.
+            fps (int): Frame rate for the datasets, used in time calculations. Default is 50.
+
+        Examples:
+            >>> processor = AlohaProcessor(Path("path_to_hdf5_directory"), ["camera1", "camera2"])
+            >>> processor.is_valid()
+            True
+        """
+        self.folder_path = folder_path
+        if cameras is None:
+            cameras = ["top"]
+        self.cameras = cameras
+        if fps is None:
+            fps = 50
+        self._fps = fps
+
+    @property
+    def fps(self) -> int:
+        return self._fps
+
+    def is_valid(self) -> bool:
+        """
+        Validates the HDF5 files in the specified folder to ensure they contain the required datasets
+        for actions, positions, and images for each specified camera.
+
+        Returns:
+            bool: True if all files are valid HDF5 files with all required datasets, False otherwise.
+        """
+        hdf5_files: list[Path] = list(self.folder_path.glob("episode_*.hdf5"))
+        if len(hdf5_files) == 0:
+            return False
+        try:
+            hdf5_files = sorted(
+                hdf5_files, key=lambda x: int(re.search(r"episode_(\d+).hdf5", x.name).group(1))
+            )
+        except AttributeError:
+            # All file names must contain a numerical identifier matching 'episode_(\\d+).hdf5
+            return False
+
+        # Check if the sequence is consecutive eg episode_0, episode_1, episode_2, etc.
+        # If not, return False
+        previous_number = None
+        for file in hdf5_files:
+            current_number = int(re.search(r"episode_(\d+).hdf5", file.name).group(1))
+            if previous_number is not None and current_number - previous_number != 1:
+                return False
+            previous_number = current_number
+
+        for file in hdf5_files:
+            try:
+                with h5py.File(file, "r") as file:
+                    # Check for the expected datasets within the HDF5 file
+                    required_datasets = ["/action", "/observations/qpos"]
+                    # Add camera-specific image datasets to the required datasets
+                    camera_datasets = [f"/observations/images/{cam}" for cam in self.cameras]
+                    required_datasets.extend(camera_datasets)
+
+                    if not all(dataset in file for dataset in required_datasets):
+                        return False
+            except OSError:
+                return False
+        return True
+
+    def preprocess(self):
+        """
+        Collects episode data from the HDF5 file and returns it as an AlohaStep named tuple.
+
+        Returns:
+            AlohaStep: Named tuple containing episode data.
+
+        Raises:
+            ValueError: If the file is not valid.
+        """
+        if not self.is_valid():
+            raise ValueError("The HDF5 file is invalid or does not contain the required datasets.")
+
+        hdf5_files = list(self.folder_path.glob("*.hdf5"))
+        hdf5_files = sorted(hdf5_files, key=lambda x: int(re.search(r"episode_(\d+)", x.name).group(1)))
+        ep_dicts = []
+        episode_data_index = {"from": [], "to": []}
+
+        id_from = 0
+
+        for ep_path in tqdm.tqdm(hdf5_files):
+            with h5py.File(ep_path, "r") as ep:
+                ep_id = int(re.search(r"episode_(\d+)", ep_path.name).group(1))
+                num_frames = ep["/action"].shape[0]
+
+                # last step of demonstration is considered done
+                done = torch.zeros(num_frames, dtype=torch.bool)
+                done[-1] = True
+
+                state = torch.from_numpy(ep["/observations/qpos"][:])
+                action = torch.from_numpy(ep["/action"][:])
+
+                ep_dict = {}
+
+                for cam in self.cameras:
+                    image = torch.from_numpy(ep[f"/observations/images/{cam}"][:])  # b h w c
+                    ep_dict[f"observation.images.{cam}"] = [PILImage.fromarray(x.numpy()) for x in image]
+
+                ep_dict.update(
+                    {
+                        "observation.state": state,
+                        "action": action,
+                        "episode_index": torch.tensor([ep_id] * num_frames),
+                        "frame_index": torch.arange(0, num_frames, 1),
+                        "timestamp": torch.arange(0, num_frames, 1) / self.fps,
+                        # TODO(rcadene): compute reward and success
+                        # "next.reward": reward,
+                        "next.done": done,
+                        # "next.success": success,
+                    }
+                )
+
+                assert isinstance(ep_id, int)
+                ep_dicts.append(ep_dict)
+
+                episode_data_index["from"].append(id_from)
+                episode_data_index["to"].append(id_from + num_frames)
+
+            id_from += num_frames
+
+        data_dict = concatenate_episodes(ep_dicts)
+        return data_dict, episode_data_index
+
+    def to_hf_dataset(self, data_dict) -> Dataset:
+        """
+        Converts a dictionary of data into a Hugging Face Dataset object.
+
+        Args:
+            data_dict (dict): A dictionary containing the data to be converted.
+
+        Returns:
+            Dataset: The converted Hugging Face Dataset object.
+        """
+        image_features = {f"observation.images.{cam}": Image() for cam in self.cameras}
+        features = {
+            "observation.state": Sequence(
+                length=data_dict["observation.state"].shape[1], feature=Value(dtype="float32", id=None)
+            ),
+            "action": Sequence(length=data_dict["action"].shape[1], feature=Value(dtype="float32", id=None)),
+            "episode_index": Value(dtype="int64", id=None),
+            "frame_index": Value(dtype="int64", id=None),
+            "timestamp": Value(dtype="float32", id=None),
+            # "next.reward": Value(dtype="float32", id=None),
+            "next.done": Value(dtype="bool", id=None),
+            # "next.success": Value(dtype="bool", id=None),
+            "index": Value(dtype="int64", id=None),
+        }
+        update_features = {**image_features, **features}
+        features = Features(update_features)
+        hf_dataset = Dataset.from_dict(data_dict, features=features)
+        hf_dataset.set_transform(hf_transform_to_torch)
+
+        return hf_dataset
+
+    def cleanup(self):
+        pass
--- a/lerobot/common/datasets/push_dataset_to_hub/pusht_processor.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/pusht_processor.py
@@ -0,0 +1,180 @@
+from pathlib import Path
+
+import numpy as np
+import torch
+import tqdm
+import zarr
+from datasets import Dataset, Features, Image, Sequence, Value
+from PIL import Image as PILImage
+
+from lerobot.common.datasets.push_dataset_to_hub.utils import concatenate_episodes
+from lerobot.common.datasets.utils import (
+    hf_transform_to_torch,
+)
+
+
+class PushTProcessor:
+    """ Process zarr files formatted like in: https://github.com/real-stanford/diffusion_policy
+    """
+    def __init__(self, folder_path: Path, fps: int | None = None):
+        self.zarr_path = folder_path
+        if fps is None:
+            fps = 10
+        self._fps = fps
+
+    @property
+    def fps(self) -> int:
+        return self._fps
+
+    def is_valid(self):
+        try:
+            zarr_data = zarr.open(self.zarr_path, mode="r")
+        except Exception:
+            # TODO (azouitine): Handle the exception properly
+            return False
+        required_datasets = {
+            "data/action",
+            "data/img",
+            "data/keypoint",
+            "data/n_contacts",
+            "data/state",
+            "meta/episode_ends",
+        }
+        for dataset in required_datasets:
+            if dataset not in zarr_data:
+                return False
+        nb_frames = zarr_data["data/img"].shape[0]
+
+        required_datasets.remove("meta/episode_ends")
+
+        return all(nb_frames == zarr_data[dataset].shape[0] for dataset in required_datasets)
+
+    def preprocess(self):
+        try:
+            import pymunk
+            from gym_pusht.envs.pusht import PushTEnv, pymunk_to_shapely
+
+            from lerobot.common.datasets.push_dataset_to_hub._diffusion_policy_replay_buffer import (
+                ReplayBuffer as DiffusionPolicyReplayBuffer,
+            )
+        except ModuleNotFoundError as e:
+            print("`gym_pusht` is not installed. Please install it with `pip install 'lerobot[gym_pusht]'`")
+            raise e
+
+        # as define in env
+        success_threshold = 0.95  # 95% coverage,
+
+        dataset_dict = DiffusionPolicyReplayBuffer.copy_from_path(
+            self.zarr_path
+        )  # , keys=['img', 'state', 'action'])
+
+        episode_ids = torch.from_numpy(dataset_dict.get_episode_idxs())
+        num_episodes = dataset_dict.meta["episode_ends"].shape[0]
+        assert len(
+            {dataset_dict[key].shape[0] for key in dataset_dict.keys()}  # noqa: SIM118
+        ), "Some data type dont have the same number of total frames."
+
+        # TODO: verify that goal pose is expected to be fixed
+        goal_pos_angle = np.array([256, 256, np.pi / 4])  # x, y, theta (in radians)
+        goal_body = PushTEnv.get_goal_pose_body(goal_pos_angle)
+
+        imgs = torch.from_numpy(dataset_dict["img"])  # b h w c
+        states = torch.from_numpy(dataset_dict["state"])
+        actions = torch.from_numpy(dataset_dict["action"])
+
+        ep_dicts = []
+        episode_data_index = {"from": [], "to": []}
+
+        id_from = 0
+        for episode_id in tqdm.tqdm(range(num_episodes)):
+            id_to = dataset_dict.meta["episode_ends"][episode_id]
+
+            num_frames = id_to - id_from
+
+            assert (episode_ids[id_from:id_to] == episode_id).all()
+
+            image = imgs[id_from:id_to]
+            assert image.min() >= 0.0
+            assert image.max() <= 255.0
+            image = image.type(torch.uint8)
+
+            state = states[id_from:id_to]
+            agent_pos = state[:, :2]
+            block_pos = state[:, 2:4]
+            block_angle = state[:, 4]
+
+            reward = torch.zeros(num_frames)
+            success = torch.zeros(num_frames, dtype=torch.bool)
+            done = torch.zeros(num_frames, dtype=torch.bool)
+            for i in range(num_frames):
+                space = pymunk.Space()
+                space.gravity = 0, 0
+                space.damping = 0
+
+                # Add walls.
+                walls = [
+                    PushTEnv.add_segment(space, (5, 506), (5, 5), 2),
+                    PushTEnv.add_segment(space, (5, 5), (506, 5), 2),
+                    PushTEnv.add_segment(space, (506, 5), (506, 506), 2),
+                    PushTEnv.add_segment(space, (5, 506), (506, 506), 2),
+                ]
+                space.add(*walls)
+
+                block_body = PushTEnv.add_tee(space, block_pos[i].tolist(), block_angle[i].item())
+                goal_geom = pymunk_to_shapely(goal_body, block_body.shapes)
+                block_geom = pymunk_to_shapely(block_body, block_body.shapes)
+                intersection_area = goal_geom.intersection(block_geom).area
+                goal_area = goal_geom.area
+                coverage = intersection_area / goal_area
+                reward[i] = np.clip(coverage / success_threshold, 0, 1)
+                success[i] = coverage > success_threshold
+
+            # last step of demonstration is considered done
+            done[-1] = True
+
+            ep_dict = {
+                "observation.image": [PILImage.fromarray(x.numpy()) for x in image],
+                "observation.state": agent_pos,
+                "action": actions[id_from:id_to],
+                "episode_index": torch.tensor([episode_id] * num_frames, dtype=torch.int),
+                "frame_index": torch.arange(0, num_frames, 1),
+                "timestamp": torch.arange(0, num_frames, 1) / self.fps,
+                # "next.observation.image": image[1:],
+                # "next.observation.state": agent_pos[1:],
+                # TODO(rcadene): verify that reward and done are aligned with image and agent_pos
+                "next.reward": torch.cat([reward[1:], reward[[-1]]]),
+                "next.done": torch.cat([done[1:], done[[-1]]]),
+                "next.success": torch.cat([success[1:], success[[-1]]]),
+            }
+            ep_dicts.append(ep_dict)
+
+            episode_data_index["from"].append(id_from)
+            episode_data_index["to"].append(id_from + num_frames)
+
+            id_from += num_frames
+
+        data_dict = concatenate_episodes(ep_dicts)
+        return data_dict, episode_data_index
+
+    def to_hf_dataset(self, data_dict):
+        features = {
+            "observation.image": Image(),
+            "observation.state": Sequence(
+                length=data_dict["observation.state"].shape[1], feature=Value(dtype="float32", id=None)
+            ),
+            "action": Sequence(length=data_dict["action"].shape[1], feature=Value(dtype="float32", id=None)),
+            "episode_index": Value(dtype="int64", id=None),
+            "frame_index": Value(dtype="int64", id=None),
+            "timestamp": Value(dtype="float32", id=None),
+            "next.reward": Value(dtype="float32", id=None),
+            "next.done": Value(dtype="bool", id=None),
+            "next.success": Value(dtype="bool", id=None),
+            "index": Value(dtype="int64", id=None),
+        }
+        features = Features(features)
+        hf_dataset = Dataset.from_dict(data_dict, features=features)
+        hf_dataset.set_transform(hf_transform_to_torch)
+        return hf_dataset
+
+    def cleanup(self):
+        pass
--- a/lerobot/common/datasets/push_dataset_to_hub/umi_processor.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/umi_processor.py
@@ -0,0 +1,280 @@
+import os
+import re
+import shutil
+from glob import glob
+
+import numpy as np
+import torch
+import tqdm
+import zarr
+from datasets import Dataset, Features, Image, Sequence, Value
+from PIL import Image as PILImage
+
+from lerobot.common.datasets.push_dataset_to_hub._umi_imagecodecs_numcodecs import register_codecs
+from lerobot.common.datasets.push_dataset_to_hub.utils import concatenate_episodes
+from lerobot.common.datasets.utils import (
+    hf_transform_to_torch,
+)
+
+
+class UmiProcessor:
+    """
+     Process UMI (Universal Manipulation Interface) data stored in Zarr format like in: https://github.com/real-stanford/universal_manipulation_interface
+
+    Attributes:
+        folder_path (str): The path to the folder containing Zarr datasets.
+        fps (int): Frames per second, used to calculate timestamps for frames.
+
+    """
+
+    def __init__(self, folder_path: str, fps: int | None = None):
+        self.zarr_path = folder_path
+        if fps is None:
+            # TODO (azouitine): Add reference to the paper
+            fps = 15
+        self._fps = fps
+        register_codecs()
+
+    @property
+    def fps(self) -> int:
+        return self._fps
+
+    def is_valid(self) -> bool:
+        """
+        Validates the Zarr folder to ensure it contains all required datasets with consistent frame counts.
+
+        Returns:
+            bool: True if all required datasets are present and have consistent frame counts, False otherwise.
+        """
+        # Check if the Zarr folder is valid
+        try:
+            zarr_data = zarr.open(self.zarr_path, mode="r")
+        except Exception:
+            # TODO (azouitine): Handle the exception properly
+            return False
+        required_datasets = {
+            "data/robot0_demo_end_pose",
+            "data/robot0_demo_start_pose",
+            "data/robot0_eef_pos",
+            "data/robot0_eef_rot_axis_angle",
+            "data/robot0_gripper_width",
+            "meta/episode_ends",
+            "data/camera0_rgb",
+        }
+        for dataset in required_datasets:
+            if dataset not in zarr_data:
+                return False
+        nb_frames = zarr_data["data/camera0_rgb"].shape[0]
+
+        required_datasets.remove("meta/episode_ends")
+
+        return all(nb_frames == zarr_data[dataset].shape[0] for dataset in required_datasets)
+
+    def preprocess(self):
+        """
+        Collects and processes all episodes from the Zarr dataset into structured data dictionaries.
+
+        Returns:
+            Tuple[Dict, Dict]: A tuple containing the structured episode data and episode index mappings.
+        """
+        zarr_data = zarr.open(self.zarr_path, mode="r")
+
+        # We process the image data separately because it is too large to fit in memory
+        end_pose = torch.from_numpy(zarr_data["data/robot0_demo_end_pose"][:])
+        start_pos = torch.from_numpy(zarr_data["data/robot0_demo_start_pose"][:])
+        eff_pos = torch.from_numpy(zarr_data["data/robot0_eef_pos"][:])
+        eff_rot_axis_angle = torch.from_numpy(zarr_data["data/robot0_eef_rot_axis_angle"][:])
+        gripper_width = torch.from_numpy(zarr_data["data/robot0_gripper_width"][:])
+
+        states_pos = torch.cat([eff_pos, eff_rot_axis_angle], dim=1)
+        states = torch.cat([states_pos, gripper_width], dim=1)
+
+        episode_ends = zarr_data["meta/episode_ends"][:]
+        num_episodes: int = episode_ends.shape[0]
+
+        episode_ids = torch.from_numpy(self.get_episode_idxs(episode_ends))
+
+        # We convert it in torch tensor later because the jit function does not support torch tensors
+        episode_ends = torch.from_numpy(episode_ends)
+
+        ep_dicts = []
+        episode_data_index = {"from": [], "to": []}
+        id_from = 0
+
+        for episode_id in tqdm.tqdm(range(num_episodes)):
+            id_to = episode_ends[episode_id]
+
+            num_frames = id_to - id_from
+
+            assert (
+                episode_ids[id_from:id_to] == episode_id
+            ).all(), f"episode_ids[{id_from}:{id_to}] != {episode_id}"
+
+            state = states[id_from:id_to]
+            ep_dict = {
+                # observation.image will be filled later
+                "observation.state": state,
+                "episode_index": torch.tensor([episode_id] * num_frames, dtype=torch.int),
+                "frame_index": torch.arange(0, num_frames, 1),
+                "timestamp": torch.arange(0, num_frames, 1) / self.fps,
+                "episode_data_index_from": torch.tensor([id_from] * num_frames),
+                "episode_data_index_to": torch.tensor([id_from + num_frames] * num_frames),
+                "end_pose": end_pose[id_from:id_to],
+                "start_pos": start_pos[id_from:id_to],
+                "gripper_width": gripper_width[id_from:id_to],
+            }
+            ep_dicts.append(ep_dict)
+            episode_data_index["from"].append(id_from)
+            episode_data_index["to"].append(id_from + num_frames)
+            id_from += num_frames
+
+        data_dict = concatenate_episodes(ep_dicts)
+
+        total_frames = id_from
+        data_dict["index"] = torch.arange(0, total_frames, 1)
+
+        print("Saving images to disk in temporary folder...")
+        # datasets.Image() can take a list of paths to images, so we save the images to a temporary folder
+        # to avoid loading them all in memory
+        _save_images_concurrently(
+            data=zarr_data, image_key="data/camera0_rgb", folder_path="tmp_umi_images", max_workers=12
+        )
+        print("Saving images to disk in temporary folder... Done")
+
+        # Sort files by number eg. 1.png, 2.png, 3.png, 9.png, 10.png instead of 1.png, 10.png, 2.png, 3.png, 9.png
+        # to correctly match the images with the data
+        images_path = sorted(
+            glob("tmp_umi_images/*"), key=lambda x: int(re.search(r"(\d+)\.png$", x).group(1))
+        )
+        data_dict["observation.image"] = images_path
+        print("Images saved to disk, do not forget to delete the folder tmp_umi_images/")
+
+        # Cleanup
+        return data_dict, episode_data_index
+
+    def to_hf_dataset(self, data_dict):
+        """
+        Converts the processed data dictionary into a Hugging Face dataset with defined features.
+
+        Args:
+            data_dict (Dict): The data dictionary containing tensors and episode information.
+
+        Returns:
+            Dataset: A Hugging Face dataset constructed from the provided data dictionary.
+        """
+        features = {
+            "observation.image": Image(),
+            "observation.state": Sequence(
+                length=data_dict["observation.state"].shape[1], feature=Value(dtype="float32", id=None)
+            ),
+            "episode_index": Value(dtype="int64", id=None),
+            "frame_index": Value(dtype="int64", id=None),
+            "timestamp": Value(dtype="float32", id=None),
+            "index": Value(dtype="int64", id=None),
+            "episode_data_index_from": Value(dtype="int64", id=None),
+            "episode_data_index_to": Value(dtype="int64", id=None),
+            # `start_pos` and `end_pos` respectively represent the positions of the end-effector
+            # at the beginning and the end of the episode.
+            # `gripper_width` indicates the distance between the grippers, and this value is included
+            # in the state vector, which comprises the concatenation of the end-effector position
+            # and gripper width.
+            "end_pose": Sequence(
+                length=data_dict["end_pose"].shape[1], feature=Value(dtype="float32", id=None)
+            ),
+            "start_pos": Sequence(
+                length=data_dict["start_pos"].shape[1], feature=Value(dtype="float32", id=None)
+            ),
+            "gripper_width": Sequence(
+                length=data_dict["gripper_width"].shape[1], feature=Value(dtype="float32", id=None)
+            ),
+        }
+        features = Features(features)
+        hf_dataset = Dataset.from_dict(data_dict, features=features)
+        hf_dataset.set_transform(hf_transform_to_torch)
+
+        return hf_dataset
+
+    def cleanup(self):
+        # Cleanup
+        if os.path.exists("tmp_umi_images"):
+            print("Removing temporary images folder")
+            shutil.rmtree("tmp_umi_images")
+            print("Cleanup done")
+
+    @classmethod
+    def get_episode_idxs(cls, episode_ends: np.ndarray) -> np.ndarray:
+        # Optimized and simplified version of this function: https://github.com/real-stanford/universal_manipulation_interface/blob/298776ce251f33b6b3185a98d6e7d1f9ad49168b/diffusion_policy/common/replay_buffer.py#L374
+        from numba import jit
+
+        @jit(nopython=True)
+        def _get_episode_idxs(episode_ends):
+            result = np.zeros((episode_ends[-1],), dtype=np.int64)
+            start_idx = 0
+            for episode_number, end_idx in enumerate(episode_ends):
+                result[start_idx:end_idx] = episode_number
+                start_idx = end_idx
+            return result
+
+        return _get_episode_idxs(episode_ends)
+
+
+def _clear_folder(folder_path: str):
+    """
+    Clears all the content of the specified folder. Creates the folder if it does not exist.
+
+    Args:
+    folder_path (str): Path to the folder to clear.
+
+    Examples:
+    >>> import os
+    >>> os.makedirs('example_folder', exist_ok=True)
+    >>> with open('example_folder/temp_file.txt', 'w') as f:
+    ...     f.write('example')
+    >>> clear_folder('example_folder')
+    >>> os.listdir('example_folder')
+    []
+    """
+    if os.path.exists(folder_path):
+        for filename in os.listdir(folder_path):
+            file_path = os.path.join(folder_path, filename)
+            try:
+                if os.path.isfile(file_path) or os.path.islink(file_path):
+                    os.unlink(file_path)
+                elif os.path.isdir(file_path):
+                    shutil.rmtree(file_path)
+            except Exception as e:
+                print(f"Failed to delete {file_path}. Reason: {e}")
+    else:
+        os.makedirs(folder_path)
+
+
+def _save_image(img_array: np.array, i: int, folder_path: str):
+    """
+    Saves a single image to the specified folder.
+
+    Args:
+    img_array (ndarray): The numpy array of the image.
+    i (int): Index of the image, used for naming.
+    folder_path (str): Path to the folder where the image will be saved.
+    """
+    img = PILImage.fromarray(img_array)
+    img_format = "PNG" if img_array.dtype == np.uint8 else "JPEG"
+    img.save(os.path.join(folder_path, f"{i}.{img_format.lower()}"), quality=100)
+
+
+def _save_images_concurrently(data: dict, image_key: str, folder_path: str, max_workers: int = 4):
+    from concurrent.futures import ThreadPoolExecutor
+
+    """
+    Saves images from the zarr_data to the specified folder using multithreading.
+
+    Args:
+    zarr_data (dict): A dictionary containing image data in an array format.
+    folder_path (str): Path to the folder where images will be saved.
+    max_workers (int): The maximum number of threads to use for saving images.
+    """
+    num_images = len(data["data/camera0_rgb"])
+    _clear_folder(folder_path)  # Clear or create folder first
+
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        [executor.submit(_save_image, data[image_key][i], i, folder_path) for i in range(num_images)]
--- a/lerobot/common/datasets/push_dataset_to_hub/utils.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/utils.py
@@ -0,0 +1,20 @@
+import torch
+
+
+def concatenate_episodes(ep_dicts):
+    data_dict = {}
+
+    keys = ep_dicts[0].keys()
+    for key in keys:
+        if torch.is_tensor(ep_dicts[0][key][0]):
+            data_dict[key] = torch.cat([ep_dict[key] for ep_dict in ep_dicts])
+        else:
+            if key not in data_dict:
+                data_dict[key] = []
+            for ep_dict in ep_dicts:
+                for x in ep_dict[key]:
+                    data_dict[key].append(x)
+
+    total_frames = data_dict["frame_index"].shape[0]
+    data_dict["index"] = torch.arange(0, total_frames, 1)
+    return data_dict
--- a/lerobot/common/datasets/push_dataset_to_hub/xarm_processor.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/xarm_processor.py
@@ -0,0 +1,145 @@
+import pickle
+from pathlib import Path
+
+import einops
+import torch
+import tqdm
+from datasets import Dataset, Features, Image, Sequence, Value
+from PIL import Image as PILImage
+
+from lerobot.common.datasets.push_dataset_to_hub.utils import concatenate_episodes
+from lerobot.common.datasets.utils import (
+    hf_transform_to_torch,
+)
+
+
+class XarmProcessor:
+    """Process pickle files formatted like in: https://github.com/fyhMer/fowm"""
+
+    def __init__(self, folder_path: str, fps: int | None = None):
+        self.folder_path = Path(folder_path)
+        self.keys = {"actions", "rewards", "dones", "masks"}
+        self.nested_keys = {"observations": {"rgb", "state"}, "next_observations": {"rgb", "state"}}
+        if fps is None:
+            fps = 15
+        self._fps = fps
+
+    @property
+    def fps(self) -> int:
+        return self._fps
+
+    def is_valid(self) -> bool:
+        # get all .pkl files
+        xarm_files = list(self.folder_path.glob("*.pkl"))
+        if len(xarm_files) != 1:
+            return False
+
+        try:
+            with open(xarm_files[0], "rb") as f:
+                dataset_dict = pickle.load(f)
+        except Exception:
+            return False
+
+        if not isinstance(dataset_dict, dict):
+            return False
+
+        if not all(k in dataset_dict for k in self.keys):
+            return False
+
+        # Check for consistent lengths in nested keys
+        try:
+            expected_len = len(dataset_dict["actions"])
+            if any(len(dataset_dict[key]) != expected_len for key in self.keys if key in dataset_dict):
+                return False
+
+            for key, subkeys in self.nested_keys.items():
+                nested_dict = dataset_dict.get(key, {})
+                if any(
+                    len(nested_dict[subkey]) != expected_len for subkey in subkeys if subkey in nested_dict
+                ):
+                    return False
+        except KeyError:  # If any expected key or subkey is missing
+            return False
+
+        return True  # All checks passed
+
+    def preprocess(self):
+        if not self.is_valid():
+            raise ValueError("The Xarm file is invalid or does not contain the required datasets.")
+
+        xarm_files = list(self.folder_path.glob("*.pkl"))
+
+        with open(xarm_files[0], "rb") as f:
+            dataset_dict = pickle.load(f)
+        ep_dicts = []
+        episode_data_index = {"from": [], "to": []}
+
+        id_from = 0
+        id_to = 0
+        episode_id = 0
+        total_frames = dataset_dict["actions"].shape[0]
+        for i in tqdm.tqdm(range(total_frames)):
+            id_to += 1
+
+            if not dataset_dict["dones"][i]:
+                continue
+
+            num_frames = id_to - id_from
+
+            image = torch.tensor(dataset_dict["observations"]["rgb"][id_from:id_to])
+            image = einops.rearrange(image, "b c h w -> b h w c")
+            state = torch.tensor(dataset_dict["observations"]["state"][id_from:id_to])
+            action = torch.tensor(dataset_dict["actions"][id_from:id_to])
+            # TODO(rcadene): we have a missing last frame which is the observation when the env is done
+            # it is critical to have this frame for tdmpc to predict a "done observation/state"
+            # next_image = torch.tensor(dataset_dict["next_observations"]["rgb"][id_from:id_to])
+            # next_state = torch.tensor(dataset_dict["next_observations"]["state"][id_from:id_to])
+            next_reward = torch.tensor(dataset_dict["rewards"][id_from:id_to])
+            next_done = torch.tensor(dataset_dict["dones"][id_from:id_to])
+
+            ep_dict = {
+                "observation.image": [PILImage.fromarray(x.numpy()) for x in image],
+                "observation.state": state,
+                "action": action,
+                "episode_index": torch.tensor([episode_id] * num_frames, dtype=torch.int),
+                "frame_index": torch.arange(0, num_frames, 1),
+                "timestamp": torch.arange(0, num_frames, 1) / self.fps,
+                # "next.observation.image": next_image,
+                # "next.observation.state": next_state,
+                "next.reward": next_reward,
+                "next.done": next_done,
+            }
+            ep_dicts.append(ep_dict)
+
+            episode_data_index["from"].append(id_from)
+            episode_data_index["to"].append(id_from + num_frames)
+
+            id_from = id_to
+            episode_id += 1
+
+        data_dict = concatenate_episodes(ep_dicts)
+        return data_dict, episode_data_index
+
+    def to_hf_dataset(self, data_dict):
+        features = {
+            "observation.image": Image(),
+            "observation.state": Sequence(
+                length=data_dict["observation.state"].shape[1], feature=Value(dtype="float32", id=None)
+            ),
+            "action": Sequence(length=data_dict["action"].shape[1], feature=Value(dtype="float32", id=None)),
+            "episode_index": Value(dtype="int64", id=None),
+            "frame_index": Value(dtype="int64", id=None),
+            "timestamp": Value(dtype="float32", id=None),
+            "next.reward": Value(dtype="float32", id=None),
+            "next.done": Value(dtype="bool", id=None),
+            #'next.success': Value(dtype='bool', id=None),
+            "index": Value(dtype="int64", id=None),
+        }
+        features = Features(features)
+        hf_dataset = Dataset.from_dict(data_dict, features=features)
+        hf_dataset.set_transform(hf_transform_to_torch)
+
+        return hf_dataset
+
+    def cleanup(self):
+        pass