Convert datasets to av1 encoding (#302)

2024-07-22 20:08:59 +02:00
parent 461d5472d3
commit 0b21210d72
571 changed files with 988 additions and 1311 deletions
--- a/lerobot/common/datasets/lerobot_dataset.py
+++ b/lerobot/common/datasets/lerobot_dataset.py
@@ -35,9 +35,8 @@ from lerobot.common.datasets.utils import (
 )
 from lerobot.common.datasets.video_utils import VideoFrame, load_from_videos

-# For maintainers, see lerobot/common/datasets/push_dataset_to_hub/codebase_version.md
-CODEBASE_VERSION = "v1.5"
-
+# For maintainers, see lerobot/common/datasets/push_dataset_to_hub/CODEBASE_VERSION.md
+CODEBASE_VERSION = "v1.6"
 DATA_DIR = Path(os.environ["DATA_DIR"]) if "DATA_DIR" in os.environ else None


--- a/lerobot/common/datasets/push_dataset_to_hub/CODEBASE_VERSION.md
+++ b/lerobot/common/datasets/push_dataset_to_hub/CODEBASE_VERSION.md
@@ -10,7 +10,8 @@ For instance, [`lerobot/pusht`](https://huggingface.co/datasets/lerobot/pusht) h
 - [v1.2](https://huggingface.co/datasets/lerobot/pusht/tree/v1.2)
 - [v1.3](https://huggingface.co/datasets/lerobot/pusht/tree/v1.3)
 - [v1.4](https://huggingface.co/datasets/lerobot/pusht/tree/v1.4)
- [v1.5](https://huggingface.co/datasets/lerobot/pusht/tree/v1.5) <-- last version
+- [v1.5](https://huggingface.co/datasets/lerobot/pusht/tree/v1.5)
+- [v1.6](https://huggingface.co/datasets/lerobot/pusht/tree/v1.6) <-- last version
 - [main](https://huggingface.co/datasets/lerobot/pusht/tree/main) <-- points to the last version

 Starting with v1.6, every dataset pushed to the hub or saved locally also have this version number in their
@@ -45,13 +46,11 @@ for repo_id in available_datasets:
    dataset_info = api.list_repo_refs(repo_id, repo_type="dataset")
    branches = [b.name for b in dataset_info.branches]
    if CODEBASE_VERSION in branches:
-        # First check if the newer version already exists.
-        print(f"Found existing branch for {repo_id}. Please contact a member of the core LeRobot team.")
-        print("Exiting early")
-        break
+        print(f"{repo_id} already @{CODEBASE_VERSION}, skipping.")
+        continue
    else:
        # Now create a branch named after the new version by branching out from "main"
        # which is expected to be the preceding version
        api.create_branch(repo_id, repo_type="dataset", branch=CODEBASE_VERSION, revision="main")
-        print(f"{repo_id} successfully updated")
+        print(f"{repo_id} successfully updated @{CODEBASE_VERSION}")
 ```
--- a/lerobot/common/datasets/push_dataset_to_hub/_download_raw.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/_download_raw.py
@@ -19,8 +19,8 @@ This file contains download scripts for raw datasets.
 Example of usage:
 ```
 python lerobot/common/datasets/push_dataset_to_hub/_download_raw.py \
--raw-dir data/cadene/pusht_raw \
--repo-id cadene/pusht_raw
+--raw-dir data/lerobot-raw/pusht_raw \
+--repo-id lerobot-raw/pusht_raw
 ```
 """

@@ -31,63 +31,65 @@ from pathlib import Path

 from huggingface_hub import snapshot_download

-AVAILABLE_RAW_REPO_IDS = [
-    "lerobot-raw/aloha_mobile_cabinet_raw",
-    "lerobot-raw/aloha_mobile_chair_raw",
-    "lerobot-raw/aloha_mobile_elevator_raw",
-    "lerobot-raw/aloha_mobile_shrimp_raw",
-    "lerobot-raw/aloha_mobile_wash_pan_raw",
-    "lerobot-raw/aloha_mobile_wipe_wine_raw",
-    "lerobot-raw/aloha_sim_insertion_human_raw",
-    "lerobot-raw/aloha_sim_insertion_scripted_raw",
-    "lerobot-raw/aloha_sim_transfer_cube_human_raw",
-    "lerobot-raw/aloha_sim_transfer_cube_scripted_raw",
-    "lerobot-raw/aloha_static_battery_raw",
-    "lerobot-raw/aloha_static_candy_raw",
-    "lerobot-raw/aloha_static_coffee_new_raw",
-    "lerobot-raw/aloha_static_coffee_raw",
-    "lerobot-raw/aloha_static_cups_open_raw",
-    "lerobot-raw/aloha_static_fork_pick_up_raw",
-    "lerobot-raw/aloha_static_pingpong_test_raw",
-    "lerobot-raw/aloha_static_pro_pencil_raw",
-    "lerobot-raw/aloha_static_screw_driver_raw",
-    "lerobot-raw/aloha_static_tape_raw",
-    "lerobot-raw/aloha_static_thread_velcro_raw",
-    "lerobot-raw/aloha_static_towel_raw",
-    "lerobot-raw/aloha_static_vinh_cup_left_raw",
-    "lerobot-raw/aloha_static_vinh_cup_raw",
-    "lerobot-raw/aloha_static_ziploc_slide_raw",
-    "lerobot-raw/pusht_raw",
-    "lerobot-raw/umi_cup_in_the_wild_raw",
-    "lerobot-raw/unitreeh1_fold_clothes_raw",
-    "lerobot-raw/unitreeh1_rearrange_objects_raw",
-    "lerobot-raw/unitreeh1_two_robot_greeting_raw",
-    "lerobot-raw/unitreeh1_warehouse_raw",
-    "lerobot-raw/xarm_lift_medium_raw",
-    "lerobot-raw/xarm_lift_medium_replay_raw",
-    "lerobot-raw/xarm_push_medium_raw",
-    "lerobot-raw/xarm_push_medium_replay_raw",
-]
+from lerobot.common.datasets.push_dataset_to_hub.utils import check_repo_id
+
+# {raw_repo_id: raw_format}
+AVAILABLE_RAW_REPO_IDS = {
+    "lerobot-raw/aloha_mobile_cabinet_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_mobile_chair_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_mobile_elevator_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_mobile_shrimp_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_mobile_wash_pan_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_mobile_wipe_wine_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_sim_insertion_human_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_sim_insertion_scripted_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_sim_transfer_cube_human_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_sim_transfer_cube_scripted_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_static_battery_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_static_candy_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_static_coffee_new_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_static_coffee_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_static_cups_open_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_static_fork_pick_up_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_static_pingpong_test_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_static_pro_pencil_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_static_screw_driver_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_static_tape_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_static_thread_velcro_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_static_towel_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_static_vinh_cup_left_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_static_vinh_cup_raw": "aloha_hdf5",
+    "lerobot-raw/aloha_static_ziploc_slide_raw": "aloha_hdf5",
+    "lerobot-raw/pusht_raw": "pusht_zarr",
+    "lerobot-raw/umi_cup_in_the_wild_raw": "umi_zarr",
+    "lerobot-raw/unitreeh1_fold_clothes_raw": "aloha_hdf5",
+    "lerobot-raw/unitreeh1_rearrange_objects_raw": "aloha_hdf5",
+    "lerobot-raw/unitreeh1_two_robot_greeting_raw": "aloha_hdf5",
+    "lerobot-raw/unitreeh1_warehouse_raw": "aloha_hdf5",
+    "lerobot-raw/xarm_lift_medium_raw": "xarm_pkl",
+    "lerobot-raw/xarm_lift_medium_replay_raw": "xarm_pkl",
+    "lerobot-raw/xarm_push_medium_raw": "xarm_pkl",
+    "lerobot-raw/xarm_push_medium_replay_raw": "xarm_pkl",
+}


 def download_raw(raw_dir: Path, repo_id: str):
-    # Check repo_id is well formated
-    if len(repo_id.split("/")) != 2:
-        raise ValueError(
-            f"`repo_id` is expected to contain a community or user id `/` the name of the dataset (e.g. 'lerobot/pusht'), but contains '{repo_id}'."
-        )
+    check_repo_id(repo_id)
    user_id, dataset_id = repo_id.split("/")

    if not dataset_id.endswith("_raw"):
        warnings.warn(
-            f"`dataset_id` ({dataset_id}) doesn't end with '_raw' (e.g. 'lerobot/pusht_raw'). Following this naming convention by renaming your repository is advised, but not mandatory.",
+            f"""`dataset_id` ({dataset_id}) doesn't end with '_raw' (e.g. 'lerobot/pusht_raw'). Following this
+             naming convention by renaming your repository is advised, but not mandatory.""",
            stacklevel=1,
        )

    # Send warning if raw_dir isn't well formated
    if raw_dir.parts[-2] != user_id or raw_dir.parts[-1] != dataset_id:
        warnings.warn(
-            f"`raw_dir` ({raw_dir}) doesn't contain a community or user id `/` the name of the dataset that match the `repo_id` (e.g. 'data/lerobot/pusht_raw'). Following this naming convention is advised, but not mandatory.",
+            f"""`raw_dir` ({raw_dir}) doesn't contain a community or user id `/` the name of the dataset that
+             match the `repo_id` (e.g. 'data/lerobot/pusht_raw'). Following this naming convention is advised,
+             but not mandatory.""",
            stacklevel=1,
        )
    raw_dir.mkdir(parents=True, exist_ok=True)
@@ -97,8 +99,9 @@ def download_raw(raw_dir: Path, repo_id: str):
    logging.info(f"Finish downloading from huggingface.co/{user_id} for {dataset_id}")


-def download_all_raw_datasets():
-    data_dir = Path("data")
+def download_all_raw_datasets(data_dir: Path | None = None):
+    if data_dir is None:
+        data_dir = Path("data")
    for repo_id in AVAILABLE_RAW_REPO_IDS:
        raw_dir = data_dir / repo_id
        download_raw(raw_dir, repo_id)
@@ -106,7 +109,8 @@ def download_all_raw_datasets():

 def main():
    parser = argparse.ArgumentParser(
-        description=f"A script to download raw datasets from Hugging Face hub to a local directory. Here is a non exhaustive list of available repositories to use in `--repo-id`: {AVAILABLE_RAW_REPO_IDS}",
+        description=f"""A script to download raw datasets from Hugging Face hub to a local directory. Here is a
+            non exhaustive list of available repositories to use in `--repo-id`: {AVAILABLE_RAW_REPO_IDS}""",
    )

    parser.add_argument(
@@ -119,7 +123,8 @@ def main():
        "--repo-id",
        type=str,
        required=True,
-        help="Repositery identifier on Hugging Face: a community or a user name `/` the name of the dataset (e.g. `lerobot/pusht_raw`, `cadene/aloha_sim_insertion_human_raw`).",
+        help="""Repositery identifier on Hugging Face: a community or a user name `/` the name of
+        the dataset (e.g. `lerobot/pusht_raw`, `cadene/aloha_sim_insertion_human_raw`).""",
    )
    args = parser.parse_args()
    download_raw(**vars(args))
--- a/lerobot/common/datasets/push_dataset_to_hub/_encode_datasets.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/_encode_datasets.py
@@ -0,0 +1,184 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Use this script to batch encode lerobot dataset from their raw format to LeRobotDataset and push their updated
+version to the hub. Under the hood, this script reuses 'push_dataset_to_hub.py'. It assumes that you already
+downloaded raw datasets, which you can do with the related '_download_raw.py' script.
+
+For instance, for codebase_version = 'v1.6', the following command was run, assuming raw datasets from
+lerobot-raw were downloaded in 'raw/datasets/directory':
+```bash
+python lerobot/common/datasets/push_dataset_to_hub/_encode_datasets.py \
+  --raw-dir raw/datasets/directory \
+  --raw-repo-ids lerobot-raw \
+  --local-dir push/datasets/directory \
+  --tests-data-dir tests/data \
+  --push-repo lerobot \
+  --vcodec libsvtav1 \
+  --pix-fmt yuv420p \
+  --g 2 \
+  --crf 30
+```
+"""
+
+import argparse
+from pathlib import Path
+
+from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION
+from lerobot.common.datasets.push_dataset_to_hub._download_raw import AVAILABLE_RAW_REPO_IDS
+from lerobot.common.datasets.push_dataset_to_hub.utils import check_repo_id
+from lerobot.scripts.push_dataset_to_hub import push_dataset_to_hub
+
+
+def get_push_repo_id_from_raw(raw_repo_id: str, push_repo: str) -> str:
+    dataset_id_raw = raw_repo_id.split("/")[1]
+    dataset_id = dataset_id_raw.removesuffix("_raw")
+    return f"{push_repo}/{dataset_id}"
+
+
+def encode_datasets(
+    raw_dir: Path,
+    raw_repo_ids: list[str],
+    push_repo: str,
+    vcodec: str,
+    pix_fmt: str,
+    g: int,
+    crf: int,
+    local_dir: Path | None = None,
+    tests_data_dir: Path | None = None,
+    raw_format: str | None = None,
+    dry_run: bool = False,
+) -> None:
+    if len(raw_repo_ids) == 1 and raw_repo_ids[0].lower() == "lerobot-raw":
+        raw_repo_ids_format = AVAILABLE_RAW_REPO_IDS
+    else:
+        if raw_format is None:
+            raise ValueError(raw_format)
+        raw_repo_ids_format = {id_: raw_format for id_ in raw_repo_ids}
+
+    for raw_repo_id, repo_raw_format in raw_repo_ids_format.items():
+        check_repo_id(raw_repo_id)
+        dataset_repo_id_push = get_push_repo_id_from_raw(raw_repo_id, push_repo)
+        dataset_raw_dir = raw_dir / raw_repo_id
+        dataset_dir = local_dir / dataset_repo_id_push if local_dir is not None else None
+        encoding = {
+            "vcodec": vcodec,
+            "pix_fmt": pix_fmt,
+            "g": g,
+            "crf": crf,
+        }
+
+        if not (dataset_raw_dir).is_dir():
+            raise NotADirectoryError(dataset_raw_dir)
+
+        if not dry_run:
+            push_dataset_to_hub(
+                dataset_raw_dir,
+                raw_format=repo_raw_format,
+                repo_id=dataset_repo_id_push,
+                local_dir=dataset_dir,
+                resume=True,
+                encoding=encoding,
+                tests_data_dir=tests_data_dir,
+            )
+        else:
+            print(
+                f"DRY RUN: {dataset_raw_dir}  -->  {dataset_dir}  -->  {dataset_repo_id_push}@{CODEBASE_VERSION}"
+            )
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--raw-dir",
+        type=Path,
+        default=Path("data"),
+        help="Directory where raw datasets are located.",
+    )
+    parser.add_argument(
+        "--raw-repo-ids",
+        type=str,
+        nargs="*",
+        default=["lerobot-raw"],
+        help="""Raw dataset repo ids. if 'lerobot-raw', the keys from `AVAILABLE_RAW_REPO_IDS` will be
+            used and raw datasets will be fetched from the 'lerobot-raw/' repo and pushed with their
+            associated format. It is assumed that each dataset is located at `raw_dir / raw_repo_id` """,
+    )
+    parser.add_argument(
+        "--raw-format",
+        type=str,
+        default=None,
+        help="""Raw format to use for the raw repo-ids. Must be specified if --raw-repo-ids is not
+            'lerobot-raw'""",
+    )
+    parser.add_argument(
+        "--local-dir",
+        type=Path,
+        default=None,
+        help="""When provided, writes the dataset converted to LeRobotDataset format in this directory
+        (e.g. `data/lerobot/aloha_mobile_chair`).""",
+    )
+    parser.add_argument(
+        "--push-repo",
+        type=str,
+        default="lerobot",
+        help="Repo to upload datasets to",
+    )
+    parser.add_argument(
+        "--vcodec",
+        type=str,
+        default="libsvtav1",
+        help="Codec to use for encoding videos",
+    )
+    parser.add_argument(
+        "--pix-fmt",
+        type=str,
+        default="yuv420p",
+        help="Pixel formats (chroma subsampling) to be used for encoding",
+    )
+    parser.add_argument(
+        "--g",
+        type=int,
+        default=2,
+        help="Group of pictures sizes to be used for encoding.",
+    )
+    parser.add_argument(
+        "--crf",
+        type=int,
+        default=30,
+        help="Constant rate factors to be used for encoding.",
+    )
+    parser.add_argument(
+        "--tests-data-dir",
+        type=Path,
+        default=None,
+        help=(
+            "When provided, save tests artifacts into the given directory "
+            "(e.g. `--tests-data-dir tests/data` will save to tests/data/{--repo-id})."
+        ),
+    )
+    parser.add_argument(
+        "--dry-run",
+        type=int,
+        default=0,
+        help="If not set to 0, this script won't download or upload anything.",
+    )
+    args = parser.parse_args()
+    encode_datasets(**vars(args))
+
+
+if __name__ == "__main__":
+    main()
--- a/lerobot/common/datasets/push_dataset_to_hub/aloha_hdf5_format.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/aloha_hdf5_format.py
@@ -29,7 +29,11 @@ from datasets import Dataset, Features, Image, Sequence, Value
 from PIL import Image as PILImage

 from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION
-from lerobot.common.datasets.push_dataset_to_hub.utils import concatenate_episodes, save_images_concurrently
+from lerobot.common.datasets.push_dataset_to_hub.utils import (
+    concatenate_episodes,
+    get_default_encoding,
+    save_images_concurrently,
+)
 from lerobot.common.datasets.utils import (
    calculate_episode_data_index,
    hf_transform_to_torch,
@@ -72,7 +76,14 @@ def check_format(raw_dir) -> bool:
                    assert c < h and c < w, f"Expect (h,w,c) image format but ({h=},{w=},{c=}) provided."


-def load_from_raw(raw_dir: Path, videos_dir: Path, fps: int, video: bool, episodes: list[int] | None = None):
+def load_from_raw(
+    raw_dir: Path,
+    videos_dir: Path,
+    fps: int,
+    video: bool,
+    episodes: list[int] | None = None,
+    encoding: dict | None = None,
+):
    # only frames from simulation are uncompressed
    compressed_images = "sim" not in raw_dir.name

@@ -123,7 +134,7 @@ def load_from_raw(raw_dir: Path, videos_dir: Path, fps: int, video: bool, episod
                    # encode images to a mp4 video
                    fname = f"{img_key}_episode_{ep_idx:06d}.mp4"
                    video_path = videos_dir / fname
-                    encode_video_frames(tmp_imgs_dir, video_path, fps)
+                    encode_video_frames(tmp_imgs_dir, video_path, fps, **(encoding or {}))

                    # clean temporary images directory
                    shutil.rmtree(tmp_imgs_dir)
@@ -200,6 +211,7 @@ def from_raw_to_lerobot_format(
    fps: int | None = None,
    video: bool = True,
    episodes: list[int] | None = None,
+    encoding: dict | None = None,
 ):
    # sanity check
    check_format(raw_dir)
@@ -207,7 +219,7 @@ def from_raw_to_lerobot_format(
    if fps is None:
        fps = 50

-    data_dict = load_from_raw(raw_dir, videos_dir, fps, video, episodes)
+    data_dict = load_from_raw(raw_dir, videos_dir, fps, video, episodes, encoding)
    hf_dataset = to_hf_dataset(data_dict, video)
    episode_data_index = calculate_episode_data_index(hf_dataset)
    info = {
@@ -215,4 +227,7 @@ def from_raw_to_lerobot_format(
        "fps": fps,
        "video": video,
    }
+    if video:
+        info["encoding"] = get_default_encoding()
+
    return hf_dataset, episode_data_index, info
--- a/lerobot/common/datasets/push_dataset_to_hub/cam_png_format.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/cam_png_format.py
@@ -81,8 +81,9 @@ def from_raw_to_lerobot_format(
    fps: int | None = None,
    video: bool = True,
    episodes: list[int] | None = None,
+    encoding: dict | None = None,
 ):
-    if video or episodes is not None:
+    if video or episodes or encoding is not None:
        # TODO(aliberts): support this
        raise NotImplementedError

--- a/lerobot/common/datasets/push_dataset_to_hub/dora_parquet_format.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/dora_parquet_format.py
@@ -18,6 +18,7 @@ Contains utilities to process raw data format from dora-record
 """

 import re
+import warnings
 from pathlib import Path

 import pandas as pd
@@ -199,6 +200,7 @@ def from_raw_to_lerobot_format(
    fps: int | None = None,
    video: bool = True,
    episodes: list[int] | None = None,
+    encoding: dict | None = None,
 ):
    # sanity check
    check_format(raw_dir)
@@ -211,6 +213,12 @@ def from_raw_to_lerobot_format(
    if not video:
        raise NotImplementedError()

+    if encoding is not None:
+        warnings.warn(
+            "Video encoding is currently done outside of LeRobot for the dora_parquet format.",
+            stacklevel=1,
+        )
+
    data_df = load_from_raw(raw_dir, videos_dir, fps, episodes)
    hf_dataset = to_hf_dataset(data_df, video)
    episode_data_index = calculate_episode_data_index(hf_dataset)
@@ -219,4 +227,7 @@ def from_raw_to_lerobot_format(
        "fps": fps,
        "video": video,
    }
+    if video:
+        info["encoding"] = "unknown"
+
    return hf_dataset, episode_data_index, info
--- a/lerobot/common/datasets/push_dataset_to_hub/pusht_zarr_format.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/pusht_zarr_format.py
@@ -26,7 +26,11 @@ from datasets import Dataset, Features, Image, Sequence, Value
 from PIL import Image as PILImage

 from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION
-from lerobot.common.datasets.push_dataset_to_hub.utils import concatenate_episodes, save_images_concurrently
+from lerobot.common.datasets.push_dataset_to_hub.utils import (
+    concatenate_episodes,
+    get_default_encoding,
+    save_images_concurrently,
+)
 from lerobot.common.datasets.utils import (
    calculate_episode_data_index,
    hf_transform_to_torch,
@@ -62,6 +66,7 @@ def load_from_raw(
    video: bool,
    episodes: list[int] | None = None,
    keypoints_instead_of_image: bool = False,
+    encoding: dict | None = None,
 ):
    try:
        import pymunk
@@ -172,7 +177,7 @@ def load_from_raw(
                # encode images to a mp4 video
                fname = f"{img_key}_episode_{ep_idx:06d}.mp4"
                video_path = videos_dir / fname
-                encode_video_frames(tmp_imgs_dir, video_path, fps)
+                encode_video_frames(tmp_imgs_dir, video_path, fps, **(encoding or {}))

                # clean temporary images directory
                shutil.rmtree(tmp_imgs_dir)
@@ -244,6 +249,7 @@ def from_raw_to_lerobot_format(
    fps: int | None = None,
    video: bool = True,
    episodes: list[int] | None = None,
+    encoding: dict | None = None,
 ):
    # Manually change this to True to use keypoints of the T instead of an image observation (but don't merge
    # with True). Also make sure to use video = 0 in the `push_dataset_to_hub.py` script.
@@ -255,7 +261,7 @@ def from_raw_to_lerobot_format(
    if fps is None:
        fps = 10

-    data_dict = load_from_raw(raw_dir, videos_dir, fps, video, episodes, keypoints_instead_of_image)
+    data_dict = load_from_raw(raw_dir, videos_dir, fps, video, episodes, keypoints_instead_of_image, encoding)
    hf_dataset = to_hf_dataset(data_dict, video, keypoints_instead_of_image)
    episode_data_index = calculate_episode_data_index(hf_dataset)
    info = {
@@ -263,4 +269,7 @@ def from_raw_to_lerobot_format(
        "fps": fps,
        "video": video if not keypoints_instead_of_image else 0,
    }
+    if video:
+        info["encoding"] = get_default_encoding()
+
    return hf_dataset, episode_data_index, info
--- a/lerobot/common/datasets/push_dataset_to_hub/umi_zarr_format.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/umi_zarr_format.py
@@ -27,7 +27,11 @@ from PIL import Image as PILImage

 from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION
 from lerobot.common.datasets.push_dataset_to_hub._umi_imagecodecs_numcodecs import register_codecs
-from lerobot.common.datasets.push_dataset_to_hub.utils import concatenate_episodes, save_images_concurrently
+from lerobot.common.datasets.push_dataset_to_hub.utils import (
+    concatenate_episodes,
+    get_default_encoding,
+    save_images_concurrently,
+)
 from lerobot.common.datasets.utils import (
    calculate_episode_data_index,
    hf_transform_to_torch,
@@ -60,7 +64,14 @@ def check_format(raw_dir) -> bool:
    assert all(nb_frames == zarr_data[dataset].shape[0] for dataset in required_datasets)


-def load_from_raw(raw_dir: Path, videos_dir: Path, fps: int, video: bool, episodes: list[int] | None = None):
+def load_from_raw(
+    raw_dir: Path,
+    videos_dir: Path,
+    fps: int,
+    video: bool,
+    episodes: list[int] | None = None,
+    encoding: dict | None = None,
+):
    zarr_path = raw_dir / "cup_in_the_wild.zarr"
    zarr_data = zarr.open(zarr_path, mode="r")

@@ -88,49 +99,61 @@ def load_from_raw(raw_dir: Path, videos_dir: Path, fps: int, video: bool, episod
        to_ids.append(to_idx)
        from_idx = to_idx

+    ep_dicts_dir = videos_dir / "ep_dicts"
+    ep_dicts_dir.mkdir(exist_ok=True, parents=True)
    ep_dicts = []
+
    ep_ids = episodes if episodes else range(num_episodes)
    for ep_idx, selected_ep_idx in tqdm.tqdm(enumerate(ep_ids)):
-        from_idx = from_ids[selected_ep_idx]
-        to_idx = to_ids[selected_ep_idx]
-        num_frames = to_idx - from_idx
+        ep_dict_path = ep_dicts_dir / f"{ep_idx}"
+        if not ep_dict_path.is_file():
+            from_idx = from_ids[selected_ep_idx]
+            to_idx = to_ids[selected_ep_idx]
+            num_frames = to_idx - from_idx

-        # TODO(rcadene): save temporary images of the episode?
+            # TODO(rcadene): save temporary images of the episode?

-        state = states[from_idx:to_idx]
+            state = states[from_idx:to_idx]

-        ep_dict = {}
+            ep_dict = {}

-        # load 57MB of images in RAM (400x224x224x3 uint8)
-        imgs_array = zarr_data["data/camera0_rgb"][from_idx:to_idx]
-        img_key = "observation.image"
-        if video:
-            # save png images in temporary directory
-            tmp_imgs_dir = videos_dir / "tmp_images"
-            save_images_concurrently(imgs_array, tmp_imgs_dir)
+            # load 57MB of images in RAM (400x224x224x3 uint8)
+            imgs_array = zarr_data["data/camera0_rgb"][from_idx:to_idx]
+            img_key = "observation.image"
+            if video:
+                fname = f"{img_key}_episode_{ep_idx:06d}.mp4"
+                video_path = videos_dir / fname
+                if not video_path.is_file():
+                    # save png images in temporary directory
+                    tmp_imgs_dir = videos_dir / "tmp_images"
+                    save_images_concurrently(imgs_array, tmp_imgs_dir)

-            # encode images to a mp4 video
-            fname = f"{img_key}_episode_{ep_idx:06d}.mp4"
-            video_path = videos_dir / fname
-            encode_video_frames(tmp_imgs_dir, video_path, fps)
+                    # encode images to a mp4 video
+                    encode_video_frames(tmp_imgs_dir, video_path, fps, **(encoding or {}))

-            # clean temporary images directory
-            shutil.rmtree(tmp_imgs_dir)
+                    # clean temporary images directory
+                    shutil.rmtree(tmp_imgs_dir)

-            # store the reference to the video frame
-            ep_dict[img_key] = [{"path": f"videos/{fname}", "timestamp": i / fps} for i in range(num_frames)]
+                # store the reference to the video frame
+                ep_dict[img_key] = [
+                    {"path": f"videos/{fname}", "timestamp": i / fps} for i in range(num_frames)
+                ]
+            else:
+                ep_dict[img_key] = [PILImage.fromarray(x) for x in imgs_array]
+
+            ep_dict["observation.state"] = state
+            ep_dict["episode_index"] = torch.tensor([ep_idx] * num_frames, dtype=torch.int64)
+            ep_dict["frame_index"] = torch.arange(0, num_frames, 1)
+            ep_dict["timestamp"] = torch.arange(0, num_frames, 1) / fps
+            ep_dict["episode_data_index_from"] = torch.tensor([from_idx] * num_frames)
+            ep_dict["episode_data_index_to"] = torch.tensor([from_idx + num_frames] * num_frames)
+            ep_dict["end_pose"] = end_pose[from_idx:to_idx]
+            ep_dict["start_pos"] = start_pos[from_idx:to_idx]
+            ep_dict["gripper_width"] = gripper_width[from_idx:to_idx]
+            torch.save(ep_dict, ep_dict_path)
        else:
-            ep_dict[img_key] = [PILImage.fromarray(x) for x in imgs_array]
+            ep_dict = torch.load(ep_dict_path)

-        ep_dict["observation.state"] = state
-        ep_dict["episode_index"] = torch.tensor([ep_idx] * num_frames, dtype=torch.int64)
-        ep_dict["frame_index"] = torch.arange(0, num_frames, 1)
-        ep_dict["timestamp"] = torch.arange(0, num_frames, 1) / fps
-        ep_dict["episode_data_index_from"] = torch.tensor([from_idx] * num_frames)
-        ep_dict["episode_data_index_to"] = torch.tensor([from_idx + num_frames] * num_frames)
-        ep_dict["end_pose"] = end_pose[from_idx:to_idx]
-        ep_dict["start_pos"] = start_pos[from_idx:to_idx]
-        ep_dict["gripper_width"] = gripper_width[from_idx:to_idx]
        ep_dicts.append(ep_dict)

    data_dict = concatenate_episodes(ep_dicts)
@@ -183,6 +206,7 @@ def from_raw_to_lerobot_format(
    fps: int | None = None,
    video: bool = True,
    episodes: list[int] | None = None,
+    encoding: dict | None = None,
 ):
    # sanity check
    check_format(raw_dir)
@@ -196,7 +220,7 @@ def from_raw_to_lerobot_format(
            "Generating UMI dataset without `video=True` creates ~150GB on disk and requires ~80GB in RAM."
        )

-    data_dict = load_from_raw(raw_dir, videos_dir, fps, video, episodes)
+    data_dict = load_from_raw(raw_dir, videos_dir, fps, video, episodes, encoding)
    hf_dataset = to_hf_dataset(data_dict, video)
    episode_data_index = calculate_episode_data_index(hf_dataset)
    info = {
@@ -204,4 +228,7 @@ def from_raw_to_lerobot_format(
        "fps": fps,
        "video": video,
    }
+    if video:
+        info["encoding"] = get_default_encoding()
+
    return hf_dataset, episode_data_index, info
--- a/lerobot/common/datasets/push_dataset_to_hub/utils.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/utils.py
@@ -13,6 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import inspect
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path

@@ -20,6 +21,8 @@ import numpy
 import PIL
 import torch

+from lerobot.common.datasets.video_utils import encode_video_frames
+

 def concatenate_episodes(ep_dicts):
    data_dict = {}
@@ -51,3 +54,21 @@ def save_images_concurrently(imgs_array: numpy.array, out_dir: Path, max_workers
    num_images = len(imgs_array)
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        [executor.submit(save_image, imgs_array[i], i, out_dir) for i in range(num_images)]
+
+
+def get_default_encoding() -> dict:
+    """Returns the default ffmpeg encoding parameters used by `encode_video_frames`."""
+    signature = inspect.signature(encode_video_frames)
+    return {
+        k: v.default
+        for k, v in signature.parameters.items()
+        if v.default is not inspect.Parameter.empty and k in ["vcodec", "pix_fmt", "g", "crf"]
+    }
+
+
+def check_repo_id(repo_id: str) -> None:
+    if len(repo_id.split("/")) != 2:
+        raise ValueError(
+            f"""`repo_id` is expected to contain a community or user id `/` the name of the dataset
+            (e.g. 'lerobot/pusht'), but contains '{repo_id}'."""
+        )
--- a/lerobot/common/datasets/push_dataset_to_hub/xarm_pkl_format.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/xarm_pkl_format.py
@@ -26,7 +26,11 @@ from datasets import Dataset, Features, Image, Sequence, Value
 from PIL import Image as PILImage

 from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION
-from lerobot.common.datasets.push_dataset_to_hub.utils import concatenate_episodes, save_images_concurrently
+from lerobot.common.datasets.push_dataset_to_hub.utils import (
+    concatenate_episodes,
+    get_default_encoding,
+    save_images_concurrently,
+)
 from lerobot.common.datasets.utils import (
    calculate_episode_data_index,
    hf_transform_to_torch,
@@ -56,7 +60,14 @@ def check_format(raw_dir):
        assert all(len(nested_dict[subkey]) == expected_len for subkey in subkeys if subkey in nested_dict)


-def load_from_raw(raw_dir: Path, videos_dir: Path, fps: int, video: bool, episodes: list[int] | None = None):
+def load_from_raw(
+    raw_dir: Path,
+    videos_dir: Path,
+    fps: int,
+    video: bool,
+    episodes: list[int] | None = None,
+    encoding: dict | None = None,
+):
    pkl_path = raw_dir / "buffer.pkl"

    with open(pkl_path, "rb") as f:
@@ -105,7 +116,7 @@ def load_from_raw(raw_dir: Path, videos_dir: Path, fps: int, video: bool, episod
            # encode images to a mp4 video
            fname = f"{img_key}_episode_{ep_idx:06d}.mp4"
            video_path = videos_dir / fname
-            encode_video_frames(tmp_imgs_dir, video_path, fps)
+            encode_video_frames(tmp_imgs_dir, video_path, fps, **(encoding or {}))

            # clean temporary images directory
            shutil.rmtree(tmp_imgs_dir)
@@ -167,6 +178,7 @@ def from_raw_to_lerobot_format(
    fps: int | None = None,
    video: bool = True,
    episodes: list[int] | None = None,
+    encoding: dict | None = None,
 ):
    # sanity check
    check_format(raw_dir)
@@ -174,7 +186,7 @@ def from_raw_to_lerobot_format(
    if fps is None:
        fps = 15

-    data_dict = load_from_raw(raw_dir, videos_dir, fps, video, episodes)
+    data_dict = load_from_raw(raw_dir, videos_dir, fps, video, episodes, encoding)
    hf_dataset = to_hf_dataset(data_dict, video)
    episode_data_index = calculate_episode_data_index(hf_dataset)
    info = {
@@ -182,4 +194,7 @@ def from_raw_to_lerobot_format(
        "fps": fps,
        "video": video,
    }
+    if video:
+        info["encoding"] = get_default_encoding()
+
    return hf_dataset, episode_data_index, info
--- a/lerobot/common/datasets/video_utils.py
+++ b/lerobot/common/datasets/video_utils.py
@@ -166,10 +166,10 @@ def encode_video_frames(
    imgs_dir: Path,
    video_path: Path,
    fps: int,
-    video_codec: str = "libsvtav1",
-    pixel_format: str = "yuv420p",
-    group_of_pictures_size: int | None = 2,
-    constant_rate_factor: int | None = 30,
+    vcodec: str = "libsvtav1",
+    pix_fmt: str = "yuv420p",
+    g: int | None = 2,
+    crf: int | None = 30,
    fast_decode: int = 0,
    log_level: str | None = "error",
    overwrite: bool = False,
@@ -183,20 +183,20 @@ def encode_video_frames(
            ("-f", "image2"),
            ("-r", str(fps)),
            ("-i", str(imgs_dir / "frame_%06d.png")),
-            ("-vcodec", video_codec),
-            ("-pix_fmt", pixel_format),
+            ("-vcodec", vcodec),
+            ("-pix_fmt", pix_fmt),
        ]
    )

-    if group_of_pictures_size is not None:
-        ffmpeg_args["-g"] = str(group_of_pictures_size)
+    if g is not None:
+        ffmpeg_args["-g"] = str(g)

-    if constant_rate_factor is not None:
-        ffmpeg_args["-crf"] = str(constant_rate_factor)
+    if crf is not None:
+        ffmpeg_args["-crf"] = str(crf)

    if fast_decode:
-        key = "-svtav1-params" if video_codec == "libsvtav1" else "-tune"
-        value = f"fast-decode={fast_decode}" if video_codec == "libsvtav1" else "fastdecode"
+        key = "-svtav1-params" if vcodec == "libsvtav1" else "-tune"
+        value = f"fast-decode={fast_decode}" if vcodec == "libsvtav1" else "fastdecode"
        ffmpeg_args[key] = value

    if log_level is not None: