Enable video_reader backend (#220)

Co-authored-by: Alexander Soare <alexander.soare159@gmail.com>
2024-06-19 17:15:25 +02:00
parent 48951662f2
commit 2abef3bef9
11 changed files with 464 additions and 220 deletions
--- a/lerobot/common/datasets/_video_benchmark/capture_camera_feed.py
+++ b/lerobot/common/datasets/_video_benchmark/capture_camera_feed.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Capture video feed from a camera as raw images."""
+
+import argparse
+import datetime as dt
+from pathlib import Path
+
+import cv2
+
+
+def display_and_save_video_stream(output_dir: Path, fps: int, width: int, height: int):
+    now = dt.datetime.now()
+    capture_dir = output_dir / f"{now:%Y-%m-%d}" / f"{now:%H-%M-%S}"
+    if not capture_dir.exists():
+        capture_dir.mkdir(parents=True, exist_ok=True)
+
+    # Opens the default webcam
+    cap = cv2.VideoCapture(0)
+    if not cap.isOpened():
+        print("Error: Could not open video stream.")
+        return
+
+    cap.set(cv2.CAP_PROP_FPS, fps)
+    cap.set(cv2.CAP_PROP_FRAME_WIDTH, width)
+    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, height)
+
+    frame_index = 0
+    while True:
+        ret, frame = cap.read()
+
+        if not ret:
+            print("Error: Could not read frame.")
+            break
+
+        cv2.imshow("Video Stream", frame)
+        cv2.imwrite(str(capture_dir / f"frame_{frame_index:06d}.png"), frame)
+        frame_index += 1
+
+        # Break the loop on 'q' key press
+        if cv2.waitKey(1) & 0xFF == ord("q"):
+            break
+
+    # Release the capture and destroy all windows
+    cap.release()
+    cv2.destroyAllWindows()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=Path("outputs/cam_capture/"),
+        help="Directory where the capture images are written. A subfolder named with the current date & time will be created inside it for each capture.",
+    )
+    parser.add_argument(
+        "--fps",
+        type=int,
+        default=30,
+        help="Frames Per Second of the capture.",
+    )
+    parser.add_argument(
+        "--width",
+        type=int,
+        default=1280,
+        help="Width of the captured images.",
+    )
+    parser.add_argument(
+        "--height",
+        type=int,
+        default=720,
+        help="Height of the captured images.",
+    )
+    args = parser.parse_args()
+    display_and_save_video_stream(**vars(args))
--- a/lerobot/common/datasets/_video_benchmark/run_video_benchmark.py
+++ b/lerobot/common/datasets/_video_benchmark/run_video_benchmark.py
@@ -13,6 +13,23 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Assess the performance of video decoding in various configurations.
+
+This script will run different video decoding benchmarks where one parameter varies at a time.
+These parameters and theirs values are specified in the BENCHMARKS dict.
+
+All of these benchmarks are evaluated within different timestamps modes corresponding to different frame-loading scenarios:
+    - `1_frame`: 1 single frame is loaded.
+    - `2_frames`: 2 consecutive frames are loaded.
+    - `2_frames_4_space`: 2 frames separated by 4 frames are loaded.
+    - `6_frames`: 6 consecutive frames are loaded.
+
+These values are more or less arbitrary and based on possible future usage.
+
+These benchmarks are run on the first episode of each dataset specified in DATASET_REPO_IDS.
+Note: These datasets need to be image datasets, not video datasets.
+"""
+
 import json
 import random
 import shutil
@@ -21,15 +38,38 @@ import time
 from pathlib import Path

 import einops
-import numpy
+import numpy as np
 import PIL
 import torch
+from skimage.metrics import mean_squared_error, peak_signal_noise_ratio, structural_similarity

 from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
 from lerobot.common.datasets.video_utils import (
    decode_video_frames_torchvision,
 )

+OUTPUT_DIR = Path("tmp/run_video_benchmark")
+DRY_RUN = False
+
+DATASET_REPO_IDS = [
+    "lerobot/pusht_image",
+    "aliberts/aloha_mobile_shrimp_image",
+    "aliberts/paris_street",
+    "aliberts/kitchen",
+]
+TIMESTAMPS_MODES = [
+    "1_frame",
+    "2_frames",
+    "2_frames_4_space",
+    "6_frames",
+]
+BENCHMARKS = {
+    # "pix_fmt": ["yuv420p", "yuv444p"],
+    # "g": [1, 2, 3, 4, 5, 6, 10, 15, 20, 40, 100, None],
+    # "crf": [0, 5, 10, 15, 20, None, 25, 30, 40, 50],
+    "backend": ["pyav", "video_reader"],
+}
+

 def get_directory_size(directory):
    total_size = 0
@@ -56,6 +96,10 @@ def run_video_benchmark(

    # TODO(rcadene): rewrite with hardcoding of original images and episodes
    dataset = LeRobotDataset(repo_id)
+    if dataset.video:
+        raise ValueError(
+            f"Use only image dataset for running this benchmark. Video dataset provided: {repo_id}"
+        )

    # Get fps
    fps = dataset.fps
@@ -68,10 +112,11 @@ def run_video_benchmark(
    if not imgs_dir.exists():
        imgs_dir.mkdir(parents=True, exist_ok=True)
        hf_dataset = dataset.hf_dataset.with_format(None)
-        imgs_dataset = hf_dataset.select_columns("observation.image")
+        img_keys = [key for key in hf_dataset.features if key.startswith("observation.image")]
+        imgs_dataset = hf_dataset.select_columns(img_keys[0])

        for i, item in enumerate(imgs_dataset):
-            img = item["observation.image"]
+            img = item[img_keys[0]]
            img.save(str(imgs_dir / f"frame_{i:06d}.png"), quality=100)

            if i >= ep_num_images - 1:
@@ -107,7 +152,7 @@ def run_video_benchmark(

    decoder = cfg["decoder"]
    decoder_kwgs = cfg["decoder_kwgs"]
-    device = cfg["device"]
+    backend = cfg["backend"]

    if decoder == "torchvision":
        decode_frames_fn = decode_video_frames_torchvision
@@ -116,12 +161,12 @@ def run_video_benchmark(

    # Estimate average loading time

-    def load_original_frames(imgs_dir, timestamps):
+    def load_original_frames(imgs_dir, timestamps) -> torch.Tensor:
        frames = []
        for ts in timestamps:
            idx = int(ts * fps)
            frame = PIL.Image.open(imgs_dir / f"frame_{idx:06d}.png")
-            frame = torch.from_numpy(numpy.array(frame))
+            frame = torch.from_numpy(np.array(frame))
            frame = frame.type(torch.float32) / 255
            frame = einops.rearrange(frame, "h w c -> c h w")
            frames.append(frame)
@@ -130,6 +175,9 @@ def run_video_benchmark(
    list_avg_load_time = []
    list_avg_load_time_from_images = []
    per_pixel_l2_errors = []
+    psnr_values = []
+    ssim_values = []
+    mse_values = []

    random.seed(seed)

@@ -142,7 +190,7 @@ def run_video_benchmark(
        elif timestamps_mode == "2_frames":
            timestamps = [ts - 1 / fps, ts]
        elif timestamps_mode == "2_frames_4_space":
-            timestamps = [ts - 4 / fps, ts]
+            timestamps = [ts - 5 / fps, ts]
        elif timestamps_mode == "6_frames":
            timestamps = [ts - i / fps for i in range(6)][::-1]
        else:
@@ -152,7 +200,7 @@ def run_video_benchmark(

        start_time_s = time.monotonic()
        frames = decode_frames_fn(
-            video_path, timestamps=timestamps, tolerance_s=1e-4, device=device, **decoder_kwgs
+            video_path, timestamps=timestamps, tolerance_s=1e-4, backend=backend, **decoder_kwgs
        )
        avg_load_time = (time.monotonic() - start_time_s) / num_frames
        list_avg_load_time.append(avg_load_time)
@@ -162,11 +210,19 @@ def run_video_benchmark(
        avg_load_time_from_images = (time.monotonic() - start_time_s) / num_frames
        list_avg_load_time_from_images.append(avg_load_time_from_images)

-        # Estimate average L2 error between original frames and decoded frames
+        # Estimate reconstruction error between original frames and decoded frames with various metrics
        for i, ts in enumerate(timestamps):
            # are_close = torch.allclose(frames[i], original_frames[i], atol=0.02)
            num_pixels = original_frames[i].numel()
            per_pixel_l2_error = torch.norm(frames[i] - original_frames[i], p=2).item() / num_pixels
+            per_pixel_l2_errors.append(per_pixel_l2_error)
+
+            frame_np, original_frame_np = frames[i].numpy(), original_frames[i].numpy()
+            psnr_values.append(peak_signal_noise_ratio(original_frame_np, frame_np, data_range=1.0))
+            ssim_values.append(
+                structural_similarity(original_frame_np, frame_np, data_range=1.0, channel_axis=0)
+            )
+            mse_values.append(mean_squared_error(original_frame_np, frame_np))

            # save decoded frames
            if t == 0:
@@ -179,15 +235,18 @@ def run_video_benchmark(
                original_frame = PIL.Image.open(imgs_dir / f"frame_{idx:06d}.png")
                original_frame.save(output_dir / f"original_frame_{i:06d}.png")

-            per_pixel_l2_errors.append(per_pixel_l2_error)
-
-    avg_load_time = float(numpy.array(list_avg_load_time).mean())
-    avg_load_time_from_images = float(numpy.array(list_avg_load_time_from_images).mean())
-    avg_per_pixel_l2_error = float(numpy.array(per_pixel_l2_errors).mean())
+    image_size = tuple(dataset[0][dataset.camera_keys[0]].shape[-2:])
+    avg_load_time = float(np.array(list_avg_load_time).mean())
+    avg_load_time_from_images = float(np.array(list_avg_load_time_from_images).mean())
+    avg_per_pixel_l2_error = float(np.array(per_pixel_l2_errors).mean())
+    avg_psnr = float(np.mean(psnr_values))
+    avg_ssim = float(np.mean(ssim_values))
+    avg_mse = float(np.mean(mse_values))

    # Save benchmark info

    info = {
+        "image_size": image_size,
        "sum_original_frames_size_bytes": sum_original_frames_size_bytes,
        "video_size_bytes": video_size_bytes,
        "avg_load_time_from_images": avg_load_time_from_images,
@@ -195,6 +254,9 @@ def run_video_benchmark(
        "compression_factor": sum_original_frames_size_bytes / video_size_bytes,
        "load_time_factor": avg_load_time_from_images / avg_load_time,
        "avg_per_pixel_l2_error": avg_per_pixel_l2_error,
+        "avg_psnr": avg_psnr,
+        "avg_ssim": avg_ssim,
+        "avg_mse": avg_mse,
    }

    with open(output_dir / "info.json", "w") as f:
@@ -234,138 +296,113 @@ def load_info(out_dir):
    return info


-def main():
-    out_dir = Path("tmp/run_video_benchmark")
-    dry_run = False
-    repo_ids = ["lerobot/pusht", "lerobot/umi_cup_in_the_wild"]
-    timestamps_modes = [
-        "1_frame",
-        "2_frames",
-        "2_frames_4_space",
-        "6_frames",
+def one_variable_study(
+    var_name: str, var_values: list, repo_ids: list, bench_dir: Path, timestamps_mode: str, dry_run: bool
+):
+    print(f"**`{var_name}`**")
+    headers = [
+        "repo_id",
+        "image_size",
+        var_name,
+        "compression_factor",
+        "load_time_factor",
+        "avg_per_pixel_l2_error",
+        "avg_psnr",
+        "avg_ssim",
+        "avg_mse",
    ]
-    for timestamps_mode in timestamps_modes:
-        bench_dir = out_dir / timestamps_mode
+    rows = []
+    base_cfg = {
+        "repo_id": None,
+        # video encoding
+        "g": 2,
+        "crf": None,
+        "pix_fmt": "yuv444p",
+        # video decoding
+        "backend": "pyav",
+        "decoder": "torchvision",
+        "decoder_kwgs": {},
+    }
+    for repo_id in repo_ids:
+        for val in var_values:
+            cfg = base_cfg.copy()
+            cfg["repo_id"] = repo_id
+            cfg[var_name] = val
+            if not dry_run:
+                run_video_benchmark(
+                    bench_dir / repo_id / f"torchvision_{var_name}_{val}", cfg, timestamps_mode
+                )
+            info = load_info(bench_dir / repo_id / f"torchvision_{var_name}_{val}")
+            width, height = info["image_size"][0], info["image_size"][1]
+            rows.append(
+                [
+                    repo_id,
+                    f"{width} x {height}",
+                    val,
+                    info["compression_factor"],
+                    info["load_time_factor"],
+                    info["avg_per_pixel_l2_error"],
+                    info["avg_psnr"],
+                    info["avg_ssim"],
+                    info["avg_mse"],
+                ]
+            )
+    display_markdown_table(headers, rows)
+
+
+def best_study(repo_ids: list, bench_dir: Path, timestamps_mode: str, dry_run: bool):
+    """Change the config once you deciced what's best based on one-variable-studies"""
+    print("**best**")
+    headers = [
+        "repo_id",
+        "image_size",
+        "compression_factor",
+        "load_time_factor",
+        "avg_per_pixel_l2_error",
+        "avg_psnr",
+        "avg_ssim",
+        "avg_mse",
+    ]
+    rows = []
+    for repo_id in repo_ids:
+        cfg = {
+            "repo_id": repo_id,
+            # video encoding
+            "g": 2,
+            "crf": None,
+            "pix_fmt": "yuv444p",
+            # video decoding
+            "backend": "video_reader",
+            "decoder": "torchvision",
+            "decoder_kwgs": {},
+        }
+        if not dry_run:
+            run_video_benchmark(bench_dir / repo_id / "torchvision_best", cfg, timestamps_mode)
+        info = load_info(bench_dir / repo_id / "torchvision_best")
+        width, height = info["image_size"][0], info["image_size"][1]
+        rows.append(
+            [
+                repo_id,
+                f"{width} x {height}",
+                info["compression_factor"],
+                info["load_time_factor"],
+                info["avg_per_pixel_l2_error"],
+            ]
+        )
+    display_markdown_table(headers, rows)
+
+
+def main():
+    for timestamps_mode in TIMESTAMPS_MODES:
+        bench_dir = OUTPUT_DIR / timestamps_mode

        print(f"### `{timestamps_mode}`")
        print()

-        print("**`pix_fmt`**")
-        headers = ["repo_id", "pix_fmt", "compression_factor", "load_time_factor", "avg_per_pixel_l2_error"]
-        rows = []
-        for repo_id in repo_ids:
-            for pix_fmt in ["yuv420p", "yuv444p"]:
-                cfg = {
-                    "repo_id": repo_id,
-                    # video encoding
-                    "g": 2,
-                    "crf": None,
-                    "pix_fmt": pix_fmt,
-                    # video decoding
-                    "device": "cpu",
-                    "decoder": "torchvision",
-                    "decoder_kwgs": {},
-                }
-                if not dry_run:
-                    run_video_benchmark(bench_dir / repo_id / f"torchvision_{pix_fmt}", cfg, timestamps_mode)
-                info = load_info(bench_dir / repo_id / f"torchvision_{pix_fmt}")
-                rows.append(
-                    [
-                        repo_id,
-                        pix_fmt,
-                        info["compression_factor"],
-                        info["load_time_factor"],
-                        info["avg_per_pixel_l2_error"],
-                    ]
-                )
-        display_markdown_table(headers, rows)
+        for name, values in BENCHMARKS.items():
+            one_variable_study(name, values, DATASET_REPO_IDS, bench_dir, timestamps_mode, DRY_RUN)

-        print("**`g`**")
-        headers = ["repo_id", "g", "compression_factor", "load_time_factor", "avg_per_pixel_l2_error"]
-        rows = []
-        for repo_id in repo_ids:
-            for g in [1, 2, 3, 4, 5, 6, 10, 15, 20, 40, 100, None]:
-                cfg = {
-                    "repo_id": repo_id,
-                    # video encoding
-                    "g": g,
-                    "pix_fmt": "yuv444p",
-                    # video decoding
-                    "device": "cpu",
-                    "decoder": "torchvision",
-                    "decoder_kwgs": {},
-                }
-                if not dry_run:
-                    run_video_benchmark(bench_dir / repo_id / f"torchvision_g_{g}", cfg, timestamps_mode)
-                info = load_info(bench_dir / repo_id / f"torchvision_g_{g}")
-                rows.append(
-                    [
-                        repo_id,
-                        g,
-                        info["compression_factor"],
-                        info["load_time_factor"],
-                        info["avg_per_pixel_l2_error"],
-                    ]
-                )
-        display_markdown_table(headers, rows)
-
-        print("**`crf`**")
-        headers = ["repo_id", "crf", "compression_factor", "load_time_factor", "avg_per_pixel_l2_error"]
-        rows = []
-        for repo_id in repo_ids:
-            for crf in [0, 5, 10, 15, 20, None, 25, 30, 40, 50]:
-                cfg = {
-                    "repo_id": repo_id,
-                    # video encoding
-                    "g": 2,
-                    "crf": crf,
-                    "pix_fmt": "yuv444p",
-                    # video decoding
-                    "device": "cpu",
-                    "decoder": "torchvision",
-                    "decoder_kwgs": {},
-                }
-                if not dry_run:
-                    run_video_benchmark(bench_dir / repo_id / f"torchvision_crf_{crf}", cfg, timestamps_mode)
-                info = load_info(bench_dir / repo_id / f"torchvision_crf_{crf}")
-                rows.append(
-                    [
-                        repo_id,
-                        crf,
-                        info["compression_factor"],
-                        info["load_time_factor"],
-                        info["avg_per_pixel_l2_error"],
-                    ]
-                )
-        display_markdown_table(headers, rows)
-
-        print("**best**")
-        headers = ["repo_id", "compression_factor", "load_time_factor", "avg_per_pixel_l2_error"]
-        rows = []
-        for repo_id in repo_ids:
-            cfg = {
-                "repo_id": repo_id,
-                # video encoding
-                "g": 2,
-                "crf": None,
-                "pix_fmt": "yuv444p",
-                # video decoding
-                "device": "cpu",
-                "decoder": "torchvision",
-                "decoder_kwgs": {},
-            }
-            if not dry_run:
-                run_video_benchmark(bench_dir / repo_id / "torchvision_best", cfg, timestamps_mode)
-            info = load_info(bench_dir / repo_id / "torchvision_best")
-            rows.append(
-                [
-                    repo_id,
-                    info["compression_factor"],
-                    info["load_time_factor"],
-                    info["avg_per_pixel_l2_error"],
-                ]
-            )
-        display_markdown_table(headers, rows)
+        # best_study(DATASET_REPO_IDS, bench_dir, timestamps_mode, DRY_RUN)


 if __name__ == "__main__":
--- a/lerobot/common/datasets/factory.py
+++ b/lerobot/common/datasets/factory.py
@@ -96,6 +96,7 @@ def make_dataset(cfg, split: str = "train") -> LeRobotDataset | MultiLeRobotData
            split=split,
            delta_timestamps=cfg.training.get("delta_timestamps"),
            image_transforms=image_transforms,
+            video_backend=cfg.video_backend,
        )
    else:
        dataset = MultiLeRobotDataset(
@@ -103,6 +104,7 @@ def make_dataset(cfg, split: str = "train") -> LeRobotDataset | MultiLeRobotData
            split=split,
            delta_timestamps=cfg.training.get("delta_timestamps"),
            image_transforms=image_transforms,
+            video_backend=cfg.video_backend,
        )

    if cfg.get("override_dataset_stats"):
--- a/lerobot/common/datasets/lerobot_dataset.py
+++ b/lerobot/common/datasets/lerobot_dataset.py
@@ -48,6 +48,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
        split: str = "train",
        image_transforms: Callable | None = None,
        delta_timestamps: dict[list[float]] | None = None,
+        video_backend: str | None = None,
    ):
        super().__init__()
        self.repo_id = repo_id
@@ -69,6 +70,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
        self.info = load_info(repo_id, version, root)
        if self.video:
            self.videos_dir = load_videos(repo_id, version, root)
+            self.video_backend = video_backend if video_backend is not None else "pyav"

    @property
    def fps(self) -> int:
@@ -149,6 +151,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
                self.video_frame_keys,
                self.videos_dir,
                self.tolerance_s,
+                self.video_backend,
            )

        if self.image_transforms is not None:
@@ -188,6 +191,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
        stats=None,
        info=None,
        videos_dir=None,
+        video_backend=None,
    ) -> "LeRobotDataset":
        """Create a LeRobot Dataset from existing data and attributes instead of loading from the filesystem.

@@ -210,6 +214,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
        obj.stats = stats
        obj.info = info if info is not None else {}
        obj.videos_dir = videos_dir
+        obj.video_backend = video_backend if video_backend is not None else "pyav"
        return obj


@@ -228,6 +233,7 @@ class MultiLeRobotDataset(torch.utils.data.Dataset):
        split: str = "train",
        image_transforms: Callable | None = None,
        delta_timestamps: dict[list[float]] | None = None,
+        video_backend: str | None = None,
    ):
        super().__init__()
        self.repo_ids = repo_ids
@@ -241,6 +247,7 @@ class MultiLeRobotDataset(torch.utils.data.Dataset):
                split=split,
                delta_timestamps=delta_timestamps,
                image_transforms=image_transforms,
+                video_backend=video_backend,
            )
            for repo_id in repo_ids
        ]
--- a/lerobot/common/datasets/push_dataset_to_hub/cam_png_format.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/cam_png_format.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Contains utilities to process raw data format of png images files recorded with capture_camera_feed.py
+"""
+
+from pathlib import Path
+
+import torch
+from datasets import Dataset, Features, Image, Value
+from PIL import Image as PILImage
+
+from lerobot.common.datasets.push_dataset_to_hub.utils import concatenate_episodes
+from lerobot.common.datasets.utils import calculate_episode_data_index, hf_transform_to_torch
+from lerobot.common.datasets.video_utils import VideoFrame
+
+
+def check_format(raw_dir: Path) -> bool:
+    image_paths = list(raw_dir.glob("frame_*.png"))
+    if len(image_paths) == 0:
+        raise ValueError
+
+
+def load_from_raw(raw_dir: Path, fps: int, episodes: list[int] | None = None):
+    if episodes is not None:
+        # TODO(aliberts): add support for multi-episodes.
+        raise NotImplementedError()
+
+    ep_dict = {}
+    ep_idx = 0
+
+    image_paths = sorted(raw_dir.glob("frame_*.png"))
+    num_frames = len(image_paths)
+
+    ep_dict["observation.image"] = [PILImage.open(x) for x in image_paths]
+    ep_dict["episode_index"] = torch.tensor([ep_idx] * num_frames)
+    ep_dict["frame_index"] = torch.arange(0, num_frames, 1)
+    ep_dict["timestamp"] = torch.arange(0, num_frames, 1) / fps
+
+    ep_dicts = [ep_dict]
+    data_dict = concatenate_episodes(ep_dicts)
+    total_frames = data_dict["frame_index"].shape[0]
+    data_dict["index"] = torch.arange(0, total_frames, 1)
+    return data_dict
+
+
+def to_hf_dataset(data_dict, video) -> Dataset:
+    features = {}
+    if video:
+        features["observation.image"] = VideoFrame()
+    else:
+        features["observation.image"] = Image()
+
+    features["episode_index"] = Value(dtype="int64", id=None)
+    features["frame_index"] = Value(dtype="int64", id=None)
+    features["timestamp"] = Value(dtype="float32", id=None)
+    features["index"] = Value(dtype="int64", id=None)
+
+    hf_dataset = Dataset.from_dict(data_dict, features=Features(features))
+    hf_dataset.set_transform(hf_transform_to_torch)
+    return hf_dataset
+
+
+def from_raw_to_lerobot_format(
+    raw_dir: Path,
+    videos_dir: Path,
+    fps: int | None = None,
+    video: bool = True,
+    episodes: list[int] | None = None,
+):
+    if video or episodes is not None:
+        # TODO(aliberts): support this
+        raise NotImplementedError
+
+    # sanity check
+    check_format(raw_dir)
+
+    if fps is None:
+        fps = 30
+
+    data_dict = load_from_raw(raw_dir, videos_dir, fps, video, episodes)
+    hf_dataset = to_hf_dataset(data_dict, video)
+    episode_data_index = calculate_episode_data_index(hf_dataset)
+    info = {
+        "fps": fps,
+        "video": video,
+    }
+    return hf_dataset, episode_data_index, info
--- a/lerobot/common/datasets/video_utils.py
+++ b/lerobot/common/datasets/video_utils.py
@@ -27,7 +27,11 @@ from datasets.features.features import register_feature


 def load_from_videos(
-    item: dict[str, torch.Tensor], video_frame_keys: list[str], videos_dir: Path, tolerance_s: float
+    item: dict[str, torch.Tensor],
+    video_frame_keys: list[str],
+    videos_dir: Path,
+    tolerance_s: float,
+    backend: str = "pyav",
 ):
    """Note: When using data workers (e.g. DataLoader with num_workers>0), do not call this function
    in the main process (e.g. by using a second Dataloader with num_workers=0). It will result in a Segmentation Fault.
@@ -46,14 +50,14 @@ def load_from_videos(
                raise NotImplementedError("All video paths are expected to be the same for now.")
            video_path = data_dir / paths[0]

-            frames = decode_video_frames_torchvision(video_path, timestamps, tolerance_s)
+            frames = decode_video_frames_torchvision(video_path, timestamps, tolerance_s, backend)
            item[key] = frames
        else:
            # load one frame
            timestamps = [item[key]["timestamp"]]
            video_path = data_dir / item[key]["path"]

-            frames = decode_video_frames_torchvision(video_path, timestamps, tolerance_s)
+            frames = decode_video_frames_torchvision(video_path, timestamps, tolerance_s, backend)
            item[key] = frames[0]

    return item
@@ -63,11 +67,23 @@ def decode_video_frames_torchvision(
    video_path: str,
    timestamps: list[float],
    tolerance_s: float,
-    device: str = "cpu",
+    backend: str = "pyav",
    log_loaded_timestamps: bool = False,
 ):
    """Loads frames associated to the requested timestamps of a video

+    The backend can be either "pyav" (default) or "video_reader".
+    "video_reader" requires installing torchvision from source, see:
+    https://github.com/pytorch/vision/blob/main/torchvision/csrc/io/decoder/gpu/README.rst
+    (note that you need to compile against ffmpeg<4.3)
+
+    While both use cpu, "video_reader" is faster than "pyav" but requires additional setup.
+    See our benchmark results for more info on performance:
+    https://github.com/huggingface/lerobot/pull/220
+
+    See torchvision doc for more info on these two backends:
+    https://pytorch.org/vision/0.18/index.html?highlight=backend#torchvision.set_video_backend
+
    Note: Video benefits from inter-frame compression. Instead of storing every frame individually,
    the encoder stores a reference frame (or a key frame) and subsequent frames as differences relative to
    that key frame. As a consequence, to access a requested frame, we need to load the preceding key frame,
@@ -78,21 +94,9 @@ def decode_video_frames_torchvision(

    # set backend
    keyframes_only = False
-    if device == "cpu":
-        # explicitely use pyav
-        torchvision.set_video_backend("pyav")
+    torchvision.set_video_backend(backend)
+    if backend == "pyav":
        keyframes_only = True  # pyav doesnt support accuracte seek
-    elif device == "cuda":
-        # TODO(rcadene, aliberts): implement video decoding with GPU
-        # torchvision.set_video_backend("cuda")
-        # torchvision.set_video_backend("video_reader")
-        # requires installing torchvision from source, see: https://github.com/pytorch/vision/blob/main/torchvision/csrc/io/decoder/gpu/README.rst
-        # check possible bug: https://github.com/pytorch/vision/issues/7745
-        raise NotImplementedError(
-            "Video decoding on gpu with cuda is currently not supported. Use `device='cpu'`."
-        )
-    else:
-        raise ValueError(device)

    # set a video stream reader
    # TODO(rcadene): also load audio stream at the same time
@@ -120,7 +124,9 @@ def decode_video_frames_torchvision(
        if current_ts >= last_ts:
            break

-    reader.container.close()
+    if backend == "pyav":
+        reader.container.close()
+
    reader = None

    query_ts = torch.tensor(timestamps)
--- a/lerobot/configs/default.yaml
+++ b/lerobot/configs/default.yaml
@@ -28,6 +28,7 @@ seed: ???
 # "dataset_index" into the returned item. The index mapping is made according to the order in which the
 # datsets are provided.
 dataset_repo_id: lerobot/pusht
+video_backend: pyav

 training:
  offline_steps: ???
--- a/lerobot/scripts/push_dataset_to_hub.py
+++ b/lerobot/scripts/push_dataset_to_hub.py
@@ -55,7 +55,6 @@ from safetensors.torch import save_file

 from lerobot.common.datasets.compute_stats import compute_stats
 from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDataset
-from lerobot.common.datasets.push_dataset_to_hub._download_raw import download_raw
 from lerobot.common.datasets.utils import flatten_dict


@@ -70,6 +69,8 @@ def get_from_raw_to_lerobot_format_fn(raw_format: str):
        from lerobot.common.datasets.push_dataset_to_hub.dora_parquet_format import from_raw_to_lerobot_format
    elif raw_format == "xarm_pkl":
        from lerobot.common.datasets.push_dataset_to_hub.xarm_pkl_format import from_raw_to_lerobot_format
+    elif raw_format == "cam_png":
+        from lerobot.common.datasets.push_dataset_to_hub.cam_png_format import from_raw_to_lerobot_format
    else:
        raise ValueError(
            f"The selected {raw_format} can't be found. Did you add it to `lerobot/scripts/push_dataset_to_hub.py::get_from_raw_to_lerobot_format_fn`?"
@@ -182,10 +183,6 @@ def push_dataset_to_hub(
        meta_data_dir = Path(cache_dir) / "meta_data"
        videos_dir = Path(cache_dir) / "videos"

-    # Download the raw dataset if available
-    if not raw_dir.exists():
-        download_raw(raw_dir, dataset_id)
-
    if raw_format is None:
        # TODO(rcadene, adilzouitine): implement auto_find_raw_format
        raise NotImplementedError()