Enable video_reader backend (#220)

Co-authored-by: Alexander Soare <alexander.soare159@gmail.com>
2024-06-19 17:15:25 +02:00
parent 48951662f2
commit 2abef3bef9
11 changed files with 464 additions and 220 deletions
--- a/lerobot/common/datasets/video_utils.py
+++ b/lerobot/common/datasets/video_utils.py
@@ -27,7 +27,11 @@ from datasets.features.features import register_feature


 def load_from_videos(
-    item: dict[str, torch.Tensor], video_frame_keys: list[str], videos_dir: Path, tolerance_s: float
+    item: dict[str, torch.Tensor],
+    video_frame_keys: list[str],
+    videos_dir: Path,
+    tolerance_s: float,
+    backend: str = "pyav",
 ):
    """Note: When using data workers (e.g. DataLoader with num_workers>0), do not call this function
    in the main process (e.g. by using a second Dataloader with num_workers=0). It will result in a Segmentation Fault.
@@ -46,14 +50,14 @@ def load_from_videos(
                raise NotImplementedError("All video paths are expected to be the same for now.")
            video_path = data_dir / paths[0]

-            frames = decode_video_frames_torchvision(video_path, timestamps, tolerance_s)
+            frames = decode_video_frames_torchvision(video_path, timestamps, tolerance_s, backend)
            item[key] = frames
        else:
            # load one frame
            timestamps = [item[key]["timestamp"]]
            video_path = data_dir / item[key]["path"]

-            frames = decode_video_frames_torchvision(video_path, timestamps, tolerance_s)
+            frames = decode_video_frames_torchvision(video_path, timestamps, tolerance_s, backend)
            item[key] = frames[0]

    return item
@@ -63,11 +67,23 @@ def decode_video_frames_torchvision(
    video_path: str,
    timestamps: list[float],
    tolerance_s: float,
-    device: str = "cpu",
+    backend: str = "pyav",
    log_loaded_timestamps: bool = False,
 ):
    """Loads frames associated to the requested timestamps of a video

+    The backend can be either "pyav" (default) or "video_reader".
+    "video_reader" requires installing torchvision from source, see:
+    https://github.com/pytorch/vision/blob/main/torchvision/csrc/io/decoder/gpu/README.rst
+    (note that you need to compile against ffmpeg<4.3)
+
+    While both use cpu, "video_reader" is faster than "pyav" but requires additional setup.
+    See our benchmark results for more info on performance:
+    https://github.com/huggingface/lerobot/pull/220
+
+    See torchvision doc for more info on these two backends:
+    https://pytorch.org/vision/0.18/index.html?highlight=backend#torchvision.set_video_backend
+
    Note: Video benefits from inter-frame compression. Instead of storing every frame individually,
    the encoder stores a reference frame (or a key frame) and subsequent frames as differences relative to
    that key frame. As a consequence, to access a requested frame, we need to load the preceding key frame,
@@ -78,21 +94,9 @@ def decode_video_frames_torchvision(

    # set backend
    keyframes_only = False
-    if device == "cpu":
-        # explicitely use pyav
-        torchvision.set_video_backend("pyav")
+    torchvision.set_video_backend(backend)
+    if backend == "pyav":
        keyframes_only = True  # pyav doesnt support accuracte seek
-    elif device == "cuda":
-        # TODO(rcadene, aliberts): implement video decoding with GPU
-        # torchvision.set_video_backend("cuda")
-        # torchvision.set_video_backend("video_reader")
-        # requires installing torchvision from source, see: https://github.com/pytorch/vision/blob/main/torchvision/csrc/io/decoder/gpu/README.rst
-        # check possible bug: https://github.com/pytorch/vision/issues/7745
-        raise NotImplementedError(
-            "Video decoding on gpu with cuda is currently not supported. Use `device='cpu'`."
-        )
-    else:
-        raise ValueError(device)

    # set a video stream reader
    # TODO(rcadene): also load audio stream at the same time
@@ -120,7 +124,9 @@ def decode_video_frames_torchvision(
        if current_ts >= last_ts:
            break

-    reader.container.close()
+    if backend == "pyav":
+        reader.container.close()
+
    reader = None

    query_ts = torch.tensor(timestamps)