Improve video benchmark (#282)

Co-authored-by: Alexander Soare <alexander.soare159@gmail.com>
Co-authored-by: Remi <re.cadene@gmail.com>
This commit is contained in:
Simon Alibert
2024-07-09 20:20:25 +02:00
committed by GitHub
parent cc2f6e7404
commit e410e5d711
11 changed files with 985 additions and 772 deletions

View File

@@ -16,6 +16,7 @@
import logging
import subprocess
import warnings
from collections import OrderedDict
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, ClassVar
@@ -69,7 +70,7 @@ def decode_video_frames_torchvision(
tolerance_s: float,
backend: str = "pyav",
log_loaded_timestamps: bool = False,
):
) -> torch.Tensor:
"""Loads frames associated to the requested timestamps of a video
The backend can be either "pyav" (default) or "video_reader".
@@ -77,9 +78,8 @@ def decode_video_frames_torchvision(
https://github.com/pytorch/vision/blob/main/torchvision/csrc/io/decoder/gpu/README.rst
(note that you need to compile against ffmpeg<4.3)
While both use cpu, "video_reader" is faster than "pyav" but requires additional setup.
See our benchmark results for more info on performance:
https://github.com/huggingface/lerobot/pull/220
While both use cpu, "video_reader" is supposedly faster than "pyav" but requires additional setup.
For more info on video decoding, see `benchmark/video/README.md`
See torchvision doc for more info on these two backends:
https://pytorch.org/vision/0.18/index.html?highlight=backend#torchvision.set_video_backend
@@ -142,6 +142,10 @@ def decode_video_frames_torchvision(
"It means that the closest frame that can be loaded from the video is too far away in time."
"This might be due to synchronization issues with timestamps during data collection."
"To be safe, we advise to ignore this item during training."
f"\nqueried timestamps: {query_ts}"
f"\nloaded timestamps: {loaded_ts}"
f"\nvideo: {video_path}"
f"\nbackend: {backend}"
)
# get closest frames to the query timestamps
@@ -158,22 +162,52 @@ def decode_video_frames_torchvision(
return closest_frames
def encode_video_frames(imgs_dir: Path, video_path: Path, fps: int):
"""More info on ffmpeg arguments tuning on `lerobot/common/datasets/_video_benchmark/README.md`"""
def encode_video_frames(
imgs_dir: Path,
video_path: Path,
fps: int,
video_codec: str = "libsvtav1",
pixel_format: str = "yuv420p",
group_of_pictures_size: int | None = 2,
constant_rate_factor: int | None = 30,
fast_decode: int = 0,
log_level: str | None = "error",
overwrite: bool = False,
) -> None:
"""More info on ffmpeg arguments tuning on `benchmark/video/README.md`"""
video_path = Path(video_path)
video_path.parent.mkdir(parents=True, exist_ok=True)
ffmpeg_cmd = (
f"ffmpeg -r {fps} "
"-f image2 "
"-loglevel error "
f"-i {str(imgs_dir / 'frame_%06d.png')} "
"-vcodec libx264 "
"-g 2 "
"-pix_fmt yuv444p "
f"{str(video_path)}"
ffmpeg_args = OrderedDict(
[
("-f", "image2"),
("-r", str(fps)),
("-i", str(imgs_dir / "frame_%06d.png")),
("-vcodec", video_codec),
("-pix_fmt", pixel_format),
]
)
subprocess.run(ffmpeg_cmd.split(" "), check=True)
if group_of_pictures_size is not None:
ffmpeg_args["-g"] = str(group_of_pictures_size)
if constant_rate_factor is not None:
ffmpeg_args["-crf"] = str(constant_rate_factor)
if fast_decode:
key = "-svtav1-params" if video_codec == "libsvtav1" else "-tune"
value = f"fast-decode={fast_decode}" if video_codec == "libsvtav1" else "fastdecode"
ffmpeg_args[key] = value
if log_level is not None:
ffmpeg_args["-loglevel"] = str(log_level)
ffmpeg_args = [item for pair in ffmpeg_args.items() for item in pair]
if overwrite:
ffmpeg_args.append("-y")
ffmpeg_cmd = ["ffmpeg"] + ffmpeg_args + [str(video_path)]
subprocess.run(ffmpeg_cmd, check=True)
@dataclass