feat(encoding): switching to PyAV for ffmpeg related tasks (#983)

2025-04-29 17:39:35 +02:00
parent 674e784aa9
commit 6d723c45a9
3 changed files with 115 additions and 94 deletions
--- a/benchmarks/video/run_video_benchmark.py
+++ b/benchmarks/video/run_video_benchmark.py
@@ -416,7 +416,7 @@ if __name__ == "__main__":
        "--vcodec",
        type=str,
        nargs="*",
-        default=["libx264", "libx265", "libsvtav1"],
+        default=["libx264", "hevc", "libsvtav1"],
        help="Video codecs to be tested",
    )
    parser.add_argument(
@@ -446,7 +446,7 @@ if __name__ == "__main__":
    #     nargs="*",
    #     default=[0, 1],
    #     help="Use the fastdecode tuning option. 0 disables it. "
-    #         "For libx264 and libx265, only 1 is possible. "
+    #         "For libx264 and libx265/hevc, only 1 is possible. "
    #         "For libsvtav1, 1, 2 or 3 are possible values with a higher number meaning a faster decoding optimization",
    # )
    parser.add_argument(
--- a/lerobot/common/datasets/video_utils.py
+++ b/lerobot/common/datasets/video_utils.py
@@ -13,16 +13,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import glob
 import importlib
 import json
 import logging
 import subprocess
 import warnings
 from collections import OrderedDict
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any, ClassVar
 import av
 import pyarrow as pa
 import torch
 import torchvision
@@ -252,51 +251,83 @@ def encode_video_frames(
    g: int | None = 2,
    crf: int | None = 30,
    fast_decode: int = 0,
-    log_level: str | None = "error",
+    log_level: int | None = av.logging.ERROR,
    overwrite: bool = False,
 ) -> None:
    """More info on ffmpeg arguments tuning on `benchmark/video/README.md`"""
    # Check encoder availability
    if vcodec not in ["h264", "hevc", "libsvtav1"]:
        raise ValueError(f"Unsupported video codec: {vcodec}. Supported codecs are: h264, hevc, libsvtav1.")
    video_path = Path(video_path)
    imgs_dir = Path(imgs_dir)
    video_path.parent.mkdir(parents=True, exist_ok=True)
-    ffmpeg_args = OrderedDict(
+    video_path.parent.mkdir(parents=True, exist_ok=overwrite)
-        [
+
-            ("-f", "image2"),
+    # Encoders/pixel formats incompatibility check
-            ("-r", str(fps)),
+    if (vcodec == "libsvtav1" or vcodec == "hevc") and pix_fmt == "yuv444p":
-            ("-i", str(imgs_dir / "frame_%06d.png")),
+        logging.warning(
-            ("-vcodec", vcodec),
+            f"Incompatible pixel format 'yuv444p' for codec {vcodec}, auto-selecting format 'yuv420p'"
-            ("-pix_fmt", pix_fmt),
+        )
-        ]
+        pix_fmt = "yuv420p"
    # Get input frames
    template = "frame_" + ("[0-9]" * 6) + ".png"
    input_list = sorted(
        glob.glob(str(imgs_dir / template)), key=lambda x: int(x.split("_")[-1].split(".")[0])
    )
    # Define video output frame size (assuming all input frames are the same size)
    if len(input_list) == 0:
        raise FileNotFoundError(f"No images found in {imgs_dir}.")
    dummy_image = Image.open(input_list[0])
    width, height = dummy_image.size
    # Define video codec options
    video_options = {}
    if g is not None:
-        ffmpeg_args["-g"] = str(g)
+        video_options["g"] = str(g)
    if crf is not None:
-        ffmpeg_args["-crf"] = str(crf)
+        video_options["crf"] = str(crf)
    if fast_decode:
-        key = "-svtav1-params" if vcodec == "libsvtav1" else "-tune"
+        key = "svtav1-params" if vcodec == "libsvtav1" else "tune"
        value = f"fast-decode={fast_decode}" if vcodec == "libsvtav1" else "fastdecode"
-        ffmpeg_args[key] = value
+        video_options[key] = value
    # Set logging level
    if log_level is not None:
-        ffmpeg_args["-loglevel"] = str(log_level)
+        # "While less efficient, it is generally preferable to modify logging with Python’s logging"
        logging.getLogger("libav").setLevel(log_level)
-    ffmpeg_args = [item for pair in ffmpeg_args.items() for item in pair]
+    # Create and open output file (overwrite by default)
-    if overwrite:
+    with av.open(str(video_path), "w") as output:
-        ffmpeg_args.append("-y")
+        output_stream = output.add_stream(vcodec, fps, options=video_options)
        output_stream.pix_fmt = pix_fmt
        output_stream.width = width
        output_stream.height = height
-    ffmpeg_cmd = ["ffmpeg"] + ffmpeg_args + [str(video_path)]
+        # Loop through input frames and encode them
-    # redirect stdin to subprocess.DEVNULL to prevent reading random keyboard inputs from terminal
+        for input_data in input_list:
-    subprocess.run(ffmpeg_cmd, check=True, stdin=subprocess.DEVNULL)
+            input_image = Image.open(input_data).convert("RGB")
            input_frame = av.VideoFrame.from_image(input_image)
            packet = output_stream.encode(input_frame)
            if packet:
                output.mux(packet)
        # Flush the encoder
        packet = output_stream.encode()
        if packet:
            output.mux(packet)
    # Reset logging level
    if log_level is not None:
        av.logging.restore_default_callback()
    if not video_path.exists():
-        raise OSError(
+        raise OSError(f"Video encoding did not work. File not found: {video_path}.")
            f"Video encoding did not work. File not found: {video_path}. "
            f"Try running the command manually to debug: `{''.join(ffmpeg_cmd)}`"
        )
@dataclass
@@ -332,78 +363,68 @@ with warnings.catch_warnings():
 def get_audio_info(video_path: Path | str) -> dict:
-    ffprobe_audio_cmd = [
+    # Set logging level
-        "ffprobe",
+    logging.getLogger("libav").setLevel(av.logging.ERROR)
        "-v",
        "error",
        "-select_streams",
        "a:0",
        "-show_entries",
        "stream=channels,codec_name,bit_rate,sample_rate,bit_depth,channel_layout,duration",
        "-of",
        "json",
        str(video_path),
    ]
    result = subprocess.run(ffprobe_audio_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    if result.returncode != 0:
        raise RuntimeError(f"Error running ffprobe: {result.stderr}")
-    info = json.loads(result.stdout)
+    # Getting audio stream information
-    audio_stream_info = info["streams"][0] if info.get("streams") else None
+    audio_info = {}
-    if audio_stream_info is None:
+    with av.open(str(video_path), "r") as audio_file:
-        return {"has_audio": False}
+        try:
            audio_stream = audio_file.streams.audio[0]
        except IndexError:
            # Reset logging level
            av.logging.restore_default_callback()
            return {"has_audio": False}
-    # Return the information, defaulting to None if no audio stream is present
+        audio_info["audio.channels"] = audio_stream.channels
-    return {
+        audio_info["audio.codec"] = audio_stream.codec.canonical_name
-        "has_audio": True,
+        # In an ideal loseless case : bit depth x sample rate x channels = bit rate.
-        "audio.channels": audio_stream_info.get("channels", None),
+        # In an actual compressed case, the bit rate is set according to the compression level : the lower the bit rate, the more compression is applied.
-        "audio.codec": audio_stream_info.get("codec_name", None),
+        audio_info["audio.bit_rate"] = audio_stream.bit_rate
-        "audio.bit_rate": int(audio_stream_info["bit_rate"]) if audio_stream_info.get("bit_rate") else None,
+        audio_info["audio.sample_rate"] = audio_stream.sample_rate  # Number of samples per second
-        "audio.sample_rate": int(audio_stream_info["sample_rate"])
+        # In an ideal loseless case : fixed number of bits per sample.
-        if audio_stream_info.get("sample_rate")
+        # In an actual compressed case : variable number of bits per sample (often reduced to match a given depth rate).
-        else None,
+        audio_info["audio.bit_depth"] = audio_stream.format.bits
-        "audio.bit_depth": audio_stream_info.get("bit_depth", None),
+        audio_info["audio.channel_layout"] = audio_stream.layout.name
-        "audio.channel_layout": audio_stream_info.get("channel_layout", None),
+        audio_info["has_audio"] = True
-    }
+
    # Reset logging level
    av.logging.restore_default_callback()
    return audio_info
 def get_video_info(video_path: Path | str) -> dict:
-    ffprobe_video_cmd = [
+    # Set logging level
-        "ffprobe",
+    logging.getLogger("libav").setLevel(av.logging.ERROR)
        "-v",
        "error",
        "-select_streams",
        "v:0",
        "-show_entries",
        "stream=r_frame_rate,width,height,codec_name,nb_frames,duration,pix_fmt",
        "-of",
        "json",
        str(video_path),
    ]
    result = subprocess.run(ffprobe_video_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    if result.returncode != 0:
        raise RuntimeError(f"Error running ffprobe: {result.stderr}")
-    info = json.loads(result.stdout)
+    # Getting video stream information
-    video_stream_info = info["streams"][0]
+    video_info = {}
    with av.open(str(video_path), "r") as video_file:
        try:
            video_stream = video_file.streams.video[0]
        except IndexError:
            # Reset logging level
            av.logging.restore_default_callback()
            return {}
-    # Calculate fps from r_frame_rate
+        video_info["video.height"] = video_stream.height
-    r_frame_rate = video_stream_info["r_frame_rate"]
+        video_info["video.width"] = video_stream.width
-    num, denom = map(int, r_frame_rate.split("/"))
+        video_info["video.codec"] = video_stream.codec.canonical_name
-    fps = num / denom
+        video_info["video.pix_fmt"] = video_stream.pix_fmt
        video_info["video.is_depth_map"] = False
-    pixel_channels = get_video_pixel_channels(video_stream_info["pix_fmt"])
+        # Calculate fps from r_frame_rate
        video_info["video.fps"] = int(video_stream.base_rate)
-    video_info = {
+        pixel_channels = get_video_pixel_channels(video_stream.pix_fmt)
-        "video.fps": fps,
+        video_info["video.channels"] = pixel_channels
-        "video.height": video_stream_info["height"],
+
-        "video.width": video_stream_info["width"],
+    # Reset logging level
-        "video.channels": pixel_channels,
+    av.logging.restore_default_callback()
-        "video.codec": video_stream_info["codec_name"],
+
-        "video.pix_fmt": video_stream_info["pix_fmt"],
+    # Adding audio stream information
-        "video.is_depth_map": False,
+    video_info.update(**get_audio_info(video_path))
        **get_audio_info(video_path),
    }
    return video_info
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -62,7 +62,7 @@ dependencies = [
    "omegaconf>=2.3.0",
    "opencv-python-headless>=4.9.0",
    "packaging>=24.2",
-    "av>=12.0.5",
+    "av>=14.2.0",
    "pymunk>=6.6.0",
    "pynput>=1.7.7",
    "pyzmq>=26.2.1",