Files
lerobot/lerobot/common/datasets/_video_benchmark/run_video_benchmark.py
Simon Alibert 2abef3bef9 Enable video_reader backend (#220)
Co-authored-by: Alexander Soare <alexander.soare159@gmail.com>
2024-06-19 17:15:25 +02:00

410 lines
14 KiB
Python

#!/usr/bin/env python
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Assess the performance of video decoding in various configurations.
This script will run different video decoding benchmarks where one parameter varies at a time.
These parameters and theirs values are specified in the BENCHMARKS dict.
All of these benchmarks are evaluated within different timestamps modes corresponding to different frame-loading scenarios:
- `1_frame`: 1 single frame is loaded.
- `2_frames`: 2 consecutive frames are loaded.
- `2_frames_4_space`: 2 frames separated by 4 frames are loaded.
- `6_frames`: 6 consecutive frames are loaded.
These values are more or less arbitrary and based on possible future usage.
These benchmarks are run on the first episode of each dataset specified in DATASET_REPO_IDS.
Note: These datasets need to be image datasets, not video datasets.
"""
import json
import random
import shutil
import subprocess
import time
from pathlib import Path
import einops
import numpy as np
import PIL
import torch
from skimage.metrics import mean_squared_error, peak_signal_noise_ratio, structural_similarity
from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
from lerobot.common.datasets.video_utils import (
decode_video_frames_torchvision,
)
OUTPUT_DIR = Path("tmp/run_video_benchmark")
DRY_RUN = False
DATASET_REPO_IDS = [
"lerobot/pusht_image",
"aliberts/aloha_mobile_shrimp_image",
"aliberts/paris_street",
"aliberts/kitchen",
]
TIMESTAMPS_MODES = [
"1_frame",
"2_frames",
"2_frames_4_space",
"6_frames",
]
BENCHMARKS = {
# "pix_fmt": ["yuv420p", "yuv444p"],
# "g": [1, 2, 3, 4, 5, 6, 10, 15, 20, 40, 100, None],
# "crf": [0, 5, 10, 15, 20, None, 25, 30, 40, 50],
"backend": ["pyav", "video_reader"],
}
def get_directory_size(directory):
total_size = 0
# Iterate over all files and subdirectories recursively
for item in directory.rglob("*"):
if item.is_file():
# Add the file size to the total
total_size += item.stat().st_size
return total_size
def run_video_benchmark(
output_dir,
cfg,
timestamps_mode,
seed=1337,
):
output_dir = Path(output_dir)
if output_dir.exists():
shutil.rmtree(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
repo_id = cfg["repo_id"]
# TODO(rcadene): rewrite with hardcoding of original images and episodes
dataset = LeRobotDataset(repo_id)
if dataset.video:
raise ValueError(
f"Use only image dataset for running this benchmark. Video dataset provided: {repo_id}"
)
# Get fps
fps = dataset.fps
# we only load first episode
ep_num_images = dataset.episode_data_index["to"][0].item()
# Save/Load image directory for the first episode
imgs_dir = Path(f"tmp/data/images/{repo_id}/observation.image_episode_000000")
if not imgs_dir.exists():
imgs_dir.mkdir(parents=True, exist_ok=True)
hf_dataset = dataset.hf_dataset.with_format(None)
img_keys = [key for key in hf_dataset.features if key.startswith("observation.image")]
imgs_dataset = hf_dataset.select_columns(img_keys[0])
for i, item in enumerate(imgs_dataset):
img = item[img_keys[0]]
img.save(str(imgs_dir / f"frame_{i:06d}.png"), quality=100)
if i >= ep_num_images - 1:
break
sum_original_frames_size_bytes = get_directory_size(imgs_dir)
# Encode images into video
video_path = output_dir / "episode_0.mp4"
g = cfg.get("g")
crf = cfg.get("crf")
pix_fmt = cfg["pix_fmt"]
cmd = f"ffmpeg -r {fps} "
cmd += "-f image2 "
cmd += "-loglevel error "
cmd += f"-i {str(imgs_dir / 'frame_%06d.png')} "
cmd += "-vcodec libx264 "
if g is not None:
cmd += f"-g {g} " # ensures at least 1 keyframe every 10 frames
# cmd += "-keyint_min 10 " set a minimum of 10 frames between 2 key frames
# cmd += "-sc_threshold 0 " disable scene change detection to lower the number of key frames
if crf is not None:
cmd += f"-crf {crf} "
cmd += f"-pix_fmt {pix_fmt} "
cmd += f"{str(video_path)}"
subprocess.run(cmd.split(" "), check=True)
video_size_bytes = video_path.stat().st_size
# Set decoder
decoder = cfg["decoder"]
decoder_kwgs = cfg["decoder_kwgs"]
backend = cfg["backend"]
if decoder == "torchvision":
decode_frames_fn = decode_video_frames_torchvision
else:
raise ValueError(decoder)
# Estimate average loading time
def load_original_frames(imgs_dir, timestamps) -> torch.Tensor:
frames = []
for ts in timestamps:
idx = int(ts * fps)
frame = PIL.Image.open(imgs_dir / f"frame_{idx:06d}.png")
frame = torch.from_numpy(np.array(frame))
frame = frame.type(torch.float32) / 255
frame = einops.rearrange(frame, "h w c -> c h w")
frames.append(frame)
return frames
list_avg_load_time = []
list_avg_load_time_from_images = []
per_pixel_l2_errors = []
psnr_values = []
ssim_values = []
mse_values = []
random.seed(seed)
for t in range(50):
# test loading 2 frames that are 4 frames appart, which might be a common setting
ts = random.randint(fps, ep_num_images - fps) / fps
if timestamps_mode == "1_frame":
timestamps = [ts]
elif timestamps_mode == "2_frames":
timestamps = [ts - 1 / fps, ts]
elif timestamps_mode == "2_frames_4_space":
timestamps = [ts - 5 / fps, ts]
elif timestamps_mode == "6_frames":
timestamps = [ts - i / fps for i in range(6)][::-1]
else:
raise ValueError(timestamps_mode)
num_frames = len(timestamps)
start_time_s = time.monotonic()
frames = decode_frames_fn(
video_path, timestamps=timestamps, tolerance_s=1e-4, backend=backend, **decoder_kwgs
)
avg_load_time = (time.monotonic() - start_time_s) / num_frames
list_avg_load_time.append(avg_load_time)
start_time_s = time.monotonic()
original_frames = load_original_frames(imgs_dir, timestamps)
avg_load_time_from_images = (time.monotonic() - start_time_s) / num_frames
list_avg_load_time_from_images.append(avg_load_time_from_images)
# Estimate reconstruction error between original frames and decoded frames with various metrics
for i, ts in enumerate(timestamps):
# are_close = torch.allclose(frames[i], original_frames[i], atol=0.02)
num_pixels = original_frames[i].numel()
per_pixel_l2_error = torch.norm(frames[i] - original_frames[i], p=2).item() / num_pixels
per_pixel_l2_errors.append(per_pixel_l2_error)
frame_np, original_frame_np = frames[i].numpy(), original_frames[i].numpy()
psnr_values.append(peak_signal_noise_ratio(original_frame_np, frame_np, data_range=1.0))
ssim_values.append(
structural_similarity(original_frame_np, frame_np, data_range=1.0, channel_axis=0)
)
mse_values.append(mean_squared_error(original_frame_np, frame_np))
# save decoded frames
if t == 0:
frame_hwc = (frames[i].permute((1, 2, 0)) * 255).type(torch.uint8).cpu().numpy()
PIL.Image.fromarray(frame_hwc).save(output_dir / f"frame_{i:06d}.png")
# save original_frames
idx = int(ts * fps)
if t == 0:
original_frame = PIL.Image.open(imgs_dir / f"frame_{idx:06d}.png")
original_frame.save(output_dir / f"original_frame_{i:06d}.png")
image_size = tuple(dataset[0][dataset.camera_keys[0]].shape[-2:])
avg_load_time = float(np.array(list_avg_load_time).mean())
avg_load_time_from_images = float(np.array(list_avg_load_time_from_images).mean())
avg_per_pixel_l2_error = float(np.array(per_pixel_l2_errors).mean())
avg_psnr = float(np.mean(psnr_values))
avg_ssim = float(np.mean(ssim_values))
avg_mse = float(np.mean(mse_values))
# Save benchmark info
info = {
"image_size": image_size,
"sum_original_frames_size_bytes": sum_original_frames_size_bytes,
"video_size_bytes": video_size_bytes,
"avg_load_time_from_images": avg_load_time_from_images,
"avg_load_time": avg_load_time,
"compression_factor": sum_original_frames_size_bytes / video_size_bytes,
"load_time_factor": avg_load_time_from_images / avg_load_time,
"avg_per_pixel_l2_error": avg_per_pixel_l2_error,
"avg_psnr": avg_psnr,
"avg_ssim": avg_ssim,
"avg_mse": avg_mse,
}
with open(output_dir / "info.json", "w") as f:
json.dump(info, f)
return info
def display_markdown_table(headers, rows):
for i, row in enumerate(rows):
new_row = []
for col in row:
if col is None:
new_col = "None"
elif isinstance(col, float):
new_col = f"{col:.3f}"
if new_col == "0.000":
new_col = f"{col:.7f}"
elif isinstance(col, int):
new_col = f"{col}"
else:
new_col = col
new_row.append(new_col)
rows[i] = new_row
header_line = "| " + " | ".join(headers) + " |"
separator_line = "| " + " | ".join(["---" for _ in headers]) + " |"
body_lines = ["| " + " | ".join(row) + " |" for row in rows]
markdown_table = "\n".join([header_line, separator_line] + body_lines)
print(markdown_table)
print()
def load_info(out_dir):
with open(out_dir / "info.json") as f:
info = json.load(f)
return info
def one_variable_study(
var_name: str, var_values: list, repo_ids: list, bench_dir: Path, timestamps_mode: str, dry_run: bool
):
print(f"**`{var_name}`**")
headers = [
"repo_id",
"image_size",
var_name,
"compression_factor",
"load_time_factor",
"avg_per_pixel_l2_error",
"avg_psnr",
"avg_ssim",
"avg_mse",
]
rows = []
base_cfg = {
"repo_id": None,
# video encoding
"g": 2,
"crf": None,
"pix_fmt": "yuv444p",
# video decoding
"backend": "pyav",
"decoder": "torchvision",
"decoder_kwgs": {},
}
for repo_id in repo_ids:
for val in var_values:
cfg = base_cfg.copy()
cfg["repo_id"] = repo_id
cfg[var_name] = val
if not dry_run:
run_video_benchmark(
bench_dir / repo_id / f"torchvision_{var_name}_{val}", cfg, timestamps_mode
)
info = load_info(bench_dir / repo_id / f"torchvision_{var_name}_{val}")
width, height = info["image_size"][0], info["image_size"][1]
rows.append(
[
repo_id,
f"{width} x {height}",
val,
info["compression_factor"],
info["load_time_factor"],
info["avg_per_pixel_l2_error"],
info["avg_psnr"],
info["avg_ssim"],
info["avg_mse"],
]
)
display_markdown_table(headers, rows)
def best_study(repo_ids: list, bench_dir: Path, timestamps_mode: str, dry_run: bool):
"""Change the config once you deciced what's best based on one-variable-studies"""
print("**best**")
headers = [
"repo_id",
"image_size",
"compression_factor",
"load_time_factor",
"avg_per_pixel_l2_error",
"avg_psnr",
"avg_ssim",
"avg_mse",
]
rows = []
for repo_id in repo_ids:
cfg = {
"repo_id": repo_id,
# video encoding
"g": 2,
"crf": None,
"pix_fmt": "yuv444p",
# video decoding
"backend": "video_reader",
"decoder": "torchvision",
"decoder_kwgs": {},
}
if not dry_run:
run_video_benchmark(bench_dir / repo_id / "torchvision_best", cfg, timestamps_mode)
info = load_info(bench_dir / repo_id / "torchvision_best")
width, height = info["image_size"][0], info["image_size"][1]
rows.append(
[
repo_id,
f"{width} x {height}",
info["compression_factor"],
info["load_time_factor"],
info["avg_per_pixel_l2_error"],
]
)
display_markdown_table(headers, rows)
def main():
for timestamps_mode in TIMESTAMPS_MODES:
bench_dir = OUTPUT_DIR / timestamps_mode
print(f"### `{timestamps_mode}`")
print()
for name, values in BENCHMARKS.items():
one_variable_study(name, values, DATASET_REPO_IDS, bench_dir, timestamps_mode, DRY_RUN)
# best_study(DATASET_REPO_IDS, bench_dir, timestamps_mode, DRY_RUN)
if __name__ == "__main__":
main()