Enable video_reader backend (#220)
Co-authored-by: Alexander Soare <alexander.soare159@gmail.com>
This commit is contained in:
@@ -0,0 +1,90 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Capture video feed from a camera as raw images."""
|
||||
|
||||
import argparse
|
||||
import datetime as dt
|
||||
from pathlib import Path
|
||||
|
||||
import cv2
|
||||
|
||||
|
||||
def display_and_save_video_stream(output_dir: Path, fps: int, width: int, height: int):
|
||||
now = dt.datetime.now()
|
||||
capture_dir = output_dir / f"{now:%Y-%m-%d}" / f"{now:%H-%M-%S}"
|
||||
if not capture_dir.exists():
|
||||
capture_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Opens the default webcam
|
||||
cap = cv2.VideoCapture(0)
|
||||
if not cap.isOpened():
|
||||
print("Error: Could not open video stream.")
|
||||
return
|
||||
|
||||
cap.set(cv2.CAP_PROP_FPS, fps)
|
||||
cap.set(cv2.CAP_PROP_FRAME_WIDTH, width)
|
||||
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, height)
|
||||
|
||||
frame_index = 0
|
||||
while True:
|
||||
ret, frame = cap.read()
|
||||
|
||||
if not ret:
|
||||
print("Error: Could not read frame.")
|
||||
break
|
||||
|
||||
cv2.imshow("Video Stream", frame)
|
||||
cv2.imwrite(str(capture_dir / f"frame_{frame_index:06d}.png"), frame)
|
||||
frame_index += 1
|
||||
|
||||
# Break the loop on 'q' key press
|
||||
if cv2.waitKey(1) & 0xFF == ord("q"):
|
||||
break
|
||||
|
||||
# Release the capture and destroy all windows
|
||||
cap.release()
|
||||
cv2.destroyAllWindows()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=Path,
|
||||
default=Path("outputs/cam_capture/"),
|
||||
help="Directory where the capture images are written. A subfolder named with the current date & time will be created inside it for each capture.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fps",
|
||||
type=int,
|
||||
default=30,
|
||||
help="Frames Per Second of the capture.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--width",
|
||||
type=int,
|
||||
default=1280,
|
||||
help="Width of the captured images.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--height",
|
||||
type=int,
|
||||
default=720,
|
||||
help="Height of the captured images.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
display_and_save_video_stream(**vars(args))
|
||||
@@ -13,6 +13,23 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Assess the performance of video decoding in various configurations.
|
||||
|
||||
This script will run different video decoding benchmarks where one parameter varies at a time.
|
||||
These parameters and theirs values are specified in the BENCHMARKS dict.
|
||||
|
||||
All of these benchmarks are evaluated within different timestamps modes corresponding to different frame-loading scenarios:
|
||||
- `1_frame`: 1 single frame is loaded.
|
||||
- `2_frames`: 2 consecutive frames are loaded.
|
||||
- `2_frames_4_space`: 2 frames separated by 4 frames are loaded.
|
||||
- `6_frames`: 6 consecutive frames are loaded.
|
||||
|
||||
These values are more or less arbitrary and based on possible future usage.
|
||||
|
||||
These benchmarks are run on the first episode of each dataset specified in DATASET_REPO_IDS.
|
||||
Note: These datasets need to be image datasets, not video datasets.
|
||||
"""
|
||||
|
||||
import json
|
||||
import random
|
||||
import shutil
|
||||
@@ -21,15 +38,38 @@ import time
|
||||
from pathlib import Path
|
||||
|
||||
import einops
|
||||
import numpy
|
||||
import numpy as np
|
||||
import PIL
|
||||
import torch
|
||||
from skimage.metrics import mean_squared_error, peak_signal_noise_ratio, structural_similarity
|
||||
|
||||
from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
|
||||
from lerobot.common.datasets.video_utils import (
|
||||
decode_video_frames_torchvision,
|
||||
)
|
||||
|
||||
OUTPUT_DIR = Path("tmp/run_video_benchmark")
|
||||
DRY_RUN = False
|
||||
|
||||
DATASET_REPO_IDS = [
|
||||
"lerobot/pusht_image",
|
||||
"aliberts/aloha_mobile_shrimp_image",
|
||||
"aliberts/paris_street",
|
||||
"aliberts/kitchen",
|
||||
]
|
||||
TIMESTAMPS_MODES = [
|
||||
"1_frame",
|
||||
"2_frames",
|
||||
"2_frames_4_space",
|
||||
"6_frames",
|
||||
]
|
||||
BENCHMARKS = {
|
||||
# "pix_fmt": ["yuv420p", "yuv444p"],
|
||||
# "g": [1, 2, 3, 4, 5, 6, 10, 15, 20, 40, 100, None],
|
||||
# "crf": [0, 5, 10, 15, 20, None, 25, 30, 40, 50],
|
||||
"backend": ["pyav", "video_reader"],
|
||||
}
|
||||
|
||||
|
||||
def get_directory_size(directory):
|
||||
total_size = 0
|
||||
@@ -56,6 +96,10 @@ def run_video_benchmark(
|
||||
|
||||
# TODO(rcadene): rewrite with hardcoding of original images and episodes
|
||||
dataset = LeRobotDataset(repo_id)
|
||||
if dataset.video:
|
||||
raise ValueError(
|
||||
f"Use only image dataset for running this benchmark. Video dataset provided: {repo_id}"
|
||||
)
|
||||
|
||||
# Get fps
|
||||
fps = dataset.fps
|
||||
@@ -68,10 +112,11 @@ def run_video_benchmark(
|
||||
if not imgs_dir.exists():
|
||||
imgs_dir.mkdir(parents=True, exist_ok=True)
|
||||
hf_dataset = dataset.hf_dataset.with_format(None)
|
||||
imgs_dataset = hf_dataset.select_columns("observation.image")
|
||||
img_keys = [key for key in hf_dataset.features if key.startswith("observation.image")]
|
||||
imgs_dataset = hf_dataset.select_columns(img_keys[0])
|
||||
|
||||
for i, item in enumerate(imgs_dataset):
|
||||
img = item["observation.image"]
|
||||
img = item[img_keys[0]]
|
||||
img.save(str(imgs_dir / f"frame_{i:06d}.png"), quality=100)
|
||||
|
||||
if i >= ep_num_images - 1:
|
||||
@@ -107,7 +152,7 @@ def run_video_benchmark(
|
||||
|
||||
decoder = cfg["decoder"]
|
||||
decoder_kwgs = cfg["decoder_kwgs"]
|
||||
device = cfg["device"]
|
||||
backend = cfg["backend"]
|
||||
|
||||
if decoder == "torchvision":
|
||||
decode_frames_fn = decode_video_frames_torchvision
|
||||
@@ -116,12 +161,12 @@ def run_video_benchmark(
|
||||
|
||||
# Estimate average loading time
|
||||
|
||||
def load_original_frames(imgs_dir, timestamps):
|
||||
def load_original_frames(imgs_dir, timestamps) -> torch.Tensor:
|
||||
frames = []
|
||||
for ts in timestamps:
|
||||
idx = int(ts * fps)
|
||||
frame = PIL.Image.open(imgs_dir / f"frame_{idx:06d}.png")
|
||||
frame = torch.from_numpy(numpy.array(frame))
|
||||
frame = torch.from_numpy(np.array(frame))
|
||||
frame = frame.type(torch.float32) / 255
|
||||
frame = einops.rearrange(frame, "h w c -> c h w")
|
||||
frames.append(frame)
|
||||
@@ -130,6 +175,9 @@ def run_video_benchmark(
|
||||
list_avg_load_time = []
|
||||
list_avg_load_time_from_images = []
|
||||
per_pixel_l2_errors = []
|
||||
psnr_values = []
|
||||
ssim_values = []
|
||||
mse_values = []
|
||||
|
||||
random.seed(seed)
|
||||
|
||||
@@ -142,7 +190,7 @@ def run_video_benchmark(
|
||||
elif timestamps_mode == "2_frames":
|
||||
timestamps = [ts - 1 / fps, ts]
|
||||
elif timestamps_mode == "2_frames_4_space":
|
||||
timestamps = [ts - 4 / fps, ts]
|
||||
timestamps = [ts - 5 / fps, ts]
|
||||
elif timestamps_mode == "6_frames":
|
||||
timestamps = [ts - i / fps for i in range(6)][::-1]
|
||||
else:
|
||||
@@ -152,7 +200,7 @@ def run_video_benchmark(
|
||||
|
||||
start_time_s = time.monotonic()
|
||||
frames = decode_frames_fn(
|
||||
video_path, timestamps=timestamps, tolerance_s=1e-4, device=device, **decoder_kwgs
|
||||
video_path, timestamps=timestamps, tolerance_s=1e-4, backend=backend, **decoder_kwgs
|
||||
)
|
||||
avg_load_time = (time.monotonic() - start_time_s) / num_frames
|
||||
list_avg_load_time.append(avg_load_time)
|
||||
@@ -162,11 +210,19 @@ def run_video_benchmark(
|
||||
avg_load_time_from_images = (time.monotonic() - start_time_s) / num_frames
|
||||
list_avg_load_time_from_images.append(avg_load_time_from_images)
|
||||
|
||||
# Estimate average L2 error between original frames and decoded frames
|
||||
# Estimate reconstruction error between original frames and decoded frames with various metrics
|
||||
for i, ts in enumerate(timestamps):
|
||||
# are_close = torch.allclose(frames[i], original_frames[i], atol=0.02)
|
||||
num_pixels = original_frames[i].numel()
|
||||
per_pixel_l2_error = torch.norm(frames[i] - original_frames[i], p=2).item() / num_pixels
|
||||
per_pixel_l2_errors.append(per_pixel_l2_error)
|
||||
|
||||
frame_np, original_frame_np = frames[i].numpy(), original_frames[i].numpy()
|
||||
psnr_values.append(peak_signal_noise_ratio(original_frame_np, frame_np, data_range=1.0))
|
||||
ssim_values.append(
|
||||
structural_similarity(original_frame_np, frame_np, data_range=1.0, channel_axis=0)
|
||||
)
|
||||
mse_values.append(mean_squared_error(original_frame_np, frame_np))
|
||||
|
||||
# save decoded frames
|
||||
if t == 0:
|
||||
@@ -179,15 +235,18 @@ def run_video_benchmark(
|
||||
original_frame = PIL.Image.open(imgs_dir / f"frame_{idx:06d}.png")
|
||||
original_frame.save(output_dir / f"original_frame_{i:06d}.png")
|
||||
|
||||
per_pixel_l2_errors.append(per_pixel_l2_error)
|
||||
|
||||
avg_load_time = float(numpy.array(list_avg_load_time).mean())
|
||||
avg_load_time_from_images = float(numpy.array(list_avg_load_time_from_images).mean())
|
||||
avg_per_pixel_l2_error = float(numpy.array(per_pixel_l2_errors).mean())
|
||||
image_size = tuple(dataset[0][dataset.camera_keys[0]].shape[-2:])
|
||||
avg_load_time = float(np.array(list_avg_load_time).mean())
|
||||
avg_load_time_from_images = float(np.array(list_avg_load_time_from_images).mean())
|
||||
avg_per_pixel_l2_error = float(np.array(per_pixel_l2_errors).mean())
|
||||
avg_psnr = float(np.mean(psnr_values))
|
||||
avg_ssim = float(np.mean(ssim_values))
|
||||
avg_mse = float(np.mean(mse_values))
|
||||
|
||||
# Save benchmark info
|
||||
|
||||
info = {
|
||||
"image_size": image_size,
|
||||
"sum_original_frames_size_bytes": sum_original_frames_size_bytes,
|
||||
"video_size_bytes": video_size_bytes,
|
||||
"avg_load_time_from_images": avg_load_time_from_images,
|
||||
@@ -195,6 +254,9 @@ def run_video_benchmark(
|
||||
"compression_factor": sum_original_frames_size_bytes / video_size_bytes,
|
||||
"load_time_factor": avg_load_time_from_images / avg_load_time,
|
||||
"avg_per_pixel_l2_error": avg_per_pixel_l2_error,
|
||||
"avg_psnr": avg_psnr,
|
||||
"avg_ssim": avg_ssim,
|
||||
"avg_mse": avg_mse,
|
||||
}
|
||||
|
||||
with open(output_dir / "info.json", "w") as f:
|
||||
@@ -234,138 +296,113 @@ def load_info(out_dir):
|
||||
return info
|
||||
|
||||
|
||||
def main():
|
||||
out_dir = Path("tmp/run_video_benchmark")
|
||||
dry_run = False
|
||||
repo_ids = ["lerobot/pusht", "lerobot/umi_cup_in_the_wild"]
|
||||
timestamps_modes = [
|
||||
"1_frame",
|
||||
"2_frames",
|
||||
"2_frames_4_space",
|
||||
"6_frames",
|
||||
def one_variable_study(
|
||||
var_name: str, var_values: list, repo_ids: list, bench_dir: Path, timestamps_mode: str, dry_run: bool
|
||||
):
|
||||
print(f"**`{var_name}`**")
|
||||
headers = [
|
||||
"repo_id",
|
||||
"image_size",
|
||||
var_name,
|
||||
"compression_factor",
|
||||
"load_time_factor",
|
||||
"avg_per_pixel_l2_error",
|
||||
"avg_psnr",
|
||||
"avg_ssim",
|
||||
"avg_mse",
|
||||
]
|
||||
for timestamps_mode in timestamps_modes:
|
||||
bench_dir = out_dir / timestamps_mode
|
||||
rows = []
|
||||
base_cfg = {
|
||||
"repo_id": None,
|
||||
# video encoding
|
||||
"g": 2,
|
||||
"crf": None,
|
||||
"pix_fmt": "yuv444p",
|
||||
# video decoding
|
||||
"backend": "pyav",
|
||||
"decoder": "torchvision",
|
||||
"decoder_kwgs": {},
|
||||
}
|
||||
for repo_id in repo_ids:
|
||||
for val in var_values:
|
||||
cfg = base_cfg.copy()
|
||||
cfg["repo_id"] = repo_id
|
||||
cfg[var_name] = val
|
||||
if not dry_run:
|
||||
run_video_benchmark(
|
||||
bench_dir / repo_id / f"torchvision_{var_name}_{val}", cfg, timestamps_mode
|
||||
)
|
||||
info = load_info(bench_dir / repo_id / f"torchvision_{var_name}_{val}")
|
||||
width, height = info["image_size"][0], info["image_size"][1]
|
||||
rows.append(
|
||||
[
|
||||
repo_id,
|
||||
f"{width} x {height}",
|
||||
val,
|
||||
info["compression_factor"],
|
||||
info["load_time_factor"],
|
||||
info["avg_per_pixel_l2_error"],
|
||||
info["avg_psnr"],
|
||||
info["avg_ssim"],
|
||||
info["avg_mse"],
|
||||
]
|
||||
)
|
||||
display_markdown_table(headers, rows)
|
||||
|
||||
|
||||
def best_study(repo_ids: list, bench_dir: Path, timestamps_mode: str, dry_run: bool):
|
||||
"""Change the config once you deciced what's best based on one-variable-studies"""
|
||||
print("**best**")
|
||||
headers = [
|
||||
"repo_id",
|
||||
"image_size",
|
||||
"compression_factor",
|
||||
"load_time_factor",
|
||||
"avg_per_pixel_l2_error",
|
||||
"avg_psnr",
|
||||
"avg_ssim",
|
||||
"avg_mse",
|
||||
]
|
||||
rows = []
|
||||
for repo_id in repo_ids:
|
||||
cfg = {
|
||||
"repo_id": repo_id,
|
||||
# video encoding
|
||||
"g": 2,
|
||||
"crf": None,
|
||||
"pix_fmt": "yuv444p",
|
||||
# video decoding
|
||||
"backend": "video_reader",
|
||||
"decoder": "torchvision",
|
||||
"decoder_kwgs": {},
|
||||
}
|
||||
if not dry_run:
|
||||
run_video_benchmark(bench_dir / repo_id / "torchvision_best", cfg, timestamps_mode)
|
||||
info = load_info(bench_dir / repo_id / "torchvision_best")
|
||||
width, height = info["image_size"][0], info["image_size"][1]
|
||||
rows.append(
|
||||
[
|
||||
repo_id,
|
||||
f"{width} x {height}",
|
||||
info["compression_factor"],
|
||||
info["load_time_factor"],
|
||||
info["avg_per_pixel_l2_error"],
|
||||
]
|
||||
)
|
||||
display_markdown_table(headers, rows)
|
||||
|
||||
|
||||
def main():
|
||||
for timestamps_mode in TIMESTAMPS_MODES:
|
||||
bench_dir = OUTPUT_DIR / timestamps_mode
|
||||
|
||||
print(f"### `{timestamps_mode}`")
|
||||
print()
|
||||
|
||||
print("**`pix_fmt`**")
|
||||
headers = ["repo_id", "pix_fmt", "compression_factor", "load_time_factor", "avg_per_pixel_l2_error"]
|
||||
rows = []
|
||||
for repo_id in repo_ids:
|
||||
for pix_fmt in ["yuv420p", "yuv444p"]:
|
||||
cfg = {
|
||||
"repo_id": repo_id,
|
||||
# video encoding
|
||||
"g": 2,
|
||||
"crf": None,
|
||||
"pix_fmt": pix_fmt,
|
||||
# video decoding
|
||||
"device": "cpu",
|
||||
"decoder": "torchvision",
|
||||
"decoder_kwgs": {},
|
||||
}
|
||||
if not dry_run:
|
||||
run_video_benchmark(bench_dir / repo_id / f"torchvision_{pix_fmt}", cfg, timestamps_mode)
|
||||
info = load_info(bench_dir / repo_id / f"torchvision_{pix_fmt}")
|
||||
rows.append(
|
||||
[
|
||||
repo_id,
|
||||
pix_fmt,
|
||||
info["compression_factor"],
|
||||
info["load_time_factor"],
|
||||
info["avg_per_pixel_l2_error"],
|
||||
]
|
||||
)
|
||||
display_markdown_table(headers, rows)
|
||||
for name, values in BENCHMARKS.items():
|
||||
one_variable_study(name, values, DATASET_REPO_IDS, bench_dir, timestamps_mode, DRY_RUN)
|
||||
|
||||
print("**`g`**")
|
||||
headers = ["repo_id", "g", "compression_factor", "load_time_factor", "avg_per_pixel_l2_error"]
|
||||
rows = []
|
||||
for repo_id in repo_ids:
|
||||
for g in [1, 2, 3, 4, 5, 6, 10, 15, 20, 40, 100, None]:
|
||||
cfg = {
|
||||
"repo_id": repo_id,
|
||||
# video encoding
|
||||
"g": g,
|
||||
"pix_fmt": "yuv444p",
|
||||
# video decoding
|
||||
"device": "cpu",
|
||||
"decoder": "torchvision",
|
||||
"decoder_kwgs": {},
|
||||
}
|
||||
if not dry_run:
|
||||
run_video_benchmark(bench_dir / repo_id / f"torchvision_g_{g}", cfg, timestamps_mode)
|
||||
info = load_info(bench_dir / repo_id / f"torchvision_g_{g}")
|
||||
rows.append(
|
||||
[
|
||||
repo_id,
|
||||
g,
|
||||
info["compression_factor"],
|
||||
info["load_time_factor"],
|
||||
info["avg_per_pixel_l2_error"],
|
||||
]
|
||||
)
|
||||
display_markdown_table(headers, rows)
|
||||
|
||||
print("**`crf`**")
|
||||
headers = ["repo_id", "crf", "compression_factor", "load_time_factor", "avg_per_pixel_l2_error"]
|
||||
rows = []
|
||||
for repo_id in repo_ids:
|
||||
for crf in [0, 5, 10, 15, 20, None, 25, 30, 40, 50]:
|
||||
cfg = {
|
||||
"repo_id": repo_id,
|
||||
# video encoding
|
||||
"g": 2,
|
||||
"crf": crf,
|
||||
"pix_fmt": "yuv444p",
|
||||
# video decoding
|
||||
"device": "cpu",
|
||||
"decoder": "torchvision",
|
||||
"decoder_kwgs": {},
|
||||
}
|
||||
if not dry_run:
|
||||
run_video_benchmark(bench_dir / repo_id / f"torchvision_crf_{crf}", cfg, timestamps_mode)
|
||||
info = load_info(bench_dir / repo_id / f"torchvision_crf_{crf}")
|
||||
rows.append(
|
||||
[
|
||||
repo_id,
|
||||
crf,
|
||||
info["compression_factor"],
|
||||
info["load_time_factor"],
|
||||
info["avg_per_pixel_l2_error"],
|
||||
]
|
||||
)
|
||||
display_markdown_table(headers, rows)
|
||||
|
||||
print("**best**")
|
||||
headers = ["repo_id", "compression_factor", "load_time_factor", "avg_per_pixel_l2_error"]
|
||||
rows = []
|
||||
for repo_id in repo_ids:
|
||||
cfg = {
|
||||
"repo_id": repo_id,
|
||||
# video encoding
|
||||
"g": 2,
|
||||
"crf": None,
|
||||
"pix_fmt": "yuv444p",
|
||||
# video decoding
|
||||
"device": "cpu",
|
||||
"decoder": "torchvision",
|
||||
"decoder_kwgs": {},
|
||||
}
|
||||
if not dry_run:
|
||||
run_video_benchmark(bench_dir / repo_id / "torchvision_best", cfg, timestamps_mode)
|
||||
info = load_info(bench_dir / repo_id / "torchvision_best")
|
||||
rows.append(
|
||||
[
|
||||
repo_id,
|
||||
info["compression_factor"],
|
||||
info["load_time_factor"],
|
||||
info["avg_per_pixel_l2_error"],
|
||||
]
|
||||
)
|
||||
display_markdown_table(headers, rows)
|
||||
# best_study(DATASET_REPO_IDS, bench_dir, timestamps_mode, DRY_RUN)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -96,6 +96,7 @@ def make_dataset(cfg, split: str = "train") -> LeRobotDataset | MultiLeRobotData
|
||||
split=split,
|
||||
delta_timestamps=cfg.training.get("delta_timestamps"),
|
||||
image_transforms=image_transforms,
|
||||
video_backend=cfg.video_backend,
|
||||
)
|
||||
else:
|
||||
dataset = MultiLeRobotDataset(
|
||||
@@ -103,6 +104,7 @@ def make_dataset(cfg, split: str = "train") -> LeRobotDataset | MultiLeRobotData
|
||||
split=split,
|
||||
delta_timestamps=cfg.training.get("delta_timestamps"),
|
||||
image_transforms=image_transforms,
|
||||
video_backend=cfg.video_backend,
|
||||
)
|
||||
|
||||
if cfg.get("override_dataset_stats"):
|
||||
|
||||
@@ -48,6 +48,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
|
||||
split: str = "train",
|
||||
image_transforms: Callable | None = None,
|
||||
delta_timestamps: dict[list[float]] | None = None,
|
||||
video_backend: str | None = None,
|
||||
):
|
||||
super().__init__()
|
||||
self.repo_id = repo_id
|
||||
@@ -69,6 +70,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
|
||||
self.info = load_info(repo_id, version, root)
|
||||
if self.video:
|
||||
self.videos_dir = load_videos(repo_id, version, root)
|
||||
self.video_backend = video_backend if video_backend is not None else "pyav"
|
||||
|
||||
@property
|
||||
def fps(self) -> int:
|
||||
@@ -149,6 +151,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
|
||||
self.video_frame_keys,
|
||||
self.videos_dir,
|
||||
self.tolerance_s,
|
||||
self.video_backend,
|
||||
)
|
||||
|
||||
if self.image_transforms is not None:
|
||||
@@ -188,6 +191,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
|
||||
stats=None,
|
||||
info=None,
|
||||
videos_dir=None,
|
||||
video_backend=None,
|
||||
) -> "LeRobotDataset":
|
||||
"""Create a LeRobot Dataset from existing data and attributes instead of loading from the filesystem.
|
||||
|
||||
@@ -210,6 +214,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
|
||||
obj.stats = stats
|
||||
obj.info = info if info is not None else {}
|
||||
obj.videos_dir = videos_dir
|
||||
obj.video_backend = video_backend if video_backend is not None else "pyav"
|
||||
return obj
|
||||
|
||||
|
||||
@@ -228,6 +233,7 @@ class MultiLeRobotDataset(torch.utils.data.Dataset):
|
||||
split: str = "train",
|
||||
image_transforms: Callable | None = None,
|
||||
delta_timestamps: dict[list[float]] | None = None,
|
||||
video_backend: str | None = None,
|
||||
):
|
||||
super().__init__()
|
||||
self.repo_ids = repo_ids
|
||||
@@ -241,6 +247,7 @@ class MultiLeRobotDataset(torch.utils.data.Dataset):
|
||||
split=split,
|
||||
delta_timestamps=delta_timestamps,
|
||||
image_transforms=image_transforms,
|
||||
video_backend=video_backend,
|
||||
)
|
||||
for repo_id in repo_ids
|
||||
]
|
||||
|
||||
101
lerobot/common/datasets/push_dataset_to_hub/cam_png_format.py
Normal file
101
lerobot/common/datasets/push_dataset_to_hub/cam_png_format.py
Normal file
@@ -0,0 +1,101 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
Contains utilities to process raw data format of png images files recorded with capture_camera_feed.py
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
from datasets import Dataset, Features, Image, Value
|
||||
from PIL import Image as PILImage
|
||||
|
||||
from lerobot.common.datasets.push_dataset_to_hub.utils import concatenate_episodes
|
||||
from lerobot.common.datasets.utils import calculate_episode_data_index, hf_transform_to_torch
|
||||
from lerobot.common.datasets.video_utils import VideoFrame
|
||||
|
||||
|
||||
def check_format(raw_dir: Path) -> bool:
|
||||
image_paths = list(raw_dir.glob("frame_*.png"))
|
||||
if len(image_paths) == 0:
|
||||
raise ValueError
|
||||
|
||||
|
||||
def load_from_raw(raw_dir: Path, fps: int, episodes: list[int] | None = None):
|
||||
if episodes is not None:
|
||||
# TODO(aliberts): add support for multi-episodes.
|
||||
raise NotImplementedError()
|
||||
|
||||
ep_dict = {}
|
||||
ep_idx = 0
|
||||
|
||||
image_paths = sorted(raw_dir.glob("frame_*.png"))
|
||||
num_frames = len(image_paths)
|
||||
|
||||
ep_dict["observation.image"] = [PILImage.open(x) for x in image_paths]
|
||||
ep_dict["episode_index"] = torch.tensor([ep_idx] * num_frames)
|
||||
ep_dict["frame_index"] = torch.arange(0, num_frames, 1)
|
||||
ep_dict["timestamp"] = torch.arange(0, num_frames, 1) / fps
|
||||
|
||||
ep_dicts = [ep_dict]
|
||||
data_dict = concatenate_episodes(ep_dicts)
|
||||
total_frames = data_dict["frame_index"].shape[0]
|
||||
data_dict["index"] = torch.arange(0, total_frames, 1)
|
||||
return data_dict
|
||||
|
||||
|
||||
def to_hf_dataset(data_dict, video) -> Dataset:
|
||||
features = {}
|
||||
if video:
|
||||
features["observation.image"] = VideoFrame()
|
||||
else:
|
||||
features["observation.image"] = Image()
|
||||
|
||||
features["episode_index"] = Value(dtype="int64", id=None)
|
||||
features["frame_index"] = Value(dtype="int64", id=None)
|
||||
features["timestamp"] = Value(dtype="float32", id=None)
|
||||
features["index"] = Value(dtype="int64", id=None)
|
||||
|
||||
hf_dataset = Dataset.from_dict(data_dict, features=Features(features))
|
||||
hf_dataset.set_transform(hf_transform_to_torch)
|
||||
return hf_dataset
|
||||
|
||||
|
||||
def from_raw_to_lerobot_format(
|
||||
raw_dir: Path,
|
||||
videos_dir: Path,
|
||||
fps: int | None = None,
|
||||
video: bool = True,
|
||||
episodes: list[int] | None = None,
|
||||
):
|
||||
if video or episodes is not None:
|
||||
# TODO(aliberts): support this
|
||||
raise NotImplementedError
|
||||
|
||||
# sanity check
|
||||
check_format(raw_dir)
|
||||
|
||||
if fps is None:
|
||||
fps = 30
|
||||
|
||||
data_dict = load_from_raw(raw_dir, videos_dir, fps, video, episodes)
|
||||
hf_dataset = to_hf_dataset(data_dict, video)
|
||||
episode_data_index = calculate_episode_data_index(hf_dataset)
|
||||
info = {
|
||||
"fps": fps,
|
||||
"video": video,
|
||||
}
|
||||
return hf_dataset, episode_data_index, info
|
||||
@@ -27,7 +27,11 @@ from datasets.features.features import register_feature
|
||||
|
||||
|
||||
def load_from_videos(
|
||||
item: dict[str, torch.Tensor], video_frame_keys: list[str], videos_dir: Path, tolerance_s: float
|
||||
item: dict[str, torch.Tensor],
|
||||
video_frame_keys: list[str],
|
||||
videos_dir: Path,
|
||||
tolerance_s: float,
|
||||
backend: str = "pyav",
|
||||
):
|
||||
"""Note: When using data workers (e.g. DataLoader with num_workers>0), do not call this function
|
||||
in the main process (e.g. by using a second Dataloader with num_workers=0). It will result in a Segmentation Fault.
|
||||
@@ -46,14 +50,14 @@ def load_from_videos(
|
||||
raise NotImplementedError("All video paths are expected to be the same for now.")
|
||||
video_path = data_dir / paths[0]
|
||||
|
||||
frames = decode_video_frames_torchvision(video_path, timestamps, tolerance_s)
|
||||
frames = decode_video_frames_torchvision(video_path, timestamps, tolerance_s, backend)
|
||||
item[key] = frames
|
||||
else:
|
||||
# load one frame
|
||||
timestamps = [item[key]["timestamp"]]
|
||||
video_path = data_dir / item[key]["path"]
|
||||
|
||||
frames = decode_video_frames_torchvision(video_path, timestamps, tolerance_s)
|
||||
frames = decode_video_frames_torchvision(video_path, timestamps, tolerance_s, backend)
|
||||
item[key] = frames[0]
|
||||
|
||||
return item
|
||||
@@ -63,11 +67,23 @@ def decode_video_frames_torchvision(
|
||||
video_path: str,
|
||||
timestamps: list[float],
|
||||
tolerance_s: float,
|
||||
device: str = "cpu",
|
||||
backend: str = "pyav",
|
||||
log_loaded_timestamps: bool = False,
|
||||
):
|
||||
"""Loads frames associated to the requested timestamps of a video
|
||||
|
||||
The backend can be either "pyav" (default) or "video_reader".
|
||||
"video_reader" requires installing torchvision from source, see:
|
||||
https://github.com/pytorch/vision/blob/main/torchvision/csrc/io/decoder/gpu/README.rst
|
||||
(note that you need to compile against ffmpeg<4.3)
|
||||
|
||||
While both use cpu, "video_reader" is faster than "pyav" but requires additional setup.
|
||||
See our benchmark results for more info on performance:
|
||||
https://github.com/huggingface/lerobot/pull/220
|
||||
|
||||
See torchvision doc for more info on these two backends:
|
||||
https://pytorch.org/vision/0.18/index.html?highlight=backend#torchvision.set_video_backend
|
||||
|
||||
Note: Video benefits from inter-frame compression. Instead of storing every frame individually,
|
||||
the encoder stores a reference frame (or a key frame) and subsequent frames as differences relative to
|
||||
that key frame. As a consequence, to access a requested frame, we need to load the preceding key frame,
|
||||
@@ -78,21 +94,9 @@ def decode_video_frames_torchvision(
|
||||
|
||||
# set backend
|
||||
keyframes_only = False
|
||||
if device == "cpu":
|
||||
# explicitely use pyav
|
||||
torchvision.set_video_backend("pyav")
|
||||
torchvision.set_video_backend(backend)
|
||||
if backend == "pyav":
|
||||
keyframes_only = True # pyav doesnt support accuracte seek
|
||||
elif device == "cuda":
|
||||
# TODO(rcadene, aliberts): implement video decoding with GPU
|
||||
# torchvision.set_video_backend("cuda")
|
||||
# torchvision.set_video_backend("video_reader")
|
||||
# requires installing torchvision from source, see: https://github.com/pytorch/vision/blob/main/torchvision/csrc/io/decoder/gpu/README.rst
|
||||
# check possible bug: https://github.com/pytorch/vision/issues/7745
|
||||
raise NotImplementedError(
|
||||
"Video decoding on gpu with cuda is currently not supported. Use `device='cpu'`."
|
||||
)
|
||||
else:
|
||||
raise ValueError(device)
|
||||
|
||||
# set a video stream reader
|
||||
# TODO(rcadene): also load audio stream at the same time
|
||||
@@ -120,7 +124,9 @@ def decode_video_frames_torchvision(
|
||||
if current_ts >= last_ts:
|
||||
break
|
||||
|
||||
reader.container.close()
|
||||
if backend == "pyav":
|
||||
reader.container.close()
|
||||
|
||||
reader = None
|
||||
|
||||
query_ts = torch.tensor(timestamps)
|
||||
|
||||
@@ -28,6 +28,7 @@ seed: ???
|
||||
# "dataset_index" into the returned item. The index mapping is made according to the order in which the
|
||||
# datsets are provided.
|
||||
dataset_repo_id: lerobot/pusht
|
||||
video_backend: pyav
|
||||
|
||||
training:
|
||||
offline_steps: ???
|
||||
|
||||
@@ -55,7 +55,6 @@ from safetensors.torch import save_file
|
||||
|
||||
from lerobot.common.datasets.compute_stats import compute_stats
|
||||
from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDataset
|
||||
from lerobot.common.datasets.push_dataset_to_hub._download_raw import download_raw
|
||||
from lerobot.common.datasets.utils import flatten_dict
|
||||
|
||||
|
||||
@@ -70,6 +69,8 @@ def get_from_raw_to_lerobot_format_fn(raw_format: str):
|
||||
from lerobot.common.datasets.push_dataset_to_hub.dora_parquet_format import from_raw_to_lerobot_format
|
||||
elif raw_format == "xarm_pkl":
|
||||
from lerobot.common.datasets.push_dataset_to_hub.xarm_pkl_format import from_raw_to_lerobot_format
|
||||
elif raw_format == "cam_png":
|
||||
from lerobot.common.datasets.push_dataset_to_hub.cam_png_format import from_raw_to_lerobot_format
|
||||
else:
|
||||
raise ValueError(
|
||||
f"The selected {raw_format} can't be found. Did you add it to `lerobot/scripts/push_dataset_to_hub.py::get_from_raw_to_lerobot_format_fn`?"
|
||||
@@ -182,10 +183,6 @@ def push_dataset_to_hub(
|
||||
meta_data_dir = Path(cache_dir) / "meta_data"
|
||||
videos_dir = Path(cache_dir) / "videos"
|
||||
|
||||
# Download the raw dataset if available
|
||||
if not raw_dir.exists():
|
||||
download_raw(raw_dir, dataset_id)
|
||||
|
||||
if raw_format is None:
|
||||
# TODO(rcadene, adilzouitine): implement auto_find_raw_format
|
||||
raise NotImplementedError()
|
||||
|
||||
Reference in New Issue
Block a user