most unit tests passing (TODO: convert datasets)

This commit is contained in:
Remi Cadene
2025-04-16 21:30:58 +02:00
parent c2a05a1fde
commit 6b6a990f4c
22 changed files with 150 additions and 136 deletions

View File

@@ -51,6 +51,7 @@ from lerobot.common.datasets.utils import (
get_features_from_robot,
get_hf_dataset_size_in_mb,
get_hf_features_from_features,
get_parquet_file_size_in_mb,
get_parquet_num_frames,
get_safe_version,
get_video_duration_in_s,
@@ -59,15 +60,16 @@ from lerobot.common.datasets.utils import (
load_episodes,
load_info,
load_nested_dataset,
load_stats,
load_tasks,
update_chunk_file_indices,
validate_episode_buffer,
validate_frame,
write_info,
write_json,
write_stats,
write_tasks,
)
from lerobot.common.datasets.v30.convert_dataset_v21_to_v30 import get_parquet_file_size_in_mb
from lerobot.common.datasets.video_utils import (
VideoFrame,
decode_video_frames_torchvision,
@@ -111,8 +113,7 @@ class LeRobotDatasetMetadata:
check_version_compatibility(self.repo_id, self._version, CODEBASE_VERSION)
self.tasks = load_tasks(self.root)
self.episodes = load_episodes(self.root)
# TODO(rcadene): https://huggingface.slack.com/archives/C02V51Q3800/p1743517952388249?thread_ts=1742896075.499119&cid=C02V51Q3800
# self.stats = aggregate_stats(list(self.episodes_stats.values()))
self.stats = load_stats(self.root)
def pull_from_repo(
self,
@@ -272,10 +273,17 @@ class LeRobotDatasetMetadata:
chunk_idx, file_idx = 0, 0
df["meta/episodes/chunk_index"] = [chunk_idx]
df["meta/episodes/file_index"] = [file_idx]
df["dataset_from_index"] = [0]
df["dataset_to_index"] = [len(df)]
else:
# Retrieve information from the latest parquet file
latest_ep = self.episodes.with_format(
columns=["meta/episodes/chunk_index", "meta/episodes/file_index"]
columns=[
"meta/episodes/chunk_index",
"meta/episodes/file_index",
"dataset_from_index",
"dataset_to_index",
]
)[-1]
chunk_idx, file_idx = (
latest_ep["meta/episodes/chunk_index"],
@@ -285,16 +293,18 @@ class LeRobotDatasetMetadata:
latest_path = self.root / DEFAULT_EPISODES_PATH.format(chunk_index=chunk_idx, file_index=file_idx)
latest_size_in_mb = get_parquet_file_size_in_mb(latest_path)
# Determine if a new parquet file is needed
if latest_size_in_mb + ep_size_in_mb >= self.files_size_in_mb:
# Size limit is reached, prepare new parquet file
chunk_idx, file_idx = update_chunk_file_indices(chunk_idx, file_idx, self.meta.chunks_size)
df["meta/episodes/chunk_index"] = [chunk_idx]
df["meta/episodes/file_index"] = [file_idx]
else:
# Update the existing parquet file with new row
df["meta/episodes/chunk_index"] = [chunk_idx]
df["meta/episodes/file_index"] = [file_idx]
# Update the existing pandas dataframe with new row
df["meta/episodes/chunk_index"] = [chunk_idx]
df["meta/episodes/file_index"] = [file_idx]
df["dataset_from_index"] = [latest_ep["dataset_to_index"]]
df["dataset_to_index"] = [latest_ep["dataset_to_index"] + len(df)]
if latest_size_in_mb + ep_size_in_mb < self.files_size_in_mb:
# Size limit wasnt reached, concatenate latest dataframe with new one
latest_df = pd.read_parquet(latest_path)
df = pd.concat([latest_df, df], ignore_index=True)
@@ -333,8 +343,8 @@ class LeRobotDatasetMetadata:
self.update_video_info()
write_info(self.info, self.root)
self.stats = aggregate_stats([self.stats, episode_stats]) if self.stats else episode_stats
# TODO: write stats
self.stats = aggregate_stats([self.stats, episode_stats]) if self.stats is not None else episode_stats
write_stats(self.stats, self.root)
def update_video_info(self) -> None:
"""
@@ -401,8 +411,7 @@ class LeRobotDatasetMetadata:
obj.tasks = None
obj.episodes = None
# TODO(rcadene) stats
obj.stats = {}
obj.stats = None
obj.info = create_empty_dataset_info(CODEBASE_VERSION, fps, robot_type, features, use_videos)
if len(obj.video_keys) > 0 and not use_videos:
raise ValueError()

View File

@@ -337,13 +337,11 @@ def compute_sampler_weights(
if len(offline_dataset) > 0:
offline_data_mask_indices = []
for start_index, end_index in zip(
offline_dataset.episode_data_index["from"],
offline_dataset.episode_data_index["to"],
offline_dataset.meta.episodes["dataset_from_index"],
offline_dataset.meta.episodes["dataset_to_index"],
strict=True,
):
offline_data_mask_indices.extend(
range(start_index.item(), end_index.item() - offline_drop_n_last_frames)
)
offline_data_mask_indices.extend(range(start_index, end_index - offline_drop_n_last_frames))
offline_data_mask = torch.zeros(len(offline_dataset), dtype=torch.bool)
offline_data_mask[torch.tensor(offline_data_mask_indices)] = True
weights.append(

View File

@@ -21,7 +21,8 @@ import torch
class EpisodeAwareSampler:
def __init__(
self,
episode_data_index: dict,
dataset_from_indices: list[int],
dataset_to_indices: list[int],
episode_indices_to_use: Union[list, None] = None,
drop_n_first_frames: int = 0,
drop_n_last_frames: int = 0,
@@ -30,7 +31,8 @@ class EpisodeAwareSampler:
"""Sampler that optionally incorporates episode boundary information.
Args:
episode_data_index: Dictionary with keys 'from' and 'to' containing the start and end indices of each episode.
dataset_from_indices: List of indices containing the start of each episode in the dataset.
dataset_to_indices: List of indices containing the end of each episode in the dataset.
episode_indices_to_use: List of episode indices to use. If None, all episodes are used.
Assumes that episodes are indexed from 0 to N-1.
drop_n_first_frames: Number of frames to drop from the start of each episode.
@@ -39,12 +41,10 @@ class EpisodeAwareSampler:
"""
indices = []
for episode_idx, (start_index, end_index) in enumerate(
zip(episode_data_index["from"], episode_data_index["to"], strict=True)
zip(dataset_from_indices, dataset_to_indices, strict=True)
):
if episode_indices_to_use is None or episode_idx in episode_indices_to_use:
indices.extend(
range(start_index.item() + drop_n_first_frames, end_index.item() - drop_n_last_frames)
)
indices.extend(range(start_index + drop_n_first_frames, end_index - drop_n_last_frames))
self.indices = indices
self.shuffle = shuffle

View File

@@ -21,7 +21,6 @@ import shutil
import subprocess
import tempfile
from collections.abc import Iterator
from itertools import accumulate
from pathlib import Path
from pprint import pformat
from types import SimpleNamespace
@@ -56,23 +55,23 @@ DEFAULT_FILE_SIZE_IN_MB = 500.0 # Max size per file
# Keep legacy for `convert_dataset_v21_to_v30.py`
LEGACY_EPISODES_PATH = "meta/episodes.jsonl"
LEGACY_STATS_PATH = "meta/stats.json"
LEGACY_EPISODES_STATS_PATH = "meta/episodes_stats.jsonl"
LEGACY_TASKS_PATH = "meta/tasks.jsonl"
LEGACY_DEFAULT_VIDEO_PATH = "videos/chunk-{episode_chunk:03d}/{video_key}/episode_{episode_index:06d}.mp4"
LEGACY_DEFAULT_PARQUET_PATH = "data/chunk-{episode_chunk:03d}/episode_{episode_index:06d}.parquet"
# TODO
DEFAULT_IMAGE_PATH = "images/{image_key}/episode_{episode_index:06d}/frame_{frame_index:06d}.png"
DEFAULT_IMAGE_PATH = "images/{image_key}/episode-{episode_index:06d}/frame-{frame_index:06d}.png"
INFO_PATH = "meta/info.json"
STATS_PATH = "meta/stats.json"
EPISODES_DIR = "meta/episodes"
DATA_DIR = "data"
VIDEO_DIR = "videos"
INFO_PATH = "meta/info.json"
CHUNK_FILE_PATTERN = "chunk-{chunk_index:03d}/file-{file_index:03d}"
DEFAULT_EPISODES_PATH = EPISODES_DIR + "/" + CHUNK_FILE_PATTERN + ".parquet"
DEFAULT_TASKS_PATH = "meta/tasks.parquet"
DEFAULT_EPISODES_PATH = EPISODES_DIR + "/" + CHUNK_FILE_PATTERN + ".parquet"
DEFAULT_DATA_PATH = DATA_DIR + "/" + CHUNK_FILE_PATTERN + ".parquet"
DEFAULT_VIDEO_PATH = VIDEO_DIR + "/{video_key}/" + CHUNK_FILE_PATTERN + ".mp4"
@@ -95,6 +94,12 @@ DEFAULT_FEATURES = {
}
def get_parquet_file_size_in_mb(parquet_path):
metadata = pq.read_metadata(parquet_path)
uncompressed_size = metadata.num_rows * metadata.row_group(0).total_byte_size
return uncompressed_size / (1024**2)
def get_hf_dataset_size_in_mb(hf_ds: Dataset) -> int:
return hf_ds.data.nbytes / (1024**2)
@@ -317,7 +322,7 @@ def load_info(local_dir: Path) -> dict:
def write_stats(stats: dict, local_dir: Path):
serialized_stats = serialize_dict(stats)
write_json(serialized_stats, local_dir / LEGACY_STATS_PATH)
write_json(serialized_stats, local_dir / STATS_PATH)
def cast_stats_to_numpy(stats) -> dict[str, dict[str, np.ndarray]]:
@@ -326,9 +331,9 @@ def cast_stats_to_numpy(stats) -> dict[str, dict[str, np.ndarray]]:
def load_stats(local_dir: Path) -> dict[str, dict[str, np.ndarray]]:
if not (local_dir / LEGACY_STATS_PATH).exists():
if not (local_dir / STATS_PATH).exists():
return None
stats = load_json(local_dir / LEGACY_STATS_PATH)
stats = load_json(local_dir / STATS_PATH)
return cast_stats_to_numpy(stats)
@@ -375,13 +380,6 @@ def write_episodes(episodes: Dataset, local_dir: Path):
if get_hf_dataset_size_in_mb(episodes) > DEFAULT_FILE_SIZE_IN_MB:
raise NotImplementedError("Contact a maintainer.")
def add_chunk_file_indices(row):
row["chunk_index"] = 0
row["file_index"] = 0
return row
episodes = episodes.map(add_chunk_file_indices)
fpath = local_dir / DEFAULT_EPISODES_PATH.format(chunk_index=0, file_index=0)
fpath.parent.mkdir(parents=True, exist_ok=True)
episodes.to_parquet(fpath)
@@ -642,20 +640,6 @@ def create_empty_dataset_info(
}
def get_episode_data_index(
episode_dicts: dict[dict], episodes: list[int] | None = None
) -> dict[str, torch.Tensor]:
episode_lengths = {ep_idx: ep_dict["length"] for ep_idx, ep_dict in episode_dicts.items()}
if episodes is not None:
episode_lengths = {ep_idx: episode_lengths[ep_idx] for ep_idx in episodes}
cumulative_lengths = list(accumulate(episode_lengths.values()))
return {
"from": torch.LongTensor([0] + cumulative_lengths[:-1]),
"to": torch.LongTensor(cumulative_lengths),
}
def check_timestamps_sync(
timestamps: np.ndarray,
episode_indices: np.ndarray,

View File

@@ -123,10 +123,10 @@ from lerobot.common.datasets.utils import (
DEFAULT_CHUNK_SIZE,
DEFAULT_DATA_PATH,
DEFAULT_VIDEO_PATH,
LEGACY_EPISODES_PATH,
INFO_PATH,
LEGACY_STATS_PATH,
LEGACY_EPISODES_PATH,
LEGACY_TASKS_PATH,
STATS_PATH,
create_branch,
create_lerobot_dataset_card,
flatten_dict,
@@ -188,7 +188,7 @@ def convert_stats_to_json(v1_dir: Path, v2_dir: Path) -> None:
serialized_stats = {key: value.tolist() for key, value in stats.items()}
serialized_stats = unflatten_dict(serialized_stats)
json_path = v2_dir / LEGACY_STATS_PATH
json_path = v2_dir / STATS_PATH
json_path.parent.mkdir(exist_ok=True, parents=True)
with open(json_path, "w") as f:
json.dump(serialized_stats, f, indent=4)
@@ -296,9 +296,7 @@ def split_parquet_by_episodes(
for ep_idx in range(ep_chunk_start, ep_chunk_end):
ep_table = table.filter(pc.equal(table["episode_index"], ep_idx))
episode_lengths.insert(ep_idx, len(ep_table))
output_file = output_dir / DEFAULT_DATA_PATH.format(
episode_chunk=ep_chunk, episode_index=ep_idx
)
output_file = output_dir / DEFAULT_DATA_PATH.format(episode_chunk=ep_chunk, episode_index=ep_idx)
pq.write_table(ep_table, output_file)
return episode_lengths

View File

@@ -23,7 +23,7 @@ import logging
from huggingface_hub import HfApi
from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDataset
from lerobot.common.datasets.utils import LEGACY_EPISODES_STATS_PATH, LEGACY_STATS_PATH, load_stats, write_info
from lerobot.common.datasets.utils import LEGACY_EPISODES_STATS_PATH, STATS_PATH, load_stats, write_info
from lerobot.common.datasets.v21.convert_stats import check_aggregate_stats, convert_stats
V20 = "v2.0"
@@ -60,15 +60,15 @@ def convert_dataset(
dataset.push_to_hub(branch=branch, tag_version=False, allow_patterns="meta/")
# delete old stats.json file
if (dataset.root / LEGACY_STATS_PATH).is_file:
(dataset.root / LEGACY_STATS_PATH).unlink()
if (dataset.root / STATS_PATH).is_file:
(dataset.root / STATS_PATH).unlink()
hub_api = HfApi()
if hub_api.file_exists(
repo_id=dataset.repo_id, filename=LEGACY_STATS_PATH, revision=branch, repo_type="dataset"
repo_id=dataset.repo_id, filename=STATS_PATH, revision=branch, repo_type="dataset"
):
hub_api.delete_file(
path_in_repo=LEGACY_STATS_PATH, repo_id=dataset.repo_id, revision=branch, repo_type="dataset"
path_in_repo=STATS_PATH, repo_id=dataset.repo_id, revision=branch, repo_type="dataset"
)
hub_api.create_tag(repo_id, tag=CODEBASE_VERSION, revision=branch, repo_type="dataset")

View File

@@ -18,15 +18,16 @@ python lerobot/common/datasets/v30/convert_dataset_v21_to_v30.py \
"""
import argparse
import shutil
from pathlib import Path
import pandas as pd
import pyarrow.parquet as pq
import tqdm
from datasets import Dataset
from huggingface_hub import snapshot_download
from huggingface_hub import HfApi, snapshot_download
from lerobot.common.constants import HF_LEROBOT_HOME
from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDataset
from lerobot.common.datasets.utils import (
DEFAULT_CHUNK_SIZE,
DEFAULT_DATA_PATH,
@@ -34,6 +35,7 @@ from lerobot.common.datasets.utils import (
DEFAULT_VIDEO_PATH,
concat_video_files,
flatten_dict,
get_parquet_file_size_in_mb,
get_parquet_num_frames,
get_video_duration_in_s,
get_video_size_in_mb,
@@ -93,12 +95,6 @@ meta/info.json
"""
def get_parquet_file_size_in_mb(parquet_path):
metadata = pq.read_metadata(parquet_path)
uncompressed_size = metadata.num_rows * metadata.row_group(0).total_byte_size
return uncompressed_size / (1024**2)
# def generate_flat_ep_stats(episodes_stats):
# for ep_idx, ep_stats in episodes_stats.items():
# flat_ep_stats = flatten_dict(ep_stats)
@@ -148,8 +144,8 @@ def convert_data(root, new_root):
"episode_index": ep_idx,
"data/chunk_index": chunk_idx,
"data/file_index": file_idx,
"data/from_index": num_frames,
"data/to_index": num_frames + ep_num_frames,
"dataset_from_index": num_frames,
"dataset_to_index": num_frames + ep_num_frames,
}
size_in_mb += ep_size_in_mb
num_frames += ep_num_frames
@@ -337,6 +333,9 @@ def convert_dataset(
root = HF_LEROBOT_HOME / repo_id
new_root = HF_LEROBOT_HOME / f"{repo_id}_v30"
if new_root.is_dir():
shutil.rmtree(new_root)
snapshot_download(
repo_id,
repo_type="dataset",
@@ -350,6 +349,24 @@ def convert_dataset(
episodes_videos_metadata = convert_videos(root, new_root)
convert_episodes_metadata(root, new_root, episodes_metadata, episodes_videos_metadata)
shutil.move(str(root), str(root) + "_old")
shutil.move(str(new_root), str(root))
# TODO(racdene)
if False:
hub_api = HfApi()
hub_api.delete_tag(repo_id, tag=CODEBASE_VERSION, revision=branch, repo_type="dataset")
hub_api.delete_files(
delete_patterns=["data/chunk*/episode_*", "meta/*.jsonl", "videos/chunk*"],
repo_id=repo_id,
revision=branch,
repo_type="dataset",
)
hub_api.create_tag(repo_id, tag=CODEBASE_VERSION, revision=branch, repo_type="dataset")
LeRobotDataset(repo_id).push_to_hub()
if __name__ == "__main__":
parser = argparse.ArgumentParser()