From f6b16f6d97155e3ce34ab2a1ec145e9413588197 Mon Sep 17 00:00:00 2001 From: Michel Aractingi Date: Tue, 4 Nov 2025 15:56:41 +0100 Subject: [PATCH] fix(dataset_tools) Critical bug in modify features (#2342) * fix bug in `_copy_data_with_feature_changes` * Update src/lerobot/datasets/dataset_tools.py Co-authored-by: Caroline Pascal Signed-off-by: Michel Aractingi * add missing import --------- Signed-off-by: Michel Aractingi Co-authored-by: Caroline Pascal --- src/lerobot/datasets/dataset_tools.py | 32 ++++++++++++--------------- 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/src/lerobot/datasets/dataset_tools.py b/src/lerobot/datasets/dataset_tools.py index 2735ba0a..2fb68dca 100644 --- a/src/lerobot/datasets/dataset_tools.py +++ b/src/lerobot/datasets/dataset_tools.py @@ -39,6 +39,7 @@ from lerobot.datasets.aggregate import aggregate_datasets from lerobot.datasets.compute_stats import aggregate_stats from lerobot.datasets.lerobot_dataset import LeRobotDataset, LeRobotDatasetMetadata from lerobot.datasets.utils import ( + DATA_DIR, DEFAULT_CHUNK_SIZE, DEFAULT_DATA_FILE_SIZE_IN_MB, DEFAULT_DATA_PATH, @@ -962,28 +963,23 @@ def _copy_data_with_feature_changes( remove_features: list[str] | None = None, ) -> None: """Copy data while adding or removing features.""" - if dataset.meta.episodes is None: - dataset.meta.episodes = load_episodes(dataset.meta.root) + data_dir = dataset.root / DATA_DIR + parquet_files = sorted(data_dir.glob("*/*.parquet")) - # Map file paths to episode indices to extract chunk/file indices - file_to_episodes: dict[Path, set[int]] = {} - for ep_idx in range(dataset.meta.total_episodes): - file_path = dataset.meta.get_data_file_path(ep_idx) - if file_path not in file_to_episodes: - file_to_episodes[file_path] = set() - file_to_episodes[file_path].add(ep_idx) + if not parquet_files: + raise ValueError(f"No parquet files found in {data_dir}") frame_idx = 0 - for src_path in tqdm(sorted(file_to_episodes.keys()), desc="Processing data files"): - df = pd.read_parquet(dataset.root / src_path).reset_index(drop=True) + for src_path in tqdm(parquet_files, desc="Processing data files"): + df = pd.read_parquet(src_path).reset_index(drop=True) - # Get chunk_idx and file_idx from the source file's first episode - episodes_in_file = file_to_episodes[src_path] - first_ep_idx = min(episodes_in_file) - src_ep = dataset.meta.episodes[first_ep_idx] - chunk_idx = src_ep["data/chunk_index"] - file_idx = src_ep["data/file_index"] + relative_path = src_path.relative_to(dataset.root) + chunk_dir = relative_path.parts[1] + file_name = relative_path.parts[2] + + chunk_idx = int(chunk_dir.split("-")[1]) + file_idx = int(file_name.split("-")[1].split(".")[0]) if remove_features: df = df.drop(columns=remove_features, errors="ignore") @@ -1009,7 +1005,7 @@ def _copy_data_with_feature_changes( df[feature_name] = feature_slice frame_idx = end_idx - # Write using the preserved chunk_idx and file_idx from source + # Write using the same chunk/file structure as source dst_path = new_meta.root / DEFAULT_DATA_PATH.format(chunk_index=chunk_idx, file_index=file_idx) dst_path.parent.mkdir(parents=True, exist_ok=True)