fix(dataset_tools) Critical bug in modify features (#2342)
Some checks failed
Documentation / Build Main Docs (push) Failing after 0s
Documentation / Build PR Docs (push) Has been skipped
Fast Tests / Fast Pytest Tests (push) Failing after 48s
Full Tests / Full Tests (push) Failing after 4s
Quality / Run Pre-commit Hooks (Lint, Format & Static Analysis) (push) Failing after 1m45s
Security / Secret Leaks Scan (push) Failing after 38s
Full Tests / Build and Push Docker (push) Has been cancelled
Full Tests / GPU Tests (push) Has been cancelled
Full Tests / Delete PR Image (push) Has been cancelled
Nightly / Build CPU Docker for Nightly (push) Has been cancelled
Nightly / Build GPU Docker for Nightly (push) Has been cancelled
Nightly / Nightly CPU Tests (push) Has been cancelled
Nightly / Nightly GPU Tests (push) Has been cancelled
Nightly / Nightly Multi-GPU Tests (push) Has been cancelled
Unbound Dependency Tests / Full Unbound Tests (push) Failing after 49s
Unbound Dependency Tests / Build and Push Docker (push) Has been cancelled
Unbound Dependency Tests / GPU Unbound Tests (push) Has been cancelled
Unbound Dependency Tests / Delete Unbound Image (push) Has been cancelled
Stale / Close Stale Issues and PRs (push) Failing after 10s
Some checks failed
Documentation / Build Main Docs (push) Failing after 0s
Documentation / Build PR Docs (push) Has been skipped
Fast Tests / Fast Pytest Tests (push) Failing after 48s
Full Tests / Full Tests (push) Failing after 4s
Quality / Run Pre-commit Hooks (Lint, Format & Static Analysis) (push) Failing after 1m45s
Security / Secret Leaks Scan (push) Failing after 38s
Full Tests / Build and Push Docker (push) Has been cancelled
Full Tests / GPU Tests (push) Has been cancelled
Full Tests / Delete PR Image (push) Has been cancelled
Nightly / Build CPU Docker for Nightly (push) Has been cancelled
Nightly / Build GPU Docker for Nightly (push) Has been cancelled
Nightly / Nightly CPU Tests (push) Has been cancelled
Nightly / Nightly GPU Tests (push) Has been cancelled
Nightly / Nightly Multi-GPU Tests (push) Has been cancelled
Unbound Dependency Tests / Full Unbound Tests (push) Failing after 49s
Unbound Dependency Tests / Build and Push Docker (push) Has been cancelled
Unbound Dependency Tests / GPU Unbound Tests (push) Has been cancelled
Unbound Dependency Tests / Delete Unbound Image (push) Has been cancelled
Stale / Close Stale Issues and PRs (push) Failing after 10s
* fix bug in `_copy_data_with_feature_changes` * Update src/lerobot/datasets/dataset_tools.py Co-authored-by: Caroline Pascal <caroline8.pascal@gmail.com> Signed-off-by: Michel Aractingi <michel.aractingi@huggingface.co> * add missing import --------- Signed-off-by: Michel Aractingi <michel.aractingi@huggingface.co> Co-authored-by: Caroline Pascal <caroline8.pascal@gmail.com>
This commit is contained in:
@@ -39,6 +39,7 @@ from lerobot.datasets.aggregate import aggregate_datasets
|
|||||||
from lerobot.datasets.compute_stats import aggregate_stats
|
from lerobot.datasets.compute_stats import aggregate_stats
|
||||||
from lerobot.datasets.lerobot_dataset import LeRobotDataset, LeRobotDatasetMetadata
|
from lerobot.datasets.lerobot_dataset import LeRobotDataset, LeRobotDatasetMetadata
|
||||||
from lerobot.datasets.utils import (
|
from lerobot.datasets.utils import (
|
||||||
|
DATA_DIR,
|
||||||
DEFAULT_CHUNK_SIZE,
|
DEFAULT_CHUNK_SIZE,
|
||||||
DEFAULT_DATA_FILE_SIZE_IN_MB,
|
DEFAULT_DATA_FILE_SIZE_IN_MB,
|
||||||
DEFAULT_DATA_PATH,
|
DEFAULT_DATA_PATH,
|
||||||
@@ -962,28 +963,23 @@ def _copy_data_with_feature_changes(
|
|||||||
remove_features: list[str] | None = None,
|
remove_features: list[str] | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Copy data while adding or removing features."""
|
"""Copy data while adding or removing features."""
|
||||||
if dataset.meta.episodes is None:
|
data_dir = dataset.root / DATA_DIR
|
||||||
dataset.meta.episodes = load_episodes(dataset.meta.root)
|
parquet_files = sorted(data_dir.glob("*/*.parquet"))
|
||||||
|
|
||||||
# Map file paths to episode indices to extract chunk/file indices
|
if not parquet_files:
|
||||||
file_to_episodes: dict[Path, set[int]] = {}
|
raise ValueError(f"No parquet files found in {data_dir}")
|
||||||
for ep_idx in range(dataset.meta.total_episodes):
|
|
||||||
file_path = dataset.meta.get_data_file_path(ep_idx)
|
|
||||||
if file_path not in file_to_episodes:
|
|
||||||
file_to_episodes[file_path] = set()
|
|
||||||
file_to_episodes[file_path].add(ep_idx)
|
|
||||||
|
|
||||||
frame_idx = 0
|
frame_idx = 0
|
||||||
|
|
||||||
for src_path in tqdm(sorted(file_to_episodes.keys()), desc="Processing data files"):
|
for src_path in tqdm(parquet_files, desc="Processing data files"):
|
||||||
df = pd.read_parquet(dataset.root / src_path).reset_index(drop=True)
|
df = pd.read_parquet(src_path).reset_index(drop=True)
|
||||||
|
|
||||||
# Get chunk_idx and file_idx from the source file's first episode
|
relative_path = src_path.relative_to(dataset.root)
|
||||||
episodes_in_file = file_to_episodes[src_path]
|
chunk_dir = relative_path.parts[1]
|
||||||
first_ep_idx = min(episodes_in_file)
|
file_name = relative_path.parts[2]
|
||||||
src_ep = dataset.meta.episodes[first_ep_idx]
|
|
||||||
chunk_idx = src_ep["data/chunk_index"]
|
chunk_idx = int(chunk_dir.split("-")[1])
|
||||||
file_idx = src_ep["data/file_index"]
|
file_idx = int(file_name.split("-")[1].split(".")[0])
|
||||||
|
|
||||||
if remove_features:
|
if remove_features:
|
||||||
df = df.drop(columns=remove_features, errors="ignore")
|
df = df.drop(columns=remove_features, errors="ignore")
|
||||||
@@ -1009,7 +1005,7 @@ def _copy_data_with_feature_changes(
|
|||||||
df[feature_name] = feature_slice
|
df[feature_name] = feature_slice
|
||||||
frame_idx = end_idx
|
frame_idx = end_idx
|
||||||
|
|
||||||
# Write using the preserved chunk_idx and file_idx from source
|
# Write using the same chunk/file structure as source
|
||||||
dst_path = new_meta.root / DEFAULT_DATA_PATH.format(chunk_index=chunk_idx, file_index=file_idx)
|
dst_path = new_meta.root / DEFAULT_DATA_PATH.format(chunk_index=chunk_idx, file_index=file_idx)
|
||||||
dst_path.parent.mkdir(parents=True, exist_ok=True)
|
dst_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user