Improve dataset examples (#82)

Co-authored-by: Alexander Soare <alexander.soare159@gmail.com>
This commit is contained in:
Remi
2024-04-18 11:43:16 +02:00
committed by GitHub
parent d5c4b0c344
commit 0928afd37d
15 changed files with 274 additions and 165 deletions

View File

@@ -1,63 +0,0 @@
"""
This script is designed to facilitate the creation of a subset of an existing dataset by selecting a specific number of frames from the original dataset.
This subset can then be used for running quick unit tests.
The script takes an input directory containing the original dataset and an output directory where the subset of the dataset will be saved.
Additionally, the number of frames to include in the subset can be specified.
The script ensures that the subset is a representative sample of the original dataset by copying the specified number of frames and retaining the structure and format of the data.
Usage:
Run the script with the following command, specifying the path to the input data directory,
the path to the output data directory, and optionally the number of frames to include in the subset dataset:
`python tests/scripts/mock_dataset.py --in-data-dir path/to/input_data --out-data-dir path/to/output_data`
Example:
`python tests/scripts/mock_dataset.py --in-data-dir data/pusht --out-data-dir tests/data/pusht`
"""
import argparse
import shutil
from pathlib import Path
import torch
def mock_dataset(in_data_dir, out_data_dir, num_frames):
in_data_dir = Path(in_data_dir)
out_data_dir = Path(out_data_dir)
out_data_dir.mkdir(exist_ok=True, parents=True)
# copy the first `n` frames for each data key so that we have real data
in_data_dict = torch.load(in_data_dir / "data_dict.pth")
out_data_dict = {key: in_data_dict[key][:num_frames].clone() for key in in_data_dict}
torch.save(out_data_dict, out_data_dir / "data_dict.pth")
# recreate data_ids_per_episode that corresponds to the subset
episodes = in_data_dict["episode"][:num_frames].tolist()
data_ids_per_episode = {}
for idx, ep_id in enumerate(episodes):
if ep_id not in data_ids_per_episode:
data_ids_per_episode[ep_id] = []
data_ids_per_episode[ep_id].append(idx)
for ep_id in data_ids_per_episode:
data_ids_per_episode[ep_id] = torch.tensor(data_ids_per_episode[ep_id])
torch.save(data_ids_per_episode, out_data_dir / "data_ids_per_episode.pth")
# copy the full statistics of dataset since it's small
in_stats_path = in_data_dir / "stats.pth"
out_stats_path = out_data_dir / "stats.pth"
shutil.copy(in_stats_path, out_stats_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Create a dataset with a subset of frames for quick testing.")
parser.add_argument("--in-data-dir", type=str, help="Path to input data")
parser.add_argument("--out-data-dir", type=str, help="Path to save the output data")
parser.add_argument("--num-frames", type=int, default=50, help="Number of frames to copy over")
args = parser.parse_args()
mock_dataset(args.in_data_dir, args.out_data_dir, args.num_frames)

View File

@@ -50,7 +50,7 @@ def test_factory(env_name, dataset_id, policy_name):
keys_ndim_required.append(
(key, 3, True),
)
assert dataset.data_dict[key].dtype == torch.uint8, f"{key}"
assert dataset.hf_dataset[key].dtype == torch.uint8, f"{key}"
# test number of dimensions
for key, ndim, required in keys_ndim_required:
@@ -121,16 +121,16 @@ def test_compute_stats():
batch_size=len(dataset),
shuffle=False,
)
data_dict = next(iter(dataloader))
hf_dataset = next(iter(dataloader))
# compute stats based on all frames from the dataset without any batching
expected_stats = {}
for k, pattern in stats_patterns.items():
expected_stats[k] = {}
expected_stats[k]["mean"] = einops.reduce(data_dict[k], pattern, "mean")
expected_stats[k]["std"] = torch.sqrt(einops.reduce((data_dict[k] - expected_stats[k]["mean"]) ** 2, pattern, "mean"))
expected_stats[k]["min"] = einops.reduce(data_dict[k], pattern, "min")
expected_stats[k]["max"] = einops.reduce(data_dict[k], pattern, "max")
expected_stats[k]["mean"] = einops.reduce(hf_dataset[k], pattern, "mean")
expected_stats[k]["std"] = torch.sqrt(einops.reduce((hf_dataset[k] - expected_stats[k]["mean"]) ** 2, pattern, "mean"))
expected_stats[k]["min"] = einops.reduce(hf_dataset[k], pattern, "min")
expected_stats[k]["max"] = einops.reduce(hf_dataset[k], pattern, "max")
# test computed stats match expected stats
for k in stats_patterns:
@@ -153,47 +153,47 @@ def test_compute_stats():
def test_load_previous_and_future_frames_within_tolerance():
data_dict = Dataset.from_dict({
hf_dataset = Dataset.from_dict({
"timestamp": [0.1, 0.2, 0.3, 0.4, 0.5],
"index": [0, 1, 2, 3, 4],
"episode_data_index_from": [0, 0, 0, 0, 0],
"episode_data_index_to": [5, 5, 5, 5, 5],
})
data_dict = data_dict.with_format("torch")
item = data_dict[2]
hf_dataset = hf_dataset.with_format("torch")
item = hf_dataset[2]
delta_timestamps = {"index": [-0.2, 0, 0.139]}
tol = 0.04
item = load_previous_and_future_frames(item, data_dict, delta_timestamps, tol)
item = load_previous_and_future_frames(item, hf_dataset, delta_timestamps, tol)
data, is_pad = item["index"], item["index_is_pad"]
assert torch.equal(data, torch.tensor([0, 2, 3])), "Data does not match expected values"
assert not is_pad.any(), "Unexpected padding detected"
def test_load_previous_and_future_frames_outside_tolerance_inside_episode_range():
data_dict = Dataset.from_dict({
hf_dataset = Dataset.from_dict({
"timestamp": [0.1, 0.2, 0.3, 0.4, 0.5],
"index": [0, 1, 2, 3, 4],
"episode_data_index_from": [0, 0, 0, 0, 0],
"episode_data_index_to": [5, 5, 5, 5, 5],
})
data_dict = data_dict.with_format("torch")
item = data_dict[2]
hf_dataset = hf_dataset.with_format("torch")
item = hf_dataset[2]
delta_timestamps = {"index": [-0.2, 0, 0.141]}
tol = 0.04
with pytest.raises(AssertionError):
load_previous_and_future_frames(item, data_dict, delta_timestamps, tol)
load_previous_and_future_frames(item, hf_dataset, delta_timestamps, tol)
def test_load_previous_and_future_frames_outside_tolerance_outside_episode_range():
data_dict = Dataset.from_dict({
hf_dataset = Dataset.from_dict({
"timestamp": [0.1, 0.2, 0.3, 0.4, 0.5],
"index": [0, 1, 2, 3, 4],
"episode_data_index_from": [0, 0, 0, 0, 0],
"episode_data_index_to": [5, 5, 5, 5, 5],
})
data_dict = data_dict.with_format("torch")
item = data_dict[2]
hf_dataset = hf_dataset.with_format("torch")
item = hf_dataset[2]
delta_timestamps = {"index": [-0.3, -0.24, 0, 0.26, 0.3]}
tol = 0.04
item = load_previous_and_future_frames(item, data_dict, delta_timestamps, tol)
item = load_previous_and_future_frames(item, hf_dataset, delta_timestamps, tol)
data, is_pad = item["index"], item["index_is_pad"]
assert torch.equal(data, torch.tensor([0, 0, 2, 4, 4])), "Data does not match expected values"
assert torch.equal(is_pad, torch.tensor([True, False, False, True, True])), "Padding does not match expected values"

View File

@@ -1,4 +1,5 @@
from pathlib import Path
import subprocess
def _find_and_replace(text: str, finds_and_replaces: list[tuple[str, str]]) -> str:
@@ -8,23 +9,29 @@ def _find_and_replace(text: str, finds_and_replaces: list[tuple[str, str]]) -> s
return text
def _run_script(path):
subprocess.run(['python', path], check=True)
def test_example_1():
path = "examples/1_visualize_dataset.py"
with open(path, "r") as file:
file_contents = file.read()
exec(file_contents)
assert Path("outputs/visualize_dataset/example/episode_0.mp4").exists()
path = "examples/1_load_hugging_face_dataset.py"
_run_script(path)
assert Path("outputs/examples/1_load_hugging_face_dataset/episode_5.mp4").exists()
def test_examples_3_and_2():
def test_example_2():
path = "examples/2_load_lerobot_dataset.py"
_run_script(path)
assert Path("outputs/examples/2_load_lerobot_dataset/episode_5.mp4").exists()
def test_examples_4_and_3():
"""
Train a model with example 3, check the outputs.
Evaluate the trained model with example 2, check the outputs.
"""
path = "examples/3_train_policy.py"
path = "examples/4_train_policy.py"
with open(path, "r") as file:
file_contents = file.read()
@@ -46,7 +53,7 @@ def test_examples_3_and_2():
for file_name in ["model.pt", "stats.pth", "config.yaml"]:
assert Path(f"outputs/train/example_pusht_diffusion/{file_name}").exists()
path = "examples/2_evaluate_pretrained_policy.py"
path = "examples/3_evaluate_pretrained_policy.py"
with open(path, "r") as file:
file_contents = file.read()