Improve dataset examples (#82)
Co-authored-by: Alexander Soare <alexander.soare159@gmail.com>
This commit is contained in:
@@ -1,63 +0,0 @@
|
||||
"""
|
||||
This script is designed to facilitate the creation of a subset of an existing dataset by selecting a specific number of frames from the original dataset.
|
||||
This subset can then be used for running quick unit tests.
|
||||
The script takes an input directory containing the original dataset and an output directory where the subset of the dataset will be saved.
|
||||
Additionally, the number of frames to include in the subset can be specified.
|
||||
The script ensures that the subset is a representative sample of the original dataset by copying the specified number of frames and retaining the structure and format of the data.
|
||||
|
||||
Usage:
|
||||
Run the script with the following command, specifying the path to the input data directory,
|
||||
the path to the output data directory, and optionally the number of frames to include in the subset dataset:
|
||||
|
||||
`python tests/scripts/mock_dataset.py --in-data-dir path/to/input_data --out-data-dir path/to/output_data`
|
||||
|
||||
Example:
|
||||
`python tests/scripts/mock_dataset.py --in-data-dir data/pusht --out-data-dir tests/data/pusht`
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import shutil
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
def mock_dataset(in_data_dir, out_data_dir, num_frames):
|
||||
in_data_dir = Path(in_data_dir)
|
||||
out_data_dir = Path(out_data_dir)
|
||||
out_data_dir.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
# copy the first `n` frames for each data key so that we have real data
|
||||
in_data_dict = torch.load(in_data_dir / "data_dict.pth")
|
||||
out_data_dict = {key: in_data_dict[key][:num_frames].clone() for key in in_data_dict}
|
||||
torch.save(out_data_dict, out_data_dir / "data_dict.pth")
|
||||
|
||||
# recreate data_ids_per_episode that corresponds to the subset
|
||||
episodes = in_data_dict["episode"][:num_frames].tolist()
|
||||
data_ids_per_episode = {}
|
||||
for idx, ep_id in enumerate(episodes):
|
||||
if ep_id not in data_ids_per_episode:
|
||||
data_ids_per_episode[ep_id] = []
|
||||
data_ids_per_episode[ep_id].append(idx)
|
||||
for ep_id in data_ids_per_episode:
|
||||
data_ids_per_episode[ep_id] = torch.tensor(data_ids_per_episode[ep_id])
|
||||
torch.save(data_ids_per_episode, out_data_dir / "data_ids_per_episode.pth")
|
||||
|
||||
# copy the full statistics of dataset since it's small
|
||||
in_stats_path = in_data_dir / "stats.pth"
|
||||
out_stats_path = out_data_dir / "stats.pth"
|
||||
shutil.copy(in_stats_path, out_stats_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
parser = argparse.ArgumentParser(description="Create a dataset with a subset of frames for quick testing.")
|
||||
|
||||
parser.add_argument("--in-data-dir", type=str, help="Path to input data")
|
||||
parser.add_argument("--out-data-dir", type=str, help="Path to save the output data")
|
||||
parser.add_argument("--num-frames", type=int, default=50, help="Number of frames to copy over")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
mock_dataset(args.in_data_dir, args.out_data_dir, args.num_frames)
|
||||
@@ -50,7 +50,7 @@ def test_factory(env_name, dataset_id, policy_name):
|
||||
keys_ndim_required.append(
|
||||
(key, 3, True),
|
||||
)
|
||||
assert dataset.data_dict[key].dtype == torch.uint8, f"{key}"
|
||||
assert dataset.hf_dataset[key].dtype == torch.uint8, f"{key}"
|
||||
|
||||
# test number of dimensions
|
||||
for key, ndim, required in keys_ndim_required:
|
||||
@@ -121,16 +121,16 @@ def test_compute_stats():
|
||||
batch_size=len(dataset),
|
||||
shuffle=False,
|
||||
)
|
||||
data_dict = next(iter(dataloader))
|
||||
hf_dataset = next(iter(dataloader))
|
||||
|
||||
# compute stats based on all frames from the dataset without any batching
|
||||
expected_stats = {}
|
||||
for k, pattern in stats_patterns.items():
|
||||
expected_stats[k] = {}
|
||||
expected_stats[k]["mean"] = einops.reduce(data_dict[k], pattern, "mean")
|
||||
expected_stats[k]["std"] = torch.sqrt(einops.reduce((data_dict[k] - expected_stats[k]["mean"]) ** 2, pattern, "mean"))
|
||||
expected_stats[k]["min"] = einops.reduce(data_dict[k], pattern, "min")
|
||||
expected_stats[k]["max"] = einops.reduce(data_dict[k], pattern, "max")
|
||||
expected_stats[k]["mean"] = einops.reduce(hf_dataset[k], pattern, "mean")
|
||||
expected_stats[k]["std"] = torch.sqrt(einops.reduce((hf_dataset[k] - expected_stats[k]["mean"]) ** 2, pattern, "mean"))
|
||||
expected_stats[k]["min"] = einops.reduce(hf_dataset[k], pattern, "min")
|
||||
expected_stats[k]["max"] = einops.reduce(hf_dataset[k], pattern, "max")
|
||||
|
||||
# test computed stats match expected stats
|
||||
for k in stats_patterns:
|
||||
@@ -153,47 +153,47 @@ def test_compute_stats():
|
||||
|
||||
|
||||
def test_load_previous_and_future_frames_within_tolerance():
|
||||
data_dict = Dataset.from_dict({
|
||||
hf_dataset = Dataset.from_dict({
|
||||
"timestamp": [0.1, 0.2, 0.3, 0.4, 0.5],
|
||||
"index": [0, 1, 2, 3, 4],
|
||||
"episode_data_index_from": [0, 0, 0, 0, 0],
|
||||
"episode_data_index_to": [5, 5, 5, 5, 5],
|
||||
})
|
||||
data_dict = data_dict.with_format("torch")
|
||||
item = data_dict[2]
|
||||
hf_dataset = hf_dataset.with_format("torch")
|
||||
item = hf_dataset[2]
|
||||
delta_timestamps = {"index": [-0.2, 0, 0.139]}
|
||||
tol = 0.04
|
||||
item = load_previous_and_future_frames(item, data_dict, delta_timestamps, tol)
|
||||
item = load_previous_and_future_frames(item, hf_dataset, delta_timestamps, tol)
|
||||
data, is_pad = item["index"], item["index_is_pad"]
|
||||
assert torch.equal(data, torch.tensor([0, 2, 3])), "Data does not match expected values"
|
||||
assert not is_pad.any(), "Unexpected padding detected"
|
||||
|
||||
def test_load_previous_and_future_frames_outside_tolerance_inside_episode_range():
|
||||
data_dict = Dataset.from_dict({
|
||||
hf_dataset = Dataset.from_dict({
|
||||
"timestamp": [0.1, 0.2, 0.3, 0.4, 0.5],
|
||||
"index": [0, 1, 2, 3, 4],
|
||||
"episode_data_index_from": [0, 0, 0, 0, 0],
|
||||
"episode_data_index_to": [5, 5, 5, 5, 5],
|
||||
})
|
||||
data_dict = data_dict.with_format("torch")
|
||||
item = data_dict[2]
|
||||
hf_dataset = hf_dataset.with_format("torch")
|
||||
item = hf_dataset[2]
|
||||
delta_timestamps = {"index": [-0.2, 0, 0.141]}
|
||||
tol = 0.04
|
||||
with pytest.raises(AssertionError):
|
||||
load_previous_and_future_frames(item, data_dict, delta_timestamps, tol)
|
||||
load_previous_and_future_frames(item, hf_dataset, delta_timestamps, tol)
|
||||
|
||||
def test_load_previous_and_future_frames_outside_tolerance_outside_episode_range():
|
||||
data_dict = Dataset.from_dict({
|
||||
hf_dataset = Dataset.from_dict({
|
||||
"timestamp": [0.1, 0.2, 0.3, 0.4, 0.5],
|
||||
"index": [0, 1, 2, 3, 4],
|
||||
"episode_data_index_from": [0, 0, 0, 0, 0],
|
||||
"episode_data_index_to": [5, 5, 5, 5, 5],
|
||||
})
|
||||
data_dict = data_dict.with_format("torch")
|
||||
item = data_dict[2]
|
||||
hf_dataset = hf_dataset.with_format("torch")
|
||||
item = hf_dataset[2]
|
||||
delta_timestamps = {"index": [-0.3, -0.24, 0, 0.26, 0.3]}
|
||||
tol = 0.04
|
||||
item = load_previous_and_future_frames(item, data_dict, delta_timestamps, tol)
|
||||
item = load_previous_and_future_frames(item, hf_dataset, delta_timestamps, tol)
|
||||
data, is_pad = item["index"], item["index_is_pad"]
|
||||
assert torch.equal(data, torch.tensor([0, 0, 2, 4, 4])), "Data does not match expected values"
|
||||
assert torch.equal(is_pad, torch.tensor([True, False, False, True, True])), "Padding does not match expected values"
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
from pathlib import Path
|
||||
import subprocess
|
||||
|
||||
|
||||
def _find_and_replace(text: str, finds_and_replaces: list[tuple[str, str]]) -> str:
|
||||
@@ -8,23 +9,29 @@ def _find_and_replace(text: str, finds_and_replaces: list[tuple[str, str]]) -> s
|
||||
return text
|
||||
|
||||
|
||||
def _run_script(path):
|
||||
subprocess.run(['python', path], check=True)
|
||||
|
||||
|
||||
def test_example_1():
|
||||
path = "examples/1_visualize_dataset.py"
|
||||
|
||||
with open(path, "r") as file:
|
||||
file_contents = file.read()
|
||||
exec(file_contents)
|
||||
|
||||
assert Path("outputs/visualize_dataset/example/episode_0.mp4").exists()
|
||||
path = "examples/1_load_hugging_face_dataset.py"
|
||||
_run_script(path)
|
||||
assert Path("outputs/examples/1_load_hugging_face_dataset/episode_5.mp4").exists()
|
||||
|
||||
|
||||
def test_examples_3_and_2():
|
||||
def test_example_2():
|
||||
path = "examples/2_load_lerobot_dataset.py"
|
||||
_run_script(path)
|
||||
assert Path("outputs/examples/2_load_lerobot_dataset/episode_5.mp4").exists()
|
||||
|
||||
|
||||
def test_examples_4_and_3():
|
||||
"""
|
||||
Train a model with example 3, check the outputs.
|
||||
Evaluate the trained model with example 2, check the outputs.
|
||||
"""
|
||||
|
||||
path = "examples/3_train_policy.py"
|
||||
path = "examples/4_train_policy.py"
|
||||
|
||||
with open(path, "r") as file:
|
||||
file_contents = file.read()
|
||||
@@ -46,7 +53,7 @@ def test_examples_3_and_2():
|
||||
for file_name in ["model.pt", "stats.pth", "config.yaml"]:
|
||||
assert Path(f"outputs/train/example_pusht_diffusion/{file_name}").exists()
|
||||
|
||||
path = "examples/2_evaluate_pretrained_policy.py"
|
||||
path = "examples/3_evaluate_pretrained_policy.py"
|
||||
|
||||
with open(path, "r") as file:
|
||||
file_contents = file.read()
|
||||
|
||||
Reference in New Issue
Block a user