fix test

fix nans
2024-06-04 15:43:10 +02:00 · 2024-06-04 12:04:03 +02:00
11 changed files with 78 additions and 141 deletions
--- a/.github/workflows/trufflehog.yml
+++ b/.github/workflows/trufflehog.yml
@@ -1,18 +0,0 @@
-on:
-  push:
-
-name: Secret Leaks
-
-permissions:
-  contents: read
-
-jobs:
-  trufflehog:
-    runs-on: ubuntu-latest
-    steps:
-    - name: Checkout code
-      uses: actions/checkout@v4
-      with:
-        fetch-depth: 0
-    - name: Secret Scanning
-      uses: trufflesecurity/trufflehog@main
--- a/lerobot/common/datasets/push_dataset_to_hub/aloha_dora_format.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/aloha_dora_format.py
@@ -78,15 +78,29 @@ def load_from_raw(raw_dir: Path, out_dir: Path, fps: int):

    image_keys = [key for key in df if "observation.images." in key]

+    num_unaligned_images = 0
+    max_episode = 0
+
    def get_episode_index(row):
+        nonlocal num_unaligned_images
+        nonlocal max_episode
        episode_index_per_cam = {}
        for key in image_keys:
+            if isinstance(row[key], float):
+                num_unaligned_images += 1
+                return float("nan")
            path = row[key][0]["path"]
            match = re.search(r"_(\d{6}).mp4", path)
            if not match:
                raise ValueError(path)
            episode_index = int(match.group(1))
            episode_index_per_cam[key] = episode_index
+
+            if episode_index > max_episode:
+                assert episode_index - max_episode == 1
+                max_episode = episode_index
+            else:
+                assert episode_index == max_episode
        if len(set(episode_index_per_cam.values())) != 1:
            raise ValueError(
                f"All cameras are expected to belong to the same episode, but getting {episode_index_per_cam}"
@@ -111,11 +125,24 @@ def load_from_raw(raw_dir: Path, out_dir: Path, fps: int):
    del df["timestamp_utc"]

    # sanity check
-    has_nan = df.isna().any().any()
-    if has_nan:
-        raise ValueError("Dataset contains Nan values.")
+    num_rows_with_nan = df.isna().any(axis=1).sum()
+    assert (
+        num_rows_with_nan == num_unaligned_images
+    ), f"Found {num_rows_with_nan} rows with NaN values but {num_unaligned_images} unaligned images."
+    if num_unaligned_images > max_episode * 2:
+        # We allow a few unaligned images, typically at the beginning and end of the episodes for instance
+        # but if there are too many, we raise an error to avoid large chunks of missing data
+        raise ValueError(
+            f"Found {num_unaligned_images} unaligned images out of {max_episode} episodes. "
+            f"Check the timestamps of the cameras."
+        )
+
+    # Drop rows with NaN values now that we double checked and convert episode_index to int
+    df = df.dropna()
+    df["episode_index"] = df["episode_index"].astype(int)

    # sanity check episode indices go from 0 to n-1
+    assert df["episode_index"].max() == max_episode
    ep_ids = [ep_idx for ep_idx, _ in df.groupby("episode_index")]
    expected_ep_ids = list(range(df["episode_index"].max() + 1))
    if ep_ids != expected_ep_ids:
@@ -214,8 +241,6 @@ def from_raw_to_lerobot_format(raw_dir: Path, out_dir: Path, fps=None, video=Tru

    if fps is None:
        fps = 30
-    else:
-        raise NotImplementedError()

    if not video:
        raise NotImplementedError()
--- a/lerobot/common/datasets/utils.py
+++ b/lerobot/common/datasets/utils.py
@@ -243,10 +243,11 @@ def load_previous_and_future_frames(
        is_pad = min_ > tolerance_s

        # check violated query timestamps are all outside the episode range
-        assert ((query_ts[is_pad] < ep_first_ts) | (ep_last_ts < query_ts[is_pad])).all(), (
-            f"One or several timestamps unexpectedly violate the tolerance ({min_} > {tolerance_s=}) inside episode range."
-            "This might be due to synchronization issues with timestamps during data collection."
-        )
+        if not ((query_ts[is_pad] < ep_first_ts) | (ep_last_ts < query_ts[is_pad])).all():
+            raise ValueError(
+                f"One or several timestamps unexpectedly violate the tolerance ({min_} > {tolerance_s=}) inside episode range."
+                "This might be due to synchronization issues with timestamps during data collection."
+            )

        # get dataset indices corresponding to frames to be loaded
        data_ids = ep_data_ids[argmin_]
--- a/lerobot/common/logger.py
+++ b/lerobot/common/logger.py
@@ -189,7 +189,7 @@ class Logger:
            training_state["scheduler"] = scheduler.state_dict()
        torch.save(training_state, save_dir / self.training_state_file_name)

-    def save_checkpont(
+    def save_checkpoint(
        self,
        train_step: int,
        policy: Policy,
--- a/lerobot/common/policies/normalize.py
+++ b/lerobot/common/policies/normalize.py
@@ -147,7 +147,7 @@ class Normalize(nn.Module):
                assert not torch.isinf(min).any(), _no_stats_error_str("min")
                assert not torch.isinf(max).any(), _no_stats_error_str("max")
                # normalize to [0,1]
-                batch[key] = (batch[key] - min) / (max - min + 1e-8)
+                batch[key] = (batch[key] - min) / (max - min)
                # normalize to [-1, 1]
                batch[key] = batch[key] * 2 - 1
            else:
--- a/lerobot/scripts/display_sys_info.py
+++ b/lerobot/scripts/display_sys_info.py
@@ -13,71 +13,39 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-"""Use this script to get a quick summary of your system config.
-It should be able to run without any of LeRobot's dependencies or LeRobot itself installed.
-"""
-
 import platform

-HAS_HF_HUB = True
-HAS_HF_DATASETS = True
-HAS_NP = True
-HAS_TORCH = True
-HAS_LEROBOT = True
+import huggingface_hub

-try:
-    import huggingface_hub
-except ImportError:
-    HAS_HF_HUB = False
+# import dataset
+import numpy as np
+import torch

-try:
-    import datasets
-except ImportError:
-    HAS_HF_DATASETS = False
+from lerobot import __version__ as version

-try:
-    import numpy as np
-except ImportError:
-    HAS_NP = False
-
-try:
-    import torch
-except ImportError:
-    HAS_TORCH = False
-
-try:
-    import lerobot
-except ImportError:
-    HAS_LEROBOT = False
-
-
-lerobot_version = lerobot.__version__ if HAS_LEROBOT else "N/A"
-hf_hub_version = huggingface_hub.__version__ if HAS_HF_HUB else "N/A"
-hf_datasets_version = datasets.__version__ if HAS_HF_DATASETS else "N/A"
-np_version = np.__version__ if HAS_NP else "N/A"
-
-torch_version = torch.__version__ if HAS_TORCH else "N/A"
-torch_cuda_available = torch.cuda.is_available() if HAS_TORCH else "N/A"
-cuda_version = torch._C._cuda_getCompiledVersion() if HAS_TORCH and torch.version.cuda is not None else "N/A"
+pt_version = torch.__version__
+pt_cuda_available = torch.cuda.is_available()
+pt_cuda_available = torch.cuda.is_available()
+cuda_version = torch._C._cuda_getCompiledVersion() if torch.version.cuda is not None else "N/A"


 # TODO(aliberts): refactor into an actual command `lerobot env`
 def display_sys_info() -> dict:
    """Run this to get basic system info to help for tracking issues & bugs."""
    info = {
-        "`lerobot` version": lerobot_version,
+        "`lerobot` version": version,
        "Platform": platform.platform(),
        "Python version": platform.python_version(),
-        "Huggingface_hub version": hf_hub_version,
-        "Dataset version": hf_datasets_version,
-        "Numpy version": np_version,
-        "PyTorch version (GPU?)": f"{torch_version} ({torch_cuda_available})",
+        "Huggingface_hub version": huggingface_hub.__version__,
+        # TODO(aliberts): Add dataset when https://github.com/huggingface/lerobot/pull/73 is merged
+        # "Dataset version": dataset.__version__,
+        "Numpy version": np.__version__,
+        "PyTorch version (GPU?)": f"{pt_version} ({pt_cuda_available})",
        "Cuda version": cuda_version,
        "Using GPU in script?": "<fill in>",
-        # "Using distributed or parallel set-up in script?": "<fill in>",
+        "Using distributed or parallel set-up in script?": "<fill in>",
    }
-    print("\nCopy-and-paste the text below in your GitHub issue and FILL OUT the last point.\n")
+    print("\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n")
    print(format_dict(info))
    return info

--- a/lerobot/scripts/eval.py
+++ b/lerobot/scripts/eval.py
@@ -164,7 +164,10 @@ def rollout(
        # VectorEnv stores is_success in `info["final_info"][env_index]["is_success"]`. "final_info" isn't
        # available of none of the envs finished.
        if "final_info" in info:
-            successes = [info["is_success"] if info is not None else False for info in info["final_info"]]
+            successes = [
+                info["is_success"] if info is not None and "is_success" in info else False
+                for info in info["final_info"]
+            ]
        else:
            successes = [False] * env.num_envs

@@ -209,7 +212,7 @@ def eval_policy(
    policy: torch.nn.Module,
    n_episodes: int,
    max_episodes_rendered: int = 0,
-    videos_dir: Path | None = None,
+    video_dir: Path | None = None,
    return_episode_data: bool = False,
    start_seed: int | None = None,
    enable_progbar: bool = False,
@@ -221,7 +224,7 @@ def eval_policy(
        policy: The policy.
        n_episodes: The number of episodes to evaluate.
        max_episodes_rendered: Maximum number of episodes to render into videos.
-        videos_dir: Where to save rendered videos.
+        video_dir: Where to save rendered videos.
        return_episode_data: Whether to return episode data for online training. Incorporates the data into
            the "episodes" key of the returned dictionary.
        start_seed: The first seed to use for the first individual rollout. For all subsequent rollouts the
@@ -347,8 +350,8 @@ def eval_policy(
            ):
                if n_episodes_rendered >= max_episodes_rendered:
                    break
-                videos_dir.mkdir(parents=True, exist_ok=True)
-                video_path = videos_dir / f"eval_episode_{n_episodes_rendered}.mp4"
+                video_dir.mkdir(parents=True, exist_ok=True)
+                video_path = video_dir / f"eval_episode_{n_episodes_rendered}.mp4"
                video_paths.append(str(video_path))
                thread = threading.Thread(
                    target=write_video,
@@ -503,10 +506,9 @@ def _compile_episode_data(
    }


-def main(
+def eval(
    pretrained_policy_path: str | None = None,
    hydra_cfg_path: str | None = None,
-    out_dir: str | None = None,
    config_overrides: list[str] | None = None,
 ):
    assert (pretrained_policy_path is None) ^ (hydra_cfg_path is None)
@@ -514,8 +516,12 @@ def main(
        hydra_cfg = init_hydra_config(pretrained_policy_path / "config.yaml", config_overrides)
    else:
        hydra_cfg = init_hydra_config(hydra_cfg_path, config_overrides)
+    out_dir = (
+        f"outputs/eval/{dt.now().strftime('%Y-%m-%d/%H-%M-%S')}_{hydra_cfg.env.name}_{hydra_cfg.policy.name}"
+    )
+
    if out_dir is None:
-        out_dir = f"outputs/eval/{dt.now().strftime('%Y-%m-%d/%H-%M-%S')}_{hydra_cfg.env.name}_{hydra_cfg.policy.name}"
+        raise NotImplementedError()

    # Check device is available
    device = get_safe_torch_device(hydra_cfg.device, log=True)
@@ -543,7 +549,7 @@ def main(
            policy,
            hydra_cfg.eval.n_episodes,
            max_episodes_rendered=10,
-            videos_dir=Path(out_dir) / "videos",
+            video_dir=Path(out_dir) / "eval",
            start_seed=hydra_cfg.seed,
            enable_progbar=True,
            enable_inner_progbar=True,
@@ -583,13 +589,6 @@ if __name__ == "__main__":
        ),
    )
    parser.add_argument("--revision", help="Optionally provide the Hugging Face Hub revision ID.")
-    parser.add_argument(
-        "--out-dir",
-        help=(
-            "Where to save the evaluation outputs. If not provided, outputs are saved in "
-            "outputs/eval/{timestamp}_{env_name}_{policy_name}"
-        ),
-    )
    parser.add_argument(
        "overrides",
        nargs="*",
@@ -598,7 +597,7 @@ if __name__ == "__main__":
    args = parser.parse_args()

    if args.pretrained_policy_name_or_path is None:
-        main(hydra_cfg_path=args.config, out_dir=args.out_dir, config_overrides=args.overrides)
+        eval(hydra_cfg_path=args.config, config_overrides=args.overrides)
    else:
        try:
            pretrained_policy_path = Path(
@@ -622,8 +621,4 @@ if __name__ == "__main__":
                "repo ID, nor is it an existing local directory."
            )

-        main(
-            pretrained_policy_path=pretrained_policy_path,
-            out_dir=args.out_dir,
-            config_overrides=args.overrides,
-        )
+        eval(pretrained_policy_path=pretrained_policy_path, config_overrides=args.overrides)
--- a/lerobot/scripts/train.py
+++ b/lerobot/scripts/train.py
@@ -150,7 +150,6 @@ def log_train_info(logger: Logger, info, step, cfg, dataset, is_offline):
    grad_norm = info["grad_norm"]
    lr = info["lr"]
    update_s = info["update_s"]
-    dataloading_s = info["dataloading_s"]

    # A sample is an (observation,action) pair, where observation and action
    # can be on multiple timestamps. In a batch, we have `batch_size`` number of samples.
@@ -171,7 +170,6 @@ def log_train_info(logger: Logger, info, step, cfg, dataset, is_offline):
        f"lr:{lr:0.1e}",
        # in seconds
        f"updt_s:{update_s:.3f}",
-        f"data_s:{dataloading_s:.3f}",  # if not ~0, you are bottlenecked by cpu or io
    ]
    logging.info(" ".join(log_items))

@@ -327,9 +325,6 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No

    # Note: this helper will be used in offline and online training loops.
    def evaluate_and_checkpoint_if_needed(step):
-        _num_digits = max(6, len(str(cfg.training.offline_steps + cfg.training.online_steps)))
-        step_identifier = f"{step:0{_num_digits}d}"
-
        if cfg.training.eval_freq > 0 and step % cfg.training.eval_freq == 0:
            logging.info(f"Eval policy at step {step}")
            with torch.no_grad(), torch.autocast(device_type=device.type) if cfg.use_amp else nullcontext():
@@ -337,7 +332,7 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
                    eval_env,
                    policy,
                    cfg.eval.n_episodes,
-                    videos_dir=Path(out_dir) / "eval" / f"videos_step_{step_identifier}",
+                    video_dir=Path(out_dir) / "eval",
                    max_episodes_rendered=4,
                    start_seed=cfg.seed,
                )
@@ -350,12 +345,14 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
            logging.info(f"Checkpoint policy after step {step}")
            # Note: Save with step as the identifier, and format it to have at least 6 digits but more if
            # needed (choose 6 as a minimum for consistency without being overkill).
-            logger.save_checkpont(
+            logger.save_checkpoint(
                step,
                policy,
                optimizer,
                lr_scheduler,
-                identifier=step_identifier,
+                identifier=str(step).zfill(
+                    max(6, len(str(cfg.training.offline_steps + cfg.training.online_steps)))
+                ),
            )
            logging.info("Resume training")

@@ -385,10 +382,7 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
    for _ in range(step, cfg.training.offline_steps):
        if step == 0:
            logging.info("Start offline training on a fixed dataset")
-
-        start_time = time.perf_counter()
        batch = next(dl_iter)
-        dataloading_s = time.perf_counter() - start_time

        for key in batch:
            batch[key] = batch[key].to(device, non_blocking=True)
@@ -403,8 +397,6 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
            use_amp=cfg.use_amp,
        )

-        train_info["dataloading_s"] = dataloading_s
-
        if step % cfg.training.log_freq == 0:
            log_train_info(logger, train_info, step, cfg, offline_dataset, is_offline=True)

--- a/lerobot/scripts/visualize_dataset.py
+++ b/lerobot/scripts/visualize_dataset.py
@@ -106,7 +106,6 @@ def visualize_dataset(
    ws_port: int = 9087,
    save: bool = False,
    output_dir: Path | None = None,
-    root: Path | None = None,
 ) -> Path | None:
    if save:
        assert (
@@ -114,7 +113,7 @@ def visualize_dataset(
        ), "Set an output directory where to write .rrd files with `--output-dir path/to/directory`."

    logging.info("Loading dataset")
-    dataset = LeRobotDataset(repo_id, root=root)
+    dataset = LeRobotDataset(repo_id)

    logging.info("Loading dataloader")
    episode_sampler = EpisodeSampler(dataset, episode_index)
@@ -257,12 +256,6 @@ def main():
        help="Directory path to write a .rrd file when `--save 1` is set.",
    )

-    parser.add_argument(
-        "--root",
-        type=str,
-        help="Root directory for a dataset stored on a local machine.",
-    )
-
    args = parser.parse_args()
    visualize_dataset(**vars(args))

--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -244,7 +244,7 @@ def test_load_previous_and_future_frames_outside_tolerance_inside_episode_range(
    delta_timestamps = {"index": [-0.2, 0, 0.141]}
    tol = 0.04
    item = hf_dataset[2]
-    with pytest.raises(AssertionError):
+    with pytest.raises(ValueError):
        load_previous_and_future_frames(item, hf_dataset, episode_data_index, delta_timestamps, tol)


--- a/tests/test_visualize_dataset.py
+++ b/tests/test_visualize_dataset.py
@@ -13,8 +13,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from pathlib import Path
-
 import pytest

 from lerobot.scripts.visualize_dataset import visualize_dataset
@@ -33,20 +31,3 @@ def test_visualize_dataset(tmpdir, repo_id):
        output_dir=tmpdir,
    )
    assert rrd_path.exists()
-
-
-@pytest.mark.parametrize(
-    "repo_id",
-    ["lerobot/pusht"],
-)
-@pytest.mark.parametrize("root", [Path(__file__).parent / "data"])
-def test_visualize_local_dataset(tmpdir, repo_id, root):
-    rrd_path = visualize_dataset(
-        repo_id,
-        episode_index=0,
-        batch_size=32,
-        save=True,
-        output_dir=tmpdir,
-        root=root,
-    )
-    assert rrd_path.exists()
Author	SHA1	Message	Date
Thomas Wolf	3c2dd1b881	fix test	2024-06-04 15:43:10 +02:00
Thomas Wolf	63a5d0be39	fix nans	2024-06-04 12:04:03 +02:00