From 299effe0f16622bae1593b3de2c772d685c96363 Mon Sep 17 00:00:00 2001
From: Adil Zouitine <adilzouitinegm@gmail.com>
Date: Thu, 24 Apr 2025 10:18:03 +0200
Subject: [PATCH] [HIL-SERL] Update CI to allow installation of prerelease
 versions for lerobot (#1018)

Co-authored-by: imstevenpmwork <steven.palma@huggingface.co>
---
 .github/workflows/test.yml                    |   2 +-
 lerobot/common/datasets/lerobot_dataset.py    |  11 +-
 lerobot/common/envs/utils.py                  |  45 ++-
 .../common/robot_devices/control_configs.py   |   2 +-
 lerobot/common/robot_devices/control_utils.py |  14 +-
 lerobot/scripts/control_robot.py              |   2 +-
 lerobot/scripts/control_sim_robot.py          |  15 +-
 pyproject.toml                                |   5 +-
 .../classifier/test_modelling_classifier.py   |  60 +++-
 tests/test_train_hilserl_classifier.py        | 310 ------------------
 10 files changed, 78 insertions(+), 388 deletions(-)
 delete mode 100644 tests/test_train_hilserl_classifier.py

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 7de51ce5..d91c5364 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -101,7 +101,7 @@ jobs:
           python-version: "3.10"
 
       - name: Install lerobot
-        run: uv sync --extra "test" --prerelease=allow
+        run: uv sync --extra "test"
 
       - name: Test with pytest
         run: |
diff --git a/lerobot/common/datasets/lerobot_dataset.py b/lerobot/common/datasets/lerobot_dataset.py
index 42d7a3fc..d8da85d6 100644
--- a/lerobot/common/datasets/lerobot_dataset.py
+++ b/lerobot/common/datasets/lerobot_dataset.py
@@ -318,7 +318,7 @@ class LeRobotDatasetMetadata:
         obj.root.mkdir(parents=True, exist_ok=False)
 
         if robot is not None:
-            features = {**(features or {}), **get_features_from_robot(robot)}
+            features = get_features_from_robot(robot, use_videos)
             robot_type = robot.robot_type
             if not all(cam.fps == fps for cam in robot.cameras.values()):
                 logging.warning(
@@ -821,9 +821,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
 
             if self.features[key]["dtype"] in ["image", "video"]:
                 img_path = self._get_image_file_path(
-                    episode_index=self.episode_buffer["episode_index"],
-                    image_key=key,
-                    frame_index=frame_index,
+                    episode_index=self.episode_buffer["episode_index"], image_key=key, frame_index=frame_index
                 )
                 if frame_index == 0:
                     img_path.parent.mkdir(parents=True, exist_ok=True)
@@ -869,10 +867,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
         for key, ft in self.features.items():
             # index, episode_index, task_index are already processed above, and image and video
             # are processed separately by storing image path and frame info as meta data
-            if key in ["index", "episode_index", "task_index"] or ft["dtype"] in [
-                "image",
-                "video",
-            ]:
+            if key in ["index", "episode_index", "task_index"] or ft["dtype"] in ["image", "video"]:
                 continue
             episode_buffer[key] = np.stack(episode_buffer[key])
 
diff --git a/lerobot/common/envs/utils.py b/lerobot/common/envs/utils.py
index b73e19ea..83334f87 100644
--- a/lerobot/common/envs/utils.py
+++ b/lerobot/common/envs/utils.py
@@ -37,35 +37,29 @@ def preprocess_observation(observations: dict[str, np.ndarray]) -> dict[str, Ten
     """
     # map to expected inputs for the policy
     return_observations = {}
-    # TODO: You have to merge all tensors from agent key and extra key
-    # You don't keep sensor param key in the observation
-    # And you keep sensor data rgb
-    for key, img in observations.items():
-        if "images" not in key:
-            continue
+    if "pixels" in observations:
+        if isinstance(observations["pixels"], dict):
+            imgs = {f"observation.images.{key}": img for key, img in observations["pixels"].items()}
+        else:
+            imgs = {"observation.image": observations["pixels"]}
 
-        # TODO(aliberts, rcadene): use transforms.ToTensor()?
-        if not torch.is_tensor(img):
+        for imgkey, img in imgs.items():
+            # TODO(aliberts, rcadene): use transforms.ToTensor()?
             img = torch.from_numpy(img)
 
-        if img.ndim == 3:
-            img = img.unsqueeze(0)
+            # sanity check that images are channel last
+            _, h, w, c = img.shape
+            assert c < h and c < w, f"expect channel last images, but instead got {img.shape=}"
 
-        # sanity check that images are channel last
-        _, h, w, c = img.shape
-        assert c < h and c < w, f"expect channel last images, but instead got {img.shape=}"
+            # sanity check that images are uint8
+            assert img.dtype == torch.uint8, f"expect torch.uint8, but instead {img.dtype=}"
 
-        # sanity check that images are uint8
-        assert img.dtype == torch.uint8, f"expect torch.uint8, but instead {img.dtype=}"
+            # convert to channel first of type float32 in range [0,1]
+            img = einops.rearrange(img, "b h w c -> b c h w").contiguous()
+            img = img.type(torch.float32)
+            img /= 255
 
-        # convert to channel first of type float32 in range [0,1]
-        img = einops.rearrange(img, "b h w c -> b c h w").contiguous()
-        img = img.type(torch.float32)
-        img /= 255
-
-        return_observations[key] = img
-        # obs state agent qpos and qvel
-        # image
+            return_observations[imgkey] = img
 
     if "environment_state" in observations:
         return_observations["observation.environment_state"] = torch.from_numpy(
@@ -74,8 +68,7 @@ def preprocess_observation(observations: dict[str, np.ndarray]) -> dict[str, Ten
 
     # TODO(rcadene): enable pixels only baseline with `obs_type="pixels"` in environment by removing
     # requirement for "agent_pos"
-    # return_observations["observation.state"] = torch.from_numpy(observations["agent_pos"]).float()
-    return_observations["observation.state"] = observations["observation.state"].float()
+    return_observations["observation.state"] = torch.from_numpy(observations["agent_pos"]).float()
     return return_observations
 
 
@@ -93,7 +86,7 @@ def env_to_policy_features(env_cfg: EnvConfig) -> dict[str, PolicyFeature]:
         else:
             feature = ft
 
-        policy_key = env_cfg.features_map.get(key, key)
+        policy_key = env_cfg.features_map[key]
         policy_features[policy_key] = feature
 
     return policy_features
diff --git a/lerobot/common/robot_devices/control_configs.py b/lerobot/common/robot_devices/control_configs.py
index 00577ddb..aad17d7b 100644
--- a/lerobot/common/robot_devices/control_configs.py
+++ b/lerobot/common/robot_devices/control_configs.py
@@ -88,7 +88,7 @@ class RecordControlConfig(ControlConfig):
     # Resume recording on an existing dataset.
     resume: bool = False
     # Reset follower arms to an initial configuration.
-    reset_follower_arms: bool = True
+    reset_follower_arms: bool = False
 
     def __post_init__(self):
         # HACK: We parse again the cli args here to get the pretrained path if there was one.
diff --git a/lerobot/common/robot_devices/control_utils.py b/lerobot/common/robot_devices/control_utils.py
index 525b006e..bef13143 100644
--- a/lerobot/common/robot_devices/control_utils.py
+++ b/lerobot/common/robot_devices/control_utils.py
@@ -129,22 +129,16 @@ def predict_action(observation, policy, device, use_amp):
     return action
 
 
-def init_keyboard_listener(assign_rewards=False):
+def init_keyboard_listener():
     """
     Initializes a keyboard listener to enable early termination of an episode
     or environment reset by pressing the right arrow key ('->'). This may require
     sudo permissions to allow the terminal to monitor keyboard events.
-
-    Args:
-        assign_rewards (bool): If True, allows annotating the collected trajectory
-        with a binary reward at the end of the episode to indicate success.
     """
     events = {}
     events["exit_early"] = False
     events["rerecord_episode"] = False
     events["stop_recording"] = False
-    if assign_rewards:
-        events["next.reward"] = 0
 
     if is_headless():
         logging.warning(
@@ -169,12 +163,6 @@ def init_keyboard_listener(assign_rewards=False):
                 print("Escape key pressed. Stopping data recording...")
                 events["stop_recording"] = True
                 events["exit_early"] = True
-            elif assign_rewards and key == keyboard.Key.space:
-                events["next.reward"] = 1 if events["next.reward"] == 0 else 0
-                print(
-                    "Space key pressed. Assigning new reward to the subsequent frames. New reward:",
-                    events["next.reward"],
-                )
 
         except Exception as e:
             print(f"Error handling key press: {e}")
diff --git a/lerobot/scripts/control_robot.py b/lerobot/scripts/control_robot.py
index bf91d7db..658aefd5 100644
--- a/lerobot/scripts/control_robot.py
+++ b/lerobot/scripts/control_robot.py
@@ -276,7 +276,7 @@ def record(
 
     if not robot.is_connected:
         robot.connect()
-    listener, events = init_keyboard_listener(assign_rewards=cfg.assign_rewards)
+    listener, events = init_keyboard_listener()
 
     # Execute a few seconds without recording to:
     # 1. teleoperate the robot to move it in starting position if no policy provided,
diff --git a/lerobot/scripts/control_sim_robot.py b/lerobot/scripts/control_sim_robot.py
index 5e62b88d..c33c6e72 100644
--- a/lerobot/scripts/control_sim_robot.py
+++ b/lerobot/scripts/control_sim_robot.py
@@ -201,14 +201,9 @@ def record(
     resume: bool = False,
     local_files_only: bool = False,
     run_compute_stats: bool = True,
-    assign_rewards: bool = False,
 ) -> LeRobotDataset:
     # Load pretrained policy
 
-    extra_features = (
-        {"next.reward": {"dtype": "int64", "shape": (1,), "names": None}} if assign_rewards else None
-    )
-
     policy = None
     if pretrained_policy_name_or_path is not None:
         policy, policy_fps, device, use_amp = init_policy(pretrained_policy_name_or_path, policy_overrides)
@@ -221,7 +216,7 @@ def record(
         raise ValueError("Either policy or process_action_fn has to be set to enable control in sim.")
 
     # initialize listener before sim env
-    listener, events = init_keyboard_listener(assign_rewards=assign_rewards)
+    listener, events = init_keyboard_listener()
 
     # create sim env
     env = env()
@@ -269,7 +264,6 @@ def record(
             "shape": env.action_space.shape,
             "names": None,
         }
-        features = {**features, **extra_features}
 
         # Create empty dataset or load existing saved episodes
         sanity_check_dataset_name(repo_id, policy)
@@ -321,13 +315,6 @@ def record(
                 "timestamp": env_timestamp,
             }
 
-            # Overwrite environment reward with manually assigned reward
-            if assign_rewards:
-                frame["next.reward"] = events["next.reward"]
-
-                # Should success always be false to match what we do in control_utils?
-                frame["next.success"] = False
-
             for key in image_keys:
                 if not key.startswith("observation.image"):
                     frame["observation.image." + key] = observation[key]
diff --git a/pyproject.toml b/pyproject.toml
index eb82f338..e8ec5275 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -49,7 +49,7 @@ dependencies = [
     "datasets>=2.19.0",
     "deepdiff>=7.0.1",
     "diffusers>=0.27.2",
-    "draccus>=0.10.0",
+    "draccus==0.10.0",
     "einops>=0.8.0",
     "flask>=3.0.3",
     "gdown>=5.1.0",
@@ -70,7 +70,7 @@ dependencies = [
     "pyzmq>=26.2.1",
     "rerun-sdk>=0.21.0",
     "termcolor>=2.4.0",
-    "torch>=2.2.1",
+    "torch>=2.2.1,<=2.6.0",
     "torchcodec>=0.2.1; sys_platform != 'win32' and (sys_platform != 'linux' or (platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 'armv7l')) and (sys_platform != 'darwin' or platform_machine != 'x86_64')",
     "torchmetrics>=1.6.0",
     "torchvision>=0.21.0",
@@ -89,7 +89,6 @@ dynamixel = ["dynamixel-sdk>=3.7.31", "pynput>=1.7.7"]
 feetech = ["feetech-servo-sdk>=1.0.0", "pynput>=1.7.7"]
 hilserl = ["transformers>=4.48.0", "torchmetrics>=1.6.0"]
 intelrealsense = ["pyrealsense2>=2.55.1.6486 ; sys_platform != 'darwin'"]
-mani_skill = ["mani-skill==3.0.0b20"]
 pi0 = ["transformers>=4.48.0"]
 pusht = ["gym-pusht>=0.1.5 ; python_version < '4.0'"]
 stretch = [
diff --git a/tests/policies/hilserl/classifier/test_modelling_classifier.py b/tests/policies/hilserl/classifier/test_modelling_classifier.py
index e8223a52..53edb10e 100644
--- a/tests/policies/hilserl/classifier/test_modelling_classifier.py
+++ b/tests/policies/hilserl/classifier/test_modelling_classifier.py
@@ -4,6 +4,7 @@ from lerobot.common.policies.hilserl.classifier.modeling_classifier import (
     ClassifierConfig,
     ClassifierOutput,
 )
+from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
 from tests.utils import require_package
 
 
@@ -27,19 +28,39 @@ def test_binary_classifier_with_default_params():
     )
 
     config = ClassifierConfig()
+    config.input_features = {
+        "observation.image": PolicyFeature(type=FeatureType.VISUAL, shape=(3, 224, 224)),
+    }
+    config.output_features = {
+        "next.reward": PolicyFeature(type=FeatureType.REWARD, shape=(1,)),
+    }
+    config.normalization_mapping = {
+        "VISUAL": NormalizationMode.IDENTITY,
+        "REWARD": NormalizationMode.IDENTITY,
+    }
+    config.num_cameras = 1
     classifier = Classifier(config)
 
     batch_size = 10
 
-    input = torch.rand(batch_size, 3, 224, 224)
-    output = classifier(input)
+    input = {
+        "observation.image": torch.rand((batch_size, 3, 224, 224)),
+        "next.reward": torch.randint(low=0, high=2, size=(batch_size,)).float(),
+    }
+
+    images, labels = classifier.extract_images_and_labels(input)
+    assert len(images) == 1
+    assert images[0].shape == torch.Size([batch_size, 3, 224, 224])
+    assert labels.shape == torch.Size([batch_size])
+
+    output = classifier.predict(images)
 
     assert output is not None
-    assert output.logits.shape == torch.Size([batch_size])
+    assert output.logits.size() == torch.Size([batch_size])
     assert not torch.isnan(output.logits).any(), "Tensor contains NaN values"
     assert output.probabilities.shape == torch.Size([batch_size])
     assert not torch.isnan(output.probabilities).any(), "Tensor contains NaN values"
-    assert output.hidden_states.shape == torch.Size([batch_size, 2048])
+    assert output.hidden_states.shape == torch.Size([batch_size, 512])
     assert not torch.isnan(output.hidden_states).any(), "Tensor contains NaN values"
 
 
@@ -50,20 +71,37 @@ def test_multiclass_classifier():
     )
 
     num_classes = 5
-    config = ClassifierConfig(num_classes=num_classes)
+    config = ClassifierConfig()
+    config.input_features = {
+        "observation.image": PolicyFeature(type=FeatureType.VISUAL, shape=(3, 224, 224)),
+    }
+    config.output_features = {
+        "next.reward": PolicyFeature(type=FeatureType.REWARD, shape=(num_classes,)),
+    }
+    config.num_cameras = 1
+    config.num_classes = num_classes
     classifier = Classifier(config)
 
     batch_size = 10
 
-    input = torch.rand(batch_size, 3, 224, 224)
-    output = classifier(input)
+    input = {
+        "observation.image": torch.rand((batch_size, 3, 224, 224)),
+        "next.reward": torch.rand((batch_size, num_classes)),
+    }
+
+    images, labels = classifier.extract_images_and_labels(input)
+    assert len(images) == 1
+    assert images[0].shape == torch.Size([batch_size, 3, 224, 224])
+    assert labels.shape == torch.Size([batch_size, num_classes])
+
+    output = classifier.predict(images)
 
     assert output is not None
     assert output.logits.shape == torch.Size([batch_size, num_classes])
     assert not torch.isnan(output.logits).any(), "Tensor contains NaN values"
     assert output.probabilities.shape == torch.Size([batch_size, num_classes])
     assert not torch.isnan(output.probabilities).any(), "Tensor contains NaN values"
-    assert output.hidden_states.shape == torch.Size([batch_size, 2048])
+    assert output.hidden_states.shape == torch.Size([batch_size, 512])
     assert not torch.isnan(output.hidden_states).any(), "Tensor contains NaN values"
 
 
@@ -87,9 +125,9 @@ def test_explicit_device_setup():
         Classifier,
     )
 
-    config = ClassifierConfig(device="meta")
-    assert config.device == "meta"
+    config = ClassifierConfig(device="cpu")
+    assert config.device == "cpu"
 
     classifier = Classifier(config)
     for p in classifier.parameters():
-        assert p.device == torch.device("meta")
+        assert p.device == torch.device("cpu")
diff --git a/tests/test_train_hilserl_classifier.py b/tests/test_train_hilserl_classifier.py
deleted file mode 100644
index 58344913..00000000
--- a/tests/test_train_hilserl_classifier.py
+++ /dev/null
@@ -1,310 +0,0 @@
-import os
-import tempfile
-from pathlib import Path
-from unittest.mock import MagicMock, patch
-
-import pytest
-import torch
-from hydra import compose, initialize_config_dir
-from torch import nn
-from torch.utils.data import Dataset
-
-from lerobot.common.policies.hilserl.classifier.configuration_classifier import (
-    ClassifierConfig,
-)
-from lerobot.common.policies.hilserl.classifier.modeling_classifier import Classifier
-from lerobot.scripts.train_hilserl_classifier import (
-    create_balanced_sampler,
-    train,
-    train_epoch,
-    validate,
-)
-
-
-class MockDataset(Dataset):
-    def __init__(self, data):
-        self.data = data
-        self.meta = MagicMock()
-        self.meta.stats = {}
-
-    def __getitem__(self, idx):
-        return self.data[idx]
-
-    def __len__(self):
-        return len(self.data)
-
-
-def make_dummy_model():
-    model_config = ClassifierConfig(
-        num_classes=2,
-        model_name="hf-tiny-model-private/tiny-random-ResNetModel",
-        num_cameras=1,
-    )
-    model = Classifier(config=model_config)
-    return model
-
-
-def test_create_balanced_sampler():
-    # Mock dataset with imbalanced classes
-    data = [
-        {"label": 0},
-        {"label": 0},
-        {"label": 1},
-        {"label": 0},
-        {"label": 1},
-        {"label": 1},
-        {"label": 1},
-        {"label": 1},
-    ]
-    dataset = MockDataset(data)
-    cfg = MagicMock()
-    cfg.training.label_key = "label"
-
-    sampler = create_balanced_sampler(dataset, cfg)
-
-    # Get weights from the sampler
-    weights = sampler.weights.float()
-
-    # Check that samples have appropriate weights
-    labels = [item["label"] for item in data]
-    class_counts = torch.tensor([labels.count(0), labels.count(1)], dtype=torch.float32)
-    class_weights = 1.0 / class_counts
-    expected_weights = torch.tensor([class_weights[label] for label in labels], dtype=torch.float32)
-
-    # Test that the weights are correct
-    assert torch.allclose(weights, expected_weights)
-
-
-def test_train_epoch():
-    model = make_dummy_model()
-    # Mock components
-    model.train = MagicMock()
-
-    train_loader = [
-        {
-            "image": torch.rand(2, 3, 224, 224),
-            "label": torch.tensor([0.0, 1.0]),
-        }
-    ]
-
-    criterion = nn.BCEWithLogitsLoss()
-    optimizer = MagicMock()
-    grad_scaler = MagicMock()
-    device = torch.device("cpu")
-    logger = MagicMock()
-    step = 0
-    cfg = MagicMock()
-    cfg.training.image_keys = ["image"]
-    cfg.training.label_key = "label"
-    cfg.training.use_amp = False
-
-    # Call the function under test
-    train_epoch(
-        model,
-        train_loader,
-        criterion,
-        optimizer,
-        grad_scaler,
-        device,
-        logger,
-        step,
-        cfg,
-    )
-
-    # Check that model.train() was called
-    model.train.assert_called_once()
-
-    # Check that optimizer.zero_grad() was called
-    optimizer.zero_grad.assert_called()
-
-    # Check that logger.log_dict was called
-    logger.log_dict.assert_called()
-
-
-def test_validate():
-    model = make_dummy_model()
-
-    # Mock components
-    model.eval = MagicMock()
-    val_loader = [
-        {
-            "image": torch.rand(2, 3, 224, 224),
-            "label": torch.tensor([0.0, 1.0]),
-        }
-    ]
-    criterion = nn.BCEWithLogitsLoss()
-    device = torch.device("cpu")
-    logger = MagicMock()
-    cfg = MagicMock()
-    cfg.training.image_keys = ["image"]
-    cfg.training.label_key = "label"
-    cfg.training.use_amp = False
-
-    # Call validate
-    accuracy, eval_info = validate(model, val_loader, criterion, device, logger, cfg)
-
-    # Check that model.eval() was called
-    model.eval.assert_called_once()
-
-    # Check accuracy/eval_info are calculated and of the correct type
-    assert isinstance(accuracy, float)
-    assert isinstance(eval_info, dict)
-
-
-def test_train_epoch_multiple_cameras():
-    model_config = ClassifierConfig(
-        num_classes=2,
-        model_name="hf-tiny-model-private/tiny-random-ResNetModel",
-        num_cameras=2,
-    )
-    model = Classifier(config=model_config)
-
-    # Mock components
-    model.train = MagicMock()
-
-    train_loader = [
-        {
-            "image_1": torch.rand(2, 3, 224, 224),
-            "image_2": torch.rand(2, 3, 224, 224),
-            "label": torch.tensor([0.0, 1.0]),
-        }
-    ]
-
-    criterion = nn.BCEWithLogitsLoss()
-    optimizer = MagicMock()
-    grad_scaler = MagicMock()
-    device = torch.device("cpu")
-    logger = MagicMock()
-    step = 0
-    cfg = MagicMock()
-    cfg.training.image_keys = ["image_1", "image_2"]
-    cfg.training.label_key = "label"
-    cfg.training.use_amp = False
-
-    # Call the function under test
-    train_epoch(
-        model,
-        train_loader,
-        criterion,
-        optimizer,
-        grad_scaler,
-        device,
-        logger,
-        step,
-        cfg,
-    )
-
-    # Check that model.train() was called
-    model.train.assert_called_once()
-
-    # Check that optimizer.zero_grad() was called
-    optimizer.zero_grad.assert_called()
-
-    # Check that logger.log_dict was called
-    logger.log_dict.assert_called()
-
-
-@pytest.mark.parametrize("resume", [True, False])
-@patch("lerobot.scripts.train_hilserl_classifier.init_hydra_config")
-@patch("lerobot.scripts.train_hilserl_classifier.Logger.get_last_checkpoint_dir")
-@patch("lerobot.scripts.train_hilserl_classifier.Logger.get_last_pretrained_model_dir")
-@patch("lerobot.scripts.train_hilserl_classifier.Logger")
-@patch("lerobot.scripts.train_hilserl_classifier.LeRobotDataset")
-@patch("lerobot.scripts.train_hilserl_classifier.get_model")
-def test_resume_function(
-    mock_get_model,
-    mock_dataset,
-    mock_logger,
-    mock_get_last_pretrained_model_dir,
-    mock_get_last_checkpoint_dir,
-    mock_init_hydra_config,
-    resume,
-):
-    # Initialize Hydra
-    test_file_dir = os.path.dirname(os.path.abspath(__file__))
-    config_dir = os.path.abspath(os.path.join(test_file_dir, "..", "lerobot", "configs", "policy"))
-    assert os.path.exists(config_dir), f"Config directory does not exist at {config_dir}"
-
-    with initialize_config_dir(config_dir=config_dir, job_name="test_app", version_base="1.2"):
-        cfg = compose(
-            config_name="hilserl_classifier",
-            overrides=[
-                "device=cpu",
-                "seed=42",
-                f"output_dir={tempfile.mkdtemp()}",
-                "wandb.enable=False",
-                f"resume={resume}",
-                "dataset_repo_id=dataset_repo_id",
-                "train_split_proportion=0.8",
-                "training.num_workers=0",
-                "training.batch_size=2",
-                "training.image_keys=[image]",
-                "training.label_key=label",
-                "training.use_amp=False",
-                "training.num_epochs=1",
-                "eval.batch_size=2",
-            ],
-        )
-
-    # Mock the init_hydra_config function to return cfg
-    mock_init_hydra_config.return_value = cfg
-
-    # Mock dataset
-    dataset = MockDataset([{"image": torch.rand(3, 224, 224), "label": i % 2} for i in range(10)])
-    mock_dataset.return_value = dataset
-
-    # Mock checkpoint handling
-    mock_checkpoint_dir = MagicMock(spec=Path)
-    mock_checkpoint_dir.exists.return_value = resume  # Only exists if resuming
-    mock_get_last_checkpoint_dir.return_value = mock_checkpoint_dir
-    mock_get_last_pretrained_model_dir.return_value = Path(tempfile.mkdtemp())
-
-    # Mock logger
-    logger = MagicMock()
-    resumed_step = 1000
-    if resume:
-        logger.load_last_training_state.return_value = resumed_step
-    else:
-        logger.load_last_training_state.return_value = 0
-    mock_logger.return_value = logger
-
-    # Instantiate the model and set make_policy to return it
-    model = make_dummy_model()
-    mock_get_model.return_value = model
-
-    # Call train
-    train(cfg)
-
-    # Check that checkpoint handling methods were called
-    if resume:
-        mock_get_last_checkpoint_dir.assert_called_once_with(Path(cfg.output_dir))
-        mock_get_last_pretrained_model_dir.assert_called_once_with(Path(cfg.output_dir))
-        mock_checkpoint_dir.exists.assert_called_once()
-        logger.load_last_training_state.assert_called_once()
-    else:
-        mock_get_last_checkpoint_dir.assert_not_called()
-        mock_get_last_pretrained_model_dir.assert_not_called()
-        mock_checkpoint_dir.exists.assert_not_called()
-        logger.load_last_training_state.assert_not_called()
-
-    # Collect the steps from logger.log_dict calls
-    train_log_calls = logger.log_dict.call_args_list
-
-    # Extract the steps used in the train logging
-    steps = []
-    for call in train_log_calls:
-        mode = call.kwargs.get("mode", call.args[2] if len(call.args) > 2 else None)
-        if mode == "train":
-            step = call.kwargs.get("step", call.args[1] if len(call.args) > 1 else None)
-            steps.append(step)
-
-    expected_start_step = resumed_step if resume else 0
-
-    # Calculate expected_steps
-    train_size = int(cfg.train_split_proportion * len(dataset))
-    batch_size = cfg.training.batch_size
-    num_batches = (train_size + batch_size - 1) // batch_size
-
-    expected_steps = [expected_start_step + i for i in range(num_batches)]
-
-    assert steps == expected_steps, f"Expected steps {expected_steps}, got {steps}"