From 3b24ad3c849f7b5de3b58cad653c6197321b765f Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Tue, 15 Apr 2025 18:12:21 +0200
Subject: [PATCH] Fixes for the reward classifier

---
 lerobot/common/envs/configs.py                | 14 +----
 .../classifier/configuration_classifier.py    | 22 ++++---
 .../hilserl/classifier/modeling_classifier.py | 39 +++++--------
 lerobot/scripts/server/gym_manipulator.py     | 58 +++++++++++--------
 4 files changed, 64 insertions(+), 69 deletions(-)

diff --git a/lerobot/common/envs/configs.py b/lerobot/common/envs/configs.py
index 77220c3c..3b493dd1 100644
--- a/lerobot/common/envs/configs.py
+++ b/lerobot/common/envs/configs.py
@@ -225,12 +225,7 @@ class HILSerlRobotEnvConfig(EnvConfig):
     device: str = "cuda"
     push_to_hub: bool = True
     pretrained_policy_name_or_path: Optional[str] = None
-    reward_classifier: dict[str, str | None] = field(
-        default_factory=lambda: {
-            "pretrained_path": None,
-            "config_path": None,
-        }
-    )
+    reward_classifier_pretrained_path: Optional[str] = None
 
     def gym_kwargs(self) -> dict:
         return {}
@@ -271,12 +266,7 @@ class ManiskillEnvConfig(EnvConfig):
             "observation.state": OBS_ROBOT,
         }
     )
-    reward_classifier: dict[str, str | None] = field(
-        default_factory=lambda: {
-            "pretrained_path": None,
-            "config_path": None,
-        }
-    )
+    reward_classifier_pretrained_path: Optional[str] = None
 
     @property
     def gym_kwargs(self) -> dict:
diff --git a/lerobot/common/policies/hilserl/classifier/configuration_classifier.py b/lerobot/common/policies/hilserl/classifier/configuration_classifier.py
index d04c189b..4c006103 100644
--- a/lerobot/common/policies/hilserl/classifier/configuration_classifier.py
+++ b/lerobot/common/policies/hilserl/classifier/configuration_classifier.py
@@ -20,10 +20,13 @@ class ClassifierConfig(PreTrainedConfig):
     model_type: str = "cnn"  # "transformer" or "cnn"
     num_cameras: int = 2
     learning_rate: float = 1e-4
-    normalization_mode = None
-    # output_features: Dict[str, PolicyFeature] = field(
-    #     default_factory=lambda: {"next.reward": PolicyFeature(type=FeatureType.REWARD, shape=(1,))}
-    # )
+    weight_decay: float = 0.01
+    grad_clip_norm: float = 1.0
+    normalization_mapping: dict[str, NormalizationMode] = field(
+        default_factory=lambda: {
+            "VISUAL": NormalizationMode.MEAN_STD,
+        }
+    )
 
     @property
     def observation_delta_indices(self) -> List | None:
@@ -40,8 +43,8 @@ class ClassifierConfig(PreTrainedConfig):
     def get_optimizer_preset(self) -> OptimizerConfig:
         return AdamWConfig(
             lr=self.learning_rate,
-            weight_decay=0.01,
-            grad_clip_norm=1.0,
+            weight_decay=self.weight_decay,
+            grad_clip_norm=self.grad_clip_norm,
         )
 
     def get_scheduler_preset(self) -> LRSchedulerConfig | None:
@@ -49,5 +52,8 @@ class ClassifierConfig(PreTrainedConfig):
 
     def validate_features(self) -> None:
         """Validate feature configurations."""
-        # Classifier doesn't need specific feature validation
-        pass
+        has_image = any(key.startswith("observation.image") for key in self.input_features)
+        if not has_image:
+            raise ValueError(
+                "You must provide an image observation (key starting with 'observation.image') in the input features"
+            )
diff --git a/lerobot/common/policies/hilserl/classifier/modeling_classifier.py b/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
index 3db6394e..74f94fbc 100644
--- a/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
+++ b/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
@@ -139,11 +139,7 @@ class Classifier(PreTrainedPolicy):
 
     def extract_images_and_labels(self, batch: Dict[str, Tensor]) -> Tuple[list, Tensor]:
         """Extract image tensors and label tensors from batch."""
-        # Find image keys in input features
-        image_keys = [key for key in self.config.input_features if key.startswith(OBS_IMAGE)]
-
-        # Extract the images and labels
-        images = [batch[key] for key in image_keys]
+        images = [batch[key] for key in self.config.input_features if key.startswith(OBS_IMAGE)]
         labels = batch["next.reward"]
 
         return images, labels
@@ -197,9 +193,9 @@ class Classifier(PreTrainedPolicy):
 
         return loss, output_dict
 
-    def predict_reward(self, batch, threshold=0.6):
+    def predict_reward(self, batch, threshold=0.5):
         """Legacy method for compatibility."""
-        images, _ = self.extract_images_and_labels(batch)
+        images = [batch[key] for key in self.config.input_features if key.startswith(OBS_IMAGE)]
         if self.config.num_classes == 2:
             probs = self.predict(images).probabilities
             logging.debug(f"Predicted reward images: {probs}")
@@ -207,8 +203,6 @@ class Classifier(PreTrainedPolicy):
         else:
             return torch.argmax(self.predict(images).probabilities, dim=1)
 
-    # Methods required by PreTrainedPolicy abstract class
-
     def get_optim_params(self) -> dict:
         """Return optimizer parameters for the policy."""
         return {
@@ -217,21 +211,16 @@ class Classifier(PreTrainedPolicy):
             "weight_decay": getattr(self.config, "weight_decay", 0.01),
         }
 
-    def reset(self):
-        """Reset any stateful components (required by PreTrainedPolicy)."""
-        # Classifier doesn't have stateful components that need resetting
-        pass
-
     def select_action(self, batch: Dict[str, Tensor]) -> Tensor:
-        """Return action (class prediction) based on input observation."""
-        images, _ = self.extract_images_and_labels(batch)
+        """
+        This method is required by PreTrainedPolicy but not used for reward classifiers.
+        The reward classifier is not an actor and does not select actions.
+        """
+        raise NotImplementedError("Reward classifiers do not select actions")
 
-        with torch.no_grad():
-            outputs = self.predict(images)
-
-            if self.config.num_classes == 2:
-                # For binary classification return 0 or 1
-                return (outputs.probabilities > 0.5).float()
-            else:
-                # For multi-class return the predicted class
-                return torch.argmax(outputs.probabilities, dim=1)
+    def reset(self):
+        """
+        This method is required by PreTrainedPolicy but not used for reward classifiers.
+        The reward classifier is not an actor and does not select actions.
+        """
+        pass
diff --git a/lerobot/scripts/server/gym_manipulator.py b/lerobot/scripts/server/gym_manipulator.py
index 1a4caee3..d2871263 100644
--- a/lerobot/scripts/server/gym_manipulator.py
+++ b/lerobot/scripts/server/gym_manipulator.py
@@ -362,20 +362,20 @@ class RewardWrapper(gym.Wrapper):
         """
         self.env = env
 
-        # NOTE: We got 15% speedup by compiling the model
-        self.reward_classifier = torch.compile(reward_classifier)
-
         if isinstance(device, str):
             device = torch.device(device)
         self.device = device
 
+        self.reward_classifier = torch.compile(reward_classifier)
+        self.reward_classifier.to(self.device)
+
     def step(self, action):
-        observation, reward, terminated, truncated, info = self.env.step(action)
-        images = [
-            observation[key].to(self.device, non_blocking=self.device.type == "cuda")
+        observation, _, terminated, truncated, info = self.env.step(action)
+        images = {
+            key: observation[key].to(self.device, non_blocking=self.device.type == "cuda")
             for key in observation
             if "image" in key
-        ]
+        }
         start_time = time.perf_counter()
         with torch.inference_mode():
             success = (
@@ -1184,7 +1184,9 @@ def make_robot_env(cfg) -> gym.vector.VectorEnv:
         )
 
     # Add reward computation and control wrappers
-    # env = RewardWrapper(env=env, reward_classifier=reward_classifier, device=cfg.device)
+    reward_classifier = init_reward_classifier(cfg)
+    if reward_classifier is not None:
+        env = RewardWrapper(env=env, reward_classifier=reward_classifier, device=cfg.device)
     env = TimeLimitWrapper(env=env, control_time_s=cfg.wrapper.control_time_s, fps=cfg.fps)
     if cfg.wrapper.use_gripper:
         env = GripperActionWrapper(env=env, quantization_threshold=cfg.wrapper.gripper_quantization_threshold)
@@ -1227,26 +1229,34 @@ def make_robot_env(cfg) -> gym.vector.VectorEnv:
     return env
 
 
-def get_classifier(cfg):
-    if (
-        cfg.wrapper.reward_classifier_pretrained_path is None
-        or cfg.wrapper.reward_classifier_config_file is None
-    ):
+def init_reward_classifier(cfg):
+    """
+    Load a reward classifier policy from a pretrained path if configured.
+
+    Args:
+        cfg: The environment configuration containing classifier paths
+
+    Returns:
+        The loaded classifier model or None if not configured
+    """
+    if cfg.reward_classifier_pretrained_path is None:
         return None
 
-    from lerobot.common.policies.hilserl.classifier.configuration_classifier import (
-        ClassifierConfig,
-    )
-    from lerobot.common.policies.hilserl.classifier.modeling_classifier import (
-        Classifier,
+    from lerobot.common.policies.hilserl.classifier.modeling_classifier import Classifier
+
+    # Get device from config or default to CUDA
+    device = getattr(cfg, "device", "cpu")
+
+    # Load the classifier directly using from_pretrained
+    classifier = Classifier.from_pretrained(
+        pretrained_name_or_path=cfg.reward_classifier_pretrained_path,
     )
 
-    classifier_config = _policy_cfg_from_hydra_cfg(ClassifierConfig, cfg)
-    classifier_config.num_cameras = len(cfg.training.image_keys)  # TODO automate these paths
-    model = Classifier(classifier_config)
-    model.load_state_dict(Classifier.from_pretrained(pretrained_path).state_dict())
-    model = model.to(device)
-    return model
+    # Ensure model is on the correct device
+    classifier.to(device)
+    classifier.eval()  # Set to evaluation mode
+
+    return classifier
 
 
 def record_dataset(env, policy, cfg):