From 71ec721e4889c515775323b606b6a493cf7dc8ac Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Wed, 22 Jan 2025 01:15:45 +0100
Subject: [PATCH] cleaned eval_on_robot.py; readded policy; fixed doc strings

---
 lerobot/scripts/eval_on_robot.py | 165 +++++++++++++++++++------------
 1 file changed, 102 insertions(+), 63 deletions(-)

diff --git a/lerobot/scripts/eval_on_robot.py b/lerobot/scripts/eval_on_robot.py
index 842c1a28..66ccc605 100644
--- a/lerobot/scripts/eval_on_robot.py
+++ b/lerobot/scripts/eval_on_robot.py
@@ -15,25 +15,34 @@
 # limitations under the License.
 """Evaluate a policy by running rollouts on the real robot and computing metrics.
 
-Usage examples: evaluate a checkpoint from the LeRobot training script for 10 episodes.
+This script supports performing human interventions during rollouts. 
+Human interventions allow the user to take control of the robot from the policy 
+and correct its behavior. It is specifically designed for reinforcement learning 
+experiments and HIL-SERL (human-in-the-loop reinforcement learning) methods.
 
-```
-python lerobot/scripts/eval_on_robot.py \
-    -p outputs/train/model/checkpoints/005000/pretrained_model \
-    eval.n_episodes=10
-```
+### How to Use
 
-Test reward classifier with teleoperation (you need to press space to take over)
+To rollout a policy on the robot:
 ```
 python lerobot/scripts/eval_on_robot.py \
     --robot-path lerobot/configs/robot/so100.yaml \
+    --pretrained-policy-path-or-name path/to/pretrained_model \
+    --policy-config path/to/policy/config.yaml \
+    --display-cameras 1
+```
+
+If you trained a reward classifier on your task, you can also evaluate it using this script.
+You can annotate the collection with a pre-trained reward classifier by running:
+
+```
+python lerobot/scripts/eval_on_robot.py \
+    --robot-path lerobot/configs/robot/so100.yaml \
+    --pretrained-policy-path-or-name path/to/pretrained_model \
+    --policy-config path/to/policy/config.yaml \
     --reward-classifier-pretrained-path outputs/classifier/checkpoints/best/pretrained_model \
     --reward-classifier-config-file lerobot/configs/policy/hilserl_classifier.yaml \
     --display-cameras 1
 ```
-
-**NOTE** (michel-aractingi): This script is incomplete and it is being prepared
-for running training on the real robot.
 """
 
 import argparse
@@ -46,7 +55,8 @@ import torch
 from tqdm import trange
 
 from lerobot.common.policies.policy_protocol import Policy
-from lerobot.common.robot_devices.control_utils import busy_wait, is_headless, reset_follower_position
+from lerobot.common.policies.utils import get_device_from_parameters
+from lerobot.common.robot_devices.control_utils import busy_wait, is_headless, reset_follower_position, predict_action
 from lerobot.common.robot_devices.robots.factory import Robot, make_robot
 from lerobot.common.utils.utils import (
     init_hydra_config,
@@ -69,7 +79,6 @@ def get_classifier(pretrained_path, config_path):
     classifier_config.num_cameras = len(cfg.training.image_keys)  # TODO automate these paths
     model = Classifier(classifier_config)
     model.load_state_dict(Classifier.from_pretrained(pretrained_path).state_dict())
-    model = model.to("mps")
     return model
 
 
@@ -81,48 +90,45 @@ def rollout(
     control_time_s: float = 20,
     use_amp: bool = True,
     display_cameras: bool = False,
+    device: str = "cpu"
 ) -> dict:
     """Run a batched policy rollout on the real robot.
 
-    The return dictionary contains:
-        "robot": A a dictionary of (batch, sequence + 1, *) tensors mapped to observation
-            keys. NOTE the that this has an extra sequence element relative to the other keys in the
-            dictionary. This is because an extra observation is included for after the environment is
-            terminated or truncated.
-        "action": A (batch, sequence, action_dim) tensor of actions applied based on the observations (not
-            including the last observations).
-        "reward": A (batch, sequence) tensor of rewards received for applying the actions.
-        "success": A (batch, sequence) tensor of success conditions (the only time this can be True is upon
-            environment termination/truncation).
-        "done": A (batch, sequence) tensor of **cumulative** done conditions. For any given batch element,
-            the first True is followed by True's all the way till the end. This can be used for masking
-            extraneous elements from the sequences above.
+    This function executes a rollout using the provided policy and robot interface, 
+    simulating batched interactions for a fixed control duration. 
+
+    The returned dictionary contains rollout statistics, which can be used for analysis and debugging.
 
     Args:
-        robot: The robot class that defines the interface with the real robot.
-        policy: The policy. Must be a PyTorch nn module.
+        "robot": The robot interface for interacting with the real robot hardware.
+        "policy": The policy to execute. Must be a PyTorch `nn.Module` object.
+        "reward_classifier": A module to classify rewards during the rollout.
+        "fps": The control frequency at which the policy is executed.
+        "control_time_s": The total control duration of the rollout in seconds.
+        "use_amp": Whether to use automatic mixed precision (AMP) for policy evaluation.
+        "display_cameras": If True, displays camera streams during the rollout.
+        "device": The device to use for computations (e.g., "cpu", "cuda" or "mps").
 
     Returns:
-        The dictionary described above.
+        Dictionary of the statisitcs collected during rollouts.
     """
-    # TODO (michel-aractingi): Infer the device from policy parameters when policy is added
-    # assert isinstance(policy, nn.Module), "Policy must be a PyTorch nn module."
-    # device = get_device_from_parameters(policy)
-
     # define keyboard listener
     listener, events = init_keyboard_listener()
 
     # Reset the policy. TODO (michel-aractingi) add real policy evaluation once the code is ready.
-    # policy.reset()
+    if policy is not None:
+        policy.reset()  
 
     # NOTE: sorting to make sure the key sequence is the same during training and testing.
     observation = robot.capture_observation()
     image_keys = [key for key in observation if "image" in key]
-    image_keys.sort()
+    image_keys.sort() # CG{T}
 
+    
     all_actions = []
     all_rewards = []
-    all_successes = []
+    
+    indices_from_policy = []
 
     start_episode_t = time.perf_counter()
     init_pos = robot.follower_arms["main"].read("Present_Position")
@@ -141,27 +147,32 @@ def rollout(
         else:
             # explore with policy
             with torch.inference_mode():
-                # TODO (michel-aractingi) replace this part with policy (predict_action)
-                action = robot.follower_arms["main"].read("Present_Position")
-                action = torch.from_numpy(action)
+                # TODO (michel-aractingi) in placy temporarly for testing purposes
+                if policy is None:
+                    action = robot.follower_arms["main"].read("Present_Position")
+                    action = torch.from_numpy(action)
+                    indices_from_policy.append(False)
+                else:
+                    action = predict_action(observation, policy, device, use_amp)
+                    indices_from_policy.append(True)
+                
                 robot.send_action(action)
-                # action = predict_action(observation, policy, device, use_amp)
+                observation = robot.capture_observation()
+                
 
-        observation = robot.capture_observation()
         images = []
         for key in image_keys:
             if display_cameras:
                 cv2.imshow(key, cv2.cvtColor(observation[key].numpy(), cv2.COLOR_RGB2BGR))
                 cv2.waitKey(1)
-            images.append(observation[key].to("mps"))
+            images.append(observation[key].to(device))
 
         reward = reward_classifier.predict_reward(images) if reward_classifier is not None else 0.0
+
+        # TODO send data through the server as soon as you have it       
+
         all_rewards.append(reward)
-
-        # print("REWARD : ", reward)
-
         all_actions.append(action)
-        all_successes.append(torch.tensor([False]))
 
         dt_s = time.perf_counter() - start_loop_t
         busy_wait(1 / fps - dt_s)
@@ -180,7 +191,6 @@ def rollout(
     ret = {
         "action": torch.stack(all_actions, dim=1),
         "next.reward": torch.stack(all_rewards, dim=1),
-        "next.success": torch.stack(all_successes, dim=1),
         "done": dones,
     }
 
@@ -199,14 +209,32 @@ def eval_policy(
     display_cameras: bool = False,
     reward_classifier_pretrained_path: str | None = None,
     reward_classifier_config_file: str | None = None,
+    device: str | None = None,
 ) -> dict:
     """
+    Evaluate a policy on a real robot by running multiple episodes and collecting metrics.
+
+    This function executes rollouts of the specified policy on the robot, computes metrics 
+    for the rollouts, and optionally evaluates a reward classifier if provided.
+
     Args:
-        env: The batch of environments.
-        policy: The policy.
-        n_episodes: The number of episodes to evaluate.
+        "robot": The robot interface used to interact with the real robot hardware.
+        "policy": The policy to be evaluated. Must be a PyTorch neural network module.
+        "fps": Frames per second (control frequency) for running the policy.
+        "n_episodes": The number of episodes to evaluate the policy.
+        "control_time_s": The max duration for each episode in seconds.
+        "use_amp": Whether to use automatic mixed precision (AMP) for policy evaluation.
+        "display_cameras": Whether to display camera streams during rollouts.
+        "reward_classifier_pretrained_path": Path to the pretrained reward classifier. 
+            If provided, the reward classifier will be evaluated during rollouts.
+        "reward_classifier_config_file": Path to the configuration file for the reward classifier. 
+            Required if `reward_classifier_pretrained_path` is provided.
+        "device": The device for computations (e.g., "cpu", "cuda" or "mps"). 
+
     Returns:
-        Dictionary with metrics and data regarding the rollouts.
+        "dict": A dictionary containing the following rollout metrics and data:
+            - "metrics": Evaluation metrics such as cumulative rewards, success rates, etc.
+            - "rollout_data": Detailed data from the rollouts, including observations, actions, rewards, and done flags.
     """
     # TODO (michel-aractingi) comment this out for testing with a fixed policy
     # assert isinstance(policy, Policy)
@@ -214,22 +242,22 @@ def eval_policy(
 
     sum_rewards = []
     max_rewards = []
-    successes = []
     rollouts = []
 
     start_eval = time.perf_counter()
     progbar = trange(n_episodes, desc="Evaluating policy on real robot")
-    reward_classifier = get_classifier(reward_classifier_pretrained_path, reward_classifier_config_file)
+    reward_classifier = get_classifier(reward_classifier_pretrained_path, reward_classifier_config_file).to(device)§
+
+    device = get_device_from_parameters(policy) if device is None else device
 
     for _ in progbar:
         rollout_data = rollout(
-            robot, policy, reward_classifier, fps, control_time_s, use_amp, display_cameras
+            robot, policy, reward_classifier, fps, control_time_s, use_amp, display_cameras, device
         )
 
         rollouts.append(rollout_data)
         sum_rewards.append(sum(rollout_data["next.reward"]))
         max_rewards.append(max(rollout_data["next.reward"]))
-        successes.append(rollout_data["next.success"][-1])
 
     info = {
         "per_episode": [
@@ -237,21 +265,18 @@ def eval_policy(
                 "episode_ix": i,
                 "sum_reward": sum_reward,
                 "max_reward": max_reward,
-                "pc_success": success * 100,
             }
-            for i, (sum_reward, max_reward, success) in enumerate(
+            for i, (sum_reward, max_reward) in enumerate(
                 zip(
                     sum_rewards[:n_episodes],
                     max_rewards[:n_episodes],
-                    successes[:n_episodes],
                     strict=False,
                 )
             )
         ],
         "aggregated": {
-            "avg_sum_reward": float(np.nanmean(torch.cat(sum_rewards[:n_episodes]))),
+            "avg_sum_reward":    float(np.nanmean(torch.cat(sum_rewards[:n_episodes]))),
             "avg_max_reward": float(np.nanmean(torch.cat(max_rewards[:n_episodes]))),
-            "pc_success": float(np.nanmean(torch.cat(successes[:n_episodes])) * 100),
             "eval_s": time.time() - start_eval,
             "eval_ep_s": (time.time() - start_eval) / n_episodes,
         },
@@ -264,9 +289,18 @@ def eval_policy(
 
 
 def init_keyboard_listener():
-    # Allow to exit early while recording an episode or resetting the environment,
-    # by tapping the right arrow key '->'. This might require a sudo permission
-    # to allow your terminal to monitor keyboard events.
+    """
+    Initialize a keyboard listener for controlling the recording and human intervention process.
+
+    Keyboard controls: (Note that this might require sudo permissions to monitor keyboard events)
+        - Right Arrow Key ('->'): Stops the current recording and exits early, useful for ending an episode 
+          and moving the next episode recording. 
+        - Left Arrow Key ('<-'): Re-records the current episode, allowing the user to start over.
+        - Space Bar: Controls the human intervention process in three steps:
+            1. First press pauses the policy and prompts the user to position the leader similar to the follower.
+            2. Second press initiates human interventions, allowing teleop control of the robot.
+            3. Third press resumes the policy rollout.
+    """
     events = {}
     events["exit_early"] = False
     events["rerecord_episode"] = False
@@ -302,10 +336,15 @@ def init_keyboard_listener():
                     )
                     events["pause_policy"] = True
                     log_say("Human intervention stage. Get ready to take over.", play_sounds=True)
-                else:
+                elif events["pause_policy"] and not events["human_intervention_step"]:
                     events["human_intervention_step"] = True
                     print("Space key pressed. Human intervention starting.")
                     log_say("Starting human intervention.", play_sounds=True)
+                elif events["human_intervention_step"]:
+                    events["human_intervention_step"] = False
+                    events["pause_policy"] = False
+                    print("Space key pressed. Human intervention ending, policy resumes control.")
+                    log_say("Policy resuming.", play_sounds=True)
 
         except Exception as e:
             print(f"Error handling key press: {e}")