add fixes for reproducibility

2024-03-22 15:06:57 +00:00
parent b9047fbdd2
commit 15ff3b3af8
4 changed files with 45 additions and 14 deletions
--- a/lerobot/scripts/eval.py
+++ b/lerobot/scripts/eval.py
@@ -1,3 +1,4 @@
+import json
 import logging
 import threading
 import time
@@ -41,6 +42,7 @@ def eval_policy(
    sum_rewards = []
    max_rewards = []
    successes = []
+    seeds = []
    threads = []  # for video saving threads
    episode_counter = 0  # for saving the correct number of videos

@@ -53,11 +55,15 @@ def eval_policy(
            if save_video or (return_first_video and i == 0):  # noqa: B023
                ep_frames.append(env.render())  # noqa: B023

+        # Clear the policy's action queue before the start of a new rollout.
+        if policy is not None:
+            policy.clear_action_queue()
+
+        env.start()  # needed to be able to get the seeds the first time as BatchedEnvs are lazy
+        seeds.extend(env._next_seed)
        with torch.inference_mode():
            # TODO(alexander-soare): When `break_when_any_done == False` this rolls out for max_steps even when all
            # envs are done the first time. But we only use the first rollout. This is a waste of compute.
-            if policy is not None:
-                policy.clear_action_queue()
            rollout = env.rollout(
                max_steps=max_steps,
                policy=policy,
@@ -65,8 +71,8 @@ def eval_policy(
                callback=maybe_render_frame,
                break_when_any_done=env.batch_size[0] == 1,
            )
-        # Figure out where in each rollout sequence the first done condition was encountered (results after this won't
-        # be included).
+        # Figure out where in each rollout sequence the first done condition was encountered (results after
+        # this won't be included).
        # Note: this assumes that the shape of the done key is (batch_size, max_steps, 1).
        # Note: this relies on a property of argmax: that it returns the first occurrence as a tiebreaker.
        rollout_steps = rollout["next", "done"].shape[1]
@@ -108,11 +114,31 @@ def eval_policy(
        thread.join()

    info = {
-        "avg_sum_reward": np.nanmean(sum_rewards[:num_episodes]),
-        "avg_max_reward": np.nanmean(max_rewards[:num_episodes]),
-        "pc_success": np.nanmean(successes[:num_episodes]) * 100,
-        "eval_s": time.time() - start,
-        "eval_ep_s": (time.time() - start) / num_episodes,
+        "micro": [
+            {
+                "episode_ix": i,
+                "sum_reward": sum_reward,
+                "max_reward": max_reward,
+                "success": success,
+                "seed": seed,
+            }
+            for i, (sum_reward, max_reward, success, seed) in enumerate(
+                zip(
+                    sum_rewards[:num_episodes],
+                    max_rewards[:num_episodes],
+                    successes[:num_episodes],
+                    seeds[:num_episodes],
+                    strict=True,
+                )
+            )
+        ],
+        "macro": {
+            "avg_sum_reward": np.nanmean(sum_rewards[:num_episodes]),
+            "avg_max_reward": np.nanmean(max_rewards[:num_episodes]),
+            "pc_success": np.nanmean(successes[:num_episodes]) * 100,
+            "eval_s": time.time() - start,
+            "eval_ep_s": (time.time() - start) / num_episodes,
+        },
    }
    if return_first_video:
        return info, first_video
@@ -156,7 +182,7 @@ def eval(cfg: dict, out_dir=None):
        # when policy is None, rollout a random policy
        policy = None

-    metrics = eval_policy(
+    info = eval_policy(
        env,
        policy=policy,
        save_video=True,
@@ -165,7 +191,11 @@ def eval(cfg: dict, out_dir=None):
        max_steps=cfg.env.episode_length,
        num_episodes=cfg.eval_episodes,
    )
-    print(metrics)
+    print(info["macro"])
+
+    # Save info
+    with open(Path(out_dir) / "eval_info.json", "w") as f:
+        json.dump(info, f, indent=2)

    logging.info("End of eval")