diff --git a/README.md b/README.md index c31d2b6c2..925d143ae 100644 --- a/README.md +++ b/README.md @@ -22,21 +22,24 @@ python setup.py develop ``` python lerobot/scripts/train.py \ ---config-name=pusht hydra.job.name=pusht +hydra.job.name=pusht \ +env=pusht ``` ### Visualize offline buffer ``` python lerobot/scripts/visualize_dataset.py \ ---config-name=pusht hydra.run.dir=tmp/$(date +"%Y_%m_%d") +hydra.run.dir=tmp/$(date +"%Y_%m_%d") \ +env=pusht ``` ### Visualize online buffer / Eval ``` python lerobot/scripts/eval.py \ ---config-name=pusht hydra.run.dir=tmp/$(date +"%Y_%m_%d") +hydra.run.dir=tmp/$(date +"%Y_%m_%d") \ +env=pusht ``` diff --git a/lerobot/configs/env/pusht.yaml b/lerobot/configs/env/pusht.yaml index 60fc594eb..fd0f70c13 100644 --- a/lerobot/configs/env/pusht.yaml +++ b/lerobot/configs/env/pusht.yaml @@ -3,6 +3,7 @@ eval_episodes: 50 eval_freq: 7500 save_freq: 75000 +log_freq: 250 # TODO: same as simxarm, need to adjust offline_steps: 25000 online_steps: 25000 diff --git a/lerobot/configs/policy/diffusion.yaml b/lerobot/configs/policy/diffusion.yaml index 40c708db8..cfd37ab11 100644 --- a/lerobot/configs/policy/diffusion.yaml +++ b/lerobot/configs/policy/diffusion.yaml @@ -21,6 +21,9 @@ past_action_visible: False keypoint_visible_rate: 1.0 obs_as_global_cond: True +offline_steps: 50000 +online_steps: 0 + policy: name: diffusion diff --git a/lerobot/scripts/eval.py b/lerobot/scripts/eval.py index b0f678343..eaade622b 100644 --- a/lerobot/scripts/eval.py +++ b/lerobot/scripts/eval.py @@ -5,6 +5,7 @@ import hydra import imageio import numpy as np import torch +import tqdm from tensordict.nn import TensorDictModule from termcolor import colored from torchrl.envs import EnvBase @@ -32,7 +33,7 @@ def eval_policy( max_rewards = [] successes = [] threads = [] - for i in range(num_episodes): + for i in tqdm.tqdm(range(num_episodes)): tensordict = env.reset() ep_frames = [] diff --git a/lerobot/scripts/train.py b/lerobot/scripts/train.py index 1f516cce0..99e82217b 100644 --- a/lerobot/scripts/train.py +++ b/lerobot/scripts/train.py @@ -50,7 +50,7 @@ def log_training_metrics(L, metrics, step, online_episode_idx, start_time, is_of def eval_policy_and_log( - env, td_policy, step, online_episode_idx, start_time, is_offline, cfg, L + env, td_policy, step, online_episode_idx, start_time, cfg, L, is_offline ): common_metrics = { "episode": online_episode_idx, @@ -83,7 +83,10 @@ def train(cfg: dict, out_dir=None, job_name=None): set_seed(cfg.seed) print(colored("Work dir:", "yellow", attrs=["bold"]), out_dir) + print("make_env") env = make_env(cfg) + + print("make_policy") policy = make_policy(cfg) td_policy = TensorDictModule( @@ -92,12 +95,12 @@ def train(cfg: dict, out_dir=None, job_name=None): out_keys=["action"], ) - # initialize offline dataset - + print("make_offline_buffer") offline_buffer = make_offline_buffer(cfg) # TODO(rcadene): move balanced_sampling, per_alpha, per_beta outside policy if cfg.policy.balanced_sampling: + print("make online_buffer") num_traj_per_batch = cfg.policy.batch_size online_sampler = PrioritizedSliceSampler( @@ -117,15 +120,16 @@ def train(cfg: dict, out_dir=None, job_name=None): online_episode_idx = 0 start_time = time.time() - step = 0 + step = 0 # number of policy update - # First eval with a random model or pretrained + print("First eval_policy_and_log with a random model or pretrained") eval_policy_and_log( - env, td_policy, step, online_episode_idx, start_time, is_offline, cfg, L + env, td_policy, step, online_episode_idx, start_time, cfg, L, is_offline=True ) - # Train offline - for _ in range(cfg.offline_steps): + for offline_step in range(cfg.offline_steps): + if offline_step == 0: + print("Start offline training on a fixed dataset") # TODO(rcadene): is it ok if step_t=0 = 0 and not 1 as previously done? metrics = policy.update(offline_buffer, step) @@ -136,7 +140,14 @@ def train(cfg: dict, out_dir=None, job_name=None): if step > 0 and step % cfg.eval_freq == 0: eval_policy_and_log( - env, td_policy, step, online_episode_idx, start_time, is_offline, cfg, L + env, + td_policy, + step, + online_episode_idx, + start_time, + cfg, + L, + is_offline=True, ) if step > 0 and cfg.save_model and step % cfg.save_freq == 0: @@ -145,10 +156,12 @@ def train(cfg: dict, out_dir=None, job_name=None): step += 1 - # Train online demo_buffer = offline_buffer if cfg.policy.balanced_sampling else None - for _ in range(cfg.online_steps): + for env_step in range(cfg.online_steps): + if env_step == 0: + print("Start online training by interacting with environment") # TODO: use SyncDataCollector for that? + # TODO: add configurable number of rollout? (default=1) with torch.no_grad(): rollout = env.rollout( max_steps=cfg.env.episode_length, @@ -191,9 +204,9 @@ def train(cfg: dict, out_dir=None, job_name=None): step, online_episode_idx, start_time, - is_offline, cfg, L, + is_offline=False, ) if step > 0 and cfg.save_model and step % cfg.save_freq == 0: