Add resume training (#205)

Co-authored-by: Remi <re.cadene@gmail.com>
This commit is contained in:
Alexander Soare
2024-05-28 12:04:23 +01:00
committed by GitHub
parent 7ec76ee235
commit e3b9f1c19b
15 changed files with 486 additions and 191 deletions

View File

@@ -5,10 +5,17 @@ defaults:
hydra:
run:
# Set `dir` to where you would like to save all of the run outputs. If you run another training session
# with the same value for `dir` its contents will be overwritten unless you set `resume` to true.
dir: outputs/train/${now:%Y-%m-%d}/${now:%H-%M-%S}_${env.name}_${policy.name}_${hydra.job.name}
job:
name: default
# Set `resume` to true to resume a previous run. In order for this to work, you will need to make sure
# `hydra.run.dir` is the directory of an existing run with at least one checkpoint in it.
# Note that when resuming a run, the default behavior is to use the configuration from the checkpoint,
# regardless of what's provided with the training command at the time of resumption.
resume: false
device: cuda # cpu
# `use_amp` determines whether to use Automatic Mixed Precision (AMP) for training and evaluation. With AMP,
# automatic gradient scaling is used.
@@ -29,7 +36,7 @@ training:
eval_freq: ???
save_freq: ???
log_freq: 250
save_model: true
save_checkpoint: true
eval:
n_episodes: 1
@@ -40,7 +47,7 @@ eval:
wandb:
enable: false
# Set to true to disable saving an artifact despite save_model == True
# Set to true to disable saving an artifact despite save_checkpoint == True
disable_artifact: false
project: lerobot
notes: ""

View File

@@ -15,7 +15,7 @@ training:
eval_freq: 10000
save_freq: 100000
log_freq: 250
save_model: true
save_checkpoint: true
batch_size: 8
lr: 1e-5

View File

@@ -27,7 +27,7 @@ training:
eval_freq: 5000
save_freq: 5000
log_freq: 250
save_model: true
save_checkpoint: true
batch_size: 64
grad_clip_norm: 10