rich annotations & update open-pi fsdp explanations
This commit is contained in:
@@ -40,7 +40,7 @@ python scripts/download_paligemma.py
|
||||
You may adjust other training parameters based on your available GPUs and training budget:
|
||||
- `num_train_steps`: Total number of training steps
|
||||
- `num_workers`: Number of data loading workers
|
||||
- `fsdp_devices`: Number of GPUs per node
|
||||
- `fsdp_devices`: Number of GPUs used for FSDP per node to distribute model parameters, gradients, and optimizer states across devices for reduced memory usage
|
||||
- `batch_size`: Batch size per GPU
|
||||
- `save_interval`: Checkpoint saving interval (in steps)
|
||||
|
||||
|
||||
@@ -202,8 +202,6 @@ python scripts/train_jax_multinode.py \
|
||||
pretrain-interndata-a1 \
|
||||
--exp-name=pretrain-interndata-a1 \
|
||||
--num_workers=12 \
|
||||
--fsdp_devices=8 \
|
||||
--batch_size=512 \
|
||||
--num_train_steps=2000000 \
|
||||
--save_interval=5000
|
||||
|
||||
|
||||
@@ -1814,7 +1814,6 @@ _CONFIGS = [
|
||||
pytorch_weight_path="",
|
||||
num_train_steps=2_000_000,
|
||||
num_workers=12,
|
||||
fsdp_devices=8,
|
||||
batch_size=512,
|
||||
save_interval=5000,
|
||||
lr_schedule=_optimizer.WarmupConstantSchedule(),
|
||||
@@ -1844,7 +1843,6 @@ _CONFIGS = [
|
||||
pytorch_weight_path="",
|
||||
num_train_steps=30_000,
|
||||
num_workers=32,
|
||||
fsdp_devices=8,
|
||||
batch_size=128,
|
||||
save_interval=5000,
|
||||
),
|
||||
@@ -1872,7 +1870,6 @@ _CONFIGS = [
|
||||
pytorch_weight_path="",
|
||||
num_train_steps=30_000,
|
||||
num_workers=32,
|
||||
fsdp_devices=8,
|
||||
batch_size=128,
|
||||
save_interval=5000,
|
||||
),
|
||||
@@ -1901,4 +1898,4 @@ def check_lerobot_repo(repo_dir: str):
|
||||
if os.path.isdir(os.path.join(repo_dir, "data")) and os.path.isdir(os.path.join(repo_dir, "meta")) and os.path.isdir(os.path.join(repo_dir, "videos")):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
return False
|
||||
Reference in New Issue
Block a user