rich annotations & update open-pi fsdp explanations

This commit is contained in:
Leon998
2026-03-18 13:59:52 +08:00
parent 814f3c3526
commit 4934c4794e
11 changed files with 349 additions and 32 deletions

View File

@@ -40,7 +40,7 @@ python scripts/download_paligemma.py
You may adjust other training parameters based on your available GPUs and training budget:
- `num_train_steps`: Total number of training steps
- `num_workers`: Number of data loading workers
- `fsdp_devices`: Number of GPUs per node
- `fsdp_devices`: Number of GPUs used for FSDP per node to distribute model parameters, gradients, and optimizer states across devices for reduced memory usage
- `batch_size`: Batch size per GPU
- `save_interval`: Checkpoint saving interval (in steps)

View File

@@ -202,8 +202,6 @@ python scripts/train_jax_multinode.py \
pretrain-interndata-a1 \
--exp-name=pretrain-interndata-a1 \
--num_workers=12 \
--fsdp_devices=8 \
--batch_size=512 \
--num_train_steps=2000000 \
--save_interval=5000

View File

@@ -1814,7 +1814,6 @@ _CONFIGS = [
pytorch_weight_path="",
num_train_steps=2_000_000,
num_workers=12,
fsdp_devices=8,
batch_size=512,
save_interval=5000,
lr_schedule=_optimizer.WarmupConstantSchedule(),
@@ -1844,7 +1843,6 @@ _CONFIGS = [
pytorch_weight_path="",
num_train_steps=30_000,
num_workers=32,
fsdp_devices=8,
batch_size=128,
save_interval=5000,
),
@@ -1872,7 +1870,6 @@ _CONFIGS = [
pytorch_weight_path="",
num_train_steps=30_000,
num_workers=32,
fsdp_devices=8,
batch_size=128,
save_interval=5000,
),
@@ -1901,4 +1898,4 @@ def check_lerobot_repo(repo_dir: str):
if os.path.isdir(os.path.join(repo_dir, "data")) and os.path.isdir(os.path.join(repo_dir, "meta")) and os.path.isdir(os.path.join(repo_dir, "videos")):
return True
else:
return False
return False