Files
lerobot/examples/train_smolvla.sh
2025-05-30 11:14:47 +00:00

106 lines
2.6 KiB
Bash

#!/bin/bash
#SBATCH --job-name=lerobot_smolvla_test_task1_main_pretrained
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --gpus-per-node=1
#SBATCH --mail-type=END,FAIL
#SBATCH --output=/lustre/fswork/projects/rech/dyf/ugz83ue/logs/slurm/lerobot_smolvla_test_task1_main_pretrained.out
###SBATCH --nodelist=jean-zay-a101
#SBATCH --cpus-per-task=20
###SBATCH --exclusive
#SBATCH --time=40:00:00
#SBATCH --mail-user=mustafa.shukor@isir.upmc.fr
##SBATCH --partition=gpu_p2
##SBATCH --qos=qos_gpu-t3
###SBATCH -C v100-32g
##SBATCH -A dyf@v100
##SBATCH --partition=gpu_p5
##SBATCH -C a100
###SBATCH -A dyf@a100
##SBATCH -A lqm@a100
##SBATCH --qos=qos_gpu_a100-dev
##SBATCH --qos=qos_gpu_a100-t3
#SBATCH --partition=gpu_p6
#SBATCH -C h100
#SBATCH -A lqm@h100
##SBATCH --qos=qos_gpu_h100-dev
#SBATCH --qos=qos_gpu_h100-t4
cd ~/lerobot_pi
source ~/.bashrc
source activate lerobot_main
export LC_ALL=C
rm core-*
export TRANSFORMERS_CACHE=$WORK/.cache/huggingface/transformers
export HF_HOME=$WORK/.cache/huggingface
export DATA_DIR=$WORK/.cache/huggingface/datasets
export HF_LEROBOT_HOME=$WORK/.cache/huggingface/lerobot
# export LEROBOT_HOME=
export HF_DATASETS_OFFLINE=1
export HF_HUB_OFFLINE=1
export WANDB_CACHE_DIR=/lustre/fsn1/projects/rech/dyf/ugz83ue/wandb
export WANDB_MODE=offline
export TOKENIZERS_PARALLELISM=false
cd ~/lerobot_pi
# ###### dgx
# source ~/miniconda3/bin/activate
# conda activate lerobot
# export WORK=/home/mustafa_shukor
# V3 So100
REPO_ID=danaaubakirova/svla_so100_task1_v3
DATASET_NAME=so100_v3_task_1
POLICY=smolvla
POLICY_NAME=smolvla
OFFLINE_STEPS=200000
BATCH_SIZE=64
# TASK_NAME=lerobot_${DATASET_NAME}_${POLICY_NAME}
# TRAIN_DIR=$WORK/logs/lerobot/$TASK_NAME
# echo $TRAIN_DIR
# rm -r $TRAIN_DIR
# python lerobot/scripts/train.py \
# --policy.type=$POLICY \
# --dataset.repo_id=$REPO_ID \
# --output_dir=$TRAIN_DIR \
# --batch_size=$BATCH_SIZE \
# --steps=$OFFLINE_STEPS
TASK_NAME=lerobot_${DATASET_NAME}_${POLICY_NAME}_pretrained
TRAIN_DIR=$WORK/logs/lerobot/$TASK_NAME
echo $TRAIN_DIR
rm -r $TRAIN_DIR
POLICY_PATH=/lustre/fswork/projects/rech/dyf/ugz83ue/logs/lerobot/lerobot_so100_community_v1_v2_v3clean2_smolpi0_lr1e-4bs64steps400000gpus4freeze32_imgtoktrue_cross_attn_gap1_vlml16_causalacttrue_sa2_smolvlm2500_nobs1_expw0.75_feat2_lrvlm1e-4_trans0true_decaylr2.5e-630000_camfalse_fps3030_idlefalse/checkpoints/280000/test_smolvla/
python lerobot/scripts/train.py \
--policy.path=$POLICY_PATH \
--dataset.repo_id=$REPO_ID \
--output_dir=$TRAIN_DIR \
--batch_size=$BATCH_SIZE \
--steps=$OFFLINE_STEPS