launch training
This commit is contained in:
BIN
._pyproject.toml
Normal file
BIN
._pyproject.toml
Normal file
Binary file not shown.
BIN
examples/._config.json
Normal file
BIN
examples/._config.json
Normal file
Binary file not shown.
BIN
examples/._train_config.json
Normal file
BIN
examples/._train_config.json
Normal file
Binary file not shown.
BIN
examples/._train_smolvla.sh
Normal file
BIN
examples/._train_smolvla.sh
Normal file
Binary file not shown.
@@ -1,28 +1,68 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
|
#SBATCH --job-name=lerobot_smolvla_test_task1_main_pretrained
|
||||||
|
#SBATCH --nodes=1
|
||||||
|
#SBATCH --ntasks=1
|
||||||
|
#SBATCH --gpus-per-node=1
|
||||||
|
#SBATCH --mail-type=END,FAIL
|
||||||
|
#SBATCH --output=/lustre/fswork/projects/rech/dyf/ugz83ue/logs/slurm/lerobot_smolvla_test_task1_main_pretrained.out
|
||||||
|
###SBATCH --nodelist=jean-zay-a101
|
||||||
|
#SBATCH --cpus-per-task=20
|
||||||
|
###SBATCH --exclusive
|
||||||
|
#SBATCH --time=40:00:00
|
||||||
|
#SBATCH --mail-user=mustafa.shukor@isir.upmc.fr
|
||||||
|
|
||||||
|
|
||||||
|
##SBATCH --partition=gpu_p2
|
||||||
|
##SBATCH --qos=qos_gpu-t3
|
||||||
|
###SBATCH -C v100-32g
|
||||||
|
##SBATCH -A dyf@v100
|
||||||
|
|
||||||
|
##SBATCH --partition=gpu_p5
|
||||||
|
##SBATCH -C a100
|
||||||
|
###SBATCH -A dyf@a100
|
||||||
|
##SBATCH -A lqm@a100
|
||||||
|
##SBATCH --qos=qos_gpu_a100-dev
|
||||||
|
##SBATCH --qos=qos_gpu_a100-t3
|
||||||
|
|
||||||
|
#SBATCH --partition=gpu_p6
|
||||||
|
#SBATCH -C h100
|
||||||
|
#SBATCH -A lqm@h100
|
||||||
|
##SBATCH --qos=qos_gpu_h100-dev
|
||||||
|
#SBATCH --qos=qos_gpu_h100-t4
|
||||||
|
|
||||||
|
|
||||||
|
cd ~/lerobot_pi
|
||||||
|
source ~/.bashrc
|
||||||
|
source activate lerobot_main
|
||||||
|
export LC_ALL=C
|
||||||
|
|
||||||
|
rm core-*
|
||||||
|
|
||||||
|
export TRANSFORMERS_CACHE=$WORK/.cache/huggingface/transformers
|
||||||
|
export HF_HOME=$WORK/.cache/huggingface
|
||||||
|
export DATA_DIR=$WORK/.cache/huggingface/datasets
|
||||||
|
export HF_LEROBOT_HOME=$WORK/.cache/huggingface/lerobot
|
||||||
|
# export LEROBOT_HOME=
|
||||||
|
|
||||||
|
export HF_DATASETS_OFFLINE=1
|
||||||
|
export HF_HUB_OFFLINE=1
|
||||||
|
|
||||||
|
export WANDB_CACHE_DIR=/lustre/fsn1/projects/rech/dyf/ugz83ue/wandb
|
||||||
|
export WANDB_MODE=offline
|
||||||
|
|
||||||
|
export TOKENIZERS_PARALLELISM=false
|
||||||
|
|
||||||
|
|
||||||
cd ~/lerobot_pi
|
cd ~/lerobot_pi
|
||||||
|
|
||||||
source ~/miniconda3/bin/activate
|
# ###### dgx
|
||||||
conda activate lerobot
|
# source ~/miniconda3/bin/activate
|
||||||
|
# conda activate lerobot
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
export WORK=/home/mustafa_shukor
|
# export WORK=/home/mustafa_shukor
|
||||||
# export TRANSFORMERS_CACHE=$WORK/.cache/huggingface/transformers
|
|
||||||
# export HF_HOME=$WORK/.cache/huggingface
|
|
||||||
# export DATA_DIR=$WORK/.cache/huggingface/datasets
|
|
||||||
# export HF_LEROBOT_HOME=$WORK/.cache/huggingface/lerobot
|
|
||||||
|
|
||||||
# export HF_DATASETS_OFFLINE=1
|
|
||||||
# export HF_HUB_OFFLINE=1
|
|
||||||
|
|
||||||
# export WANDB_CACHE_DIR=/lustre/fsn1/projects/rech/dyf/ugz83ue/wandb
|
|
||||||
# export WANDB_MODE=offline
|
|
||||||
|
|
||||||
|
|
||||||
## then later
|
|
||||||
## wandb sync wandb/offline-run-*
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -35,23 +75,32 @@ POLICY=smolvla
|
|||||||
POLICY_NAME=smolvla
|
POLICY_NAME=smolvla
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
OFFLINE_STEPS=200000
|
OFFLINE_STEPS=200000
|
||||||
BATCH_SIZE=64
|
BATCH_SIZE=64
|
||||||
|
|
||||||
|
|
||||||
TASK_NAME=lerobot_${DATASET_NAME}_${POLICY_NAME}
|
# TASK_NAME=lerobot_${DATASET_NAME}_${POLICY_NAME}
|
||||||
|
# TRAIN_DIR=$WORK/logs/lerobot/$TASK_NAME
|
||||||
|
# echo $TRAIN_DIR
|
||||||
|
# rm -r $TRAIN_DIR
|
||||||
|
# python lerobot/scripts/train.py \
|
||||||
|
# --policy.type=$POLICY \
|
||||||
|
# --dataset.repo_id=$REPO_ID \
|
||||||
|
# --output_dir=$TRAIN_DIR \
|
||||||
|
# --batch_size=$BATCH_SIZE \
|
||||||
|
# --steps=$OFFLINE_STEPS
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
TASK_NAME=lerobot_${DATASET_NAME}_${POLICY_NAME}_pretrained
|
||||||
TRAIN_DIR=$WORK/logs/lerobot/$TASK_NAME
|
TRAIN_DIR=$WORK/logs/lerobot/$TASK_NAME
|
||||||
echo $TRAIN_DIR
|
echo $TRAIN_DIR
|
||||||
|
|
||||||
|
|
||||||
rm -r $TRAIN_DIR
|
rm -r $TRAIN_DIR
|
||||||
CUDA_VISIBLE_DEVICES=2 python lerobot/scripts/train.py \
|
POLICY_PATH=/lustre/fswork/projects/rech/dyf/ugz83ue/logs/lerobot/lerobot_so100_community_v1_v2_v3clean2_smolpi0_lr1e-4bs64steps400000gpus4freeze32_imgtoktrue_cross_attn_gap1_vlml16_causalacttrue_sa2_smolvlm2500_nobs1_expw0.75_feat2_lrvlm1e-4_trans0true_decaylr2.5e-630000_camfalse_fps3030_idlefalse/checkpoints/280000/test_smolvla/
|
||||||
--policy.type=$POLICY \
|
python lerobot/scripts/train.py \
|
||||||
|
--policy.path=$POLICY_PATH \
|
||||||
--dataset.repo_id=$REPO_ID \
|
--dataset.repo_id=$REPO_ID \
|
||||||
--output_dir=$TRAIN_DIR
|
--output_dir=$TRAIN_DIR \
|
||||||
|
--batch_size=$BATCH_SIZE \
|
||||||
|
--steps=$OFFLINE_STEPS
|
||||||
|
|
||||||
|
|||||||
@@ -86,6 +86,7 @@ dynamixel = ["dynamixel-sdk>=3.7.31", "pynput>=1.7.7"]
|
|||||||
feetech = ["feetech-servo-sdk>=1.0.0", "pynput>=1.7.7"]
|
feetech = ["feetech-servo-sdk>=1.0.0", "pynput>=1.7.7"]
|
||||||
intelrealsense = ["pyrealsense2>=2.55.1.6486 ; sys_platform != 'darwin'"]
|
intelrealsense = ["pyrealsense2>=2.55.1.6486 ; sys_platform != 'darwin'"]
|
||||||
pi0 = ["transformers>=4.48.0"]
|
pi0 = ["transformers>=4.48.0"]
|
||||||
|
smolvla = ["transformers>=4.50.3", "pytest>=8.3.5", "num2words>=0.5.14", "accelerate-1.7.0"]
|
||||||
pusht = ["gym-pusht>=0.1.5 ; python_version < '4.0'"]
|
pusht = ["gym-pusht>=0.1.5 ; python_version < '4.0'"]
|
||||||
stretch = [
|
stretch = [
|
||||||
"hello-robot-stretch-body>=0.7.27 ; python_version < '4.0' and sys_platform == 'linux'",
|
"hello-robot-stretch-body>=0.7.27 ; python_version < '4.0' and sys_platform == 'linux'",
|
||||||
|
|||||||
BIN
slurm/mshukor/jz/._train_smolpi0_so100_multidatasets.sh
Normal file
BIN
slurm/mshukor/jz/._train_smolpi0_so100_multidatasets.sh
Normal file
Binary file not shown.
Binary file not shown.
BIN
slurm/mshukor/jz/._upload_models_to_hub.sh
Normal file
BIN
slurm/mshukor/jz/._upload_models_to_hub.sh
Normal file
Binary file not shown.
Reference in New Issue
Block a user